Sequence Statistics

Calculate comprehensive statistics for sequence datasets using Biopython.

Required Imports

python

from Bio import SeqIO
from Bio.SeqUtils import gc_fraction
import statistics

Basic Statistics

Sequence Count and Total Length

python

records = list(SeqIO.parse('sequences.fasta', 'fasta'))
total_seqs = len(records)
total_bp = sum(len(r.seq) for r in records)
print(f'Sequences: {total_seqs}')
print(f'Total bp: {total_bp:,}')

Length Statistics

python

lengths = [len(r.seq) for r in SeqIO.parse('sequences.fasta', 'fasta')]

print(f'Count: {len(lengths)}')
print(f'Total: {sum(lengths):,} bp')
print(f'Min: {min(lengths):,} bp')
print(f'Max: {max(lengths):,} bp')
print(f'Mean: {statistics.mean(lengths):,.1f} bp')
print(f'Median: {statistics.median(lengths):,.1f} bp')
print(f'Std Dev: {statistics.stdev(lengths):,.1f} bp')

N50 and Nx Statistics

Calculate N50

python

def calculate_n50(lengths):
    sorted_lengths = sorted(lengths, reverse=True)
    total = sum(sorted_lengths)
    cumsum = 0
    for length in sorted_lengths:
        cumsum += length
        if cumsum >= total * 0.5:
            return length
    return 0

lengths = [len(r.seq) for r in SeqIO.parse('assembly.fasta', 'fasta')]
n50 = calculate_n50(lengths)
print(f'N50: {n50:,} bp')

Calculate Any Nx Value

python

def calculate_nx(lengths, x):
    '''Calculate Nx where x is percentage (e.g., 50 for N50, 90 for N90)'''
    sorted_lengths = sorted(lengths, reverse=True)
    total = sum(sorted_lengths)
    threshold = total * (x / 100)
    cumsum = 0
    for length in sorted_lengths:
        cumsum += length
        if cumsum >= threshold:
            return length
    return 0

lengths = [len(r.seq) for r in SeqIO.parse('assembly.fasta', 'fasta')]
print(f'N50: {calculate_nx(lengths, 50):,} bp')
print(f'N75: {calculate_nx(lengths, 75):,} bp')
print(f'N90: {calculate_nx(lengths, 90):,} bp')

L50 (Number of Sequences in N50)

python

def calculate_l50(lengths):
    sorted_lengths = sorted(lengths, reverse=True)
    total = sum(sorted_lengths)
    cumsum = 0
    for i, length in enumerate(sorted_lengths, 1):
        cumsum += length
        if cumsum >= total * 0.5:
            return i
    return len(sorted_lengths)

lengths = [len(r.seq) for r in SeqIO.parse('assembly.fasta', 'fasta')]
print(f'L50: {calculate_l50(lengths)} sequences')

GC Content Statistics

Overall GC Content

python

from Bio.SeqUtils import gc_fraction

total_gc = 0
total_len = 0
for record in SeqIO.parse('sequences.fasta', 'fasta'):
    total_gc += gc_fraction(record.seq) * len(record.seq)
    total_len += len(record.seq)

overall_gc = total_gc / total_len
print(f'Overall GC: {overall_gc:.1%}')

Per-Sequence GC Distribution

python

gc_values = [gc_fraction(r.seq) for r in SeqIO.parse('sequences.fasta', 'fasta')]

print(f'Mean GC: {statistics.mean(gc_values):.1%}')
print(f'Median GC: {statistics.median(gc_values):.1%}')
print(f'Min GC: {min(gc_values):.1%}')
print(f'Max GC: {max(gc_values):.1%}')
print(f'Std Dev: {statistics.stdev(gc_values):.2%}')

GC Content Histogram Data

python

from collections import Counter

gc_values = [gc_fraction(r.seq) for r in SeqIO.parse('sequences.fasta', 'fasta')]
bins = [int(gc * 100 // 5) * 5 for gc in gc_values]  # 5% bins
histogram = Counter(bins)

for gc_bin in sorted(histogram.keys()):
    count = histogram[gc_bin]
    print(f'{gc_bin}-{gc_bin+5}%: {count} sequences')

Length Distribution

Length Histogram Data

python

from collections import Counter

lengths = [len(r.seq) for r in SeqIO.parse('sequences.fasta', 'fasta')]

# Define bins (e.g., 0-100, 100-200, etc.)
bin_size = 100
bins = [(l // bin_size) * bin_size for l in lengths]
histogram = Counter(bins)

for length_bin in sorted(histogram.keys()):
    count = histogram[length_bin]
    print(f'{length_bin}-{length_bin + bin_size}: {count}')

Length Percentiles

python

import statistics

lengths = sorted(len(r.seq) for r in SeqIO.parse('sequences.fasta', 'fasta'))

percentiles = [10, 25, 50, 75, 90, 95, 99]
for p in percentiles:
    idx = int(len(lengths) * p / 100)
    print(f'P{p}: {lengths[idx]:,} bp')

Comprehensive Summary Report

python

from Bio import SeqIO
from Bio.SeqUtils import gc_fraction
import statistics

def sequence_summary(fasta_file):
    records = list(SeqIO.parse(fasta_file, 'fasta'))
    lengths = [len(r.seq) for r in records]
    gc_values = [gc_fraction(r.seq) for r in records]

    sorted_lengths = sorted(lengths, reverse=True)
    total_bp = sum(lengths)

    # Calculate N50
    cumsum = 0
    n50 = 0
    l50 = 0
    for i, length in enumerate(sorted_lengths, 1):
        cumsum += length
        if cumsum >= total_bp * 0.5 and n50 == 0:
            n50 = length
            l50 = i

    return {
        'file': fasta_file,
        'sequences': len(records),
        'total_bp': total_bp,
        'min_length': min(lengths),
        'max_length': max(lengths),
        'mean_length': statistics.mean(lengths),
        'median_length': statistics.median(lengths),
        'n50': n50,
        'l50': l50,
        'gc_mean': statistics.mean(gc_values),
        'gc_std': statistics.stdev(gc_values) if len(gc_values) > 1 else 0
    }

stats = sequence_summary('assembly.fasta')
print(f'File: {stats["file"]}')
print(f'Sequences: {stats["sequences"]:,}')
print(f'Total bp: {stats["total_bp"]:,}')
print(f'Min/Max: {stats["min_length"]:,} / {stats["max_length"]:,} bp')
print(f'Mean: {stats["mean_length"]:,.1f} bp')
print(f'Median: {stats["median_length"]:,.1f} bp')
print(f'N50: {stats["n50"]:,} bp (L50: {stats["l50"]})')
print(f'GC: {stats["gc_mean"]:.1%} (+/- {stats["gc_std"]:.1%})')

Compare Multiple Assemblies

python

from pathlib import Path

def compare_assemblies(fasta_files):
    results = []
    for fasta_file in fasta_files:
        stats = sequence_summary(str(fasta_file))
        results.append(stats)
    return results

files = list(Path('assemblies/').glob('*.fasta'))
comparisons = compare_assemblies(files)

print(f'{"File":<30} {"Seqs":>10} {"Total bp":>15} {"N50":>12}')
print('-' * 70)
for stats in comparisons:
    print(f'{Path(stats["file"]).name:<30} {stats["sequences"]:>10,} {stats["total_bp"]:>15,} {stats["n50"]:>12,}')

Export to CSV

python

import csv
from pathlib import Path

def export_stats_csv(fasta_files, output_csv):
    fieldnames = ['file', 'sequences', 'total_bp', 'min_length', 'max_length',
                  'mean_length', 'median_length', 'n50', 'l50', 'gc_mean', 'gc_std']

    with open(output_csv, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for fasta_file in fasta_files:
            stats = sequence_summary(str(fasta_file))
            writer.writerow(stats)

files = list(Path('assemblies/').glob('*.fasta'))
export_stats_csv(files, 'assembly_stats.csv')

Nucleotide Composition

python

from collections import Counter

def nucleotide_composition(fasta_file):
    total_counts = Counter()
    for record in SeqIO.parse(fasta_file, 'fasta'):
        total_counts.update(str(record.seq).upper())

    total = sum(total_counts.values())
    return {base: count / total for base, count in total_counts.items()}

comp = nucleotide_composition('sequences.fasta')
for base in ['A', 'T', 'G', 'C', 'N']:
    if base in comp:
        print(f'{base}: {comp[base]:.2%}')

Common Metrics Reference

Metric	Description
N50	Length where 50% of total bases are in sequences >= this length
L50	Number of sequences needed to reach N50
N90	Length where 90% of total bases are in sequences >= this length
GC%	Proportion of G+C bases
Contiguity	Higher N50 = more contiguous assembly

Related Skills

read-sequences - Parse sequences for statistics calculation
batch-processing - Calculate stats across multiple files
fastq-quality - Quality score statistics for FASTQ files
sequence-manipulation/sequence-properties - Per-sequence GC content and properties
alignment-files - samtools stats/flagstat for alignment statistics

Search AI Tools

bio-sequence-statistics

Install this agent skill to your Project

SKILL.md

Sequence Statistics

Required Imports

Basic Statistics

Sequence Count and Total Length

Length Statistics

N50 and Nx Statistics

Calculate N50

Calculate Any Nx Value

L50 (Number of Sequences in N50)

GC Content Statistics

Overall GC Content

Per-Sequence GC Distribution

GC Content Histogram Data

Length Distribution

Length Histogram Data

Length Percentiles

Comprehensive Summary Report

Compare Multiple Assemblies

Export to CSV

Nucleotide Composition

Common Metrics Reference

Related Skills