diff --git a/.github/workflows/rust-publish.yml b/.github/workflows/rust-publish.yml
index 8e1c5b1a..e876c45d 100644
--- a/.github/workflows/rust-publish.yml
+++ b/.github/workflows/rust-publish.yml
@@ -157,8 +157,7 @@ jobs:
     secrets: inherit
 
   publish-wasm:
-    if: ${{ always() && inputs.wasm != false && needs.publish-all-crates.result == 'success' }}
-    needs: publish-all-crates
+    if: ${{ always() && inputs.wasm != false }}
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
diff --git a/gtars-cli/Cargo.toml b/gtars-cli/Cargo.toml
index 0bb4b8e2..f43c9c89 100644
--- a/gtars-cli/Cargo.toml
+++ b/gtars-cli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "gtars-cli"
-version = "0.8.0"
+version = "0.9.0"
 edition = "2024"
 description = "Performance critical tools for genomic interval analysis. This is the CLI"
 homepage = "https://github.com/databio/gtars"
@@ -25,7 +25,7 @@ gtars-igd = { path = "../gtars-igd", optional=true, version="0.5.1" }
 gtars-uniwig = { path = "../gtars-uniwig", optional=true, version="0.8.0" }
 gtars-overlaprs = { path = "../gtars-overlaprs", optional = true, version="0.5.1" }
 gtars-bbcache = { path = "../gtars-bbcache", optional=true, version="0.5.3" }
-gtars-genomicdist = { path = "../gtars-genomicdist", optional=true, version="0.6.0" }
+gtars-genomicdist = { path = "../gtars-genomicdist", optional=true, version="0.8.0" }
 gtars-core = { path = "../gtars-core", version="0.5.5", features=["bigbed", "http"] }
 
 # serialization
@@ -38,7 +38,7 @@ name = "gtars"
 path = "src/main.rs"
 
 [features]
-default = []
+default = ["scoring", "uniwig", "bbcache", "igd", "fragsplit", "overlaprs", "genomicdist"]
 scoring = ["dep:gtars-scoring"]
 uniwig = ["dep:gtars-uniwig"]
 bbcache = ["dep:gtars-bbcache"]
diff --git a/gtars-cli/src/genomicdist/cli.rs b/gtars-cli/src/genomicdist/cli.rs
index 49b14da9..8d241a9d 100644
--- a/gtars-cli/src/genomicdist/cli.rs
+++ b/gtars-cli/src/genomicdist/cli.rs
@@ -24,7 +24,7 @@ pub fn create_genomicdist_cli() -> Command {
             Arg::new("chrom-sizes")
                 .long("chrom-sizes")
                 .required(false)
-                .help("Path to chrom.sizes file (enables expected partitions and promoter trimming)"),
+                .help("Path to chrom.sizes file. When provided, region distribution uses a per-chromosome bin size derived from the reference genome (stable across files). Also enables expected partitions and promoter trimming."),
         )
         .arg(
             arg!(--output <OUTPUT>)
@@ -35,7 +35,7 @@ pub fn create_genomicdist_cli() -> Command {
             arg!(--bins <BINS>)
                 .required(false)
                 .default_value("250")
-                .help("Number of bins for region distribution"),
+                .help("Number of bins for the region distribution. Bin width is derived as max_chrom_len/bins; shorter chromosomes get proportionally fewer bins."),
         )
         .arg(
             Arg::new("signal-matrix")
@@ -43,6 +43,30 @@ pub fn create_genomicdist_cli() -> Command {
                 .required(false)
                 .help("Path to open signal matrix TSV (enables cell-type open chromatin enrichment)"),
         )
+        .arg(
+            Arg::new("fasta")
+                .long("fasta")
+                .required(false)
+                .help("Path to genome FASTA (.fa) or binary FASTA (.fab) file. Enables GC content; also enables dinucleotide frequencies when --dinucl-freq is set. Use .fab format (via gtars prep --fasta) for best performance."),
+        )
+        .arg(
+            Arg::new("ignore-unk-chroms")
+                .long("ignore-unk-chroms")
+                .action(clap::ArgAction::SetTrue)
+                .help("When computing GC content, skip regions on chromosomes not in the FASTA (default: error)"),
+        )
+        .arg(
+            Arg::new("dinucl-freq")
+                .long("dinucl-freq")
+                .action(clap::ArgAction::SetTrue)
+                .help("Compute per-region dinucleotide frequencies (expensive for wide regions; opt-in even when --fasta is provided)"),
+        )
+        .arg(
+            Arg::new("dinucl-raw-counts")
+                .long("dinucl-raw-counts")
+                .action(clap::ArgAction::SetTrue)
+                .help("Return raw per-region dinucleotide counts instead of percentages (matches R GenomicDistributions' rawCounts=TRUE)"),
+        )
         .arg(
             Arg::new("promoter-upstream")
                 .long("promoter-upstream")
diff --git a/gtars-cli/src/genomicdist/handlers.rs b/gtars-cli/src/genomicdist/handlers.rs
index 41a940c4..6e3a6c24 100644
--- a/gtars-cli/src/genomicdist/handlers.rs
+++ b/gtars-cli/src/genomicdist/handlers.rs
@@ -9,12 +9,16 @@ use serde::Serialize;
 
 use gtars_core::models::{Region, RegionSet};
 use gtars_core::utils::get_chrom_sizes;
-use gtars_genomicdist::models::{ChromosomeStatistics, RegionBin, Strand, TssIndex};
+use gtars_genomicdist::models::{
+    BinaryGenomeAssembly, ChromosomeStatistics, GenomeAssembly, RegionBin, SequenceAccess,
+    Strand, TssIndex,
+};
 use gtars_genomicdist::statistics::GenomicIntervalSetStatistics;
 use gtars_genomicdist::{
     GeneModel, GenomicDistAnnotation, ExpectedPartitionResult, PartitionResult,
     calc_expected_partitions, calc_partitions, genome_partition_list,
     SignalMatrix, calc_summary_signal, ConditionStats,
+    calc_gc_content, calc_dinucl_freq, DINUCL_ORDER,
     median_abs_distance,
 };
 
@@ -28,6 +32,32 @@ struct GenomicDistOutput {
     expected_partitions: Option<ExpectedPartitionResult>,
     #[serde(skip_serializing_if = "Option::is_none")]
     open_signal: Option<OpenSignalOutput>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    gc_content: Option<GcContentOutput>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    dinucl_freq: Option<DinuclFreqOutput>,
+}
+
+#[derive(Serialize)]
+struct GcContentOutput {
+    /// Mean GC content across all regions (0–1)
+    mean: f64,
+    /// Per-region GC content values (0–1), one per region in input order
+    per_region: Vec<f64>,
+}
+
+#[derive(Serialize)]
+struct DinuclFreqOutput {
+    /// Dinucleotide names in canonical order (matches DINUCL_ORDER)
+    dinucleotides: Vec<String>,
+    /// `chr_start_end` label per region
+    region_labels: Vec<String>,
+    /// Per-region matrix: outer is regions, inner is 16 values matching
+    /// `dinucleotides` order. Percentages (0–100) by default, or raw counts
+    /// if --dinucl-raw-counts flag was passed.
+    frequencies: Vec<[f64; 16]>,
+    /// Whether `frequencies` are raw counts (true) or percentages (false)
+    raw_counts: bool,
 }
 
 #[derive(Serialize)]
@@ -65,6 +95,8 @@ pub fn run_genomicdist(matches: &ArgMatches) -> Result<()> {
     let chrom_sizes_path = matches.get_one::<String>("chrom-sizes");
     let output_path = matches.get_one::<String>("output");
     let signal_matrix_path = matches.get_one::<String>("signal-matrix");
+    let fasta_path = matches.get_one::<String>("fasta");
+    let ignore_unk_chroms = matches.get_flag("ignore-unk-chroms");
     let n_bins: u32 = matches
         .get_one::<String>("bins")
         .unwrap()
@@ -91,7 +123,22 @@ pub fn run_genomicdist(matches: &ArgMatches) -> Result<()> {
     // --- Unconditional computations ---
     let widths = rs.calc_widths();
     let chromosome_stats = rs.chromosome_statistics();
-    let region_dist_map = rs.region_distribution_with_bins(n_bins);
+    let region_dist_map = match explicit_chrom_sizes.as_ref() {
+        Some(cs) => rs.region_distribution_with_chrom_sizes(n_bins, cs),
+        None => {
+            eprintln!(
+                "warning: --chrom-sizes not provided; using BED-file-derived bin width."
+            );
+            eprintln!(
+                "         Outputs will NOT be comparable across files or aligned with"
+            );
+            eprintln!(
+                "         reference genome positions. Pass --chrom-sizes <file> for"
+            );
+            eprintln!("         reference-aligned bins.");
+            rs.region_distribution_with_bins(n_bins)
+        }
+    };
     let neighbor_distances = rs
         .calc_neighbor_distances()
         .map_err(|e| anyhow::anyhow!("Failed to compute neighbor distances: {}", e))?;
@@ -214,6 +261,58 @@ pub fn run_genomicdist(matches: &ArgMatches) -> Result<()> {
         None => None,
     };
 
+    // --- Optional: GC content + dinucleotide frequencies (require FASTA) ---
+    // --fasta enables GC content. Dinucleotide frequencies are additionally
+    // opt-in via --dinucl-freq because they can be expensive for region sets
+    // with very wide regions (each dinucleotide window of every region is
+    // inspected; width dominates cost).
+    let dinucl_raw_counts = matches.get_flag("dinucl-raw-counts");
+    let compute_dinucl = matches.get_flag("dinucl-freq");
+    let (gc_content_out, dinucl_freq_out) = match fasta_path {
+        Some(p) => {
+            // Auto-detect .fab binary format vs plain FASTA (HashMap fallback)
+            let assembly: Box<dyn SequenceAccess> = if p.ends_with(".fab") {
+                Box::new(BinaryGenomeAssembly::try_from(p.as_str())
+                    .map_err(|e| anyhow::anyhow!("Failed to load .fab: {}", e))?)
+            } else {
+                Box::new(GenomeAssembly::try_from(p.as_str())
+                    .map_err(|e| anyhow::anyhow!("Failed to load FASTA: {}", e))?)
+            };
+
+            let gc_per_region = calc_gc_content(&rs, assembly.as_ref(), ignore_unk_chroms)
+                .map_err(|e| anyhow::anyhow!("Failed to compute GC content: {}", e))?;
+            let gc_mean = if gc_per_region.is_empty() {
+                0.0
+            } else {
+                gc_per_region.iter().sum::<f64>() / gc_per_region.len() as f64
+            };
+            let gc_out = GcContentOutput {
+                mean: gc_mean,
+                per_region: gc_per_region,
+            };
+
+            let dinucl_out = if compute_dinucl {
+                let (labels, matrix) = calc_dinucl_freq(
+                    &rs, assembly.as_ref(), dinucl_raw_counts, ignore_unk_chroms,
+                )
+                .map_err(|e| anyhow::anyhow!("Failed to compute dinucl freq: {}", e))?;
+                Some(DinuclFreqOutput {
+                    dinucleotides: DINUCL_ORDER
+                        .iter()
+                        .map(|d| d.to_string().unwrap_or_default())
+                        .collect(),
+                    region_labels: labels,
+                    frequencies: matrix,
+                    raw_counts: dinucl_raw_counts,
+                })
+            } else {
+                None
+            };
+            (Some(gc_out), dinucl_out)
+        }
+        None => (None, None),
+    };
+
     // --- Build output ---
     let output = GenomicDistOutput {
         scalars: Scalars {
@@ -232,6 +331,8 @@ pub fn run_genomicdist(matches: &ArgMatches) -> Result<()> {
         },
         expected_partitions,
         open_signal,
+        gc_content: gc_content_out,
+        dinucl_freq: dinucl_freq_out,
     };
 
     let compact = matches.get_flag("compact");
diff --git a/gtars-cli/src/overlaprs/cli.rs b/gtars-cli/src/overlaprs/cli.rs
index ed6485ea..6382b661 100644
--- a/gtars-cli/src/overlaprs/cli.rs
+++ b/gtars-cli/src/overlaprs/cli.rs
@@ -1,14 +1,26 @@
-use clap::{Command, arg};
+use clap::{Arg, Command, arg};
 
 pub const OVERLAP_CMD: &str = "overlaprs";
 
 pub fn create_overlap_cli() -> Command {
     Command::new(OVERLAP_CMD)
         .author("NJL")
-        .about("Tokenize data into a universe")
+        .about("Tokenize a BED file against a universe of regions (overlap-based encoding).")
         .arg_required_else_help(true)
-        .arg(arg!(-q <query> "The file you are tokenizing"))
-        .arg(arg!(-u <universe> "The universe you are tokenizing into"))
+        .arg(
+            Arg::new("query")
+                .short('q')
+                .long("query")
+                .required(true)
+                .help("Path to the BED file to tokenize"),
+        )
+        .arg(
+            Arg::new("universe")
+                .short('u')
+                .long("universe")
+                .required(true)
+                .help("Path to the universe BED file to tokenize against"),
+        )
         .arg(arg!(-e --backend <backend> "Which backend to use (ailist or bits)"))
         .arg(arg!(--streaming "Use streaming mode for very large universes (lower memory usage)"))
 }
diff --git a/gtars-cli/src/prep/cli.rs b/gtars-cli/src/prep/cli.rs
index f707da24..d150d38f 100644
--- a/gtars-cli/src/prep/cli.rs
+++ b/gtars-cli/src/prep/cli.rs
@@ -4,7 +4,7 @@ pub const PREP_CMD: &str = "prep";
 
 pub fn create_prep_cli() -> Command {
     Command::new(PREP_CMD)
-        .about("Pre-serialize GTF gene models or signal matrices to binary for fast loading.")
+        .about("Pre-serialize GTF gene models, signal matrices, or FASTA files to binary for fast loading.")
         .arg(
             Arg::new("gtf")
                 .long("gtf")
@@ -17,11 +17,17 @@ pub fn create_prep_cli() -> Command {
                 .required(false)
                 .help("Path to signal matrix TSV/TSV.gz to serialize"),
         )
+        .arg(
+            Arg::new("fasta")
+                .long("fasta")
+                .required(false)
+                .help("Path to FASTA file to convert to .fab binary (zero-copy mmap format)"),
+        )
         .arg(
             Arg::new("output")
                 .long("output")
                 .short('o')
                 .required(false)
-                .help("Output path (default: input path with .bin extension, stripping .gz first)"),
+                .help("Output path (default: input path with .bin/.fab extension)"),
         )
 }
diff --git a/gtars-cli/src/prep/handlers.rs b/gtars-cli/src/prep/handlers.rs
index 6494a056..60bf08cd 100644
--- a/gtars-cli/src/prep/handlers.rs
+++ b/gtars-cli/src/prep/handlers.rs
@@ -5,6 +5,7 @@ use anyhow::Result;
 use clap::ArgMatches;
 
 use gtars_genomicdist::{GenomicDistAnnotation, SignalMatrix};
+use gtars_genomicdist::models::BinaryGenomeAssembly;
 
 /// Derive the default output path: strip `.gz` then append `.bin`.
 fn default_output_path(input: &str) -> String {
@@ -15,10 +16,11 @@ fn default_output_path(input: &str) -> String {
 pub fn run_prep(matches: &ArgMatches) -> Result<()> {
     let gtf_path = matches.get_one::<String>("gtf");
     let signal_path = matches.get_one::<String>("signal-matrix");
+    let fasta_path = matches.get_one::<String>("fasta");
     let output_path = matches.get_one::<String>("output");
 
-    if gtf_path.is_none() && signal_path.is_none() {
-        anyhow::bail!("Provide at least one of --gtf or --signal-matrix");
+    if gtf_path.is_none() && signal_path.is_none() && fasta_path.is_none() {
+        anyhow::bail!("Provide at least one of --gtf, --signal-matrix, or --fasta");
     }
 
     if let Some(gtf) = gtf_path {
@@ -79,5 +81,29 @@ pub fn run_prep(matches: &ArgMatches) -> Result<()> {
         );
     }
 
+    if let Some(fa) = fasta_path {
+        let out = output_path
+            .cloned()
+            .unwrap_or_else(|| {
+                let stripped = fa.strip_suffix(".gz").unwrap_or(fa);
+                format!("{}.fab", stripped)
+            });
+
+        eprintln!("Converting FASTA to .fab: {}", fa);
+        let start = Instant::now();
+        BinaryGenomeAssembly::write_from_fasta(Path::new(fa), Path::new(&out))
+            .map_err(|e| anyhow::anyhow!("Failed to create .fab: {}", e))?;
+
+        let size = std::fs::metadata(&out)
+            .map(|m| m.len())
+            .unwrap_or(0);
+        eprintln!(
+            "  wrote {} ({:.1} GB) in {:.1}s",
+            out,
+            size as f64 / 1_073_741_824.0,
+            start.elapsed().as_secs_f64()
+        );
+    }
+
     Ok(())
 }
diff --git a/gtars-cli/src/ranges/cli.rs b/gtars-cli/src/ranges/cli.rs
index a8f3377e..a6c2fe5c 100644
--- a/gtars-cli/src/ranges/cli.rs
+++ b/gtars-cli/src/ranges/cli.rs
@@ -133,8 +133,14 @@ pub fn create_ranges_cli() -> Command {
         )
         .subcommand(
             Command::new("gaps")
-                .about("Compute gaps between regions per chromosome.")
+                .about("Compute gaps between regions per chromosome, bounded by chrom sizes.")
                 .arg(arg!(--input <BED> "Input BED file").required(true))
+                .arg(
+                    Arg::new("chrom-sizes")
+                        .long("chrom-sizes")
+                        .required(true)
+                        .help("Path to chrom.sizes file"),
+                )
                 .arg(arg!(--output <OUTPUT> "Output BED file (default: stdout)").required(false)),
         )
         .subcommand(
diff --git a/gtars-cli/src/ranges/handlers.rs b/gtars-cli/src/ranges/handlers.rs
index a090d3ee..beab1af2 100644
--- a/gtars-cli/src/ranges/handlers.rs
+++ b/gtars-cli/src/ranges/handlers.rs
@@ -122,7 +122,11 @@ pub fn run_ranges(matches: &ArgMatches) -> Result<()> {
         }
         Some(("gaps", m)) => {
             let rs = load_input(m)?;
-            let result = rs.gaps();
+            let cs_path = m
+                .get_one::<String>("chrom-sizes")
+                .expect("--chrom-sizes is required");
+            let chrom_sizes = get_chrom_sizes(cs_path);
+            let result = rs.gaps(&chrom_sizes);
             write_output(&result, m.get_one::<String>("output"))
         }
         Some(("intersect", m)) => {
diff --git a/gtars-genomicdist/Cargo.toml b/gtars-genomicdist/Cargo.toml
index 93e7f383..3b7f2eca 100644
--- a/gtars-genomicdist/Cargo.toml
+++ b/gtars-genomicdist/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "gtars-genomicdist"
-version = "0.6.0"
+version = "0.8.0"
 edition = "2024"
 description = "Rust port of GenomicDistributions: tools for computing statistics for genomic interval sets"
 license = "MIT"
@@ -15,6 +15,7 @@ regex = "1.11.1"
 thiserror = { workspace = true }
 serde = { workspace = true }
 bio = "3.0.0"
+memmap2 = "0.9"
 
 
 [dev-dependencies]
diff --git a/gtars-genomicdist/src/interval_ranges.rs b/gtars-genomicdist/src/interval_ranges.rs
index aa2b952b..b013bbb8 100644
--- a/gtars-genomicdist/src/interval_ranges.rs
+++ b/gtars-genomicdist/src/interval_ranges.rs
@@ -138,11 +138,26 @@ pub trait IntervalRanges {
     /// tiles the covered positions exactly, with no overlaps.
     fn disjoin(&self) -> RegionSet;
 
-    /// Return the gaps between regions per chromosome.
+    /// Return the gaps between regions per chromosome, bounded by chromosome sizes.
     ///
-    /// Reduces the input first, then emits intervals between consecutive
-    /// reduced regions on each chromosome.
-    fn gaps(&self) -> RegionSet;
+    /// Reduces the input first, then emits intervals that tile the peak-free
+    /// regions of each chromosome listed in `chrom_sizes`:
+    ///
+    /// - a **leading gap** from position 0 to the first region's start
+    ///   (omitted if the first region starts at 0),
+    /// - an **inter-region gap** between each consecutive pair of reduced
+    ///   regions,
+    /// - a **trailing gap** from the last region's end to the chromosome
+    ///   size (omitted if the last region already reaches the chromosome
+    ///   end, or extends past it due to assembly mismatch),
+    /// - a **full-chromosome gap** `0..chrom_size` for any chromosome in
+    ///   `chrom_sizes` that has no regions at all.
+    ///
+    /// Regions on chromosomes not present in `chrom_sizes` are skipped.
+    /// Regions that extend past the stated chromosome size are clipped to
+    /// `chrom_size` when computing the trailing gap, matching the
+    /// clipping behavior of `trim()`.
+    fn gaps(&self, chrom_sizes: &HashMap<String, u32>) -> RegionSet;
 
     /// Range-level intersection of two region sets.
     ///
@@ -584,44 +599,97 @@ impl IntervalRanges for RegionSet {
         RegionSet::from(result)
     }
 
-    fn gaps(&self) -> RegionSet {
+    fn gaps(&self, chrom_sizes: &HashMap<String, u32>) -> RegionSet {
         let reduced = self.reduce();
-        if reduced.regions.is_empty() {
-            return RegionSet::from(Vec::<Region>::new());
+
+        // Group reduced regions by chromosome so we can emit per-chrom gaps
+        // and also detect chromosomes with zero regions (full-chrom gaps).
+        let mut by_chr: HashMap<&str, Vec<&Region>> = HashMap::new();
+        for r in &reduced.regions {
+            // Skip chromosomes we don't have a size for — can't bound trailing gaps,
+            // and including leading gaps for unknown-size chromosomes is misleading.
+            if chrom_sizes.contains_key(&r.chr) {
+                by_chr.entry(r.chr.as_str()).or_default().push(r);
+            }
         }
 
         let mut result: Vec<Region> = Vec::new();
 
-        let mut i = 0;
-        while i < reduced.regions.len() {
-            let chr = &reduced.regions[i].chr;
-            let mut j = i + 1;
-            // Leading gap from position 0 to the first region on this chromosome
-            if reduced.regions[i].start > 0 {
-                result.push(Region {
-                    chr: chr.clone(),
-                    start: 0,
-                    end: reduced.regions[i].start,
-                    rest: None,
-                });
+        // Emit gaps for every chromosome named in chrom_sizes, not just those
+        // present in the input — this way a chromosome with zero regions
+        // contributes a full-chromosome gap, matching bedtools complement.
+        for (chr_name, &chrom_size) in chrom_sizes.iter() {
+            if chrom_size == 0 {
+                continue;
             }
-            while j < reduced.regions.len() && reduced.regions[j].chr == *chr {
-                // Gap between consecutive reduced regions
-                let gap_start = reduced.regions[j - 1].end;
-                let gap_end = reduced.regions[j].start;
-                if gap_start < gap_end {
+
+            match by_chr.get(chr_name.as_str()) {
+                None => {
+                    // No regions on this chromosome — whole chromosome is a gap.
                     result.push(Region {
-                        chr: chr.clone(),
-                        start: gap_start,
-                        end: gap_end,
+                        chr: chr_name.clone(),
+                        start: 0,
+                        end: chrom_size,
                         rest: None,
                     });
                 }
-                j += 1;
+                Some(regions) => {
+                    // Leading gap from 0 to the first region's start.
+                    if regions[0].start > 0 {
+                        // Leading gap is clipped to chrom_size as a safety net:
+                        // if the first region starts past chrom_size (assembly
+                        // mismatch) we still produce a valid [0, chrom_size) gap.
+                        let lead_end = regions[0].start.min(chrom_size);
+                        result.push(Region {
+                            chr: chr_name.clone(),
+                            start: 0,
+                            end: lead_end,
+                            rest: None,
+                        });
+                    }
+
+                    // Inter-region gaps.
+                    for pair in regions.windows(2) {
+                        let gap_start = pair[0].end;
+                        let gap_end = pair[1].start;
+                        if gap_start < gap_end {
+                            // Clip both bounds to chrom_size so the whole emitted
+                            // gap lies within the chromosome.
+                            let cs = chrom_size;
+                            let clipped_start = gap_start.min(cs);
+                            let clipped_end = gap_end.min(cs);
+                            if clipped_start < clipped_end {
+                                result.push(Region {
+                                    chr: chr_name.clone(),
+                                    start: clipped_start,
+                                    end: clipped_end,
+                                    rest: None,
+                                });
+                            }
+                        }
+                    }
+
+                    // Trailing gap from last region's end to chrom_size.
+                    let last_end = regions[regions.len() - 1].end;
+                    if last_end < chrom_size {
+                        result.push(Region {
+                            chr: chr_name.clone(),
+                            start: last_end,
+                            end: chrom_size,
+                            rest: None,
+                        });
+                    }
+                }
             }
-            i = j.max(i + 1);
         }
 
+        // Karyotypic chromosome ordering so output is stable across runs.
+        result.sort_by(|a, b| {
+            crate::utils::chrom_karyotype_key(&a.chr)
+                .cmp(&crate::utils::chrom_karyotype_key(&b.chr))
+                .then(a.start.cmp(&b.start))
+        });
+
         RegionSet::from(result)
     }
 
@@ -1140,6 +1208,140 @@ pub fn pairwise_jaccard(sets: &[RegionSet]) -> Vec<f64> {
     matrix
 }
 
+// --- Indexed operations on RegionSetList ---
+//
+// These let callers operate on pairs by index without cloning full RegionSets
+// across an FFI boundary (wasm, Python, R).
+
+use gtars_core::models::RegionSetList;
+
+/// Indexed pair operations on a RegionSetList.
+pub trait RegionSetListOps {
+    fn pintersect_at(&self, i: usize, j: usize) -> Option<RegionSet>;
+    fn pintersect_count(&self, i: usize, j: usize) -> Option<u32>;
+    fn jaccard_at(&self, i: usize, j: usize) -> Option<f64>;
+    fn union_at(&self, i: usize, j: usize) -> Option<RegionSet>;
+    fn setdiff_at(&self, i: usize, j: usize) -> Option<RegionSet>;
+    fn region_count(&self, i: usize) -> Option<u32>;
+    fn union_except(&self, skip: usize) -> Option<RegionSet>;
+    /// Compute union-of-all and all N union-except results in O(n) unions
+    /// using prefix/suffix arrays. Returns (full_union, vec_of_union_except).
+    fn bulk_union_except(&self) -> Option<(RegionSet, Vec<RegionSet>)>;
+    /// Fold all sets into a single union.
+    fn union_all(&self) -> Option<RegionSet>;
+    /// Fold all sets into a single intersection.
+    fn intersect_all(&self) -> Option<RegionSet>;
+}
+
+impl RegionSetListOps for RegionSetList {
+    fn pintersect_at(&self, i: usize, j: usize) -> Option<RegionSet> {
+        let a = self.get(i)?;
+        let b = self.get(j)?;
+        Some(a.pintersect(b))
+    }
+
+    fn pintersect_count(&self, i: usize, j: usize) -> Option<u32> {
+        self.pintersect_at(i, j).map(|rs| rs.len() as u32)
+    }
+
+    fn jaccard_at(&self, i: usize, j: usize) -> Option<f64> {
+        let a = self.get(i)?;
+        let b = self.get(j)?;
+        Some(a.jaccard(b))
+    }
+
+    fn union_at(&self, i: usize, j: usize) -> Option<RegionSet> {
+        let a = self.get(i)?;
+        let b = self.get(j)?;
+        Some(a.union(b))
+    }
+
+    fn setdiff_at(&self, i: usize, j: usize) -> Option<RegionSet> {
+        let a = self.get(i)?;
+        let b = self.get(j)?;
+        Some(a.setdiff(b))
+    }
+
+    fn region_count(&self, i: usize) -> Option<u32> {
+        self.get(i).map(|rs| rs.len() as u32)
+    }
+
+    fn union_except(&self, skip: usize) -> Option<RegionSet> {
+        let n = self.len();
+        if n < 2 || skip >= n { return None; }
+        let first = if skip == 0 { 1 } else { 0 };
+        let mut acc = self.get(first)?.clone();
+        for k in (first + 1)..n {
+            if k == skip { continue; }
+            if let Some(other) = self.get(k) {
+                acc = acc.union(other);
+            }
+        }
+        Some(acc)
+    }
+
+    fn bulk_union_except(&self) -> Option<(RegionSet, Vec<RegionSet>)> {
+        let n = self.len();
+        if n < 2 { return None; }
+
+        // prefix[i] = union(set[0]..=set[i])
+        let mut prefix = Vec::with_capacity(n);
+        prefix.push(self.get(0)?.clone());
+        for i in 1..n {
+            let prev = &prefix[i - 1];
+            prefix.push(prev.union(self.get(i)?));
+        }
+
+        // suffix[i] = union(set[i]..=set[n-1]), built incrementally from right
+        let mut suffix = vec![None; n];
+        suffix[n - 1] = Some(self.get(n - 1)?.clone());
+        for i in (0..n - 1).rev() {
+            suffix[i] = Some(self.get(i)?.union(suffix[i + 1].as_ref().unwrap()));
+        }
+        let suffix: Vec<RegionSet> = suffix.into_iter().map(|s| s.unwrap()).collect();
+
+        let full_union = prefix[n - 1].clone();
+
+        // union_except[i] = union(prefix[i-1], suffix[i+1])
+        let mut results = Vec::with_capacity(n);
+        for i in 0..n {
+            let except = match (i > 0, i < n - 1) {
+                (false, true) => suffix[1].clone(),
+                (true, false) => prefix[i - 1].clone(),
+                (true, true) => prefix[i - 1].union(&suffix[i + 1]),
+                (false, false) => unreachable!(), // n >= 2
+            };
+            results.push(except);
+        }
+
+        Some((full_union, results))
+    }
+
+    fn union_all(&self) -> Option<RegionSet> {
+        let n = self.len();
+        if n == 0 { return None; }
+        let mut acc = self.get(0)?.clone();
+        for i in 1..n {
+            if let Some(other) = self.get(i) {
+                acc = acc.union(other);
+            }
+        }
+        Some(acc)
+    }
+
+    fn intersect_all(&self) -> Option<RegionSet> {
+        let n = self.len();
+        if n == 0 { return None; }
+        let mut acc = self.get(0)?.clone();
+        for i in 1..n {
+            if let Some(other) = self.get(i) {
+                acc = acc.intersect(other);
+            }
+        }
+        Some(acc)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -2262,6 +2464,187 @@ mod tests {
         assert_eq!(ids[1], 0); // original idx 1 maps to cluster 0
     }
 
+    // ── gaps tests ──────────────────────────────────────────────────────
+
+    fn chrom_sizes(entries: &[(&str, u32)]) -> HashMap<String, u32> {
+        entries
+            .iter()
+            .map(|(c, s)| (c.to_string(), *s))
+            .collect()
+    }
+
+    #[test]
+    fn test_gaps_basic() {
+        // Three peaks on chr1 with gaps between them; leading + trailing
+        // gaps also present.
+        let rs = make_regionset(vec![
+            ("chr1", 10, 20),
+            ("chr1", 30, 40),
+            ("chr1", 50, 60),
+        ]);
+        let cs = chrom_sizes(&[("chr1", 100)]);
+        let result = rs.gaps(&cs);
+        let gaps: Vec<(&str, u32, u32)> = result
+            .regions
+            .iter()
+            .map(|r| (r.chr.as_str(), r.start, r.end))
+            .collect();
+        assert_eq!(
+            gaps,
+            vec![
+                ("chr1", 0, 10),   // leading
+                ("chr1", 20, 30),  // between peak 1 and 2
+                ("chr1", 40, 50),  // between peak 2 and 3
+                ("chr1", 60, 100), // trailing
+            ]
+        );
+    }
+
+    #[test]
+    fn test_gaps_peak_at_origin_no_leading() {
+        // First peak starts at 0 — no leading gap.
+        let rs = make_regionset(vec![("chr1", 0, 10), ("chr1", 20, 30)]);
+        let cs = chrom_sizes(&[("chr1", 100)]);
+        let result = rs.gaps(&cs);
+        let gaps: Vec<(u32, u32)> = result
+            .regions
+            .iter()
+            .map(|r| (r.start, r.end))
+            .collect();
+        assert_eq!(gaps, vec![(10, 20), (30, 100)]);
+    }
+
+    #[test]
+    fn test_gaps_peak_at_chrom_end_no_trailing() {
+        // Last peak ends at chrom_size — no trailing gap.
+        let rs = make_regionset(vec![("chr1", 10, 20), ("chr1", 80, 100)]);
+        let cs = chrom_sizes(&[("chr1", 100)]);
+        let result = rs.gaps(&cs);
+        let gaps: Vec<(u32, u32)> = result
+            .regions
+            .iter()
+            .map(|r| (r.start, r.end))
+            .collect();
+        assert_eq!(gaps, vec![(0, 10), (20, 80)]);
+    }
+
+    #[test]
+    fn test_gaps_peak_past_chrom_end_clipped() {
+        // Last peak extends past chrom_size — should be clipped, no trailing.
+        let rs = make_regionset(vec![("chr1", 10, 20), ("chr1", 80, 150)]);
+        let cs = chrom_sizes(&[("chr1", 100)]);
+        let result = rs.gaps(&cs);
+        let gaps: Vec<(u32, u32)> = result
+            .regions
+            .iter()
+            .map(|r| (r.start, r.end))
+            .collect();
+        assert_eq!(gaps, vec![(0, 10), (20, 80)]);
+    }
+
+    #[test]
+    fn test_gaps_empty_regionset_populated_chrom_sizes() {
+        // No regions, but chrom_sizes has entries — emit whole-chrom gaps.
+        let rs = RegionSet::from(Vec::<Region>::new());
+        let cs = chrom_sizes(&[("chr1", 100), ("chr2", 50)]);
+        let result = rs.gaps(&cs);
+        let mut gaps: Vec<(String, u32, u32)> = result
+            .regions
+            .iter()
+            .map(|r| (r.chr.clone(), r.start, r.end))
+            .collect();
+        gaps.sort();
+        assert_eq!(
+            gaps,
+            vec![
+                ("chr1".to_string(), 0, 100),
+                ("chr2".to_string(), 0, 50),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_gaps_empty_regionset_empty_chrom_sizes() {
+        let rs = RegionSet::from(Vec::<Region>::new());
+        let cs: HashMap<String, u32> = HashMap::new();
+        let result = rs.gaps(&cs);
+        assert!(result.regions.is_empty());
+    }
+
+    #[test]
+    fn test_gaps_chromosome_not_in_chrom_sizes_skipped() {
+        // Peak on chr2 with no chr2 entry in chrom_sizes — should be ignored.
+        let rs = make_regionset(vec![("chr1", 10, 20), ("chr2", 5, 15)]);
+        let cs = chrom_sizes(&[("chr1", 100)]);
+        let result = rs.gaps(&cs);
+        // Only chr1 gaps emitted.
+        for r in &result.regions {
+            assert_eq!(r.chr, "chr1");
+        }
+    }
+
+    #[test]
+    fn test_gaps_full_chrom_gap_for_unrepresented_chrom() {
+        // chrom_sizes has chr2 but input has no chr2 peaks — emit whole chr2.
+        let rs = make_regionset(vec![("chr1", 10, 20)]);
+        let cs = chrom_sizes(&[("chr1", 100), ("chr2", 200)]);
+        let result = rs.gaps(&cs);
+        let chr2_gaps: Vec<(u32, u32)> = result
+            .regions
+            .iter()
+            .filter(|r| r.chr == "chr2")
+            .map(|r| (r.start, r.end))
+            .collect();
+        assert_eq!(chr2_gaps, vec![(0, 200)]);
+    }
+
+    #[test]
+    fn test_gaps_overlapping_peaks_reduced() {
+        // Overlapping peaks get merged by reduce() before gap computation.
+        let rs = make_regionset(vec![
+            ("chr1", 10, 30),
+            ("chr1", 25, 40), // overlaps with previous
+            ("chr1", 50, 60),
+        ]);
+        let cs = chrom_sizes(&[("chr1", 100)]);
+        let result = rs.gaps(&cs);
+        let gaps: Vec<(u32, u32)> = result
+            .regions
+            .iter()
+            .map(|r| (r.start, r.end))
+            .collect();
+        // After reduce: [10,40], [50,60] → gaps: [0,10], [40,50], [60,100]
+        assert_eq!(gaps, vec![(0, 10), (40, 50), (60, 100)]);
+    }
+
+    #[test]
+    fn test_gaps_karyotypic_ordering() {
+        // Output should be karyotypically ordered regardless of chrom_sizes insertion order.
+        let rs = make_regionset(vec![
+            ("chr2", 10, 20),
+            ("chr1", 10, 20),
+            ("chr10", 10, 20),
+        ]);
+        let cs = chrom_sizes(&[("chr10", 100), ("chr1", 100), ("chr2", 100)]);
+        let result = rs.gaps(&cs);
+        // First gap on each chr; collect chr names in order of appearance.
+        let order: Vec<&str> = result
+            .regions
+            .iter()
+            .map(|r| r.chr.as_str())
+            .scan("", |prev, chr| {
+                if chr != *prev {
+                    *prev = chr;
+                    Some(chr)
+                } else {
+                    Some("") // repeat, skip
+                }
+            })
+            .filter(|s| !s.is_empty())
+            .collect();
+        assert_eq!(order, vec!["chr1", "chr2", "chr10"]);
+    }
+
     #[test]
     fn test_resize_center_large_coords() {
         // Regression: coordinates near u32::MAX/2 should not overflow in midpoint calc
@@ -2420,4 +2803,283 @@ mod tests {
         // overlap with [0,10): [5,10)=5bp, overlap with [20,30): [20,25)=5bp
         assert_eq!(super::merge_intersection_bp(&a, &b), 10);
     }
+
+    // ── RegionSetListOps tests ─────────────────────────────────────────
+
+    fn make_rsl(sets: Vec<RegionSet>) -> RegionSetList {
+        RegionSetList::from(sets)
+    }
+
+    #[rstest]
+    fn test_rsl_pintersect_count() {
+        let a = make_regionset(vec![("chr1", 0, 100), ("chr1", 200, 300)]);
+        let b = make_regionset(vec![("chr1", 50, 150), ("chr1", 250, 350)]);
+        let rsl = make_rsl(vec![a, b]);
+        // a has 2 regions, b has 2 regions, both overlap each other
+        let count = rsl.pintersect_count(0, 1).unwrap();
+        assert_eq!(count, 2); // both pairs overlap
+    }
+
+    #[rstest]
+    fn test_rsl_pintersect_count_no_overlap() {
+        let a = make_regionset(vec![("chr1", 0, 10)]);
+        let b = make_regionset(vec![("chr1", 100, 200)]);
+        let rsl = make_rsl(vec![a, b]);
+        // paired by index: chr1:0-10 vs chr1:100-200 → no genomic overlap,
+        // but pintersect produces a zero-width region (start=end) per pair
+        let count = rsl.pintersect_count(0, 1).unwrap();
+        assert_eq!(count, 1); // zero-width region still counted
+    }
+
+    #[rstest]
+    fn test_rsl_pintersect_count_out_of_bounds() {
+        let a = make_regionset(vec![("chr1", 0, 100)]);
+        let rsl = make_rsl(vec![a]);
+        assert!(rsl.pintersect_count(0, 5).is_none());
+    }
+
+    #[rstest]
+    fn test_rsl_jaccard_at() {
+        let a = make_regionset(vec![("chr1", 0, 100)]);
+        let b = make_regionset(vec![("chr1", 0, 100)]);
+        let rsl = make_rsl(vec![a, b]);
+        let j = rsl.jaccard_at(0, 1).unwrap();
+        assert!((j - 1.0).abs() < 1e-9, "identical sets should have jaccard=1.0");
+    }
+
+    #[rstest]
+    fn test_rsl_jaccard_at_disjoint() {
+        let a = make_regionset(vec![("chr1", 0, 100)]);
+        let b = make_regionset(vec![("chr1", 200, 300)]);
+        let rsl = make_rsl(vec![a, b]);
+        let j = rsl.jaccard_at(0, 1).unwrap();
+        assert!((j - 0.0).abs() < 1e-9, "disjoint sets should have jaccard=0.0");
+    }
+
+    #[rstest]
+    fn test_rsl_union_at() {
+        let a = make_regionset(vec![("chr1", 0, 100)]);
+        let b = make_regionset(vec![("chr1", 50, 150)]);
+        let rsl = make_rsl(vec![a, b]);
+        let u = rsl.union_at(0, 1).unwrap();
+        assert_eq!(u.regions.len(), 1);
+        assert_eq!(u.regions[0].start, 0);
+        assert_eq!(u.regions[0].end, 150);
+    }
+
+    #[rstest]
+    fn test_rsl_setdiff_at() {
+        let a = make_regionset(vec![("chr1", 0, 100)]);
+        let b = make_regionset(vec![("chr1", 50, 150)]);
+        let rsl = make_rsl(vec![a, b]);
+        let diff = rsl.setdiff_at(0, 1).unwrap();
+        // a minus b: chr1:0-50
+        assert_eq!(diff.regions.len(), 1);
+        assert_eq!(diff.regions[0].start, 0);
+        assert_eq!(diff.regions[0].end, 50);
+    }
+
+    #[rstest]
+    fn test_rsl_region_count() {
+        let a = make_regionset(vec![("chr1", 0, 100), ("chr1", 200, 300)]);
+        let b = make_regionset(vec![("chr1", 50, 150)]);
+        let rsl = make_rsl(vec![a, b]);
+        assert_eq!(rsl.region_count(0).unwrap(), 2);
+        assert_eq!(rsl.region_count(1).unwrap(), 1);
+        assert!(rsl.region_count(5).is_none());
+    }
+
+    #[rstest]
+    fn test_rsl_union_all() {
+        let a = make_regionset(vec![("chr1", 0, 100)]);
+        let b = make_regionset(vec![("chr1", 50, 200)]);
+        let c = make_regionset(vec![("chr1", 150, 300)]);
+        let rsl = make_rsl(vec![a, b, c]);
+        let u = rsl.union_all().unwrap();
+        assert_eq!(u.regions.len(), 1);
+        assert_eq!(u.regions[0].start, 0);
+        assert_eq!(u.regions[0].end, 300);
+    }
+
+    #[rstest]
+    fn test_rsl_union_all_empty() {
+        let rsl = make_rsl(vec![]);
+        assert!(rsl.union_all().is_none());
+    }
+
+    #[rstest]
+    fn test_rsl_union_all_single() {
+        let a = make_regionset(vec![("chr1", 10, 50)]);
+        let rsl = make_rsl(vec![a]);
+        let u = rsl.union_all().unwrap();
+        assert_eq!(u.regions.len(), 1);
+        assert_eq!(u.regions[0].start, 10);
+        assert_eq!(u.regions[0].end, 50);
+    }
+
+    #[rstest]
+    fn test_rsl_intersect_all() {
+        // Three overlapping sets — intersection is the region shared by all three
+        let a = make_regionset(vec![("chr1", 0, 100)]);
+        let b = make_regionset(vec![("chr1", 30, 200)]);
+        let c = make_regionset(vec![("chr1", 60, 150)]);
+        let rsl = make_rsl(vec![a, b, c]);
+        let inter = rsl.intersect_all().unwrap();
+        assert_eq!(inter.regions.len(), 1);
+        assert_eq!(inter.regions[0].start, 60);
+        assert_eq!(inter.regions[0].end, 100);
+    }
+
+    #[rstest]
+    fn test_rsl_intersect_all_disjoint() {
+        let a = make_regionset(vec![("chr1", 0, 50)]);
+        let b = make_regionset(vec![("chr1", 100, 200)]);
+        let rsl = make_rsl(vec![a, b]);
+        let inter = rsl.intersect_all().unwrap();
+        assert_eq!(inter.regions.len(), 0);
+    }
+
+    #[rstest]
+    fn test_rsl_intersect_all_empty() {
+        let rsl = make_rsl(vec![]);
+        assert!(rsl.intersect_all().is_none());
+    }
+
+    #[rstest]
+    fn test_rsl_intersect_all_different_sizes() {
+        // This is the case where pintersect would give wrong results:
+        // sets have different numbers of regions, but share genomic coverage
+        let a = make_regionset(vec![("chr1", 0, 100), ("chr1", 200, 300)]);
+        let b = make_regionset(vec![("chr1", 50, 250)]);
+        let rsl = make_rsl(vec![a, b]);
+        let inter = rsl.intersect_all().unwrap();
+        // Shared coverage: [50,100) and [200,250)
+        assert_eq!(inter.regions.len(), 2);
+        assert_eq!(inter.regions[0].start, 50);
+        assert_eq!(inter.regions[0].end, 100);
+        assert_eq!(inter.regions[1].start, 200);
+        assert_eq!(inter.regions[1].end, 250);
+    }
+
+    #[rstest]
+    fn test_rsl_union_except() {
+        let a = make_regionset(vec![("chr1", 0, 100)]);
+        let b = make_regionset(vec![("chr1", 200, 300)]);
+        let c = make_regionset(vec![("chr1", 400, 500)]);
+        let rsl = make_rsl(vec![a, b, c]);
+        // union_except(1) = union of sets 0 and 2 (skip set 1)
+        let ue = rsl.union_except(1).unwrap();
+        assert_eq!(ue.regions.len(), 2);
+        assert_eq!(ue.regions[0].start, 0);
+        assert_eq!(ue.regions[0].end, 100);
+        assert_eq!(ue.regions[1].start, 400);
+        assert_eq!(ue.regions[1].end, 500);
+    }
+
+    #[rstest]
+    fn test_rsl_union_except_too_small() {
+        let a = make_regionset(vec![("chr1", 0, 100)]);
+        let rsl = make_rsl(vec![a]);
+        assert!(rsl.union_except(0).is_none());
+    }
+
+    #[rstest]
+    fn test_rsl_bulk_union_except_n2() {
+        let a = make_regionset(vec![("chr1", 0, 100)]);
+        let b = make_regionset(vec![("chr1", 200, 300)]);
+        let rsl = make_rsl(vec![a, b]);
+        let (full_union, excepts) = rsl.bulk_union_except().unwrap();
+
+        // Full union covers both regions
+        assert_eq!(full_union.regions.len(), 2);
+
+        // except[0] = union of everything except set 0 = set 1
+        assert_eq!(excepts.len(), 2);
+        assert_eq!(excepts[0].regions.len(), 1);
+        assert_eq!(excepts[0].regions[0].start, 200);
+        assert_eq!(excepts[0].regions[0].end, 300);
+
+        // except[1] = union of everything except set 1 = set 0
+        assert_eq!(excepts[1].regions.len(), 1);
+        assert_eq!(excepts[1].regions[0].start, 0);
+        assert_eq!(excepts[1].regions[0].end, 100);
+    }
+
+    #[rstest]
+    fn test_rsl_bulk_union_except_n3() {
+        let a = make_regionset(vec![("chr1", 0, 100)]);
+        let b = make_regionset(vec![("chr1", 200, 300)]);
+        let c = make_regionset(vec![("chr1", 400, 500)]);
+        let rsl = make_rsl(vec![a, b, c]);
+        let (full_union, excepts) = rsl.bulk_union_except().unwrap();
+
+        assert_eq!(full_union.regions.len(), 3);
+        assert_eq!(excepts.len(), 3);
+
+        // except[0] = union(b, c)
+        assert_eq!(excepts[0].regions.len(), 2);
+        assert_eq!(excepts[0].regions[0].start, 200);
+        assert_eq!(excepts[0].regions[1].start, 400);
+
+        // except[1] = union(a, c)
+        assert_eq!(excepts[1].regions.len(), 2);
+        assert_eq!(excepts[1].regions[0].start, 0);
+        assert_eq!(excepts[1].regions[1].start, 400);
+
+        // except[2] = union(a, b)
+        assert_eq!(excepts[2].regions.len(), 2);
+        assert_eq!(excepts[2].regions[0].start, 0);
+        assert_eq!(excepts[2].regions[1].start, 200);
+    }
+
+    #[rstest]
+    fn test_rsl_bulk_union_except_too_small() {
+        let a = make_regionset(vec![("chr1", 0, 100)]);
+        let rsl = make_rsl(vec![a]);
+        assert!(rsl.bulk_union_except().is_none());
+
+        let rsl_empty = make_rsl(vec![]);
+        assert!(rsl_empty.bulk_union_except().is_none());
+    }
+
+    #[rstest]
+    fn test_rsl_bulk_union_except_matches_union_except() {
+        // Verify bulk algorithm produces same results as individual union_except calls
+        let a = make_regionset(vec![("chr1", 0, 100), ("chr2", 50, 200)]);
+        let b = make_regionset(vec![("chr1", 80, 180), ("chr2", 100, 300)]);
+        let c = make_regionset(vec![("chr1", 150, 250)]);
+        let d = make_regionset(vec![("chr2", 0, 150)]);
+        let rsl = make_rsl(vec![a, b, c, d]);
+
+        let (_, bulk_excepts) = rsl.bulk_union_except().unwrap();
+
+        for i in 0..4 {
+            let individual = rsl.union_except(i).unwrap();
+            assert_eq!(
+                bulk_excepts[i].regions.len(),
+                individual.regions.len(),
+                "region count mismatch at index {}",
+                i
+            );
+            for (j, (bulk_r, indiv_r)) in bulk_excepts[i]
+                .regions
+                .iter()
+                .zip(individual.regions.iter())
+                .enumerate()
+            {
+                assert_eq!(
+                    bulk_r.chr, indiv_r.chr,
+                    "chr mismatch at except[{}][{}]", i, j
+                );
+                assert_eq!(
+                    bulk_r.start, indiv_r.start,
+                    "start mismatch at except[{}][{}]", i, j
+                );
+                assert_eq!(
+                    bulk_r.end, indiv_r.end,
+                    "end mismatch at except[{}][{}]", i, j
+                );
+            }
+        }
+    }
 }
diff --git a/gtars-genomicdist/src/lib.rs b/gtars-genomicdist/src/lib.rs
index f935cc26..8bf59a29 100644
--- a/gtars-genomicdist/src/lib.rs
+++ b/gtars-genomicdist/src/lib.rs
@@ -41,6 +41,7 @@ pub use gtars_core::models::CoordinateMode;
 pub use consensus::{ConsensusRegion, consensus};
 pub use interval_ranges::IntervalRanges;
 pub use interval_ranges::pairwise_jaccard;
+pub use interval_ranges::RegionSetListOps;
 pub use partitions::{
     calc_expected_partitions, calc_partitions, genome_partition_list, ExpectedPartitionResult,
     ExpectedPartitionRow, GeneModel, PartitionList, PartitionResult,
@@ -49,4 +50,4 @@ pub use models::{SortedRegionSet, Strand, StrandedRegionSet};
 pub use signal::{calc_summary_signal, ConditionStats, SignalMatrix, SignalSummaryResult};
 pub use statistics::GenomicIntervalSetStatistics;
 pub use utils::{chrom_karyotype_key, median_abs_distance};
-pub use statistics::{calc_dinucl_freq, calc_dinucl_freq_per_region, calc_gc_content, DINUCL_ORDER};
+pub use statistics::{calc_dinucl_freq, calc_gc_content, DINUCL_ORDER};
diff --git a/gtars-genomicdist/src/models.rs b/gtars-genomicdist/src/models.rs
index 58e08f69..079d4dd7 100644
--- a/gtars-genomicdist/src/models.rs
+++ b/gtars-genomicdist/src/models.rs
@@ -1,9 +1,12 @@
 use crate::errors::GtarsGenomicDistError;
 use bio::io::fasta;
 use gtars_core::models::{CoordinateMode, Region, RegionSet};
+use memmap2::Mmap;
 use serde::{Deserialize, Serialize};
-use std::{collections::HashMap, fmt::Debug};
+use std::collections::HashMap;
+use std::fmt::Debug;
 use std::fs::File;
+use std::io::{BufWriter, Write};
 use std::path::Path;
 
 /// A `RegionSet` that is guaranteed to be sorted by (chr, start).
@@ -131,13 +134,31 @@ pub struct RegionBin {
     pub rid: u32,
 }
 
+/// Trait for types that provide sequence access to a reference genome.
+///
+/// Implemented by [`GenomeAssembly`] (in-memory HashMap) and
+/// [`BinaryGenomeAssembly`] (mmap .fab binary). Functions like
+/// `calc_gc_content` accept `&impl SequenceAccess` to work with either.
+pub trait SequenceAccess {
+    /// Get the sequence for a genomic region. Returns owned bytes.
+    fn get_sequence(&self, coords: &Region) -> Result<Vec<u8>, GtarsGenomicDistError>;
+
+    /// Check whether a chromosome exists in the assembly.
+    fn contains_chr(&self, chr: &str) -> bool;
+}
+
+/// In-memory genome assembly backed by a HashMap of chromosome sequences.
+///
+/// Loads the entire FASTA into memory on construction. Slower to construct
+/// (~2s for hg38) but provides zero-copy `&[u8]` access to sequences,
+/// making per-region operations like GC content highly vectorizable.
+/// No `.fai` index required.
 pub struct GenomeAssembly {
     seq_map: HashMap<String, Vec<u8>>,
 }
 
 impl TryFrom<&str> for GenomeAssembly {
     type Error = GtarsGenomicDistError;
-
     fn try_from(value: &str) -> Result<Self, GtarsGenomicDistError> {
         GenomeAssembly::try_from(Path::new(value))
     }
@@ -145,9 +166,7 @@ impl TryFrom<&str> for GenomeAssembly {
 
 impl TryFrom<String> for GenomeAssembly {
     type Error = GtarsGenomicDistError;
-
     fn try_from(value: String) -> Result<Self, GtarsGenomicDistError> {
-        // println!("Converting String to Path: {}", value);
         GenomeAssembly::try_from(Path::new(&value))
     }
 }
@@ -155,16 +174,11 @@ impl TryFrom<String> for GenomeAssembly {
 impl TryFrom<&Path> for GenomeAssembly {
     type Error = GtarsGenomicDistError;
 
-    ///
-    /// Create a new [GenomeAssembly] from fasta file
-    ///
     fn try_from(value: &Path) -> Result<GenomeAssembly, GtarsGenomicDistError> {
         let file = File::open(value)?;
         let genome = fasta::Reader::new(file);
-
         let records = genome.records();
 
-        // store the genome in a hashmap
         let mut seq_map: HashMap<String, Vec<u8>> = HashMap::new();
         for record in records {
             match record {
@@ -179,13 +193,12 @@ impl TryFrom<&Path> for GenomeAssembly {
                 }
             }
         }
-
         Ok(GenomeAssembly { seq_map })
     }
 }
 
 impl GenomeAssembly {
-    pub fn seq_from_region<'a>(&self, coords: &Region) -> Result<&[u8], GtarsGenomicDistError> {
+    pub fn seq_from_region(&self, coords: &Region) -> Result<&[u8], GtarsGenomicDistError> {
         let chr = &coords.chr;
         let start = coords.start as usize;
         let end = coords.end as usize;
@@ -196,10 +209,7 @@ impl GenomeAssembly {
             } else {
                 Err(GtarsGenomicDistError::CustomError(format!(
                     "Invalid range: start={}, end={} for chromosome {} with length {}",
-                    start,
-                    end,
-                    chr,
-                    seq.len()
+                    start, end, chr, seq.len()
                 )))
             }
         } else {
@@ -215,6 +225,235 @@ impl GenomeAssembly {
     }
 }
 
+impl SequenceAccess for GenomeAssembly {
+    fn get_sequence(&self, coords: &Region) -> Result<Vec<u8>, GtarsGenomicDistError> {
+        self.seq_from_region(coords).map(|s| s.to_vec())
+    }
+
+    fn contains_chr(&self, chr: &str) -> bool {
+        self.seq_map.contains_key(chr)
+    }
+}
+
+// --- Binary FASTA (.fab) format ---
+
+const FAB_MAGIC: &[u8; 4] = b"GFAB";
+const FAB_VERSION: u8 = 1;
+
+/// Memory-mapped genome assembly backed by a binary FASTA (.fab) file.
+///
+/// The .fab format stores sequences contiguously without line wrapping,
+/// enabling mmap + zero-copy `&[u8]` access with both instant construction
+/// and zero-copy per-region performance.
+///
+/// Create .fab files with [`BinaryGenomeAssembly::write_from_fasta`] or
+/// via `gtars prep --fasta <file>`.
+#[derive(Debug)]
+pub struct BinaryGenomeAssembly {
+    mmap: Mmap,
+    /// name → (offset_in_file, sequence_length)
+    index: HashMap<String, (usize, usize)>,
+}
+
+impl BinaryGenomeAssembly {
+    /// Open a .fab binary FASTA file via mmap.
+    pub fn from_file(path: &Path) -> Result<Self, GtarsGenomicDistError> {
+        let file = File::open(path).map_err(|e| {
+            GtarsGenomicDistError::CustomError(format!(
+                "Failed to open .fab file '{}': {}",
+                path.display(), e
+            ))
+        })?;
+        let mmap = unsafe { Mmap::map(&file) }.map_err(|e| {
+            GtarsGenomicDistError::CustomError(format!(
+                "Failed to mmap .fab file '{}': {}",
+                path.display(), e
+            ))
+        })?;
+
+        // Parse header
+        if mmap.len() < 9 {
+            return Err(GtarsGenomicDistError::CustomError(
+                "Invalid .fab file: too short".into(),
+            ));
+        }
+        if &mmap[0..4] != FAB_MAGIC {
+            return Err(GtarsGenomicDistError::CustomError(
+                "Invalid .fab file: bad magic bytes".into(),
+            ));
+        }
+        let version = mmap[4];
+        if version != FAB_VERSION {
+            return Err(GtarsGenomicDistError::CustomError(format!(
+                "Unsupported .fab version: {} (expected {})",
+                version, FAB_VERSION
+            )));
+        }
+        let n_chroms = u32::from_le_bytes(mmap[5..9].try_into().unwrap()) as usize;
+
+        // Parse index
+        let mut pos = 9;
+        let mut index = HashMap::with_capacity(n_chroms);
+        for _ in 0..n_chroms {
+            if pos + 2 > mmap.len() {
+                return Err(GtarsGenomicDistError::CustomError(
+                    "Invalid .fab file: truncated index".into(),
+                ));
+            }
+            let name_len = u16::from_le_bytes(mmap[pos..pos + 2].try_into().unwrap()) as usize;
+            pos += 2;
+            if pos + name_len + 16 > mmap.len() {
+                return Err(GtarsGenomicDistError::CustomError(
+                    "Invalid .fab file: truncated index entry".into(),
+                ));
+            }
+            let name = std::str::from_utf8(&mmap[pos..pos + name_len])
+                .map_err(|e| {
+                    GtarsGenomicDistError::CustomError(format!(
+                        "Invalid .fab file: non-UTF8 chromosome name: {}",
+                        e
+                    ))
+                })?
+                .to_string();
+            pos += name_len;
+            let offset =
+                u64::from_le_bytes(mmap[pos..pos + 8].try_into().unwrap()) as usize;
+            pos += 8;
+            let length =
+                u64::from_le_bytes(mmap[pos..pos + 8].try_into().unwrap()) as usize;
+            pos += 8;
+            index.insert(name, (offset, length));
+        }
+
+        Ok(BinaryGenomeAssembly { mmap, index })
+    }
+
+    /// Get the sequence for a region as a zero-copy `&[u8]` slice.
+    pub fn seq_from_region(&self, coords: &Region) -> Result<&[u8], GtarsGenomicDistError> {
+        let chr = &coords.chr;
+        let start = coords.start as usize;
+        let end = coords.end as usize;
+
+        let &(offset, length) = self.index.get(chr).ok_or_else(|| {
+            GtarsGenomicDistError::CustomError(format!(
+                "Unknown chromosome found in region set: {}",
+                chr
+            ))
+        })?;
+
+        if end > length || start > end {
+            return Err(GtarsGenomicDistError::CustomError(format!(
+                "Invalid range: start={}, end={} for chromosome {} with length {}",
+                start, end, chr, length
+            )));
+        }
+
+        let file_start = offset + start;
+        let file_end = offset + end;
+        if file_end > self.mmap.len() {
+            return Err(GtarsGenomicDistError::CustomError(format!(
+                "Corrupted .fab file: sequence data for {} extends beyond file boundary",
+                chr
+            )));
+        }
+
+        Ok(&self.mmap[file_start..file_end])
+    }
+
+    pub fn contains_chr(&self, chr: &str) -> bool {
+        self.index.contains_key(chr)
+    }
+
+    /// Convert a FASTA file to .fab binary format.
+    pub fn write_from_fasta(
+        fasta_path: &Path,
+        output_path: &Path,
+    ) -> Result<(), GtarsGenomicDistError> {
+        // Read all sequences into memory (same as GenomeAssembly)
+        let file = File::open(fasta_path)?;
+        let reader = fasta::Reader::new(file);
+
+        let mut chroms: Vec<(String, Vec<u8>)> = Vec::new();
+        for record in reader.records() {
+            let record = record.map_err(|e| {
+                GtarsGenomicDistError::CustomError(format!(
+                    "Error reading FASTA: {}", e
+                ))
+            })?;
+            chroms.push((record.id().to_string(), record.seq().to_owned()));
+        }
+
+        // Compute index: header size first
+        let mut header_size: usize = 4 + 1 + 4; // magic + version + n_chroms
+        for (name, _) in &chroms {
+            header_size += 2 + name.len() + 8 + 8; // name_len + name + offset + length
+        }
+
+        // Write
+        let out = File::create(output_path).map_err(|e| {
+            GtarsGenomicDistError::CustomError(format!(
+                "Failed to create .fab file '{}': {}",
+                output_path.display(), e
+            ))
+        })?;
+        let mut w = BufWriter::new(out);
+
+        // Header
+        w.write_all(FAB_MAGIC)?;
+        w.write_all(&[FAB_VERSION])?;
+        w.write_all(&(chroms.len() as u32).to_le_bytes())?;
+
+        // Index
+        let mut offset = header_size;
+        for (name, seq) in &chroms {
+            w.write_all(&(name.len() as u16).to_le_bytes())?;
+            w.write_all(name.as_bytes())?;
+            w.write_all(&(offset as u64).to_le_bytes())?;
+            w.write_all(&(seq.len() as u64).to_le_bytes())?;
+            offset += seq.len();
+        }
+
+        // Sequences
+        for (_, seq) in &chroms {
+            w.write_all(seq)?;
+        }
+
+        w.flush()?;
+        Ok(())
+    }
+}
+
+impl TryFrom<&str> for BinaryGenomeAssembly {
+    type Error = GtarsGenomicDistError;
+    fn try_from(value: &str) -> Result<Self, GtarsGenomicDistError> {
+        BinaryGenomeAssembly::from_file(Path::new(value))
+    }
+}
+
+impl TryFrom<String> for BinaryGenomeAssembly {
+    type Error = GtarsGenomicDistError;
+    fn try_from(value: String) -> Result<Self, GtarsGenomicDistError> {
+        BinaryGenomeAssembly::from_file(Path::new(&value))
+    }
+}
+
+impl TryFrom<&Path> for BinaryGenomeAssembly {
+    type Error = GtarsGenomicDistError;
+    fn try_from(value: &Path) -> Result<Self, GtarsGenomicDistError> {
+        BinaryGenomeAssembly::from_file(value)
+    }
+}
+
+impl SequenceAccess for BinaryGenomeAssembly {
+    fn get_sequence(&self, coords: &Region) -> Result<Vec<u8>, GtarsGenomicDistError> {
+        self.seq_from_region(coords).map(|s| s.to_vec())
+    }
+
+    fn contains_chr(&self, chr: &str) -> bool {
+        self.index.contains_key(chr)
+    }
+}
+
 #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
 pub enum Dinucleotide {
     Aa,
@@ -672,6 +911,80 @@ mod tests {
         assert!(ga.unwrap().contains_chr("chr1"));
     }
 
+    #[test]
+    fn test_binary_genome_assembly_round_trip() {
+        // Write .fab from base.fa, then read it back and verify sequences match
+        let fasta_path = get_fasta_path("base.fa");
+        let fab_path = fasta_path.with_extension("fa.test.fab");
+
+        BinaryGenomeAssembly::write_from_fasta(&fasta_path, &fab_path).unwrap();
+        let bga = BinaryGenomeAssembly::from_file(&fab_path).unwrap();
+
+        // Verify chromosomes exist
+        assert!(bga.contains_chr("chr1"));
+        assert!(bga.contains_chr("chr2"));
+        assert!(bga.contains_chr("chrX"));
+        assert!(!bga.contains_chr("chr3"));
+
+        // Verify sequences match HashMap GenomeAssembly
+        let ga = GenomeAssembly::try_from(fasta_path.as_path()).unwrap();
+
+        let region1 = Region { chr: "chr1".into(), start: 0, end: 4, rest: None };
+        assert_eq!(bga.seq_from_region(&region1).unwrap(), ga.seq_from_region(&region1).unwrap());
+        assert_eq!(bga.seq_from_region(&region1).unwrap(), b"GGAA");
+
+        let region2 = Region { chr: "chrX".into(), start: 2, end: 6, rest: None };
+        assert_eq!(bga.seq_from_region(&region2).unwrap(), ga.seq_from_region(&region2).unwrap());
+        assert_eq!(bga.seq_from_region(&region2).unwrap(), b"GGGG");
+
+        // Out-of-bounds error
+        let bad_region = Region { chr: "chr1".into(), start: 0, end: 100, rest: None };
+        assert!(bga.seq_from_region(&bad_region).is_err());
+
+        // Unknown chromosome error
+        let unk_region = Region { chr: "chr99".into(), start: 0, end: 1, rest: None };
+        assert!(bga.seq_from_region(&unk_region).is_err());
+
+        // Clean up
+        std::fs::remove_file(&fab_path).ok();
+    }
+
+    #[test]
+    fn test_binary_genome_assembly_bad_magic() {
+        let dir = tempfile::tempdir().unwrap();
+        let fab_path = dir.path().join("bad.fab");
+        // Need at least 9 bytes to pass the "too short" check
+        std::fs::write(&fab_path, b"XXXX\x01\x00\x00\x00\x00").unwrap();
+        let result = BinaryGenomeAssembly::from_file(&fab_path);
+        assert!(result.is_err());
+        assert!(result.unwrap_err().to_string().contains("bad magic"));
+    }
+
+    #[test]
+    fn test_binary_genome_assembly_gc_parity() {
+        // Verify calc_gc_content produces identical results from .fab and HashMap
+        use crate::statistics::calc_gc_content;
+
+        let fasta_path = get_fasta_path("base.fa");
+        let fab_path = fasta_path.with_extension("fa.test2.fab");
+        BinaryGenomeAssembly::write_from_fasta(&fasta_path, &fab_path).unwrap();
+
+        let ga = GenomeAssembly::try_from(fasta_path.as_path()).unwrap();
+        let bga = BinaryGenomeAssembly::from_file(&fab_path).unwrap();
+
+        let regions = vec![
+            Region { chr: "chr1".into(), start: 0, end: 4, rest: None },
+            Region { chr: "chr2".into(), start: 0, end: 4, rest: None },
+        ];
+        let rs = RegionSet::from(regions);
+
+        let gc_hashmap = calc_gc_content(&rs, &ga, false).unwrap();
+        let gc_fab = calc_gc_content(&rs, &bga, false).unwrap();
+        assert_eq!(gc_hashmap, gc_fab);
+
+        std::fs::remove_file(&fab_path).ok();
+    }
+
     #[test]
     fn test_tss_index_try_from_path() {
         let path = get_test_path("dummy_tss.bed").unwrap();
diff --git a/gtars-genomicdist/src/statistics.rs b/gtars-genomicdist/src/statistics.rs
index 615a197b..38205951 100644
--- a/gtars-genomicdist/src/statistics.rs
+++ b/gtars-genomicdist/src/statistics.rs
@@ -9,7 +9,9 @@ use std::collections::HashMap;
 use gtars_core::models::{Region, RegionSet};
 
 use crate::errors::GtarsGenomicDistError;
-use crate::models::{ChromosomeStatistics, Dinucleotide, GenomeAssembly, RegionBin};
+use crate::models::{
+    ChromosomeStatistics, Dinucleotide, RegionBin, SequenceAccess,
+};
 
 /// Trait for computing statistics and distributions of genomic intervals.
 pub trait GenomicIntervalSetStatistics {
@@ -34,6 +36,13 @@ pub trait GenomicIntervalSetStatistics {
     /// Like `region_distribution_with_bins` but uses actual chromosome sizes
     /// to create bins per-chromosome, matching R's `getGenomeBins(chromSizes)`.
     /// Each chromosome gets `n_bins` bins sized to that chromosome's length.
+    ///
+    /// Regions on chromosomes not present in `chrom_sizes` are skipped.
+    /// Regions whose midpoint falls beyond the stated chromosome size are also
+    /// skipped (common with assembly mismatches, e.g. an hg19 BED paired with
+    /// hg38 chrom_sizes). The total bin count may therefore be lower than the
+    /// input region count; callers who need to detect mismatches can compare
+    /// `sum(bin.n)` against their input region count.
     fn region_distribution_with_chrom_sizes(
         &self,
         n_bins: u32,
@@ -45,7 +54,12 @@ pub trait GenomicIntervalSetStatistics {
     /// For each pair of adjacent regions on the same chromosome, returns the
     /// signed gap: `next.start - current.end`. Positive values indicate a gap,
     /// negative values indicate overlapping regions, zero means adjacent/abutting.
-    /// Regions on chromosomes with fewer than 2 regions are skipped.
+    ///
+    /// Regions on chromosomes with fewer than 2 regions are skipped (they have
+    /// no neighbors to measure against). Output length equals the total number
+    /// of *gaps* across all multi-region chromosomes — generally shorter than
+    /// the input region count. Output is not aligned 1:1 with input regions.
+    /// No sentinel values are emitted.
     fn calc_neighbor_distances(&self) -> Result<Vec<i64>, GtarsGenomicDistError>;
 
     /// Compute the distance from each region to its nearest neighbor.
@@ -54,6 +68,13 @@ pub trait GenomicIntervalSetStatistics {
     /// and downstream neighbors. First and last regions on each chromosome
     /// use their only neighbor's distance. Overlapping neighbors have distance 0.
     ///
+    /// Regions on chromosomes with only one region are skipped (they have no
+    /// neighbors). Output length equals the total number of regions across all
+    /// multi-region chromosomes — generally shorter than the input region
+    /// count, and NOT aligned 1:1 with input regions. No sentinel values are
+    /// emitted. Callers who need 1:1 alignment must filter their input to
+    /// multi-region chromosomes first.
+    ///
     /// Port of R GenomicDistributions `calcNearestNeighbors()`.
     fn calc_nearest_neighbors(&self) -> Result<Vec<u32>, GtarsGenomicDistError>;
 
@@ -176,6 +197,13 @@ impl GenomicIntervalSetStatistics for RegionSet {
             return HashMap::new();
         }
 
+        // Proportional binning: the longest chromosome gets n_bins bins,
+        // shorter chromosomes get proportionally fewer. This produces a
+        // uniform bin width (in bp) across all chromosomes so that
+        // positional heatmaps are reference-aligned.
+        let max_chrom_len = chrom_sizes.values().copied().max().unwrap_or(1) as u64;
+        let bin_width = (max_chrom_len / n_bins as u64).max(1);
+
         let mut plot_results: HashMap<String, RegionBin> = HashMap::new();
 
         for region in &self.regions {
@@ -183,9 +211,16 @@ impl GenomicIntervalSetStatistics for RegionSet {
                 Some(&s) => s,
                 None => continue, // skip regions on chromosomes not in chrom_sizes
             };
-            let bin_size = (chrom_size / n_bins).max(1);
+            let bin_size = bin_width as u32;
 
             let mid = region.mid_point();
+            // Skip regions whose midpoint falls beyond the stated chromosome size
+            // (e.g. BED file assembled against a different reference than the one
+            // supplied by chrom_sizes). Without this, rid would exceed n_bins and
+            // produce bins with end < start.
+            if mid >= chrom_size {
+                continue;
+            }
             let rid = mid / bin_size;
             let bin_start = rid * bin_size;
             let bin_end = (bin_start + bin_size).min(chrom_size);
@@ -285,7 +320,7 @@ impl GenomicIntervalSetStatistics for RegionSet {
 ///
 pub fn calc_gc_content(
     region_set: &RegionSet,
-    genome: &GenomeAssembly,
+    genome: &(impl SequenceAccess + ?Sized),
     ignore_unk_chroms: bool,
 ) -> Result<Vec<f64>, GtarsGenomicDistError> {
     // for region in region_set
@@ -299,11 +334,11 @@ pub fn calc_gc_content(
         for region in region_set.iter_chr_regions(chr) {
             let mut gc_count: u32 = 0;
             let mut total_count: u32 = 0;
-            let seq = genome.seq_from_region(region);
+            let seq = genome.get_sequence(region);
 
             match seq {
                 Ok(seq) => {
-                    for base in seq {
+                    for &base in &seq {
                         match base.to_ascii_lowercase() {
                             b'g' | b'c' => {
                                 gc_count += 1;
@@ -337,38 +372,6 @@ pub fn calc_gc_content(
     Ok(gc_contents)
 }
 
-///
-///  Calculate Dinucleotide frequencies
-///
-/// Arguments:
-/// - region_set: RegionSet object
-/// - genome: GenomeAssembly object (reference genome)
-/// Return: Result of hashmap of dinucleotide and frequencies e.g. 'Aa: 13142'
-pub fn calc_dinucl_freq(
-    region_set: &RegionSet,
-    genome: &GenomeAssembly,
-) -> Result<HashMap<Dinucleotide, u64>, GtarsGenomicDistError> {
-    let mut dinucl_freqs: HashMap<Dinucleotide, u64> = HashMap::new();
-
-    for chr in region_set.iter_chroms() {
-        for region in region_set.iter_chr_regions(chr) {
-            let seq = genome.seq_from_region(region)?;
-            for aas in seq.windows(2) {
-                let dinucl = Dinucleotide::from_bytes(aas);
-                match dinucl {
-                    Some(dinucl) => {
-                        let current_freq = dinucl_freqs.entry(dinucl).or_insert(0);
-                        *current_freq += 1;
-                    }
-                    None => continue,
-                }
-            }
-        }
-    }
-
-    Ok(dinucl_freqs)
-}
-
 /// Canonical ordering of dinucleotides, matching GenomicDistributions' column order.
 pub const DINUCL_ORDER: [Dinucleotide; 16] = [
     Dinucleotide::Aa, Dinucleotide::Ac, Dinucleotide::Ag, Dinucleotide::At,
@@ -377,24 +380,62 @@ pub const DINUCL_ORDER: [Dinucleotide; 16] = [
     Dinucleotide::Ta, Dinucleotide::Tc, Dinucleotide::Tg, Dinucleotide::Tt,
 ];
 
-/// Per-region dinucleotide frequencies as percentages.
+/// Per-region dinucleotide frequencies.
 ///
-/// Returns a tuple of (region_labels, frequency_matrix) where:
-/// - `region_labels` is `chr_start_end` for each region
-/// - `frequency_matrix` is a `Vec<[f64; 16]>` — one row per region, columns
-///   in [`DINUCL_ORDER`] order, values are percentages (0–100).
+/// Matches R GenomicDistributions `calcDinuclFreq`: one row per region,
+/// 16 dinucleotide columns in [`DINUCL_ORDER`] order.
 ///
-/// This matches the output format of R's GenomicDistributions::calcDinuclFreq.
-pub fn calc_dinucl_freq_per_region(
+/// Arguments:
+/// - `region_set`: RegionSet object
+/// - `genome`: GenomeAssembly object (reference genome)
+/// - `raw_counts`: if `true`, return raw integer-valued counts;
+///   if `false`, return percentages (0–100) per row (matches R default)
+/// - `ignore_unk_chroms`: if `true`, skip regions on chromosomes not in
+///   the assembly; if `false`, error on unknown chromosomes
+///
+/// Returns a tuple `(region_labels, frequency_matrix)`:
+/// - `region_labels`: `chr_start_end` for each region
+/// - `frequency_matrix`: `Vec<[f64; 16]>` — one row per region
+///
+/// For pooled global counts across all regions, sum the columns of the
+/// raw-counts matrix:
+/// ```no_run
+/// # use gtars_genomicdist::{calc_dinucl_freq, DINUCL_ORDER};
+/// # use gtars_genomicdist::models::GenomeAssembly;
+/// # use gtars_core::models::RegionSet;
+/// # fn example(rs: &RegionSet, assembly: &GenomeAssembly) -> Result<(), Box<dyn std::error::Error>> {
+/// let (_, matrix) = calc_dinucl_freq(rs, assembly, true, false)?;
+/// let mut totals = [0.0f64; 16];
+/// for row in &matrix {
+///     for (i, &c) in row.iter().enumerate() {
+///         totals[i] += c;
+///     }
+/// }
+/// # Ok(()) }
+/// ```
+pub fn calc_dinucl_freq(
     region_set: &RegionSet,
-    genome: &GenomeAssembly,
+    genome: &(impl SequenceAccess + ?Sized),
+    raw_counts: bool,
+    ignore_unk_chroms: bool,
 ) -> Result<(Vec<String>, Vec<[f64; 16]>), GtarsGenomicDistError> {
     let mut labels: Vec<String> = Vec::new();
     let mut matrix: Vec<[f64; 16]> = Vec::new();
 
     for chr in region_set.iter_chroms() {
+        if ignore_unk_chroms && !genome.contains_chr(chr) {
+            continue;
+        }
         for region in region_set.iter_chr_regions(chr) {
-            let seq = genome.seq_from_region(region)?;
+            let seq = match genome.get_sequence(region) {
+                Ok(s) => s,
+                Err(e) => {
+                    if ignore_unk_chroms {
+                        continue;
+                    }
+                    return Err(e);
+                }
+            };
             let mut counts = [0u64; 16];
             let mut total: u64 = 0;
 
@@ -406,19 +447,25 @@ pub fn calc_dinucl_freq_per_region(
                 }
             }
 
-            let freqs: [f64; 16] = if total > 0 {
-                let mut f = [0.0f64; 16];
+            let row: [f64; 16] = if raw_counts {
+                let mut r = [0.0f64; 16];
+                for (i, &c) in counts.iter().enumerate() {
+                    r[i] = c as f64;
+                }
+                r
+            } else if total > 0 {
+                let mut r = [0.0f64; 16];
                 for (i, &c) in counts.iter().enumerate() {
-                    f[i] = (c as f64 / total as f64) * 100.0;
+                    r[i] = (c as f64 / total as f64) * 100.0;
                 }
-                f
+                r
             } else {
                 [0.0; 16]
             };
 
             labels.push(format!("{}_{}_{}",
                 region.chr, region.start, region.end));
-            matrix.push(freqs);
+            matrix.push(row);
         }
     }
 
@@ -428,6 +475,7 @@ pub fn calc_dinucl_freq_per_region(
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::models::GenomeAssembly;
 
     use pretty_assertions::assert_eq;
     use rstest::*;
@@ -572,8 +620,8 @@ mod tests {
     // --- Dinucleotide frequency ---
 
     #[rstest]
-    fn test_calc_dinucl_freq() {
-        // base.fa: chr1=GGAA → dinucleotides: GG, GA, AA
+    fn test_calc_dinucl_freq_raw_counts() {
+        // base.fa: chr1=GGAA → dinucleotides: GG, GA, AA (3 total)
         let path = get_fasta_path("base.fa");
         let ga = GenomeAssembly::try_from(path.to_str().unwrap()).unwrap();
 
@@ -581,19 +629,25 @@ mod tests {
             Region { chr: "chr1".into(), start: 0, end: 4, rest: None },
         ];
         let rs = RegionSet::from(regions);
-        let freq = calc_dinucl_freq(&rs, &ga).unwrap();
+        let (labels, matrix) = calc_dinucl_freq(&rs, &ga, true, false).unwrap();
 
-        assert_eq!(*freq.get(&Dinucleotide::Gg).unwrap_or(&0), 1);
-        assert_eq!(*freq.get(&Dinucleotide::Ga).unwrap_or(&0), 1);
-        assert_eq!(*freq.get(&Dinucleotide::Aa).unwrap_or(&0), 1);
+        assert_eq!(labels, vec!["chr1_0_4"]);
+        assert_eq!(matrix.len(), 1);
+        let row = &matrix[0];
+        let gg_idx = DINUCL_ORDER.iter().position(|d| *d == Dinucleotide::Gg).unwrap();
+        let ga_idx = DINUCL_ORDER.iter().position(|d| *d == Dinucleotide::Ga).unwrap();
+        let aa_idx = DINUCL_ORDER.iter().position(|d| *d == Dinucleotide::Aa).unwrap();
+        assert_eq!(row[gg_idx], 1.0);
+        assert_eq!(row[ga_idx], 1.0);
+        assert_eq!(row[aa_idx], 1.0);
         // total should be 3 (4 bases → 3 dinucleotides)
-        let total: u64 = freq.values().sum();
-        assert_eq!(total, 3);
+        let total: f64 = row.iter().sum();
+        assert_eq!(total, 3.0);
     }
 
     #[rstest]
-    fn test_calc_dinucl_freq_per_region() {
-        // base.fa: chr2=GCGC → dinucleotides: GC, CG, GC
+    fn test_calc_dinucl_freq_percentages() {
+        // base.fa: chr2=GCGC → dinucleotides: GC, CG, GC (percentages)
         let path = get_fasta_path("base.fa");
         let ga = GenomeAssembly::try_from(path.to_str().unwrap()).unwrap();
 
@@ -601,14 +655,13 @@ mod tests {
             Region { chr: "chr2".into(), start: 0, end: 4, rest: None },
         ];
         let rs = RegionSet::from(regions);
-        let (labels, matrix) = calc_dinucl_freq_per_region(&rs, &ga).unwrap();
+        let (labels, matrix) = calc_dinucl_freq(&rs, &ga, false, false).unwrap();
 
         assert_eq!(labels.len(), 1);
         assert_eq!(labels[0], "chr2_0_4");
         assert_eq!(matrix.len(), 1);
 
         // GC appears 2/3 times, CG appears 1/3 times
-        // Find indices in DINUCL_ORDER
         let gc_idx = DINUCL_ORDER.iter().position(|d| *d == Dinucleotide::Gc).unwrap();
         let cg_idx = DINUCL_ORDER.iter().position(|d| *d == Dinucleotide::Cg).unwrap();
 
@@ -616,11 +669,34 @@ mod tests {
         assert!((row[gc_idx] - 200.0 / 3.0).abs() < 0.1); // ~66.67%
         assert!((row[cg_idx] - 100.0 / 3.0).abs() < 0.1);  // ~33.33%
 
-        // all other dinucleotides should be 0
+        // percentages sum to 100
         let total: f64 = row.iter().sum();
         assert!((total - 100.0).abs() < 0.1);
     }
 
+    #[rstest]
+    fn test_calc_dinucl_freq_global_derivable() {
+        // Global counts are derivable by column-summing the raw-counts matrix.
+        // Two regions: chr1=GGAA, chr2=GCGC → pooled: GG×1, GA×1, AA×1, GC×2, CG×1 = 6 total
+        let path = get_fasta_path("base.fa");
+        let ga = GenomeAssembly::try_from(path.to_str().unwrap()).unwrap();
+        let regions = vec![
+            Region { chr: "chr1".into(), start: 0, end: 4, rest: None },
+            Region { chr: "chr2".into(), start: 0, end: 4, rest: None },
+        ];
+        let rs = RegionSet::from(regions);
+        let (_, matrix) = calc_dinucl_freq(&rs, &ga, true, false).unwrap();
+
+        let mut totals = [0.0f64; 16];
+        for row in &matrix {
+            for (i, &c) in row.iter().enumerate() {
+                totals[i] += c;
+            }
+        }
+        let grand: f64 = totals.iter().sum();
+        assert_eq!(grand, 6.0);
+    }
+
     // --- Empty RegionSet edge cases ---
 
     #[rstest]
@@ -747,6 +823,38 @@ mod tests {
         );
     }
 
+    #[rstest]
+    fn test_region_distribution_with_chrom_sizes_skips_out_of_range() {
+        // Regions whose midpoint falls beyond the stated chromosome size are
+        // silently skipped (assembly mismatch case). Previously this produced
+        // bins with end < start or rid >= n_bins.
+        let regions = vec![
+            Region { chr: "chr1".into(), start: 100, end: 200, rest: None },  // mid=150, in range
+            Region { chr: "chr1".into(), start: 800, end: 900, rest: None },  // mid=850, in range
+            Region { chr: "chr1".into(), start: 1200, end: 1300, rest: None },// mid=1250, out of range (chrom_size=1000)
+            Region { chr: "chr2".into(), start: 300, end: 400, rest: None },  // mid=350, in range
+            Region { chr: "chr2".into(), start: 2000, end: 2100, rest: None },// mid=2050, out of range
+            Region { chr: "chr3".into(), start: 0, end: 100, rest: None },    // chr3 not in chrom_sizes
+        ];
+        let rs = RegionSet::from(regions);
+        let mut chrom_sizes = HashMap::new();
+        chrom_sizes.insert("chr1".to_string(), 1000u32);
+        chrom_sizes.insert("chr2".to_string(), 500u32);
+
+        let bins = rs.region_distribution_with_chrom_sizes(10, &chrom_sizes);
+
+        // Every bin should have end > start and rid < n_bins
+        for bin in bins.values() {
+            assert!(bin.end > bin.start, "bin {:?} has end <= start", bin);
+            assert!(bin.rid < 10, "bin {:?} has rid >= n_bins", bin);
+        }
+
+        // Total counted regions: 2 (chr1) + 1 (chr2) = 3
+        // (chr1's third region, chr2's second, and chr3's only region are all skipped)
+        let total: u32 = bins.values().map(|b| b.n).sum();
+        assert_eq!(total, 3, "expected 3 in-range regions counted");
+    }
+
     #[rstest]
     fn test_gc_content_in_valid_range() {
         // R GenomicDistributions tests: all GC values should be in [0, 1]
@@ -800,4 +908,20 @@ mod tests {
         // Mean width should be 1 billion
         assert!((s.mean_region_length - 1_000_000_000.0).abs() < 1.0);
     }
+
+    // ── spatial-arrangement feature tests ───────────────────────────────
+
+    fn make_rs(regions: &[(&str, u32, u32)]) -> RegionSet {
+        let regs: Vec<Region> = regions
+            .iter()
+            .map(|(chr, s, e)| Region {
+                chr: chr.to_string(),
+                start: *s,
+                end: *e,
+                rest: None,
+            })
+            .collect();
+        RegionSet::from(regs)
+    }
+
 }
diff --git a/gtars-lola/Cargo.toml b/gtars-lola/Cargo.toml
index 48cbcf70..bdf0ffef 100644
--- a/gtars-lola/Cargo.toml
+++ b/gtars-lola/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "gtars-lola"
-version = "0.1.0"
+version = "0.2.0"
 edition = "2024"
 description = "LOLA (Locus Overlap Analysis) for genomic region enrichment testing"
 license = "MIT"
diff --git a/gtars-lola/src/database.rs b/gtars-lola/src/database.rs
index d2d5737f..0b6ba5d7 100644
--- a/gtars-lola/src/database.rs
+++ b/gtars-lola/src/database.rs
@@ -24,14 +24,14 @@ pub struct CollectionAnno {
 #[derive(Debug, Clone, Default)]
 pub struct RegionSetAnno {
     pub filename: String,
-    pub cell_type: String,
-    pub description: String,
-    pub tissue: String,
-    pub data_source: String,
-    pub antibody: String,
-    pub treatment: String,
+    pub cell_type: Option<String>,
+    pub description: Option<String>,
+    pub tissue: Option<String>,
+    pub data_source: Option<String>,
+    pub antibody: Option<String>,
+    pub treatment: Option<String>,
     /// Which collection this file belongs to.
-    pub collection: String,
+    pub collection: Option<String>,
 }
 
 /// A LOLA region database: IGD index + original region sets + annotations.
@@ -145,12 +145,12 @@ impl RegionDB {
                         // Fall back to collection description if file-level is empty.
                         let mut anno = anno_map.get(fname).cloned().unwrap_or(RegionSetAnno {
                             filename: fname.clone(),
-                            collection: coll_name.clone(),
+                            collection: Some(coll_name.clone()),
                             ..Default::default()
                         });
-                        if anno.description.is_empty() {
+                        if anno.description.is_none() {
                             // R LOLA uses the collection folder name as fallback
-                            anno.description = coll_name.clone();
+                            anno.description = Some(coll_name.clone());
                         }
                         all_region_anno.push(anno);
                         files_loaded += 1;
@@ -248,7 +248,7 @@ impl RegionDB {
             .iter()
             .filter(|a| {
                 if let Some(filter) = collections {
-                    filter.iter().any(|f| *f == a.collection)
+                    a.collection.as_deref().map_or(false, |c| filter.contains(&c))
                 } else {
                     true
                 }
@@ -269,7 +269,7 @@ impl RegionDB {
             .filter(|(_, a)| {
                 let name_match = filenames.iter().any(|f| *f == a.filename);
                 let coll_match = if let Some(filter) = collections {
-                    filter.iter().any(|f| *f == a.collection)
+                    a.collection.as_deref().map_or(false, |c| filter.contains(&c))
                 } else {
                     true
                 };
@@ -429,23 +429,30 @@ fn parse_index_txt(path: &Path, collection_name: &str) -> Vec<RegionSetAnno> {
 
         let fields: Vec<&str> = line.split(sep).collect();
 
-        let get = |key: &str| -> String {
+        let get_required = |key: &str| -> String {
             col_map
                 .get(key)
                 .and_then(|&i| fields.get(i))
                 .map(|s| s.trim().to_string())
                 .unwrap_or_default()
         };
+        let get_optional = |key: &str| -> Option<String> {
+            col_map
+                .get(key)
+                .and_then(|&i| fields.get(i))
+                .map(|s| s.trim().to_string())
+                .filter(|s| !s.is_empty())
+        };
 
         annos.push(RegionSetAnno {
-            filename: get("filename"),
-            cell_type: get("cellType"),
-            description: get("description"),
-            tissue: get("tissue"),
-            data_source: get("dataSource"),
-            antibody: get("antibody"),
-            treatment: get("treatment"),
-            collection: collection_name.to_string(),
+            filename: get_required("filename"),
+            cell_type: get_optional("cellType"),
+            description: get_optional("description"),
+            tissue: get_optional("tissue"),
+            data_source: get_optional("dataSource"),
+            antibody: get_optional("antibody"),
+            treatment: get_optional("treatment"),
+            collection: Some(collection_name.to_string()),
         });
     }
 
@@ -504,8 +511,8 @@ mod tests {
         assert_eq!(db.collection_anno.len(), 1);
         assert_eq!(db.collection_anno[0].collector, "John");
         assert_eq!(db.region_anno.len(), 2);
-        assert_eq!(db.region_anno[0].cell_type, "K562");
-        assert_eq!(db.region_anno[1].cell_type, "HeLa");
+        assert_eq!(db.region_anno[0].cell_type.as_deref(), Some("K562"));
+        assert_eq!(db.region_anno[1].cell_type.as_deref(), Some("HeLa"));
 
         // Region sets should have correct counts
         assert_eq!(db.region_sets[0].regions.len(), 3); // file1: 3 regions
@@ -599,7 +606,7 @@ mod tests {
 
         let anno = RegionSetAnno {
             filename: "test.bed".to_string(),
-            cell_type: "K562".to_string(),
+            cell_type: Some("K562".to_string()),
             ..Default::default()
         };
 
@@ -665,12 +672,12 @@ mod tests {
         assert_eq!(db.num_region_sets(), 1);
         // R LOLA uses the collection folder name as fallback, not collection.txt description
         assert_eq!(
-            db.region_anno[0].description, "fallback_coll",
+            db.region_anno[0].description.as_deref(), Some("fallback_coll"),
             "Description should fall back to collection folder name when index.txt description is empty"
         );
         // Other fields from index.txt should still be present
-        assert_eq!(db.region_anno[0].cell_type, "K562");
-        assert_eq!(db.region_anno[0].tissue, "blood");
+        assert_eq!(db.region_anno[0].cell_type.as_deref(), Some("K562"));
+        assert_eq!(db.region_anno[0].tissue.as_deref(), Some("blood"));
     }
 
     #[test]
@@ -728,7 +735,7 @@ mod tests {
             .region_anno
             .iter()
             .enumerate()
-            .filter(|(_, a)| a.collection == "coll1")
+            .filter(|(_, a)| a.collection.as_deref() == Some("coll1"))
             .map(|(i, _)| i)
             .collect();
 
diff --git a/gtars-lola/src/enrichment.rs b/gtars-lola/src/enrichment.rs
index d3c60c7e..705c0a3d 100644
--- a/gtars-lola/src/enrichment.rs
+++ b/gtars-lola/src/enrichment.rs
@@ -264,13 +264,13 @@ pub fn run_lola(
                 d,
                 q_value: None,
                 filename,
-                collection: String::new(),
-                description: String::new(),
-                cell_type: String::new(),
-                tissue: String::new(),
-                antibody: String::new(),
-                treatment: String::new(),
-                data_source: String::new(),
+                collection: None,
+                description: None,
+                cell_type: None,
+                tissue: None,
+                antibody: None,
+                treatment: None,
+                data_source: None,
                 db_set_size: 0,
             });
         }
diff --git a/gtars-lola/src/models.rs b/gtars-lola/src/models.rs
index 259c13b1..fc177aef 100644
--- a/gtars-lola/src/models.rs
+++ b/gtars-lola/src/models.rs
@@ -83,19 +83,19 @@ pub struct LolaResult {
     /// DB set filename (from IGD file_info).
     pub filename: String,
     /// Collection name this DB set belongs to.
-    pub collection: String,
+    pub collection: Option<String>,
     /// Description from index.txt.
-    pub description: String,
+    pub description: Option<String>,
     /// Cell type annotation.
-    pub cell_type: String,
+    pub cell_type: Option<String>,
     /// Tissue annotation.
-    pub tissue: String,
+    pub tissue: Option<String>,
     /// Antibody annotation.
-    pub antibody: String,
+    pub antibody: Option<String>,
     /// Treatment annotation.
-    pub treatment: String,
+    pub treatment: Option<String>,
     /// Data source annotation.
-    pub data_source: String,
+    pub data_source: Option<String>,
     /// Number of regions in the DB set.
     pub db_set_size: u64,
 }
diff --git a/gtars-lola/src/output.rs b/gtars-lola/src/output.rs
index 64e34edb..aee22290 100644
--- a/gtars-lola/src/output.rs
+++ b/gtars-lola/src/output.rs
@@ -1,7 +1,6 @@
 //! Output formatting, FDR correction, and annotation.
 
 use std::io::Write;
-use std::path::Path;
 
 use crate::database::RegionDB;
 use crate::models::LolaResult;
@@ -16,7 +15,7 @@ pub fn annotate_results(results: &mut [LolaResult], db: &RegionDB) {
             let anno = &db.region_anno[r.db_set];
             r.collection = anno.collection.clone();
             // Truncate description to 80 chars (matches R LOLA behavior)
-            r.description = anno.description.chars().take(80).collect();
+            r.description = anno.description.as_ref().map(|d| d.chars().take(80).collect());
             r.cell_type = anno.cell_type.clone();
             r.tissue = anno.tissue.clone();
             r.antibody = anno.antibody.clone();
@@ -105,6 +104,94 @@ pub fn apply_fdr_correction(results: &mut [LolaResult]) {
     }
 }
 
+/// Column-oriented representation of LOLA results.
+///
+/// Each field is a parallel Vec — row `i` across all fields describes one result.
+/// Bindings should convert this to their native columnar type (JS object, PyDict,
+/// R data.frame) rather than reimplementing the row→column pivot.
+#[derive(Debug, Clone)]
+pub struct LolaColumnar {
+    pub user_set: Vec<usize>,
+    pub db_set: Vec<usize>,
+    pub p_value_log: Vec<f64>,
+    pub odds_ratio: Vec<f64>,
+    pub support: Vec<u64>,
+    pub rnk_pv: Vec<usize>,
+    pub rnk_or: Vec<usize>,
+    pub rnk_sup: Vec<usize>,
+    pub max_rnk: Vec<usize>,
+    pub mean_rnk: Vec<f64>,
+    pub b: Vec<i64>,
+    pub c: Vec<i64>,
+    pub d: Vec<i64>,
+    pub q_value: Vec<Option<f64>>,
+    pub filename: Vec<String>,
+    pub collection: Vec<Option<String>>,
+    pub description: Vec<Option<String>>,
+    pub cell_type: Vec<Option<String>>,
+    pub tissue: Vec<Option<String>>,
+    pub antibody: Vec<Option<String>>,
+    pub treatment: Vec<Option<String>>,
+    pub data_source: Vec<Option<String>>,
+    pub db_set_size: Vec<u64>,
+}
+
+/// Convert a slice of LolaResults into column-oriented vectors.
+pub fn results_to_columns(results: &[LolaResult]) -> LolaColumnar {
+    let n = results.len();
+    let mut c = LolaColumnar {
+        user_set: Vec::with_capacity(n),
+        db_set: Vec::with_capacity(n),
+        p_value_log: Vec::with_capacity(n),
+        odds_ratio: Vec::with_capacity(n),
+        support: Vec::with_capacity(n),
+        rnk_pv: Vec::with_capacity(n),
+        rnk_or: Vec::with_capacity(n),
+        rnk_sup: Vec::with_capacity(n),
+        max_rnk: Vec::with_capacity(n),
+        mean_rnk: Vec::with_capacity(n),
+        b: Vec::with_capacity(n),
+        c: Vec::with_capacity(n),
+        d: Vec::with_capacity(n),
+        q_value: Vec::with_capacity(n),
+        filename: Vec::with_capacity(n),
+        collection: Vec::with_capacity(n),
+        description: Vec::with_capacity(n),
+        cell_type: Vec::with_capacity(n),
+        tissue: Vec::with_capacity(n),
+        antibody: Vec::with_capacity(n),
+        treatment: Vec::with_capacity(n),
+        data_source: Vec::with_capacity(n),
+        db_set_size: Vec::with_capacity(n),
+    };
+    for r in results {
+        c.user_set.push(r.user_set);
+        c.db_set.push(r.db_set);
+        c.p_value_log.push(r.p_value_log);
+        c.odds_ratio.push(r.odds_ratio);
+        c.support.push(r.support);
+        c.rnk_pv.push(r.rnk_pv);
+        c.rnk_or.push(r.rnk_or);
+        c.rnk_sup.push(r.rnk_sup);
+        c.max_rnk.push(r.max_rnk);
+        c.mean_rnk.push(r.mean_rnk);
+        c.b.push(r.b);
+        c.c.push(r.c);
+        c.d.push(r.d);
+        c.q_value.push(r.q_value);
+        c.filename.push(r.filename.clone());
+        c.collection.push(r.collection.clone());
+        c.description.push(r.description.clone());
+        c.cell_type.push(r.cell_type.clone());
+        c.tissue.push(r.tissue.clone());
+        c.antibody.push(r.antibody.clone());
+        c.treatment.push(r.treatment.clone());
+        c.data_source.push(r.data_source.clone());
+        c.db_set_size.push(r.db_set_size);
+    }
+    c
+}
+
 /// Write LOLA results as TSV matching R LOLA's `writeCombinedEnrichment` format.
 pub fn write_results_tsv<W: Write>(
     writer: &mut W,
@@ -124,14 +211,13 @@ pub fn write_results_tsv<W: Write>(
             .q_value
             .map(|q| format!("{:.6e}", q))
             .unwrap_or_else(|| "NA".to_string());
-
         writeln!(
             writer,
             "{}\t{}\t{}\t{:.4}\t{:.4}\t{}\t{}\t{}\t{}\t{}\t{:.2}\t{}\t{}\t{}\t\
              {}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
             r.user_set + 1, // 1-based for R compatibility
             r.db_set + 1,
-            r.collection,
+            r.collection.as_deref().unwrap_or(""),
             r.p_value_log,
             r.odds_ratio,
             r.support,
@@ -143,12 +229,12 @@ pub fn write_results_tsv<W: Write>(
             r.b,
             r.c,
             r.d,
-            r.description,
-            r.cell_type,
-            r.tissue,
-            r.antibody,
-            r.treatment,
-            r.data_source,
+            r.description.as_deref().unwrap_or(""),
+            r.cell_type.as_deref().unwrap_or(""),
+            r.tissue.as_deref().unwrap_or(""),
+            r.antibody.as_deref().unwrap_or(""),
+            r.treatment.as_deref().unwrap_or(""),
+            r.data_source.as_deref().unwrap_or(""),
             r.filename,
             qv,
             r.db_set_size,
@@ -158,15 +244,6 @@ pub fn write_results_tsv<W: Write>(
     Ok(())
 }
 
-/// Write results to a TSV file on disk.
-pub fn write_results_to_file(
-    path: &Path,
-    results: &[LolaResult],
-) -> std::io::Result<()> {
-    let mut file = std::fs::File::create(path)?;
-    write_results_tsv(&mut file, results)
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -189,13 +266,13 @@ mod tests {
             d: 100,
             q_value: None,
             filename: format!("file{}.bed", db_set),
-            collection: String::new(),
-            description: String::new(),
-            cell_type: String::new(),
-            tissue: String::new(),
-            antibody: String::new(),
-            treatment: String::new(),
-            data_source: String::new(),
+            collection: None,
+            description: None,
+            cell_type: None,
+            tissue: None,
+            antibody: None,
+            treatment: None,
+            data_source: None,
             db_set_size: 0,
         }
     }
@@ -400,4 +477,55 @@ mod tests {
         let output = String::from_utf8(buf).unwrap();
         assert!(output.contains("NA")); // q_value should be NA
     }
+
+    #[test]
+    fn test_results_to_columns_basic() {
+        let results = vec![
+            make_result(0, 0, 3.0),
+            make_result(1, 2, 5.0),
+        ];
+        let c = results_to_columns(&results);
+
+        assert_eq!(c.user_set.len(), 2);
+        assert_eq!(c.user_set, vec![0, 1]);
+        assert_eq!(c.db_set, vec![0, 2]);
+        assert_eq!(c.p_value_log, vec![3.0, 5.0]);
+        assert_eq!(c.odds_ratio, vec![1.0, 1.0]);
+        assert_eq!(c.support, vec![10, 10]);
+        assert_eq!(c.b, vec![5, 5]);
+        assert_eq!(c.c, vec![5, 5]);
+        assert_eq!(c.d, vec![100, 100]);
+        assert_eq!(c.filename, vec!["file0.bed", "file2.bed"]);
+        assert_eq!(c.db_set_size, vec![0, 0]);
+        // empty strings → None
+        assert_eq!(c.collection, vec![None, None]);
+        assert_eq!(c.description, vec![None, None]);
+        assert_eq!(c.cell_type, vec![None, None]);
+        assert_eq!(c.tissue, vec![None, None]);
+        assert_eq!(c.antibody, vec![None, None]);
+        assert_eq!(c.treatment, vec![None, None]);
+        assert_eq!(c.data_source, vec![None, None]);
+    }
+
+    #[test]
+    fn test_results_to_columns_empty() {
+        let c = results_to_columns(&[]);
+        assert!(c.user_set.is_empty());
+        assert!(c.filename.is_empty());
+    }
+
+    #[test]
+    fn test_results_to_columns_with_metadata() {
+        let mut r = make_result(0, 0, 1.0);
+        r.collection = Some("ENCODE".to_string());
+        r.cell_type = Some("K562".to_string());
+        r.tissue = None; // stays None
+        r.q_value = Some(0.05);
+
+        let c = results_to_columns(&[r]);
+        assert_eq!(c.collection, vec![Some("ENCODE".to_string())]);
+        assert_eq!(c.cell_type, vec![Some("K562".to_string())]);
+        assert_eq!(c.tissue, vec![None]);
+        assert_eq!(c.q_value, vec![Some(0.05)]);
+    }
 }
diff --git a/gtars-python/Cargo.toml b/gtars-python/Cargo.toml
index b2bf21ad..588f1e7d 100644
--- a/gtars-python/Cargo.toml
+++ b/gtars-python/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "gtars-py"
-version = "0.8.0"
+version = "0.9.0"
 edition = "2024"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/gtars-python/py_src/gtars/genomic_distributions/__init__.pyi b/gtars-python/py_src/gtars/genomic_distributions/__init__.pyi
index 0a89d7f8..5ac30fbb 100644
--- a/gtars-python/py_src/gtars/genomic_distributions/__init__.pyi
+++ b/gtars-python/py_src/gtars/genomic_distributions/__init__.pyi
@@ -1,28 +1,43 @@
-from typing import Optional, List, Dict
-from gtars.models import RegionSet, GenomeAssembly, PartitionList, SignalMatrix
+from typing import Optional, Union, List, Dict
+from gtars.models import RegionSet, GenomeAssembly, BinaryGenomeAssembly, PartitionList, SignalMatrix
 
 def calc_gc_content(
-    rs: RegionSet, genome: GenomeAssembly, ignore_unk_chroms: Optional[bool] = False
+    rs: RegionSet,
+    genome: Union[GenomeAssembly, BinaryGenomeAssembly],
+    ignore_unk_chroms: Optional[bool] = False,
 ) -> List[float]:
     """
     Calculate GC content for each region.
 
     :param rs: RegionSet object
-    :param genome: GenomeAssembly object (reference genome)
+    :param genome: GenomeAssembly or BinaryGenomeAssembly (reference genome)
     :param ignore_unk_chroms: ignore unknown chromosomes
     :return: GC content per region (0 to 1)
     """
     ...
 
-def calc_dinucleotide_frequency(
-    rs: RegionSet, genome: GenomeAssembly
-) -> Dict[str, int]:
+def calc_dinucl_freq(
+    rs: RegionSet,
+    genome: Union[GenomeAssembly, BinaryGenomeAssembly],
+    raw_counts: bool = False,
+    ignore_unk_chroms: bool = False,
+) -> Dict[str, object]:
     """
-    Calculate dinucleotide frequencies across all regions.
+    Per-region dinucleotide frequencies, matching R GenomicDistributions ``calcDinuclFreq``.
 
     :param rs: RegionSet object
-    :param genome: GenomeAssembly object (reference genome)
-    :return: dict mapping dinucleotide names to counts
+    :param genome: GenomeAssembly or BinaryGenomeAssembly (reference genome)
+    :param raw_counts: if False (default), return percentages (0–100) per row;
+        if True, return raw integer-valued counts
+    :param ignore_unk_chroms: skip regions on chromosomes not in the assembly
+    :return: dict with keys:
+        - ``region_labels``: list of ``chr_start_end`` strings (one per region)
+        - ``dinucleotides``: list of 16 dinucleotide names in canonical order
+        - ``frequencies``: list of lists — one row per region, 16 values per row
+          matching ``dinucleotides`` order
+
+    For pooled global counts across all regions, sum each column of the
+    raw-counts matrix.
     """
     ...
 
diff --git a/gtars-python/py_src/gtars/models/__init__.pyi b/gtars-python/py_src/gtars/models/__init__.pyi
index 77ca0550..12a5b815 100644
--- a/gtars-python/py_src/gtars/models/__init__.pyi
+++ b/gtars-python/py_src/gtars/models/__init__.pyi
@@ -1,4 +1,4 @@
-from typing import List, Dict, Optional
+from typing import List, Dict, Optional, Tuple
 
 class Region:
     chr: str
@@ -124,25 +124,45 @@ class RegionSet:
         """
         ...
 
-    def neighbor_distances(self) -> List[Optional[float]]:
+    def neighbor_distances(self) -> List[int]:
         """
-        Distances between consecutive regions on each chromosome.
-        Returns None for missing values.
+        Signed gaps between consecutive regions on each chromosome.
+
+        Output length may be shorter than input region count — chromosomes with
+        fewer than 2 regions are skipped (no neighbors to measure against).
+        Output is NOT aligned 1:1 with input regions.
         """
         ...
 
-    def nearest_neighbors(self) -> List[Optional[float]]:
+    def nearest_neighbors(self) -> List[int]:
         """
-        Distance from each region to its nearest neighbor.
-        Returns None for regions with no neighbor on the same chromosome.
+        Distance from each region to its nearest neighbor on the same chromosome.
+
+        Output length may be shorter than input region count — chromosomes with
+        only one region are skipped. Output is NOT aligned 1:1 with input regions.
         """
         ...
 
-    def distribution(self, n_bins: int = 250) -> List[Dict[str, object]]:
+    def distribution(
+        self,
+        n_bins: int = 250,
+        chrom_sizes: Optional[Dict[str, int]] = None,
+    ) -> List[Dict[str, object]]:
         """
         Region distribution across genomic bins.
 
-        :param n_bins: number of bins (default 250)
+        :param n_bins: number of bins for the longest chromosome (default 250)
+        :param chrom_sizes: optional mapping of chromosome name to length. When provided,
+            per-chromosome bin sizes are derived from the reference genome
+            (bin_size = chrom_size / n_bins per chrom). This produces outputs that are
+            comparable across BED files and aligned with reference genome positions.
+            When absent, bin size is derived from the BED file's observed max end
+            coordinate — outputs will NOT be comparable across files.
+
+            When ``chrom_sizes`` is provided, regions on chromosomes not listed in
+            ``chrom_sizes`` are skipped, as are regions whose midpoint falls beyond
+            the stated chromosome size (common with assembly mismatches). The summed
+            bin counts may therefore be lower than the input region count.
         :return: list of dicts with keys: chr, start, end, n, rid
         """
         ...
@@ -311,6 +331,17 @@ class RegionSet:
         """
         ...
 
+    def gaps(self, chrom_sizes: Dict[str, int]) -> "RegionSet":
+        """
+        Return gaps between regions per chromosome, bounded by chrom sizes.
+
+        Emits leading gaps (from 0), inter-region gaps, trailing gaps
+        (to ``chrom_size``), and full-chromosome gaps for any chromosome
+        in ``chrom_sizes`` that has no regions. Regions on chromosomes
+        not listed in ``chrom_sizes`` are skipped.
+        """
+        ...
+
     def __len__(self) -> int:
         """
         Size of the regionset
@@ -332,6 +363,20 @@ class GenomeAssembly:
     def __str__(self) -> str: ...
     def __repr__(self) -> str: ...
 
+class BinaryGenomeAssembly:
+    def __init__(self, path: str) -> "BinaryGenomeAssembly":
+        """
+        Open a .fab binary FASTA file for zero-copy mmap access.
+
+        Create .fab files with ``gtars prep --fasta <file>``.
+
+        :param path: path to the .fab file
+        """
+        ...
+
+    def __str__(self) -> str: ...
+    def __repr__(self) -> str: ...
+
 class TssIndex:
     def __init__(self, path: str) -> "TssIndex":
         """
diff --git a/gtars-python/src/genomic_distributions/mod.rs b/gtars-python/src/genomic_distributions/mod.rs
index 13681967..ddf21c12 100644
--- a/gtars-python/src/genomic_distributions/mod.rs
+++ b/gtars-python/src/genomic_distributions/mod.rs
@@ -2,14 +2,14 @@ use pyo3::prelude::*;
 
 mod tools;
 pub use self::tools::{
-    py_calc_dinucleotide_frequency, py_calc_expected_partitions, py_calc_gc_content,
-    py_calc_partitions, py_calc_summary_signal, py_consensus, py_median_abs_distance,
+    py_calc_dinucl_freq, py_calc_expected_partitions, py_calc_gc_content, py_calc_partitions,
+    py_calc_summary_signal, py_consensus, py_median_abs_distance,
 };
 
 #[pymodule]
 pub fn genomic_distributions(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_function(wrap_pyfunction!(py_calc_gc_content, m)?)?;
-    m.add_function(wrap_pyfunction!(py_calc_dinucleotide_frequency, m)?)?;
+    m.add_function(wrap_pyfunction!(py_calc_dinucl_freq, m)?)?;
     m.add_function(wrap_pyfunction!(py_calc_partitions, m)?)?;
     m.add_function(wrap_pyfunction!(py_calc_expected_partitions, m)?)?;
     m.add_function(wrap_pyfunction!(py_calc_summary_signal, m)?)?;
diff --git a/gtars-python/src/genomic_distributions/tools.rs b/gtars-python/src/genomic_distributions/tools.rs
index 7fe78412..6ae31fb2 100644
--- a/gtars-python/src/genomic_distributions/tools.rs
+++ b/gtars-python/src/genomic_distributions/tools.rs
@@ -1,7 +1,7 @@
 use pyo3::prelude::*;
 use pyo3::types::PyDict;
 
-use crate::models::{PyGenomeAssembly, PyPartitionList, PyRegionSet, PySignalMatrix};
+use crate::models::{PyBinaryGenomeAssembly, PyGenomeAssembly, PyPartitionList, PyRegionSet, PySignalMatrix};
 use gtars_genomicdist::{self, statistics};
 use std::collections::HashMap;
 
@@ -9,30 +9,64 @@ use std::collections::HashMap;
 #[pyo3(signature = (rs, genome, ignore_unk_chroms = Some(false)))]
 pub fn py_calc_gc_content(
     rs: &PyRegionSet,
-    genome: &PyGenomeAssembly,
+    genome: &Bound<'_, PyAny>,
     ignore_unk_chroms: Option<bool>,
 ) -> anyhow::Result<Vec<f64>> {
-    let result = statistics::calc_gc_content(
-        &rs.regionset,
-        &genome.genome_assembly,
-        ignore_unk_chroms.unwrap_or(false),
-    )?;
-    Ok(result)
+    let ignore = ignore_unk_chroms.unwrap_or(false);
+    if let Ok(g) = genome.cast::<PyBinaryGenomeAssembly>() {
+        let g = g.borrow();
+        Ok(statistics::calc_gc_content(&rs.regionset, &g.assembly, ignore)?)
+    } else if let Ok(g) = genome.cast::<PyGenomeAssembly>() {
+        let g = g.borrow();
+        Ok(statistics::calc_gc_content(&rs.regionset, &g.genome_assembly, ignore)?)
+    } else {
+        Err(anyhow::anyhow!("genome must be a GenomeAssembly or BinaryGenomeAssembly"))
+    }
 }
 
-#[pyfunction(name = "calc_dinucleotide_frequency")]
-pub fn py_calc_dinucleotide_frequency(
+/// Per-region dinucleotide frequencies, matching R GenomicDistributions `calcDinuclFreq`.
+///
+/// Arguments:
+///   - ``rs``: RegionSet
+///   - ``genome``: GenomeAssembly (reference)
+///   - ``raw_counts``: if False (default), return percentages (0–100) per row;
+///     if True, return raw integer-valued counts
+///
+/// Returns a dict:
+///   - ``region_labels``: list of ``chr_start_end`` strings (one per region)
+///   - ``dinucleotides``: list of 16 dinucleotide names in canonical order
+///   - ``frequencies``: list of lists — one row per region, 16 values per row
+///     matching ``dinucleotides`` order
+///
+/// For pooled global counts, sum each column of the raw-counts matrix.
+#[pyfunction(name = "calc_dinucl_freq")]
+#[pyo3(signature = (rs, genome, raw_counts = false, ignore_unk_chroms = false))]
+pub fn py_calc_dinucl_freq<'py>(
+    py: Python<'py>,
     rs: &PyRegionSet,
-    genome: &PyGenomeAssembly,
-) -> anyhow::Result<HashMap<String, u64>> {
-    let frequencies = statistics::calc_dinucl_freq(&rs.regionset, &genome.genome_assembly)?;
-    let mut freq_map: HashMap<String, u64> = HashMap::new();
-    // Convert Dinucleotide to String and push to HashMap
-    for (di, freq) in frequencies {
-        freq_map.insert(di.to_string()?, freq);
-    }
-
-    Ok(freq_map)
+    genome: &Bound<'py, PyAny>,
+    raw_counts: bool,
+    ignore_unk_chroms: bool,
+) -> anyhow::Result<Bound<'py, PyDict>> {
+    let (labels, matrix) = if let Ok(g) = genome.cast::<PyBinaryGenomeAssembly>() {
+        let g = g.borrow();
+        statistics::calc_dinucl_freq(&rs.regionset, &g.assembly, raw_counts, ignore_unk_chroms)?
+    } else if let Ok(g) = genome.cast::<PyGenomeAssembly>() {
+        let g = g.borrow();
+        statistics::calc_dinucl_freq(&rs.regionset, &g.genome_assembly, raw_counts, ignore_unk_chroms)?
+    } else {
+        return Err(anyhow::anyhow!("genome must be a GenomeAssembly or BinaryGenomeAssembly"));
+    };
+    let dinucl_names: Vec<String> = statistics::DINUCL_ORDER
+        .iter()
+        .map(|d| d.to_string().unwrap_or_default())
+        .collect();
+    let freqs_nested: Vec<Vec<f64>> = matrix.into_iter().map(|row| row.to_vec()).collect();
+    let result = PyDict::new(py);
+    result.set_item("region_labels", labels)?;
+    result.set_item("dinucleotides", dinucl_names)?;
+    result.set_item("frequencies", freqs_nested)?;
+    Ok(result)
 }
 
 #[pyfunction(name = "calc_partitions")]
diff --git a/gtars-python/src/lola/mod.rs b/gtars-python/src/lola/mod.rs
index 6d042360..63e4b9a0 100644
--- a/gtars-python/src/lola/mod.rs
+++ b/gtars-python/src/lola/mod.rs
@@ -125,13 +125,13 @@ impl PyRegionDB {
         for a in &self.inner.region_anno {
             let d = PyDict::new(py);
             d.set_item("filename", &a.filename)?;
-            d.set_item("cellType", &a.cell_type)?;
-            d.set_item("description", &a.description)?;
-            d.set_item("tissue", &a.tissue)?;
-            d.set_item("dataSource", &a.data_source)?;
-            d.set_item("antibody", &a.antibody)?;
-            d.set_item("treatment", &a.treatment)?;
-            d.set_item("collection", &a.collection)?;
+            d.set_item("cellType", a.cell_type.as_deref())?;
+            d.set_item("description", a.description.as_deref())?;
+            d.set_item("tissue", a.tissue.as_deref())?;
+            d.set_item("dataSource", a.data_source.as_deref())?;
+            d.set_item("antibody", a.antibody.as_deref())?;
+            d.set_item("treatment", a.treatment.as_deref())?;
+            d.set_item("collection", a.collection.as_deref())?;
             result.push(d);
         }
         Ok(result)
@@ -242,82 +242,33 @@ fn results_to_dict<'py>(
     py: Python<'py>,
     results: &[LolaResult],
 ) -> PyResult<Bound<'py, PyDict>> {
-    let n = results.len();
-    let mut user_set = Vec::with_capacity(n);
-    let mut db_set = Vec::with_capacity(n);
-    let mut p_value_log = Vec::with_capacity(n);
-    let mut odds_ratio = Vec::with_capacity(n);
-    let mut support = Vec::with_capacity(n);
-    let mut rnk_pv = Vec::with_capacity(n);
-    let mut rnk_or = Vec::with_capacity(n);
-    let mut rnk_sup = Vec::with_capacity(n);
-    let mut max_rnk = Vec::with_capacity(n);
-    let mut mean_rnk = Vec::with_capacity(n);
-    let mut b_vec = Vec::with_capacity(n);
-    let mut c_vec = Vec::with_capacity(n);
-    let mut d_vec = Vec::with_capacity(n);
-    let mut collection: Vec<Option<String>> = Vec::with_capacity(n);
-    let mut description: Vec<Option<String>> = Vec::with_capacity(n);
-    let mut cell_type: Vec<Option<String>> = Vec::with_capacity(n);
-    let mut tissue: Vec<Option<String>> = Vec::with_capacity(n);
-    let mut antibody: Vec<Option<String>> = Vec::with_capacity(n);
-    let mut treatment: Vec<Option<String>> = Vec::with_capacity(n);
-    let mut data_source: Vec<Option<String>> = Vec::with_capacity(n);
-    let mut filename = Vec::with_capacity(n);
-    let mut db_set_size = Vec::with_capacity(n);
-
-    for r in results {
-        user_set.push(r.user_set);
-        db_set.push(r.db_set);
-        p_value_log.push(r.p_value_log);
-        odds_ratio.push(r.odds_ratio);
-        support.push(r.support);
-        rnk_pv.push(r.rnk_pv);
-        rnk_or.push(r.rnk_or);
-        rnk_sup.push(r.rnk_sup);
-        max_rnk.push(r.max_rnk);
-        mean_rnk.push(r.mean_rnk);
-        b_vec.push(r.b);
-        c_vec.push(r.c);
-        d_vec.push(r.d);
-        collection.push(empty_to_none(&r.collection));
-        description.push(empty_to_none(&r.description));
-        cell_type.push(empty_to_none(&r.cell_type));
-        tissue.push(empty_to_none(&r.tissue));
-        antibody.push(empty_to_none(&r.antibody));
-        treatment.push(empty_to_none(&r.treatment));
-        data_source.push(empty_to_none(&r.data_source));
-        filename.push(r.filename.clone());
-        db_set_size.push(r.db_set_size);
-    }
-
-    let q_value: Vec<Option<f64>> = results.iter().map(|r| r.q_value).collect();
+    use gtars_lola::output::results_to_columns;
 
+    let c = results_to_columns(results);
     let dict = PyDict::new(py);
-    dict.set_item("userSet", user_set)?;
-    dict.set_item("dbSet", db_set)?;
-    dict.set_item("collection", collection)?;
-    dict.set_item("pValueLog", p_value_log)?;
-    dict.set_item("oddsRatio", odds_ratio)?;
-    dict.set_item("support", support)?;
-    dict.set_item("rnkPV", rnk_pv)?;
-    dict.set_item("rnkOR", rnk_or)?;
-    dict.set_item("rnkSup", rnk_sup)?;
-    dict.set_item("maxRnk", max_rnk)?;
-    dict.set_item("meanRnk", mean_rnk)?;
-    dict.set_item("b", b_vec)?;
-    dict.set_item("c", c_vec)?;
-    dict.set_item("d", d_vec)?;
-    dict.set_item("description", description)?;
-    dict.set_item("cellType", cell_type)?;
-    dict.set_item("tissue", tissue)?;
-    dict.set_item("antibody", antibody)?;
-    dict.set_item("treatment", treatment)?;
-    dict.set_item("dataSource", data_source)?;
-    dict.set_item("filename", filename)?;
-    dict.set_item("qValue", q_value)?;
-    dict.set_item("size", db_set_size)?;
-
+    dict.set_item("userSet", c.user_set)?;
+    dict.set_item("dbSet", c.db_set)?;
+    dict.set_item("collection", c.collection)?;
+    dict.set_item("pValueLog", c.p_value_log)?;
+    dict.set_item("oddsRatio", c.odds_ratio)?;
+    dict.set_item("support", c.support)?;
+    dict.set_item("rnkPV", c.rnk_pv)?;
+    dict.set_item("rnkOR", c.rnk_or)?;
+    dict.set_item("rnkSup", c.rnk_sup)?;
+    dict.set_item("maxRnk", c.max_rnk)?;
+    dict.set_item("meanRnk", c.mean_rnk)?;
+    dict.set_item("b", c.b)?;
+    dict.set_item("c", c.c)?;
+    dict.set_item("d", c.d)?;
+    dict.set_item("description", c.description)?;
+    dict.set_item("cellType", c.cell_type)?;
+    dict.set_item("tissue", c.tissue)?;
+    dict.set_item("antibody", c.antibody)?;
+    dict.set_item("treatment", c.treatment)?;
+    dict.set_item("dataSource", c.data_source)?;
+    dict.set_item("filename", c.filename)?;
+    dict.set_item("qValue", c.q_value)?;
+    dict.set_item("size", c.db_set_size)?;
     Ok(dict)
 }
 
@@ -408,14 +359,6 @@ fn py_build_restricted_universe(
 // Helpers
 // =========================================================================
 
-fn empty_to_none(s: &str) -> Option<String> {
-    if s.is_empty() {
-        None
-    } else {
-        Some(s.to_string())
-    }
-}
-
 fn tuples_to_regionset(tuples: Vec<(String, u32, u32)>) -> RegionSet {
     RegionSet::from(
         tuples
diff --git a/gtars-python/src/models/genome_assembly.rs b/gtars-python/src/models/genome_assembly.rs
index 4927771b..e560fb6c 100644
--- a/gtars-python/src/models/genome_assembly.rs
+++ b/gtars-python/src/models/genome_assembly.rs
@@ -1,4 +1,4 @@
-use gtars_genomicdist::models::GenomeAssembly;
+use gtars_genomicdist::models::{BinaryGenomeAssembly, GenomeAssembly};
 use pyo3::prelude::*;
 
 #[pyclass(name = "GenomeAssembly")]
@@ -22,3 +22,29 @@ impl PyGenomeAssembly {
         Ok(PyGenomeAssembly { genome_assembly })
     }
 }
+
+/// Memory-mapped genome assembly backed by a .fab binary file.
+///
+/// Near-instant construction via mmap with zero-copy sequence access.
+/// Create .fab files with ``gtars prep --fasta <file>``.
+#[pyclass(name = "BinaryGenomeAssembly")]
+pub struct PyBinaryGenomeAssembly {
+    pub assembly: BinaryGenomeAssembly,
+}
+
+#[pymethods]
+impl PyBinaryGenomeAssembly {
+    #[new]
+    /// Open a .fab binary FASTA file.
+    ///
+    /// Args:
+    ///     path: path to the .fab file
+    ///
+    /// Returns:
+    ///     BinaryGenomeAssembly object
+    pub fn new(path: &Bound<'_, PyAny>) -> PyResult<Self> {
+        let assembly = BinaryGenomeAssembly::try_from(path.to_string())
+            .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?;
+        Ok(PyBinaryGenomeAssembly { assembly })
+    }
+}
diff --git a/gtars-python/src/models/mod.rs b/gtars-python/src/models/mod.rs
index 0529af33..0cf9d038 100644
--- a/gtars-python/src/models/mod.rs
+++ b/gtars-python/src/models/mod.rs
@@ -13,12 +13,11 @@ pub(crate) mod tss_index;
 
 pub use self::gda::PyGenomicDistAnnotation;
 pub use self::gene_model::PyGeneModel;
-pub use self::genome_assembly::PyGenomeAssembly;
+pub use self::genome_assembly::{PyBinaryGenomeAssembly, PyGenomeAssembly};
 pub use self::interval::PyInterval;
 pub use self::partition_list::PyPartitionList;
 pub use self::region::PyRegion;
-pub use self::region_set::PyChromosomeStatistics;
-pub use self::region_set::PyRegionSet;
+pub use self::region_set::{PyChromosomeStatistics, PyRegionSet};
 pub use self::region_set_list::PyRegionSetList;
 pub use self::signal_matrix::PySignalMatrix;
 pub use self::tss_index::PyTssIndex;
@@ -30,6 +29,7 @@ pub fn models(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<PyRegionSet>()?;
     m.add_class::<PyChromosomeStatistics>()?;
     m.add_class::<PyGenomeAssembly>()?;
+    m.add_class::<PyBinaryGenomeAssembly>()?;
     m.add_class::<PyTssIndex>()?;
     m.add_class::<PyGeneModel>()?;
     m.add_class::<PyPartitionList>()?;
diff --git a/gtars-python/src/models/region_set.rs b/gtars-python/src/models/region_set.rs
index 720eb977..90a59c48 100644
--- a/gtars-python/src/models/region_set.rs
+++ b/gtars-python/src/models/region_set.rs
@@ -298,43 +298,38 @@ impl PyRegionSet {
         self.regionset.calc_widths()
     }
 
-    fn neighbor_distances(&self) -> PyResult<Vec<Option<f64>>> {
-        let dists = self
-            .regionset
+    /// Signed gaps between consecutive regions on each chromosome.
+    ///
+    /// Output length may be shorter than input region count — chromosomes with
+    /// fewer than 2 regions are skipped (no neighbors to measure against). Output
+    /// is not aligned 1:1 with input regions.
+    fn neighbor_distances(&self) -> PyResult<Vec<i64>> {
+        self.regionset
             .calc_neighbor_distances()
-            .map_err(|e| PyValueError::new_err(e.to_string()))?;
-        Ok(dists
-            .into_iter()
-            .map(|d| {
-                if d == i64::MAX {
-                    None
-                } else {
-                    Some(d as f64)
-                }
-            })
-            .collect())
+            .map_err(|e| PyValueError::new_err(e.to_string()))
     }
 
-    fn nearest_neighbors(&self) -> PyResult<Vec<Option<f64>>> {
-        let dists = self
-            .regionset
+    /// Distance from each region to its nearest neighbor on the same chromosome.
+    ///
+    /// Output length may be shorter than input region count — chromosomes with
+    /// only one region are skipped. Output is not aligned 1:1 with input regions.
+    fn nearest_neighbors(&self) -> PyResult<Vec<u32>> {
+        self.regionset
             .calc_nearest_neighbors()
-            .map_err(|e| PyValueError::new_err(e.to_string()))?;
-        Ok(dists
-            .into_iter()
-            .map(|d| {
-                if d == u32::MAX {
-                    None
-                } else {
-                    Some(d as f64)
-                }
-            })
-            .collect())
+            .map_err(|e| PyValueError::new_err(e.to_string()))
     }
 
-    #[pyo3(signature = (n_bins = 250))]
-    fn distribution<'py>(&self, py: Python<'py>, n_bins: u32) -> PyResult<Vec<Bound<'py, PyDict>>> {
-        let bins = self.regionset.region_distribution_with_bins(n_bins);
+    #[pyo3(signature = (n_bins = 250, chrom_sizes = None))]
+    fn distribution<'py>(
+        &self,
+        py: Python<'py>,
+        n_bins: u32,
+        chrom_sizes: Option<HashMap<String, u32>>,
+    ) -> PyResult<Vec<Bound<'py, PyDict>>> {
+        let bins = match chrom_sizes {
+            Some(cs) => self.regionset.region_distribution_with_chrom_sizes(n_bins, &cs),
+            None => self.regionset.region_distribution_with_bins(n_bins),
+        };
         let mut sorted_bins: Vec<_> = bins.into_values().collect();
         sorted_bins.sort_by(|a, b| {
             a.chr
@@ -512,6 +507,17 @@ impl PyRegionSet {
 
         result
     }
+
+    /// Return gaps between regions per chromosome, bounded by chrom sizes.
+    ///
+    /// Emits leading gaps, inter-region gaps, trailing gaps (to chrom_size),
+    /// and full-chromosome gaps for any chromosome in ``chrom_sizes`` that
+    /// has no regions at all.
+    fn gaps(&self, chrom_sizes: HashMap<String, u32>) -> PyResult<Self> {
+        let rs = self.regionset.gaps(&chrom_sizes);
+        Ok(Self::from_regionset(rs))
+    }
+
 }
 
 #[pymethods]
@@ -556,3 +562,4 @@ impl PyChromosomeStatistics {
         self.median_region_length
     }
 }
+
diff --git a/gtars-python/src/refget/mod.rs b/gtars-python/src/refget/mod.rs
index 9febc122..d2b9a54b 100644
--- a/gtars-python/src/refget/mod.rs
+++ b/gtars-python/src/refget/mod.rs
@@ -2522,9 +2522,9 @@ impl PyRefgetStore {
     ///
     /// Returns:
     ///     dict: {'sequences': [...], 'collections': [...]}
-    fn available_alias_namespaces(&self) -> PyResult<PyObject> {
+    fn available_alias_namespaces(&self) -> PyResult<Py<PyAny>> {
         let aliases = self.inner.available_alias_namespaces();
-        Python::with_gil(|py| {
+        Python::attach(|py| {
             let dict = pyo3::types::PyDict::new(py);
             let seq_list: Vec<&str> = aliases.sequences.iter().map(|s| s.as_str()).collect();
             let coll_list: Vec<&str> = aliases.collections.iter().map(|s| s.as_str()).collect();
@@ -2543,7 +2543,7 @@ impl PyRefgetStore {
     /// Returns:
     ///     dict: {pulled, skipped, not_found, conflicts}
     #[pyo3(signature = (namespace=None, strategy="keep-ours"))]
-    fn pull_aliases(&mut self, namespace: Option<&str>, strategy: &str) -> PyResult<PyObject> {
+    fn pull_aliases(&mut self, namespace: Option<&str>, strategy: &str) -> PyResult<Py<PyAny>> {
         let sync_strategy = parse_sync_strategy(strategy)?;
         let result = self
             .inner
@@ -2561,7 +2561,7 @@ impl PyRefgetStore {
     /// Returns:
     ///     dict: {pulled, skipped, not_found, conflicts}
     #[pyo3(signature = (digest=None, strategy="keep-ours"))]
-    fn pull_fhr(&mut self, digest: Option<&str>, strategy: &str) -> PyResult<PyObject> {
+    fn pull_fhr(&mut self, digest: Option<&str>, strategy: &str) -> PyResult<Py<PyAny>> {
         let sync_strategy = parse_sync_strategy(strategy)?;
         let result = self
             .inner
@@ -3102,8 +3102,8 @@ fn parse_sync_strategy(strategy: &str) -> PyResult<SyncStrategy> {
 }
 
 /// Convert a PullResult into a Python dict.
-fn pull_result_to_pyobject(result: gtars_refget::PullResult) -> PyResult<PyObject> {
-    Python::with_gil(|py| {
+fn pull_result_to_pyobject(result: gtars_refget::PullResult) -> PyResult<Py<PyAny>> {
+    Python::attach(|py| {
         let dict = pyo3::types::PyDict::new(py);
         dict.set_item("pulled", result.pulled)?;
         dict.set_item("skipped", result.skipped)?;
diff --git a/gtars-python/tests/test_genomicdist.py b/gtars-python/tests/test_genomicdist.py
index e6a4ee5a..2b1c0d5e 100644
--- a/gtars-python/tests/test_genomicdist.py
+++ b/gtars-python/tests/test_genomicdist.py
@@ -55,8 +55,18 @@ def test_neighbor_distances_same_chrom(self):
         )
         dists = rs.neighbor_distances()
         assert len(dists) == 2
-        assert dists[0] == 100.0  # 300 - 200
-        assert dists[1] == 100.0  # 500 - 400
+        assert dists[0] == 100  # 300 - 200
+        assert dists[1] == 100  # 500 - 400
+        # Return type is List[int] (no None/sentinel wrapping)
+        assert all(isinstance(d, int) for d in dists)
+
+    def test_nearest_neighbors_return_type(self):
+        # Confirm no None/Optional wrapping — the core function skips
+        # single-region chroms rather than emitting sentinels.
+        rs = make_regionset([("chr1", 0, 10), ("chr1", 20, 30)])
+        nn = rs.nearest_neighbors()
+        assert all(isinstance(d, int) for d in nn)
+        assert None not in nn
 
     def test_neighbor_distances_overlapping(self):
         rs = make_regionset([("chr1", 100, 300), ("chr1", 200, 400)])
@@ -104,6 +114,38 @@ def test_distribution_default_bins(self):
         dist = rs.distribution()
         assert len(dist) > 0
 
+    def test_distribution_with_chrom_sizes(self):
+        """Passing chrom_sizes uses reference-derived per-chrom bin sizes."""
+        rs = make_regionset([("chr1", 0, 100), ("chr2", 200, 300)])
+        # chr1 size 1000, chr2 size 500, 10 bins each → chr1 bin=100, chr2 bin=50
+        chrom_sizes = {"chr1": 1000, "chr2": 500}
+        dist = rs.distribution(10, chrom_sizes)
+        assert isinstance(dist, list)
+        # chr1 bin width should be 100 (1000/10)
+        chr1_bins = [d for d in dist if d["chr"] == "chr1"]
+        assert all(d["end"] - d["start"] == 100 for d in chr1_bins)
+        # chr2 bin width should be 50 (500/10)
+        chr2_bins = [d for d in dist if d["chr"] == "chr2"]
+        assert all(d["end"] - d["start"] == 50 for d in chr2_bins)
+
+    def test_distribution_chrom_sizes_skips_out_of_range(self):
+        """Regions beyond stated chrom_size are skipped (assembly mismatch case)."""
+        # chr1 size 1000: region at mid=1250 is out of range
+        # chr2 size 500:  region at mid=2050 is out of range
+        rs = make_regionset([
+            ("chr1", 100, 200),    # mid=150, in range
+            ("chr1", 1200, 1300),  # mid=1250, out of range
+            ("chr2", 200, 300),    # mid=250, in range
+            ("chr2", 2000, 2100),  # mid=2050, out of range
+        ])
+        chrom_sizes = {"chr1": 1000, "chr2": 500}
+        dist = rs.distribution(10, chrom_sizes)
+        # 2 of 4 regions should be in bins; bins are well-formed (no end < start)
+        total_in_bins = sum(d["n"] for d in dist)
+        assert total_in_bins == 2
+        assert all(d["end"] > d["start"] for d in dist)
+        assert all(d["rid"] < 10 for d in dist)
+
 
 # =========================================================================
 # Interval range operations
diff --git a/gtars-python/tests/test_regionset.py b/gtars-python/tests/test_regionset.py
index 8da4d628..eabc5712 100644
--- a/gtars-python/tests/test_regionset.py
+++ b/gtars-python/tests/test_regionset.py
@@ -100,3 +100,26 @@ def test_disjoin_empty(self):
         rs = _rs()
         result = rs.disjoin()
         assert len(result) == 0
+
+
+class TestGaps:
+    def test_gaps_basic(self):
+        """Gaps should emit leading, inter-region, and trailing gaps."""
+        rs = _rs(("chr1", 10, 20), ("chr1", 30, 40), ("chr1", 50, 60))
+        result = rs.gaps({"chr1": 100})
+        coords = [(r.start, r.end) for r in result]
+        assert coords == [(0, 10), (20, 30), (40, 50), (60, 100)]
+
+    def test_gaps_full_chrom_for_empty(self):
+        """A chromosome in chrom_sizes with no regions yields a full-chrom gap."""
+        rs = _rs(("chr1", 10, 20))
+        result = rs.gaps({"chr1": 100, "chr2": 50})
+        chr2 = [(r.start, r.end) for r in result if r.chr == "chr2"]
+        assert chr2 == [(0, 50)]
+
+    def test_gaps_empty(self):
+        rs = _rs()
+        result = rs.gaps({})
+        assert len(result) == 0
+
+
diff --git a/gtars-r/DESCRIPTION b/gtars-r/DESCRIPTION
index 76f270dc..b16592a7 100644
--- a/gtars-r/DESCRIPTION
+++ b/gtars-r/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: gtars
 Title: Performance critical genomic interval analysis using Rust, in R
-Version: 0.8.0
+Version: 0.9.0
 Authors@R: 
     person("Nathan", "LeRoy", , "nleroy917@gmail.com", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0002-7354-7213"))
diff --git a/gtars-r/NAMESPACE b/gtars-r/NAMESPACE
index ad5424f2..757991cb 100644
--- a/gtars-r/NAMESPACE
+++ b/gtars-r/NAMESPACE
@@ -23,6 +23,7 @@ export(calcWidth)
 export(checkUniverseAppropriateness)
 export(check_universe)
 export(chromosomeStatistics)
+export(clusterRegions)
 export(compare)
 export(concat)
 export(consensus)
@@ -85,6 +86,7 @@ export(list_sequences)
 export(loadGenomeAssembly)
 export(loadRegionDB)
 export(loadRegionDBFromBeds)
+export(load_binary_genome_assembly)
 export(load_collection_aliases)
 export(load_fhr_metadata)
 export(load_from_directory)
@@ -117,6 +119,7 @@ export(r_calc_summary_signal_from_matrix)
 export(r_calc_tss_distances)
 export(r_calc_widths)
 export(r_chromosome_statistics)
+export(r_cluster)
 export(r_concat)
 export(r_consensus)
 export(r_count_overlaps)
@@ -184,6 +187,7 @@ exportMethods("[")
 exportMethods("[[")
 exportMethods(as.data.frame)
 exportMethods(chromosomeStatistics)
+exportMethods(clusterRegions)
 exportMethods(concat)
 exportMethods(countOverlaps)
 exportMethods(disjoin)
diff --git a/gtars-r/R/RegionSet-methods.R b/gtars-r/R/RegionSet-methods.R
index 1046bbc4..8441e2e6 100644
--- a/gtars-r/R/RegionSet-methods.R
+++ b/gtars-r/R/RegionSet-methods.R
@@ -137,6 +137,30 @@ setMethod("distribution", "ANY", function(x, nBins = 250L, chromSizes = NULL, ..
   distribution(RegionSet(x), nBins = nBins, chromSizes = chromSizes)
 })
 
+#' Cluster regions via single-linkage with a stitching radius
+#'
+#' @description Assigns a cluster ID to each region. Regions within
+#'   ``maxGap`` bp of another region on the same chromosome are linked
+#'   into the same cluster; chromosome boundaries always break clusters.
+#' @param x A RegionSet, GRanges, file path, or data.frame
+#' @param maxGap Maximum bp gap between regions to link (default 0)
+#' @param ... ignored
+#' @return Integer vector of 0-based cluster IDs in original region order
+#' @export
+setGeneric("clusterRegions", function(x, maxGap = 0L, ...) standardGeneric("clusterRegions"))
+
+#' @rdname clusterRegions
+#' @export
+setMethod("clusterRegions", "RegionSet", function(x, maxGap = 0L, ...) {
+  .Call(wrap__r_cluster, .ptr(x), as.integer(maxGap))
+})
+
+#' @rdname clusterRegions
+#' @export
+setMethod("clusterRegions", "ANY", function(x, maxGap = 0L, ...) {
+  clusterRegions(RegionSet(x), maxGap = maxGap)
+})
+
 # =========================================================================
 # IRanges generics: trim, reduce, promoters
 #
@@ -380,29 +404,44 @@ setMethod("disjoin", "data.frame", function(x, ...) {
   disjoin(RegionSet(x))
 })
 
-#' Return gaps between regions per chromosome
+#' Return gaps between regions per chromosome, bounded by chromosome sizes
+#'
+#' Emits the peak-free intervals of each chromosome listed in ``chrom_sizes``:
+#' leading gaps from 0, inter-region gaps, trailing gaps to the chromosome
+#' end, and full-chromosome gaps for any chromosome with no regions.
+#' Regions on chromosomes not listed in ``chrom_sizes`` are skipped.
 #'
 #' @param x A RegionSet, GRanges, file path, or data.frame
+#' @param chrom_sizes Named integer vector of chromosome sizes (name =
+#'   chromosome, value = size in bp). Required.
+#' @param start,end Ignored. Accepted only for compatibility with the
+#'   \code{IRanges::gaps} generic signature; use ``chrom_sizes`` instead.
 #' @param ... ignored
-#' @return A RegionSet with gap regions
+#' @return A RegionSet containing the gap intervals
 #' @rdname gaps
 #' @export
-setMethod("gaps", "RegionSet", function(x, start = NA, end = NA, ...) {
-  .rs_from_ptr(.Call(wrap__r_gaps, .ptr(x)))
+setMethod("gaps", "RegionSet", function(x, start = NA, end = NA, chrom_sizes = NULL, ...) {
+  if (is.null(chrom_sizes)) {
+    stop("'chrom_sizes' is required: pass a named integer vector of chromosome sizes")
+  }
+  .rs_from_ptr(.Call(
+    wrap__r_gaps,
+    .ptr(x),
+    names(chrom_sizes),
+    as.integer(chrom_sizes)
+  ))
 })
 
 #' @rdname gaps
 #' @export
-setMethod("gaps", "character", function(x, start = NA,
-                                         end = NA, ...) {
-  gaps(RegionSet(x))
+setMethod("gaps", "character", function(x, start = NA, end = NA, chrom_sizes = NULL, ...) {
+  gaps(RegionSet(x), chrom_sizes = chrom_sizes)
 })
 
 #' @rdname gaps
 #' @export
-setMethod("gaps", "data.frame", function(x, start = NA,
-                                          end = NA, ...) {
-  gaps(RegionSet(x))
+setMethod("gaps", "data.frame", function(x, start = NA, end = NA, chrom_sizes = NULL, ...) {
+  gaps(RegionSet(x), chrom_sizes = chrom_sizes)
 })
 
 # =========================================================================
diff --git a/gtars-r/R/extendr-wrappers.R b/gtars-r/R/extendr-wrappers.R
index 22741945..09869621 100644
--- a/gtars-r/R/extendr-wrappers.R
+++ b/gtars-r/R/extendr-wrappers.R
@@ -60,23 +60,49 @@ r_chromosome_statistics <- function(rs_ptr) .Call(wrap__r_chromosome_statistics,
 #' @param n_bins Number of bins (default 250)
 r_region_distribution <- function(rs_ptr, n_bins, chrom_names, chrom_lengths) .Call(wrap__r_region_distribution, rs_ptr, n_bins, chrom_names, chrom_lengths)
 
+#' Cluster nearby regions via single-linkage with a stitching radius.
+#'
+#' Returns an integer vector of cluster IDs in original region order.
+#' Regions within `max_gap` bp of another region on the same chromosome
+#' are assigned the same cluster. Chromosome boundaries always break clusters.
+#'
+#' @export
+#' @param rs_ptr External pointer to a RegionSet
+#' @param max_gap Maximum bp gap between regions to link (non-negative)
+r_cluster <- function(rs_ptr, max_gap) .Call(wrap__r_cluster, rs_ptr, max_gap)
+
 #' Load a genome assembly from a FASTA file
 #' @export
 #' @param fasta_path Path to a FASTA file
 load_genome_assembly <- function(fasta_path) .Call(wrap__r_load_genome_assembly, fasta_path)
 
+#' Load a binary genome assembly from a .fab file
+#' @export
+#' @param fab_path Path to a .fab binary FASTA file (created by gtars prep --fasta)
+load_binary_genome_assembly <- function(fab_path) .Call(wrap__r_load_binary_genome_assembly, fab_path)
+
 #' Calculate GC content for each region
 #' @export
 #' @param rs_ptr External pointer to a RegionSet
 #' @param assembly_ptr External pointer to a GenomeAssembly
-#' @param ignore_unk_chroms Skip regions on chromosomes not in the assembly
+#' @param ignore_unk_chroms Skip regions on chromosomes not in the assembly.
+#'   Pass NULL (the default) or FALSE to error on unknown chromosomes.
 r_calc_gc_content <- function(rs_ptr, assembly_ptr, ignore_unk_chroms) .Call(wrap__r_calc_gc_content, rs_ptr, assembly_ptr, ignore_unk_chroms)
 
-#' Calculate per-region dinucleotide frequencies (percentages)
+#' Calculate per-region dinucleotide frequencies (matches R GenomicDistributions calcDinuclFreq)
+#'
+#' Returns a list with a ``region`` column and 16 dinucleotide columns
+#' (AA, AC, ..., TT). When ``raw_counts`` is FALSE (default), each row sums
+#' to 100 (percentages). When TRUE, values are raw integer counts.
+#'
+#' For pooled global counts, sum each dinucleotide column across rows
+#' (with ``raw_counts=TRUE``).
 #' @export
 #' @param rs_ptr External pointer to a RegionSet
 #' @param assembly_ptr External pointer to a GenomeAssembly
-r_calc_dinucl_freq <- function(rs_ptr, assembly_ptr) .Call(wrap__r_calc_dinucl_freq, rs_ptr, assembly_ptr)
+#' @param raw_counts Return raw counts instead of percentages (default FALSE, matches R upstream)
+#' @param ignore_unk_chroms Skip regions on chromosomes not in the assembly (default FALSE)
+r_calc_dinucl_freq <- function(rs_ptr, assembly_ptr, raw_counts, ignore_unk_chroms) .Call(wrap__r_calc_dinucl_freq, rs_ptr, assembly_ptr, raw_counts, ignore_unk_chroms)
 
 #' Trim regions to chromosome boundaries
 #' @export
@@ -166,10 +192,18 @@ r_narrow <- function(rs_ptr, start, end, width) .Call(wrap__r_narrow, rs_ptr, st
 #' @param rs_ptr External pointer to a RegionSet
 r_disjoin <- function(rs_ptr) .Call(wrap__r_disjoin, rs_ptr)
 
-#' Return gaps between regions per chromosome
+#' Return gaps between regions per chromosome, bounded by chrom sizes
+#'
+#' Emits the peak-free intervals of each chromosome listed in `chrom_sizes`,
+#' including leading gaps (from 0), inter-region gaps, trailing gaps
+#' (to `chrom_size`), and full-chromosome gaps for chromosomes with no
+#' regions. Regions on chromosomes not present in `chrom_sizes` are skipped.
+#'
 #' @export
 #' @param rs_ptr External pointer to a RegionSet
-r_gaps <- function(rs_ptr) .Call(wrap__r_gaps, rs_ptr)
+#' @param chrom_names Character vector of chromosome names
+#' @param chrom_sizes_vec Integer vector of chromosome sizes
+r_gaps <- function(rs_ptr, chrom_names, chrom_sizes_vec) .Call(wrap__r_gaps, rs_ptr, chrom_names, chrom_sizes_vec)
 
 #' Range-level intersection of two region sets
 #' @export
diff --git a/gtars-r/R/genomicdist.R b/gtars-r/R/genomicdist.R
index 30340ce2..8b4e5efb 100644
--- a/gtars-r/R/genomicdist.R
+++ b/gtars-r/R/genomicdist.R
@@ -96,18 +96,29 @@ calcGCContent <- function(query, ref, ignoreUnkChroms = FALSE) {
 #' Calculate dinucleotide frequency
 #'
 #' @description Calculates the frequency of all 16 dinucleotides for each
-#'   region as percentages. Drop-in replacement for
-#'   GenomicDistributions::calcDinuclFreq.
+#'   region. Drop-in replacement for GenomicDistributions::calcDinuclFreq,
+#'   including the \code{rawCounts} parameter.
 #'
 #' @param query A GRanges object, file path, data.frame, or RegionSet
 #' @param ref A GenomeAssembly pointer (from loadGenomeAssembly)
+#' @param rawCounts If TRUE, return raw integer counts; if FALSE (default),
+#'   return percentages per row (each row sums to 100)
+#' @param ignoreUnknownChroms If TRUE, skip regions on chromosomes not in
+#'   the assembly. If FALSE (default), error on unknown chromosomes.
 #' @return A data.frame with a \code{region} column and 16 dinucleotide
-#'   columns containing frequency percentages.
+#'   columns. For pooled global counts, call \code{colSums(result[, -1])}
+#'   on a \code{rawCounts=TRUE} result.
 #'
 #' @export
-calcDinuclFreq <- function(query, ref) {
+calcDinuclFreq <- function(query, ref, rawCounts = FALSE, ignoreUnknownChroms = FALSE) {
   query <- .ensure_regionset(query)
-  result <- .Call(wrap__r_calc_dinucl_freq, query, ref)
+  result <- .Call(
+    wrap__r_calc_dinucl_freq,
+    query,
+    ref,
+    rawCounts,
+    ignoreUnknownChroms
+  )
   as.data.frame(result, stringsAsFactors = FALSE)
 }
 
diff --git a/gtars-r/man/calcDinuclFreq.Rd b/gtars-r/man/calcDinuclFreq.Rd
index 7f257c8b..d30edd62 100644
--- a/gtars-r/man/calcDinuclFreq.Rd
+++ b/gtars-r/man/calcDinuclFreq.Rd
@@ -4,19 +4,26 @@
 \alias{calcDinuclFreq}
 \title{Calculate dinucleotide frequency}
 \usage{
-calcDinuclFreq(query, ref)
+calcDinuclFreq(query, ref, rawCounts = FALSE, ignoreUnknownChroms = FALSE)
 }
 \arguments{
 \item{query}{A GRanges object, file path, data.frame, or RegionSet}
 
 \item{ref}{A GenomeAssembly pointer (from loadGenomeAssembly)}
+
+\item{rawCounts}{If TRUE, return raw integer counts; if FALSE (default),
+return percentages per row (each row sums to 100)}
+
+\item{ignoreUnknownChroms}{If TRUE, skip regions on chromosomes not in
+the assembly. If FALSE (default), error on unknown chromosomes.}
 }
 \value{
 A data.frame with a \code{region} column and 16 dinucleotide
-columns containing frequency percentages.
+columns. For pooled global counts, call \code{colSums(result[, -1])}
+on a \code{rawCounts=TRUE} result.
 }
 \description{
 Calculates the frequency of all 16 dinucleotides for each
-region as percentages. Drop-in replacement for
-GenomicDistributions::calcDinuclFreq.
+region. Drop-in replacement for GenomicDistributions::calcDinuclFreq,
+including the \code{rawCounts} parameter.
 }
diff --git a/gtars-r/man/clusterRegions.Rd b/gtars-r/man/clusterRegions.Rd
new file mode 100644
index 00000000..deb5eedb
--- /dev/null
+++ b/gtars-r/man/clusterRegions.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/RegionSet-methods.R
+\name{clusterRegions}
+\alias{clusterRegions}
+\alias{clusterRegions,RegionSet-method}
+\alias{clusterRegions,ANY-method}
+\title{Cluster regions via single-linkage with a stitching radius}
+\usage{
+clusterRegions(x, maxGap = 0L, ...)
+
+\S4method{clusterRegions}{RegionSet}(x, maxGap = 0L, ...)
+
+\S4method{clusterRegions}{ANY}(x, maxGap = 0L, ...)
+}
+\arguments{
+\item{x}{A RegionSet, GRanges, file path, or data.frame}
+
+\item{maxGap}{Maximum bp gap between regions to link (default 0)}
+
+\item{...}{ignored}
+}
+\value{
+Integer vector of 0-based cluster IDs in original region order
+}
+\description{
+Assigns a cluster ID to each region. Regions within
+\code{maxGap} bp of another region on the same chromosome are linked
+into the same cluster; chromosome boundaries always break clusters.
+}
diff --git a/gtars-r/man/gaps.Rd b/gtars-r/man/gaps.Rd
index e11b11ee..9eb34d57 100644
--- a/gtars-r/man/gaps.Rd
+++ b/gtars-r/man/gaps.Rd
@@ -4,22 +4,31 @@
 \alias{gaps,RegionSet-method}
 \alias{gaps,character-method}
 \alias{gaps,data.frame-method}
-\title{Return gaps between regions per chromosome}
+\title{Return gaps between regions per chromosome, bounded by chromosome sizes}
 \usage{
-\S4method{gaps}{RegionSet}(x, start = NA, end = NA, ...)
+\S4method{gaps}{RegionSet}(x, start = NA, end = NA, chrom_sizes = NULL, ...)
 
-\S4method{gaps}{character}(x, start = NA, end = NA, ...)
+\S4method{gaps}{character}(x, start = NA, end = NA, chrom_sizes = NULL, ...)
 
-\S4method{gaps}{data.frame}(x, start = NA, end = NA, ...)
+\S4method{gaps}{data.frame}(x, start = NA, end = NA, chrom_sizes = NULL, ...)
 }
 \arguments{
 \item{x}{A RegionSet, GRanges, file path, or data.frame}
 
+\item{start, end}{Ignored. Accepted only for compatibility with the
+\code{IRanges::gaps} generic signature; use \code{chrom_sizes} instead.}
+
+\item{chrom_sizes}{Named integer vector of chromosome sizes (name =
+chromosome, value = size in bp). Required.}
+
 \item{...}{ignored}
 }
 \value{
-A RegionSet with gap regions
+A RegionSet containing the gap intervals
 }
 \description{
-Return gaps between regions per chromosome
+Emits the peak-free intervals of each chromosome listed in \code{chrom_sizes}:
+leading gaps from 0, inter-region gaps, trailing gaps to the chromosome
+end, and full-chromosome gaps for any chromosome with no regions.
+Regions on chromosomes not listed in \code{chrom_sizes} are skipped.
 }
diff --git a/gtars-r/man/load_binary_genome_assembly.Rd b/gtars-r/man/load_binary_genome_assembly.Rd
new file mode 100644
index 00000000..b349a274
--- /dev/null
+++ b/gtars-r/man/load_binary_genome_assembly.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/extendr-wrappers.R
+\name{load_binary_genome_assembly}
+\alias{load_binary_genome_assembly}
+\title{Load a binary genome assembly from a .fab file}
+\usage{
+load_binary_genome_assembly(fab_path)
+}
+\arguments{
+\item{fab_path}{Path to a .fab binary FASTA file (created by gtars prep --fasta)}
+}
+\description{
+Load a binary genome assembly from a .fab file
+}
diff --git a/gtars-r/man/r_calc_dinucl_freq.Rd b/gtars-r/man/r_calc_dinucl_freq.Rd
index d29f4941..e2d2315a 100644
--- a/gtars-r/man/r_calc_dinucl_freq.Rd
+++ b/gtars-r/man/r_calc_dinucl_freq.Rd
@@ -2,15 +2,25 @@
 % Please edit documentation in R/extendr-wrappers.R
 \name{r_calc_dinucl_freq}
 \alias{r_calc_dinucl_freq}
-\title{Calculate per-region dinucleotide frequencies (percentages)}
+\title{Calculate per-region dinucleotide frequencies (matches R GenomicDistributions calcDinuclFreq)}
 \usage{
-r_calc_dinucl_freq(rs_ptr, assembly_ptr)
+r_calc_dinucl_freq(rs_ptr, assembly_ptr, raw_counts, ignore_unk_chroms)
 }
 \arguments{
 \item{rs_ptr}{External pointer to a RegionSet}
 
 \item{assembly_ptr}{External pointer to a GenomeAssembly}
+
+\item{raw_counts}{Return raw counts instead of percentages (default FALSE, matches R upstream)}
+
+\item{ignore_unk_chroms}{Skip regions on chromosomes not in the assembly (default FALSE)}
 }
 \description{
-Calculate per-region dinucleotide frequencies (percentages)
+Returns a list with a \code{region} column and 16 dinucleotide columns
+(AA, AC, ..., TT). When \code{raw_counts} is FALSE (default), each row sums
+to 100 (percentages). When TRUE, values are raw integer counts.
+}
+\details{
+For pooled global counts, sum each dinucleotide column across rows
+(with \code{raw_counts=TRUE}).
 }
diff --git a/gtars-r/man/r_calc_gc_content.Rd b/gtars-r/man/r_calc_gc_content.Rd
index 69f277b2..eb8b77a8 100644
--- a/gtars-r/man/r_calc_gc_content.Rd
+++ b/gtars-r/man/r_calc_gc_content.Rd
@@ -11,7 +11,8 @@ r_calc_gc_content(rs_ptr, assembly_ptr, ignore_unk_chroms)
 
 \item{assembly_ptr}{External pointer to a GenomeAssembly}
 
-\item{ignore_unk_chroms}{Skip regions on chromosomes not in the assembly}
+\item{ignore_unk_chroms}{Skip regions on chromosomes not in the assembly.
+Pass NULL (the default) or FALSE to error on unknown chromosomes.}
 }
 \description{
 Calculate GC content for each region
diff --git a/gtars-r/man/r_cluster.Rd b/gtars-r/man/r_cluster.Rd
new file mode 100644
index 00000000..038e8130
--- /dev/null
+++ b/gtars-r/man/r_cluster.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/extendr-wrappers.R
+\name{r_cluster}
+\alias{r_cluster}
+\title{Cluster nearby regions via single-linkage with a stitching radius.}
+\usage{
+r_cluster(rs_ptr, max_gap)
+}
+\arguments{
+\item{rs_ptr}{External pointer to a RegionSet}
+
+\item{max_gap}{Maximum bp gap between regions to link (non-negative)}
+}
+\description{
+Returns an integer vector of cluster IDs in original region order.
+Regions within \code{max_gap} bp of another region on the same chromosome
+are assigned the same cluster. Chromosome boundaries always break clusters.
+}
diff --git a/gtars-r/man/r_gaps.Rd b/gtars-r/man/r_gaps.Rd
index 86c01554..9d4e7176 100644
--- a/gtars-r/man/r_gaps.Rd
+++ b/gtars-r/man/r_gaps.Rd
@@ -2,13 +2,20 @@
 % Please edit documentation in R/extendr-wrappers.R
 \name{r_gaps}
 \alias{r_gaps}
-\title{Return gaps between regions per chromosome}
+\title{Return gaps between regions per chromosome, bounded by chrom sizes}
 \usage{
-r_gaps(rs_ptr)
+r_gaps(rs_ptr, chrom_names, chrom_sizes_vec)
 }
 \arguments{
 \item{rs_ptr}{External pointer to a RegionSet}
+
+\item{chrom_names}{Character vector of chromosome names}
+
+\item{chrom_sizes_vec}{Integer vector of chromosome sizes}
 }
 \description{
-Return gaps between regions per chromosome
+Emits the peak-free intervals of each chromosome listed in \code{chrom_sizes},
+including leading gaps (from 0), inter-region gaps, trailing gaps
+(to \code{chrom_size}), and full-chromosome gaps for chromosomes with no
+regions. Regions on chromosomes not present in \code{chrom_sizes} are skipped.
 }
diff --git a/gtars-r/src/rust/Cargo.toml b/gtars-r/src/rust/Cargo.toml
index d148db6b..a16ab051 100644
--- a/gtars-r/src/rust/Cargo.toml
+++ b/gtars-r/src/rust/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "gtars-r"
-version = "0.8.0"
+version = "0.9.0"
 edition = "2021"
 
 [lib]
diff --git a/gtars-r/src/rust/src/genomicdist.rs b/gtars-r/src/rust/src/genomicdist.rs
index 4df57943..1b9d7b80 100644
--- a/gtars-r/src/rust/src/genomicdist.rs
+++ b/gtars-r/src/rust/src/genomicdist.rs
@@ -3,12 +3,12 @@ use std::collections::HashMap;
 use extendr_api::prelude::*;
 
 use gtars_core::models::{Region, RegionSet, RegionSetList};
-use gtars_genomicdist::models::{GenomeAssembly, TssIndex};
+use gtars_genomicdist::models::{BinaryGenomeAssembly, GenomeAssembly, TssIndex};
 use gtars_overlaprs::RegionSetOverlaps;
 use gtars_genomicdist::{
-    calc_dinucl_freq_per_region, calc_gc_content, calc_summary_signal, chrom_karyotype_key,
-    consensus, genome_partition_list, calc_expected_partitions, calc_partitions,
-    pairwise_jaccard,
+    calc_dinucl_freq, calc_gc_content, calc_summary_signal,
+    chrom_karyotype_key, consensus, genome_partition_list, calc_expected_partitions,
+    calc_partitions, pairwise_jaccard,
     CoordinateMode, GenomicDistAnnotation, GenomicIntervalSetStatistics, GeneModel, IntervalRanges,
     PartitionList, SignalMatrix, Strand, StrandedRegionSet, DINUCL_ORDER,
 };
@@ -26,12 +26,22 @@ macro_rules! with_regionset {
     }};
 }
 
+/// Try to extract either a GenomeAssembly or BinaryGenomeAssembly pointer
+/// and call the provided function with `&dyn SequenceAccess`.
 macro_rules! with_assembly {
     ($ptr:expr, $asm:ident, $body:block) => {{
-        let ext_ptr = <ExternalPtr<GenomeAssembly>>::try_from($ptr)
-            .map_err(|_| extendr_api::Error::Other("Invalid GenomeAssembly pointer".into()))?;
-        let $asm = &*ext_ptr;
-        $body
+        use gtars_genomicdist::models::SequenceAccess;
+        if let Ok(ext_ptr) = <ExternalPtr<GenomeAssembly>>::try_from($ptr.clone()) {
+            let $asm: &dyn SequenceAccess = &*ext_ptr;
+            $body
+        } else if let Ok(ext_ptr) = <ExternalPtr<BinaryGenomeAssembly>>::try_from($ptr) {
+            let $asm: &dyn SequenceAccess = &*ext_ptr;
+            $body
+        } else {
+            Err(extendr_api::Error::Other(
+                "Invalid assembly pointer: expected GenomeAssembly or BinaryGenomeAssembly".into(),
+            ))
+        }
     }};
 }
 
@@ -135,9 +145,11 @@ pub fn r_regionset_length(rs_ptr: Robj) -> extendr_api::Result<i32> {
 /// @export
 /// @param rs_ptr External pointer to a RegionSet
 #[extendr(r_name = "r_calc_widths")]
-pub fn r_calc_widths(rs_ptr: Robj) -> extendr_api::Result<Vec<i32>> {
+pub fn r_calc_widths(rs_ptr: Robj) -> extendr_api::Result<Vec<f64>> {
+    // f64 instead of i32: widths are u32 in Rust and can exceed i32::MAX (2.1 Gbp)
+    // for concatenated/genome-scale regions. f64 safely represents any u32.
     with_regionset!(rs_ptr, rs, {
-        Ok(rs.calc_widths().into_iter().map(|w| w as i32).collect())
+        Ok(rs.calc_widths().into_iter().map(|w| w as f64).collect())
     })
 }
 
@@ -158,12 +170,14 @@ pub fn r_calc_neighbor_distances(rs_ptr: Robj) -> extendr_api::Result<Vec<f64>>
 /// @export
 /// @param rs_ptr External pointer to a RegionSet
 #[extendr(r_name = "r_calc_nearest_neighbors")]
-pub fn r_calc_nearest_neighbors(rs_ptr: Robj) -> extendr_api::Result<Vec<i32>> {
+pub fn r_calc_nearest_neighbors(rs_ptr: Robj) -> extendr_api::Result<Vec<f64>> {
+    // f64 instead of i32: neighbor distances are chromosome-scale (up to ~250M on
+    // hg38) and genome-concatenated BEDs could push past i32::MAX (2.1 Gbp).
     with_regionset!(rs_ptr, rs, {
         let dists = rs
             .calc_nearest_neighbors()
             .map_err(|e| extendr_api::Error::Other(format!("{}", e)))?;
-        Ok(dists.into_iter().map(|d| d as i32).collect())
+        Ok(dists.into_iter().map(|d| d as f64).collect())
     })
 }
 
@@ -180,21 +194,23 @@ pub fn r_chromosome_statistics(rs_ptr: Robj) -> extendr_api::Result<List> {
         });
 
         let mut chr_names: Vec<String> = Vec::new();
-        let mut n_regions: Vec<i32> = Vec::new();
+        let mut n_regions: Vec<f64> = Vec::new();
         let mut start_pos: Vec<f64> = Vec::new();
         let mut end_pos: Vec<f64> = Vec::new();
-        let mut min_len: Vec<i32> = Vec::new();
-        let mut max_len: Vec<i32> = Vec::new();
+        let mut min_len: Vec<f64> = Vec::new();
+        let mut max_len: Vec<f64> = Vec::new();
         let mut mean_len: Vec<f64> = Vec::new();
         let mut median_len: Vec<f64> = Vec::new();
 
+        // f64 for count and length fields: u32 values can exceed i32::MAX (2.1 Gbp)
+        // for wide regions, and region_count has no upper bound.
         for (chr, s) in &entries {
             chr_names.push(chr.clone());
-            n_regions.push(s.number_of_regions as i32);
+            n_regions.push(s.number_of_regions as f64);
             start_pos.push(s.start_nucleotide_position as f64);
             end_pos.push(s.end_nucleotide_position as f64);
-            min_len.push(s.minimum_region_length as i32);
-            max_len.push(s.maximum_region_length as i32);
+            min_len.push(s.minimum_region_length as f64);
+            max_len.push(s.maximum_region_length as f64);
             mean_len.push(s.mean_region_length);
             median_len.push(s.median_region_length);
         }
@@ -212,6 +228,25 @@ pub fn r_chromosome_statistics(rs_ptr: Robj) -> extendr_api::Result<List> {
     })
 }
 
+/// Cluster nearby regions via single-linkage with a stitching radius.
+///
+/// Returns an integer vector of cluster IDs in original region order.
+/// Regions within `max_gap` bp of another region on the same chromosome
+/// are assigned the same cluster. Chromosome boundaries always break clusters.
+///
+/// @export
+/// @param rs_ptr External pointer to a RegionSet
+/// @param max_gap Maximum bp gap between regions to link (non-negative)
+#[extendr(r_name = "r_cluster")]
+pub fn r_cluster(rs_ptr: Robj, max_gap: i32) -> extendr_api::Result<Vec<f64>> {
+    let max_gap_u32 = checked_u32(max_gap, "max_gap")?;
+    with_regionset!(rs_ptr, rs, {
+        let ids = rs.cluster(max_gap_u32);
+        // u32 IDs → f64 to match the other count/ID returns in this module.
+        Ok(ids.into_iter().map(|id| id as f64).collect())
+    })
+}
+
 /// Calculate region distribution across chromosome bins
 /// @export
 /// @param rs_ptr External pointer to a RegionSet
@@ -282,34 +317,79 @@ pub fn r_load_genome_assembly(fasta_path: &str) -> extendr_api::Result<Robj> {
     Ok(ExternalPtr::new(assembly).into())
 }
 
+/// Load a binary genome assembly from a .fab file
+/// @export
+/// @param fab_path Path to a .fab binary FASTA file (created by gtars prep --fasta)
+#[extendr(r_name = "load_binary_genome_assembly")]
+pub fn r_load_binary_genome_assembly(fab_path: &str) -> extendr_api::Result<Robj> {
+    let assembly = BinaryGenomeAssembly::try_from(fab_path)
+        .map_err(|e| extendr_api::Error::Other(format!("Loading .fab: {}", e)))?;
+    Ok(ExternalPtr::new(assembly).into())
+}
+
 /// Calculate GC content for each region
 /// @export
 /// @param rs_ptr External pointer to a RegionSet
 /// @param assembly_ptr External pointer to a GenomeAssembly
-/// @param ignore_unk_chroms Skip regions on chromosomes not in the assembly
+/// @param ignore_unk_chroms Skip regions on chromosomes not in the assembly.
+///   Pass NULL (the default) or FALSE to error on unknown chromosomes.
 #[extendr(r_name = "r_calc_gc_content")]
 pub fn r_calc_gc_content(
     rs_ptr: Robj,
     assembly_ptr: Robj,
-    ignore_unk_chroms: bool,
+    ignore_unk_chroms: Robj,
 ) -> extendr_api::Result<Vec<f64>> {
+    // Default to false when NULL/missing — matches Python's default
+    let ignore = if ignore_unk_chroms.is_null() {
+        false
+    } else {
+        <bool>::try_from(&ignore_unk_chroms)
+            .map_err(|_| extendr_api::Error::Other("ignore_unk_chroms must be logical".into()))?
+    };
     with_regionset!(rs_ptr, rs, {
         with_assembly!(assembly_ptr, asm, {
-            calc_gc_content(rs, asm, ignore_unk_chroms)
+            calc_gc_content(rs, asm, ignore)
                 .map_err(|e| extendr_api::Error::Other(format!("{}", e)))
         })
     })
 }
 
-/// Calculate per-region dinucleotide frequencies (percentages)
+/// Calculate per-region dinucleotide frequencies (matches R GenomicDistributions calcDinuclFreq)
+///
+/// Returns a list with a ``region`` column and 16 dinucleotide columns
+/// (AA, AC, ..., TT). When ``raw_counts`` is FALSE (default), each row sums
+/// to 100 (percentages). When TRUE, values are raw integer counts.
+///
+/// For pooled global counts, sum each dinucleotide column across rows
+/// (with ``raw_counts=TRUE``).
 /// @export
 /// @param rs_ptr External pointer to a RegionSet
 /// @param assembly_ptr External pointer to a GenomeAssembly
+/// @param raw_counts Return raw counts instead of percentages (default FALSE, matches R upstream)
+/// @param ignore_unk_chroms Skip regions on chromosomes not in the assembly (default FALSE)
 #[extendr(r_name = "r_calc_dinucl_freq")]
-pub fn r_calc_dinucl_freq(rs_ptr: Robj, assembly_ptr: Robj) -> extendr_api::Result<List> {
+pub fn r_calc_dinucl_freq(
+    rs_ptr: Robj,
+    assembly_ptr: Robj,
+    raw_counts: Robj,
+    ignore_unk_chroms: Robj,
+) -> extendr_api::Result<List> {
+    // Default to false when NULL/missing — matches Python and R GenomicDistributions defaults
+    let raw = if raw_counts.is_null() {
+        false
+    } else {
+        <bool>::try_from(&raw_counts)
+            .map_err(|_| extendr_api::Error::Other("raw_counts must be logical".into()))?
+    };
+    let ignore = if ignore_unk_chroms.is_null() {
+        false
+    } else {
+        <bool>::try_from(&ignore_unk_chroms)
+            .map_err(|_| extendr_api::Error::Other("ignore_unk_chroms must be logical".into()))?
+    };
     with_regionset!(rs_ptr, rs, {
         with_assembly!(assembly_ptr, asm, {
-            let (labels, matrix) = calc_dinucl_freq_per_region(rs, asm)
+            let (labels, matrix) = calc_dinucl_freq(rs, asm, raw, ignore)
                 .map_err(|e| extendr_api::Error::Other(format!("{}", e)))?;
 
             // Build column names (uppercase to match GD: AA, AC, AG, ...)
@@ -572,13 +652,26 @@ pub fn r_disjoin(rs_ptr: Robj) -> extendr_api::Result<Robj> {
     })
 }
 
-/// Return gaps between regions per chromosome
+/// Return gaps between regions per chromosome, bounded by chrom sizes
+///
+/// Emits the peak-free intervals of each chromosome listed in `chrom_sizes`,
+/// including leading gaps (from 0), inter-region gaps, trailing gaps
+/// (to `chrom_size`), and full-chromosome gaps for chromosomes with no
+/// regions. Regions on chromosomes not present in `chrom_sizes` are skipped.
+///
 /// @export
 /// @param rs_ptr External pointer to a RegionSet
+/// @param chrom_names Character vector of chromosome names
+/// @param chrom_sizes_vec Integer vector of chromosome sizes
 #[extendr(r_name = "r_gaps")]
-pub fn r_gaps(rs_ptr: Robj) -> extendr_api::Result<Robj> {
+pub fn r_gaps(
+    rs_ptr: Robj,
+    chrom_names: Vec<String>,
+    chrom_sizes_vec: Vec<i32>,
+) -> extendr_api::Result<Robj> {
+    let chrom_sizes = chrom_sizes_from_vecs(chrom_names, chrom_sizes_vec)?;
     with_regionset!(rs_ptr, rs, {
-        let result = rs.gaps();
+        let result = rs.gaps(&chrom_sizes);
         Ok(ExternalPtr::new(result).into())
     })
 }
@@ -1248,8 +1341,11 @@ extendr_module! {
     fn r_calc_nearest_neighbors;
     fn r_chromosome_statistics;
     fn r_region_distribution;
+    // Clustering
+    fn r_cluster;
     // GC / Dinucleotide
     fn r_load_genome_assembly;
+    fn r_load_binary_genome_assembly;
     fn r_calc_gc_content;
     fn r_calc_dinucl_freq;
     // Interval Ranges
diff --git a/gtars-r/src/rust/src/lola.rs b/gtars-r/src/rust/src/lola.rs
index a7c217d4..fa69cb00 100644
--- a/gtars-r/src/rust/src/lola.rs
+++ b/gtars-r/src/rust/src/lola.rs
@@ -55,79 +55,29 @@ fn extract_region_sets(user_sets: List) -> extendr_api::Result<Vec<RegionSet>> {
     Ok(sets)
 }
 
-/// Convert an empty string to None (becomes NA in R).
-fn empty_to_na(s: &str) -> Option<String> {
-    if s.is_empty() {
-        None
-    } else {
-        Some(s.to_string())
-    }
-}
-
 /// Convert LOLA results to an R list (data.frame-like structure).
 fn results_to_list(results: &[gtars_lola::models::LolaResult]) -> List {
-    let n = results.len();
-    let mut user_set = Vec::with_capacity(n);
-    let mut db_set = Vec::with_capacity(n);
-    let mut p_value_log = Vec::with_capacity(n);
-    let mut odds_ratio = Vec::with_capacity(n);
-    let mut support: Vec<i32> = Vec::with_capacity(n);
-    let mut rnk_pv: Vec<i32> = Vec::with_capacity(n);
-    let mut rnk_or: Vec<i32> = Vec::with_capacity(n);
-    let mut rnk_sup: Vec<i32> = Vec::with_capacity(n);
-    let mut max_rnk: Vec<i32> = Vec::with_capacity(n);
-    let mut mean_rnk = Vec::with_capacity(n);
-    let mut b_vec: Vec<i32> = Vec::with_capacity(n);
-    let mut c_vec: Vec<i32> = Vec::with_capacity(n);
-    let mut d_vec: Vec<i32> = Vec::with_capacity(n);
-    let mut collection: Vec<Option<String>> = Vec::with_capacity(n);
-    let mut description: Vec<Option<String>> = Vec::with_capacity(n);
-    let mut cell_type: Vec<Option<String>> = Vec::with_capacity(n);
-    let mut tissue: Vec<Option<String>> = Vec::with_capacity(n);
-    let mut antibody: Vec<Option<String>> = Vec::with_capacity(n);
-    let mut treatment: Vec<Option<String>> = Vec::with_capacity(n);
-    let mut data_source: Vec<Option<String>> = Vec::with_capacity(n);
-    let mut filename = Vec::with_capacity(n);
-    let mut db_set_size: Vec<i32> = Vec::with_capacity(n);
-
-    let mut q_value: Vec<Option<f64>> = Vec::with_capacity(n);
-
-    for r in results {
-        user_set.push((r.user_set + 1) as i32); // 1-based for R
-        db_set.push((r.db_set + 1) as i32);
-        p_value_log.push(r.p_value_log);
-        odds_ratio.push(r.odds_ratio);
-        support.push(r.support as i32);
-        rnk_pv.push(r.rnk_pv as i32);
-        rnk_or.push(r.rnk_or as i32);
-        rnk_sup.push(r.rnk_sup as i32);
-        max_rnk.push(r.max_rnk as i32);
-        mean_rnk.push(r.mean_rnk);
-        b_vec.push(r.b as i32);
-        c_vec.push(r.c as i32);
-        d_vec.push(r.d as i32);
-        collection.push(empty_to_na(&r.collection));
-        description.push(empty_to_na(&r.description));
-        cell_type.push(empty_to_na(&r.cell_type));
-        tissue.push(empty_to_na(&r.tissue));
-        antibody.push(empty_to_na(&r.antibody));
-        treatment.push(empty_to_na(&r.treatment));
-        data_source.push(empty_to_na(&r.data_source));
-        filename.push(r.filename.clone());
-        db_set_size.push(r.db_set_size as i32);
-        q_value.push(r.q_value);
-    }
-
-    // Convert Option<f64> to Rfloat (NA for None)
-    let q_value_r: Vec<Rfloat> = q_value
-        .iter()
-        .map(|q| match q {
-            Some(v) => Rfloat::from(*v),
-            None => Rfloat::na(),
-        })
+    use gtars_lola::output::results_to_columns;
+
+    let c = results_to_columns(results);
+
+    // R-specific conversions: 1-based indices, i32 casts, NA types
+    let user_set: Vec<i32> = c.user_set.iter().map(|&v| (v + 1) as i32).collect();
+    let db_set: Vec<i32> = c.db_set.iter().map(|&v| (v + 1) as i32).collect();
+    let support: Vec<i32> = c.support.iter().map(|&v| v as i32).collect();
+    let rnk_pv: Vec<i32> = c.rnk_pv.iter().map(|&v| v as i32).collect();
+    let rnk_or: Vec<i32> = c.rnk_or.iter().map(|&v| v as i32).collect();
+    let rnk_sup: Vec<i32> = c.rnk_sup.iter().map(|&v| v as i32).collect();
+    let max_rnk: Vec<i32> = c.max_rnk.iter().map(|&v| v as i32).collect();
+    let b_vec: Vec<i32> = c.b.iter().map(|&v| v as i32).collect();
+    let c_vec: Vec<i32> = c.c.iter().map(|&v| v as i32).collect();
+    let d_vec: Vec<i32> = c.d.iter().map(|&v| v as i32).collect();
+    let db_set_size: Vec<i32> = c.db_set_size.iter().map(|&v| v as i32).collect();
+
+    let q_value_r: Vec<Rfloat> = c.q_value.iter()
+        .map(|q| match q { Some(v) => Rfloat::from(*v), None => Rfloat::na() })
         .collect();
 
-    // Convert Option<String> to Rstr (NA for None)
     let to_rstr = |v: &[Option<String>]| -> Vec<Rstr> {
         v.iter()
             .map(|s| match s {
@@ -140,25 +90,25 @@ fn results_to_list(results: &[gtars_lola::models::LolaResult]) -> List {
     list!(
         userSet = user_set,
         dbSet = db_set,
-        collection = to_rstr(&collection),
-        pValueLog = p_value_log,
-        oddsRatio = odds_ratio,
+        collection = to_rstr(&c.collection),
+        pValueLog = c.p_value_log,
+        oddsRatio = c.odds_ratio,
         support = support,
         rnkPV = rnk_pv,
         rnkOR = rnk_or,
         rnkSup = rnk_sup,
         maxRnk = max_rnk,
-        meanRnk = mean_rnk,
+        meanRnk = c.mean_rnk,
         b = b_vec,
         c = c_vec,
         d = d_vec,
-        description = to_rstr(&description),
-        cellType = to_rstr(&cell_type),
-        tissue = to_rstr(&tissue),
-        antibody = to_rstr(&antibody),
-        treatment = to_rstr(&treatment),
-        dataSource = to_rstr(&data_source),
-        filename = filename,
+        description = to_rstr(&c.description),
+        cellType = to_rstr(&c.cell_type),
+        tissue = to_rstr(&c.tissue),
+        antibody = to_rstr(&c.antibody),
+        treatment = to_rstr(&c.treatment),
+        dataSource = to_rstr(&c.data_source),
+        filename = c.filename,
         qValue = q_value_r,
         size = db_set_size
     )
@@ -432,13 +382,13 @@ pub fn r_regiondb_anno(db: Robj) -> extendr_api::Result<Robj> {
 
         for (i, a) in db_ref.region_anno.iter().enumerate() {
             filename.push(a.filename.clone());
-            cell_type.push(empty_to_na(&a.cell_type));
-            description.push(empty_to_na(&a.description));
-            tissue.push(empty_to_na(&a.tissue));
-            data_source.push(empty_to_na(&a.data_source));
-            antibody.push(empty_to_na(&a.antibody));
-            treatment.push(empty_to_na(&a.treatment));
-            collection.push(empty_to_na(&a.collection));
+            cell_type.push(a.cell_type.clone());
+            description.push(a.description.clone());
+            tissue.push(a.tissue.clone());
+            data_source.push(a.data_source.clone());
+            antibody.push(a.antibody.clone());
+            treatment.push(a.treatment.clone());
+            collection.push(a.collection.clone());
             size.push(db_ref.region_sets[i].len() as i32);
         }
 
diff --git a/gtars-r/tests/testthat/test_genomicdist.R b/gtars-r/tests/testthat/test_genomicdist.R
index 6245a055..40a8d844 100644
--- a/gtars-r/tests/testthat/test_genomicdist.R
+++ b/gtars-r/tests/testthat/test_genomicdist.R
@@ -18,9 +18,10 @@ test_that("calcWidth returns correct widths", {
   skip_if_no_data()
   rs <- RegionSet(file.path(data_dir, "dummy.bed"))
   w <- calcWidth(rs)
-  expect_type(w, "integer")
+  # doubles instead of integers: u32 widths can exceed i32::MAX (2.1 Gbp)
+  expect_type(w, "double")
   expect_equal(length(w), 4L)
-  expect_equal(w[1], 4L)
+  expect_equal(w[1], 4)
 })
 
 test_that("calcNeighborDist returns distances", {
@@ -34,7 +35,9 @@ test_that("calcNearestNeighbors returns distances", {
   skip_if_no_data()
   rs <- RegionSet(file.path(data_dir, "dummy.bed"))
   nn <- calcNearestNeighbors(rs)
-  expect_type(nn, "integer")
+  # doubles instead of integers: distances can be chromosome-scale,
+  # approaching or exceeding i32::MAX for concatenated/genome-scale regions
+  expect_type(nn, "double")
   expect_equal(length(nn), length(rs))
 })
 
@@ -77,6 +80,23 @@ test_that("calcDinuclFreq returns data.frame with 16 dinucleotide columns", {
   expect_true(is.data.frame(freq))
   # 16 dinucleotides + region column
   expect_true(ncol(freq) >= 16)
+  # Default rawCounts=FALSE: each row sums to 100
+  num_cols <- freq[, sapply(freq, is.numeric)]
+  expect_equal(sum(num_cols[1, ]), 100, tolerance = 1e-6)
+})
+
+test_that("calcDinuclFreq rawCounts=TRUE returns integer counts", {
+  fasta_path <- file.path(test_path(), "..", "..", "..", "tests", "data", "fasta", "base.fa")
+  skip_if_not(file.exists(fasta_path), "Test FASTA file not found")
+
+  assembly <- loadGenomeAssembly(fasta_path)
+  # base.fa: chrX = "TTGGGGAA" (8bp) → 7 dinucleotide windows
+  df <- data.frame(chr = "chrX", start = 0L, end = 8L)
+  freq <- calcDinuclFreq(df, assembly, rawCounts = TRUE)
+  expect_true(is.data.frame(freq))
+  num_cols <- freq[, sapply(freq, is.numeric)]
+  # 8 bases → 7 dinucleotides
+  expect_equal(sum(num_cols[1, ]), 7)
 })
 
 # =========================================================================
diff --git a/gtars-r/tests/testthat/test_regionset.R b/gtars-r/tests/testthat/test_regionset.R
index 3a547cd7..e455eaec 100644
--- a/gtars-r/tests/testthat/test_regionset.R
+++ b/gtars-r/tests/testthat/test_regionset.R
@@ -82,9 +82,10 @@ test_that("widths returns correct values", {
   # dummy.bed: chr1:2-6, chr1:4-7, chr1:5-9, chr1:7-12
   rs <- RegionSet(file.path(data_dir, "dummy.bed"))
   w <- widths(rs)
-  expect_type(w, "integer")
+  # doubles instead of integers: u32 widths can exceed i32::MAX (2.1 Gbp)
+  expect_type(w, "double")
   expect_equal(length(w), 4L)
-  expect_equal(w[1], 4L)
+  expect_equal(w[1], 4)
 })
 
 test_that("neighborDistances returns numeric vector", {
@@ -98,7 +99,8 @@ test_that("nearestNeighbors returns numeric vector", {
   skip_if_no_data()
   rs <- RegionSet(file.path(data_dir, "dummy.bed"))
   nn <- nearestNeighbors(rs)
-  expect_type(nn, "integer")
+  # doubles instead of integers: distances can be chromosome-scale
+  expect_type(nn, "double")
 })
 
 test_that("chromosomeStatistics returns data.frame", {
@@ -179,12 +181,54 @@ test_that("disjoin returns non-overlapping pieces", {
   expect_true(length(rs_dj) >= 1L)
 })
 
-test_that("gaps returns inter-region gaps", {
+test_that("gaps returns inter-region gaps bounded by chrom_sizes", {
   skip_if_no_data()
-  # dummy.bed regions all overlap, so gaps may be empty
-  rs <- RegionSet(file.path(data_dir, "dummy.bed"))
-  rs_gaps <- gaps(rs)
+  # Three non-overlapping peaks → leading, 2 inter-region, trailing = 4 gaps.
+  df <- data.frame(
+    chr = "chr1",
+    start = c(10L, 30L, 50L),
+    end   = c(20L, 40L, 60L)
+  )
+  rs <- RegionSet(df)
+  chrom_sizes <- c(chr1 = 100L)
+  rs_gaps <- gaps(rs, chrom_sizes = chrom_sizes)
   expect_s4_class(rs_gaps, "RegionSet")
+  expect_equal(length(rs_gaps), 4L)
+})
+
+test_that("gaps errors when chrom_sizes is missing", {
+  df <- data.frame(chr = "chr1", start = 10L, end = 20L)
+  rs <- RegionSet(df)
+  expect_error(gaps(rs), "chrom_sizes")
+})
+
+test_that("gaps emits full-chromosome gap for empty chromosomes", {
+  df <- data.frame(chr = "chr1", start = 10L, end = 20L)
+  rs <- RegionSet(df)
+  chrom_sizes <- c(chr1 = 100L, chr2 = 50L)
+  rs_gaps <- gaps(rs, chrom_sizes = chrom_sizes)
+  gaps_df <- as.data.frame(rs_gaps)
+  chr2 <- gaps_df[gaps_df$chr == "chr2", ]
+  expect_equal(nrow(chr2), 1L)
+  expect_equal(chr2$start, 0L)
+  expect_equal(chr2$end, 50L)
+})
+
+# =========================================================================
+# Spatial-arrangement summary statistics
+# =========================================================================
+
+test_that("clusterRegions returns cluster IDs", {
+  df <- data.frame(
+    chr = "chr1",
+    start = c(0L, 13L, 100L),
+    end   = c(10L, 20L, 110L)
+  )
+  rs <- RegionSet(df)
+  ids <- clusterRegions(rs, maxGap = 5L)
+  expect_length(ids, 3L)
+  expect_equal(ids[1], ids[2])   # first two are within 5bp → same cluster
+  expect_false(ids[1] == ids[3]) # third is far away
 })
 
 # =========================================================================
diff --git a/gtars-refget/src/store/persistence.rs b/gtars-refget/src/store/persistence.rs
index a5abd482..0a4115f7 100644
--- a/gtars-refget/src/store/persistence.rs
+++ b/gtars-refget/src/store/persistence.rs
@@ -188,7 +188,11 @@ impl ReadonlyRefgetStore {
             "#name\tlength\talphabet\tsha512t24u\tmd5\tdescription"
         )?;
 
-        for result_sr in self.sequence_store.values() {
+        // Sort by sha512t24u digest for deterministic output (sequence_store is a HashMap).
+        let mut entries: Vec<&SequenceRecord> = self.sequence_store.values().collect();
+        entries.sort_by(|a, b| a.metadata().sha512t24u.cmp(&b.metadata().sha512t24u));
+
+        for result_sr in entries {
             let result = result_sr.metadata();
             let description = result.description.as_deref().unwrap_or("");
             writeln!(
diff --git a/gtars-refget/src/store/readonly.rs b/gtars-refget/src/store/readonly.rs
index 096ea74e..feb183fb 100644
--- a/gtars-refget/src/store/readonly.rs
+++ b/gtars-refget/src/store/readonly.rs
@@ -426,11 +426,17 @@ impl ReadonlyRefgetStore {
             .map(|name_map| {
                 name_map
                     .iter()
-                    .filter_map(|(name, seq_key)| {
-                        let record = self.sequence_store.get(seq_key)?;
+                    .map(|(name, seq_key)| {
+                        let record = self.sequence_store.get(seq_key).ok_or_else(|| {
+                            anyhow!(
+                                "Sequence {} not found in store for collection {}",
+                                key_to_digest_string(seq_key),
+                                collection_digest,
+                            )
+                        })?;
                         let mut meta = record.metadata().clone();
                         meta.name = name.clone();
-                        Some(match record.sequence() {
+                        Ok(match record.sequence() {
                             Some(seq) => SequenceRecord::Full {
                                 metadata: meta,
                                 sequence: seq.to_vec(),
@@ -438,8 +444,9 @@ impl ReadonlyRefgetStore {
                             None => SequenceRecord::Stub(meta),
                         })
                     })
-                    .collect()
+                    .collect::<Result<Vec<_>>>()
             })
+            .transpose()?
             .unwrap_or_default();
 
         Ok(crate::digest::SequenceCollection {
@@ -539,11 +546,23 @@ impl ReadonlyRefgetStore {
     /// Import a single collection (with all its sequences, aliases, and FHR
     /// metadata) from another store into this store.
     ///
+    /// Both stores must be disk-backed with matching storage modes.
     /// The source store must have the collection loaded (call
     /// `load_collection()` or `load_all_collections()` first).
     pub fn import_collection(&mut self, source: &ReadonlyRefgetStore, digest: &str) -> Result<()> {
-        let collection = source.get_collection(digest)?;
-        self.add_sequence_collection(collection)?;
+        // Both stores must be disk-backed with same mode
+        if source.local_path.is_none() || self.local_path.is_none() || !self.persist_to_disk {
+            return Err(anyhow!("import_collection requires both stores to be disk-backed"));
+        }
+        if source.mode != self.mode {
+            return Err(anyhow!(
+                "import_collection requires matching storage modes (source={:?}, dest={:?})",
+                source.mode,
+                self.mode,
+            ));
+        }
+
+        self.import_collection_file_copy(source, digest)?;
 
         // Copy sequence aliases for every sequence in the imported collection
         let coll_key = digest.to_key();
@@ -569,6 +588,126 @@ impl ReadonlyRefgetStore {
         Ok(())
     }
 
+    /// File-copy based import: copies RGSI and .seq files directly from
+    /// source to destination, then registers the collection in memory.
+    fn import_collection_file_copy(
+        &mut self,
+        source: &ReadonlyRefgetStore,
+        digest: &str,
+    ) -> Result<()> {
+        use crate::collection::read_rgsi_file;
+
+        // 1. Read the source RGSI file to get collection metadata
+        let src_rgsi_path = source
+            .collection_file_path(digest)
+            .ok_or_else(|| anyhow!("Source store has no local path for collection {}", digest))?;
+        let collection = read_rgsi_file(&src_rgsi_path)
+            .with_context(|| format!("Failed to read source RGSI file: {}", src_rgsi_path.display()))?;
+
+        let mut metadata = collection.metadata.clone();
+
+        // 2. Handle ancillary digests and copy/write the RGSI file
+        let dst_rgsi_path = self
+            .collection_file_path(digest)
+            .ok_or_else(|| anyhow!("Dest store has no local path for collection {}", digest))?;
+        if let Some(parent) = dst_rgsi_path.parent() {
+            create_dir_all(parent)?;
+        }
+
+        let needs_ancillary_rewrite = self.ancillary_digests
+            && metadata.name_length_pairs_digest.is_none();
+
+        if needs_ancillary_rewrite {
+            // Source lacks ancillary digests but destination wants them --
+            // compute them and write a new RGSI file.
+            metadata.compute_ancillary_digests(&collection.sequences);
+            use crate::collection::SequenceCollectionRecordExt;
+            let record = SequenceCollectionRecord::Full {
+                metadata: metadata.clone(),
+                sequences: collection.sequences.iter()
+                    .map(|s| SequenceRecord::Stub(s.metadata().clone()))
+                    .collect(),
+            };
+            record.write_collection_rgsi(&dst_rgsi_path)?;
+        } else {
+            // Byte-for-byte copy of the RGSI file
+            fs::copy(&src_rgsi_path, &dst_rgsi_path)
+                .with_context(|| format!(
+                    "Failed to copy RGSI {} -> {}",
+                    src_rgsi_path.display(),
+                    dst_rgsi_path.display(),
+                ))?;
+        }
+
+        // 3. Copy sequence data files
+        for seq_record in &collection.sequences {
+            let seq_meta = seq_record.metadata();
+            let seq_digest = &seq_meta.sha512t24u;
+
+            let src_seq_path = source
+                .sequence_file_path(seq_digest)
+                .ok_or_else(|| anyhow!("Source has no path for sequence {}", seq_digest))?;
+            let dst_seq_path = self
+                .sequence_file_path(seq_digest)
+                .ok_or_else(|| anyhow!("Dest has no path for sequence {}", seq_digest))?;
+
+            // Skip if destination already has this sequence (dedup across collections)
+            if dst_seq_path.exists() {
+                // Still need to register in memory below
+            } else {
+                if let Some(parent) = dst_seq_path.parent() {
+                    create_dir_all(parent)?;
+                }
+                fs::copy(&src_seq_path, &dst_seq_path).with_context(|| {
+                    format!(
+                        "Failed to copy sequence {} -> {}",
+                        src_seq_path.display(),
+                        dst_seq_path.display(),
+                    )
+                })?;
+            }
+        }
+
+        // 4. Register in memory
+        let coll_key = digest.to_key();
+
+        // Build the collection record with stub sequences
+        let stub_sequences: Vec<SequenceRecord> = collection
+            .sequences
+            .iter()
+            .map(|s| SequenceRecord::Stub(s.metadata().clone()))
+            .collect();
+
+        let record = SequenceCollectionRecord::Full {
+            metadata: metadata.clone(),
+            sequences: stub_sequences,
+        };
+        self.collections.insert(coll_key, record);
+
+        // Register sequences and populate name_lookup
+        let mut name_map = IndexMap::new();
+        for seq_record in &collection.sequences {
+            let seq_meta = seq_record.metadata();
+            let seq_key = seq_meta.sha512t24u.to_key();
+
+            name_map.insert(seq_meta.name.clone(), seq_key);
+
+            // Insert stub into sequence_store (skip if already present -- dedup)
+            if !self.sequence_store.contains_key(&seq_key) {
+                self.sequence_store
+                    .insert(seq_key, SequenceRecord::Stub(seq_meta.clone()));
+                self.md5_lookup
+                    .insert(seq_meta.md5.to_key(), seq_key);
+            }
+        }
+        self.name_lookup.insert(coll_key, name_map);
+
+        // 5. Update index files
+        self.write_index_files()?;
+
+        Ok(())
+    }
+
     // =========================================================================
     // Sequence API
     // =========================================================================
@@ -825,6 +964,23 @@ impl ReadonlyRefgetStore {
         PathBuf::from(path_str)
     }
 
+    /// Return the full filesystem path to a sequence `.seq` file for the given digest.
+    ///
+    /// Returns `None` if the store has no local path or no seqdata path template.
+    pub fn sequence_file_path(&self, seq_digest: &str) -> Option<PathBuf> {
+        let local_path = self.local_path.as_ref()?;
+        let template = self.seqdata_path_template.as_ref()?;
+        Some(local_path.join(Self::expand_template(seq_digest, template)))
+    }
+
+    /// Return the full filesystem path to a collection RGSI file for the given digest.
+    ///
+    /// Returns `None` if the store has no local path.
+    pub fn collection_file_path(&self, coll_digest: &str) -> Option<PathBuf> {
+        let local_path = self.local_path.as_ref()?;
+        Some(local_path.join(format!("collections/{}.rgsi", coll_digest)))
+    }
+
     /// Validate a relative path to prevent directory traversal attacks.
     pub(crate) fn sanitize_relative_path(path: &str) -> Result<()> {
         if path.starts_with('/') || path.starts_with('\\') {
diff --git a/gtars-refget/src/store/tests.rs b/gtars-refget/src/store/tests.rs
index 6015b96f..b9bb80f4 100644
--- a/gtars-refget/src/store/tests.rs
+++ b/gtars-refget/src/store/tests.rs
@@ -1526,6 +1526,92 @@ fn test_collection_order_preserved_after_roundtrip() {
     );
 }
 
+/// Test that multiple collections sharing sequences under different names and different orderings
+/// all preserve their correct per-collection names and element orderings across a disk roundtrip.
+///
+/// This covers the intersection of two previously-fixed bugs:
+/// 1. HashMap ordering (fixed: inner map now IndexMap)
+/// 2. Global name leakage (fixed: get_collection() overrides meta.name from name_lookup)
+#[test]
+fn test_shared_sequences_order_preserved_after_disk_roundtrip() {
+    // FASTA A: base ordering — chrX first, then chr1, then chr2
+    let fasta_a = ">chrX\nTTGGGGAA\n>chr1\nGGAA\n>chr2\nGCGC\n";
+    // FASTA B: different order — chr1 first, same sequences as A
+    let fasta_b = ">chr1\nGGAA\n>chr2\nGCGC\n>chrX\nTTGGGGAA\n";
+    // FASTA C: name swap — chr2 has GGAA, chr1 has GCGC (opposite of A/B)
+    let fasta_c = ">chrX\nTTGGGGAA\n>chr2\nGGAA\n>chr1\nGCGC\n";
+
+    let dir = tempdir().unwrap();
+    let fasta_a_path = dir.path().join("a.fa");
+    let fasta_b_path = dir.path().join("b.fa");
+    let fasta_c_path = dir.path().join("c.fa");
+    fs::write(&fasta_a_path, fasta_a).unwrap();
+    fs::write(&fasta_b_path, fasta_b).unwrap();
+    fs::write(&fasta_c_path, fasta_c).unwrap();
+
+    let store_path = dir.path().join("store");
+    let mut store = RefgetStore::on_disk(&store_path).unwrap();
+    store.set_quiet(true);
+
+    let (meta_a, _) = store.add_sequence_collection_from_fasta(&fasta_a_path, FastaImportOptions::new()).unwrap();
+    let (meta_b, _) = store.add_sequence_collection_from_fasta(&fasta_b_path, FastaImportOptions::new()).unwrap();
+    let (meta_c, _) = store.add_sequence_collection_from_fasta(&fasta_c_path, FastaImportOptions::new()).unwrap();
+
+    let digest_a = meta_a.digest.clone();
+    let digest_b = meta_b.digest.clone();
+    let digest_c = meta_c.digest.clone();
+
+    // Load collections before write and record level2 output
+    store.load_all_collections().unwrap();
+    let pre_a = store.get_collection_level2(&digest_a).unwrap();
+    let pre_b = store.get_collection_level2(&digest_b).unwrap();
+    let pre_c = store.get_collection_level2(&digest_c).unwrap();
+
+    // Verify pre-write ordering for FASTA A: chrX, chr1, chr2
+    assert_eq!(pre_a.names, vec!["chrX", "chr1", "chr2"], "A: names before roundtrip");
+    // Verify pre-write ordering for FASTA B: chr1, chr2, chrX
+    assert_eq!(pre_b.names, vec!["chr1", "chr2", "chrX"], "B: names before roundtrip");
+    // Verify pre-write ordering for FASTA C: chrX, chr2, chr1 (name swap)
+    assert_eq!(pre_c.names, vec!["chrX", "chr2", "chr1"], "C: names before roundtrip");
+
+    store.write().unwrap();
+
+    // Drop and reopen from disk
+    drop(store);
+    let mut reloaded = RefgetStore::open_local(&store_path).unwrap();
+    reloaded.load_all_collections().unwrap();
+
+    let post_a = reloaded.get_collection_level2(&digest_a).unwrap();
+    let post_b = reloaded.get_collection_level2(&digest_b).unwrap();
+    let post_c = reloaded.get_collection_level2(&digest_c).unwrap();
+
+    // Names must match exactly (order-sensitive) after roundtrip
+    assert_eq!(post_a.names, pre_a.names, "A: names after roundtrip");
+    assert_eq!(post_b.names, pre_b.names, "B: names after roundtrip");
+    assert_eq!(post_c.names, pre_c.names, "C: names after roundtrip");
+
+    // Lengths must match exactly after roundtrip
+    assert_eq!(post_a.lengths, pre_a.lengths, "A: lengths after roundtrip");
+    assert_eq!(post_b.lengths, pre_b.lengths, "B: lengths after roundtrip");
+    assert_eq!(post_c.lengths, pre_c.lengths, "C: lengths after roundtrip");
+
+    // Sequence digests must match exactly after roundtrip
+    assert_eq!(post_a.sequences, pre_a.sequences, "A: sequences after roundtrip");
+    assert_eq!(post_b.sequences, pre_b.sequences, "B: sequences after roundtrip");
+    assert_eq!(post_c.sequences, pre_c.sequences, "C: sequences after roundtrip");
+
+    // Cross-check: FASTA C has chr2=GGAA and chr1=GCGC (opposite of A's chr1=GGAA, chr2=GCGC)
+    // The sequence digest for chr2 in C should equal chr1 in A
+    assert_eq!(
+        post_c.sequences[1], post_a.sequences[1],
+        "C.chr2 and A.chr1 share GGAA bytes, should have same sequence digest"
+    );
+    assert_eq!(
+        post_c.sequences[2], post_a.sequences[2],
+        "C.chr1 and A.chr2 share GCGC bytes, should have same sequence digest"
+    );
+}
+
 // =========================================================================
 // Name source tests
 // =========================================================================
@@ -1591,10 +1677,30 @@ fn test_shared_sequence_different_names() {
 // Import collection tests
 // =========================================================================
 
+/// Helper: create a disk-backed store with one collection from a FASTA string.
+fn disk_store_with_one_collection(fasta_content: &str) -> (RefgetStore, String, tempfile::TempDir, tempfile::TempDir) {
+    let store_dir = tempdir().unwrap();
+    let fasta_dir = tempdir().unwrap();
+    let fasta = fasta_dir.path().join("test.fa");
+    fs::write(&fasta, fasta_content).unwrap();
+
+    let mut store = RefgetStore::on_disk(store_dir.path()).unwrap();
+    store.set_quiet(true);
+    let (meta, _) = store
+        .add_sequence_collection_from_fasta(&fasta, FastaImportOptions::new())
+        .unwrap();
+    let digest = meta.digest.clone();
+    // Load collection so name_lookup is populated
+    store.load_all_collections().unwrap();
+    (store, digest, store_dir, fasta_dir)
+}
+
 #[test]
 fn test_import_collection_basic() {
-    let (mut source, digest) = store_with_one_collection(">chr1\nATGC\n>chr2\nGGGG\n");
-    let mut target = RefgetStore::in_memory();
+    let (mut source, digest, _src_dir, _fasta_dir) =
+        disk_store_with_one_collection(">chr1\nATGC\n>chr2\nGGGG\n");
+    let target_dir = tempdir().unwrap();
+    let mut target = RefgetStore::on_disk(target_dir.path()).unwrap();
 
     target.import_collection(&mut source, &digest).unwrap();
 
@@ -1605,7 +1711,8 @@ fn test_import_collection_basic() {
 
 #[test]
 fn test_import_collection_copies_sequence_aliases() {
-    let (mut source, digest) = store_with_one_collection(">chr1\nATGC\n>chr2\nGGGG\n");
+    let (mut source, digest, _src_dir, _fasta_dir) =
+        disk_store_with_one_collection(">chr1\nATGC\n>chr2\nGGGG\n");
 
     // Add sequence aliases in source
     let coll = source.get_collection(&digest).unwrap();
@@ -1615,7 +1722,8 @@ fn test_import_collection_copies_sequence_aliases() {
     source.add_sequence_alias("ucsc", "chr1", &seq0_digest).unwrap();
     source.add_sequence_alias("ncbi", "NC_000002.1", &seq1_digest).unwrap();
 
-    let mut target = RefgetStore::in_memory();
+    let target_dir = tempdir().unwrap();
+    let mut target = RefgetStore::on_disk(target_dir.path()).unwrap();
     target.import_collection(&mut source, &digest).unwrap();
 
     // Target should have the sequence aliases
@@ -1631,12 +1739,14 @@ fn test_import_collection_copies_sequence_aliases() {
 
 #[test]
 fn test_import_collection_copies_collection_aliases() {
-    let (mut source, digest) = store_with_one_collection(">chr1\nATGC\n>chr2\nGGGG\n");
+    let (mut source, digest, _src_dir, _fasta_dir) =
+        disk_store_with_one_collection(">chr1\nATGC\n>chr2\nGGGG\n");
 
     source.add_collection_alias("insdc", "GCA_000001.1", &digest).unwrap();
     source.add_collection_alias("refseq", "GCF_000001.1", &digest).unwrap();
 
-    let mut target = RefgetStore::in_memory();
+    let target_dir = tempdir().unwrap();
+    let mut target = RefgetStore::on_disk(target_dir.path()).unwrap();
     target.import_collection(&mut source, &digest).unwrap();
 
     let ns = target.list_collection_alias_namespaces();
@@ -1651,7 +1761,8 @@ fn test_import_collection_copies_collection_aliases() {
 fn test_import_collection_copies_fhr_metadata() {
     use super::fhr_metadata::FhrMetadata;
 
-    let (mut source, digest) = store_with_one_collection(">chr1\nATGC\n");
+    let (mut source, digest, _src_dir, _fasta_dir) =
+        disk_store_with_one_collection(">chr1\nATGC\n");
 
     let fhr = FhrMetadata {
         genome: Some("Homo sapiens".to_string()),
@@ -1659,7 +1770,8 @@ fn test_import_collection_copies_fhr_metadata() {
     };
     source.set_fhr_metadata(&digest, fhr).unwrap();
 
-    let mut target = RefgetStore::in_memory();
+    let target_dir = tempdir().unwrap();
+    let mut target = RefgetStore::on_disk(target_dir.path()).unwrap();
     target.import_collection(&mut source, &digest).unwrap();
 
     let fhr = target.get_fhr_metadata(&digest);
@@ -1682,6 +1794,7 @@ fn test_import_collection_disk_roundtrip_aliases() {
     let seq0_digest;
     {
         let mut source = RefgetStore::on_disk(source_dir.path()).unwrap();
+        source.set_quiet(true);
         let (meta, _) = source
             .add_sequence_collection_from_fasta(&fasta, FastaImportOptions::new())
             .unwrap();
@@ -1720,3 +1833,137 @@ fn test_import_collection_disk_roundtrip_aliases() {
     let coll_aliases = target.get_aliases_for_collection(&digest);
     assert_eq!(coll_aliases.len(), 1, "Expected 1 collection alias in target: {:?}", coll_aliases);
 }
+
+#[test]
+fn test_import_collection_file_copy_roundtrip() {
+    // Verify RGSI and .seq files are byte-for-byte identical after import
+    // when ancillary digests match between source and dest.
+    let (mut source, digest, src_dir, _fasta_dir) =
+        disk_store_with_one_collection(">chr1\nATGC\n>chr2\nGGGG\n");
+    let target_dir = tempdir().unwrap();
+    let mut target = RefgetStore::on_disk(target_dir.path()).unwrap();
+
+    target.import_collection(&mut source, &digest).unwrap();
+
+    // Verify RGSI file is byte-for-byte identical
+    let src_rgsi = fs::read(
+        src_dir.path().join(format!("collections/{}.rgsi", digest)),
+    ).unwrap();
+    let dst_rgsi = fs::read(
+        target_dir.path().join(format!("collections/{}.rgsi", digest)),
+    ).unwrap();
+    assert_eq!(src_rgsi, dst_rgsi, "RGSI files should be byte-identical");
+
+    // Verify .seq files are byte-for-byte identical
+    let coll = target.get_collection(&digest).unwrap();
+    for seq in &coll.sequences {
+        let seq_digest = &seq.metadata().sha512t24u;
+        let src_seq_path = source.sequence_file_path(seq_digest).unwrap();
+        let dst_seq_path = target.sequence_file_path(seq_digest).unwrap();
+        let src_data = fs::read(&src_seq_path).unwrap();
+        let dst_data = fs::read(&dst_seq_path).unwrap();
+        assert_eq!(src_data, dst_data, "Sequence file for {} should be byte-identical", seq_digest);
+    }
+
+    // Verify in-memory metadata matches
+    let src_coll = source.get_collection(&digest).unwrap();
+    assert_eq!(coll.metadata.digest, src_coll.metadata.digest);
+    assert_eq!(coll.sequences.len(), src_coll.sequences.len());
+    for (src_seq, dst_seq) in src_coll.sequences.iter().zip(coll.sequences.iter()) {
+        assert_eq!(src_seq.metadata().sha512t24u, dst_seq.metadata().sha512t24u);
+        assert_eq!(src_seq.metadata().name, dst_seq.metadata().name);
+        assert_eq!(src_seq.metadata().length, dst_seq.metadata().length);
+    }
+}
+
+#[test]
+fn test_import_collection_ancillary_digest_enrichment() {
+    // Source store has ancillary_digests: false, destination has ancillary_digests: true.
+    // The destination RGSI should contain ancillary digest headers that the source lacks.
+    let source_dir = tempdir().unwrap();
+    let fasta_dir = tempdir().unwrap();
+    let fasta = fasta_dir.path().join("test.fa");
+    fs::write(&fasta, ">chr1\nATGC\n>chr2\nGGGG\n").unwrap();
+
+    // Create source store with ancillary_digests: false
+    let mut source = RefgetStore::on_disk(source_dir.path()).unwrap();
+    source.set_quiet(true);
+    source.disable_ancillary_digests();
+    let (meta, _) = source
+        .add_sequence_collection_from_fasta(&fasta, FastaImportOptions::new())
+        .unwrap();
+    let digest = meta.digest.clone();
+    source.load_all_collections().unwrap();
+
+    // Verify source RGSI lacks ancillary digests
+    let src_rgsi_content = fs::read_to_string(
+        source_dir.path().join(format!("collections/{}.rgsi", digest)),
+    ).unwrap();
+    assert!(
+        !src_rgsi_content.contains("name_length_pairs_digest"),
+        "Source should NOT have ancillary digests",
+    );
+
+    // Create destination store with ancillary_digests: true (default)
+    let target_dir = tempdir().unwrap();
+    let mut target = RefgetStore::on_disk(target_dir.path()).unwrap();
+    target.enable_ancillary_digests();
+    target.import_collection(&mut source, &digest).unwrap();
+
+    // Verify destination RGSI has ancillary digest headers
+    let dst_rgsi_content = fs::read_to_string(
+        target_dir.path().join(format!("collections/{}.rgsi", digest)),
+    ).unwrap();
+    assert!(
+        dst_rgsi_content.contains("name_length_pairs_digest"),
+        "Destination should have ancillary digests. RGSI:\n{}",
+        dst_rgsi_content,
+    );
+    assert!(
+        dst_rgsi_content.contains("sorted_name_length_pairs_digest"),
+        "Destination should have sorted_name_length_pairs_digest",
+    );
+    assert!(
+        dst_rgsi_content.contains("sorted_sequences_digest"),
+        "Destination should have sorted_sequences_digest",
+    );
+
+    // Verify in-memory metadata has non-None ancillary fields
+    let coll_meta = target.get_collection_metadata(&digest).unwrap();
+    assert!(coll_meta.name_length_pairs_digest.is_some(), "name_length_pairs_digest should be Some");
+    assert!(coll_meta.sorted_name_length_pairs_digest.is_some(), "sorted_name_length_pairs_digest should be Some");
+    assert!(coll_meta.sorted_sequences_digest.is_some(), "sorted_sequences_digest should be Some");
+}
+
+#[test]
+fn test_import_collection_mode_mismatch_error() {
+    // Source with Raw mode, destination with Encoded mode should fail.
+    let source_dir = tempdir().unwrap();
+    let fasta_dir = tempdir().unwrap();
+    let fasta = fasta_dir.path().join("test.fa");
+    fs::write(&fasta, ">chr1\nATGC\n").unwrap();
+
+    // Create source store in Raw mode
+    let mut source = RefgetStore::on_disk(source_dir.path()).unwrap();
+    source.set_quiet(true);
+    source.disable_encoding();
+    let (meta, _) = source
+        .add_sequence_collection_from_fasta(&fasta, FastaImportOptions::new())
+        .unwrap();
+    let digest = meta.digest.clone();
+    source.load_all_collections().unwrap();
+
+    // Create destination store in Encoded mode (default)
+    let target_dir = tempdir().unwrap();
+    let mut target = RefgetStore::on_disk(target_dir.path()).unwrap();
+    // target uses Encoded mode by default
+
+    let result = target.import_collection(&mut source, &digest);
+    assert!(result.is_err(), "Should fail with mode mismatch");
+    let err_msg = result.unwrap_err().to_string();
+    assert!(
+        err_msg.contains("matching storage modes"),
+        "Error should mention storage modes: {}",
+        err_msg,
+    );
+}
diff --git a/gtars-tokenizers/src/tokenizer.rs b/gtars-tokenizers/src/tokenizer.rs
index 5dc9b5a4..fa80f9ae 100644
--- a/gtars-tokenizers/src/tokenizer.rs
+++ b/gtars-tokenizers/src/tokenizer.rs
@@ -1,5 +1,7 @@
 use std::collections::HashMap as StdHashMap;
-use std::path::{Path, PathBuf};
+use std::path::Path;
+#[cfg(feature = "huggingface")]
+use std::path::PathBuf;
 
 use fxhash::FxHashMap as HashMap;
 
diff --git a/gtars-wasm/Cargo.toml b/gtars-wasm/Cargo.toml
index 16375c40..93ae9b01 100644
--- a/gtars-wasm/Cargo.toml
+++ b/gtars-wasm/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "gtars-js"
-version = "0.8.0"
+version = "0.9.0"
 authors = ["Nathan LeRoy <NLeRoy917@gmail.com>"]
 edition = "2024"
 
@@ -15,6 +15,7 @@ wasm-bindgen = "0.2.93"
 serde = { version = "1.0", features = ["derive"] }
 serde-wasm-bindgen = "0.4"
 serde_json = "1.0"
+flate2 = { workspace = true }
 getrandom = { version = "0.2.16", features = ["js"] }
 
 # our code
@@ -35,6 +36,13 @@ console_error_panic_hook = { version = "0.1.7", optional = true }
 [dev-dependencies]
 wasm-bindgen-test = "0.3.34"
 
-[profile.release]
-# Tell `rustc` to optimize for small code size.
-opt-level = "s"
+# NOTE: profile sections in non-root workspace members are silently ignored
+# by cargo, so this block has no effect here. Left commented as a marker for
+# future work — to actually apply size optimization to the wasm bundle, this
+# needs to move to the workspace root Cargo.toml (e.g. as
+# `[profile.release.package.gtars-js]` for a per-package override, or as a
+# custom `wasm-release` profile inheriting from release).
+#
+# [profile.release]
+# # Tell `rustc` to optimize for small code size.
+# opt-level = "s"
diff --git a/gtars-wasm/src/bed_stream.rs b/gtars-wasm/src/bed_stream.rs
new file mode 100644
index 00000000..d089f81f
--- /dev/null
+++ b/gtars-wasm/src/bed_stream.rs
@@ -0,0 +1,287 @@
+use std::collections::HashMap;
+use std::io::{Cursor, Read};
+use std::sync::Mutex;
+
+use flate2::read::GzDecoder;
+use gtars_core::models::{Region, RegionSet};
+use serde::{Deserialize, Serialize};
+use wasm_bindgen::prelude::*;
+
+use crate::models::BedEntries;
+
+// ============================================================================
+// BedStreamParser — streaming BED file parser with gzip auto-detection
+// ============================================================================
+
+struct BedStreamParser {
+    regions: Vec<Region>,
+    compressed_buf: Vec<u8>,
+    text_buf: String,
+    is_gzipped: Option<bool>,
+    bytes_processed: usize,
+}
+
+impl BedStreamParser {
+    fn new() -> Self {
+        Self {
+            regions: Vec::new(),
+            compressed_buf: Vec::new(),
+            text_buf: String::new(),
+            is_gzipped: None,
+            bytes_processed: 0,
+        }
+    }
+
+    fn update(&mut self, chunk: &[u8]) -> Result<(), String> {
+        if chunk.is_empty() {
+            return Ok(());
+        }
+
+        self.bytes_processed += chunk.len();
+
+        if self.is_gzipped.is_none() {
+            self.is_gzipped = Some(chunk.len() >= 2 && chunk[0] == 0x1f && chunk[1] == 0x8b);
+        }
+
+        if self.is_gzipped == Some(true) {
+            self.compressed_buf.extend_from_slice(chunk);
+        } else {
+            let text = String::from_utf8_lossy(chunk);
+            self.text_buf.push_str(&text);
+            self.parse_complete_lines();
+        }
+
+        Ok(())
+    }
+
+    fn finish(mut self) -> Result<RegionSet, String> {
+        if self.is_gzipped == Some(true) {
+            let cursor = Cursor::new(&self.compressed_buf);
+            let mut decoder = GzDecoder::new(cursor);
+            let mut decompressed = String::new();
+            decoder
+                .read_to_string(&mut decompressed)
+                .map_err(|e| format!("Gzip decompression failed: {}", e))?;
+            self.text_buf = decompressed;
+            self.parse_complete_lines();
+        }
+
+        if !self.text_buf.is_empty() {
+            let remaining = std::mem::take(&mut self.text_buf);
+            self.parse_line(&remaining);
+        }
+
+        let mut rs = RegionSet::from(self.regions);
+        rs.sort();
+        Ok(rs)
+    }
+
+    fn region_count(&self) -> usize {
+        self.regions.len()
+    }
+
+    fn bytes_processed(&self) -> usize {
+        self.bytes_processed
+    }
+
+    fn parse_complete_lines(&mut self) {
+        let last_newline = match self.text_buf.rfind('\n') {
+            Some(pos) => pos,
+            None => return,
+        };
+
+        let complete = self.text_buf[..last_newline].to_string();
+        let remainder = self.text_buf[last_newline + 1..].to_string();
+        self.text_buf = remainder;
+
+        for line in complete.split('\n') {
+            self.parse_line(line);
+        }
+    }
+
+    fn parse_line(&mut self, line: &str) {
+        let trimmed = line.trim();
+        if trimmed.is_empty()
+            || trimmed.starts_with('#')
+            || trimmed.starts_with("browser")
+            || trimmed.starts_with("track")
+        {
+            return;
+        }
+
+        let parts: Vec<&str> = trimmed.split('\t').collect();
+        if parts.len() < 3 {
+            eprintln!(
+                "BED parse warning: expected at least 3 tab-separated fields, got {} in line: {:?}",
+                parts.len(),
+                trimmed
+            );
+            return;
+        }
+
+        let start = match parts[1].parse::<u32>() {
+            Ok(v) => v,
+            Err(e) => {
+                eprintln!(
+                    "BED parse warning: invalid start coordinate {:?} in line {:?}: {}",
+                    parts[1],
+                    trimmed,
+                    e
+                );
+                return;
+            }
+        };
+        let end = match parts[2].parse::<u32>() {
+            Ok(v) => v,
+            Err(e) => {
+                eprintln!(
+                    "BED parse warning: invalid end coordinate {:?} in line {:?}: {}",
+                    parts[2],
+                    trimmed,
+                    e
+                );
+                return;
+            }
+        };
+
+        let rest = if parts.len() > 3 {
+            Some(parts[3..].join("\t"))
+        } else {
+            None
+        };
+
+        self.regions.push(Region {
+            chr: parts[0].to_string(),
+            start,
+            end,
+            rest: rest.filter(|s| !s.is_empty()),
+        });
+    }
+}
+
+// ============================================================================
+// Global storage for streaming BED parser instances
+// ============================================================================
+
+static BED_PARSER_STORAGE: Mutex<Option<BedParserStorage>> = Mutex::new(None);
+
+struct BedParserStorage {
+    parsers: HashMap<u32, BedStreamParser>,
+    next_id: u32,
+}
+
+impl BedParserStorage {
+    fn new() -> Self {
+        Self {
+            parsers: HashMap::new(),
+            next_id: 1,
+        }
+    }
+
+    fn insert(&mut self, parser: BedStreamParser) -> u32 {
+        let mut id = self.next_id;
+        while self.parsers.contains_key(&id) || id == 0 {
+            id = id.wrapping_add(1);
+            if id == 0 {
+                id = 1;
+            }
+        }
+        self.next_id = id.wrapping_add(1);
+        if self.next_id == 0 {
+            self.next_id = 1;
+        }
+        self.parsers.insert(id, parser);
+        id
+    }
+
+    fn get_mut(&mut self, id: u32) -> Option<&mut BedStreamParser> {
+        self.parsers.get_mut(&id)
+    }
+
+    fn remove(&mut self, id: u32) -> Option<BedStreamParser> {
+        self.parsers.remove(&id)
+    }
+}
+
+fn with_bed_storage<F, R>(f: F) -> R
+where
+    F: FnOnce(&mut BedParserStorage) -> R,
+{
+    let mut guard = BED_PARSER_STORAGE
+        .lock()
+        .expect("BED_PARSER_STORAGE mutex poisoned");
+    if guard.is_none() {
+        *guard = Some(BedParserStorage::new());
+    }
+    f(guard.as_mut().unwrap())
+}
+
+// ============================================================================
+// WASM bindings
+// ============================================================================
+
+#[wasm_bindgen(js_name = "bedParserNew")]
+pub fn bed_parser_new() -> u32 {
+    with_bed_storage(|storage| storage.insert(BedStreamParser::new()))
+}
+
+#[wasm_bindgen(js_name = "bedParserUpdate")]
+pub fn bed_parser_update(handle: u32, chunk: &[u8]) -> Result<(), JsError> {
+    with_bed_storage(|storage| {
+        if let Some(parser) = storage.get_mut(handle) {
+            parser
+                .update(chunk)
+                .map_err(|e| JsError::new(&format!("Failed to process chunk: {}", e)))
+        } else {
+            Err(JsError::new("Invalid parser handle"))
+        }
+    })
+}
+
+/// Finalize parsing and return BedEntries (array of [chr, start, end, rest] tuples).
+/// The result can be passed directly to `new RegionSet(entries)`.
+#[wasm_bindgen(js_name = "bedParserFinish")]
+pub fn bed_parser_finish(handle: u32) -> Result<JsValue, JsError> {
+    let parser = with_bed_storage(|storage| storage.remove(handle))
+        .ok_or_else(|| JsError::new("Invalid parser handle"))?;
+
+    let region_set = parser
+        .finish()
+        .map_err(|e| JsError::new(&format!("Failed to finalize parser: {}", e)))?;
+
+    let entries: Vec<(String, u32, u32, String)> = region_set
+        .regions
+        .into_iter()
+        .map(|r| (r.chr, r.start, r.end, r.rest.unwrap_or_default()))
+        .collect();
+
+    serde_wasm_bindgen::to_value(&BedEntries(entries))
+        .map_err(|e| JsError::new(&format!("Serialization error: {}", e)))
+}
+
+#[wasm_bindgen(js_name = "bedParserFree")]
+pub fn bed_parser_free(handle: u32) -> bool {
+    with_bed_storage(|storage| storage.remove(handle).is_some())
+}
+
+#[wasm_bindgen(js_name = "bedParserProgress")]
+pub fn bed_parser_progress(handle: u32) -> Result<JsValue, JsError> {
+    let progress = with_bed_storage(|storage| {
+        storage.get_mut(handle).map(|parser| BedParserProgress {
+            region_count: parser.region_count(),
+            bytes_processed: parser.bytes_processed(),
+        })
+    });
+
+    match progress {
+        Some(p) => serde_wasm_bindgen::to_value(&p)
+            .map_err(|e| JsError::new(&format!("Serialization error: {}", e))),
+        None => Err(JsError::new("Invalid parser handle")),
+    }
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct BedParserProgress {
+    pub region_count: usize,
+    pub bytes_processed: usize,
+}
diff --git a/gtars-wasm/src/lib.rs b/gtars-wasm/src/lib.rs
index 10ccca90..eed4de7c 100644
--- a/gtars-wasm/src/lib.rs
+++ b/gtars-wasm/src/lib.rs
@@ -1,3 +1,4 @@
+mod bed_stream;
 mod asset;
 mod lola;
 mod models;
@@ -10,17 +11,6 @@ mod tokenizers;
 mod tss;
 mod utils;
 
-use wasm_bindgen::prelude::*;
-
-// Re-export refget functions at the top level
+// Re-export functions at the top level
+pub use bed_stream::*;
 pub use refget::*;
-
-#[wasm_bindgen]
-pub fn greet(name: &str) {
-    alert(&format!("Hello, {}!", name));
-}
-
-#[wasm_bindgen]
-extern "C" {
-    fn alert(s: &str);
-}
diff --git a/gtars-wasm/src/lola.rs b/gtars-wasm/src/lola.rs
index 0b1bbdf1..232ff94c 100644
--- a/gtars-wasm/src/lola.rs
+++ b/gtars-wasm/src/lola.rs
@@ -257,67 +257,63 @@ struct UniverseCheckResult {
     warnings: Vec<String>,
 }
 
-#[derive(serde::Serialize)]
-#[serde(rename_all = "camelCase")]
-struct LolaResults {
-    user_set: Vec<usize>,
-    db_set: Vec<usize>,
-    collection: Vec<Option<String>>,
-    p_value_log: Vec<f64>,
-    odds_ratio: Vec<f64>,
-    support: Vec<u64>,
-    rnk_pv: Vec<usize>,
-    rnk_or: Vec<usize>,
-    rnk_sup: Vec<usize>,
-    max_rnk: Vec<usize>,
-    mean_rnk: Vec<f64>,
-    b: Vec<i64>,
-    c: Vec<i64>,
-    d: Vec<i64>,
-    description: Vec<Option<String>>,
-    cell_type: Vec<Option<String>>,
-    tissue: Vec<Option<String>>,
-    antibody: Vec<Option<String>>,
-    treatment: Vec<Option<String>>,
-    data_source: Vec<Option<String>>,
-    filename: Vec<String>,
-    q_value: Vec<Option<f64>>,
-    size: Vec<u64>,
-}
-
-fn empty_to_none(s: &str) -> Option<String> {
-    if s.is_empty() {
-        None
-    } else {
-        Some(s.to_string())
+fn results_to_js(results: &[LolaResult]) -> Result<JsValue, JsValue> {
+    use gtars_lola::output::results_to_columns;
+
+    let c = results_to_columns(results);
+
+    #[derive(serde::Serialize)]
+    #[serde(rename_all = "camelCase")]
+    struct Out {
+        user_set: Vec<usize>,
+        db_set: Vec<usize>,
+        collection: Vec<Option<String>>,
+        p_value_log: Vec<f64>,
+        odds_ratio: Vec<f64>,
+        support: Vec<u64>,
+        rnk_pv: Vec<usize>,
+        rnk_or: Vec<usize>,
+        rnk_sup: Vec<usize>,
+        max_rnk: Vec<usize>,
+        mean_rnk: Vec<f64>,
+        b: Vec<i64>,
+        c: Vec<i64>,
+        d: Vec<i64>,
+        description: Vec<Option<String>>,
+        cell_type: Vec<Option<String>>,
+        tissue: Vec<Option<String>>,
+        antibody: Vec<Option<String>>,
+        treatment: Vec<Option<String>>,
+        data_source: Vec<Option<String>>,
+        filename: Vec<String>,
+        q_value: Vec<Option<f64>>,
+        size: Vec<u64>,
     }
-}
 
-fn results_to_js(results: &[LolaResult]) -> Result<JsValue, JsValue> {
-    let out = LolaResults {
-        user_set: results.iter().map(|r| r.user_set).collect(),
-        db_set: results.iter().map(|r| r.db_set).collect(),
-        collection: results.iter().map(|r| empty_to_none(&r.collection)).collect(),
-        p_value_log: results.iter().map(|r| r.p_value_log).collect(),
-        odds_ratio: results.iter().map(|r| r.odds_ratio).collect(),
-        support: results.iter().map(|r| r.support).collect(),
-        rnk_pv: results.iter().map(|r| r.rnk_pv).collect(),
-        rnk_or: results.iter().map(|r| r.rnk_or).collect(),
-        rnk_sup: results.iter().map(|r| r.rnk_sup).collect(),
-        max_rnk: results.iter().map(|r| r.max_rnk).collect(),
-        mean_rnk: results.iter().map(|r| r.mean_rnk).collect(),
-        b: results.iter().map(|r| r.b).collect(),
-        c: results.iter().map(|r| r.c).collect(),
-        d: results.iter().map(|r| r.d).collect(),
-        description: results.iter().map(|r| empty_to_none(&r.description)).collect(),
-        cell_type: results.iter().map(|r| empty_to_none(&r.cell_type)).collect(),
-        tissue: results.iter().map(|r| empty_to_none(&r.tissue)).collect(),
-        antibody: results.iter().map(|r| empty_to_none(&r.antibody)).collect(),
-        treatment: results.iter().map(|r| empty_to_none(&r.treatment)).collect(),
-        data_source: results.iter().map(|r| empty_to_none(&r.data_source)).collect(),
-        filename: results.iter().map(|r| r.filename.clone()).collect(),
-        q_value: results.iter().map(|r| r.q_value).collect(),
-        size: results.iter().map(|r| r.db_set_size).collect(),
+    let out = Out {
+        user_set: c.user_set,
+        db_set: c.db_set,
+        collection: c.collection,
+        p_value_log: c.p_value_log,
+        odds_ratio: c.odds_ratio,
+        support: c.support,
+        rnk_pv: c.rnk_pv,
+        rnk_or: c.rnk_or,
+        rnk_sup: c.rnk_sup,
+        max_rnk: c.max_rnk,
+        mean_rnk: c.mean_rnk,
+        b: c.b,
+        c: c.c,
+        d: c.d,
+        description: c.description,
+        cell_type: c.cell_type,
+        tissue: c.tissue,
+        antibody: c.antibody,
+        treatment: c.treatment,
+        data_source: c.data_source,
+        filename: c.filename,
+        q_value: c.q_value,
+        size: c.db_set_size,
     };
 
     serde_wasm_bindgen::to_value(&out).map_err(|e| JsValue::from_str(&format!("{}", e)))
diff --git a/gtars-wasm/src/regionset.rs b/gtars-wasm/src/regionset.rs
index 32e3ab02..d344e00a 100644
--- a/gtars-wasm/src/regionset.rs
+++ b/gtars-wasm/src/regionset.rs
@@ -4,11 +4,16 @@ use crate::models::BedEntries;
 use gtars_core::models::{Region, RegionSet, RegionSetList};
 use gtars_genomicdist::bed_classifier::classify_bed;
 use gtars_genomicdist::consensus;
-use gtars_genomicdist::interval_ranges::IntervalRanges;
+use gtars_genomicdist::interval_ranges::{IntervalRanges, RegionSetListOps};
 use gtars_genomicdist::models::RegionBin;
 use gtars_genomicdist::statistics::GenomicIntervalSetStatistics;
 use wasm_bindgen::prelude::*;
 
+#[inline]
+fn to_js<T: serde::Serialize>(value: &T) -> Result<JsValue, JsValue> {
+    serde_wasm_bindgen::to_value(value).map_err(|e| e.into())
+}
+
 #[wasm_bindgen(js_name = "ChromosomeStatistics")]
 #[derive(serde::Serialize)]
 pub struct JsChromosomeStatistics {
@@ -176,10 +181,31 @@ impl JsRegionSet {
         serde_wasm_bindgen::to_value(&result).map_err(|e| e.into())
     }
 
+    /// Region distribution across genomic bins.
+    ///
+    /// When `chrom_sizes` is provided (JS object: `{chr: length, ...}`), per-chromosome
+    /// bin sizes are derived from the reference genome (bin_size = chrom_size / n_bins
+    /// per chrom). Outputs are comparable across BED files and aligned with reference
+    /// genome positions. Regions on chroms not in `chrom_sizes`, or whose midpoint
+    /// falls beyond the stated chromosome size, are silently skipped.
+    ///
+    /// When `chrom_sizes` is null/undefined, bin size is derived from the BED file's
+    /// observed max end coordinate — outputs will NOT be comparable across files.
     #[wasm_bindgen(js_name = "regionDistribution")]
-    pub fn region_distribution(&self, n_bins: u32) -> Result<JsValue, JsValue> {
+    pub fn region_distribution(
+        &self,
+        n_bins: u32,
+        chrom_sizes: JsValue,
+    ) -> Result<JsValue, JsValue> {
         let distribution: HashMap<String, RegionBin> =
-            self.region_set.region_distribution_with_bins(n_bins);
+            if chrom_sizes.is_null() || chrom_sizes.is_undefined() {
+                self.region_set.region_distribution_with_bins(n_bins)
+            } else {
+                let cs: HashMap<String, u32> = serde_wasm_bindgen::from_value(chrom_sizes)
+                    .map_err(|e| JsValue::from_str(&format!("chrom_sizes: {}", e)))?;
+                self.region_set
+                    .region_distribution_with_chrom_sizes(n_bins, &cs)
+            };
 
         let mut result_vector: Vec<JsRegionDistribution> = vec![];
 
@@ -234,6 +260,18 @@ impl JsRegionSet {
         serde_wasm_bindgen::to_value(&nearest).map_err(|e| e.into())
     }
 
+    /// Single-linkage cluster IDs for each region.
+    ///
+    /// Two regions on the same chromosome are assigned the same cluster
+    /// when the bp gap between them is at most `max_gap`. Chromosome
+    /// boundaries always break clusters. Returns a `Uint32Array`-compatible
+    /// JS array in the original region order.
+    #[wasm_bindgen(js_name = "cluster")]
+    pub fn cluster(&self, max_gap: u32) -> Result<JsValue, JsValue> {
+        let ids = self.region_set.cluster(max_gap);
+        to_js(&ids)
+    }
+
     // ── Interval range methods ──────────────────────────────────────
 
     #[wasm_bindgen(js_name = "trim")]
@@ -320,9 +358,11 @@ impl JsRegionSet {
     }
 
     #[wasm_bindgen(js_name = "gaps")]
-    pub fn gaps(&self) -> JsRegionSet {
-        let result = self.region_set.gaps();
-        JsRegionSet { region_set: result }
+    pub fn gaps(&self, chrom_sizes: &JsValue) -> Result<JsRegionSet, JsValue> {
+        let sizes: HashMap<String, u32> = serde_wasm_bindgen::from_value(chrom_sizes.clone())
+            .map_err(|e| JsValue::from_str(&format!("chrom_sizes: {}", e)))?;
+        let result = self.region_set.gaps(&sizes);
+        Ok(JsRegionSet { region_set: result })
     }
 
     #[wasm_bindgen(js_name = "intersect")]
@@ -345,6 +385,25 @@ pub struct JsRegionSetList {
 
 #[wasm_bindgen(js_class = "RegionSetList")]
 impl JsRegionSetList {
+    /// Create an empty RegionSetList. Use `add()` to populate it.
+    #[wasm_bindgen(constructor)]
+    pub fn new() -> Self {
+        JsRegionSetList {
+            inner: RegionSetList::new(Vec::new()),
+        }
+    }
+
+    /// Add a RegionSet to this list, with an optional name.
+    pub fn add(&mut self, set: &JsRegionSet, name: Option<String>) {
+        let idx = self.inner.region_sets.len();
+        self.inner.region_sets.push(set.region_set.clone());
+        let name_val = name.unwrap_or_else(|| format!("set_{}", idx));
+        self.inner
+            .names
+            .get_or_insert_with(Vec::new)
+            .push(name_val);
+    }
+
     /// Number of region sets in this list.
     #[wasm_bindgen(getter)]
     pub fn length(&self) -> usize {
@@ -383,6 +442,139 @@ impl JsRegionSetList {
         }
     }
 
+    /// Build a RegionSetList directly from arrays of BED entries.
+    ///
+    /// @param entries - Array of arrays: [[[chr, start, end, rest], ...], ...]
+    /// @param names - Optional array of names, one per set
+    #[wasm_bindgen(js_name = "fromEntries")]
+    pub fn from_entries(entries: &JsValue, names: &JsValue) -> Result<JsRegionSetList, JsValue> {
+        let all_entries: Vec<BedEntries> =
+            serde_wasm_bindgen::from_value(entries.clone())?;
+        let names_vec: Option<Vec<String>> = if names.is_null() || names.is_undefined() {
+            None
+        } else {
+            Some(serde_wasm_bindgen::from_value(names.clone())?)
+        };
+
+        let mut sets = Vec::with_capacity(all_entries.len());
+        for bed_entries in all_entries {
+            let regions: Vec<Region> = bed_entries
+                .0
+                .into_iter()
+                .map(|be| Region {
+                    chr: be.0,
+                    start: be.1,
+                    end: be.2,
+                    rest: Some(be.3),
+                })
+                .collect();
+            let mut rs = RegionSet::from(regions);
+            rs.sort();
+            sets.push(rs);
+        }
+
+        Ok(JsRegionSetList {
+            inner: RegionSetList {
+                region_sets: sets,
+                names: names_vec,
+                path: None,
+            },
+        })
+    }
+
+    /// Number of regions in the set at the given index.
+    #[wasm_bindgen(js_name = "regionCount")]
+    pub fn region_count(&self, index: usize) -> Result<u32, JsValue> {
+        self.inner.region_count(index)
+            .ok_or_else(|| JsValue::from_str("Index out of range"))
+    }
+
+    /// Number of overlapping regions between two sets by index.
+    #[wasm_bindgen(js_name = "pintersectCount")]
+    pub fn pintersect_count(&self, i: usize, j: usize) -> Result<u32, JsValue> {
+        self.inner.pintersect_count(i, j)
+            .ok_or_else(|| JsValue::from_str("Index out of range"))
+    }
+
+    /// Jaccard similarity between two sets by index.
+    #[wasm_bindgen(js_name = "jaccardAt")]
+    pub fn jaccard_at(&self, i: usize, j: usize) -> Result<f64, JsValue> {
+        self.inner.jaccard_at(i, j)
+            .ok_or_else(|| JsValue::from_str("Index out of range"))
+    }
+
+    /// Union of two sets by index.
+    #[wasm_bindgen(js_name = "unionAt")]
+    pub fn union_at(&self, i: usize, j: usize) -> Result<JsRegionSet, JsValue> {
+        self.inner.union_at(i, j)
+            .map(|rs| JsRegionSet { region_set: rs })
+            .ok_or_else(|| JsValue::from_str("Index out of range"))
+    }
+
+    /// Setdiff of two sets by index (set[i] minus set[j]).
+    #[wasm_bindgen(js_name = "setdiffAt")]
+    pub fn setdiff_at(&self, i: usize, j: usize) -> Result<JsRegionSet, JsValue> {
+        self.inner.setdiff_at(i, j)
+            .map(|rs| JsRegionSet { region_set: rs })
+            .ok_or_else(|| JsValue::from_str("Index out of range"))
+    }
+
+    /// Union of all sets except the one at the given index.
+    #[wasm_bindgen(js_name = "unionExcept")]
+    pub fn union_except(&self, skip: usize) -> Result<JsRegionSet, JsValue> {
+        self.inner.union_except(skip)
+            .map(|rs| JsRegionSet { region_set: rs })
+            .ok_or_else(|| JsValue::from_str("Index out of range or list too small"))
+    }
+
+    /// Compute all N union-except results in O(n) via prefix/suffix.
+    /// Returns { union: RegionSet, excepts: RegionSet[] }.
+    #[wasm_bindgen(js_name = "bulkUnionExcept")]
+    pub fn bulk_union_except(&self) -> Result<JsValue, JsValue> {
+        let (full_union, excepts) = self.inner.bulk_union_except()
+            .ok_or_else(|| JsValue::from_str("Need at least 2 sets"))?;
+
+        #[derive(serde::Serialize)]
+        struct BulkResult {
+            union_regions: u32,
+            union_nucleotides: u32,
+            except_unique: Vec<u32>,
+        }
+
+        // For each file, compute setdiff(file_i, union_except_i).len()
+        let mut except_unique = Vec::with_capacity(excepts.len());
+        for (i, ue) in excepts.iter().enumerate() {
+            if let Some(rs) = self.inner.get(i) {
+                except_unique.push(rs.setdiff(ue).len() as u32);
+            } else {
+                except_unique.push(0);
+            }
+        }
+
+        let result = BulkResult {
+            union_regions: full_union.len() as u32,
+            union_nucleotides: full_union.nucleotides_length() as u32,
+            except_unique,
+        };
+        serde_wasm_bindgen::to_value(&result).map_err(|e| e.into())
+    }
+
+    /// Union of all sets.
+    #[wasm_bindgen(js_name = "unionAll")]
+    pub fn union_all(&self) -> Result<JsRegionSet, JsValue> {
+        self.inner.union_all()
+            .map(|rs| JsRegionSet { region_set: rs })
+            .ok_or_else(|| JsValue::from_str("Empty list"))
+    }
+
+    /// Intersection of all sets.
+    #[wasm_bindgen(js_name = "intersectAll")]
+    pub fn intersect_all(&self) -> Result<JsRegionSet, JsValue> {
+        self.inner.intersect_all()
+            .map(|rs| JsRegionSet { region_set: rs })
+            .ok_or_else(|| JsValue::from_str("Empty list"))
+    }
+
     /// Compute pairwise Jaccard similarity for all pairs of region sets.
     ///
     /// Returns { matrix: number[][], names: string[] | null }.