diff --git a/.github/workflows/rust-publish.yml b/.github/workflows/rust-publish.yml index 8e1c5b1a..e876c45d 100644 --- a/.github/workflows/rust-publish.yml +++ b/.github/workflows/rust-publish.yml @@ -157,8 +157,7 @@ jobs: secrets: inherit publish-wasm: - if: ${{ always() && inputs.wasm != false && needs.publish-all-crates.result == 'success' }} - needs: publish-all-crates + if: ${{ always() && inputs.wasm != false }} runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 diff --git a/gtars-cli/Cargo.toml b/gtars-cli/Cargo.toml index 0bb4b8e2..f43c9c89 100644 --- a/gtars-cli/Cargo.toml +++ b/gtars-cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "gtars-cli" -version = "0.8.0" +version = "0.9.0" edition = "2024" description = "Performance critical tools for genomic interval analysis. This is the CLI" homepage = "https://github.com/databio/gtars" @@ -25,7 +25,7 @@ gtars-igd = { path = "../gtars-igd", optional=true, version="0.5.1" } gtars-uniwig = { path = "../gtars-uniwig", optional=true, version="0.8.0" } gtars-overlaprs = { path = "../gtars-overlaprs", optional = true, version="0.5.1" } gtars-bbcache = { path = "../gtars-bbcache", optional=true, version="0.5.3" } -gtars-genomicdist = { path = "../gtars-genomicdist", optional=true, version="0.6.0" } +gtars-genomicdist = { path = "../gtars-genomicdist", optional=true, version="0.8.0" } gtars-core = { path = "../gtars-core", version="0.5.5", features=["bigbed", "http"] } # serialization @@ -38,7 +38,7 @@ name = "gtars" path = "src/main.rs" [features] -default = [] +default = ["scoring", "uniwig", "bbcache", "igd", "fragsplit", "overlaprs", "genomicdist"] scoring = ["dep:gtars-scoring"] uniwig = ["dep:gtars-uniwig"] bbcache = ["dep:gtars-bbcache"] diff --git a/gtars-cli/src/genomicdist/cli.rs b/gtars-cli/src/genomicdist/cli.rs index 49b14da9..8d241a9d 100644 --- a/gtars-cli/src/genomicdist/cli.rs +++ b/gtars-cli/src/genomicdist/cli.rs @@ -24,7 +24,7 @@ pub fn create_genomicdist_cli() -> Command { Arg::new("chrom-sizes") .long("chrom-sizes") .required(false) - .help("Path to chrom.sizes file (enables expected partitions and promoter trimming)"), + .help("Path to chrom.sizes file. When provided, region distribution uses a per-chromosome bin size derived from the reference genome (stable across files). Also enables expected partitions and promoter trimming."), ) .arg( arg!(--output ) @@ -35,7 +35,7 @@ pub fn create_genomicdist_cli() -> Command { arg!(--bins ) .required(false) .default_value("250") - .help("Number of bins for region distribution"), + .help("Number of bins for the region distribution. Bin width is derived as max_chrom_len/bins; shorter chromosomes get proportionally fewer bins."), ) .arg( Arg::new("signal-matrix") @@ -43,6 +43,30 @@ pub fn create_genomicdist_cli() -> Command { .required(false) .help("Path to open signal matrix TSV (enables cell-type open chromatin enrichment)"), ) + .arg( + Arg::new("fasta") + .long("fasta") + .required(false) + .help("Path to genome FASTA (.fa) or binary FASTA (.fab) file. Enables GC content; also enables dinucleotide frequencies when --dinucl-freq is set. Use .fab format (via gtars prep --fasta) for best performance."), + ) + .arg( + Arg::new("ignore-unk-chroms") + .long("ignore-unk-chroms") + .action(clap::ArgAction::SetTrue) + .help("When computing GC content, skip regions on chromosomes not in the FASTA (default: error)"), + ) + .arg( + Arg::new("dinucl-freq") + .long("dinucl-freq") + .action(clap::ArgAction::SetTrue) + .help("Compute per-region dinucleotide frequencies (expensive for wide regions; opt-in even when --fasta is provided)"), + ) + .arg( + Arg::new("dinucl-raw-counts") + .long("dinucl-raw-counts") + .action(clap::ArgAction::SetTrue) + .help("Return raw per-region dinucleotide counts instead of percentages (matches R GenomicDistributions' rawCounts=TRUE)"), + ) .arg( Arg::new("promoter-upstream") .long("promoter-upstream") diff --git a/gtars-cli/src/genomicdist/handlers.rs b/gtars-cli/src/genomicdist/handlers.rs index 41a940c4..6e3a6c24 100644 --- a/gtars-cli/src/genomicdist/handlers.rs +++ b/gtars-cli/src/genomicdist/handlers.rs @@ -9,12 +9,16 @@ use serde::Serialize; use gtars_core::models::{Region, RegionSet}; use gtars_core::utils::get_chrom_sizes; -use gtars_genomicdist::models::{ChromosomeStatistics, RegionBin, Strand, TssIndex}; +use gtars_genomicdist::models::{ + BinaryGenomeAssembly, ChromosomeStatistics, GenomeAssembly, RegionBin, SequenceAccess, + Strand, TssIndex, +}; use gtars_genomicdist::statistics::GenomicIntervalSetStatistics; use gtars_genomicdist::{ GeneModel, GenomicDistAnnotation, ExpectedPartitionResult, PartitionResult, calc_expected_partitions, calc_partitions, genome_partition_list, SignalMatrix, calc_summary_signal, ConditionStats, + calc_gc_content, calc_dinucl_freq, DINUCL_ORDER, median_abs_distance, }; @@ -28,6 +32,32 @@ struct GenomicDistOutput { expected_partitions: Option, #[serde(skip_serializing_if = "Option::is_none")] open_signal: Option, + #[serde(skip_serializing_if = "Option::is_none")] + gc_content: Option, + #[serde(skip_serializing_if = "Option::is_none")] + dinucl_freq: Option, +} + +#[derive(Serialize)] +struct GcContentOutput { + /// Mean GC content across all regions (0–1) + mean: f64, + /// Per-region GC content values (0–1), one per region in input order + per_region: Vec, +} + +#[derive(Serialize)] +struct DinuclFreqOutput { + /// Dinucleotide names in canonical order (matches DINUCL_ORDER) + dinucleotides: Vec, + /// `chr_start_end` label per region + region_labels: Vec, + /// Per-region matrix: outer is regions, inner is 16 values matching + /// `dinucleotides` order. Percentages (0–100) by default, or raw counts + /// if --dinucl-raw-counts flag was passed. + frequencies: Vec<[f64; 16]>, + /// Whether `frequencies` are raw counts (true) or percentages (false) + raw_counts: bool, } #[derive(Serialize)] @@ -65,6 +95,8 @@ pub fn run_genomicdist(matches: &ArgMatches) -> Result<()> { let chrom_sizes_path = matches.get_one::("chrom-sizes"); let output_path = matches.get_one::("output"); let signal_matrix_path = matches.get_one::("signal-matrix"); + let fasta_path = matches.get_one::("fasta"); + let ignore_unk_chroms = matches.get_flag("ignore-unk-chroms"); let n_bins: u32 = matches .get_one::("bins") .unwrap() @@ -91,7 +123,22 @@ pub fn run_genomicdist(matches: &ArgMatches) -> Result<()> { // --- Unconditional computations --- let widths = rs.calc_widths(); let chromosome_stats = rs.chromosome_statistics(); - let region_dist_map = rs.region_distribution_with_bins(n_bins); + let region_dist_map = match explicit_chrom_sizes.as_ref() { + Some(cs) => rs.region_distribution_with_chrom_sizes(n_bins, cs), + None => { + eprintln!( + "warning: --chrom-sizes not provided; using BED-file-derived bin width." + ); + eprintln!( + " Outputs will NOT be comparable across files or aligned with" + ); + eprintln!( + " reference genome positions. Pass --chrom-sizes for" + ); + eprintln!(" reference-aligned bins."); + rs.region_distribution_with_bins(n_bins) + } + }; let neighbor_distances = rs .calc_neighbor_distances() .map_err(|e| anyhow::anyhow!("Failed to compute neighbor distances: {}", e))?; @@ -214,6 +261,58 @@ pub fn run_genomicdist(matches: &ArgMatches) -> Result<()> { None => None, }; + // --- Optional: GC content + dinucleotide frequencies (require FASTA) --- + // --fasta enables GC content. Dinucleotide frequencies are additionally + // opt-in via --dinucl-freq because they can be expensive for region sets + // with very wide regions (each dinucleotide window of every region is + // inspected; width dominates cost). + let dinucl_raw_counts = matches.get_flag("dinucl-raw-counts"); + let compute_dinucl = matches.get_flag("dinucl-freq"); + let (gc_content_out, dinucl_freq_out) = match fasta_path { + Some(p) => { + // Auto-detect .fab binary format vs plain FASTA (HashMap fallback) + let assembly: Box = if p.ends_with(".fab") { + Box::new(BinaryGenomeAssembly::try_from(p.as_str()) + .map_err(|e| anyhow::anyhow!("Failed to load .fab: {}", e))?) + } else { + Box::new(GenomeAssembly::try_from(p.as_str()) + .map_err(|e| anyhow::anyhow!("Failed to load FASTA: {}", e))?) + }; + + let gc_per_region = calc_gc_content(&rs, assembly.as_ref(), ignore_unk_chroms) + .map_err(|e| anyhow::anyhow!("Failed to compute GC content: {}", e))?; + let gc_mean = if gc_per_region.is_empty() { + 0.0 + } else { + gc_per_region.iter().sum::() / gc_per_region.len() as f64 + }; + let gc_out = GcContentOutput { + mean: gc_mean, + per_region: gc_per_region, + }; + + let dinucl_out = if compute_dinucl { + let (labels, matrix) = calc_dinucl_freq( + &rs, assembly.as_ref(), dinucl_raw_counts, ignore_unk_chroms, + ) + .map_err(|e| anyhow::anyhow!("Failed to compute dinucl freq: {}", e))?; + Some(DinuclFreqOutput { + dinucleotides: DINUCL_ORDER + .iter() + .map(|d| d.to_string().unwrap_or_default()) + .collect(), + region_labels: labels, + frequencies: matrix, + raw_counts: dinucl_raw_counts, + }) + } else { + None + }; + (Some(gc_out), dinucl_out) + } + None => (None, None), + }; + // --- Build output --- let output = GenomicDistOutput { scalars: Scalars { @@ -232,6 +331,8 @@ pub fn run_genomicdist(matches: &ArgMatches) -> Result<()> { }, expected_partitions, open_signal, + gc_content: gc_content_out, + dinucl_freq: dinucl_freq_out, }; let compact = matches.get_flag("compact"); diff --git a/gtars-cli/src/overlaprs/cli.rs b/gtars-cli/src/overlaprs/cli.rs index ed6485ea..6382b661 100644 --- a/gtars-cli/src/overlaprs/cli.rs +++ b/gtars-cli/src/overlaprs/cli.rs @@ -1,14 +1,26 @@ -use clap::{Command, arg}; +use clap::{Arg, Command, arg}; pub const OVERLAP_CMD: &str = "overlaprs"; pub fn create_overlap_cli() -> Command { Command::new(OVERLAP_CMD) .author("NJL") - .about("Tokenize data into a universe") + .about("Tokenize a BED file against a universe of regions (overlap-based encoding).") .arg_required_else_help(true) - .arg(arg!(-q "The file you are tokenizing")) - .arg(arg!(-u "The universe you are tokenizing into")) + .arg( + Arg::new("query") + .short('q') + .long("query") + .required(true) + .help("Path to the BED file to tokenize"), + ) + .arg( + Arg::new("universe") + .short('u') + .long("universe") + .required(true) + .help("Path to the universe BED file to tokenize against"), + ) .arg(arg!(-e --backend "Which backend to use (ailist or bits)")) .arg(arg!(--streaming "Use streaming mode for very large universes (lower memory usage)")) } diff --git a/gtars-cli/src/prep/cli.rs b/gtars-cli/src/prep/cli.rs index f707da24..d150d38f 100644 --- a/gtars-cli/src/prep/cli.rs +++ b/gtars-cli/src/prep/cli.rs @@ -4,7 +4,7 @@ pub const PREP_CMD: &str = "prep"; pub fn create_prep_cli() -> Command { Command::new(PREP_CMD) - .about("Pre-serialize GTF gene models or signal matrices to binary for fast loading.") + .about("Pre-serialize GTF gene models, signal matrices, or FASTA files to binary for fast loading.") .arg( Arg::new("gtf") .long("gtf") @@ -17,11 +17,17 @@ pub fn create_prep_cli() -> Command { .required(false) .help("Path to signal matrix TSV/TSV.gz to serialize"), ) + .arg( + Arg::new("fasta") + .long("fasta") + .required(false) + .help("Path to FASTA file to convert to .fab binary (zero-copy mmap format)"), + ) .arg( Arg::new("output") .long("output") .short('o') .required(false) - .help("Output path (default: input path with .bin extension, stripping .gz first)"), + .help("Output path (default: input path with .bin/.fab extension)"), ) } diff --git a/gtars-cli/src/prep/handlers.rs b/gtars-cli/src/prep/handlers.rs index 6494a056..60bf08cd 100644 --- a/gtars-cli/src/prep/handlers.rs +++ b/gtars-cli/src/prep/handlers.rs @@ -5,6 +5,7 @@ use anyhow::Result; use clap::ArgMatches; use gtars_genomicdist::{GenomicDistAnnotation, SignalMatrix}; +use gtars_genomicdist::models::BinaryGenomeAssembly; /// Derive the default output path: strip `.gz` then append `.bin`. fn default_output_path(input: &str) -> String { @@ -15,10 +16,11 @@ fn default_output_path(input: &str) -> String { pub fn run_prep(matches: &ArgMatches) -> Result<()> { let gtf_path = matches.get_one::("gtf"); let signal_path = matches.get_one::("signal-matrix"); + let fasta_path = matches.get_one::("fasta"); let output_path = matches.get_one::("output"); - if gtf_path.is_none() && signal_path.is_none() { - anyhow::bail!("Provide at least one of --gtf or --signal-matrix"); + if gtf_path.is_none() && signal_path.is_none() && fasta_path.is_none() { + anyhow::bail!("Provide at least one of --gtf, --signal-matrix, or --fasta"); } if let Some(gtf) = gtf_path { @@ -79,5 +81,29 @@ pub fn run_prep(matches: &ArgMatches) -> Result<()> { ); } + if let Some(fa) = fasta_path { + let out = output_path + .cloned() + .unwrap_or_else(|| { + let stripped = fa.strip_suffix(".gz").unwrap_or(fa); + format!("{}.fab", stripped) + }); + + eprintln!("Converting FASTA to .fab: {}", fa); + let start = Instant::now(); + BinaryGenomeAssembly::write_from_fasta(Path::new(fa), Path::new(&out)) + .map_err(|e| anyhow::anyhow!("Failed to create .fab: {}", e))?; + + let size = std::fs::metadata(&out) + .map(|m| m.len()) + .unwrap_or(0); + eprintln!( + " wrote {} ({:.1} GB) in {:.1}s", + out, + size as f64 / 1_073_741_824.0, + start.elapsed().as_secs_f64() + ); + } + Ok(()) } diff --git a/gtars-cli/src/ranges/cli.rs b/gtars-cli/src/ranges/cli.rs index a8f3377e..a6c2fe5c 100644 --- a/gtars-cli/src/ranges/cli.rs +++ b/gtars-cli/src/ranges/cli.rs @@ -133,8 +133,14 @@ pub fn create_ranges_cli() -> Command { ) .subcommand( Command::new("gaps") - .about("Compute gaps between regions per chromosome.") + .about("Compute gaps between regions per chromosome, bounded by chrom sizes.") .arg(arg!(--input "Input BED file").required(true)) + .arg( + Arg::new("chrom-sizes") + .long("chrom-sizes") + .required(true) + .help("Path to chrom.sizes file"), + ) .arg(arg!(--output "Output BED file (default: stdout)").required(false)), ) .subcommand( diff --git a/gtars-cli/src/ranges/handlers.rs b/gtars-cli/src/ranges/handlers.rs index a090d3ee..beab1af2 100644 --- a/gtars-cli/src/ranges/handlers.rs +++ b/gtars-cli/src/ranges/handlers.rs @@ -122,7 +122,11 @@ pub fn run_ranges(matches: &ArgMatches) -> Result<()> { } Some(("gaps", m)) => { let rs = load_input(m)?; - let result = rs.gaps(); + let cs_path = m + .get_one::("chrom-sizes") + .expect("--chrom-sizes is required"); + let chrom_sizes = get_chrom_sizes(cs_path); + let result = rs.gaps(&chrom_sizes); write_output(&result, m.get_one::("output")) } Some(("intersect", m)) => { diff --git a/gtars-genomicdist/Cargo.toml b/gtars-genomicdist/Cargo.toml index 93e7f383..3b7f2eca 100644 --- a/gtars-genomicdist/Cargo.toml +++ b/gtars-genomicdist/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "gtars-genomicdist" -version = "0.6.0" +version = "0.8.0" edition = "2024" description = "Rust port of GenomicDistributions: tools for computing statistics for genomic interval sets" license = "MIT" @@ -15,6 +15,7 @@ regex = "1.11.1" thiserror = { workspace = true } serde = { workspace = true } bio = "3.0.0" +memmap2 = "0.9" [dev-dependencies] diff --git a/gtars-genomicdist/src/interval_ranges.rs b/gtars-genomicdist/src/interval_ranges.rs index aa2b952b..b013bbb8 100644 --- a/gtars-genomicdist/src/interval_ranges.rs +++ b/gtars-genomicdist/src/interval_ranges.rs @@ -138,11 +138,26 @@ pub trait IntervalRanges { /// tiles the covered positions exactly, with no overlaps. fn disjoin(&self) -> RegionSet; - /// Return the gaps between regions per chromosome. + /// Return the gaps between regions per chromosome, bounded by chromosome sizes. /// - /// Reduces the input first, then emits intervals between consecutive - /// reduced regions on each chromosome. - fn gaps(&self) -> RegionSet; + /// Reduces the input first, then emits intervals that tile the peak-free + /// regions of each chromosome listed in `chrom_sizes`: + /// + /// - a **leading gap** from position 0 to the first region's start + /// (omitted if the first region starts at 0), + /// - an **inter-region gap** between each consecutive pair of reduced + /// regions, + /// - a **trailing gap** from the last region's end to the chromosome + /// size (omitted if the last region already reaches the chromosome + /// end, or extends past it due to assembly mismatch), + /// - a **full-chromosome gap** `0..chrom_size` for any chromosome in + /// `chrom_sizes` that has no regions at all. + /// + /// Regions on chromosomes not present in `chrom_sizes` are skipped. + /// Regions that extend past the stated chromosome size are clipped to + /// `chrom_size` when computing the trailing gap, matching the + /// clipping behavior of `trim()`. + fn gaps(&self, chrom_sizes: &HashMap) -> RegionSet; /// Range-level intersection of two region sets. /// @@ -584,44 +599,97 @@ impl IntervalRanges for RegionSet { RegionSet::from(result) } - fn gaps(&self) -> RegionSet { + fn gaps(&self, chrom_sizes: &HashMap) -> RegionSet { let reduced = self.reduce(); - if reduced.regions.is_empty() { - return RegionSet::from(Vec::::new()); + + // Group reduced regions by chromosome so we can emit per-chrom gaps + // and also detect chromosomes with zero regions (full-chrom gaps). + let mut by_chr: HashMap<&str, Vec<&Region>> = HashMap::new(); + for r in &reduced.regions { + // Skip chromosomes we don't have a size for — can't bound trailing gaps, + // and including leading gaps for unknown-size chromosomes is misleading. + if chrom_sizes.contains_key(&r.chr) { + by_chr.entry(r.chr.as_str()).or_default().push(r); + } } let mut result: Vec = Vec::new(); - let mut i = 0; - while i < reduced.regions.len() { - let chr = &reduced.regions[i].chr; - let mut j = i + 1; - // Leading gap from position 0 to the first region on this chromosome - if reduced.regions[i].start > 0 { - result.push(Region { - chr: chr.clone(), - start: 0, - end: reduced.regions[i].start, - rest: None, - }); + // Emit gaps for every chromosome named in chrom_sizes, not just those + // present in the input — this way a chromosome with zero regions + // contributes a full-chromosome gap, matching bedtools complement. + for (chr_name, &chrom_size) in chrom_sizes.iter() { + if chrom_size == 0 { + continue; } - while j < reduced.regions.len() && reduced.regions[j].chr == *chr { - // Gap between consecutive reduced regions - let gap_start = reduced.regions[j - 1].end; - let gap_end = reduced.regions[j].start; - if gap_start < gap_end { + + match by_chr.get(chr_name.as_str()) { + None => { + // No regions on this chromosome — whole chromosome is a gap. result.push(Region { - chr: chr.clone(), - start: gap_start, - end: gap_end, + chr: chr_name.clone(), + start: 0, + end: chrom_size, rest: None, }); } - j += 1; + Some(regions) => { + // Leading gap from 0 to the first region's start. + if regions[0].start > 0 { + // Leading gap is clipped to chrom_size as a safety net: + // if the first region starts past chrom_size (assembly + // mismatch) we still produce a valid [0, chrom_size) gap. + let lead_end = regions[0].start.min(chrom_size); + result.push(Region { + chr: chr_name.clone(), + start: 0, + end: lead_end, + rest: None, + }); + } + + // Inter-region gaps. + for pair in regions.windows(2) { + let gap_start = pair[0].end; + let gap_end = pair[1].start; + if gap_start < gap_end { + // Clip both bounds to chrom_size so the whole emitted + // gap lies within the chromosome. + let cs = chrom_size; + let clipped_start = gap_start.min(cs); + let clipped_end = gap_end.min(cs); + if clipped_start < clipped_end { + result.push(Region { + chr: chr_name.clone(), + start: clipped_start, + end: clipped_end, + rest: None, + }); + } + } + } + + // Trailing gap from last region's end to chrom_size. + let last_end = regions[regions.len() - 1].end; + if last_end < chrom_size { + result.push(Region { + chr: chr_name.clone(), + start: last_end, + end: chrom_size, + rest: None, + }); + } + } } - i = j.max(i + 1); } + // Karyotypic chromosome ordering so output is stable across runs. + result.sort_by(|a, b| { + crate::utils::chrom_karyotype_key(&a.chr) + .cmp(&crate::utils::chrom_karyotype_key(&b.chr)) + .then(a.start.cmp(&b.start)) + }); + RegionSet::from(result) } @@ -1140,6 +1208,140 @@ pub fn pairwise_jaccard(sets: &[RegionSet]) -> Vec { matrix } +// --- Indexed operations on RegionSetList --- +// +// These let callers operate on pairs by index without cloning full RegionSets +// across an FFI boundary (wasm, Python, R). + +use gtars_core::models::RegionSetList; + +/// Indexed pair operations on a RegionSetList. +pub trait RegionSetListOps { + fn pintersect_at(&self, i: usize, j: usize) -> Option; + fn pintersect_count(&self, i: usize, j: usize) -> Option; + fn jaccard_at(&self, i: usize, j: usize) -> Option; + fn union_at(&self, i: usize, j: usize) -> Option; + fn setdiff_at(&self, i: usize, j: usize) -> Option; + fn region_count(&self, i: usize) -> Option; + fn union_except(&self, skip: usize) -> Option; + /// Compute union-of-all and all N union-except results in O(n) unions + /// using prefix/suffix arrays. Returns (full_union, vec_of_union_except). + fn bulk_union_except(&self) -> Option<(RegionSet, Vec)>; + /// Fold all sets into a single union. + fn union_all(&self) -> Option; + /// Fold all sets into a single intersection. + fn intersect_all(&self) -> Option; +} + +impl RegionSetListOps for RegionSetList { + fn pintersect_at(&self, i: usize, j: usize) -> Option { + let a = self.get(i)?; + let b = self.get(j)?; + Some(a.pintersect(b)) + } + + fn pintersect_count(&self, i: usize, j: usize) -> Option { + self.pintersect_at(i, j).map(|rs| rs.len() as u32) + } + + fn jaccard_at(&self, i: usize, j: usize) -> Option { + let a = self.get(i)?; + let b = self.get(j)?; + Some(a.jaccard(b)) + } + + fn union_at(&self, i: usize, j: usize) -> Option { + let a = self.get(i)?; + let b = self.get(j)?; + Some(a.union(b)) + } + + fn setdiff_at(&self, i: usize, j: usize) -> Option { + let a = self.get(i)?; + let b = self.get(j)?; + Some(a.setdiff(b)) + } + + fn region_count(&self, i: usize) -> Option { + self.get(i).map(|rs| rs.len() as u32) + } + + fn union_except(&self, skip: usize) -> Option { + let n = self.len(); + if n < 2 || skip >= n { return None; } + let first = if skip == 0 { 1 } else { 0 }; + let mut acc = self.get(first)?.clone(); + for k in (first + 1)..n { + if k == skip { continue; } + if let Some(other) = self.get(k) { + acc = acc.union(other); + } + } + Some(acc) + } + + fn bulk_union_except(&self) -> Option<(RegionSet, Vec)> { + let n = self.len(); + if n < 2 { return None; } + + // prefix[i] = union(set[0]..=set[i]) + let mut prefix = Vec::with_capacity(n); + prefix.push(self.get(0)?.clone()); + for i in 1..n { + let prev = &prefix[i - 1]; + prefix.push(prev.union(self.get(i)?)); + } + + // suffix[i] = union(set[i]..=set[n-1]), built incrementally from right + let mut suffix = vec![None; n]; + suffix[n - 1] = Some(self.get(n - 1)?.clone()); + for i in (0..n - 1).rev() { + suffix[i] = Some(self.get(i)?.union(suffix[i + 1].as_ref().unwrap())); + } + let suffix: Vec = suffix.into_iter().map(|s| s.unwrap()).collect(); + + let full_union = prefix[n - 1].clone(); + + // union_except[i] = union(prefix[i-1], suffix[i+1]) + let mut results = Vec::with_capacity(n); + for i in 0..n { + let except = match (i > 0, i < n - 1) { + (false, true) => suffix[1].clone(), + (true, false) => prefix[i - 1].clone(), + (true, true) => prefix[i - 1].union(&suffix[i + 1]), + (false, false) => unreachable!(), // n >= 2 + }; + results.push(except); + } + + Some((full_union, results)) + } + + fn union_all(&self) -> Option { + let n = self.len(); + if n == 0 { return None; } + let mut acc = self.get(0)?.clone(); + for i in 1..n { + if let Some(other) = self.get(i) { + acc = acc.union(other); + } + } + Some(acc) + } + + fn intersect_all(&self) -> Option { + let n = self.len(); + if n == 0 { return None; } + let mut acc = self.get(0)?.clone(); + for i in 1..n { + if let Some(other) = self.get(i) { + acc = acc.intersect(other); + } + } + Some(acc) + } +} + #[cfg(test)] mod tests { use super::*; @@ -2262,6 +2464,187 @@ mod tests { assert_eq!(ids[1], 0); // original idx 1 maps to cluster 0 } + // ── gaps tests ────────────────────────────────────────────────────── + + fn chrom_sizes(entries: &[(&str, u32)]) -> HashMap { + entries + .iter() + .map(|(c, s)| (c.to_string(), *s)) + .collect() + } + + #[test] + fn test_gaps_basic() { + // Three peaks on chr1 with gaps between them; leading + trailing + // gaps also present. + let rs = make_regionset(vec![ + ("chr1", 10, 20), + ("chr1", 30, 40), + ("chr1", 50, 60), + ]); + let cs = chrom_sizes(&[("chr1", 100)]); + let result = rs.gaps(&cs); + let gaps: Vec<(&str, u32, u32)> = result + .regions + .iter() + .map(|r| (r.chr.as_str(), r.start, r.end)) + .collect(); + assert_eq!( + gaps, + vec![ + ("chr1", 0, 10), // leading + ("chr1", 20, 30), // between peak 1 and 2 + ("chr1", 40, 50), // between peak 2 and 3 + ("chr1", 60, 100), // trailing + ] + ); + } + + #[test] + fn test_gaps_peak_at_origin_no_leading() { + // First peak starts at 0 — no leading gap. + let rs = make_regionset(vec![("chr1", 0, 10), ("chr1", 20, 30)]); + let cs = chrom_sizes(&[("chr1", 100)]); + let result = rs.gaps(&cs); + let gaps: Vec<(u32, u32)> = result + .regions + .iter() + .map(|r| (r.start, r.end)) + .collect(); + assert_eq!(gaps, vec![(10, 20), (30, 100)]); + } + + #[test] + fn test_gaps_peak_at_chrom_end_no_trailing() { + // Last peak ends at chrom_size — no trailing gap. + let rs = make_regionset(vec![("chr1", 10, 20), ("chr1", 80, 100)]); + let cs = chrom_sizes(&[("chr1", 100)]); + let result = rs.gaps(&cs); + let gaps: Vec<(u32, u32)> = result + .regions + .iter() + .map(|r| (r.start, r.end)) + .collect(); + assert_eq!(gaps, vec![(0, 10), (20, 80)]); + } + + #[test] + fn test_gaps_peak_past_chrom_end_clipped() { + // Last peak extends past chrom_size — should be clipped, no trailing. + let rs = make_regionset(vec![("chr1", 10, 20), ("chr1", 80, 150)]); + let cs = chrom_sizes(&[("chr1", 100)]); + let result = rs.gaps(&cs); + let gaps: Vec<(u32, u32)> = result + .regions + .iter() + .map(|r| (r.start, r.end)) + .collect(); + assert_eq!(gaps, vec![(0, 10), (20, 80)]); + } + + #[test] + fn test_gaps_empty_regionset_populated_chrom_sizes() { + // No regions, but chrom_sizes has entries — emit whole-chrom gaps. + let rs = RegionSet::from(Vec::::new()); + let cs = chrom_sizes(&[("chr1", 100), ("chr2", 50)]); + let result = rs.gaps(&cs); + let mut gaps: Vec<(String, u32, u32)> = result + .regions + .iter() + .map(|r| (r.chr.clone(), r.start, r.end)) + .collect(); + gaps.sort(); + assert_eq!( + gaps, + vec![ + ("chr1".to_string(), 0, 100), + ("chr2".to_string(), 0, 50), + ] + ); + } + + #[test] + fn test_gaps_empty_regionset_empty_chrom_sizes() { + let rs = RegionSet::from(Vec::::new()); + let cs: HashMap = HashMap::new(); + let result = rs.gaps(&cs); + assert!(result.regions.is_empty()); + } + + #[test] + fn test_gaps_chromosome_not_in_chrom_sizes_skipped() { + // Peak on chr2 with no chr2 entry in chrom_sizes — should be ignored. + let rs = make_regionset(vec![("chr1", 10, 20), ("chr2", 5, 15)]); + let cs = chrom_sizes(&[("chr1", 100)]); + let result = rs.gaps(&cs); + // Only chr1 gaps emitted. + for r in &result.regions { + assert_eq!(r.chr, "chr1"); + } + } + + #[test] + fn test_gaps_full_chrom_gap_for_unrepresented_chrom() { + // chrom_sizes has chr2 but input has no chr2 peaks — emit whole chr2. + let rs = make_regionset(vec![("chr1", 10, 20)]); + let cs = chrom_sizes(&[("chr1", 100), ("chr2", 200)]); + let result = rs.gaps(&cs); + let chr2_gaps: Vec<(u32, u32)> = result + .regions + .iter() + .filter(|r| r.chr == "chr2") + .map(|r| (r.start, r.end)) + .collect(); + assert_eq!(chr2_gaps, vec![(0, 200)]); + } + + #[test] + fn test_gaps_overlapping_peaks_reduced() { + // Overlapping peaks get merged by reduce() before gap computation. + let rs = make_regionset(vec![ + ("chr1", 10, 30), + ("chr1", 25, 40), // overlaps with previous + ("chr1", 50, 60), + ]); + let cs = chrom_sizes(&[("chr1", 100)]); + let result = rs.gaps(&cs); + let gaps: Vec<(u32, u32)> = result + .regions + .iter() + .map(|r| (r.start, r.end)) + .collect(); + // After reduce: [10,40], [50,60] → gaps: [0,10], [40,50], [60,100] + assert_eq!(gaps, vec![(0, 10), (40, 50), (60, 100)]); + } + + #[test] + fn test_gaps_karyotypic_ordering() { + // Output should be karyotypically ordered regardless of chrom_sizes insertion order. + let rs = make_regionset(vec![ + ("chr2", 10, 20), + ("chr1", 10, 20), + ("chr10", 10, 20), + ]); + let cs = chrom_sizes(&[("chr10", 100), ("chr1", 100), ("chr2", 100)]); + let result = rs.gaps(&cs); + // First gap on each chr; collect chr names in order of appearance. + let order: Vec<&str> = result + .regions + .iter() + .map(|r| r.chr.as_str()) + .scan("", |prev, chr| { + if chr != *prev { + *prev = chr; + Some(chr) + } else { + Some("") // repeat, skip + } + }) + .filter(|s| !s.is_empty()) + .collect(); + assert_eq!(order, vec!["chr1", "chr2", "chr10"]); + } + #[test] fn test_resize_center_large_coords() { // Regression: coordinates near u32::MAX/2 should not overflow in midpoint calc @@ -2420,4 +2803,283 @@ mod tests { // overlap with [0,10): [5,10)=5bp, overlap with [20,30): [20,25)=5bp assert_eq!(super::merge_intersection_bp(&a, &b), 10); } + + // ── RegionSetListOps tests ───────────────────────────────────────── + + fn make_rsl(sets: Vec) -> RegionSetList { + RegionSetList::from(sets) + } + + #[rstest] + fn test_rsl_pintersect_count() { + let a = make_regionset(vec![("chr1", 0, 100), ("chr1", 200, 300)]); + let b = make_regionset(vec![("chr1", 50, 150), ("chr1", 250, 350)]); + let rsl = make_rsl(vec![a, b]); + // a has 2 regions, b has 2 regions, both overlap each other + let count = rsl.pintersect_count(0, 1).unwrap(); + assert_eq!(count, 2); // both pairs overlap + } + + #[rstest] + fn test_rsl_pintersect_count_no_overlap() { + let a = make_regionset(vec![("chr1", 0, 10)]); + let b = make_regionset(vec![("chr1", 100, 200)]); + let rsl = make_rsl(vec![a, b]); + // paired by index: chr1:0-10 vs chr1:100-200 → no genomic overlap, + // but pintersect produces a zero-width region (start=end) per pair + let count = rsl.pintersect_count(0, 1).unwrap(); + assert_eq!(count, 1); // zero-width region still counted + } + + #[rstest] + fn test_rsl_pintersect_count_out_of_bounds() { + let a = make_regionset(vec![("chr1", 0, 100)]); + let rsl = make_rsl(vec![a]); + assert!(rsl.pintersect_count(0, 5).is_none()); + } + + #[rstest] + fn test_rsl_jaccard_at() { + let a = make_regionset(vec![("chr1", 0, 100)]); + let b = make_regionset(vec![("chr1", 0, 100)]); + let rsl = make_rsl(vec![a, b]); + let j = rsl.jaccard_at(0, 1).unwrap(); + assert!((j - 1.0).abs() < 1e-9, "identical sets should have jaccard=1.0"); + } + + #[rstest] + fn test_rsl_jaccard_at_disjoint() { + let a = make_regionset(vec![("chr1", 0, 100)]); + let b = make_regionset(vec![("chr1", 200, 300)]); + let rsl = make_rsl(vec![a, b]); + let j = rsl.jaccard_at(0, 1).unwrap(); + assert!((j - 0.0).abs() < 1e-9, "disjoint sets should have jaccard=0.0"); + } + + #[rstest] + fn test_rsl_union_at() { + let a = make_regionset(vec![("chr1", 0, 100)]); + let b = make_regionset(vec![("chr1", 50, 150)]); + let rsl = make_rsl(vec![a, b]); + let u = rsl.union_at(0, 1).unwrap(); + assert_eq!(u.regions.len(), 1); + assert_eq!(u.regions[0].start, 0); + assert_eq!(u.regions[0].end, 150); + } + + #[rstest] + fn test_rsl_setdiff_at() { + let a = make_regionset(vec![("chr1", 0, 100)]); + let b = make_regionset(vec![("chr1", 50, 150)]); + let rsl = make_rsl(vec![a, b]); + let diff = rsl.setdiff_at(0, 1).unwrap(); + // a minus b: chr1:0-50 + assert_eq!(diff.regions.len(), 1); + assert_eq!(diff.regions[0].start, 0); + assert_eq!(diff.regions[0].end, 50); + } + + #[rstest] + fn test_rsl_region_count() { + let a = make_regionset(vec![("chr1", 0, 100), ("chr1", 200, 300)]); + let b = make_regionset(vec![("chr1", 50, 150)]); + let rsl = make_rsl(vec![a, b]); + assert_eq!(rsl.region_count(0).unwrap(), 2); + assert_eq!(rsl.region_count(1).unwrap(), 1); + assert!(rsl.region_count(5).is_none()); + } + + #[rstest] + fn test_rsl_union_all() { + let a = make_regionset(vec![("chr1", 0, 100)]); + let b = make_regionset(vec![("chr1", 50, 200)]); + let c = make_regionset(vec![("chr1", 150, 300)]); + let rsl = make_rsl(vec![a, b, c]); + let u = rsl.union_all().unwrap(); + assert_eq!(u.regions.len(), 1); + assert_eq!(u.regions[0].start, 0); + assert_eq!(u.regions[0].end, 300); + } + + #[rstest] + fn test_rsl_union_all_empty() { + let rsl = make_rsl(vec![]); + assert!(rsl.union_all().is_none()); + } + + #[rstest] + fn test_rsl_union_all_single() { + let a = make_regionset(vec![("chr1", 10, 50)]); + let rsl = make_rsl(vec![a]); + let u = rsl.union_all().unwrap(); + assert_eq!(u.regions.len(), 1); + assert_eq!(u.regions[0].start, 10); + assert_eq!(u.regions[0].end, 50); + } + + #[rstest] + fn test_rsl_intersect_all() { + // Three overlapping sets — intersection is the region shared by all three + let a = make_regionset(vec![("chr1", 0, 100)]); + let b = make_regionset(vec![("chr1", 30, 200)]); + let c = make_regionset(vec![("chr1", 60, 150)]); + let rsl = make_rsl(vec![a, b, c]); + let inter = rsl.intersect_all().unwrap(); + assert_eq!(inter.regions.len(), 1); + assert_eq!(inter.regions[0].start, 60); + assert_eq!(inter.regions[0].end, 100); + } + + #[rstest] + fn test_rsl_intersect_all_disjoint() { + let a = make_regionset(vec![("chr1", 0, 50)]); + let b = make_regionset(vec![("chr1", 100, 200)]); + let rsl = make_rsl(vec![a, b]); + let inter = rsl.intersect_all().unwrap(); + assert_eq!(inter.regions.len(), 0); + } + + #[rstest] + fn test_rsl_intersect_all_empty() { + let rsl = make_rsl(vec![]); + assert!(rsl.intersect_all().is_none()); + } + + #[rstest] + fn test_rsl_intersect_all_different_sizes() { + // This is the case where pintersect would give wrong results: + // sets have different numbers of regions, but share genomic coverage + let a = make_regionset(vec![("chr1", 0, 100), ("chr1", 200, 300)]); + let b = make_regionset(vec![("chr1", 50, 250)]); + let rsl = make_rsl(vec![a, b]); + let inter = rsl.intersect_all().unwrap(); + // Shared coverage: [50,100) and [200,250) + assert_eq!(inter.regions.len(), 2); + assert_eq!(inter.regions[0].start, 50); + assert_eq!(inter.regions[0].end, 100); + assert_eq!(inter.regions[1].start, 200); + assert_eq!(inter.regions[1].end, 250); + } + + #[rstest] + fn test_rsl_union_except() { + let a = make_regionset(vec![("chr1", 0, 100)]); + let b = make_regionset(vec![("chr1", 200, 300)]); + let c = make_regionset(vec![("chr1", 400, 500)]); + let rsl = make_rsl(vec![a, b, c]); + // union_except(1) = union of sets 0 and 2 (skip set 1) + let ue = rsl.union_except(1).unwrap(); + assert_eq!(ue.regions.len(), 2); + assert_eq!(ue.regions[0].start, 0); + assert_eq!(ue.regions[0].end, 100); + assert_eq!(ue.regions[1].start, 400); + assert_eq!(ue.regions[1].end, 500); + } + + #[rstest] + fn test_rsl_union_except_too_small() { + let a = make_regionset(vec![("chr1", 0, 100)]); + let rsl = make_rsl(vec![a]); + assert!(rsl.union_except(0).is_none()); + } + + #[rstest] + fn test_rsl_bulk_union_except_n2() { + let a = make_regionset(vec![("chr1", 0, 100)]); + let b = make_regionset(vec![("chr1", 200, 300)]); + let rsl = make_rsl(vec![a, b]); + let (full_union, excepts) = rsl.bulk_union_except().unwrap(); + + // Full union covers both regions + assert_eq!(full_union.regions.len(), 2); + + // except[0] = union of everything except set 0 = set 1 + assert_eq!(excepts.len(), 2); + assert_eq!(excepts[0].regions.len(), 1); + assert_eq!(excepts[0].regions[0].start, 200); + assert_eq!(excepts[0].regions[0].end, 300); + + // except[1] = union of everything except set 1 = set 0 + assert_eq!(excepts[1].regions.len(), 1); + assert_eq!(excepts[1].regions[0].start, 0); + assert_eq!(excepts[1].regions[0].end, 100); + } + + #[rstest] + fn test_rsl_bulk_union_except_n3() { + let a = make_regionset(vec![("chr1", 0, 100)]); + let b = make_regionset(vec![("chr1", 200, 300)]); + let c = make_regionset(vec![("chr1", 400, 500)]); + let rsl = make_rsl(vec![a, b, c]); + let (full_union, excepts) = rsl.bulk_union_except().unwrap(); + + assert_eq!(full_union.regions.len(), 3); + assert_eq!(excepts.len(), 3); + + // except[0] = union(b, c) + assert_eq!(excepts[0].regions.len(), 2); + assert_eq!(excepts[0].regions[0].start, 200); + assert_eq!(excepts[0].regions[1].start, 400); + + // except[1] = union(a, c) + assert_eq!(excepts[1].regions.len(), 2); + assert_eq!(excepts[1].regions[0].start, 0); + assert_eq!(excepts[1].regions[1].start, 400); + + // except[2] = union(a, b) + assert_eq!(excepts[2].regions.len(), 2); + assert_eq!(excepts[2].regions[0].start, 0); + assert_eq!(excepts[2].regions[1].start, 200); + } + + #[rstest] + fn test_rsl_bulk_union_except_too_small() { + let a = make_regionset(vec![("chr1", 0, 100)]); + let rsl = make_rsl(vec![a]); + assert!(rsl.bulk_union_except().is_none()); + + let rsl_empty = make_rsl(vec![]); + assert!(rsl_empty.bulk_union_except().is_none()); + } + + #[rstest] + fn test_rsl_bulk_union_except_matches_union_except() { + // Verify bulk algorithm produces same results as individual union_except calls + let a = make_regionset(vec![("chr1", 0, 100), ("chr2", 50, 200)]); + let b = make_regionset(vec![("chr1", 80, 180), ("chr2", 100, 300)]); + let c = make_regionset(vec![("chr1", 150, 250)]); + let d = make_regionset(vec![("chr2", 0, 150)]); + let rsl = make_rsl(vec![a, b, c, d]); + + let (_, bulk_excepts) = rsl.bulk_union_except().unwrap(); + + for i in 0..4 { + let individual = rsl.union_except(i).unwrap(); + assert_eq!( + bulk_excepts[i].regions.len(), + individual.regions.len(), + "region count mismatch at index {}", + i + ); + for (j, (bulk_r, indiv_r)) in bulk_excepts[i] + .regions + .iter() + .zip(individual.regions.iter()) + .enumerate() + { + assert_eq!( + bulk_r.chr, indiv_r.chr, + "chr mismatch at except[{}][{}]", i, j + ); + assert_eq!( + bulk_r.start, indiv_r.start, + "start mismatch at except[{}][{}]", i, j + ); + assert_eq!( + bulk_r.end, indiv_r.end, + "end mismatch at except[{}][{}]", i, j + ); + } + } + } } diff --git a/gtars-genomicdist/src/lib.rs b/gtars-genomicdist/src/lib.rs index f935cc26..8bf59a29 100644 --- a/gtars-genomicdist/src/lib.rs +++ b/gtars-genomicdist/src/lib.rs @@ -41,6 +41,7 @@ pub use gtars_core::models::CoordinateMode; pub use consensus::{ConsensusRegion, consensus}; pub use interval_ranges::IntervalRanges; pub use interval_ranges::pairwise_jaccard; +pub use interval_ranges::RegionSetListOps; pub use partitions::{ calc_expected_partitions, calc_partitions, genome_partition_list, ExpectedPartitionResult, ExpectedPartitionRow, GeneModel, PartitionList, PartitionResult, @@ -49,4 +50,4 @@ pub use models::{SortedRegionSet, Strand, StrandedRegionSet}; pub use signal::{calc_summary_signal, ConditionStats, SignalMatrix, SignalSummaryResult}; pub use statistics::GenomicIntervalSetStatistics; pub use utils::{chrom_karyotype_key, median_abs_distance}; -pub use statistics::{calc_dinucl_freq, calc_dinucl_freq_per_region, calc_gc_content, DINUCL_ORDER}; +pub use statistics::{calc_dinucl_freq, calc_gc_content, DINUCL_ORDER}; diff --git a/gtars-genomicdist/src/models.rs b/gtars-genomicdist/src/models.rs index 58e08f69..079d4dd7 100644 --- a/gtars-genomicdist/src/models.rs +++ b/gtars-genomicdist/src/models.rs @@ -1,9 +1,12 @@ use crate::errors::GtarsGenomicDistError; use bio::io::fasta; use gtars_core::models::{CoordinateMode, Region, RegionSet}; +use memmap2::Mmap; use serde::{Deserialize, Serialize}; -use std::{collections::HashMap, fmt::Debug}; +use std::collections::HashMap; +use std::fmt::Debug; use std::fs::File; +use std::io::{BufWriter, Write}; use std::path::Path; /// A `RegionSet` that is guaranteed to be sorted by (chr, start). @@ -131,13 +134,31 @@ pub struct RegionBin { pub rid: u32, } +/// Trait for types that provide sequence access to a reference genome. +/// +/// Implemented by [`GenomeAssembly`] (in-memory HashMap) and +/// [`BinaryGenomeAssembly`] (mmap .fab binary). Functions like +/// `calc_gc_content` accept `&impl SequenceAccess` to work with either. +pub trait SequenceAccess { + /// Get the sequence for a genomic region. Returns owned bytes. + fn get_sequence(&self, coords: &Region) -> Result, GtarsGenomicDistError>; + + /// Check whether a chromosome exists in the assembly. + fn contains_chr(&self, chr: &str) -> bool; +} + +/// In-memory genome assembly backed by a HashMap of chromosome sequences. +/// +/// Loads the entire FASTA into memory on construction. Slower to construct +/// (~2s for hg38) but provides zero-copy `&[u8]` access to sequences, +/// making per-region operations like GC content highly vectorizable. +/// No `.fai` index required. pub struct GenomeAssembly { seq_map: HashMap>, } impl TryFrom<&str> for GenomeAssembly { type Error = GtarsGenomicDistError; - fn try_from(value: &str) -> Result { GenomeAssembly::try_from(Path::new(value)) } @@ -145,9 +166,7 @@ impl TryFrom<&str> for GenomeAssembly { impl TryFrom for GenomeAssembly { type Error = GtarsGenomicDistError; - fn try_from(value: String) -> Result { - // println!("Converting String to Path: {}", value); GenomeAssembly::try_from(Path::new(&value)) } } @@ -155,16 +174,11 @@ impl TryFrom for GenomeAssembly { impl TryFrom<&Path> for GenomeAssembly { type Error = GtarsGenomicDistError; - /// - /// Create a new [GenomeAssembly] from fasta file - /// fn try_from(value: &Path) -> Result { let file = File::open(value)?; let genome = fasta::Reader::new(file); - let records = genome.records(); - // store the genome in a hashmap let mut seq_map: HashMap> = HashMap::new(); for record in records { match record { @@ -179,13 +193,12 @@ impl TryFrom<&Path> for GenomeAssembly { } } } - Ok(GenomeAssembly { seq_map }) } } impl GenomeAssembly { - pub fn seq_from_region<'a>(&self, coords: &Region) -> Result<&[u8], GtarsGenomicDistError> { + pub fn seq_from_region(&self, coords: &Region) -> Result<&[u8], GtarsGenomicDistError> { let chr = &coords.chr; let start = coords.start as usize; let end = coords.end as usize; @@ -196,10 +209,7 @@ impl GenomeAssembly { } else { Err(GtarsGenomicDistError::CustomError(format!( "Invalid range: start={}, end={} for chromosome {} with length {}", - start, - end, - chr, - seq.len() + start, end, chr, seq.len() ))) } } else { @@ -215,6 +225,235 @@ impl GenomeAssembly { } } +impl SequenceAccess for GenomeAssembly { + fn get_sequence(&self, coords: &Region) -> Result, GtarsGenomicDistError> { + self.seq_from_region(coords).map(|s| s.to_vec()) + } + + fn contains_chr(&self, chr: &str) -> bool { + self.seq_map.contains_key(chr) + } +} + +// --- Binary FASTA (.fab) format --- + +const FAB_MAGIC: &[u8; 4] = b"GFAB"; +const FAB_VERSION: u8 = 1; + +/// Memory-mapped genome assembly backed by a binary FASTA (.fab) file. +/// +/// The .fab format stores sequences contiguously without line wrapping, +/// enabling mmap + zero-copy `&[u8]` access with both instant construction +/// and zero-copy per-region performance. +/// +/// Create .fab files with [`BinaryGenomeAssembly::write_from_fasta`] or +/// via `gtars prep --fasta `. +#[derive(Debug)] +pub struct BinaryGenomeAssembly { + mmap: Mmap, + /// name → (offset_in_file, sequence_length) + index: HashMap, +} + +impl BinaryGenomeAssembly { + /// Open a .fab binary FASTA file via mmap. + pub fn from_file(path: &Path) -> Result { + let file = File::open(path).map_err(|e| { + GtarsGenomicDistError::CustomError(format!( + "Failed to open .fab file '{}': {}", + path.display(), e + )) + })?; + let mmap = unsafe { Mmap::map(&file) }.map_err(|e| { + GtarsGenomicDistError::CustomError(format!( + "Failed to mmap .fab file '{}': {}", + path.display(), e + )) + })?; + + // Parse header + if mmap.len() < 9 { + return Err(GtarsGenomicDistError::CustomError( + "Invalid .fab file: too short".into(), + )); + } + if &mmap[0..4] != FAB_MAGIC { + return Err(GtarsGenomicDistError::CustomError( + "Invalid .fab file: bad magic bytes".into(), + )); + } + let version = mmap[4]; + if version != FAB_VERSION { + return Err(GtarsGenomicDistError::CustomError(format!( + "Unsupported .fab version: {} (expected {})", + version, FAB_VERSION + ))); + } + let n_chroms = u32::from_le_bytes(mmap[5..9].try_into().unwrap()) as usize; + + // Parse index + let mut pos = 9; + let mut index = HashMap::with_capacity(n_chroms); + for _ in 0..n_chroms { + if pos + 2 > mmap.len() { + return Err(GtarsGenomicDistError::CustomError( + "Invalid .fab file: truncated index".into(), + )); + } + let name_len = u16::from_le_bytes(mmap[pos..pos + 2].try_into().unwrap()) as usize; + pos += 2; + if pos + name_len + 16 > mmap.len() { + return Err(GtarsGenomicDistError::CustomError( + "Invalid .fab file: truncated index entry".into(), + )); + } + let name = std::str::from_utf8(&mmap[pos..pos + name_len]) + .map_err(|e| { + GtarsGenomicDistError::CustomError(format!( + "Invalid .fab file: non-UTF8 chromosome name: {}", + e + )) + })? + .to_string(); + pos += name_len; + let offset = + u64::from_le_bytes(mmap[pos..pos + 8].try_into().unwrap()) as usize; + pos += 8; + let length = + u64::from_le_bytes(mmap[pos..pos + 8].try_into().unwrap()) as usize; + pos += 8; + index.insert(name, (offset, length)); + } + + Ok(BinaryGenomeAssembly { mmap, index }) + } + + /// Get the sequence for a region as a zero-copy `&[u8]` slice. + pub fn seq_from_region(&self, coords: &Region) -> Result<&[u8], GtarsGenomicDistError> { + let chr = &coords.chr; + let start = coords.start as usize; + let end = coords.end as usize; + + let &(offset, length) = self.index.get(chr).ok_or_else(|| { + GtarsGenomicDistError::CustomError(format!( + "Unknown chromosome found in region set: {}", + chr + )) + })?; + + if end > length || start > end { + return Err(GtarsGenomicDistError::CustomError(format!( + "Invalid range: start={}, end={} for chromosome {} with length {}", + start, end, chr, length + ))); + } + + let file_start = offset + start; + let file_end = offset + end; + if file_end > self.mmap.len() { + return Err(GtarsGenomicDistError::CustomError(format!( + "Corrupted .fab file: sequence data for {} extends beyond file boundary", + chr + ))); + } + + Ok(&self.mmap[file_start..file_end]) + } + + pub fn contains_chr(&self, chr: &str) -> bool { + self.index.contains_key(chr) + } + + /// Convert a FASTA file to .fab binary format. + pub fn write_from_fasta( + fasta_path: &Path, + output_path: &Path, + ) -> Result<(), GtarsGenomicDistError> { + // Read all sequences into memory (same as GenomeAssembly) + let file = File::open(fasta_path)?; + let reader = fasta::Reader::new(file); + + let mut chroms: Vec<(String, Vec)> = Vec::new(); + for record in reader.records() { + let record = record.map_err(|e| { + GtarsGenomicDistError::CustomError(format!( + "Error reading FASTA: {}", e + )) + })?; + chroms.push((record.id().to_string(), record.seq().to_owned())); + } + + // Compute index: header size first + let mut header_size: usize = 4 + 1 + 4; // magic + version + n_chroms + for (name, _) in &chroms { + header_size += 2 + name.len() + 8 + 8; // name_len + name + offset + length + } + + // Write + let out = File::create(output_path).map_err(|e| { + GtarsGenomicDistError::CustomError(format!( + "Failed to create .fab file '{}': {}", + output_path.display(), e + )) + })?; + let mut w = BufWriter::new(out); + + // Header + w.write_all(FAB_MAGIC)?; + w.write_all(&[FAB_VERSION])?; + w.write_all(&(chroms.len() as u32).to_le_bytes())?; + + // Index + let mut offset = header_size; + for (name, seq) in &chroms { + w.write_all(&(name.len() as u16).to_le_bytes())?; + w.write_all(name.as_bytes())?; + w.write_all(&(offset as u64).to_le_bytes())?; + w.write_all(&(seq.len() as u64).to_le_bytes())?; + offset += seq.len(); + } + + // Sequences + for (_, seq) in &chroms { + w.write_all(seq)?; + } + + w.flush()?; + Ok(()) + } +} + +impl TryFrom<&str> for BinaryGenomeAssembly { + type Error = GtarsGenomicDistError; + fn try_from(value: &str) -> Result { + BinaryGenomeAssembly::from_file(Path::new(value)) + } +} + +impl TryFrom for BinaryGenomeAssembly { + type Error = GtarsGenomicDistError; + fn try_from(value: String) -> Result { + BinaryGenomeAssembly::from_file(Path::new(&value)) + } +} + +impl TryFrom<&Path> for BinaryGenomeAssembly { + type Error = GtarsGenomicDistError; + fn try_from(value: &Path) -> Result { + BinaryGenomeAssembly::from_file(value) + } +} + +impl SequenceAccess for BinaryGenomeAssembly { + fn get_sequence(&self, coords: &Region) -> Result, GtarsGenomicDistError> { + self.seq_from_region(coords).map(|s| s.to_vec()) + } + + fn contains_chr(&self, chr: &str) -> bool { + self.index.contains_key(chr) + } +} + #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)] pub enum Dinucleotide { Aa, @@ -672,6 +911,80 @@ mod tests { assert!(ga.unwrap().contains_chr("chr1")); } + #[test] + fn test_binary_genome_assembly_round_trip() { + // Write .fab from base.fa, then read it back and verify sequences match + let fasta_path = get_fasta_path("base.fa"); + let fab_path = fasta_path.with_extension("fa.test.fab"); + + BinaryGenomeAssembly::write_from_fasta(&fasta_path, &fab_path).unwrap(); + let bga = BinaryGenomeAssembly::from_file(&fab_path).unwrap(); + + // Verify chromosomes exist + assert!(bga.contains_chr("chr1")); + assert!(bga.contains_chr("chr2")); + assert!(bga.contains_chr("chrX")); + assert!(!bga.contains_chr("chr3")); + + // Verify sequences match HashMap GenomeAssembly + let ga = GenomeAssembly::try_from(fasta_path.as_path()).unwrap(); + + let region1 = Region { chr: "chr1".into(), start: 0, end: 4, rest: None }; + assert_eq!(bga.seq_from_region(®ion1).unwrap(), ga.seq_from_region(®ion1).unwrap()); + assert_eq!(bga.seq_from_region(®ion1).unwrap(), b"GGAA"); + + let region2 = Region { chr: "chrX".into(), start: 2, end: 6, rest: None }; + assert_eq!(bga.seq_from_region(®ion2).unwrap(), ga.seq_from_region(®ion2).unwrap()); + assert_eq!(bga.seq_from_region(®ion2).unwrap(), b"GGGG"); + + // Out-of-bounds error + let bad_region = Region { chr: "chr1".into(), start: 0, end: 100, rest: None }; + assert!(bga.seq_from_region(&bad_region).is_err()); + + // Unknown chromosome error + let unk_region = Region { chr: "chr99".into(), start: 0, end: 1, rest: None }; + assert!(bga.seq_from_region(&unk_region).is_err()); + + // Clean up + std::fs::remove_file(&fab_path).ok(); + } + + #[test] + fn test_binary_genome_assembly_bad_magic() { + let dir = tempfile::tempdir().unwrap(); + let fab_path = dir.path().join("bad.fab"); + // Need at least 9 bytes to pass the "too short" check + std::fs::write(&fab_path, b"XXXX\x01\x00\x00\x00\x00").unwrap(); + let result = BinaryGenomeAssembly::from_file(&fab_path); + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("bad magic")); + } + + #[test] + fn test_binary_genome_assembly_gc_parity() { + // Verify calc_gc_content produces identical results from .fab and HashMap + use crate::statistics::calc_gc_content; + + let fasta_path = get_fasta_path("base.fa"); + let fab_path = fasta_path.with_extension("fa.test2.fab"); + BinaryGenomeAssembly::write_from_fasta(&fasta_path, &fab_path).unwrap(); + + let ga = GenomeAssembly::try_from(fasta_path.as_path()).unwrap(); + let bga = BinaryGenomeAssembly::from_file(&fab_path).unwrap(); + + let regions = vec![ + Region { chr: "chr1".into(), start: 0, end: 4, rest: None }, + Region { chr: "chr2".into(), start: 0, end: 4, rest: None }, + ]; + let rs = RegionSet::from(regions); + + let gc_hashmap = calc_gc_content(&rs, &ga, false).unwrap(); + let gc_fab = calc_gc_content(&rs, &bga, false).unwrap(); + assert_eq!(gc_hashmap, gc_fab); + + std::fs::remove_file(&fab_path).ok(); + } + #[test] fn test_tss_index_try_from_path() { let path = get_test_path("dummy_tss.bed").unwrap(); diff --git a/gtars-genomicdist/src/statistics.rs b/gtars-genomicdist/src/statistics.rs index 615a197b..38205951 100644 --- a/gtars-genomicdist/src/statistics.rs +++ b/gtars-genomicdist/src/statistics.rs @@ -9,7 +9,9 @@ use std::collections::HashMap; use gtars_core::models::{Region, RegionSet}; use crate::errors::GtarsGenomicDistError; -use crate::models::{ChromosomeStatistics, Dinucleotide, GenomeAssembly, RegionBin}; +use crate::models::{ + ChromosomeStatistics, Dinucleotide, RegionBin, SequenceAccess, +}; /// Trait for computing statistics and distributions of genomic intervals. pub trait GenomicIntervalSetStatistics { @@ -34,6 +36,13 @@ pub trait GenomicIntervalSetStatistics { /// Like `region_distribution_with_bins` but uses actual chromosome sizes /// to create bins per-chromosome, matching R's `getGenomeBins(chromSizes)`. /// Each chromosome gets `n_bins` bins sized to that chromosome's length. + /// + /// Regions on chromosomes not present in `chrom_sizes` are skipped. + /// Regions whose midpoint falls beyond the stated chromosome size are also + /// skipped (common with assembly mismatches, e.g. an hg19 BED paired with + /// hg38 chrom_sizes). The total bin count may therefore be lower than the + /// input region count; callers who need to detect mismatches can compare + /// `sum(bin.n)` against their input region count. fn region_distribution_with_chrom_sizes( &self, n_bins: u32, @@ -45,7 +54,12 @@ pub trait GenomicIntervalSetStatistics { /// For each pair of adjacent regions on the same chromosome, returns the /// signed gap: `next.start - current.end`. Positive values indicate a gap, /// negative values indicate overlapping regions, zero means adjacent/abutting. - /// Regions on chromosomes with fewer than 2 regions are skipped. + /// + /// Regions on chromosomes with fewer than 2 regions are skipped (they have + /// no neighbors to measure against). Output length equals the total number + /// of *gaps* across all multi-region chromosomes — generally shorter than + /// the input region count. Output is not aligned 1:1 with input regions. + /// No sentinel values are emitted. fn calc_neighbor_distances(&self) -> Result, GtarsGenomicDistError>; /// Compute the distance from each region to its nearest neighbor. @@ -54,6 +68,13 @@ pub trait GenomicIntervalSetStatistics { /// and downstream neighbors. First and last regions on each chromosome /// use their only neighbor's distance. Overlapping neighbors have distance 0. /// + /// Regions on chromosomes with only one region are skipped (they have no + /// neighbors). Output length equals the total number of regions across all + /// multi-region chromosomes — generally shorter than the input region + /// count, and NOT aligned 1:1 with input regions. No sentinel values are + /// emitted. Callers who need 1:1 alignment must filter their input to + /// multi-region chromosomes first. + /// /// Port of R GenomicDistributions `calcNearestNeighbors()`. fn calc_nearest_neighbors(&self) -> Result, GtarsGenomicDistError>; @@ -176,6 +197,13 @@ impl GenomicIntervalSetStatistics for RegionSet { return HashMap::new(); } + // Proportional binning: the longest chromosome gets n_bins bins, + // shorter chromosomes get proportionally fewer. This produces a + // uniform bin width (in bp) across all chromosomes so that + // positional heatmaps are reference-aligned. + let max_chrom_len = chrom_sizes.values().copied().max().unwrap_or(1) as u64; + let bin_width = (max_chrom_len / n_bins as u64).max(1); + let mut plot_results: HashMap = HashMap::new(); for region in &self.regions { @@ -183,9 +211,16 @@ impl GenomicIntervalSetStatistics for RegionSet { Some(&s) => s, None => continue, // skip regions on chromosomes not in chrom_sizes }; - let bin_size = (chrom_size / n_bins).max(1); + let bin_size = bin_width as u32; let mid = region.mid_point(); + // Skip regions whose midpoint falls beyond the stated chromosome size + // (e.g. BED file assembled against a different reference than the one + // supplied by chrom_sizes). Without this, rid would exceed n_bins and + // produce bins with end < start. + if mid >= chrom_size { + continue; + } let rid = mid / bin_size; let bin_start = rid * bin_size; let bin_end = (bin_start + bin_size).min(chrom_size); @@ -285,7 +320,7 @@ impl GenomicIntervalSetStatistics for RegionSet { /// pub fn calc_gc_content( region_set: &RegionSet, - genome: &GenomeAssembly, + genome: &(impl SequenceAccess + ?Sized), ignore_unk_chroms: bool, ) -> Result, GtarsGenomicDistError> { // for region in region_set @@ -299,11 +334,11 @@ pub fn calc_gc_content( for region in region_set.iter_chr_regions(chr) { let mut gc_count: u32 = 0; let mut total_count: u32 = 0; - let seq = genome.seq_from_region(region); + let seq = genome.get_sequence(region); match seq { Ok(seq) => { - for base in seq { + for &base in &seq { match base.to_ascii_lowercase() { b'g' | b'c' => { gc_count += 1; @@ -337,38 +372,6 @@ pub fn calc_gc_content( Ok(gc_contents) } -/// -/// Calculate Dinucleotide frequencies -/// -/// Arguments: -/// - region_set: RegionSet object -/// - genome: GenomeAssembly object (reference genome) -/// Return: Result of hashmap of dinucleotide and frequencies e.g. 'Aa: 13142' -pub fn calc_dinucl_freq( - region_set: &RegionSet, - genome: &GenomeAssembly, -) -> Result, GtarsGenomicDistError> { - let mut dinucl_freqs: HashMap = HashMap::new(); - - for chr in region_set.iter_chroms() { - for region in region_set.iter_chr_regions(chr) { - let seq = genome.seq_from_region(region)?; - for aas in seq.windows(2) { - let dinucl = Dinucleotide::from_bytes(aas); - match dinucl { - Some(dinucl) => { - let current_freq = dinucl_freqs.entry(dinucl).or_insert(0); - *current_freq += 1; - } - None => continue, - } - } - } - } - - Ok(dinucl_freqs) -} - /// Canonical ordering of dinucleotides, matching GenomicDistributions' column order. pub const DINUCL_ORDER: [Dinucleotide; 16] = [ Dinucleotide::Aa, Dinucleotide::Ac, Dinucleotide::Ag, Dinucleotide::At, @@ -377,24 +380,62 @@ pub const DINUCL_ORDER: [Dinucleotide; 16] = [ Dinucleotide::Ta, Dinucleotide::Tc, Dinucleotide::Tg, Dinucleotide::Tt, ]; -/// Per-region dinucleotide frequencies as percentages. +/// Per-region dinucleotide frequencies. /// -/// Returns a tuple of (region_labels, frequency_matrix) where: -/// - `region_labels` is `chr_start_end` for each region -/// - `frequency_matrix` is a `Vec<[f64; 16]>` — one row per region, columns -/// in [`DINUCL_ORDER`] order, values are percentages (0–100). +/// Matches R GenomicDistributions `calcDinuclFreq`: one row per region, +/// 16 dinucleotide columns in [`DINUCL_ORDER`] order. /// -/// This matches the output format of R's GenomicDistributions::calcDinuclFreq. -pub fn calc_dinucl_freq_per_region( +/// Arguments: +/// - `region_set`: RegionSet object +/// - `genome`: GenomeAssembly object (reference genome) +/// - `raw_counts`: if `true`, return raw integer-valued counts; +/// if `false`, return percentages (0–100) per row (matches R default) +/// - `ignore_unk_chroms`: if `true`, skip regions on chromosomes not in +/// the assembly; if `false`, error on unknown chromosomes +/// +/// Returns a tuple `(region_labels, frequency_matrix)`: +/// - `region_labels`: `chr_start_end` for each region +/// - `frequency_matrix`: `Vec<[f64; 16]>` — one row per region +/// +/// For pooled global counts across all regions, sum the columns of the +/// raw-counts matrix: +/// ```no_run +/// # use gtars_genomicdist::{calc_dinucl_freq, DINUCL_ORDER}; +/// # use gtars_genomicdist::models::GenomeAssembly; +/// # use gtars_core::models::RegionSet; +/// # fn example(rs: &RegionSet, assembly: &GenomeAssembly) -> Result<(), Box> { +/// let (_, matrix) = calc_dinucl_freq(rs, assembly, true, false)?; +/// let mut totals = [0.0f64; 16]; +/// for row in &matrix { +/// for (i, &c) in row.iter().enumerate() { +/// totals[i] += c; +/// } +/// } +/// # Ok(()) } +/// ``` +pub fn calc_dinucl_freq( region_set: &RegionSet, - genome: &GenomeAssembly, + genome: &(impl SequenceAccess + ?Sized), + raw_counts: bool, + ignore_unk_chroms: bool, ) -> Result<(Vec, Vec<[f64; 16]>), GtarsGenomicDistError> { let mut labels: Vec = Vec::new(); let mut matrix: Vec<[f64; 16]> = Vec::new(); for chr in region_set.iter_chroms() { + if ignore_unk_chroms && !genome.contains_chr(chr) { + continue; + } for region in region_set.iter_chr_regions(chr) { - let seq = genome.seq_from_region(region)?; + let seq = match genome.get_sequence(region) { + Ok(s) => s, + Err(e) => { + if ignore_unk_chroms { + continue; + } + return Err(e); + } + }; let mut counts = [0u64; 16]; let mut total: u64 = 0; @@ -406,19 +447,25 @@ pub fn calc_dinucl_freq_per_region( } } - let freqs: [f64; 16] = if total > 0 { - let mut f = [0.0f64; 16]; + let row: [f64; 16] = if raw_counts { + let mut r = [0.0f64; 16]; + for (i, &c) in counts.iter().enumerate() { + r[i] = c as f64; + } + r + } else if total > 0 { + let mut r = [0.0f64; 16]; for (i, &c) in counts.iter().enumerate() { - f[i] = (c as f64 / total as f64) * 100.0; + r[i] = (c as f64 / total as f64) * 100.0; } - f + r } else { [0.0; 16] }; labels.push(format!("{}_{}_{}", region.chr, region.start, region.end)); - matrix.push(freqs); + matrix.push(row); } } @@ -428,6 +475,7 @@ pub fn calc_dinucl_freq_per_region( #[cfg(test)] mod tests { use super::*; + use crate::models::GenomeAssembly; use pretty_assertions::assert_eq; use rstest::*; @@ -572,8 +620,8 @@ mod tests { // --- Dinucleotide frequency --- #[rstest] - fn test_calc_dinucl_freq() { - // base.fa: chr1=GGAA → dinucleotides: GG, GA, AA + fn test_calc_dinucl_freq_raw_counts() { + // base.fa: chr1=GGAA → dinucleotides: GG, GA, AA (3 total) let path = get_fasta_path("base.fa"); let ga = GenomeAssembly::try_from(path.to_str().unwrap()).unwrap(); @@ -581,19 +629,25 @@ mod tests { Region { chr: "chr1".into(), start: 0, end: 4, rest: None }, ]; let rs = RegionSet::from(regions); - let freq = calc_dinucl_freq(&rs, &ga).unwrap(); + let (labels, matrix) = calc_dinucl_freq(&rs, &ga, true, false).unwrap(); - assert_eq!(*freq.get(&Dinucleotide::Gg).unwrap_or(&0), 1); - assert_eq!(*freq.get(&Dinucleotide::Ga).unwrap_or(&0), 1); - assert_eq!(*freq.get(&Dinucleotide::Aa).unwrap_or(&0), 1); + assert_eq!(labels, vec!["chr1_0_4"]); + assert_eq!(matrix.len(), 1); + let row = &matrix[0]; + let gg_idx = DINUCL_ORDER.iter().position(|d| *d == Dinucleotide::Gg).unwrap(); + let ga_idx = DINUCL_ORDER.iter().position(|d| *d == Dinucleotide::Ga).unwrap(); + let aa_idx = DINUCL_ORDER.iter().position(|d| *d == Dinucleotide::Aa).unwrap(); + assert_eq!(row[gg_idx], 1.0); + assert_eq!(row[ga_idx], 1.0); + assert_eq!(row[aa_idx], 1.0); // total should be 3 (4 bases → 3 dinucleotides) - let total: u64 = freq.values().sum(); - assert_eq!(total, 3); + let total: f64 = row.iter().sum(); + assert_eq!(total, 3.0); } #[rstest] - fn test_calc_dinucl_freq_per_region() { - // base.fa: chr2=GCGC → dinucleotides: GC, CG, GC + fn test_calc_dinucl_freq_percentages() { + // base.fa: chr2=GCGC → dinucleotides: GC, CG, GC (percentages) let path = get_fasta_path("base.fa"); let ga = GenomeAssembly::try_from(path.to_str().unwrap()).unwrap(); @@ -601,14 +655,13 @@ mod tests { Region { chr: "chr2".into(), start: 0, end: 4, rest: None }, ]; let rs = RegionSet::from(regions); - let (labels, matrix) = calc_dinucl_freq_per_region(&rs, &ga).unwrap(); + let (labels, matrix) = calc_dinucl_freq(&rs, &ga, false, false).unwrap(); assert_eq!(labels.len(), 1); assert_eq!(labels[0], "chr2_0_4"); assert_eq!(matrix.len(), 1); // GC appears 2/3 times, CG appears 1/3 times - // Find indices in DINUCL_ORDER let gc_idx = DINUCL_ORDER.iter().position(|d| *d == Dinucleotide::Gc).unwrap(); let cg_idx = DINUCL_ORDER.iter().position(|d| *d == Dinucleotide::Cg).unwrap(); @@ -616,11 +669,34 @@ mod tests { assert!((row[gc_idx] - 200.0 / 3.0).abs() < 0.1); // ~66.67% assert!((row[cg_idx] - 100.0 / 3.0).abs() < 0.1); // ~33.33% - // all other dinucleotides should be 0 + // percentages sum to 100 let total: f64 = row.iter().sum(); assert!((total - 100.0).abs() < 0.1); } + #[rstest] + fn test_calc_dinucl_freq_global_derivable() { + // Global counts are derivable by column-summing the raw-counts matrix. + // Two regions: chr1=GGAA, chr2=GCGC → pooled: GG×1, GA×1, AA×1, GC×2, CG×1 = 6 total + let path = get_fasta_path("base.fa"); + let ga = GenomeAssembly::try_from(path.to_str().unwrap()).unwrap(); + let regions = vec![ + Region { chr: "chr1".into(), start: 0, end: 4, rest: None }, + Region { chr: "chr2".into(), start: 0, end: 4, rest: None }, + ]; + let rs = RegionSet::from(regions); + let (_, matrix) = calc_dinucl_freq(&rs, &ga, true, false).unwrap(); + + let mut totals = [0.0f64; 16]; + for row in &matrix { + for (i, &c) in row.iter().enumerate() { + totals[i] += c; + } + } + let grand: f64 = totals.iter().sum(); + assert_eq!(grand, 6.0); + } + // --- Empty RegionSet edge cases --- #[rstest] @@ -747,6 +823,38 @@ mod tests { ); } + #[rstest] + fn test_region_distribution_with_chrom_sizes_skips_out_of_range() { + // Regions whose midpoint falls beyond the stated chromosome size are + // silently skipped (assembly mismatch case). Previously this produced + // bins with end < start or rid >= n_bins. + let regions = vec![ + Region { chr: "chr1".into(), start: 100, end: 200, rest: None }, // mid=150, in range + Region { chr: "chr1".into(), start: 800, end: 900, rest: None }, // mid=850, in range + Region { chr: "chr1".into(), start: 1200, end: 1300, rest: None },// mid=1250, out of range (chrom_size=1000) + Region { chr: "chr2".into(), start: 300, end: 400, rest: None }, // mid=350, in range + Region { chr: "chr2".into(), start: 2000, end: 2100, rest: None },// mid=2050, out of range + Region { chr: "chr3".into(), start: 0, end: 100, rest: None }, // chr3 not in chrom_sizes + ]; + let rs = RegionSet::from(regions); + let mut chrom_sizes = HashMap::new(); + chrom_sizes.insert("chr1".to_string(), 1000u32); + chrom_sizes.insert("chr2".to_string(), 500u32); + + let bins = rs.region_distribution_with_chrom_sizes(10, &chrom_sizes); + + // Every bin should have end > start and rid < n_bins + for bin in bins.values() { + assert!(bin.end > bin.start, "bin {:?} has end <= start", bin); + assert!(bin.rid < 10, "bin {:?} has rid >= n_bins", bin); + } + + // Total counted regions: 2 (chr1) + 1 (chr2) = 3 + // (chr1's third region, chr2's second, and chr3's only region are all skipped) + let total: u32 = bins.values().map(|b| b.n).sum(); + assert_eq!(total, 3, "expected 3 in-range regions counted"); + } + #[rstest] fn test_gc_content_in_valid_range() { // R GenomicDistributions tests: all GC values should be in [0, 1] @@ -800,4 +908,20 @@ mod tests { // Mean width should be 1 billion assert!((s.mean_region_length - 1_000_000_000.0).abs() < 1.0); } + + // ── spatial-arrangement feature tests ─────────────────────────────── + + fn make_rs(regions: &[(&str, u32, u32)]) -> RegionSet { + let regs: Vec = regions + .iter() + .map(|(chr, s, e)| Region { + chr: chr.to_string(), + start: *s, + end: *e, + rest: None, + }) + .collect(); + RegionSet::from(regs) + } + } diff --git a/gtars-lola/Cargo.toml b/gtars-lola/Cargo.toml index 48cbcf70..bdf0ffef 100644 --- a/gtars-lola/Cargo.toml +++ b/gtars-lola/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "gtars-lola" -version = "0.1.0" +version = "0.2.0" edition = "2024" description = "LOLA (Locus Overlap Analysis) for genomic region enrichment testing" license = "MIT" diff --git a/gtars-lola/src/database.rs b/gtars-lola/src/database.rs index d2d5737f..0b6ba5d7 100644 --- a/gtars-lola/src/database.rs +++ b/gtars-lola/src/database.rs @@ -24,14 +24,14 @@ pub struct CollectionAnno { #[derive(Debug, Clone, Default)] pub struct RegionSetAnno { pub filename: String, - pub cell_type: String, - pub description: String, - pub tissue: String, - pub data_source: String, - pub antibody: String, - pub treatment: String, + pub cell_type: Option, + pub description: Option, + pub tissue: Option, + pub data_source: Option, + pub antibody: Option, + pub treatment: Option, /// Which collection this file belongs to. - pub collection: String, + pub collection: Option, } /// A LOLA region database: IGD index + original region sets + annotations. @@ -145,12 +145,12 @@ impl RegionDB { // Fall back to collection description if file-level is empty. let mut anno = anno_map.get(fname).cloned().unwrap_or(RegionSetAnno { filename: fname.clone(), - collection: coll_name.clone(), + collection: Some(coll_name.clone()), ..Default::default() }); - if anno.description.is_empty() { + if anno.description.is_none() { // R LOLA uses the collection folder name as fallback - anno.description = coll_name.clone(); + anno.description = Some(coll_name.clone()); } all_region_anno.push(anno); files_loaded += 1; @@ -248,7 +248,7 @@ impl RegionDB { .iter() .filter(|a| { if let Some(filter) = collections { - filter.iter().any(|f| *f == a.collection) + a.collection.as_deref().map_or(false, |c| filter.contains(&c)) } else { true } @@ -269,7 +269,7 @@ impl RegionDB { .filter(|(_, a)| { let name_match = filenames.iter().any(|f| *f == a.filename); let coll_match = if let Some(filter) = collections { - filter.iter().any(|f| *f == a.collection) + a.collection.as_deref().map_or(false, |c| filter.contains(&c)) } else { true }; @@ -429,23 +429,30 @@ fn parse_index_txt(path: &Path, collection_name: &str) -> Vec { let fields: Vec<&str> = line.split(sep).collect(); - let get = |key: &str| -> String { + let get_required = |key: &str| -> String { col_map .get(key) .and_then(|&i| fields.get(i)) .map(|s| s.trim().to_string()) .unwrap_or_default() }; + let get_optional = |key: &str| -> Option { + col_map + .get(key) + .and_then(|&i| fields.get(i)) + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + }; annos.push(RegionSetAnno { - filename: get("filename"), - cell_type: get("cellType"), - description: get("description"), - tissue: get("tissue"), - data_source: get("dataSource"), - antibody: get("antibody"), - treatment: get("treatment"), - collection: collection_name.to_string(), + filename: get_required("filename"), + cell_type: get_optional("cellType"), + description: get_optional("description"), + tissue: get_optional("tissue"), + data_source: get_optional("dataSource"), + antibody: get_optional("antibody"), + treatment: get_optional("treatment"), + collection: Some(collection_name.to_string()), }); } @@ -504,8 +511,8 @@ mod tests { assert_eq!(db.collection_anno.len(), 1); assert_eq!(db.collection_anno[0].collector, "John"); assert_eq!(db.region_anno.len(), 2); - assert_eq!(db.region_anno[0].cell_type, "K562"); - assert_eq!(db.region_anno[1].cell_type, "HeLa"); + assert_eq!(db.region_anno[0].cell_type.as_deref(), Some("K562")); + assert_eq!(db.region_anno[1].cell_type.as_deref(), Some("HeLa")); // Region sets should have correct counts assert_eq!(db.region_sets[0].regions.len(), 3); // file1: 3 regions @@ -599,7 +606,7 @@ mod tests { let anno = RegionSetAnno { filename: "test.bed".to_string(), - cell_type: "K562".to_string(), + cell_type: Some("K562".to_string()), ..Default::default() }; @@ -665,12 +672,12 @@ mod tests { assert_eq!(db.num_region_sets(), 1); // R LOLA uses the collection folder name as fallback, not collection.txt description assert_eq!( - db.region_anno[0].description, "fallback_coll", + db.region_anno[0].description.as_deref(), Some("fallback_coll"), "Description should fall back to collection folder name when index.txt description is empty" ); // Other fields from index.txt should still be present - assert_eq!(db.region_anno[0].cell_type, "K562"); - assert_eq!(db.region_anno[0].tissue, "blood"); + assert_eq!(db.region_anno[0].cell_type.as_deref(), Some("K562")); + assert_eq!(db.region_anno[0].tissue.as_deref(), Some("blood")); } #[test] @@ -728,7 +735,7 @@ mod tests { .region_anno .iter() .enumerate() - .filter(|(_, a)| a.collection == "coll1") + .filter(|(_, a)| a.collection.as_deref() == Some("coll1")) .map(|(i, _)| i) .collect(); diff --git a/gtars-lola/src/enrichment.rs b/gtars-lola/src/enrichment.rs index d3c60c7e..705c0a3d 100644 --- a/gtars-lola/src/enrichment.rs +++ b/gtars-lola/src/enrichment.rs @@ -264,13 +264,13 @@ pub fn run_lola( d, q_value: None, filename, - collection: String::new(), - description: String::new(), - cell_type: String::new(), - tissue: String::new(), - antibody: String::new(), - treatment: String::new(), - data_source: String::new(), + collection: None, + description: None, + cell_type: None, + tissue: None, + antibody: None, + treatment: None, + data_source: None, db_set_size: 0, }); } diff --git a/gtars-lola/src/models.rs b/gtars-lola/src/models.rs index 259c13b1..fc177aef 100644 --- a/gtars-lola/src/models.rs +++ b/gtars-lola/src/models.rs @@ -83,19 +83,19 @@ pub struct LolaResult { /// DB set filename (from IGD file_info). pub filename: String, /// Collection name this DB set belongs to. - pub collection: String, + pub collection: Option, /// Description from index.txt. - pub description: String, + pub description: Option, /// Cell type annotation. - pub cell_type: String, + pub cell_type: Option, /// Tissue annotation. - pub tissue: String, + pub tissue: Option, /// Antibody annotation. - pub antibody: String, + pub antibody: Option, /// Treatment annotation. - pub treatment: String, + pub treatment: Option, /// Data source annotation. - pub data_source: String, + pub data_source: Option, /// Number of regions in the DB set. pub db_set_size: u64, } diff --git a/gtars-lola/src/output.rs b/gtars-lola/src/output.rs index 64e34edb..aee22290 100644 --- a/gtars-lola/src/output.rs +++ b/gtars-lola/src/output.rs @@ -1,7 +1,6 @@ //! Output formatting, FDR correction, and annotation. use std::io::Write; -use std::path::Path; use crate::database::RegionDB; use crate::models::LolaResult; @@ -16,7 +15,7 @@ pub fn annotate_results(results: &mut [LolaResult], db: &RegionDB) { let anno = &db.region_anno[r.db_set]; r.collection = anno.collection.clone(); // Truncate description to 80 chars (matches R LOLA behavior) - r.description = anno.description.chars().take(80).collect(); + r.description = anno.description.as_ref().map(|d| d.chars().take(80).collect()); r.cell_type = anno.cell_type.clone(); r.tissue = anno.tissue.clone(); r.antibody = anno.antibody.clone(); @@ -105,6 +104,94 @@ pub fn apply_fdr_correction(results: &mut [LolaResult]) { } } +/// Column-oriented representation of LOLA results. +/// +/// Each field is a parallel Vec — row `i` across all fields describes one result. +/// Bindings should convert this to their native columnar type (JS object, PyDict, +/// R data.frame) rather than reimplementing the row→column pivot. +#[derive(Debug, Clone)] +pub struct LolaColumnar { + pub user_set: Vec, + pub db_set: Vec, + pub p_value_log: Vec, + pub odds_ratio: Vec, + pub support: Vec, + pub rnk_pv: Vec, + pub rnk_or: Vec, + pub rnk_sup: Vec, + pub max_rnk: Vec, + pub mean_rnk: Vec, + pub b: Vec, + pub c: Vec, + pub d: Vec, + pub q_value: Vec>, + pub filename: Vec, + pub collection: Vec>, + pub description: Vec>, + pub cell_type: Vec>, + pub tissue: Vec>, + pub antibody: Vec>, + pub treatment: Vec>, + pub data_source: Vec>, + pub db_set_size: Vec, +} + +/// Convert a slice of LolaResults into column-oriented vectors. +pub fn results_to_columns(results: &[LolaResult]) -> LolaColumnar { + let n = results.len(); + let mut c = LolaColumnar { + user_set: Vec::with_capacity(n), + db_set: Vec::with_capacity(n), + p_value_log: Vec::with_capacity(n), + odds_ratio: Vec::with_capacity(n), + support: Vec::with_capacity(n), + rnk_pv: Vec::with_capacity(n), + rnk_or: Vec::with_capacity(n), + rnk_sup: Vec::with_capacity(n), + max_rnk: Vec::with_capacity(n), + mean_rnk: Vec::with_capacity(n), + b: Vec::with_capacity(n), + c: Vec::with_capacity(n), + d: Vec::with_capacity(n), + q_value: Vec::with_capacity(n), + filename: Vec::with_capacity(n), + collection: Vec::with_capacity(n), + description: Vec::with_capacity(n), + cell_type: Vec::with_capacity(n), + tissue: Vec::with_capacity(n), + antibody: Vec::with_capacity(n), + treatment: Vec::with_capacity(n), + data_source: Vec::with_capacity(n), + db_set_size: Vec::with_capacity(n), + }; + for r in results { + c.user_set.push(r.user_set); + c.db_set.push(r.db_set); + c.p_value_log.push(r.p_value_log); + c.odds_ratio.push(r.odds_ratio); + c.support.push(r.support); + c.rnk_pv.push(r.rnk_pv); + c.rnk_or.push(r.rnk_or); + c.rnk_sup.push(r.rnk_sup); + c.max_rnk.push(r.max_rnk); + c.mean_rnk.push(r.mean_rnk); + c.b.push(r.b); + c.c.push(r.c); + c.d.push(r.d); + c.q_value.push(r.q_value); + c.filename.push(r.filename.clone()); + c.collection.push(r.collection.clone()); + c.description.push(r.description.clone()); + c.cell_type.push(r.cell_type.clone()); + c.tissue.push(r.tissue.clone()); + c.antibody.push(r.antibody.clone()); + c.treatment.push(r.treatment.clone()); + c.data_source.push(r.data_source.clone()); + c.db_set_size.push(r.db_set_size); + } + c +} + /// Write LOLA results as TSV matching R LOLA's `writeCombinedEnrichment` format. pub fn write_results_tsv( writer: &mut W, @@ -124,14 +211,13 @@ pub fn write_results_tsv( .q_value .map(|q| format!("{:.6e}", q)) .unwrap_or_else(|| "NA".to_string()); - writeln!( writer, "{}\t{}\t{}\t{:.4}\t{:.4}\t{}\t{}\t{}\t{}\t{}\t{:.2}\t{}\t{}\t{}\t\ {}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", r.user_set + 1, // 1-based for R compatibility r.db_set + 1, - r.collection, + r.collection.as_deref().unwrap_or(""), r.p_value_log, r.odds_ratio, r.support, @@ -143,12 +229,12 @@ pub fn write_results_tsv( r.b, r.c, r.d, - r.description, - r.cell_type, - r.tissue, - r.antibody, - r.treatment, - r.data_source, + r.description.as_deref().unwrap_or(""), + r.cell_type.as_deref().unwrap_or(""), + r.tissue.as_deref().unwrap_or(""), + r.antibody.as_deref().unwrap_or(""), + r.treatment.as_deref().unwrap_or(""), + r.data_source.as_deref().unwrap_or(""), r.filename, qv, r.db_set_size, @@ -158,15 +244,6 @@ pub fn write_results_tsv( Ok(()) } -/// Write results to a TSV file on disk. -pub fn write_results_to_file( - path: &Path, - results: &[LolaResult], -) -> std::io::Result<()> { - let mut file = std::fs::File::create(path)?; - write_results_tsv(&mut file, results) -} - #[cfg(test)] mod tests { use super::*; @@ -189,13 +266,13 @@ mod tests { d: 100, q_value: None, filename: format!("file{}.bed", db_set), - collection: String::new(), - description: String::new(), - cell_type: String::new(), - tissue: String::new(), - antibody: String::new(), - treatment: String::new(), - data_source: String::new(), + collection: None, + description: None, + cell_type: None, + tissue: None, + antibody: None, + treatment: None, + data_source: None, db_set_size: 0, } } @@ -400,4 +477,55 @@ mod tests { let output = String::from_utf8(buf).unwrap(); assert!(output.contains("NA")); // q_value should be NA } + + #[test] + fn test_results_to_columns_basic() { + let results = vec![ + make_result(0, 0, 3.0), + make_result(1, 2, 5.0), + ]; + let c = results_to_columns(&results); + + assert_eq!(c.user_set.len(), 2); + assert_eq!(c.user_set, vec![0, 1]); + assert_eq!(c.db_set, vec![0, 2]); + assert_eq!(c.p_value_log, vec![3.0, 5.0]); + assert_eq!(c.odds_ratio, vec![1.0, 1.0]); + assert_eq!(c.support, vec![10, 10]); + assert_eq!(c.b, vec![5, 5]); + assert_eq!(c.c, vec![5, 5]); + assert_eq!(c.d, vec![100, 100]); + assert_eq!(c.filename, vec!["file0.bed", "file2.bed"]); + assert_eq!(c.db_set_size, vec![0, 0]); + // empty strings → None + assert_eq!(c.collection, vec![None, None]); + assert_eq!(c.description, vec![None, None]); + assert_eq!(c.cell_type, vec![None, None]); + assert_eq!(c.tissue, vec![None, None]); + assert_eq!(c.antibody, vec![None, None]); + assert_eq!(c.treatment, vec![None, None]); + assert_eq!(c.data_source, vec![None, None]); + } + + #[test] + fn test_results_to_columns_empty() { + let c = results_to_columns(&[]); + assert!(c.user_set.is_empty()); + assert!(c.filename.is_empty()); + } + + #[test] + fn test_results_to_columns_with_metadata() { + let mut r = make_result(0, 0, 1.0); + r.collection = Some("ENCODE".to_string()); + r.cell_type = Some("K562".to_string()); + r.tissue = None; // stays None + r.q_value = Some(0.05); + + let c = results_to_columns(&[r]); + assert_eq!(c.collection, vec![Some("ENCODE".to_string())]); + assert_eq!(c.cell_type, vec![Some("K562".to_string())]); + assert_eq!(c.tissue, vec![None]); + assert_eq!(c.q_value, vec![Some(0.05)]); + } } diff --git a/gtars-python/Cargo.toml b/gtars-python/Cargo.toml index b2bf21ad..588f1e7d 100644 --- a/gtars-python/Cargo.toml +++ b/gtars-python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "gtars-py" -version = "0.8.0" +version = "0.9.0" edition = "2024" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/gtars-python/py_src/gtars/genomic_distributions/__init__.pyi b/gtars-python/py_src/gtars/genomic_distributions/__init__.pyi index 0a89d7f8..5ac30fbb 100644 --- a/gtars-python/py_src/gtars/genomic_distributions/__init__.pyi +++ b/gtars-python/py_src/gtars/genomic_distributions/__init__.pyi @@ -1,28 +1,43 @@ -from typing import Optional, List, Dict -from gtars.models import RegionSet, GenomeAssembly, PartitionList, SignalMatrix +from typing import Optional, Union, List, Dict +from gtars.models import RegionSet, GenomeAssembly, BinaryGenomeAssembly, PartitionList, SignalMatrix def calc_gc_content( - rs: RegionSet, genome: GenomeAssembly, ignore_unk_chroms: Optional[bool] = False + rs: RegionSet, + genome: Union[GenomeAssembly, BinaryGenomeAssembly], + ignore_unk_chroms: Optional[bool] = False, ) -> List[float]: """ Calculate GC content for each region. :param rs: RegionSet object - :param genome: GenomeAssembly object (reference genome) + :param genome: GenomeAssembly or BinaryGenomeAssembly (reference genome) :param ignore_unk_chroms: ignore unknown chromosomes :return: GC content per region (0 to 1) """ ... -def calc_dinucleotide_frequency( - rs: RegionSet, genome: GenomeAssembly -) -> Dict[str, int]: +def calc_dinucl_freq( + rs: RegionSet, + genome: Union[GenomeAssembly, BinaryGenomeAssembly], + raw_counts: bool = False, + ignore_unk_chroms: bool = False, +) -> Dict[str, object]: """ - Calculate dinucleotide frequencies across all regions. + Per-region dinucleotide frequencies, matching R GenomicDistributions ``calcDinuclFreq``. :param rs: RegionSet object - :param genome: GenomeAssembly object (reference genome) - :return: dict mapping dinucleotide names to counts + :param genome: GenomeAssembly or BinaryGenomeAssembly (reference genome) + :param raw_counts: if False (default), return percentages (0–100) per row; + if True, return raw integer-valued counts + :param ignore_unk_chroms: skip regions on chromosomes not in the assembly + :return: dict with keys: + - ``region_labels``: list of ``chr_start_end`` strings (one per region) + - ``dinucleotides``: list of 16 dinucleotide names in canonical order + - ``frequencies``: list of lists — one row per region, 16 values per row + matching ``dinucleotides`` order + + For pooled global counts across all regions, sum each column of the + raw-counts matrix. """ ... diff --git a/gtars-python/py_src/gtars/models/__init__.pyi b/gtars-python/py_src/gtars/models/__init__.pyi index 77ca0550..12a5b815 100644 --- a/gtars-python/py_src/gtars/models/__init__.pyi +++ b/gtars-python/py_src/gtars/models/__init__.pyi @@ -1,4 +1,4 @@ -from typing import List, Dict, Optional +from typing import List, Dict, Optional, Tuple class Region: chr: str @@ -124,25 +124,45 @@ class RegionSet: """ ... - def neighbor_distances(self) -> List[Optional[float]]: + def neighbor_distances(self) -> List[int]: """ - Distances between consecutive regions on each chromosome. - Returns None for missing values. + Signed gaps between consecutive regions on each chromosome. + + Output length may be shorter than input region count — chromosomes with + fewer than 2 regions are skipped (no neighbors to measure against). + Output is NOT aligned 1:1 with input regions. """ ... - def nearest_neighbors(self) -> List[Optional[float]]: + def nearest_neighbors(self) -> List[int]: """ - Distance from each region to its nearest neighbor. - Returns None for regions with no neighbor on the same chromosome. + Distance from each region to its nearest neighbor on the same chromosome. + + Output length may be shorter than input region count — chromosomes with + only one region are skipped. Output is NOT aligned 1:1 with input regions. """ ... - def distribution(self, n_bins: int = 250) -> List[Dict[str, object]]: + def distribution( + self, + n_bins: int = 250, + chrom_sizes: Optional[Dict[str, int]] = None, + ) -> List[Dict[str, object]]: """ Region distribution across genomic bins. - :param n_bins: number of bins (default 250) + :param n_bins: number of bins for the longest chromosome (default 250) + :param chrom_sizes: optional mapping of chromosome name to length. When provided, + per-chromosome bin sizes are derived from the reference genome + (bin_size = chrom_size / n_bins per chrom). This produces outputs that are + comparable across BED files and aligned with reference genome positions. + When absent, bin size is derived from the BED file's observed max end + coordinate — outputs will NOT be comparable across files. + + When ``chrom_sizes`` is provided, regions on chromosomes not listed in + ``chrom_sizes`` are skipped, as are regions whose midpoint falls beyond + the stated chromosome size (common with assembly mismatches). The summed + bin counts may therefore be lower than the input region count. :return: list of dicts with keys: chr, start, end, n, rid """ ... @@ -311,6 +331,17 @@ class RegionSet: """ ... + def gaps(self, chrom_sizes: Dict[str, int]) -> "RegionSet": + """ + Return gaps between regions per chromosome, bounded by chrom sizes. + + Emits leading gaps (from 0), inter-region gaps, trailing gaps + (to ``chrom_size``), and full-chromosome gaps for any chromosome + in ``chrom_sizes`` that has no regions. Regions on chromosomes + not listed in ``chrom_sizes`` are skipped. + """ + ... + def __len__(self) -> int: """ Size of the regionset @@ -332,6 +363,20 @@ class GenomeAssembly: def __str__(self) -> str: ... def __repr__(self) -> str: ... +class BinaryGenomeAssembly: + def __init__(self, path: str) -> "BinaryGenomeAssembly": + """ + Open a .fab binary FASTA file for zero-copy mmap access. + + Create .fab files with ``gtars prep --fasta ``. + + :param path: path to the .fab file + """ + ... + + def __str__(self) -> str: ... + def __repr__(self) -> str: ... + class TssIndex: def __init__(self, path: str) -> "TssIndex": """ diff --git a/gtars-python/src/genomic_distributions/mod.rs b/gtars-python/src/genomic_distributions/mod.rs index 13681967..ddf21c12 100644 --- a/gtars-python/src/genomic_distributions/mod.rs +++ b/gtars-python/src/genomic_distributions/mod.rs @@ -2,14 +2,14 @@ use pyo3::prelude::*; mod tools; pub use self::tools::{ - py_calc_dinucleotide_frequency, py_calc_expected_partitions, py_calc_gc_content, - py_calc_partitions, py_calc_summary_signal, py_consensus, py_median_abs_distance, + py_calc_dinucl_freq, py_calc_expected_partitions, py_calc_gc_content, py_calc_partitions, + py_calc_summary_signal, py_consensus, py_median_abs_distance, }; #[pymodule] pub fn genomic_distributions(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(py_calc_gc_content, m)?)?; - m.add_function(wrap_pyfunction!(py_calc_dinucleotide_frequency, m)?)?; + m.add_function(wrap_pyfunction!(py_calc_dinucl_freq, m)?)?; m.add_function(wrap_pyfunction!(py_calc_partitions, m)?)?; m.add_function(wrap_pyfunction!(py_calc_expected_partitions, m)?)?; m.add_function(wrap_pyfunction!(py_calc_summary_signal, m)?)?; diff --git a/gtars-python/src/genomic_distributions/tools.rs b/gtars-python/src/genomic_distributions/tools.rs index 7fe78412..6ae31fb2 100644 --- a/gtars-python/src/genomic_distributions/tools.rs +++ b/gtars-python/src/genomic_distributions/tools.rs @@ -1,7 +1,7 @@ use pyo3::prelude::*; use pyo3::types::PyDict; -use crate::models::{PyGenomeAssembly, PyPartitionList, PyRegionSet, PySignalMatrix}; +use crate::models::{PyBinaryGenomeAssembly, PyGenomeAssembly, PyPartitionList, PyRegionSet, PySignalMatrix}; use gtars_genomicdist::{self, statistics}; use std::collections::HashMap; @@ -9,30 +9,64 @@ use std::collections::HashMap; #[pyo3(signature = (rs, genome, ignore_unk_chroms = Some(false)))] pub fn py_calc_gc_content( rs: &PyRegionSet, - genome: &PyGenomeAssembly, + genome: &Bound<'_, PyAny>, ignore_unk_chroms: Option, ) -> anyhow::Result> { - let result = statistics::calc_gc_content( - &rs.regionset, - &genome.genome_assembly, - ignore_unk_chroms.unwrap_or(false), - )?; - Ok(result) + let ignore = ignore_unk_chroms.unwrap_or(false); + if let Ok(g) = genome.cast::() { + let g = g.borrow(); + Ok(statistics::calc_gc_content(&rs.regionset, &g.assembly, ignore)?) + } else if let Ok(g) = genome.cast::() { + let g = g.borrow(); + Ok(statistics::calc_gc_content(&rs.regionset, &g.genome_assembly, ignore)?) + } else { + Err(anyhow::anyhow!("genome must be a GenomeAssembly or BinaryGenomeAssembly")) + } } -#[pyfunction(name = "calc_dinucleotide_frequency")] -pub fn py_calc_dinucleotide_frequency( +/// Per-region dinucleotide frequencies, matching R GenomicDistributions `calcDinuclFreq`. +/// +/// Arguments: +/// - ``rs``: RegionSet +/// - ``genome``: GenomeAssembly (reference) +/// - ``raw_counts``: if False (default), return percentages (0–100) per row; +/// if True, return raw integer-valued counts +/// +/// Returns a dict: +/// - ``region_labels``: list of ``chr_start_end`` strings (one per region) +/// - ``dinucleotides``: list of 16 dinucleotide names in canonical order +/// - ``frequencies``: list of lists — one row per region, 16 values per row +/// matching ``dinucleotides`` order +/// +/// For pooled global counts, sum each column of the raw-counts matrix. +#[pyfunction(name = "calc_dinucl_freq")] +#[pyo3(signature = (rs, genome, raw_counts = false, ignore_unk_chroms = false))] +pub fn py_calc_dinucl_freq<'py>( + py: Python<'py>, rs: &PyRegionSet, - genome: &PyGenomeAssembly, -) -> anyhow::Result> { - let frequencies = statistics::calc_dinucl_freq(&rs.regionset, &genome.genome_assembly)?; - let mut freq_map: HashMap = HashMap::new(); - // Convert Dinucleotide to String and push to HashMap - for (di, freq) in frequencies { - freq_map.insert(di.to_string()?, freq); - } - - Ok(freq_map) + genome: &Bound<'py, PyAny>, + raw_counts: bool, + ignore_unk_chroms: bool, +) -> anyhow::Result> { + let (labels, matrix) = if let Ok(g) = genome.cast::() { + let g = g.borrow(); + statistics::calc_dinucl_freq(&rs.regionset, &g.assembly, raw_counts, ignore_unk_chroms)? + } else if let Ok(g) = genome.cast::() { + let g = g.borrow(); + statistics::calc_dinucl_freq(&rs.regionset, &g.genome_assembly, raw_counts, ignore_unk_chroms)? + } else { + return Err(anyhow::anyhow!("genome must be a GenomeAssembly or BinaryGenomeAssembly")); + }; + let dinucl_names: Vec = statistics::DINUCL_ORDER + .iter() + .map(|d| d.to_string().unwrap_or_default()) + .collect(); + let freqs_nested: Vec> = matrix.into_iter().map(|row| row.to_vec()).collect(); + let result = PyDict::new(py); + result.set_item("region_labels", labels)?; + result.set_item("dinucleotides", dinucl_names)?; + result.set_item("frequencies", freqs_nested)?; + Ok(result) } #[pyfunction(name = "calc_partitions")] diff --git a/gtars-python/src/lola/mod.rs b/gtars-python/src/lola/mod.rs index 6d042360..63e4b9a0 100644 --- a/gtars-python/src/lola/mod.rs +++ b/gtars-python/src/lola/mod.rs @@ -125,13 +125,13 @@ impl PyRegionDB { for a in &self.inner.region_anno { let d = PyDict::new(py); d.set_item("filename", &a.filename)?; - d.set_item("cellType", &a.cell_type)?; - d.set_item("description", &a.description)?; - d.set_item("tissue", &a.tissue)?; - d.set_item("dataSource", &a.data_source)?; - d.set_item("antibody", &a.antibody)?; - d.set_item("treatment", &a.treatment)?; - d.set_item("collection", &a.collection)?; + d.set_item("cellType", a.cell_type.as_deref())?; + d.set_item("description", a.description.as_deref())?; + d.set_item("tissue", a.tissue.as_deref())?; + d.set_item("dataSource", a.data_source.as_deref())?; + d.set_item("antibody", a.antibody.as_deref())?; + d.set_item("treatment", a.treatment.as_deref())?; + d.set_item("collection", a.collection.as_deref())?; result.push(d); } Ok(result) @@ -242,82 +242,33 @@ fn results_to_dict<'py>( py: Python<'py>, results: &[LolaResult], ) -> PyResult> { - let n = results.len(); - let mut user_set = Vec::with_capacity(n); - let mut db_set = Vec::with_capacity(n); - let mut p_value_log = Vec::with_capacity(n); - let mut odds_ratio = Vec::with_capacity(n); - let mut support = Vec::with_capacity(n); - let mut rnk_pv = Vec::with_capacity(n); - let mut rnk_or = Vec::with_capacity(n); - let mut rnk_sup = Vec::with_capacity(n); - let mut max_rnk = Vec::with_capacity(n); - let mut mean_rnk = Vec::with_capacity(n); - let mut b_vec = Vec::with_capacity(n); - let mut c_vec = Vec::with_capacity(n); - let mut d_vec = Vec::with_capacity(n); - let mut collection: Vec> = Vec::with_capacity(n); - let mut description: Vec> = Vec::with_capacity(n); - let mut cell_type: Vec> = Vec::with_capacity(n); - let mut tissue: Vec> = Vec::with_capacity(n); - let mut antibody: Vec> = Vec::with_capacity(n); - let mut treatment: Vec> = Vec::with_capacity(n); - let mut data_source: Vec> = Vec::with_capacity(n); - let mut filename = Vec::with_capacity(n); - let mut db_set_size = Vec::with_capacity(n); - - for r in results { - user_set.push(r.user_set); - db_set.push(r.db_set); - p_value_log.push(r.p_value_log); - odds_ratio.push(r.odds_ratio); - support.push(r.support); - rnk_pv.push(r.rnk_pv); - rnk_or.push(r.rnk_or); - rnk_sup.push(r.rnk_sup); - max_rnk.push(r.max_rnk); - mean_rnk.push(r.mean_rnk); - b_vec.push(r.b); - c_vec.push(r.c); - d_vec.push(r.d); - collection.push(empty_to_none(&r.collection)); - description.push(empty_to_none(&r.description)); - cell_type.push(empty_to_none(&r.cell_type)); - tissue.push(empty_to_none(&r.tissue)); - antibody.push(empty_to_none(&r.antibody)); - treatment.push(empty_to_none(&r.treatment)); - data_source.push(empty_to_none(&r.data_source)); - filename.push(r.filename.clone()); - db_set_size.push(r.db_set_size); - } - - let q_value: Vec> = results.iter().map(|r| r.q_value).collect(); + use gtars_lola::output::results_to_columns; + let c = results_to_columns(results); let dict = PyDict::new(py); - dict.set_item("userSet", user_set)?; - dict.set_item("dbSet", db_set)?; - dict.set_item("collection", collection)?; - dict.set_item("pValueLog", p_value_log)?; - dict.set_item("oddsRatio", odds_ratio)?; - dict.set_item("support", support)?; - dict.set_item("rnkPV", rnk_pv)?; - dict.set_item("rnkOR", rnk_or)?; - dict.set_item("rnkSup", rnk_sup)?; - dict.set_item("maxRnk", max_rnk)?; - dict.set_item("meanRnk", mean_rnk)?; - dict.set_item("b", b_vec)?; - dict.set_item("c", c_vec)?; - dict.set_item("d", d_vec)?; - dict.set_item("description", description)?; - dict.set_item("cellType", cell_type)?; - dict.set_item("tissue", tissue)?; - dict.set_item("antibody", antibody)?; - dict.set_item("treatment", treatment)?; - dict.set_item("dataSource", data_source)?; - dict.set_item("filename", filename)?; - dict.set_item("qValue", q_value)?; - dict.set_item("size", db_set_size)?; - + dict.set_item("userSet", c.user_set)?; + dict.set_item("dbSet", c.db_set)?; + dict.set_item("collection", c.collection)?; + dict.set_item("pValueLog", c.p_value_log)?; + dict.set_item("oddsRatio", c.odds_ratio)?; + dict.set_item("support", c.support)?; + dict.set_item("rnkPV", c.rnk_pv)?; + dict.set_item("rnkOR", c.rnk_or)?; + dict.set_item("rnkSup", c.rnk_sup)?; + dict.set_item("maxRnk", c.max_rnk)?; + dict.set_item("meanRnk", c.mean_rnk)?; + dict.set_item("b", c.b)?; + dict.set_item("c", c.c)?; + dict.set_item("d", c.d)?; + dict.set_item("description", c.description)?; + dict.set_item("cellType", c.cell_type)?; + dict.set_item("tissue", c.tissue)?; + dict.set_item("antibody", c.antibody)?; + dict.set_item("treatment", c.treatment)?; + dict.set_item("dataSource", c.data_source)?; + dict.set_item("filename", c.filename)?; + dict.set_item("qValue", c.q_value)?; + dict.set_item("size", c.db_set_size)?; Ok(dict) } @@ -408,14 +359,6 @@ fn py_build_restricted_universe( // Helpers // ========================================================================= -fn empty_to_none(s: &str) -> Option { - if s.is_empty() { - None - } else { - Some(s.to_string()) - } -} - fn tuples_to_regionset(tuples: Vec<(String, u32, u32)>) -> RegionSet { RegionSet::from( tuples diff --git a/gtars-python/src/models/genome_assembly.rs b/gtars-python/src/models/genome_assembly.rs index 4927771b..e560fb6c 100644 --- a/gtars-python/src/models/genome_assembly.rs +++ b/gtars-python/src/models/genome_assembly.rs @@ -1,4 +1,4 @@ -use gtars_genomicdist::models::GenomeAssembly; +use gtars_genomicdist::models::{BinaryGenomeAssembly, GenomeAssembly}; use pyo3::prelude::*; #[pyclass(name = "GenomeAssembly")] @@ -22,3 +22,29 @@ impl PyGenomeAssembly { Ok(PyGenomeAssembly { genome_assembly }) } } + +/// Memory-mapped genome assembly backed by a .fab binary file. +/// +/// Near-instant construction via mmap with zero-copy sequence access. +/// Create .fab files with ``gtars prep --fasta ``. +#[pyclass(name = "BinaryGenomeAssembly")] +pub struct PyBinaryGenomeAssembly { + pub assembly: BinaryGenomeAssembly, +} + +#[pymethods] +impl PyBinaryGenomeAssembly { + #[new] + /// Open a .fab binary FASTA file. + /// + /// Args: + /// path: path to the .fab file + /// + /// Returns: + /// BinaryGenomeAssembly object + pub fn new(path: &Bound<'_, PyAny>) -> PyResult { + let assembly = BinaryGenomeAssembly::try_from(path.to_string()) + .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?; + Ok(PyBinaryGenomeAssembly { assembly }) + } +} diff --git a/gtars-python/src/models/mod.rs b/gtars-python/src/models/mod.rs index 0529af33..0cf9d038 100644 --- a/gtars-python/src/models/mod.rs +++ b/gtars-python/src/models/mod.rs @@ -13,12 +13,11 @@ pub(crate) mod tss_index; pub use self::gda::PyGenomicDistAnnotation; pub use self::gene_model::PyGeneModel; -pub use self::genome_assembly::PyGenomeAssembly; +pub use self::genome_assembly::{PyBinaryGenomeAssembly, PyGenomeAssembly}; pub use self::interval::PyInterval; pub use self::partition_list::PyPartitionList; pub use self::region::PyRegion; -pub use self::region_set::PyChromosomeStatistics; -pub use self::region_set::PyRegionSet; +pub use self::region_set::{PyChromosomeStatistics, PyRegionSet}; pub use self::region_set_list::PyRegionSetList; pub use self::signal_matrix::PySignalMatrix; pub use self::tss_index::PyTssIndex; @@ -30,6 +29,7 @@ pub fn models(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/gtars-python/src/models/region_set.rs b/gtars-python/src/models/region_set.rs index 720eb977..90a59c48 100644 --- a/gtars-python/src/models/region_set.rs +++ b/gtars-python/src/models/region_set.rs @@ -298,43 +298,38 @@ impl PyRegionSet { self.regionset.calc_widths() } - fn neighbor_distances(&self) -> PyResult>> { - let dists = self - .regionset + /// Signed gaps between consecutive regions on each chromosome. + /// + /// Output length may be shorter than input region count — chromosomes with + /// fewer than 2 regions are skipped (no neighbors to measure against). Output + /// is not aligned 1:1 with input regions. + fn neighbor_distances(&self) -> PyResult> { + self.regionset .calc_neighbor_distances() - .map_err(|e| PyValueError::new_err(e.to_string()))?; - Ok(dists - .into_iter() - .map(|d| { - if d == i64::MAX { - None - } else { - Some(d as f64) - } - }) - .collect()) + .map_err(|e| PyValueError::new_err(e.to_string())) } - fn nearest_neighbors(&self) -> PyResult>> { - let dists = self - .regionset + /// Distance from each region to its nearest neighbor on the same chromosome. + /// + /// Output length may be shorter than input region count — chromosomes with + /// only one region are skipped. Output is not aligned 1:1 with input regions. + fn nearest_neighbors(&self) -> PyResult> { + self.regionset .calc_nearest_neighbors() - .map_err(|e| PyValueError::new_err(e.to_string()))?; - Ok(dists - .into_iter() - .map(|d| { - if d == u32::MAX { - None - } else { - Some(d as f64) - } - }) - .collect()) + .map_err(|e| PyValueError::new_err(e.to_string())) } - #[pyo3(signature = (n_bins = 250))] - fn distribution<'py>(&self, py: Python<'py>, n_bins: u32) -> PyResult>> { - let bins = self.regionset.region_distribution_with_bins(n_bins); + #[pyo3(signature = (n_bins = 250, chrom_sizes = None))] + fn distribution<'py>( + &self, + py: Python<'py>, + n_bins: u32, + chrom_sizes: Option>, + ) -> PyResult>> { + let bins = match chrom_sizes { + Some(cs) => self.regionset.region_distribution_with_chrom_sizes(n_bins, &cs), + None => self.regionset.region_distribution_with_bins(n_bins), + }; let mut sorted_bins: Vec<_> = bins.into_values().collect(); sorted_bins.sort_by(|a, b| { a.chr @@ -512,6 +507,17 @@ impl PyRegionSet { result } + + /// Return gaps between regions per chromosome, bounded by chrom sizes. + /// + /// Emits leading gaps, inter-region gaps, trailing gaps (to chrom_size), + /// and full-chromosome gaps for any chromosome in ``chrom_sizes`` that + /// has no regions at all. + fn gaps(&self, chrom_sizes: HashMap) -> PyResult { + let rs = self.regionset.gaps(&chrom_sizes); + Ok(Self::from_regionset(rs)) + } + } #[pymethods] @@ -556,3 +562,4 @@ impl PyChromosomeStatistics { self.median_region_length } } + diff --git a/gtars-python/src/refget/mod.rs b/gtars-python/src/refget/mod.rs index 9febc122..d2b9a54b 100644 --- a/gtars-python/src/refget/mod.rs +++ b/gtars-python/src/refget/mod.rs @@ -2522,9 +2522,9 @@ impl PyRefgetStore { /// /// Returns: /// dict: {'sequences': [...], 'collections': [...]} - fn available_alias_namespaces(&self) -> PyResult { + fn available_alias_namespaces(&self) -> PyResult> { let aliases = self.inner.available_alias_namespaces(); - Python::with_gil(|py| { + Python::attach(|py| { let dict = pyo3::types::PyDict::new(py); let seq_list: Vec<&str> = aliases.sequences.iter().map(|s| s.as_str()).collect(); let coll_list: Vec<&str> = aliases.collections.iter().map(|s| s.as_str()).collect(); @@ -2543,7 +2543,7 @@ impl PyRefgetStore { /// Returns: /// dict: {pulled, skipped, not_found, conflicts} #[pyo3(signature = (namespace=None, strategy="keep-ours"))] - fn pull_aliases(&mut self, namespace: Option<&str>, strategy: &str) -> PyResult { + fn pull_aliases(&mut self, namespace: Option<&str>, strategy: &str) -> PyResult> { let sync_strategy = parse_sync_strategy(strategy)?; let result = self .inner @@ -2561,7 +2561,7 @@ impl PyRefgetStore { /// Returns: /// dict: {pulled, skipped, not_found, conflicts} #[pyo3(signature = (digest=None, strategy="keep-ours"))] - fn pull_fhr(&mut self, digest: Option<&str>, strategy: &str) -> PyResult { + fn pull_fhr(&mut self, digest: Option<&str>, strategy: &str) -> PyResult> { let sync_strategy = parse_sync_strategy(strategy)?; let result = self .inner @@ -3102,8 +3102,8 @@ fn parse_sync_strategy(strategy: &str) -> PyResult { } /// Convert a PullResult into a Python dict. -fn pull_result_to_pyobject(result: gtars_refget::PullResult) -> PyResult { - Python::with_gil(|py| { +fn pull_result_to_pyobject(result: gtars_refget::PullResult) -> PyResult> { + Python::attach(|py| { let dict = pyo3::types::PyDict::new(py); dict.set_item("pulled", result.pulled)?; dict.set_item("skipped", result.skipped)?; diff --git a/gtars-python/tests/test_genomicdist.py b/gtars-python/tests/test_genomicdist.py index e6a4ee5a..2b1c0d5e 100644 --- a/gtars-python/tests/test_genomicdist.py +++ b/gtars-python/tests/test_genomicdist.py @@ -55,8 +55,18 @@ def test_neighbor_distances_same_chrom(self): ) dists = rs.neighbor_distances() assert len(dists) == 2 - assert dists[0] == 100.0 # 300 - 200 - assert dists[1] == 100.0 # 500 - 400 + assert dists[0] == 100 # 300 - 200 + assert dists[1] == 100 # 500 - 400 + # Return type is List[int] (no None/sentinel wrapping) + assert all(isinstance(d, int) for d in dists) + + def test_nearest_neighbors_return_type(self): + # Confirm no None/Optional wrapping — the core function skips + # single-region chroms rather than emitting sentinels. + rs = make_regionset([("chr1", 0, 10), ("chr1", 20, 30)]) + nn = rs.nearest_neighbors() + assert all(isinstance(d, int) for d in nn) + assert None not in nn def test_neighbor_distances_overlapping(self): rs = make_regionset([("chr1", 100, 300), ("chr1", 200, 400)]) @@ -104,6 +114,38 @@ def test_distribution_default_bins(self): dist = rs.distribution() assert len(dist) > 0 + def test_distribution_with_chrom_sizes(self): + """Passing chrom_sizes uses reference-derived per-chrom bin sizes.""" + rs = make_regionset([("chr1", 0, 100), ("chr2", 200, 300)]) + # chr1 size 1000, chr2 size 500, 10 bins each → chr1 bin=100, chr2 bin=50 + chrom_sizes = {"chr1": 1000, "chr2": 500} + dist = rs.distribution(10, chrom_sizes) + assert isinstance(dist, list) + # chr1 bin width should be 100 (1000/10) + chr1_bins = [d for d in dist if d["chr"] == "chr1"] + assert all(d["end"] - d["start"] == 100 for d in chr1_bins) + # chr2 bin width should be 50 (500/10) + chr2_bins = [d for d in dist if d["chr"] == "chr2"] + assert all(d["end"] - d["start"] == 50 for d in chr2_bins) + + def test_distribution_chrom_sizes_skips_out_of_range(self): + """Regions beyond stated chrom_size are skipped (assembly mismatch case).""" + # chr1 size 1000: region at mid=1250 is out of range + # chr2 size 500: region at mid=2050 is out of range + rs = make_regionset([ + ("chr1", 100, 200), # mid=150, in range + ("chr1", 1200, 1300), # mid=1250, out of range + ("chr2", 200, 300), # mid=250, in range + ("chr2", 2000, 2100), # mid=2050, out of range + ]) + chrom_sizes = {"chr1": 1000, "chr2": 500} + dist = rs.distribution(10, chrom_sizes) + # 2 of 4 regions should be in bins; bins are well-formed (no end < start) + total_in_bins = sum(d["n"] for d in dist) + assert total_in_bins == 2 + assert all(d["end"] > d["start"] for d in dist) + assert all(d["rid"] < 10 for d in dist) + # ========================================================================= # Interval range operations diff --git a/gtars-python/tests/test_regionset.py b/gtars-python/tests/test_regionset.py index 8da4d628..eabc5712 100644 --- a/gtars-python/tests/test_regionset.py +++ b/gtars-python/tests/test_regionset.py @@ -100,3 +100,26 @@ def test_disjoin_empty(self): rs = _rs() result = rs.disjoin() assert len(result) == 0 + + +class TestGaps: + def test_gaps_basic(self): + """Gaps should emit leading, inter-region, and trailing gaps.""" + rs = _rs(("chr1", 10, 20), ("chr1", 30, 40), ("chr1", 50, 60)) + result = rs.gaps({"chr1": 100}) + coords = [(r.start, r.end) for r in result] + assert coords == [(0, 10), (20, 30), (40, 50), (60, 100)] + + def test_gaps_full_chrom_for_empty(self): + """A chromosome in chrom_sizes with no regions yields a full-chrom gap.""" + rs = _rs(("chr1", 10, 20)) + result = rs.gaps({"chr1": 100, "chr2": 50}) + chr2 = [(r.start, r.end) for r in result if r.chr == "chr2"] + assert chr2 == [(0, 50)] + + def test_gaps_empty(self): + rs = _rs() + result = rs.gaps({}) + assert len(result) == 0 + + diff --git a/gtars-r/DESCRIPTION b/gtars-r/DESCRIPTION index 76f270dc..b16592a7 100644 --- a/gtars-r/DESCRIPTION +++ b/gtars-r/DESCRIPTION @@ -1,6 +1,6 @@ Package: gtars Title: Performance critical genomic interval analysis using Rust, in R -Version: 0.8.0 +Version: 0.9.0 Authors@R: person("Nathan", "LeRoy", , "nleroy917@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-7354-7213")) diff --git a/gtars-r/NAMESPACE b/gtars-r/NAMESPACE index ad5424f2..757991cb 100644 --- a/gtars-r/NAMESPACE +++ b/gtars-r/NAMESPACE @@ -23,6 +23,7 @@ export(calcWidth) export(checkUniverseAppropriateness) export(check_universe) export(chromosomeStatistics) +export(clusterRegions) export(compare) export(concat) export(consensus) @@ -85,6 +86,7 @@ export(list_sequences) export(loadGenomeAssembly) export(loadRegionDB) export(loadRegionDBFromBeds) +export(load_binary_genome_assembly) export(load_collection_aliases) export(load_fhr_metadata) export(load_from_directory) @@ -117,6 +119,7 @@ export(r_calc_summary_signal_from_matrix) export(r_calc_tss_distances) export(r_calc_widths) export(r_chromosome_statistics) +export(r_cluster) export(r_concat) export(r_consensus) export(r_count_overlaps) @@ -184,6 +187,7 @@ exportMethods("[") exportMethods("[[") exportMethods(as.data.frame) exportMethods(chromosomeStatistics) +exportMethods(clusterRegions) exportMethods(concat) exportMethods(countOverlaps) exportMethods(disjoin) diff --git a/gtars-r/R/RegionSet-methods.R b/gtars-r/R/RegionSet-methods.R index 1046bbc4..8441e2e6 100644 --- a/gtars-r/R/RegionSet-methods.R +++ b/gtars-r/R/RegionSet-methods.R @@ -137,6 +137,30 @@ setMethod("distribution", "ANY", function(x, nBins = 250L, chromSizes = NULL, .. distribution(RegionSet(x), nBins = nBins, chromSizes = chromSizes) }) +#' Cluster regions via single-linkage with a stitching radius +#' +#' @description Assigns a cluster ID to each region. Regions within +#' ``maxGap`` bp of another region on the same chromosome are linked +#' into the same cluster; chromosome boundaries always break clusters. +#' @param x A RegionSet, GRanges, file path, or data.frame +#' @param maxGap Maximum bp gap between regions to link (default 0) +#' @param ... ignored +#' @return Integer vector of 0-based cluster IDs in original region order +#' @export +setGeneric("clusterRegions", function(x, maxGap = 0L, ...) standardGeneric("clusterRegions")) + +#' @rdname clusterRegions +#' @export +setMethod("clusterRegions", "RegionSet", function(x, maxGap = 0L, ...) { + .Call(wrap__r_cluster, .ptr(x), as.integer(maxGap)) +}) + +#' @rdname clusterRegions +#' @export +setMethod("clusterRegions", "ANY", function(x, maxGap = 0L, ...) { + clusterRegions(RegionSet(x), maxGap = maxGap) +}) + # ========================================================================= # IRanges generics: trim, reduce, promoters # @@ -380,29 +404,44 @@ setMethod("disjoin", "data.frame", function(x, ...) { disjoin(RegionSet(x)) }) -#' Return gaps between regions per chromosome +#' Return gaps between regions per chromosome, bounded by chromosome sizes +#' +#' Emits the peak-free intervals of each chromosome listed in ``chrom_sizes``: +#' leading gaps from 0, inter-region gaps, trailing gaps to the chromosome +#' end, and full-chromosome gaps for any chromosome with no regions. +#' Regions on chromosomes not listed in ``chrom_sizes`` are skipped. #' #' @param x A RegionSet, GRanges, file path, or data.frame +#' @param chrom_sizes Named integer vector of chromosome sizes (name = +#' chromosome, value = size in bp). Required. +#' @param start,end Ignored. Accepted only for compatibility with the +#' \code{IRanges::gaps} generic signature; use ``chrom_sizes`` instead. #' @param ... ignored -#' @return A RegionSet with gap regions +#' @return A RegionSet containing the gap intervals #' @rdname gaps #' @export -setMethod("gaps", "RegionSet", function(x, start = NA, end = NA, ...) { - .rs_from_ptr(.Call(wrap__r_gaps, .ptr(x))) +setMethod("gaps", "RegionSet", function(x, start = NA, end = NA, chrom_sizes = NULL, ...) { + if (is.null(chrom_sizes)) { + stop("'chrom_sizes' is required: pass a named integer vector of chromosome sizes") + } + .rs_from_ptr(.Call( + wrap__r_gaps, + .ptr(x), + names(chrom_sizes), + as.integer(chrom_sizes) + )) }) #' @rdname gaps #' @export -setMethod("gaps", "character", function(x, start = NA, - end = NA, ...) { - gaps(RegionSet(x)) +setMethod("gaps", "character", function(x, start = NA, end = NA, chrom_sizes = NULL, ...) { + gaps(RegionSet(x), chrom_sizes = chrom_sizes) }) #' @rdname gaps #' @export -setMethod("gaps", "data.frame", function(x, start = NA, - end = NA, ...) { - gaps(RegionSet(x)) +setMethod("gaps", "data.frame", function(x, start = NA, end = NA, chrom_sizes = NULL, ...) { + gaps(RegionSet(x), chrom_sizes = chrom_sizes) }) # ========================================================================= diff --git a/gtars-r/R/extendr-wrappers.R b/gtars-r/R/extendr-wrappers.R index 22741945..09869621 100644 --- a/gtars-r/R/extendr-wrappers.R +++ b/gtars-r/R/extendr-wrappers.R @@ -60,23 +60,49 @@ r_chromosome_statistics <- function(rs_ptr) .Call(wrap__r_chromosome_statistics, #' @param n_bins Number of bins (default 250) r_region_distribution <- function(rs_ptr, n_bins, chrom_names, chrom_lengths) .Call(wrap__r_region_distribution, rs_ptr, n_bins, chrom_names, chrom_lengths) +#' Cluster nearby regions via single-linkage with a stitching radius. +#' +#' Returns an integer vector of cluster IDs in original region order. +#' Regions within `max_gap` bp of another region on the same chromosome +#' are assigned the same cluster. Chromosome boundaries always break clusters. +#' +#' @export +#' @param rs_ptr External pointer to a RegionSet +#' @param max_gap Maximum bp gap between regions to link (non-negative) +r_cluster <- function(rs_ptr, max_gap) .Call(wrap__r_cluster, rs_ptr, max_gap) + #' Load a genome assembly from a FASTA file #' @export #' @param fasta_path Path to a FASTA file load_genome_assembly <- function(fasta_path) .Call(wrap__r_load_genome_assembly, fasta_path) +#' Load a binary genome assembly from a .fab file +#' @export +#' @param fab_path Path to a .fab binary FASTA file (created by gtars prep --fasta) +load_binary_genome_assembly <- function(fab_path) .Call(wrap__r_load_binary_genome_assembly, fab_path) + #' Calculate GC content for each region #' @export #' @param rs_ptr External pointer to a RegionSet #' @param assembly_ptr External pointer to a GenomeAssembly -#' @param ignore_unk_chroms Skip regions on chromosomes not in the assembly +#' @param ignore_unk_chroms Skip regions on chromosomes not in the assembly. +#' Pass NULL (the default) or FALSE to error on unknown chromosomes. r_calc_gc_content <- function(rs_ptr, assembly_ptr, ignore_unk_chroms) .Call(wrap__r_calc_gc_content, rs_ptr, assembly_ptr, ignore_unk_chroms) -#' Calculate per-region dinucleotide frequencies (percentages) +#' Calculate per-region dinucleotide frequencies (matches R GenomicDistributions calcDinuclFreq) +#' +#' Returns a list with a ``region`` column and 16 dinucleotide columns +#' (AA, AC, ..., TT). When ``raw_counts`` is FALSE (default), each row sums +#' to 100 (percentages). When TRUE, values are raw integer counts. +#' +#' For pooled global counts, sum each dinucleotide column across rows +#' (with ``raw_counts=TRUE``). #' @export #' @param rs_ptr External pointer to a RegionSet #' @param assembly_ptr External pointer to a GenomeAssembly -r_calc_dinucl_freq <- function(rs_ptr, assembly_ptr) .Call(wrap__r_calc_dinucl_freq, rs_ptr, assembly_ptr) +#' @param raw_counts Return raw counts instead of percentages (default FALSE, matches R upstream) +#' @param ignore_unk_chroms Skip regions on chromosomes not in the assembly (default FALSE) +r_calc_dinucl_freq <- function(rs_ptr, assembly_ptr, raw_counts, ignore_unk_chroms) .Call(wrap__r_calc_dinucl_freq, rs_ptr, assembly_ptr, raw_counts, ignore_unk_chroms) #' Trim regions to chromosome boundaries #' @export @@ -166,10 +192,18 @@ r_narrow <- function(rs_ptr, start, end, width) .Call(wrap__r_narrow, rs_ptr, st #' @param rs_ptr External pointer to a RegionSet r_disjoin <- function(rs_ptr) .Call(wrap__r_disjoin, rs_ptr) -#' Return gaps between regions per chromosome +#' Return gaps between regions per chromosome, bounded by chrom sizes +#' +#' Emits the peak-free intervals of each chromosome listed in `chrom_sizes`, +#' including leading gaps (from 0), inter-region gaps, trailing gaps +#' (to `chrom_size`), and full-chromosome gaps for chromosomes with no +#' regions. Regions on chromosomes not present in `chrom_sizes` are skipped. +#' #' @export #' @param rs_ptr External pointer to a RegionSet -r_gaps <- function(rs_ptr) .Call(wrap__r_gaps, rs_ptr) +#' @param chrom_names Character vector of chromosome names +#' @param chrom_sizes_vec Integer vector of chromosome sizes +r_gaps <- function(rs_ptr, chrom_names, chrom_sizes_vec) .Call(wrap__r_gaps, rs_ptr, chrom_names, chrom_sizes_vec) #' Range-level intersection of two region sets #' @export diff --git a/gtars-r/R/genomicdist.R b/gtars-r/R/genomicdist.R index 30340ce2..8b4e5efb 100644 --- a/gtars-r/R/genomicdist.R +++ b/gtars-r/R/genomicdist.R @@ -96,18 +96,29 @@ calcGCContent <- function(query, ref, ignoreUnkChroms = FALSE) { #' Calculate dinucleotide frequency #' #' @description Calculates the frequency of all 16 dinucleotides for each -#' region as percentages. Drop-in replacement for -#' GenomicDistributions::calcDinuclFreq. +#' region. Drop-in replacement for GenomicDistributions::calcDinuclFreq, +#' including the \code{rawCounts} parameter. #' #' @param query A GRanges object, file path, data.frame, or RegionSet #' @param ref A GenomeAssembly pointer (from loadGenomeAssembly) +#' @param rawCounts If TRUE, return raw integer counts; if FALSE (default), +#' return percentages per row (each row sums to 100) +#' @param ignoreUnknownChroms If TRUE, skip regions on chromosomes not in +#' the assembly. If FALSE (default), error on unknown chromosomes. #' @return A data.frame with a \code{region} column and 16 dinucleotide -#' columns containing frequency percentages. +#' columns. For pooled global counts, call \code{colSums(result[, -1])} +#' on a \code{rawCounts=TRUE} result. #' #' @export -calcDinuclFreq <- function(query, ref) { +calcDinuclFreq <- function(query, ref, rawCounts = FALSE, ignoreUnknownChroms = FALSE) { query <- .ensure_regionset(query) - result <- .Call(wrap__r_calc_dinucl_freq, query, ref) + result <- .Call( + wrap__r_calc_dinucl_freq, + query, + ref, + rawCounts, + ignoreUnknownChroms + ) as.data.frame(result, stringsAsFactors = FALSE) } diff --git a/gtars-r/man/calcDinuclFreq.Rd b/gtars-r/man/calcDinuclFreq.Rd index 7f257c8b..d30edd62 100644 --- a/gtars-r/man/calcDinuclFreq.Rd +++ b/gtars-r/man/calcDinuclFreq.Rd @@ -4,19 +4,26 @@ \alias{calcDinuclFreq} \title{Calculate dinucleotide frequency} \usage{ -calcDinuclFreq(query, ref) +calcDinuclFreq(query, ref, rawCounts = FALSE, ignoreUnknownChroms = FALSE) } \arguments{ \item{query}{A GRanges object, file path, data.frame, or RegionSet} \item{ref}{A GenomeAssembly pointer (from loadGenomeAssembly)} + +\item{rawCounts}{If TRUE, return raw integer counts; if FALSE (default), +return percentages per row (each row sums to 100)} + +\item{ignoreUnknownChroms}{If TRUE, skip regions on chromosomes not in +the assembly. If FALSE (default), error on unknown chromosomes.} } \value{ A data.frame with a \code{region} column and 16 dinucleotide -columns containing frequency percentages. +columns. For pooled global counts, call \code{colSums(result[, -1])} +on a \code{rawCounts=TRUE} result. } \description{ Calculates the frequency of all 16 dinucleotides for each -region as percentages. Drop-in replacement for -GenomicDistributions::calcDinuclFreq. +region. Drop-in replacement for GenomicDistributions::calcDinuclFreq, +including the \code{rawCounts} parameter. } diff --git a/gtars-r/man/clusterRegions.Rd b/gtars-r/man/clusterRegions.Rd new file mode 100644 index 00000000..deb5eedb --- /dev/null +++ b/gtars-r/man/clusterRegions.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RegionSet-methods.R +\name{clusterRegions} +\alias{clusterRegions} +\alias{clusterRegions,RegionSet-method} +\alias{clusterRegions,ANY-method} +\title{Cluster regions via single-linkage with a stitching radius} +\usage{ +clusterRegions(x, maxGap = 0L, ...) + +\S4method{clusterRegions}{RegionSet}(x, maxGap = 0L, ...) + +\S4method{clusterRegions}{ANY}(x, maxGap = 0L, ...) +} +\arguments{ +\item{x}{A RegionSet, GRanges, file path, or data.frame} + +\item{maxGap}{Maximum bp gap between regions to link (default 0)} + +\item{...}{ignored} +} +\value{ +Integer vector of 0-based cluster IDs in original region order +} +\description{ +Assigns a cluster ID to each region. Regions within +\code{maxGap} bp of another region on the same chromosome are linked +into the same cluster; chromosome boundaries always break clusters. +} diff --git a/gtars-r/man/gaps.Rd b/gtars-r/man/gaps.Rd index e11b11ee..9eb34d57 100644 --- a/gtars-r/man/gaps.Rd +++ b/gtars-r/man/gaps.Rd @@ -4,22 +4,31 @@ \alias{gaps,RegionSet-method} \alias{gaps,character-method} \alias{gaps,data.frame-method} -\title{Return gaps between regions per chromosome} +\title{Return gaps between regions per chromosome, bounded by chromosome sizes} \usage{ -\S4method{gaps}{RegionSet}(x, start = NA, end = NA, ...) +\S4method{gaps}{RegionSet}(x, start = NA, end = NA, chrom_sizes = NULL, ...) -\S4method{gaps}{character}(x, start = NA, end = NA, ...) +\S4method{gaps}{character}(x, start = NA, end = NA, chrom_sizes = NULL, ...) -\S4method{gaps}{data.frame}(x, start = NA, end = NA, ...) +\S4method{gaps}{data.frame}(x, start = NA, end = NA, chrom_sizes = NULL, ...) } \arguments{ \item{x}{A RegionSet, GRanges, file path, or data.frame} +\item{start, end}{Ignored. Accepted only for compatibility with the +\code{IRanges::gaps} generic signature; use \code{chrom_sizes} instead.} + +\item{chrom_sizes}{Named integer vector of chromosome sizes (name = +chromosome, value = size in bp). Required.} + \item{...}{ignored} } \value{ -A RegionSet with gap regions +A RegionSet containing the gap intervals } \description{ -Return gaps between regions per chromosome +Emits the peak-free intervals of each chromosome listed in \code{chrom_sizes}: +leading gaps from 0, inter-region gaps, trailing gaps to the chromosome +end, and full-chromosome gaps for any chromosome with no regions. +Regions on chromosomes not listed in \code{chrom_sizes} are skipped. } diff --git a/gtars-r/man/load_binary_genome_assembly.Rd b/gtars-r/man/load_binary_genome_assembly.Rd new file mode 100644 index 00000000..b349a274 --- /dev/null +++ b/gtars-r/man/load_binary_genome_assembly.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/extendr-wrappers.R +\name{load_binary_genome_assembly} +\alias{load_binary_genome_assembly} +\title{Load a binary genome assembly from a .fab file} +\usage{ +load_binary_genome_assembly(fab_path) +} +\arguments{ +\item{fab_path}{Path to a .fab binary FASTA file (created by gtars prep --fasta)} +} +\description{ +Load a binary genome assembly from a .fab file +} diff --git a/gtars-r/man/r_calc_dinucl_freq.Rd b/gtars-r/man/r_calc_dinucl_freq.Rd index d29f4941..e2d2315a 100644 --- a/gtars-r/man/r_calc_dinucl_freq.Rd +++ b/gtars-r/man/r_calc_dinucl_freq.Rd @@ -2,15 +2,25 @@ % Please edit documentation in R/extendr-wrappers.R \name{r_calc_dinucl_freq} \alias{r_calc_dinucl_freq} -\title{Calculate per-region dinucleotide frequencies (percentages)} +\title{Calculate per-region dinucleotide frequencies (matches R GenomicDistributions calcDinuclFreq)} \usage{ -r_calc_dinucl_freq(rs_ptr, assembly_ptr) +r_calc_dinucl_freq(rs_ptr, assembly_ptr, raw_counts, ignore_unk_chroms) } \arguments{ \item{rs_ptr}{External pointer to a RegionSet} \item{assembly_ptr}{External pointer to a GenomeAssembly} + +\item{raw_counts}{Return raw counts instead of percentages (default FALSE, matches R upstream)} + +\item{ignore_unk_chroms}{Skip regions on chromosomes not in the assembly (default FALSE)} } \description{ -Calculate per-region dinucleotide frequencies (percentages) +Returns a list with a \code{region} column and 16 dinucleotide columns +(AA, AC, ..., TT). When \code{raw_counts} is FALSE (default), each row sums +to 100 (percentages). When TRUE, values are raw integer counts. +} +\details{ +For pooled global counts, sum each dinucleotide column across rows +(with \code{raw_counts=TRUE}). } diff --git a/gtars-r/man/r_calc_gc_content.Rd b/gtars-r/man/r_calc_gc_content.Rd index 69f277b2..eb8b77a8 100644 --- a/gtars-r/man/r_calc_gc_content.Rd +++ b/gtars-r/man/r_calc_gc_content.Rd @@ -11,7 +11,8 @@ r_calc_gc_content(rs_ptr, assembly_ptr, ignore_unk_chroms) \item{assembly_ptr}{External pointer to a GenomeAssembly} -\item{ignore_unk_chroms}{Skip regions on chromosomes not in the assembly} +\item{ignore_unk_chroms}{Skip regions on chromosomes not in the assembly. +Pass NULL (the default) or FALSE to error on unknown chromosomes.} } \description{ Calculate GC content for each region diff --git a/gtars-r/man/r_cluster.Rd b/gtars-r/man/r_cluster.Rd new file mode 100644 index 00000000..038e8130 --- /dev/null +++ b/gtars-r/man/r_cluster.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/extendr-wrappers.R +\name{r_cluster} +\alias{r_cluster} +\title{Cluster nearby regions via single-linkage with a stitching radius.} +\usage{ +r_cluster(rs_ptr, max_gap) +} +\arguments{ +\item{rs_ptr}{External pointer to a RegionSet} + +\item{max_gap}{Maximum bp gap between regions to link (non-negative)} +} +\description{ +Returns an integer vector of cluster IDs in original region order. +Regions within \code{max_gap} bp of another region on the same chromosome +are assigned the same cluster. Chromosome boundaries always break clusters. +} diff --git a/gtars-r/man/r_gaps.Rd b/gtars-r/man/r_gaps.Rd index 86c01554..9d4e7176 100644 --- a/gtars-r/man/r_gaps.Rd +++ b/gtars-r/man/r_gaps.Rd @@ -2,13 +2,20 @@ % Please edit documentation in R/extendr-wrappers.R \name{r_gaps} \alias{r_gaps} -\title{Return gaps between regions per chromosome} +\title{Return gaps between regions per chromosome, bounded by chrom sizes} \usage{ -r_gaps(rs_ptr) +r_gaps(rs_ptr, chrom_names, chrom_sizes_vec) } \arguments{ \item{rs_ptr}{External pointer to a RegionSet} + +\item{chrom_names}{Character vector of chromosome names} + +\item{chrom_sizes_vec}{Integer vector of chromosome sizes} } \description{ -Return gaps between regions per chromosome +Emits the peak-free intervals of each chromosome listed in \code{chrom_sizes}, +including leading gaps (from 0), inter-region gaps, trailing gaps +(to \code{chrom_size}), and full-chromosome gaps for chromosomes with no +regions. Regions on chromosomes not present in \code{chrom_sizes} are skipped. } diff --git a/gtars-r/src/rust/Cargo.toml b/gtars-r/src/rust/Cargo.toml index d148db6b..a16ab051 100644 --- a/gtars-r/src/rust/Cargo.toml +++ b/gtars-r/src/rust/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "gtars-r" -version = "0.8.0" +version = "0.9.0" edition = "2021" [lib] diff --git a/gtars-r/src/rust/src/genomicdist.rs b/gtars-r/src/rust/src/genomicdist.rs index 4df57943..1b9d7b80 100644 --- a/gtars-r/src/rust/src/genomicdist.rs +++ b/gtars-r/src/rust/src/genomicdist.rs @@ -3,12 +3,12 @@ use std::collections::HashMap; use extendr_api::prelude::*; use gtars_core::models::{Region, RegionSet, RegionSetList}; -use gtars_genomicdist::models::{GenomeAssembly, TssIndex}; +use gtars_genomicdist::models::{BinaryGenomeAssembly, GenomeAssembly, TssIndex}; use gtars_overlaprs::RegionSetOverlaps; use gtars_genomicdist::{ - calc_dinucl_freq_per_region, calc_gc_content, calc_summary_signal, chrom_karyotype_key, - consensus, genome_partition_list, calc_expected_partitions, calc_partitions, - pairwise_jaccard, + calc_dinucl_freq, calc_gc_content, calc_summary_signal, + chrom_karyotype_key, consensus, genome_partition_list, calc_expected_partitions, + calc_partitions, pairwise_jaccard, CoordinateMode, GenomicDistAnnotation, GenomicIntervalSetStatistics, GeneModel, IntervalRanges, PartitionList, SignalMatrix, Strand, StrandedRegionSet, DINUCL_ORDER, }; @@ -26,12 +26,22 @@ macro_rules! with_regionset { }}; } +/// Try to extract either a GenomeAssembly or BinaryGenomeAssembly pointer +/// and call the provided function with `&dyn SequenceAccess`. macro_rules! with_assembly { ($ptr:expr, $asm:ident, $body:block) => {{ - let ext_ptr = >::try_from($ptr) - .map_err(|_| extendr_api::Error::Other("Invalid GenomeAssembly pointer".into()))?; - let $asm = &*ext_ptr; - $body + use gtars_genomicdist::models::SequenceAccess; + if let Ok(ext_ptr) = >::try_from($ptr.clone()) { + let $asm: &dyn SequenceAccess = &*ext_ptr; + $body + } else if let Ok(ext_ptr) = >::try_from($ptr) { + let $asm: &dyn SequenceAccess = &*ext_ptr; + $body + } else { + Err(extendr_api::Error::Other( + "Invalid assembly pointer: expected GenomeAssembly or BinaryGenomeAssembly".into(), + )) + } }}; } @@ -135,9 +145,11 @@ pub fn r_regionset_length(rs_ptr: Robj) -> extendr_api::Result { /// @export /// @param rs_ptr External pointer to a RegionSet #[extendr(r_name = "r_calc_widths")] -pub fn r_calc_widths(rs_ptr: Robj) -> extendr_api::Result> { +pub fn r_calc_widths(rs_ptr: Robj) -> extendr_api::Result> { + // f64 instead of i32: widths are u32 in Rust and can exceed i32::MAX (2.1 Gbp) + // for concatenated/genome-scale regions. f64 safely represents any u32. with_regionset!(rs_ptr, rs, { - Ok(rs.calc_widths().into_iter().map(|w| w as i32).collect()) + Ok(rs.calc_widths().into_iter().map(|w| w as f64).collect()) }) } @@ -158,12 +170,14 @@ pub fn r_calc_neighbor_distances(rs_ptr: Robj) -> extendr_api::Result> /// @export /// @param rs_ptr External pointer to a RegionSet #[extendr(r_name = "r_calc_nearest_neighbors")] -pub fn r_calc_nearest_neighbors(rs_ptr: Robj) -> extendr_api::Result> { +pub fn r_calc_nearest_neighbors(rs_ptr: Robj) -> extendr_api::Result> { + // f64 instead of i32: neighbor distances are chromosome-scale (up to ~250M on + // hg38) and genome-concatenated BEDs could push past i32::MAX (2.1 Gbp). with_regionset!(rs_ptr, rs, { let dists = rs .calc_nearest_neighbors() .map_err(|e| extendr_api::Error::Other(format!("{}", e)))?; - Ok(dists.into_iter().map(|d| d as i32).collect()) + Ok(dists.into_iter().map(|d| d as f64).collect()) }) } @@ -180,21 +194,23 @@ pub fn r_chromosome_statistics(rs_ptr: Robj) -> extendr_api::Result { }); let mut chr_names: Vec = Vec::new(); - let mut n_regions: Vec = Vec::new(); + let mut n_regions: Vec = Vec::new(); let mut start_pos: Vec = Vec::new(); let mut end_pos: Vec = Vec::new(); - let mut min_len: Vec = Vec::new(); - let mut max_len: Vec = Vec::new(); + let mut min_len: Vec = Vec::new(); + let mut max_len: Vec = Vec::new(); let mut mean_len: Vec = Vec::new(); let mut median_len: Vec = Vec::new(); + // f64 for count and length fields: u32 values can exceed i32::MAX (2.1 Gbp) + // for wide regions, and region_count has no upper bound. for (chr, s) in &entries { chr_names.push(chr.clone()); - n_regions.push(s.number_of_regions as i32); + n_regions.push(s.number_of_regions as f64); start_pos.push(s.start_nucleotide_position as f64); end_pos.push(s.end_nucleotide_position as f64); - min_len.push(s.minimum_region_length as i32); - max_len.push(s.maximum_region_length as i32); + min_len.push(s.minimum_region_length as f64); + max_len.push(s.maximum_region_length as f64); mean_len.push(s.mean_region_length); median_len.push(s.median_region_length); } @@ -212,6 +228,25 @@ pub fn r_chromosome_statistics(rs_ptr: Robj) -> extendr_api::Result { }) } +/// Cluster nearby regions via single-linkage with a stitching radius. +/// +/// Returns an integer vector of cluster IDs in original region order. +/// Regions within `max_gap` bp of another region on the same chromosome +/// are assigned the same cluster. Chromosome boundaries always break clusters. +/// +/// @export +/// @param rs_ptr External pointer to a RegionSet +/// @param max_gap Maximum bp gap between regions to link (non-negative) +#[extendr(r_name = "r_cluster")] +pub fn r_cluster(rs_ptr: Robj, max_gap: i32) -> extendr_api::Result> { + let max_gap_u32 = checked_u32(max_gap, "max_gap")?; + with_regionset!(rs_ptr, rs, { + let ids = rs.cluster(max_gap_u32); + // u32 IDs → f64 to match the other count/ID returns in this module. + Ok(ids.into_iter().map(|id| id as f64).collect()) + }) +} + /// Calculate region distribution across chromosome bins /// @export /// @param rs_ptr External pointer to a RegionSet @@ -282,34 +317,79 @@ pub fn r_load_genome_assembly(fasta_path: &str) -> extendr_api::Result { Ok(ExternalPtr::new(assembly).into()) } +/// Load a binary genome assembly from a .fab file +/// @export +/// @param fab_path Path to a .fab binary FASTA file (created by gtars prep --fasta) +#[extendr(r_name = "load_binary_genome_assembly")] +pub fn r_load_binary_genome_assembly(fab_path: &str) -> extendr_api::Result { + let assembly = BinaryGenomeAssembly::try_from(fab_path) + .map_err(|e| extendr_api::Error::Other(format!("Loading .fab: {}", e)))?; + Ok(ExternalPtr::new(assembly).into()) +} + /// Calculate GC content for each region /// @export /// @param rs_ptr External pointer to a RegionSet /// @param assembly_ptr External pointer to a GenomeAssembly -/// @param ignore_unk_chroms Skip regions on chromosomes not in the assembly +/// @param ignore_unk_chroms Skip regions on chromosomes not in the assembly. +/// Pass NULL (the default) or FALSE to error on unknown chromosomes. #[extendr(r_name = "r_calc_gc_content")] pub fn r_calc_gc_content( rs_ptr: Robj, assembly_ptr: Robj, - ignore_unk_chroms: bool, + ignore_unk_chroms: Robj, ) -> extendr_api::Result> { + // Default to false when NULL/missing — matches Python's default + let ignore = if ignore_unk_chroms.is_null() { + false + } else { + ::try_from(&ignore_unk_chroms) + .map_err(|_| extendr_api::Error::Other("ignore_unk_chroms must be logical".into()))? + }; with_regionset!(rs_ptr, rs, { with_assembly!(assembly_ptr, asm, { - calc_gc_content(rs, asm, ignore_unk_chroms) + calc_gc_content(rs, asm, ignore) .map_err(|e| extendr_api::Error::Other(format!("{}", e))) }) }) } -/// Calculate per-region dinucleotide frequencies (percentages) +/// Calculate per-region dinucleotide frequencies (matches R GenomicDistributions calcDinuclFreq) +/// +/// Returns a list with a ``region`` column and 16 dinucleotide columns +/// (AA, AC, ..., TT). When ``raw_counts`` is FALSE (default), each row sums +/// to 100 (percentages). When TRUE, values are raw integer counts. +/// +/// For pooled global counts, sum each dinucleotide column across rows +/// (with ``raw_counts=TRUE``). /// @export /// @param rs_ptr External pointer to a RegionSet /// @param assembly_ptr External pointer to a GenomeAssembly +/// @param raw_counts Return raw counts instead of percentages (default FALSE, matches R upstream) +/// @param ignore_unk_chroms Skip regions on chromosomes not in the assembly (default FALSE) #[extendr(r_name = "r_calc_dinucl_freq")] -pub fn r_calc_dinucl_freq(rs_ptr: Robj, assembly_ptr: Robj) -> extendr_api::Result { +pub fn r_calc_dinucl_freq( + rs_ptr: Robj, + assembly_ptr: Robj, + raw_counts: Robj, + ignore_unk_chroms: Robj, +) -> extendr_api::Result { + // Default to false when NULL/missing — matches Python and R GenomicDistributions defaults + let raw = if raw_counts.is_null() { + false + } else { + ::try_from(&raw_counts) + .map_err(|_| extendr_api::Error::Other("raw_counts must be logical".into()))? + }; + let ignore = if ignore_unk_chroms.is_null() { + false + } else { + ::try_from(&ignore_unk_chroms) + .map_err(|_| extendr_api::Error::Other("ignore_unk_chroms must be logical".into()))? + }; with_regionset!(rs_ptr, rs, { with_assembly!(assembly_ptr, asm, { - let (labels, matrix) = calc_dinucl_freq_per_region(rs, asm) + let (labels, matrix) = calc_dinucl_freq(rs, asm, raw, ignore) .map_err(|e| extendr_api::Error::Other(format!("{}", e)))?; // Build column names (uppercase to match GD: AA, AC, AG, ...) @@ -572,13 +652,26 @@ pub fn r_disjoin(rs_ptr: Robj) -> extendr_api::Result { }) } -/// Return gaps between regions per chromosome +/// Return gaps between regions per chromosome, bounded by chrom sizes +/// +/// Emits the peak-free intervals of each chromosome listed in `chrom_sizes`, +/// including leading gaps (from 0), inter-region gaps, trailing gaps +/// (to `chrom_size`), and full-chromosome gaps for chromosomes with no +/// regions. Regions on chromosomes not present in `chrom_sizes` are skipped. +/// /// @export /// @param rs_ptr External pointer to a RegionSet +/// @param chrom_names Character vector of chromosome names +/// @param chrom_sizes_vec Integer vector of chromosome sizes #[extendr(r_name = "r_gaps")] -pub fn r_gaps(rs_ptr: Robj) -> extendr_api::Result { +pub fn r_gaps( + rs_ptr: Robj, + chrom_names: Vec, + chrom_sizes_vec: Vec, +) -> extendr_api::Result { + let chrom_sizes = chrom_sizes_from_vecs(chrom_names, chrom_sizes_vec)?; with_regionset!(rs_ptr, rs, { - let result = rs.gaps(); + let result = rs.gaps(&chrom_sizes); Ok(ExternalPtr::new(result).into()) }) } @@ -1248,8 +1341,11 @@ extendr_module! { fn r_calc_nearest_neighbors; fn r_chromosome_statistics; fn r_region_distribution; + // Clustering + fn r_cluster; // GC / Dinucleotide fn r_load_genome_assembly; + fn r_load_binary_genome_assembly; fn r_calc_gc_content; fn r_calc_dinucl_freq; // Interval Ranges diff --git a/gtars-r/src/rust/src/lola.rs b/gtars-r/src/rust/src/lola.rs index a7c217d4..fa69cb00 100644 --- a/gtars-r/src/rust/src/lola.rs +++ b/gtars-r/src/rust/src/lola.rs @@ -55,79 +55,29 @@ fn extract_region_sets(user_sets: List) -> extendr_api::Result> { Ok(sets) } -/// Convert an empty string to None (becomes NA in R). -fn empty_to_na(s: &str) -> Option { - if s.is_empty() { - None - } else { - Some(s.to_string()) - } -} - /// Convert LOLA results to an R list (data.frame-like structure). fn results_to_list(results: &[gtars_lola::models::LolaResult]) -> List { - let n = results.len(); - let mut user_set = Vec::with_capacity(n); - let mut db_set = Vec::with_capacity(n); - let mut p_value_log = Vec::with_capacity(n); - let mut odds_ratio = Vec::with_capacity(n); - let mut support: Vec = Vec::with_capacity(n); - let mut rnk_pv: Vec = Vec::with_capacity(n); - let mut rnk_or: Vec = Vec::with_capacity(n); - let mut rnk_sup: Vec = Vec::with_capacity(n); - let mut max_rnk: Vec = Vec::with_capacity(n); - let mut mean_rnk = Vec::with_capacity(n); - let mut b_vec: Vec = Vec::with_capacity(n); - let mut c_vec: Vec = Vec::with_capacity(n); - let mut d_vec: Vec = Vec::with_capacity(n); - let mut collection: Vec> = Vec::with_capacity(n); - let mut description: Vec> = Vec::with_capacity(n); - let mut cell_type: Vec> = Vec::with_capacity(n); - let mut tissue: Vec> = Vec::with_capacity(n); - let mut antibody: Vec> = Vec::with_capacity(n); - let mut treatment: Vec> = Vec::with_capacity(n); - let mut data_source: Vec> = Vec::with_capacity(n); - let mut filename = Vec::with_capacity(n); - let mut db_set_size: Vec = Vec::with_capacity(n); - - let mut q_value: Vec> = Vec::with_capacity(n); - - for r in results { - user_set.push((r.user_set + 1) as i32); // 1-based for R - db_set.push((r.db_set + 1) as i32); - p_value_log.push(r.p_value_log); - odds_ratio.push(r.odds_ratio); - support.push(r.support as i32); - rnk_pv.push(r.rnk_pv as i32); - rnk_or.push(r.rnk_or as i32); - rnk_sup.push(r.rnk_sup as i32); - max_rnk.push(r.max_rnk as i32); - mean_rnk.push(r.mean_rnk); - b_vec.push(r.b as i32); - c_vec.push(r.c as i32); - d_vec.push(r.d as i32); - collection.push(empty_to_na(&r.collection)); - description.push(empty_to_na(&r.description)); - cell_type.push(empty_to_na(&r.cell_type)); - tissue.push(empty_to_na(&r.tissue)); - antibody.push(empty_to_na(&r.antibody)); - treatment.push(empty_to_na(&r.treatment)); - data_source.push(empty_to_na(&r.data_source)); - filename.push(r.filename.clone()); - db_set_size.push(r.db_set_size as i32); - q_value.push(r.q_value); - } - - // Convert Option to Rfloat (NA for None) - let q_value_r: Vec = q_value - .iter() - .map(|q| match q { - Some(v) => Rfloat::from(*v), - None => Rfloat::na(), - }) + use gtars_lola::output::results_to_columns; + + let c = results_to_columns(results); + + // R-specific conversions: 1-based indices, i32 casts, NA types + let user_set: Vec = c.user_set.iter().map(|&v| (v + 1) as i32).collect(); + let db_set: Vec = c.db_set.iter().map(|&v| (v + 1) as i32).collect(); + let support: Vec = c.support.iter().map(|&v| v as i32).collect(); + let rnk_pv: Vec = c.rnk_pv.iter().map(|&v| v as i32).collect(); + let rnk_or: Vec = c.rnk_or.iter().map(|&v| v as i32).collect(); + let rnk_sup: Vec = c.rnk_sup.iter().map(|&v| v as i32).collect(); + let max_rnk: Vec = c.max_rnk.iter().map(|&v| v as i32).collect(); + let b_vec: Vec = c.b.iter().map(|&v| v as i32).collect(); + let c_vec: Vec = c.c.iter().map(|&v| v as i32).collect(); + let d_vec: Vec = c.d.iter().map(|&v| v as i32).collect(); + let db_set_size: Vec = c.db_set_size.iter().map(|&v| v as i32).collect(); + + let q_value_r: Vec = c.q_value.iter() + .map(|q| match q { Some(v) => Rfloat::from(*v), None => Rfloat::na() }) .collect(); - // Convert Option to Rstr (NA for None) let to_rstr = |v: &[Option]| -> Vec { v.iter() .map(|s| match s { @@ -140,25 +90,25 @@ fn results_to_list(results: &[gtars_lola::models::LolaResult]) -> List { list!( userSet = user_set, dbSet = db_set, - collection = to_rstr(&collection), - pValueLog = p_value_log, - oddsRatio = odds_ratio, + collection = to_rstr(&c.collection), + pValueLog = c.p_value_log, + oddsRatio = c.odds_ratio, support = support, rnkPV = rnk_pv, rnkOR = rnk_or, rnkSup = rnk_sup, maxRnk = max_rnk, - meanRnk = mean_rnk, + meanRnk = c.mean_rnk, b = b_vec, c = c_vec, d = d_vec, - description = to_rstr(&description), - cellType = to_rstr(&cell_type), - tissue = to_rstr(&tissue), - antibody = to_rstr(&antibody), - treatment = to_rstr(&treatment), - dataSource = to_rstr(&data_source), - filename = filename, + description = to_rstr(&c.description), + cellType = to_rstr(&c.cell_type), + tissue = to_rstr(&c.tissue), + antibody = to_rstr(&c.antibody), + treatment = to_rstr(&c.treatment), + dataSource = to_rstr(&c.data_source), + filename = c.filename, qValue = q_value_r, size = db_set_size ) @@ -432,13 +382,13 @@ pub fn r_regiondb_anno(db: Robj) -> extendr_api::Result { for (i, a) in db_ref.region_anno.iter().enumerate() { filename.push(a.filename.clone()); - cell_type.push(empty_to_na(&a.cell_type)); - description.push(empty_to_na(&a.description)); - tissue.push(empty_to_na(&a.tissue)); - data_source.push(empty_to_na(&a.data_source)); - antibody.push(empty_to_na(&a.antibody)); - treatment.push(empty_to_na(&a.treatment)); - collection.push(empty_to_na(&a.collection)); + cell_type.push(a.cell_type.clone()); + description.push(a.description.clone()); + tissue.push(a.tissue.clone()); + data_source.push(a.data_source.clone()); + antibody.push(a.antibody.clone()); + treatment.push(a.treatment.clone()); + collection.push(a.collection.clone()); size.push(db_ref.region_sets[i].len() as i32); } diff --git a/gtars-r/tests/testthat/test_genomicdist.R b/gtars-r/tests/testthat/test_genomicdist.R index 6245a055..40a8d844 100644 --- a/gtars-r/tests/testthat/test_genomicdist.R +++ b/gtars-r/tests/testthat/test_genomicdist.R @@ -18,9 +18,10 @@ test_that("calcWidth returns correct widths", { skip_if_no_data() rs <- RegionSet(file.path(data_dir, "dummy.bed")) w <- calcWidth(rs) - expect_type(w, "integer") + # doubles instead of integers: u32 widths can exceed i32::MAX (2.1 Gbp) + expect_type(w, "double") expect_equal(length(w), 4L) - expect_equal(w[1], 4L) + expect_equal(w[1], 4) }) test_that("calcNeighborDist returns distances", { @@ -34,7 +35,9 @@ test_that("calcNearestNeighbors returns distances", { skip_if_no_data() rs <- RegionSet(file.path(data_dir, "dummy.bed")) nn <- calcNearestNeighbors(rs) - expect_type(nn, "integer") + # doubles instead of integers: distances can be chromosome-scale, + # approaching or exceeding i32::MAX for concatenated/genome-scale regions + expect_type(nn, "double") expect_equal(length(nn), length(rs)) }) @@ -77,6 +80,23 @@ test_that("calcDinuclFreq returns data.frame with 16 dinucleotide columns", { expect_true(is.data.frame(freq)) # 16 dinucleotides + region column expect_true(ncol(freq) >= 16) + # Default rawCounts=FALSE: each row sums to 100 + num_cols <- freq[, sapply(freq, is.numeric)] + expect_equal(sum(num_cols[1, ]), 100, tolerance = 1e-6) +}) + +test_that("calcDinuclFreq rawCounts=TRUE returns integer counts", { + fasta_path <- file.path(test_path(), "..", "..", "..", "tests", "data", "fasta", "base.fa") + skip_if_not(file.exists(fasta_path), "Test FASTA file not found") + + assembly <- loadGenomeAssembly(fasta_path) + # base.fa: chrX = "TTGGGGAA" (8bp) → 7 dinucleotide windows + df <- data.frame(chr = "chrX", start = 0L, end = 8L) + freq <- calcDinuclFreq(df, assembly, rawCounts = TRUE) + expect_true(is.data.frame(freq)) + num_cols <- freq[, sapply(freq, is.numeric)] + # 8 bases → 7 dinucleotides + expect_equal(sum(num_cols[1, ]), 7) }) # ========================================================================= diff --git a/gtars-r/tests/testthat/test_regionset.R b/gtars-r/tests/testthat/test_regionset.R index 3a547cd7..e455eaec 100644 --- a/gtars-r/tests/testthat/test_regionset.R +++ b/gtars-r/tests/testthat/test_regionset.R @@ -82,9 +82,10 @@ test_that("widths returns correct values", { # dummy.bed: chr1:2-6, chr1:4-7, chr1:5-9, chr1:7-12 rs <- RegionSet(file.path(data_dir, "dummy.bed")) w <- widths(rs) - expect_type(w, "integer") + # doubles instead of integers: u32 widths can exceed i32::MAX (2.1 Gbp) + expect_type(w, "double") expect_equal(length(w), 4L) - expect_equal(w[1], 4L) + expect_equal(w[1], 4) }) test_that("neighborDistances returns numeric vector", { @@ -98,7 +99,8 @@ test_that("nearestNeighbors returns numeric vector", { skip_if_no_data() rs <- RegionSet(file.path(data_dir, "dummy.bed")) nn <- nearestNeighbors(rs) - expect_type(nn, "integer") + # doubles instead of integers: distances can be chromosome-scale + expect_type(nn, "double") }) test_that("chromosomeStatistics returns data.frame", { @@ -179,12 +181,54 @@ test_that("disjoin returns non-overlapping pieces", { expect_true(length(rs_dj) >= 1L) }) -test_that("gaps returns inter-region gaps", { +test_that("gaps returns inter-region gaps bounded by chrom_sizes", { skip_if_no_data() - # dummy.bed regions all overlap, so gaps may be empty - rs <- RegionSet(file.path(data_dir, "dummy.bed")) - rs_gaps <- gaps(rs) + # Three non-overlapping peaks → leading, 2 inter-region, trailing = 4 gaps. + df <- data.frame( + chr = "chr1", + start = c(10L, 30L, 50L), + end = c(20L, 40L, 60L) + ) + rs <- RegionSet(df) + chrom_sizes <- c(chr1 = 100L) + rs_gaps <- gaps(rs, chrom_sizes = chrom_sizes) expect_s4_class(rs_gaps, "RegionSet") + expect_equal(length(rs_gaps), 4L) +}) + +test_that("gaps errors when chrom_sizes is missing", { + df <- data.frame(chr = "chr1", start = 10L, end = 20L) + rs <- RegionSet(df) + expect_error(gaps(rs), "chrom_sizes") +}) + +test_that("gaps emits full-chromosome gap for empty chromosomes", { + df <- data.frame(chr = "chr1", start = 10L, end = 20L) + rs <- RegionSet(df) + chrom_sizes <- c(chr1 = 100L, chr2 = 50L) + rs_gaps <- gaps(rs, chrom_sizes = chrom_sizes) + gaps_df <- as.data.frame(rs_gaps) + chr2 <- gaps_df[gaps_df$chr == "chr2", ] + expect_equal(nrow(chr2), 1L) + expect_equal(chr2$start, 0L) + expect_equal(chr2$end, 50L) +}) + +# ========================================================================= +# Spatial-arrangement summary statistics +# ========================================================================= + +test_that("clusterRegions returns cluster IDs", { + df <- data.frame( + chr = "chr1", + start = c(0L, 13L, 100L), + end = c(10L, 20L, 110L) + ) + rs <- RegionSet(df) + ids <- clusterRegions(rs, maxGap = 5L) + expect_length(ids, 3L) + expect_equal(ids[1], ids[2]) # first two are within 5bp → same cluster + expect_false(ids[1] == ids[3]) # third is far away }) # ========================================================================= diff --git a/gtars-refget/src/store/persistence.rs b/gtars-refget/src/store/persistence.rs index a5abd482..0a4115f7 100644 --- a/gtars-refget/src/store/persistence.rs +++ b/gtars-refget/src/store/persistence.rs @@ -188,7 +188,11 @@ impl ReadonlyRefgetStore { "#name\tlength\talphabet\tsha512t24u\tmd5\tdescription" )?; - for result_sr in self.sequence_store.values() { + // Sort by sha512t24u digest for deterministic output (sequence_store is a HashMap). + let mut entries: Vec<&SequenceRecord> = self.sequence_store.values().collect(); + entries.sort_by(|a, b| a.metadata().sha512t24u.cmp(&b.metadata().sha512t24u)); + + for result_sr in entries { let result = result_sr.metadata(); let description = result.description.as_deref().unwrap_or(""); writeln!( diff --git a/gtars-refget/src/store/readonly.rs b/gtars-refget/src/store/readonly.rs index 096ea74e..feb183fb 100644 --- a/gtars-refget/src/store/readonly.rs +++ b/gtars-refget/src/store/readonly.rs @@ -426,11 +426,17 @@ impl ReadonlyRefgetStore { .map(|name_map| { name_map .iter() - .filter_map(|(name, seq_key)| { - let record = self.sequence_store.get(seq_key)?; + .map(|(name, seq_key)| { + let record = self.sequence_store.get(seq_key).ok_or_else(|| { + anyhow!( + "Sequence {} not found in store for collection {}", + key_to_digest_string(seq_key), + collection_digest, + ) + })?; let mut meta = record.metadata().clone(); meta.name = name.clone(); - Some(match record.sequence() { + Ok(match record.sequence() { Some(seq) => SequenceRecord::Full { metadata: meta, sequence: seq.to_vec(), @@ -438,8 +444,9 @@ impl ReadonlyRefgetStore { None => SequenceRecord::Stub(meta), }) }) - .collect() + .collect::>>() }) + .transpose()? .unwrap_or_default(); Ok(crate::digest::SequenceCollection { @@ -539,11 +546,23 @@ impl ReadonlyRefgetStore { /// Import a single collection (with all its sequences, aliases, and FHR /// metadata) from another store into this store. /// + /// Both stores must be disk-backed with matching storage modes. /// The source store must have the collection loaded (call /// `load_collection()` or `load_all_collections()` first). pub fn import_collection(&mut self, source: &ReadonlyRefgetStore, digest: &str) -> Result<()> { - let collection = source.get_collection(digest)?; - self.add_sequence_collection(collection)?; + // Both stores must be disk-backed with same mode + if source.local_path.is_none() || self.local_path.is_none() || !self.persist_to_disk { + return Err(anyhow!("import_collection requires both stores to be disk-backed")); + } + if source.mode != self.mode { + return Err(anyhow!( + "import_collection requires matching storage modes (source={:?}, dest={:?})", + source.mode, + self.mode, + )); + } + + self.import_collection_file_copy(source, digest)?; // Copy sequence aliases for every sequence in the imported collection let coll_key = digest.to_key(); @@ -569,6 +588,126 @@ impl ReadonlyRefgetStore { Ok(()) } + /// File-copy based import: copies RGSI and .seq files directly from + /// source to destination, then registers the collection in memory. + fn import_collection_file_copy( + &mut self, + source: &ReadonlyRefgetStore, + digest: &str, + ) -> Result<()> { + use crate::collection::read_rgsi_file; + + // 1. Read the source RGSI file to get collection metadata + let src_rgsi_path = source + .collection_file_path(digest) + .ok_or_else(|| anyhow!("Source store has no local path for collection {}", digest))?; + let collection = read_rgsi_file(&src_rgsi_path) + .with_context(|| format!("Failed to read source RGSI file: {}", src_rgsi_path.display()))?; + + let mut metadata = collection.metadata.clone(); + + // 2. Handle ancillary digests and copy/write the RGSI file + let dst_rgsi_path = self + .collection_file_path(digest) + .ok_or_else(|| anyhow!("Dest store has no local path for collection {}", digest))?; + if let Some(parent) = dst_rgsi_path.parent() { + create_dir_all(parent)?; + } + + let needs_ancillary_rewrite = self.ancillary_digests + && metadata.name_length_pairs_digest.is_none(); + + if needs_ancillary_rewrite { + // Source lacks ancillary digests but destination wants them -- + // compute them and write a new RGSI file. + metadata.compute_ancillary_digests(&collection.sequences); + use crate::collection::SequenceCollectionRecordExt; + let record = SequenceCollectionRecord::Full { + metadata: metadata.clone(), + sequences: collection.sequences.iter() + .map(|s| SequenceRecord::Stub(s.metadata().clone())) + .collect(), + }; + record.write_collection_rgsi(&dst_rgsi_path)?; + } else { + // Byte-for-byte copy of the RGSI file + fs::copy(&src_rgsi_path, &dst_rgsi_path) + .with_context(|| format!( + "Failed to copy RGSI {} -> {}", + src_rgsi_path.display(), + dst_rgsi_path.display(), + ))?; + } + + // 3. Copy sequence data files + for seq_record in &collection.sequences { + let seq_meta = seq_record.metadata(); + let seq_digest = &seq_meta.sha512t24u; + + let src_seq_path = source + .sequence_file_path(seq_digest) + .ok_or_else(|| anyhow!("Source has no path for sequence {}", seq_digest))?; + let dst_seq_path = self + .sequence_file_path(seq_digest) + .ok_or_else(|| anyhow!("Dest has no path for sequence {}", seq_digest))?; + + // Skip if destination already has this sequence (dedup across collections) + if dst_seq_path.exists() { + // Still need to register in memory below + } else { + if let Some(parent) = dst_seq_path.parent() { + create_dir_all(parent)?; + } + fs::copy(&src_seq_path, &dst_seq_path).with_context(|| { + format!( + "Failed to copy sequence {} -> {}", + src_seq_path.display(), + dst_seq_path.display(), + ) + })?; + } + } + + // 4. Register in memory + let coll_key = digest.to_key(); + + // Build the collection record with stub sequences + let stub_sequences: Vec = collection + .sequences + .iter() + .map(|s| SequenceRecord::Stub(s.metadata().clone())) + .collect(); + + let record = SequenceCollectionRecord::Full { + metadata: metadata.clone(), + sequences: stub_sequences, + }; + self.collections.insert(coll_key, record); + + // Register sequences and populate name_lookup + let mut name_map = IndexMap::new(); + for seq_record in &collection.sequences { + let seq_meta = seq_record.metadata(); + let seq_key = seq_meta.sha512t24u.to_key(); + + name_map.insert(seq_meta.name.clone(), seq_key); + + // Insert stub into sequence_store (skip if already present -- dedup) + if !self.sequence_store.contains_key(&seq_key) { + self.sequence_store + .insert(seq_key, SequenceRecord::Stub(seq_meta.clone())); + self.md5_lookup + .insert(seq_meta.md5.to_key(), seq_key); + } + } + self.name_lookup.insert(coll_key, name_map); + + // 5. Update index files + self.write_index_files()?; + + Ok(()) + } + // ========================================================================= // Sequence API // ========================================================================= @@ -825,6 +964,23 @@ impl ReadonlyRefgetStore { PathBuf::from(path_str) } + /// Return the full filesystem path to a sequence `.seq` file for the given digest. + /// + /// Returns `None` if the store has no local path or no seqdata path template. + pub fn sequence_file_path(&self, seq_digest: &str) -> Option { + let local_path = self.local_path.as_ref()?; + let template = self.seqdata_path_template.as_ref()?; + Some(local_path.join(Self::expand_template(seq_digest, template))) + } + + /// Return the full filesystem path to a collection RGSI file for the given digest. + /// + /// Returns `None` if the store has no local path. + pub fn collection_file_path(&self, coll_digest: &str) -> Option { + let local_path = self.local_path.as_ref()?; + Some(local_path.join(format!("collections/{}.rgsi", coll_digest))) + } + /// Validate a relative path to prevent directory traversal attacks. pub(crate) fn sanitize_relative_path(path: &str) -> Result<()> { if path.starts_with('/') || path.starts_with('\\') { diff --git a/gtars-refget/src/store/tests.rs b/gtars-refget/src/store/tests.rs index 6015b96f..b9bb80f4 100644 --- a/gtars-refget/src/store/tests.rs +++ b/gtars-refget/src/store/tests.rs @@ -1526,6 +1526,92 @@ fn test_collection_order_preserved_after_roundtrip() { ); } +/// Test that multiple collections sharing sequences under different names and different orderings +/// all preserve their correct per-collection names and element orderings across a disk roundtrip. +/// +/// This covers the intersection of two previously-fixed bugs: +/// 1. HashMap ordering (fixed: inner map now IndexMap) +/// 2. Global name leakage (fixed: get_collection() overrides meta.name from name_lookup) +#[test] +fn test_shared_sequences_order_preserved_after_disk_roundtrip() { + // FASTA A: base ordering — chrX first, then chr1, then chr2 + let fasta_a = ">chrX\nTTGGGGAA\n>chr1\nGGAA\n>chr2\nGCGC\n"; + // FASTA B: different order — chr1 first, same sequences as A + let fasta_b = ">chr1\nGGAA\n>chr2\nGCGC\n>chrX\nTTGGGGAA\n"; + // FASTA C: name swap — chr2 has GGAA, chr1 has GCGC (opposite of A/B) + let fasta_c = ">chrX\nTTGGGGAA\n>chr2\nGGAA\n>chr1\nGCGC\n"; + + let dir = tempdir().unwrap(); + let fasta_a_path = dir.path().join("a.fa"); + let fasta_b_path = dir.path().join("b.fa"); + let fasta_c_path = dir.path().join("c.fa"); + fs::write(&fasta_a_path, fasta_a).unwrap(); + fs::write(&fasta_b_path, fasta_b).unwrap(); + fs::write(&fasta_c_path, fasta_c).unwrap(); + + let store_path = dir.path().join("store"); + let mut store = RefgetStore::on_disk(&store_path).unwrap(); + store.set_quiet(true); + + let (meta_a, _) = store.add_sequence_collection_from_fasta(&fasta_a_path, FastaImportOptions::new()).unwrap(); + let (meta_b, _) = store.add_sequence_collection_from_fasta(&fasta_b_path, FastaImportOptions::new()).unwrap(); + let (meta_c, _) = store.add_sequence_collection_from_fasta(&fasta_c_path, FastaImportOptions::new()).unwrap(); + + let digest_a = meta_a.digest.clone(); + let digest_b = meta_b.digest.clone(); + let digest_c = meta_c.digest.clone(); + + // Load collections before write and record level2 output + store.load_all_collections().unwrap(); + let pre_a = store.get_collection_level2(&digest_a).unwrap(); + let pre_b = store.get_collection_level2(&digest_b).unwrap(); + let pre_c = store.get_collection_level2(&digest_c).unwrap(); + + // Verify pre-write ordering for FASTA A: chrX, chr1, chr2 + assert_eq!(pre_a.names, vec!["chrX", "chr1", "chr2"], "A: names before roundtrip"); + // Verify pre-write ordering for FASTA B: chr1, chr2, chrX + assert_eq!(pre_b.names, vec!["chr1", "chr2", "chrX"], "B: names before roundtrip"); + // Verify pre-write ordering for FASTA C: chrX, chr2, chr1 (name swap) + assert_eq!(pre_c.names, vec!["chrX", "chr2", "chr1"], "C: names before roundtrip"); + + store.write().unwrap(); + + // Drop and reopen from disk + drop(store); + let mut reloaded = RefgetStore::open_local(&store_path).unwrap(); + reloaded.load_all_collections().unwrap(); + + let post_a = reloaded.get_collection_level2(&digest_a).unwrap(); + let post_b = reloaded.get_collection_level2(&digest_b).unwrap(); + let post_c = reloaded.get_collection_level2(&digest_c).unwrap(); + + // Names must match exactly (order-sensitive) after roundtrip + assert_eq!(post_a.names, pre_a.names, "A: names after roundtrip"); + assert_eq!(post_b.names, pre_b.names, "B: names after roundtrip"); + assert_eq!(post_c.names, pre_c.names, "C: names after roundtrip"); + + // Lengths must match exactly after roundtrip + assert_eq!(post_a.lengths, pre_a.lengths, "A: lengths after roundtrip"); + assert_eq!(post_b.lengths, pre_b.lengths, "B: lengths after roundtrip"); + assert_eq!(post_c.lengths, pre_c.lengths, "C: lengths after roundtrip"); + + // Sequence digests must match exactly after roundtrip + assert_eq!(post_a.sequences, pre_a.sequences, "A: sequences after roundtrip"); + assert_eq!(post_b.sequences, pre_b.sequences, "B: sequences after roundtrip"); + assert_eq!(post_c.sequences, pre_c.sequences, "C: sequences after roundtrip"); + + // Cross-check: FASTA C has chr2=GGAA and chr1=GCGC (opposite of A's chr1=GGAA, chr2=GCGC) + // The sequence digest for chr2 in C should equal chr1 in A + assert_eq!( + post_c.sequences[1], post_a.sequences[1], + "C.chr2 and A.chr1 share GGAA bytes, should have same sequence digest" + ); + assert_eq!( + post_c.sequences[2], post_a.sequences[2], + "C.chr1 and A.chr2 share GCGC bytes, should have same sequence digest" + ); +} + // ========================================================================= // Name source tests // ========================================================================= @@ -1591,10 +1677,30 @@ fn test_shared_sequence_different_names() { // Import collection tests // ========================================================================= +/// Helper: create a disk-backed store with one collection from a FASTA string. +fn disk_store_with_one_collection(fasta_content: &str) -> (RefgetStore, String, tempfile::TempDir, tempfile::TempDir) { + let store_dir = tempdir().unwrap(); + let fasta_dir = tempdir().unwrap(); + let fasta = fasta_dir.path().join("test.fa"); + fs::write(&fasta, fasta_content).unwrap(); + + let mut store = RefgetStore::on_disk(store_dir.path()).unwrap(); + store.set_quiet(true); + let (meta, _) = store + .add_sequence_collection_from_fasta(&fasta, FastaImportOptions::new()) + .unwrap(); + let digest = meta.digest.clone(); + // Load collection so name_lookup is populated + store.load_all_collections().unwrap(); + (store, digest, store_dir, fasta_dir) +} + #[test] fn test_import_collection_basic() { - let (mut source, digest) = store_with_one_collection(">chr1\nATGC\n>chr2\nGGGG\n"); - let mut target = RefgetStore::in_memory(); + let (mut source, digest, _src_dir, _fasta_dir) = + disk_store_with_one_collection(">chr1\nATGC\n>chr2\nGGGG\n"); + let target_dir = tempdir().unwrap(); + let mut target = RefgetStore::on_disk(target_dir.path()).unwrap(); target.import_collection(&mut source, &digest).unwrap(); @@ -1605,7 +1711,8 @@ fn test_import_collection_basic() { #[test] fn test_import_collection_copies_sequence_aliases() { - let (mut source, digest) = store_with_one_collection(">chr1\nATGC\n>chr2\nGGGG\n"); + let (mut source, digest, _src_dir, _fasta_dir) = + disk_store_with_one_collection(">chr1\nATGC\n>chr2\nGGGG\n"); // Add sequence aliases in source let coll = source.get_collection(&digest).unwrap(); @@ -1615,7 +1722,8 @@ fn test_import_collection_copies_sequence_aliases() { source.add_sequence_alias("ucsc", "chr1", &seq0_digest).unwrap(); source.add_sequence_alias("ncbi", "NC_000002.1", &seq1_digest).unwrap(); - let mut target = RefgetStore::in_memory(); + let target_dir = tempdir().unwrap(); + let mut target = RefgetStore::on_disk(target_dir.path()).unwrap(); target.import_collection(&mut source, &digest).unwrap(); // Target should have the sequence aliases @@ -1631,12 +1739,14 @@ fn test_import_collection_copies_sequence_aliases() { #[test] fn test_import_collection_copies_collection_aliases() { - let (mut source, digest) = store_with_one_collection(">chr1\nATGC\n>chr2\nGGGG\n"); + let (mut source, digest, _src_dir, _fasta_dir) = + disk_store_with_one_collection(">chr1\nATGC\n>chr2\nGGGG\n"); source.add_collection_alias("insdc", "GCA_000001.1", &digest).unwrap(); source.add_collection_alias("refseq", "GCF_000001.1", &digest).unwrap(); - let mut target = RefgetStore::in_memory(); + let target_dir = tempdir().unwrap(); + let mut target = RefgetStore::on_disk(target_dir.path()).unwrap(); target.import_collection(&mut source, &digest).unwrap(); let ns = target.list_collection_alias_namespaces(); @@ -1651,7 +1761,8 @@ fn test_import_collection_copies_collection_aliases() { fn test_import_collection_copies_fhr_metadata() { use super::fhr_metadata::FhrMetadata; - let (mut source, digest) = store_with_one_collection(">chr1\nATGC\n"); + let (mut source, digest, _src_dir, _fasta_dir) = + disk_store_with_one_collection(">chr1\nATGC\n"); let fhr = FhrMetadata { genome: Some("Homo sapiens".to_string()), @@ -1659,7 +1770,8 @@ fn test_import_collection_copies_fhr_metadata() { }; source.set_fhr_metadata(&digest, fhr).unwrap(); - let mut target = RefgetStore::in_memory(); + let target_dir = tempdir().unwrap(); + let mut target = RefgetStore::on_disk(target_dir.path()).unwrap(); target.import_collection(&mut source, &digest).unwrap(); let fhr = target.get_fhr_metadata(&digest); @@ -1682,6 +1794,7 @@ fn test_import_collection_disk_roundtrip_aliases() { let seq0_digest; { let mut source = RefgetStore::on_disk(source_dir.path()).unwrap(); + source.set_quiet(true); let (meta, _) = source .add_sequence_collection_from_fasta(&fasta, FastaImportOptions::new()) .unwrap(); @@ -1720,3 +1833,137 @@ fn test_import_collection_disk_roundtrip_aliases() { let coll_aliases = target.get_aliases_for_collection(&digest); assert_eq!(coll_aliases.len(), 1, "Expected 1 collection alias in target: {:?}", coll_aliases); } + +#[test] +fn test_import_collection_file_copy_roundtrip() { + // Verify RGSI and .seq files are byte-for-byte identical after import + // when ancillary digests match between source and dest. + let (mut source, digest, src_dir, _fasta_dir) = + disk_store_with_one_collection(">chr1\nATGC\n>chr2\nGGGG\n"); + let target_dir = tempdir().unwrap(); + let mut target = RefgetStore::on_disk(target_dir.path()).unwrap(); + + target.import_collection(&mut source, &digest).unwrap(); + + // Verify RGSI file is byte-for-byte identical + let src_rgsi = fs::read( + src_dir.path().join(format!("collections/{}.rgsi", digest)), + ).unwrap(); + let dst_rgsi = fs::read( + target_dir.path().join(format!("collections/{}.rgsi", digest)), + ).unwrap(); + assert_eq!(src_rgsi, dst_rgsi, "RGSI files should be byte-identical"); + + // Verify .seq files are byte-for-byte identical + let coll = target.get_collection(&digest).unwrap(); + for seq in &coll.sequences { + let seq_digest = &seq.metadata().sha512t24u; + let src_seq_path = source.sequence_file_path(seq_digest).unwrap(); + let dst_seq_path = target.sequence_file_path(seq_digest).unwrap(); + let src_data = fs::read(&src_seq_path).unwrap(); + let dst_data = fs::read(&dst_seq_path).unwrap(); + assert_eq!(src_data, dst_data, "Sequence file for {} should be byte-identical", seq_digest); + } + + // Verify in-memory metadata matches + let src_coll = source.get_collection(&digest).unwrap(); + assert_eq!(coll.metadata.digest, src_coll.metadata.digest); + assert_eq!(coll.sequences.len(), src_coll.sequences.len()); + for (src_seq, dst_seq) in src_coll.sequences.iter().zip(coll.sequences.iter()) { + assert_eq!(src_seq.metadata().sha512t24u, dst_seq.metadata().sha512t24u); + assert_eq!(src_seq.metadata().name, dst_seq.metadata().name); + assert_eq!(src_seq.metadata().length, dst_seq.metadata().length); + } +} + +#[test] +fn test_import_collection_ancillary_digest_enrichment() { + // Source store has ancillary_digests: false, destination has ancillary_digests: true. + // The destination RGSI should contain ancillary digest headers that the source lacks. + let source_dir = tempdir().unwrap(); + let fasta_dir = tempdir().unwrap(); + let fasta = fasta_dir.path().join("test.fa"); + fs::write(&fasta, ">chr1\nATGC\n>chr2\nGGGG\n").unwrap(); + + // Create source store with ancillary_digests: false + let mut source = RefgetStore::on_disk(source_dir.path()).unwrap(); + source.set_quiet(true); + source.disable_ancillary_digests(); + let (meta, _) = source + .add_sequence_collection_from_fasta(&fasta, FastaImportOptions::new()) + .unwrap(); + let digest = meta.digest.clone(); + source.load_all_collections().unwrap(); + + // Verify source RGSI lacks ancillary digests + let src_rgsi_content = fs::read_to_string( + source_dir.path().join(format!("collections/{}.rgsi", digest)), + ).unwrap(); + assert!( + !src_rgsi_content.contains("name_length_pairs_digest"), + "Source should NOT have ancillary digests", + ); + + // Create destination store with ancillary_digests: true (default) + let target_dir = tempdir().unwrap(); + let mut target = RefgetStore::on_disk(target_dir.path()).unwrap(); + target.enable_ancillary_digests(); + target.import_collection(&mut source, &digest).unwrap(); + + // Verify destination RGSI has ancillary digest headers + let dst_rgsi_content = fs::read_to_string( + target_dir.path().join(format!("collections/{}.rgsi", digest)), + ).unwrap(); + assert!( + dst_rgsi_content.contains("name_length_pairs_digest"), + "Destination should have ancillary digests. RGSI:\n{}", + dst_rgsi_content, + ); + assert!( + dst_rgsi_content.contains("sorted_name_length_pairs_digest"), + "Destination should have sorted_name_length_pairs_digest", + ); + assert!( + dst_rgsi_content.contains("sorted_sequences_digest"), + "Destination should have sorted_sequences_digest", + ); + + // Verify in-memory metadata has non-None ancillary fields + let coll_meta = target.get_collection_metadata(&digest).unwrap(); + assert!(coll_meta.name_length_pairs_digest.is_some(), "name_length_pairs_digest should be Some"); + assert!(coll_meta.sorted_name_length_pairs_digest.is_some(), "sorted_name_length_pairs_digest should be Some"); + assert!(coll_meta.sorted_sequences_digest.is_some(), "sorted_sequences_digest should be Some"); +} + +#[test] +fn test_import_collection_mode_mismatch_error() { + // Source with Raw mode, destination with Encoded mode should fail. + let source_dir = tempdir().unwrap(); + let fasta_dir = tempdir().unwrap(); + let fasta = fasta_dir.path().join("test.fa"); + fs::write(&fasta, ">chr1\nATGC\n").unwrap(); + + // Create source store in Raw mode + let mut source = RefgetStore::on_disk(source_dir.path()).unwrap(); + source.set_quiet(true); + source.disable_encoding(); + let (meta, _) = source + .add_sequence_collection_from_fasta(&fasta, FastaImportOptions::new()) + .unwrap(); + let digest = meta.digest.clone(); + source.load_all_collections().unwrap(); + + // Create destination store in Encoded mode (default) + let target_dir = tempdir().unwrap(); + let mut target = RefgetStore::on_disk(target_dir.path()).unwrap(); + // target uses Encoded mode by default + + let result = target.import_collection(&mut source, &digest); + assert!(result.is_err(), "Should fail with mode mismatch"); + let err_msg = result.unwrap_err().to_string(); + assert!( + err_msg.contains("matching storage modes"), + "Error should mention storage modes: {}", + err_msg, + ); +} diff --git a/gtars-tokenizers/src/tokenizer.rs b/gtars-tokenizers/src/tokenizer.rs index 5dc9b5a4..fa80f9ae 100644 --- a/gtars-tokenizers/src/tokenizer.rs +++ b/gtars-tokenizers/src/tokenizer.rs @@ -1,5 +1,7 @@ use std::collections::HashMap as StdHashMap; -use std::path::{Path, PathBuf}; +use std::path::Path; +#[cfg(feature = "huggingface")] +use std::path::PathBuf; use fxhash::FxHashMap as HashMap; diff --git a/gtars-wasm/Cargo.toml b/gtars-wasm/Cargo.toml index 16375c40..93ae9b01 100644 --- a/gtars-wasm/Cargo.toml +++ b/gtars-wasm/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "gtars-js" -version = "0.8.0" +version = "0.9.0" authors = ["Nathan LeRoy "] edition = "2024" @@ -15,6 +15,7 @@ wasm-bindgen = "0.2.93" serde = { version = "1.0", features = ["derive"] } serde-wasm-bindgen = "0.4" serde_json = "1.0" +flate2 = { workspace = true } getrandom = { version = "0.2.16", features = ["js"] } # our code @@ -35,6 +36,13 @@ console_error_panic_hook = { version = "0.1.7", optional = true } [dev-dependencies] wasm-bindgen-test = "0.3.34" -[profile.release] -# Tell `rustc` to optimize for small code size. -opt-level = "s" +# NOTE: profile sections in non-root workspace members are silently ignored +# by cargo, so this block has no effect here. Left commented as a marker for +# future work — to actually apply size optimization to the wasm bundle, this +# needs to move to the workspace root Cargo.toml (e.g. as +# `[profile.release.package.gtars-js]` for a per-package override, or as a +# custom `wasm-release` profile inheriting from release). +# +# [profile.release] +# # Tell `rustc` to optimize for small code size. +# opt-level = "s" diff --git a/gtars-wasm/src/bed_stream.rs b/gtars-wasm/src/bed_stream.rs new file mode 100644 index 00000000..d089f81f --- /dev/null +++ b/gtars-wasm/src/bed_stream.rs @@ -0,0 +1,287 @@ +use std::collections::HashMap; +use std::io::{Cursor, Read}; +use std::sync::Mutex; + +use flate2::read::GzDecoder; +use gtars_core::models::{Region, RegionSet}; +use serde::{Deserialize, Serialize}; +use wasm_bindgen::prelude::*; + +use crate::models::BedEntries; + +// ============================================================================ +// BedStreamParser — streaming BED file parser with gzip auto-detection +// ============================================================================ + +struct BedStreamParser { + regions: Vec, + compressed_buf: Vec, + text_buf: String, + is_gzipped: Option, + bytes_processed: usize, +} + +impl BedStreamParser { + fn new() -> Self { + Self { + regions: Vec::new(), + compressed_buf: Vec::new(), + text_buf: String::new(), + is_gzipped: None, + bytes_processed: 0, + } + } + + fn update(&mut self, chunk: &[u8]) -> Result<(), String> { + if chunk.is_empty() { + return Ok(()); + } + + self.bytes_processed += chunk.len(); + + if self.is_gzipped.is_none() { + self.is_gzipped = Some(chunk.len() >= 2 && chunk[0] == 0x1f && chunk[1] == 0x8b); + } + + if self.is_gzipped == Some(true) { + self.compressed_buf.extend_from_slice(chunk); + } else { + let text = String::from_utf8_lossy(chunk); + self.text_buf.push_str(&text); + self.parse_complete_lines(); + } + + Ok(()) + } + + fn finish(mut self) -> Result { + if self.is_gzipped == Some(true) { + let cursor = Cursor::new(&self.compressed_buf); + let mut decoder = GzDecoder::new(cursor); + let mut decompressed = String::new(); + decoder + .read_to_string(&mut decompressed) + .map_err(|e| format!("Gzip decompression failed: {}", e))?; + self.text_buf = decompressed; + self.parse_complete_lines(); + } + + if !self.text_buf.is_empty() { + let remaining = std::mem::take(&mut self.text_buf); + self.parse_line(&remaining); + } + + let mut rs = RegionSet::from(self.regions); + rs.sort(); + Ok(rs) + } + + fn region_count(&self) -> usize { + self.regions.len() + } + + fn bytes_processed(&self) -> usize { + self.bytes_processed + } + + fn parse_complete_lines(&mut self) { + let last_newline = match self.text_buf.rfind('\n') { + Some(pos) => pos, + None => return, + }; + + let complete = self.text_buf[..last_newline].to_string(); + let remainder = self.text_buf[last_newline + 1..].to_string(); + self.text_buf = remainder; + + for line in complete.split('\n') { + self.parse_line(line); + } + } + + fn parse_line(&mut self, line: &str) { + let trimmed = line.trim(); + if trimmed.is_empty() + || trimmed.starts_with('#') + || trimmed.starts_with("browser") + || trimmed.starts_with("track") + { + return; + } + + let parts: Vec<&str> = trimmed.split('\t').collect(); + if parts.len() < 3 { + eprintln!( + "BED parse warning: expected at least 3 tab-separated fields, got {} in line: {:?}", + parts.len(), + trimmed + ); + return; + } + + let start = match parts[1].parse::() { + Ok(v) => v, + Err(e) => { + eprintln!( + "BED parse warning: invalid start coordinate {:?} in line {:?}: {}", + parts[1], + trimmed, + e + ); + return; + } + }; + let end = match parts[2].parse::() { + Ok(v) => v, + Err(e) => { + eprintln!( + "BED parse warning: invalid end coordinate {:?} in line {:?}: {}", + parts[2], + trimmed, + e + ); + return; + } + }; + + let rest = if parts.len() > 3 { + Some(parts[3..].join("\t")) + } else { + None + }; + + self.regions.push(Region { + chr: parts[0].to_string(), + start, + end, + rest: rest.filter(|s| !s.is_empty()), + }); + } +} + +// ============================================================================ +// Global storage for streaming BED parser instances +// ============================================================================ + +static BED_PARSER_STORAGE: Mutex> = Mutex::new(None); + +struct BedParserStorage { + parsers: HashMap, + next_id: u32, +} + +impl BedParserStorage { + fn new() -> Self { + Self { + parsers: HashMap::new(), + next_id: 1, + } + } + + fn insert(&mut self, parser: BedStreamParser) -> u32 { + let mut id = self.next_id; + while self.parsers.contains_key(&id) || id == 0 { + id = id.wrapping_add(1); + if id == 0 { + id = 1; + } + } + self.next_id = id.wrapping_add(1); + if self.next_id == 0 { + self.next_id = 1; + } + self.parsers.insert(id, parser); + id + } + + fn get_mut(&mut self, id: u32) -> Option<&mut BedStreamParser> { + self.parsers.get_mut(&id) + } + + fn remove(&mut self, id: u32) -> Option { + self.parsers.remove(&id) + } +} + +fn with_bed_storage(f: F) -> R +where + F: FnOnce(&mut BedParserStorage) -> R, +{ + let mut guard = BED_PARSER_STORAGE + .lock() + .expect("BED_PARSER_STORAGE mutex poisoned"); + if guard.is_none() { + *guard = Some(BedParserStorage::new()); + } + f(guard.as_mut().unwrap()) +} + +// ============================================================================ +// WASM bindings +// ============================================================================ + +#[wasm_bindgen(js_name = "bedParserNew")] +pub fn bed_parser_new() -> u32 { + with_bed_storage(|storage| storage.insert(BedStreamParser::new())) +} + +#[wasm_bindgen(js_name = "bedParserUpdate")] +pub fn bed_parser_update(handle: u32, chunk: &[u8]) -> Result<(), JsError> { + with_bed_storage(|storage| { + if let Some(parser) = storage.get_mut(handle) { + parser + .update(chunk) + .map_err(|e| JsError::new(&format!("Failed to process chunk: {}", e))) + } else { + Err(JsError::new("Invalid parser handle")) + } + }) +} + +/// Finalize parsing and return BedEntries (array of [chr, start, end, rest] tuples). +/// The result can be passed directly to `new RegionSet(entries)`. +#[wasm_bindgen(js_name = "bedParserFinish")] +pub fn bed_parser_finish(handle: u32) -> Result { + let parser = with_bed_storage(|storage| storage.remove(handle)) + .ok_or_else(|| JsError::new("Invalid parser handle"))?; + + let region_set = parser + .finish() + .map_err(|e| JsError::new(&format!("Failed to finalize parser: {}", e)))?; + + let entries: Vec<(String, u32, u32, String)> = region_set + .regions + .into_iter() + .map(|r| (r.chr, r.start, r.end, r.rest.unwrap_or_default())) + .collect(); + + serde_wasm_bindgen::to_value(&BedEntries(entries)) + .map_err(|e| JsError::new(&format!("Serialization error: {}", e))) +} + +#[wasm_bindgen(js_name = "bedParserFree")] +pub fn bed_parser_free(handle: u32) -> bool { + with_bed_storage(|storage| storage.remove(handle).is_some()) +} + +#[wasm_bindgen(js_name = "bedParserProgress")] +pub fn bed_parser_progress(handle: u32) -> Result { + let progress = with_bed_storage(|storage| { + storage.get_mut(handle).map(|parser| BedParserProgress { + region_count: parser.region_count(), + bytes_processed: parser.bytes_processed(), + }) + }); + + match progress { + Some(p) => serde_wasm_bindgen::to_value(&p) + .map_err(|e| JsError::new(&format!("Serialization error: {}", e))), + None => Err(JsError::new("Invalid parser handle")), + } +} + +#[derive(Serialize, Deserialize)] +pub struct BedParserProgress { + pub region_count: usize, + pub bytes_processed: usize, +} diff --git a/gtars-wasm/src/lib.rs b/gtars-wasm/src/lib.rs index 10ccca90..eed4de7c 100644 --- a/gtars-wasm/src/lib.rs +++ b/gtars-wasm/src/lib.rs @@ -1,3 +1,4 @@ +mod bed_stream; mod asset; mod lola; mod models; @@ -10,17 +11,6 @@ mod tokenizers; mod tss; mod utils; -use wasm_bindgen::prelude::*; - -// Re-export refget functions at the top level +// Re-export functions at the top level +pub use bed_stream::*; pub use refget::*; - -#[wasm_bindgen] -pub fn greet(name: &str) { - alert(&format!("Hello, {}!", name)); -} - -#[wasm_bindgen] -extern "C" { - fn alert(s: &str); -} diff --git a/gtars-wasm/src/lola.rs b/gtars-wasm/src/lola.rs index 0b1bbdf1..232ff94c 100644 --- a/gtars-wasm/src/lola.rs +++ b/gtars-wasm/src/lola.rs @@ -257,67 +257,63 @@ struct UniverseCheckResult { warnings: Vec, } -#[derive(serde::Serialize)] -#[serde(rename_all = "camelCase")] -struct LolaResults { - user_set: Vec, - db_set: Vec, - collection: Vec>, - p_value_log: Vec, - odds_ratio: Vec, - support: Vec, - rnk_pv: Vec, - rnk_or: Vec, - rnk_sup: Vec, - max_rnk: Vec, - mean_rnk: Vec, - b: Vec, - c: Vec, - d: Vec, - description: Vec>, - cell_type: Vec>, - tissue: Vec>, - antibody: Vec>, - treatment: Vec>, - data_source: Vec>, - filename: Vec, - q_value: Vec>, - size: Vec, -} - -fn empty_to_none(s: &str) -> Option { - if s.is_empty() { - None - } else { - Some(s.to_string()) +fn results_to_js(results: &[LolaResult]) -> Result { + use gtars_lola::output::results_to_columns; + + let c = results_to_columns(results); + + #[derive(serde::Serialize)] + #[serde(rename_all = "camelCase")] + struct Out { + user_set: Vec, + db_set: Vec, + collection: Vec>, + p_value_log: Vec, + odds_ratio: Vec, + support: Vec, + rnk_pv: Vec, + rnk_or: Vec, + rnk_sup: Vec, + max_rnk: Vec, + mean_rnk: Vec, + b: Vec, + c: Vec, + d: Vec, + description: Vec>, + cell_type: Vec>, + tissue: Vec>, + antibody: Vec>, + treatment: Vec>, + data_source: Vec>, + filename: Vec, + q_value: Vec>, + size: Vec, } -} -fn results_to_js(results: &[LolaResult]) -> Result { - let out = LolaResults { - user_set: results.iter().map(|r| r.user_set).collect(), - db_set: results.iter().map(|r| r.db_set).collect(), - collection: results.iter().map(|r| empty_to_none(&r.collection)).collect(), - p_value_log: results.iter().map(|r| r.p_value_log).collect(), - odds_ratio: results.iter().map(|r| r.odds_ratio).collect(), - support: results.iter().map(|r| r.support).collect(), - rnk_pv: results.iter().map(|r| r.rnk_pv).collect(), - rnk_or: results.iter().map(|r| r.rnk_or).collect(), - rnk_sup: results.iter().map(|r| r.rnk_sup).collect(), - max_rnk: results.iter().map(|r| r.max_rnk).collect(), - mean_rnk: results.iter().map(|r| r.mean_rnk).collect(), - b: results.iter().map(|r| r.b).collect(), - c: results.iter().map(|r| r.c).collect(), - d: results.iter().map(|r| r.d).collect(), - description: results.iter().map(|r| empty_to_none(&r.description)).collect(), - cell_type: results.iter().map(|r| empty_to_none(&r.cell_type)).collect(), - tissue: results.iter().map(|r| empty_to_none(&r.tissue)).collect(), - antibody: results.iter().map(|r| empty_to_none(&r.antibody)).collect(), - treatment: results.iter().map(|r| empty_to_none(&r.treatment)).collect(), - data_source: results.iter().map(|r| empty_to_none(&r.data_source)).collect(), - filename: results.iter().map(|r| r.filename.clone()).collect(), - q_value: results.iter().map(|r| r.q_value).collect(), - size: results.iter().map(|r| r.db_set_size).collect(), + let out = Out { + user_set: c.user_set, + db_set: c.db_set, + collection: c.collection, + p_value_log: c.p_value_log, + odds_ratio: c.odds_ratio, + support: c.support, + rnk_pv: c.rnk_pv, + rnk_or: c.rnk_or, + rnk_sup: c.rnk_sup, + max_rnk: c.max_rnk, + mean_rnk: c.mean_rnk, + b: c.b, + c: c.c, + d: c.d, + description: c.description, + cell_type: c.cell_type, + tissue: c.tissue, + antibody: c.antibody, + treatment: c.treatment, + data_source: c.data_source, + filename: c.filename, + q_value: c.q_value, + size: c.db_set_size, }; serde_wasm_bindgen::to_value(&out).map_err(|e| JsValue::from_str(&format!("{}", e))) diff --git a/gtars-wasm/src/regionset.rs b/gtars-wasm/src/regionset.rs index 32e3ab02..d344e00a 100644 --- a/gtars-wasm/src/regionset.rs +++ b/gtars-wasm/src/regionset.rs @@ -4,11 +4,16 @@ use crate::models::BedEntries; use gtars_core::models::{Region, RegionSet, RegionSetList}; use gtars_genomicdist::bed_classifier::classify_bed; use gtars_genomicdist::consensus; -use gtars_genomicdist::interval_ranges::IntervalRanges; +use gtars_genomicdist::interval_ranges::{IntervalRanges, RegionSetListOps}; use gtars_genomicdist::models::RegionBin; use gtars_genomicdist::statistics::GenomicIntervalSetStatistics; use wasm_bindgen::prelude::*; +#[inline] +fn to_js(value: &T) -> Result { + serde_wasm_bindgen::to_value(value).map_err(|e| e.into()) +} + #[wasm_bindgen(js_name = "ChromosomeStatistics")] #[derive(serde::Serialize)] pub struct JsChromosomeStatistics { @@ -176,10 +181,31 @@ impl JsRegionSet { serde_wasm_bindgen::to_value(&result).map_err(|e| e.into()) } + /// Region distribution across genomic bins. + /// + /// When `chrom_sizes` is provided (JS object: `{chr: length, ...}`), per-chromosome + /// bin sizes are derived from the reference genome (bin_size = chrom_size / n_bins + /// per chrom). Outputs are comparable across BED files and aligned with reference + /// genome positions. Regions on chroms not in `chrom_sizes`, or whose midpoint + /// falls beyond the stated chromosome size, are silently skipped. + /// + /// When `chrom_sizes` is null/undefined, bin size is derived from the BED file's + /// observed max end coordinate — outputs will NOT be comparable across files. #[wasm_bindgen(js_name = "regionDistribution")] - pub fn region_distribution(&self, n_bins: u32) -> Result { + pub fn region_distribution( + &self, + n_bins: u32, + chrom_sizes: JsValue, + ) -> Result { let distribution: HashMap = - self.region_set.region_distribution_with_bins(n_bins); + if chrom_sizes.is_null() || chrom_sizes.is_undefined() { + self.region_set.region_distribution_with_bins(n_bins) + } else { + let cs: HashMap = serde_wasm_bindgen::from_value(chrom_sizes) + .map_err(|e| JsValue::from_str(&format!("chrom_sizes: {}", e)))?; + self.region_set + .region_distribution_with_chrom_sizes(n_bins, &cs) + }; let mut result_vector: Vec = vec![]; @@ -234,6 +260,18 @@ impl JsRegionSet { serde_wasm_bindgen::to_value(&nearest).map_err(|e| e.into()) } + /// Single-linkage cluster IDs for each region. + /// + /// Two regions on the same chromosome are assigned the same cluster + /// when the bp gap between them is at most `max_gap`. Chromosome + /// boundaries always break clusters. Returns a `Uint32Array`-compatible + /// JS array in the original region order. + #[wasm_bindgen(js_name = "cluster")] + pub fn cluster(&self, max_gap: u32) -> Result { + let ids = self.region_set.cluster(max_gap); + to_js(&ids) + } + // ── Interval range methods ────────────────────────────────────── #[wasm_bindgen(js_name = "trim")] @@ -320,9 +358,11 @@ impl JsRegionSet { } #[wasm_bindgen(js_name = "gaps")] - pub fn gaps(&self) -> JsRegionSet { - let result = self.region_set.gaps(); - JsRegionSet { region_set: result } + pub fn gaps(&self, chrom_sizes: &JsValue) -> Result { + let sizes: HashMap = serde_wasm_bindgen::from_value(chrom_sizes.clone()) + .map_err(|e| JsValue::from_str(&format!("chrom_sizes: {}", e)))?; + let result = self.region_set.gaps(&sizes); + Ok(JsRegionSet { region_set: result }) } #[wasm_bindgen(js_name = "intersect")] @@ -345,6 +385,25 @@ pub struct JsRegionSetList { #[wasm_bindgen(js_class = "RegionSetList")] impl JsRegionSetList { + /// Create an empty RegionSetList. Use `add()` to populate it. + #[wasm_bindgen(constructor)] + pub fn new() -> Self { + JsRegionSetList { + inner: RegionSetList::new(Vec::new()), + } + } + + /// Add a RegionSet to this list, with an optional name. + pub fn add(&mut self, set: &JsRegionSet, name: Option) { + let idx = self.inner.region_sets.len(); + self.inner.region_sets.push(set.region_set.clone()); + let name_val = name.unwrap_or_else(|| format!("set_{}", idx)); + self.inner + .names + .get_or_insert_with(Vec::new) + .push(name_val); + } + /// Number of region sets in this list. #[wasm_bindgen(getter)] pub fn length(&self) -> usize { @@ -383,6 +442,139 @@ impl JsRegionSetList { } } + /// Build a RegionSetList directly from arrays of BED entries. + /// + /// @param entries - Array of arrays: [[[chr, start, end, rest], ...], ...] + /// @param names - Optional array of names, one per set + #[wasm_bindgen(js_name = "fromEntries")] + pub fn from_entries(entries: &JsValue, names: &JsValue) -> Result { + let all_entries: Vec = + serde_wasm_bindgen::from_value(entries.clone())?; + let names_vec: Option> = if names.is_null() || names.is_undefined() { + None + } else { + Some(serde_wasm_bindgen::from_value(names.clone())?) + }; + + let mut sets = Vec::with_capacity(all_entries.len()); + for bed_entries in all_entries { + let regions: Vec = bed_entries + .0 + .into_iter() + .map(|be| Region { + chr: be.0, + start: be.1, + end: be.2, + rest: Some(be.3), + }) + .collect(); + let mut rs = RegionSet::from(regions); + rs.sort(); + sets.push(rs); + } + + Ok(JsRegionSetList { + inner: RegionSetList { + region_sets: sets, + names: names_vec, + path: None, + }, + }) + } + + /// Number of regions in the set at the given index. + #[wasm_bindgen(js_name = "regionCount")] + pub fn region_count(&self, index: usize) -> Result { + self.inner.region_count(index) + .ok_or_else(|| JsValue::from_str("Index out of range")) + } + + /// Number of overlapping regions between two sets by index. + #[wasm_bindgen(js_name = "pintersectCount")] + pub fn pintersect_count(&self, i: usize, j: usize) -> Result { + self.inner.pintersect_count(i, j) + .ok_or_else(|| JsValue::from_str("Index out of range")) + } + + /// Jaccard similarity between two sets by index. + #[wasm_bindgen(js_name = "jaccardAt")] + pub fn jaccard_at(&self, i: usize, j: usize) -> Result { + self.inner.jaccard_at(i, j) + .ok_or_else(|| JsValue::from_str("Index out of range")) + } + + /// Union of two sets by index. + #[wasm_bindgen(js_name = "unionAt")] + pub fn union_at(&self, i: usize, j: usize) -> Result { + self.inner.union_at(i, j) + .map(|rs| JsRegionSet { region_set: rs }) + .ok_or_else(|| JsValue::from_str("Index out of range")) + } + + /// Setdiff of two sets by index (set[i] minus set[j]). + #[wasm_bindgen(js_name = "setdiffAt")] + pub fn setdiff_at(&self, i: usize, j: usize) -> Result { + self.inner.setdiff_at(i, j) + .map(|rs| JsRegionSet { region_set: rs }) + .ok_or_else(|| JsValue::from_str("Index out of range")) + } + + /// Union of all sets except the one at the given index. + #[wasm_bindgen(js_name = "unionExcept")] + pub fn union_except(&self, skip: usize) -> Result { + self.inner.union_except(skip) + .map(|rs| JsRegionSet { region_set: rs }) + .ok_or_else(|| JsValue::from_str("Index out of range or list too small")) + } + + /// Compute all N union-except results in O(n) via prefix/suffix. + /// Returns { union: RegionSet, excepts: RegionSet[] }. + #[wasm_bindgen(js_name = "bulkUnionExcept")] + pub fn bulk_union_except(&self) -> Result { + let (full_union, excepts) = self.inner.bulk_union_except() + .ok_or_else(|| JsValue::from_str("Need at least 2 sets"))?; + + #[derive(serde::Serialize)] + struct BulkResult { + union_regions: u32, + union_nucleotides: u32, + except_unique: Vec, + } + + // For each file, compute setdiff(file_i, union_except_i).len() + let mut except_unique = Vec::with_capacity(excepts.len()); + for (i, ue) in excepts.iter().enumerate() { + if let Some(rs) = self.inner.get(i) { + except_unique.push(rs.setdiff(ue).len() as u32); + } else { + except_unique.push(0); + } + } + + let result = BulkResult { + union_regions: full_union.len() as u32, + union_nucleotides: full_union.nucleotides_length() as u32, + except_unique, + }; + serde_wasm_bindgen::to_value(&result).map_err(|e| e.into()) + } + + /// Union of all sets. + #[wasm_bindgen(js_name = "unionAll")] + pub fn union_all(&self) -> Result { + self.inner.union_all() + .map(|rs| JsRegionSet { region_set: rs }) + .ok_or_else(|| JsValue::from_str("Empty list")) + } + + /// Intersection of all sets. + #[wasm_bindgen(js_name = "intersectAll")] + pub fn intersect_all(&self) -> Result { + self.inner.intersect_all() + .map(|rs| JsRegionSet { region_set: rs }) + .ok_or_else(|| JsValue::from_str("Empty list")) + } + /// Compute pairwise Jaccard similarity for all pairs of region sets. /// /// Returns { matrix: number[][], names: string[] | null }.