Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
fef314e
Added streaming of bed file
khoroshevskyi Feb 11, 2026
acbdf7c
make importing from collections us copy instead of memory
nsheff Mar 18, 2026
eb7ddc6
Merge branch 'master' into dev
nsheff Mar 19, 2026
0e32da1
Add RegionSetList constructor and add() method to wasm bindings
sanghoonio Mar 19, 2026
446f607
Bump gtars-js wasm version to 0.8.1
sanghoonio Mar 19, 2026
69b3881
Add indexed operations on RegionSetList and fromEntries constructor
sanghoonio Mar 19, 2026
eb8ef5a
Factor LOLA result conversion into core, remove dead code
sanghoonio Mar 19, 2026
db0fecf
Add bulk union/intersect operations on RegionSetList
sanghoonio Mar 19, 2026
fa48fa0
Fix intersect_all to use range-level intersect, add tests for RegionS…
sanghoonio Mar 19, 2026
99a2b51
make importing from collections us copy instead of memory
nsheff Mar 18, 2026
c0e60cc
Merge branch 'dev' of github.com:databio/gtars into dev
nsheff Mar 20, 2026
87bb1de
Fix names misalignment in add(), validate union_except bounds, optimi…
sanghoonio Mar 21, 2026
c700590
Restore empty_to_na helper removed during LOLA refactor
sanghoonio Mar 21, 2026
8c7f19c
Use Option<String> for annotation fields in RegionSetAnno and LolaResult
sanghoonio Mar 21, 2026
a1a6349
Merge pull request #247 from databio/bindings-updates
sanghoonio Mar 21, 2026
fafdc04
Bump versions: genomicdist 0.7.0, lola 0.2.0, python 0.8.1, R 0.8.1
sanghoonio Mar 21, 2026
bc110d3
Allow WASM npm publish to run independently of crates.io publish
sanghoonio Mar 21, 2026
35e1dfd
Update gtars-wasm/src/bed_stream.rs
khoroshevskyi Mar 23, 2026
a032ba0
Merge branch 'dev' into streaming_bed
khoroshevskyi Mar 23, 2026
e6f39c3
Merge pull request #235 from databio/streaming_bed
khoroshevskyi Mar 23, 2026
3ff780b
Wire region_distribution_with_chrom_sizes across CLI/Python/WASM
sanghoonio Apr 5, 2026
bd1e5b7
Align bindings and add ignore_unk_chroms to calc_dinucl_freq
sanghoonio Apr 6, 2026
c65e21a
CLI UX: enable all subcommands by default, fix overlaprs help
sanghoonio Apr 6, 2026
388e68d
Add .fab binary FASTA format with zero-copy mmap access
sanghoonio Apr 6, 2026
66d23ac
Merge pull request #252 from databio/binary-fasta-and-binding-alignment
sanghoonio Apr 6, 2026
2dab451
Fix region distribution binning to use uniform bin width across chrom…
sanghoonio Apr 7, 2026
db4482c
Merge pull request #253 from databio/binary-fasta-and-binding-alignment
sanghoonio Apr 9, 2026
679a939
Add spatial-arrangement stats and fix gaps() chrom_sizes signature
sanghoonio Apr 11, 2026
f533827
Clarify density function docs for n_bins and per-chromosome bin width
sanghoonio Apr 13, 2026
90514a0
Add min_cluster_size to peak_clusters and apply filter uniformly
sanghoonio Apr 13, 2026
601f111
Merge pull request #254 from databio/feat/genomicdist-spatial-stats
sanghoonio Apr 14, 2026
4d52779
Bump versions: genomicdist 0.8.0, cli 0.9.0, python 0.9.0, wasm 0.9.0…
sanghoonio Apr 14, 2026
39c5aa0
Fix pyo3 0.27 deprecations in refget bindings
sanghoonio Apr 14, 2026
c9e184e
Clean up gtars-wasm warnings
sanghoonio Apr 14, 2026
ddfee6d
Gate PathBuf import in gtars-tokenizers behind huggingface feature
sanghoonio Apr 14, 2026
e7e29b8
Remove spatial-arrangement stats from genomicdist
sanghoonio Apr 19, 2026
d6840e3
Merge pull request #257 from databio/cleanup/remove-spatial-stats
sanghoonio Apr 19, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/workflows/rust-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,7 @@ jobs:
secrets: inherit

publish-wasm:
if: ${{ always() && inputs.wasm != false && needs.publish-all-crates.result == 'success' }}
needs: publish-all-crates
if: ${{ always() && inputs.wasm != false }}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
Expand Down
6 changes: 3 additions & 3 deletions gtars-cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "gtars-cli"
version = "0.8.0"
version = "0.9.0"
edition = "2024"
description = "Performance critical tools for genomic interval analysis. This is the CLI"
homepage = "https://github.com/databio/gtars"
Expand All @@ -25,7 +25,7 @@ gtars-igd = { path = "../gtars-igd", optional=true, version="0.5.1" }
gtars-uniwig = { path = "../gtars-uniwig", optional=true, version="0.8.0" }
gtars-overlaprs = { path = "../gtars-overlaprs", optional = true, version="0.5.1" }
gtars-bbcache = { path = "../gtars-bbcache", optional=true, version="0.5.3" }
gtars-genomicdist = { path = "../gtars-genomicdist", optional=true, version="0.6.0" }
gtars-genomicdist = { path = "../gtars-genomicdist", optional=true, version="0.8.0" }
gtars-core = { path = "../gtars-core", version="0.5.5", features=["bigbed", "http"] }

# serialization
Expand All @@ -38,7 +38,7 @@ name = "gtars"
path = "src/main.rs"

[features]
default = []
default = ["scoring", "uniwig", "bbcache", "igd", "fragsplit", "overlaprs", "genomicdist"]
scoring = ["dep:gtars-scoring"]
uniwig = ["dep:gtars-uniwig"]
bbcache = ["dep:gtars-bbcache"]
Expand Down
28 changes: 26 additions & 2 deletions gtars-cli/src/genomicdist/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ pub fn create_genomicdist_cli() -> Command {
Arg::new("chrom-sizes")
.long("chrom-sizes")
.required(false)
.help("Path to chrom.sizes file (enables expected partitions and promoter trimming)"),
.help("Path to chrom.sizes file. When provided, region distribution uses a per-chromosome bin size derived from the reference genome (stable across files). Also enables expected partitions and promoter trimming."),
)
.arg(
arg!(--output <OUTPUT>)
Expand All @@ -35,14 +35,38 @@ pub fn create_genomicdist_cli() -> Command {
arg!(--bins <BINS>)
.required(false)
.default_value("250")
.help("Number of bins for region distribution"),
.help("Number of bins for the region distribution. Bin width is derived as max_chrom_len/bins; shorter chromosomes get proportionally fewer bins."),
)
.arg(
Arg::new("signal-matrix")
.long("signal-matrix")
.required(false)
.help("Path to open signal matrix TSV (enables cell-type open chromatin enrichment)"),
)
.arg(
Arg::new("fasta")
.long("fasta")
.required(false)
.help("Path to genome FASTA (.fa) or binary FASTA (.fab) file. Enables GC content; also enables dinucleotide frequencies when --dinucl-freq is set. Use .fab format (via gtars prep --fasta) for best performance."),
)
.arg(
Arg::new("ignore-unk-chroms")
.long("ignore-unk-chroms")
.action(clap::ArgAction::SetTrue)
.help("When computing GC content, skip regions on chromosomes not in the FASTA (default: error)"),
)
.arg(
Arg::new("dinucl-freq")
.long("dinucl-freq")
.action(clap::ArgAction::SetTrue)
.help("Compute per-region dinucleotide frequencies (expensive for wide regions; opt-in even when --fasta is provided)"),
)
.arg(
Arg::new("dinucl-raw-counts")
.long("dinucl-raw-counts")
.action(clap::ArgAction::SetTrue)
.help("Return raw per-region dinucleotide counts instead of percentages (matches R GenomicDistributions' rawCounts=TRUE)"),
)
.arg(
Arg::new("promoter-upstream")
.long("promoter-upstream")
Expand Down
105 changes: 103 additions & 2 deletions gtars-cli/src/genomicdist/handlers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,16 @@ use serde::Serialize;

use gtars_core::models::{Region, RegionSet};
use gtars_core::utils::get_chrom_sizes;
use gtars_genomicdist::models::{ChromosomeStatistics, RegionBin, Strand, TssIndex};
use gtars_genomicdist::models::{
BinaryGenomeAssembly, ChromosomeStatistics, GenomeAssembly, RegionBin, SequenceAccess,
Strand, TssIndex,
};
use gtars_genomicdist::statistics::GenomicIntervalSetStatistics;
use gtars_genomicdist::{
GeneModel, GenomicDistAnnotation, ExpectedPartitionResult, PartitionResult,
calc_expected_partitions, calc_partitions, genome_partition_list,
SignalMatrix, calc_summary_signal, ConditionStats,
calc_gc_content, calc_dinucl_freq, DINUCL_ORDER,
median_abs_distance,
};

Expand All @@ -28,6 +32,32 @@ struct GenomicDistOutput {
expected_partitions: Option<ExpectedPartitionResult>,
#[serde(skip_serializing_if = "Option::is_none")]
open_signal: Option<OpenSignalOutput>,
#[serde(skip_serializing_if = "Option::is_none")]
gc_content: Option<GcContentOutput>,
#[serde(skip_serializing_if = "Option::is_none")]
dinucl_freq: Option<DinuclFreqOutput>,
}

#[derive(Serialize)]
struct GcContentOutput {
/// Mean GC content across all regions (0–1)
mean: f64,
/// Per-region GC content values (0–1), one per region in input order
per_region: Vec<f64>,
}

#[derive(Serialize)]
struct DinuclFreqOutput {
/// Dinucleotide names in canonical order (matches DINUCL_ORDER)
dinucleotides: Vec<String>,
/// `chr_start_end` label per region
region_labels: Vec<String>,
/// Per-region matrix: outer is regions, inner is 16 values matching
/// `dinucleotides` order. Percentages (0–100) by default, or raw counts
/// if --dinucl-raw-counts flag was passed.
frequencies: Vec<[f64; 16]>,
/// Whether `frequencies` are raw counts (true) or percentages (false)
raw_counts: bool,
}

#[derive(Serialize)]
Expand Down Expand Up @@ -65,6 +95,8 @@ pub fn run_genomicdist(matches: &ArgMatches) -> Result<()> {
let chrom_sizes_path = matches.get_one::<String>("chrom-sizes");
let output_path = matches.get_one::<String>("output");
let signal_matrix_path = matches.get_one::<String>("signal-matrix");
let fasta_path = matches.get_one::<String>("fasta");
let ignore_unk_chroms = matches.get_flag("ignore-unk-chroms");
let n_bins: u32 = matches
.get_one::<String>("bins")
.unwrap()
Expand All @@ -91,7 +123,22 @@ pub fn run_genomicdist(matches: &ArgMatches) -> Result<()> {
// --- Unconditional computations ---
let widths = rs.calc_widths();
let chromosome_stats = rs.chromosome_statistics();
let region_dist_map = rs.region_distribution_with_bins(n_bins);
let region_dist_map = match explicit_chrom_sizes.as_ref() {
Some(cs) => rs.region_distribution_with_chrom_sizes(n_bins, cs),
None => {
eprintln!(
"warning: --chrom-sizes not provided; using BED-file-derived bin width."
);
eprintln!(
" Outputs will NOT be comparable across files or aligned with"
);
eprintln!(
" reference genome positions. Pass --chrom-sizes <file> for"
);
eprintln!(" reference-aligned bins.");
rs.region_distribution_with_bins(n_bins)
}
};
let neighbor_distances = rs
.calc_neighbor_distances()
.map_err(|e| anyhow::anyhow!("Failed to compute neighbor distances: {}", e))?;
Expand Down Expand Up @@ -214,6 +261,58 @@ pub fn run_genomicdist(matches: &ArgMatches) -> Result<()> {
None => None,
};

// --- Optional: GC content + dinucleotide frequencies (require FASTA) ---
// --fasta enables GC content. Dinucleotide frequencies are additionally
// opt-in via --dinucl-freq because they can be expensive for region sets
// with very wide regions (each dinucleotide window of every region is
// inspected; width dominates cost).
let dinucl_raw_counts = matches.get_flag("dinucl-raw-counts");
let compute_dinucl = matches.get_flag("dinucl-freq");
let (gc_content_out, dinucl_freq_out) = match fasta_path {
Some(p) => {
// Auto-detect .fab binary format vs plain FASTA (HashMap fallback)
let assembly: Box<dyn SequenceAccess> = if p.ends_with(".fab") {
Box::new(BinaryGenomeAssembly::try_from(p.as_str())
.map_err(|e| anyhow::anyhow!("Failed to load .fab: {}", e))?)
} else {
Box::new(GenomeAssembly::try_from(p.as_str())
.map_err(|e| anyhow::anyhow!("Failed to load FASTA: {}", e))?)
};

let gc_per_region = calc_gc_content(&rs, assembly.as_ref(), ignore_unk_chroms)
.map_err(|e| anyhow::anyhow!("Failed to compute GC content: {}", e))?;
let gc_mean = if gc_per_region.is_empty() {
0.0
} else {
gc_per_region.iter().sum::<f64>() / gc_per_region.len() as f64
};
let gc_out = GcContentOutput {
mean: gc_mean,
per_region: gc_per_region,
};

let dinucl_out = if compute_dinucl {
let (labels, matrix) = calc_dinucl_freq(
&rs, assembly.as_ref(), dinucl_raw_counts, ignore_unk_chroms,
)
.map_err(|e| anyhow::anyhow!("Failed to compute dinucl freq: {}", e))?;
Some(DinuclFreqOutput {
dinucleotides: DINUCL_ORDER
.iter()
.map(|d| d.to_string().unwrap_or_default())
.collect(),
region_labels: labels,
frequencies: matrix,
raw_counts: dinucl_raw_counts,
})
} else {
None
};
(Some(gc_out), dinucl_out)
}
None => (None, None),
};

// --- Build output ---
let output = GenomicDistOutput {
scalars: Scalars {
Expand All @@ -232,6 +331,8 @@ pub fn run_genomicdist(matches: &ArgMatches) -> Result<()> {
},
expected_partitions,
open_signal,
gc_content: gc_content_out,
dinucl_freq: dinucl_freq_out,
};

let compact = matches.get_flag("compact");
Expand Down
20 changes: 16 additions & 4 deletions gtars-cli/src/overlaprs/cli.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,26 @@
use clap::{Command, arg};
use clap::{Arg, Command, arg};

pub const OVERLAP_CMD: &str = "overlaprs";

pub fn create_overlap_cli() -> Command {
Command::new(OVERLAP_CMD)
.author("NJL")
.about("Tokenize data into a universe")
.about("Tokenize a BED file against a universe of regions (overlap-based encoding).")
.arg_required_else_help(true)
.arg(arg!(-q <query> "The file you are tokenizing"))
.arg(arg!(-u <universe> "The universe you are tokenizing into"))
.arg(
Arg::new("query")
.short('q')
.long("query")
.required(true)
.help("Path to the BED file to tokenize"),
)
.arg(
Arg::new("universe")
.short('u')
.long("universe")
.required(true)
.help("Path to the universe BED file to tokenize against"),
)
.arg(arg!(-e --backend <backend> "Which backend to use (ailist or bits)"))
.arg(arg!(--streaming "Use streaming mode for very large universes (lower memory usage)"))
}
10 changes: 8 additions & 2 deletions gtars-cli/src/prep/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ pub const PREP_CMD: &str = "prep";

pub fn create_prep_cli() -> Command {
Command::new(PREP_CMD)
.about("Pre-serialize GTF gene models or signal matrices to binary for fast loading.")
.about("Pre-serialize GTF gene models, signal matrices, or FASTA files to binary for fast loading.")
.arg(
Arg::new("gtf")
.long("gtf")
Expand All @@ -17,11 +17,17 @@ pub fn create_prep_cli() -> Command {
.required(false)
.help("Path to signal matrix TSV/TSV.gz to serialize"),
)
.arg(
Arg::new("fasta")
.long("fasta")
.required(false)
.help("Path to FASTA file to convert to .fab binary (zero-copy mmap format)"),
)
.arg(
Arg::new("output")
.long("output")
.short('o')
.required(false)
.help("Output path (default: input path with .bin extension, stripping .gz first)"),
.help("Output path (default: input path with .bin/.fab extension)"),
)
}
30 changes: 28 additions & 2 deletions gtars-cli/src/prep/handlers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use anyhow::Result;
use clap::ArgMatches;

use gtars_genomicdist::{GenomicDistAnnotation, SignalMatrix};
use gtars_genomicdist::models::BinaryGenomeAssembly;

/// Derive the default output path: strip `.gz` then append `.bin`.
fn default_output_path(input: &str) -> String {
Expand All @@ -15,10 +16,11 @@ fn default_output_path(input: &str) -> String {
pub fn run_prep(matches: &ArgMatches) -> Result<()> {
let gtf_path = matches.get_one::<String>("gtf");
let signal_path = matches.get_one::<String>("signal-matrix");
let fasta_path = matches.get_one::<String>("fasta");
let output_path = matches.get_one::<String>("output");

if gtf_path.is_none() && signal_path.is_none() {
anyhow::bail!("Provide at least one of --gtf or --signal-matrix");
if gtf_path.is_none() && signal_path.is_none() && fasta_path.is_none() {
anyhow::bail!("Provide at least one of --gtf, --signal-matrix, or --fasta");
}

if let Some(gtf) = gtf_path {
Expand Down Expand Up @@ -79,5 +81,29 @@ pub fn run_prep(matches: &ArgMatches) -> Result<()> {
);
}

if let Some(fa) = fasta_path {
let out = output_path
.cloned()
.unwrap_or_else(|| {
let stripped = fa.strip_suffix(".gz").unwrap_or(fa);
format!("{}.fab", stripped)
});

eprintln!("Converting FASTA to .fab: {}", fa);
let start = Instant::now();
BinaryGenomeAssembly::write_from_fasta(Path::new(fa), Path::new(&out))
.map_err(|e| anyhow::anyhow!("Failed to create .fab: {}", e))?;

let size = std::fs::metadata(&out)
.map(|m| m.len())
.unwrap_or(0);
eprintln!(
" wrote {} ({:.1} GB) in {:.1}s",
out,
size as f64 / 1_073_741_824.0,
start.elapsed().as_secs_f64()
);
}

Ok(())
}
8 changes: 7 additions & 1 deletion gtars-cli/src/ranges/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,14 @@ pub fn create_ranges_cli() -> Command {
)
.subcommand(
Command::new("gaps")
.about("Compute gaps between regions per chromosome.")
.about("Compute gaps between regions per chromosome, bounded by chrom sizes.")
.arg(arg!(--input <BED> "Input BED file").required(true))
.arg(
Arg::new("chrom-sizes")
.long("chrom-sizes")
.required(true)
.help("Path to chrom.sizes file"),
)
.arg(arg!(--output <OUTPUT> "Output BED file (default: stdout)").required(false)),
)
.subcommand(
Expand Down
6 changes: 5 additions & 1 deletion gtars-cli/src/ranges/handlers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,11 @@ pub fn run_ranges(matches: &ArgMatches) -> Result<()> {
}
Some(("gaps", m)) => {
let rs = load_input(m)?;
let result = rs.gaps();
let cs_path = m
.get_one::<String>("chrom-sizes")
.expect("--chrom-sizes is required");
let chrom_sizes = get_chrom_sizes(cs_path);
let result = rs.gaps(&chrom_sizes);
write_output(&result, m.get_one::<String>("output"))
}
Some(("intersect", m)) => {
Expand Down
3 changes: 2 additions & 1 deletion gtars-genomicdist/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "gtars-genomicdist"
version = "0.6.0"
version = "0.8.0"
edition = "2024"
description = "Rust port of GenomicDistributions: tools for computing statistics for genomic interval sets"
license = "MIT"
Expand All @@ -15,6 +15,7 @@ regex = "1.11.1"
thiserror = { workspace = true }
serde = { workspace = true }
bio = "3.0.0"
memmap2 = "0.9"


[dev-dependencies]
Expand Down
Loading
Loading