Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 36 additions & 18 deletions python/integration-tests/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,33 +83,51 @@ mod integration_tests {
Ok(SingleSampleCounts { counts })
}

/// This is windows the "tskit-python" way,
/// meaning that the windows must span the entire sequence length
/// of the input
#[pyfunction]
/// NOTE: the implementation is extremely inefficient!
fn counts_from_ts_holder_windowed(
holder: &TreeSequenceHolder,
samples: Vec<i32>,
windows: Vec<f64>,
) -> PyResult<SingleSampleCountCollection> {
let mut vcounts = vec![];
assert!(!windows.is_empty());
assert!(windows[0] == 0.0);
assert!(windows[windows.len() - 1] == holder.ts.tables().sequence_length());
for w in windows.windows(2) {
assert!(w[0] < w[1]);
assert!(w[0] >= 0.);
assert!(w[1] <= holder.ts.tables().sequence_length());
let counts = popgen::MultiSiteCounts::try_from_tree_sequence_site_iter(
&holder.ts,
samples.iter().map(|i| i.into()),
holder
.ts
.site_iter()
.filter(|s| s.position() >= w[0] && s.position() < w[1]),
None,
)
.unwrap();
vcounts.push(SingleSampleCounts { counts });
}
let counts = popgen::MultiSiteCounts::try_from_tree_sequence_windows(
&holder.ts,
samples.iter().map(|i| i.into()),
windows.windows(2).map(|w| (w[0], w[1])),
None,
)
.unwrap();
let vcounts = counts
.into_iter()
.map(|c| SingleSampleCounts { counts: c })
.collect::<Vec<_>>();
Ok(SingleSampleCountCollection(vcounts))
}

/// More general windowing fn.
#[pyfunction]
fn counts_from_ts_holder_windowed_general(
holder: &TreeSequenceHolder,
samples: Vec<i32>,
windows: Vec<(f64, f64)>,
) -> PyResult<SingleSampleCountCollection> {
assert!(!windows.is_empty());
let counts = popgen::MultiSiteCounts::try_from_tree_sequence_windows(
&holder.ts,
samples.iter().map(|i| i.into()),
windows.into_iter(),
None,
)
.unwrap();
let vcounts = counts
.into_iter()
.map(|c| SingleSampleCounts { counts: c })
.collect::<Vec<_>>();
Ok(SingleSampleCountCollection(vcounts))
}

Expand Down
62 changes: 61 additions & 1 deletion python/integration-tests/tests/test_diversity.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import demes
import msprime
import numpy as np
import pytest

from hypothesis import given
from hypothesis.strategies import integers
Expand Down Expand Up @@ -112,3 +111,64 @@ def test_diversity_all_sites_windows(anc_seed, mut_seed, np_seed, num_windows):
assert len(tsdiv) == len(div)
for i, j in zip(tsdiv, div):
assert np.fabs(i-j) <= 1e-9


# This test hits our API harder by not requiring that windows cover the entire
# length of the "genome" by skipping the first and last window
# TODO: write a custom hypothesis strategy for "number of subwindows" and
# we an use numpy to subset the input windows...
@given(anc_seed=integers(1, 42000000), mut_seed=integers(1, 42000000), np_seed=integers(0, 42000000), num_windows=integers(4, 15))
def test_diversity_windows(anc_seed, mut_seed, np_seed, num_windows):
yaml = """
time_units: generations
demes:
- name: the_one_deme
epochs:
- start_size: 10000
"""
np.random.seed(np_seed)
graph = demes.loads(yaml)
demog = msprime.Demography.from_demes(graph)
ts = msprime.sim_ancestry(samples=[msprime.SampleSet(100)], sequence_length=1000000, demography=demog,
recombination_rate=1.2e-8, random_seed=anc_seed)
ts = msprime.sim_mutations(ts, rate=1.2e-8, random_seed=mut_seed)

samples = [i for i in ts.samples()]
windows = np.linspace(0.0, ts.sequence_length, num_windows, endpoint=True)
tsdiv = ts.diversity(
sample_sets=samples, span_normalise=False, windows=windows)
tsholder = integration_tests.ts_holder_from_tables(ts.tables.copy())
window_subset = windows[1:len(windows)-1]
windows_tupled = [(i, j)
for i, j in zip(window_subset[0:], window_subset[1:])]
assert len(window_subset) > 0, f"{window_subset}"
windowed_counts = integration_tests.counts_from_ts_holder_windowed_general(
tsholder, samples, windows_tupled)
div = windowed_counts.diversity()
assert len(tsdiv) - 2 == len(div)
for i, j in zip(tsdiv[1:len(tsdiv)-1], div):
assert np.fabs(i-j) <= 1e-9

if len(window_subset) > 2:
windowed_counts = integration_tests.counts_from_ts_holder_windowed_general(
tsholder, samples, windows_tupled[::2])
div = windowed_counts.diversity()
for i, j in zip(tsdiv[1:len(tsdiv)-1][::2], div):
assert np.fabs(i-j) <= 1e-9

if len(windows) > 3:
windows_tupled = [(i, j)
for i, j in zip(windows[0:], windows[1:])]
assert len(windows_tupled) == len(windows) - 1
windowed_counts = integration_tests.counts_from_ts_holder_windowed_general(
tsholder, samples, windows_tupled[::2])
div = windowed_counts.diversity()
for w, d in zip(windows_tupled[::2], div):
found = False
for i, j in zip(windows_tupled, tsdiv):
if i == w:
found = True
assert np.fabs(
d-j) <= 1e-9, f"{w} {i} {d} != {j}, {tsdiv} -> {tsdiv[::2]} vs {div}"
break
assert found is True
15 changes: 15 additions & 0 deletions src/counts.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,21 @@ impl MultiSiteCounts {
from_tree_sequence::try_from_tree_sequence_with_site_iter(ts, samples, sites, options)
}

#[cfg(feature = "tskit")]
pub fn try_from_tree_sequence_windows<N, W, P>(
ts: &tskit::TreeSequence,
samples: N,
windows: W,
options: Option<FromTreeSequenceOptions>,
) -> Result<Vec<Self>, PopgenError>
where
N: Iterator<Item = tskit::NodeId>,
W: Iterator<Item = (P, P)>,
P: Into<tskit::Position>,
{
crate::from_tree_sequence::try_from_tree_sequence_windows(ts, samples, windows, options)
}

/// Add a site from an iterator of potentially missing allele IDs.
/// # Errors
/// - If `samples` is empty.
Expand Down
198 changes: 198 additions & 0 deletions src/from_tree_sequence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ where
Ok(num_sampled_genomes)
}

// TODO: why is this returning i32 when
// we always need to be casting the result to i64 immediately?
fn setup_samples<N>(
ts: &tskit::TreeSequence,
samples: N,
Expand Down Expand Up @@ -544,6 +546,202 @@ where
try_from_tree_sequence_details(ts, options, sites, sample_sets)
}

// NOTE: this fn duplicates a lot of pre-existing logic because
// we don't (yet) have a generic abstraction for doing this over
// iterators over samples, sites, windows, etc..
// The long term goal is to identify that pattern and refactor.
pub fn try_from_tree_sequence_windows<'ts, N, W, P>(
ts: &'ts tskit::TreeSequence,
samples: N,
windows: W,
options: Option<FromTreeSequenceOptions>,
) -> Result<Vec<MultiSiteCounts>, PopgenError>
where
N: Iterator<Item = tskit::NodeId>,
W: Iterator<Item = (P, P)>,
P: Into<tskit::Position>,
{
let (mut tree_data, num_sampled_genomes) = setup_samples(ts, samples)?;

let num_sampled_genomes = num_sampled_genomes as i64;
let mut alleles_at_site: Vec<&'ts [u8]> = vec![];
let mut allele_counts: Vec<i64> = vec![];
let mut counts = Vec::<MultiSiteCounts>::default();

let _options = options.unwrap_or_default();
let mut left = 0.0;
let edges_in = ts.edge_insertion_order_column();
let edges_out = ts.edge_removal_order_column();
let edges_left = ts.tables().edges().left_column();
let edges_right = ts.tables().edges().right_column();
let edges_parent = ts.tables().edges().parent_column();
let edges_child = ts.tables().edges().child_column();
let mutation_parent = ts.tables().mutations().parent_column();
let num_edges = ts.edges().num_rows().as_usize();
let mut i = 0_usize;
let mut j = 0_usize;

let mut site_iter = ts.site_iter();
let mut current_site = site_iter.next();
let mut lastpos: Option<tskit::Position> = None;
let windows = windows
.map(|(a, b)| (a.into(), b.into()))
.collect::<Vec<_>>();
if windows.is_empty() {
return Err(PopgenError::LibraryError("empty windows".to_string()));
}

for (index, i) in windows.iter().enumerate() {
for j in [i.0, i.1] {
if j < 0.0 || j > ts.tables().sequence_length() || !f64::from(j).is_finite() {
return Err(PopgenError::LibraryError(
format!("invalid Position {j}").to_string(),
));
}
}
if i.0 >= i.1 {
return Err(PopgenError::LibraryError(
format!("invalid window {i:?}").to_string(),
));
}
for j in windows.iter().skip(index + 1) {
if j.0 <= i.0 {
return Err(PopgenError::LibraryError(
format!("unsorted windows {i:?} {j:?}").to_string(),
));
}
if j.1 > i.0 && i.1 > j.0 {
return Err(PopgenError::LibraryError(
format!("overlapping windows {i:?} {j:?}").to_string(),
));
}
}
}
Comment on lines +607 to +619

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is O(n^2); surely there is a better way? It seems sufficient to verify the following:

  • Start positions are in strictly increasing order.
  • Every window w_i ends before at the same position that the next window w_(i+1) starts.

Would this not be O(n)?

let mut windows = windows.into_iter();
let mut current_window = windows.next();
// The last element will be the counts for the current window.
counts.push(MultiSiteCounts::default());
while i < num_edges && left < ts.tables().sequence_length() {
while j < num_edges && edges_right[edges_out[j]] == left {
let edge_parent = edges_parent[edges_out[j]].as_usize();
let edge_child = edges_child[edges_out[j]].as_usize();
tree_data.process_output_edge(edge_parent, edge_child);
j += 1;
}
while i < num_edges && edges_left[edges_in[i]] == left {
let edge_parent = edges_parent[edges_in[i]].as_usize();
let edge_child = edges_child[edges_in[i]].as_usize();
tree_data.process_input_edge(edge_parent, edge_child);
i += 1;
}
let right = update_right(
ts.tables().sequence_length().into(),
i,
&edges_left,
&edges_in,
);
let right = update_right(right, j, &edges_right, &edges_out);
while let Some(site_ref) = current_site.as_ref() {
// TODO: in general, we may need to ensure left <= position < right

// Outline:
// 1. If site is in current window, process it using the code below
// 2. If site is right of the current window, advance the window
// and return to 1 if the next window is_some()
// 3. If the next window is_none(), simply return counts
let mut process_site = false;

if let Some((left, right)) = current_window.as_ref() {
if site_ref.position() >= *left && site_ref.position() < *right {
process_site = true;
} else if site_ref.position() >= *right {
current_window = windows.next();
if let Some((left, right)) = current_window.as_ref() {
counts.push(MultiSiteCounts::default());
if site_ref.position() >= *left && site_ref.position() < *right {
process_site = true;
}
} else {
// Out of windows
return Ok(counts);
}
}
}

if process_site {
if site_ref.position() < right {
if let Some(lp) = lastpos.as_ref() {
if *lp >= site_ref.position() {
return Err(PopgenError::LibraryError(
"unsorted site positions".to_string(),
));
}
}
setup_alleles_at_site(ts, site_ref.id(), &mut alleles_at_site)?;
allele_counts.resize(1, 0);

// NOTE: we process in reverse order because
// more recent mutations get processed first,
// allowing the propagation of already-mutated
// nodes up the tree.
for mutation in site_ref.mutation_iter().rev() {
let num_samples_inheriting_derived_state =
tree_data.process_mutation(&mutation, &mutation_parent);
if num_samples_inheriting_derived_state > 0
&& num_samples_inheriting_derived_state < num_sampled_genomes
{
let derived_state =
*ts.mutations().derived_state(mutation.id()).as_ref().ok_or(
PopgenError::LibraryError(
"mutation is missing derived state".to_string(),
),
)?;
match alleles_at_site.iter().position(|&x| x == derived_state) {
Some(index) => {
if index > 0 {
allele_counts[index] += num_samples_inheriting_derived_state
}
}
None => {
alleles_at_site.push(derived_state);
allele_counts.push(num_samples_inheriting_derived_state);
}
}
}
}
// TODO: we should simply sum the desired quantity as we go along,
// eliminating the need for an iteration here.
allele_counts[0] =
(num_sampled_genomes) - allele_counts.iter().skip(1).sum::<i64>();
assert!(allele_counts[0] >= 0);
if allele_counts
.iter()
.filter(|&&i| i > 0 && i < num_sampled_genomes)
.count()
> 1
{
let i = counts.len() - 1;
counts[i]
.add_site_from_counts(&allele_counts, num_sampled_genomes as i32)?;
}
lastpos = Some(site_ref.position());
current_site = site_iter.next();
} else {
break;
}
} else {
lastpos = Some(site_ref.position());
current_site = site_iter.next();
}
}
if current_site.is_none() {
break;
};
left = right;
}
Ok(counts)
}

pub fn try_from_tree_sequence_multi_with_site_iter<'ts, Outer, Inner, S>(
ts: &'ts tskit::TreeSequence,
samples: Outer,
Expand Down
Loading
Loading