Skip to content
Merged
16 changes: 15 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
...

## [v0.2.0]
### Fixed
- fixed a bug in the intervals to values cuda kernel that
introduced zeros in places where there should be
"default_value" (see release v0.1.5).
### Added
- custom_position_sampler argument to bigwig_loader.dataset.BigWigDataset
and bigwig_loader.pytorch.PytorchBigWigDataset to optionally overwrite the
default random sampling of genomic coordinates from "regions of interest."
- custom_track_sampler argument to bigwig_loader.dataset.BigWigDataset
and bigwig_loader.pytorch.PytorchBigWigDataset to optionally use a different
track sampling strategy.

## [v0.1.5]
### Added
- set a default value different from 0.0
Expand Down Expand Up @@ -47,7 +60,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- release to pypi

[Unreleased]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.5...HEAD
[Unreleased]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.2.0...HEAD
[v0.1.6]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.5...v0.2.0
[v0.1.5]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.4...v0.1.5
[v0.1.4]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.3...v0.1.4
[v0.1.3]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.2...v0.1.3
Expand Down
33 changes: 22 additions & 11 deletions bigwig_loader/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from pathlib import Path
from typing import Any
from typing import Callable
from typing import Iterable
from typing import Iterator
from typing import Literal
from typing import Optional
Expand Down Expand Up @@ -78,6 +79,12 @@ class BigWigDataset:
GPU. More threads means that more IO can take place while the GPU is busy doing
calculations (decompressing or neural network training for example). More threads
also means a higher GPU memory usage. Default: 4
custom_position_sampler: if set, this sampler will be used instead of the default
position sampler (which samples randomly and uniform from regions of interest)
This should be an iterable of tuples (chromosome, center).
custom_track_sampler: if specified, this sampler will be used to sample tracks. When not
specified, each batch simply contains all tracks, or a randomly sellected subset of
tracks in case sub_sample_tracks is set. Should be Iterable batches of track indices.
return_batch_objects: if True, the batches will be returned as instances of
bigwig_loader.batch.Batch
"""
Expand Down Expand Up @@ -107,6 +114,8 @@ def __init__(
repeat_same_positions: bool = False,
sub_sample_tracks: Optional[int] = None,
n_threads: int = 4,
custom_position_sampler: Optional[Iterable[tuple[str, int]]] = None,
custom_track_sampler: Optional[Iterable[list[int]]] = None,
return_batch_objects: bool = False,
):
super().__init__()
Expand Down Expand Up @@ -152,32 +161,34 @@ def __init__(
self._sub_sample_tracks = sub_sample_tracks
self._n_threads = n_threads
self._return_batch_objects = return_batch_objects

def _create_dataloader(self) -> StreamedDataloader:
position_sampler = RandomPositionSampler(
self._position_sampler = custom_position_sampler or RandomPositionSampler(
regions_of_interest=self.regions_of_interest,
buffer_size=self._position_sampler_buffer_size,
repeat_same=self._repeat_same_positions,
)
if custom_track_sampler is not None:
self._track_sampler: Optional[Iterable[list[int]]] = custom_track_sampler
elif sub_sample_tracks is not None:
self._track_sampler = TrackSampler(
total_number_of_tracks=len(self.bigwig_collection),
sample_size=sub_sample_tracks,
)
else:
self._track_sampler = None

def _create_dataloader(self) -> StreamedDataloader:
sequence_sampler = GenomicSequenceSampler(
reference_genome_path=self.reference_genome_path,
sequence_length=self.sequence_length,
position_sampler=position_sampler,
position_sampler=self._position_sampler,
maximum_unknown_bases_fraction=self.maximum_unknown_bases_fraction,
)
track_sampler = None
if self._sub_sample_tracks is not None:
track_sampler = TrackSampler(
total_number_of_tracks=len(self.bigwig_collection),
sample_size=self._sub_sample_tracks,
)

query_batch_generator = QueryBatchGenerator(
genomic_location_sampler=sequence_sampler,
center_bin_to_predict=self.center_bin_to_predict,
batch_size=self.super_batch_size,
track_sampler=track_sampler,
track_sampler=self._track_sampler,
)

return StreamedDataloader(
Expand Down
66 changes: 54 additions & 12 deletions bigwig_loader/download_example_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,55 @@ def download_example_data() -> None:

def get_reference_genome(reference_genome_path: Path = config.reference_genome) -> Path:
compressed_file = reference_genome_path.with_suffix(".fasta.gz")
if reference_genome_path.exists():
return reference_genome_path
elif compressed_file.exists():
# subprocess.run(["bgzip", "-d", compressed_file])
unzip_gz_file(compressed_file, reference_genome_path)
else:
LOGGER.info("Need reference genome for tests. Downloading it from ENCODE.")
url = "https://www.encodeproject.org/files/GRCh38_no_alt_analysis_set_GCA_000001405.15/@@download/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz"
urllib.request.urlretrieve(url, compressed_file)
if compressed_file.exists() and not reference_genome_path.exists():
# subprocess.run(["bgzip", "-d", compressed_file])
unzip_gz_file(compressed_file, reference_genome_path)

if (
reference_genome_path.exists()
and checksum_md5_for_path(reference_genome_path)
!= config.reference_genome_checksum
):
LOGGER.info(
f"Reference genome checksum mismatch, downloading again from {reference_genome_path}"
)
_download_genome(
url=config.reference_genome_url,
compressed_file_path=compressed_file,
uncompressed_file_path=reference_genome_path,
md5_checksum=config.reference_genome_checksum,
)

if not reference_genome_path.exists():
LOGGER.info(
f"Reference genome not found, downloading from {config.reference_genome_url}"
)
_download_genome(
url=config.reference_genome_url,
compressed_file_path=compressed_file,
uncompressed_file_path=reference_genome_path,
md5_checksum=config.reference_genome_checksum,
)
return reference_genome_path


def _download_genome(
url: str,
compressed_file_path: Path,
uncompressed_file_path: Path,
md5_checksum: str,
) -> Path:
urllib.request.urlretrieve(url, compressed_file_path)
# subprocess.run(["bgzip", "-d", compressed_file])
unzip_gz_file(compressed_file_path, uncompressed_file_path)
this_checksum = checksum_md5_for_path(uncompressed_file_path)
if this_checksum != md5_checksum:
raise RuntimeError(
f"{uncompressed_file_path} has incorrect checksum: {this_checksum} vs. {md5_checksum}"
)
return uncompressed_file_path


def unzip_gz_file(compressed_file_path: Path, output_file_path: Path) -> Path:
with gzip.open(compressed_file_path, "rb") as gz_file:
with open(output_file_path, "wb") as output_file:
Expand All @@ -52,6 +87,13 @@ def unzip_gz_file(compressed_file_path: Path, output_file_path: Path) -> Path:
}


def checksum_md5_for_path(path: Path, chunk_size: int = 10 * 1024 * 1024) -> str:
"""return the md5sum"""
with path.open(mode="rb") as f:
checksum = checksum_md5(f, chunk_size=chunk_size)
return checksum


def checksum_md5(f: BinaryIO, *, chunk_size: int = 10 * 1024 * 1024) -> str:
"""return the md5sum"""
m = hashlib.md5(b"", usedforsecurity=False)
Expand All @@ -68,7 +110,7 @@ def get_example_bigwigs_files(bigwig_dir: Path = config.bigwig_dir) -> Path:
file = bigwig_dir / fn
if not file.exists():
urllib.request.urlretrieve(url, file)
with file.open(mode="rb") as f:
if checksum_md5(f) != md5:
raise RuntimeError(f"{fn} has incorrect checksum!")
checksum = checksum_md5_for_path(file)
if checksum != md5:
raise RuntimeError(f"{fn} has incorrect checksum: {checksum} vs. {md5}")
return bigwig_dir
62 changes: 49 additions & 13 deletions bigwig_loader/intervals_to_values.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import math
from math import isnan
from pathlib import Path

import cupy as cp
Expand Down Expand Up @@ -86,11 +87,15 @@ def intervals_to_values(
)

if out is None:
logging.debug(f"Creating new out tensor with default value {default_value}")

out = cp.full(
(found_starts.shape[0], len(query_starts), sequence_length // window_size),
default_value,
dtype=cp.float32,
)
logging.debug(out)

else:
logging.debug(f"Setting default value in output tensor to {default_value}")
out.fill(default_value)
Expand Down Expand Up @@ -120,6 +125,7 @@ def intervals_to_values(
array_start = cp.ascontiguousarray(array_start)
array_end = cp.ascontiguousarray(array_end)
array_value = cp.ascontiguousarray(array_value)
default_value_isnan = isnan(default_value)

cuda_kernel(
(grid_size,),
Expand All @@ -137,6 +143,8 @@ def intervals_to_values(
sequence_length,
max_number_intervals,
window_size,
cp.float32(default_value),
default_value_isnan,
out,
),
)
Expand Down Expand Up @@ -167,8 +175,10 @@ def kernel_in_python_with_window(
int,
int,
int,
cp.ndarray,
int,
float,
bool,
cp.ndarray,
],
) -> cp.ndarray:
"""Equivalent in python to cuda_kernel_with_window. Just for debugging."""
Expand All @@ -186,6 +196,8 @@ def kernel_in_python_with_window(
sequence_length,
max_number_intervals,
window_size,
default_value,
default_value_isnan,
out,
) = args

Expand Down Expand Up @@ -214,7 +226,7 @@ def kernel_in_python_with_window(
print("reduced_dim")
print(reduced_dim)

out_vector = [0.0] * reduced_dim * batch_size * num_tracks
out_vector = [default_value] * reduced_dim * batch_size * num_tracks

for thread in range(n_threads):
batch_index = thread % batch_size
Expand All @@ -235,7 +247,8 @@ def kernel_in_python_with_window(

cursor = found_start_index
window_index = 0
summation = 0
summation = 0.0
valid_count = 0

# cursor moves through the rows of the bigwig file
# window_index moves through the sequence
Expand All @@ -261,19 +274,31 @@ def kernel_in_python_with_window(
print("start index", start_index)

if start_index >= window_end:
print("CONTINUE")
out_vector[i * reduced_dim + window_index] = summation / window_size
summation = 0
if default_value_isnan:
if valid_count > 0:
out_vector[i * reduced_dim + window_index] = (
summation / valid_count
)
else:
out_vector[i * reduced_dim + window_index] = default_value
else:
summation = summation + (window_size - valid_count) * default_value
out_vector[i * reduced_dim + window_index] = summation / window_size
summation = 0.0
valid_count = 0
window_index += 1
print("CONTINUE")
continue

number = min(window_end, end_index) - max(window_start, start_index)

print(
f"Add {number} x {track_values[cursor]} = {number * track_values[cursor]} to summation"
)
summation += number * track_values[cursor]
print(f"Summation = {summation}")
if number > 0:
print(
f"Add {number} x {track_values[cursor]} = {number * track_values[cursor]} to summation"
)
summation += number * track_values[cursor]
print(f"Summation = {summation}")
valid_count += number

print("end_index", "window_end")
print(end_index, window_end)
Expand All @@ -288,8 +313,19 @@ def kernel_in_python_with_window(
print(
"cursor + 1 >= found_end_index \t\t calculate average, reset summation and move to next window"
)
out_vector[i * reduced_dim + window_index] = summation / window_size
summation = 0
# out_vector[i * reduced_dim + window_index] = summation / window_size
if default_value_isnan:
if valid_count > 0:
out_vector[i * reduced_dim + window_index] = (
summation / valid_count
)
else:
out_vector[i * reduced_dim + window_index] = default_value
else:
summation = summation + (window_size - valid_count) * default_value
out_vector[i * reduced_dim + window_index] = summation / window_size
summation = 0.0
valid_count = 0
window_index += 1
# move cursor
if end_index < window_end:
Expand Down
11 changes: 11 additions & 0 deletions bigwig_loader/pytorch.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pathlib import Path
from typing import Any
from typing import Callable
from typing import Iterable
from typing import Iterator
from typing import Literal
from typing import Optional
Expand Down Expand Up @@ -165,6 +166,12 @@ class PytorchBigWigDataset(IterableDataset[BATCH_TYPE]):
also means a higher GPU memory usage. Default: 4
return_batch_objects: if True, the batches will be returned as instances of
bigwig_loader.pytorch.PytorchBatch
custom_position_sampler: if set, this sampler will be used instead of the default
position sampler (which samples randomly and uniform from regions of interest)
This should be an iterable of tuples (chromosome, center).
custom_track_sampler: if specified, this sampler will be used to sample tracks. When not
specified, each batch simply contains all tracks, or a randomly sellected subset of
tracks in case sub_sample_tracks is set. Should be Iterable batches of track indices.
"""

def __init__(
Expand Down Expand Up @@ -192,6 +199,8 @@ def __init__(
repeat_same_positions: bool = False,
sub_sample_tracks: Optional[int] = None,
n_threads: int = 4,
custom_position_sampler: Optional[Iterable[tuple[str, int]]] = None,
custom_track_sampler: Optional[Iterable[list[int]]] = None,
return_batch_objects: bool = False,
):
super().__init__()
Expand All @@ -217,6 +226,8 @@ def __init__(
repeat_same_positions=repeat_same_positions,
sub_sample_tracks=sub_sample_tracks,
n_threads=n_threads,
custom_position_sampler=custom_position_sampler,
custom_track_sampler=custom_track_sampler,
return_batch_objects=True,
)
self._return_batch_objects = return_batch_objects
Expand Down
3 changes: 2 additions & 1 deletion bigwig_loader/sampler/genome_sampler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pathlib import Path
from typing import Any
from typing import Callable
from typing import Iterable
from typing import Iterator
from typing import Literal
from typing import Optional
Expand All @@ -21,7 +22,7 @@ def __init__(
self,
reference_genome_path: Path,
sequence_length: int,
position_sampler: Iterator[tuple[str, int]],
position_sampler: Iterable[tuple[str, int]],
maximum_unknown_bases_fraction: float = 0.1,
):
self.reference_genome_path = reference_genome_path
Expand Down
Loading