pfizer-opensource · jorenretel · Apr 30, 2025 · Apr 24, 2025 · Apr 24, 2025 · Apr 24, 2025
@@ -9,6 +9,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 ...
 
+## [v0.2.0]
+### Fixed
+- fixed a bug in the intervals to values cuda kernel that
+  introduced zeros in places where there should be
+  "default_value" (see release v0.1.5).
+### Added
+- custom_position_sampler argument to bigwig_loader.dataset.BigWigDataset
+  and bigwig_loader.pytorch.PytorchBigWigDataset to optionally overwrite the
+  default random sampling of genomic coordinates from "regions of interest."
+- custom_track_sampler argument to bigwig_loader.dataset.BigWigDataset
+  and bigwig_loader.pytorch.PytorchBigWigDataset to optionally use a different
+  track sampling strategy.
+
 ## [v0.1.5]
 ### Added
 - set a default value different from 0.0
@@ -47,7 +60,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - release to pypi
 
-[Unreleased]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.5...HEAD
+[Unreleased]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.2.0...HEAD
+[v0.1.6]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.5...v0.2.0
 [v0.1.5]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.4...v0.1.5
 [v0.1.4]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.3...v0.1.4
 [v0.1.3]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.2...v0.1.3

@@ -2,6 +2,7 @@
 from pathlib import Path
 from typing import Any
 from typing import Callable
+from typing import Iterable
 from typing import Iterator
 from typing import Literal
 from typing import Optional
@@ -78,6 +79,12 @@ class BigWigDataset:
             GPU. More threads means that more IO can take place while the GPU is busy doing
             calculations (decompressing or neural network training for example). More threads
             also means a higher GPU memory usage. Default: 4
+        custom_position_sampler: if set, this sampler will be used instead of the default
+            position sampler (which samples randomly and uniform from regions of interest)
+            This should be an iterable of tuples (chromosome, center).
+        custom_track_sampler: if specified, this sampler will be used to sample tracks. When not
+            specified, each batch simply contains all tracks, or a randomly sellected subset of
+            tracks in case sub_sample_tracks is set. Should be Iterable batches of track indices.
         return_batch_objects: if True, the batches will be returned as instances of
             bigwig_loader.batch.Batch
     """
@@ -107,6 +114,8 @@ def __init__(
         repeat_same_positions: bool = False,
         sub_sample_tracks: Optional[int] = None,
         n_threads: int = 4,
+        custom_position_sampler: Optional[Iterable[tuple[str, int]]] = None,
+        custom_track_sampler: Optional[Iterable[list[int]]] = None,
         return_batch_objects: bool = False,
     ):
         super().__init__()
@@ -152,32 +161,34 @@ def __init__(
         self._sub_sample_tracks = sub_sample_tracks
         self._n_threads = n_threads
         self._return_batch_objects = return_batch_objects
-
-    def _create_dataloader(self) -> StreamedDataloader:
-        position_sampler = RandomPositionSampler(
+        self._position_sampler = custom_position_sampler or RandomPositionSampler(
             regions_of_interest=self.regions_of_interest,
             buffer_size=self._position_sampler_buffer_size,
             repeat_same=self._repeat_same_positions,
         )
+        if custom_track_sampler is not None:
+            self._track_sampler: Optional[Iterable[list[int]]] = custom_track_sampler
+        elif sub_sample_tracks is not None:
+            self._track_sampler = TrackSampler(
+                total_number_of_tracks=len(self.bigwig_collection),
+                sample_size=sub_sample_tracks,
+            )
+        else:
+            self._track_sampler = None
 
+    def _create_dataloader(self) -> StreamedDataloader:
         sequence_sampler = GenomicSequenceSampler(
             reference_genome_path=self.reference_genome_path,
             sequence_length=self.sequence_length,
-            position_sampler=position_sampler,
+            position_sampler=self._position_sampler,
             maximum_unknown_bases_fraction=self.maximum_unknown_bases_fraction,
         )
-        track_sampler = None
-        if self._sub_sample_tracks is not None:
-            track_sampler = TrackSampler(
-                total_number_of_tracks=len(self.bigwig_collection),
-                sample_size=self._sub_sample_tracks,
-            )
 
         query_batch_generator = QueryBatchGenerator(
             genomic_location_sampler=sequence_sampler,
             center_bin_to_predict=self.center_bin_to_predict,
             batch_size=self.super_batch_size,
-            track_sampler=track_sampler,
+            track_sampler=self._track_sampler,
         )
 
         return StreamedDataloader(

@@ -19,20 +19,55 @@ def download_example_data() -> None:
 
 def get_reference_genome(reference_genome_path: Path = config.reference_genome) -> Path:
     compressed_file = reference_genome_path.with_suffix(".fasta.gz")
-    if reference_genome_path.exists():
-        return reference_genome_path
-    elif compressed_file.exists():
-        # subprocess.run(["bgzip", "-d", compressed_file])
-        unzip_gz_file(compressed_file, reference_genome_path)
-    else:
-        LOGGER.info("Need reference genome for tests. Downloading it from ENCODE.")
-        url = "https://www.encodeproject.org/files/GRCh38_no_alt_analysis_set_GCA_000001405.15/@@download/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz"
-        urllib.request.urlretrieve(url, compressed_file)
+    if compressed_file.exists() and not reference_genome_path.exists():
         # subprocess.run(["bgzip", "-d", compressed_file])
         unzip_gz_file(compressed_file, reference_genome_path)
+
+    if (
+        reference_genome_path.exists()
+        and checksum_md5_for_path(reference_genome_path)
+        != config.reference_genome_checksum
+    ):
+        LOGGER.info(
+            f"Reference genome checksum mismatch, downloading again from {reference_genome_path}"
+        )
+        _download_genome(
+            url=config.reference_genome_url,
+            compressed_file_path=compressed_file,
+            uncompressed_file_path=reference_genome_path,
+            md5_checksum=config.reference_genome_checksum,
+        )
+
+    if not reference_genome_path.exists():
+        LOGGER.info(
+            f"Reference genome not found, downloading from {config.reference_genome_url}"
+        )
+        _download_genome(
+            url=config.reference_genome_url,
+            compressed_file_path=compressed_file,
+            uncompressed_file_path=reference_genome_path,
+            md5_checksum=config.reference_genome_checksum,
+        )
     return reference_genome_path
 
 
+def _download_genome(
+    url: str,
+    compressed_file_path: Path,
+    uncompressed_file_path: Path,
+    md5_checksum: str,
+) -> Path:
+    urllib.request.urlretrieve(url, compressed_file_path)
+    # subprocess.run(["bgzip", "-d", compressed_file])
+    unzip_gz_file(compressed_file_path, uncompressed_file_path)
+    this_checksum = checksum_md5_for_path(uncompressed_file_path)
+    if this_checksum != md5_checksum:
+        raise RuntimeError(
+            f"{uncompressed_file_path} has incorrect checksum: {this_checksum} vs. {md5_checksum}"
+        )
+    return uncompressed_file_path
+
+
 def unzip_gz_file(compressed_file_path: Path, output_file_path: Path) -> Path:
     with gzip.open(compressed_file_path, "rb") as gz_file:
         with open(output_file_path, "wb") as output_file:
@@ -52,6 +87,13 @@ def unzip_gz_file(compressed_file_path: Path, output_file_path: Path) -> Path:
 }
 
 
+def checksum_md5_for_path(path: Path, chunk_size: int = 10 * 1024 * 1024) -> str:
+    """return the md5sum"""
+    with path.open(mode="rb") as f:
+        checksum = checksum_md5(f, chunk_size=chunk_size)
+    return checksum
+
+
 def checksum_md5(f: BinaryIO, *, chunk_size: int = 10 * 1024 * 1024) -> str:
     """return the md5sum"""
     m = hashlib.md5(b"", usedforsecurity=False)
@@ -68,7 +110,7 @@ def get_example_bigwigs_files(bigwig_dir: Path = config.bigwig_dir) -> Path:
             file = bigwig_dir / fn
             if not file.exists():
                 urllib.request.urlretrieve(url, file)
-            with file.open(mode="rb") as f:
-                if checksum_md5(f) != md5:
-                    raise RuntimeError(f"{fn} has incorrect checksum!")
+            checksum = checksum_md5_for_path(file)
+            if checksum != md5:
+                raise RuntimeError(f"{fn} has incorrect checksum: {checksum} vs. {md5}")
     return bigwig_dir
@@ -1,5 +1,6 @@
 import logging
 import math
+from math import isnan
 from pathlib import Path
 
 import cupy as cp
@@ -86,11 +87,15 @@ def intervals_to_values(
         )
 
     if out is None:
+        logging.debug(f"Creating new out tensor with default value {default_value}")
+
         out = cp.full(
             (found_starts.shape[0], len(query_starts), sequence_length // window_size),
             default_value,
             dtype=cp.float32,
         )
+        logging.debug(out)
+
     else:
         logging.debug(f"Setting default value in output tensor to {default_value}")
         out.fill(default_value)
@@ -120,6 +125,7 @@ def intervals_to_values(
     array_start = cp.ascontiguousarray(array_start)
     array_end = cp.ascontiguousarray(array_end)
     array_value = cp.ascontiguousarray(array_value)
+    default_value_isnan = isnan(default_value)
 
     cuda_kernel(
         (grid_size,),
@@ -137,6 +143,8 @@ def intervals_to_values(
             sequence_length,
             max_number_intervals,
             window_size,
+            cp.float32(default_value),
+            default_value_isnan,
             out,
         ),
     )
@@ -167,8 +175,10 @@ def kernel_in_python_with_window(
         int,
         int,
         int,
-        cp.ndarray,
         int,
+        float,
+        bool,
+        cp.ndarray,
     ],
 ) -> cp.ndarray:
     """Equivalent in python to cuda_kernel_with_window. Just for debugging."""
@@ -186,6 +196,8 @@ def kernel_in_python_with_window(
         sequence_length,
         max_number_intervals,
         window_size,
+        default_value,
+        default_value_isnan,
         out,
     ) = args
 
@@ -214,7 +226,7 @@ def kernel_in_python_with_window(
     print("reduced_dim")
     print(reduced_dim)
 
-    out_vector = [0.0] * reduced_dim * batch_size * num_tracks
+    out_vector = [default_value] * reduced_dim * batch_size * num_tracks
 
     for thread in range(n_threads):
         batch_index = thread % batch_size
@@ -235,7 +247,8 @@ def kernel_in_python_with_window(
 
         cursor = found_start_index
         window_index = 0
-        summation = 0
+        summation = 0.0
+        valid_count = 0
 
         # cursor moves through the rows of the bigwig file
         # window_index moves through the sequence
@@ -261,19 +274,31 @@ def kernel_in_python_with_window(
             print("start index", start_index)
 
             if start_index >= window_end:
-                print("CONTINUE")
-                out_vector[i * reduced_dim + window_index] = summation / window_size
-                summation = 0
+                if default_value_isnan:
+                    if valid_count > 0:
+                        out_vector[i * reduced_dim + window_index] = (
+                            summation / valid_count
+                        )
+                    else:
+                        out_vector[i * reduced_dim + window_index] = default_value
+                else:
+                    summation = summation + (window_size - valid_count) * default_value
+                    out_vector[i * reduced_dim + window_index] = summation / window_size
+                summation = 0.0
+                valid_count = 0
                 window_index += 1
+                print("CONTINUE")
                 continue
 
             number = min(window_end, end_index) - max(window_start, start_index)
 
-            print(
-                f"Add {number} x {track_values[cursor]} = {number * track_values[cursor]} to summation"
-            )
-            summation += number * track_values[cursor]
-            print(f"Summation = {summation}")
+            if number > 0:
+                print(
+                    f"Add {number} x {track_values[cursor]} = {number * track_values[cursor]} to summation"
+                )
+                summation += number * track_values[cursor]
+                print(f"Summation = {summation}")
+                valid_count += number
 
             print("end_index", "window_end")
             print(end_index, window_end)
@@ -288,8 +313,19 @@ def kernel_in_python_with_window(
                     print(
                         "cursor + 1 >= found_end_index \t\t calculate average, reset summation and move to next window"
                     )
-                out_vector[i * reduced_dim + window_index] = summation / window_size
-                summation = 0
+                # out_vector[i * reduced_dim + window_index] = summation / window_size
+                if default_value_isnan:
+                    if valid_count > 0:
+                        out_vector[i * reduced_dim + window_index] = (
+                            summation / valid_count
+                        )
+                    else:
+                        out_vector[i * reduced_dim + window_index] = default_value
+                else:
+                    summation = summation + (window_size - valid_count) * default_value
+                    out_vector[i * reduced_dim + window_index] = summation / window_size
+                summation = 0.0
+                valid_count = 0
                 window_index += 1
             # move cursor
             if end_index < window_end:

@@ -1,6 +1,7 @@
 from pathlib import Path
 from typing import Any
 from typing import Callable
+from typing import Iterable
 from typing import Iterator
 from typing import Literal
 from typing import Optional
@@ -165,6 +166,12 @@ class PytorchBigWigDataset(IterableDataset[BATCH_TYPE]):
             also means a higher GPU memory usage. Default: 4
         return_batch_objects: if True, the batches will be returned as instances of
             bigwig_loader.pytorch.PytorchBatch
+        custom_position_sampler: if set, this sampler will be used instead of the default
+            position sampler (which samples randomly and uniform from regions of interest)
+            This should be an iterable of tuples (chromosome, center).
+        custom_track_sampler: if specified, this sampler will be used to sample tracks. When not
+            specified, each batch simply contains all tracks, or a randomly sellected subset of
+            tracks in case sub_sample_tracks is set. Should be Iterable batches of track indices.
     """
 
     def __init__(
@@ -192,6 +199,8 @@ def __init__(
         repeat_same_positions: bool = False,
         sub_sample_tracks: Optional[int] = None,
         n_threads: int = 4,
+        custom_position_sampler: Optional[Iterable[tuple[str, int]]] = None,
+        custom_track_sampler: Optional[Iterable[list[int]]] = None,
         return_batch_objects: bool = False,
     ):
         super().__init__()
@@ -217,6 +226,8 @@ def __init__(
             repeat_same_positions=repeat_same_positions,
             sub_sample_tracks=sub_sample_tracks,
             n_threads=n_threads,
+            custom_position_sampler=custom_position_sampler,
+            custom_track_sampler=custom_track_sampler,
             return_batch_objects=True,
         )
         self._return_batch_objects = return_batch_objects

@@ -1,6 +1,7 @@
 from pathlib import Path
 from typing import Any
 from typing import Callable
+from typing import Iterable
 from typing import Iterator
 from typing import Literal
 from typing import Optional
@@ -21,7 +22,7 @@ def __init__(
         self,
         reference_genome_path: Path,
         sequence_length: int,
-        position_sampler: Iterator[tuple[str, int]],
+        position_sampler: Iterable[tuple[str, int]],
         maximum_unknown_bases_fraction: float = 0.1,
     ):
         self.reference_genome_path = reference_genome_path