33from typing import Any
44from typing import Callable
55from typing import Iterator
6+ from typing import Literal
67from typing import Optional
78from typing import Sequence
89from typing import Union
@@ -38,16 +39,16 @@ class BigWigDataset:
3839 reference_genome_path: path to fasta file containing the reference genome.
3940 sequence_length: number of base pairs in input sequence
4041 center_bin_to_predict: if given, only do prediction on a central window. Should be
41- smaller than or equal to sequence_length. If not given will be the same as
42- sequence_length.
42+ smaller than or equal to sequence_length. If None, the whole sequence length
43+ will be used. Default: None
4344 window_size: used to down sample the resolution of the target from sequence_length
4445 moving_average_window_size: window size for moving average on the target. Can
4546 help too smooth out the target. Default: 1, which means no smoothing. If
4647 used in combination with window_size, the target is first downsampled and
4748 then smoothed.
4849 batch_size: batch size
4950 super_batch_size: batch size that is used in the background to load data from
50- bigwig files. Should be larget than or equal to batch_size. If None, it will
51+ bigwig files. Should be larger than or equal to batch_size. If None, it will
5152 be equal to batch_size.
5253 batches_per_epoch: because the length of an epoch is slightly arbitrary here,
5354 the number of batches can be set by hand. If not the number of batches per
@@ -61,6 +62,9 @@ class BigWigDataset:
6162 If None, no scaling is done. Keys can be (partial) file paths. See
6263 bigwig_loader.path.match_key_to_path for more information about how
6364 dict keys are mapped to paths.
65+ default_value: value to use for intervals that are not present in the
66+ bigwig file. Defaults to 0.0. Can be set to cp.nan to differentiate
67+ between missing values listed as 0.0.
6468 first_n_files: Only use the first n files (handy for debugging on less tasks)
6569 position_sampler_buffer_size: number of intervals picked up front by the position sampler.
6670 When all intervals are used, new intervals are picked.
@@ -73,7 +77,7 @@ class BigWigDataset:
7377 n_threads: number of python threads / cuda streams to use for loading the data to
7478 GPU. More threads means that more IO can take place while the GPU is busy doing
7579 calculations (decompressing or neural network training for example). More threads
76- also means a higher GPU memory usage.
80+ also means a higher GPU memory usage. Default: 4
7781 return_batch_objects: if True, the batches will be returned as instances of
7882 bigwig_loader.batch.Batch
7983 """
@@ -92,11 +96,12 @@ def __init__(
9296 batches_per_epoch : Optional [int ] = None ,
9397 maximum_unknown_bases_fraction : float = 0.1 ,
9498 sequence_encoder : Optional [
95- Union [Callable [[Sequence [str ]], Any ], str ]
99+ Union [Callable [[Sequence [str ]], Any ], Literal [ "onehot" ] ]
96100 ] = "onehot" ,
97101 file_extensions : Sequence [str ] = (".bigWig" , ".bw" ),
98102 crawl : bool = True ,
99103 scale : Optional [dict [Union [str | Path ], Any ]] = None ,
104+ default_value : float = 0.0 ,
100105 first_n_files : Optional [int ] = None ,
101106 position_sampler_buffer_size : int = 100000 ,
102107 repeat_same_positions : bool = False ,
@@ -139,6 +144,7 @@ def __init__(
139144 self ._first_n_files = first_n_files
140145 self ._file_extensions = file_extensions
141146 self ._crawl = crawl
147+ self ._default_value = default_value
142148 self ._scale = scale
143149 self ._position_sampler_buffer_size = position_sampler_buffer_size
144150 self ._repeat_same_positions = repeat_same_positions
@@ -181,6 +187,7 @@ def _create_dataloader(self) -> StreamedDataloader:
181187 queue_size = self ._n_threads + 1 ,
182188 slice_size = self .batch_size ,
183189 window_size = self .window_size ,
190+ default_value = self ._default_value ,
184191 )
185192
186193 def __iter__ (
0 commit comments