diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml index 9705deab..efeff660 100644 --- a/.github/workflows/black.yml +++ b/.github/workflows/black.yml @@ -1,11 +1,13 @@ name: Lint -on: [pull_request] +on: + pull_request: + branches: master jobs: lint: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - - uses: psf/black@20.8b1 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + - uses: psf/black@stable diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 00000000..8c2b17ab --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,31 @@ + # This workflows will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +name: Upload Python Package + +on: + release: + types: [created] + +jobs: + deploy: + runs-on: ubuntu-latest + name: upload release to PyPI + permissions: + id-token: write # IMPORTANT: this permission is mandatory for trusted publishing + + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine + - name: Build and publish + run: | + python setup.py sdist bdist_wheel + - name: Publish package distributions to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/run-pytest-dev.yml b/.github/workflows/run-pytest-dev.yml new file mode 100644 index 00000000..b6bd4b9d --- /dev/null +++ b/.github/workflows/run-pytest-dev.yml @@ -0,0 +1,41 @@ +name: Run pytests for dev + +on: + pull_request: + branches: [dev] + +jobs: + + pytest: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: ["3.12"] + os: [ubuntu-latest] + + steps: + - uses: actions/checkout@v3 + + - name: Hack setup-python cache + + if: hashFiles('**/requirements.txt', '**/pyproject.toml') == '' + run: | + touch ./requirements.txt + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' # caching can speed up the workflow by reusing the installed dependencies + + - name: Install uv + run: pip install uv + + - name: Install test dependencies + run: if [ -f requirements/requirements-test.txt ]; then uv pip install -r requirements/requirements-test.txt --system; fi + + - name: Install package + run: uv pip install .[ml] --system + + - name: Run pytest tests + run: pytest tests -x -vv --remote-data diff --git a/.github/workflows/run-pytest-release.yml b/.github/workflows/run-pytest-release.yml new file mode 100644 index 00000000..cbcbaeb7 --- /dev/null +++ b/.github/workflows/run-pytest-release.yml @@ -0,0 +1,35 @@ +name: Run pytests for release + +on: + pull_request: + branches: [master] + +jobs: + + pytest: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: ["3.9", "3.12"] + os: [ubuntu-latest] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install uv + run: pip install uv + + - name: Install test dependencies + run: if [ -f requirements/requirements-test.txt ]; then uv pip install -r requirements/requirements-test.txt --system; fi + + - name: Install package + run: uv pip install .[ml] --system + + - name: Run pytest tests + run: pytest tests -x -vv --remote-data + diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml deleted file mode 100644 index 11cf1a05..00000000 --- a/.github/workflows/run-pytest.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: Run pytests - -on: - pull_request: - branches: [master, dev] - -jobs: - pytest: - runs-on: ${{ matrix.os }} - strategy: - matrix: - python-version: [3.8, "3.10"] - os: [ubuntu-latest] - - steps: - - uses: actions/checkout@v2 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dev dependancies - run: if [ -f requirements/requirements-dev.txt ]; then pip install -r requirements/requirements-dev.txt; fi - - - name: Install test dependancies - run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi - - - name: Install package - run: python -m pip install . - - - name: Run pytest tests - run: pytest tests -x -vv --remote-data diff --git a/.gitignore b/.gitignore index a018973c..0f68ae47 100644 --- a/.gitignore +++ b/.gitignore @@ -85,7 +85,7 @@ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: -# .python-version +.python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. @@ -159,11 +159,11 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ -gitk/__pycache__ -gitk/eval/__pycache__ -gitk/region2vec/__pycache__ -gitk/tokenization/__pycache__ -gitk/tokenization/bedtools +geniml/__pycache__ +geniml/eval/__pycache__ +geniml/region2vec/__pycache__ +geniml/tokenization/__pycache__ +geniml/tokenization/bedtools # for testing test.* @@ -172,9 +172,30 @@ test.* # data tests/data/buenrostro2018.h5ad tests/data/buenrostro_metadata.tsv +tests/data/model-tests/* +bedshifted* +examples/sh_output* +examples/py_output* # integration test stuff tests/integration/buenrostro2018.model # examples -examples/scembed/pbmc/ \ No newline at end of file +examples/scembed/pbmc/ +examples/scembed/buenrostro +examples/scembed/atlas +examples/scembed/luecken2021 + +# vector db stuff +qdrant_storage/ + +.ruff_cache/ + + +# MacOS +.DS_Store + +local_cache + +lightning_logs +data/ \ No newline at end of file diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 00000000..0318a0f3 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,15 @@ +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.11" + +mkdocs: + configuration: mkdocs.yml + fail_on_warning: false + +# Optionally declare the Python requirements required to build your docs +python: + install: + - requirements: requirements/requirements-doc.txt \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index d99f2f30..e19736d3 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -2,5 +2,9 @@ "[python]": { "editor.defaultFormatter": "ms-python.black-formatter" }, - "python.formatting.provider": "none" + "python.testing.pytestArgs": [ + "tests" + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true } \ No newline at end of file diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 00000000..26c0003c --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,9 @@ +Copyright 2023 Nathan Sheffield + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..7647470a --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,4 @@ +include README.md +include LICENSE.txt +include requirements/* +include pyproject.toml diff --git a/README.md b/README.md index 70c36218..0068c22b 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,35 @@ -# Genomic interval toolkit +# Genomic interval machine learning (geniml) + +Geniml is a python package for building machine learning models of genomic interval data (BED files). It also includes ancillary functions to support other types of analyses of genomic interval data. + +Documentation is hosted at . + + +## Installation +### To install `geniml` use this commands. + +Without specifying dependencies, the default dependencies will be installed, +which DO NOT include machine learning (ML) or heavy processing libraries. + + +From pypi: +``` +pip install geniml +``` +or install the latest version from the GitHub repository: +``` +pip install git+https://github.com/databio/geniml.git +``` + +### To install Machine learning dependencies use this command: + +From pypi: +``` +pip install geniml[ml] +``` + + +## Development + +Run tests (from `/tests`) with `pytest`. Please read the [contributor guide](https://docs.bedbase.org/geniml/contributing/) to contribute. -You can find documentation in the `docs` subfolder. diff --git a/docs/README.md b/docs/README.md index 158be068..d2cb098f 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,41 +1,8 @@ -# Genomic interval toolkit +# Geniml documentation -## Introduction +The documentation for `geniml` is now part of BEDbase. You can find -`gitk` is a suite of tools for apply machine learning approaches to genomic interval data. It is organized as a set of modules that provide related functions, such as building HMMs, assessing genomic interval universes, calculating likelihoods of consensus genomic interval sets, and computing single-cell clusters. - -## Install - -``` -pip install --user --upgrade . -``` - -## gitk modules - -- [gitk/hmm](gitk/hmm) - Building HMMs -- [gitk/assess](gitk/assess) - Assess universe fit -- [gitk/likelihood](gitk/likelihood) - Calculate likelihood of universe -- [gitk/scembed](gitk/scembed) - Compute single-cell clusters from a cell-feature matrix using Word2Vec - -## Using modules from Python - -This repo is divided into modules. Each module should be written in a way that it provides utility as a Python library. For example, you can call functions in the `hmm` module like this: - -``` -import gitk - -gitk.hmm.function() -``` - -## Command-line interfaces - -In addition to being importable from Python, *some* modules also provide a CLI. For these, developers provide a subcommand for CLI use. The root `gitk` package provides a generalized command-line interface with the command `gitk`. The modules that provide CLIs then correspond to CLI commands, *e.g* `gitk hmm` or `gitk likelihood`, with the corresponding code contained within a sub-folder named after the model: - -``` -gitk ... -``` - -This is implemented within each module folder with: - -- `gitk//cli.py` - defines the command-line interface and provides a subparser for this module's CLI command. +- the rendered [documentation for geniml](https://docs.bedbase.org/geniml/). +- the [repository with the documentation source](https://github.com/databio/bedbase). +If you have any questions, please open an issue on [this repository](https://github.com/databio/geniml/issues) or on the [bedbase](https://github.com/databio/bedbase/issues) repository. \ No newline at end of file diff --git a/docs/autodoc_build/gitk.md b/docs/autodoc_build/gitk.md deleted file mode 100644 index 547ee366..00000000 --- a/docs/autodoc_build/gitk.md +++ /dev/null @@ -1,32 +0,0 @@ - - - - - -# Package `gitk` Documentation - - diff --git a/docs/changelog.md b/docs/changelog.md deleted file mode 100644 index 3bb50031..00000000 --- a/docs/changelog.md +++ /dev/null @@ -1,7 +0,0 @@ -# Changelog - -This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. - -## [0.0.x] -- 2023-XX-XX - -- Development diff --git a/docs/contributing.md b/docs/contributing.md deleted file mode 100644 index 502aa670..00000000 --- a/docs/contributing.md +++ /dev/null @@ -1,33 +0,0 @@ -# Contributor guide - -## Repository organization - -This repo is divided into modules. Each module is in a subfolder. To add functionality to gitk, you could add it to an existing module. If there's no existing module that fits, you could add your own module. - -## Adding a new module - -### Creating your module - -Each module should be written in a way that it provides utility as a Python library. It should contain at least these files: - -- `README.md` - describes how to use the code -- `.py`, and other `.py` files - functions that provide utility for this module. - -*All* the functions should be written to be useful via import, calling with `gitk..`. For example: - -``` -import gitk - -gitk.hmm.function() -``` - -### Adding your module to gitk - -1. Put your module in a subfolder -2. Make sure to include a `__init__.py` so it's importable. -3. Add it to list of packages in `setup.py` - -### Shared code - -Any variables, functions, or other code that is shared across modules should be placed in the parent module, which is held in the [gitk](gitk) folder. - diff --git a/docs/img/geniml_logo3.svg b/docs/img/geniml_logo3.svg new file mode 100644 index 00000000..6869ca58 --- /dev/null +++ b/docs/img/geniml_logo3.svg @@ -0,0 +1,476 @@ + + + + diff --git a/docs/img/gitk_logo.svg b/docs/img/gitk_logo.svg deleted file mode 100644 index 96bc88d8..00000000 --- a/docs/img/gitk_logo.svg +++ /dev/null @@ -1,66 +0,0 @@ - -gitk diff --git a/docs/likelihood/consensus-peaks.md b/docs/likelihood/consensus-peaks.md deleted file mode 100644 index f361c233..00000000 --- a/docs/likelihood/consensus-peaks.md +++ /dev/null @@ -1,40 +0,0 @@ -# How to create consensus peaks from a set of BED files - -## Data preprocessing - -1. install [uniwig](https://github.com/databio/uniwig/tree/smoothing), make sure to use branch smoothing -2. use [create_unsorted.sh](https://github.com/databio/uniwig/blob/smoothing/create_unsorted.sh) to make three bigWig from your files - -## Cut-off universe -Make cut-off universe from coverage using: -``` - gitk lh universe_hard --coverage_file coverage.bw \ - --fout universe.bed - -``` -Where: -- ```--coverage_file```, takes the path to bigWig file with cverage track -- ```--fout```, takes the path to output file - -## Maximum likelihood universe -Make likelihood model from coverage tracks using: - -``` -gitk lh build_model --model_folder model.tar \ - --file_no x \ - --coverage_folder coverage/ -``` -Where: -- ```--model_folder```, takes the name of tar archive that will contain the likelihood model -- ```--file_no```, number of files used in analysis -- ```--coverage_folder``` path to folder with coverage tracks -- ```--coverage_prefix``` prefix used in uniwig for making files - -Use likelihood model to make a maximum likelihood universe -``` -gitk lh universe_flexible --model_folder model.tar \ - --output_file universe.bed -``` -Where: -- ```--model_folder```, takes the name of tar archive that contains the likelihood model -- ```--output_file```, takes the path to output file \ No newline at end of file diff --git a/docs/support.md b/docs/support.md deleted file mode 100644 index 16e54381..00000000 --- a/docs/support.md +++ /dev/null @@ -1,3 +0,0 @@ -# Support - -Please raise any issues or questions using the [GitHub issue tracker](https://github.com/databio/gitk/issues). diff --git a/geniml/__init__.py b/geniml/__init__.py new file mode 100644 index 00000000..fbaa69d9 --- /dev/null +++ b/geniml/__init__.py @@ -0,0 +1,6 @@ +from logging import getLogger + +from ._version import __version__ +from .const import PKG_NAME + +_LOGGER = getLogger(PKG_NAME) diff --git a/geniml/_version.py b/geniml/_version.py new file mode 100644 index 00000000..777f190d --- /dev/null +++ b/geniml/_version.py @@ -0,0 +1 @@ +__version__ = "0.8.0" diff --git a/gitk/assess/__init__.py b/geniml/assess/__init__.py similarity index 100% rename from gitk/assess/__init__.py rename to geniml/assess/__init__.py diff --git a/geniml/assess/assess.py b/geniml/assess/assess.py new file mode 100644 index 00000000..e319211b --- /dev/null +++ b/geniml/assess/assess.py @@ -0,0 +1,338 @@ +import os +import warnings +from logging import getLogger + +import numpy as np +import pandas as pd + +from ..const import PKG_NAME +from .distance import run_distance +from .intersection import run_intersection +from .likelihood import hard_universe_likelihood, likelihood_flexible_universe +from .utils import check_if_uni_flexible + +_LOGGER = getLogger(PKG_NAME) + + +def run_all_assessment_methods( + raw_data_folder, + file_list, + universe, + no_workers, + folder_out, + pref, + save_each, + overlap=False, + distance_f_t_u=False, + distance_f_t_u_flex=False, + distance_u_t_f=False, + distance_u_t_f_flex=False, +): + """ + Assess universe fit to collection using overlap and distance metrics + :param str raw_data_folder: path to raw files from the collection + :param str file_list: path to file with list of files in the collection + :param str universe: path to universe that is being assessed + :param int no_workers: number of workers for multiprocessing + :param str folder_out: output folder + :param str pref: prefixed used for creating output files + :param bool save_each: if save output of distance metrics for each region + :param bool overlap: if calculate overlap metrics + :param bool distance_f_t_u: if calculate distance from file to universe metrics + :param bool distance_f_t_u_flex: if calculate flexible distance from file to universe metrics + :param bool distance_u_t_f: if calculate distance from universes to file metrics + :param bool distance_u_t_f_flex: if calculate flexible distance from universes to file metrics + """ + if not any( + [ + overlap, + distance_f_t_u, + distance_f_t_u_flex, + distance_u_t_f, + distance_u_t_f_flex, + ] + ): + raise AttributeError("Choose at least one assessment method") + if not any( + [ + distance_f_t_u, + distance_f_t_u_flex, + distance_u_t_f, + distance_u_t_f_flex, + ] + ): + warnings.warn("Unused argument: save_each") + asses_results = [] + if overlap: + r_overlap = run_intersection( + raw_data_folder, + file_list, + universe, + no_workers, + ) + r_overlap.columns = [ + "file", + "univers/file", + "file/universe", + "universe&file", + ] + asses_results.append(r_overlap) + _LOGGER.info("DONE: Overlap") + if distance_f_t_u: + r_distance = run_distance( + raw_data_folder, + file_list, + universe, + no_workers, + False, + folder_out, + pref + "_dist_file_to_universe", + save_each, + False, + ) + r_distance.columns = ["file", "median_dist_file_to_universe"] + asses_results.append(r_distance) + _LOGGER.info("DONE: Distance file to universe") + if distance_f_t_u_flex: + r_distance_flex = run_distance( + raw_data_folder, + file_list, + universe, + no_workers, + True, + folder_out, + pref + "_dist_file_to_universe_flex", + save_each, + False, + ) + r_distance_flex.columns = ["file", "median_dist_file_to_universe_flex"] + asses_results.append(r_distance_flex) + _LOGGER.info("DONE: Flexible distance file to universe") + + if distance_u_t_f: + r_distance_utf = run_distance( + raw_data_folder, + file_list, + universe, + no_workers, + False, + folder_out, + pref + "_dist_universe_to_file", + save_each, + True, + ) + r_distance_utf.columns = ["file", "median_dist_universe_to_file"] + asses_results.append(r_distance_utf) + _LOGGER.info("DONE: Distance universe to file") + if distance_u_t_f_flex: + r_distance_utf_flex = run_distance( + raw_data_folder, + file_list, + universe, + no_workers, + True, + folder_out, + pref + "median_dist_universe_to_file_flex", + save_each, + True, + ) + r_distance_utf_flex.columns = [ + "file", + "median_dist_universe_to_file_flex", + ] + asses_results.append(r_distance_utf_flex) + _LOGGER.info("DONE: Flexible distance universe to file") + + df = asses_results[0] + for i in asses_results[1:]: + df = pd.merge(df, i, on="file") + df.to_csv(os.path.join(folder_out, pref + "_data.csv"), index=False) + + +def get_rbs(f_t_u, u_t_f): + """ + Calculate RBS + """ + a = 101 / (f_t_u + 100) + b = 101 / (u_t_f + 100) + rbs = (10 * a + b) / 11 + return rbs + + +def get_mean_rbs(folder, file_list, universe, no_workers, flexible=False): + """ + Calculate average RBS of the collection + :param str folder: path to folder with the collection + :param str file_list: path to file with list of files in the collection + :param str universe: path to the universe + :param int no_workers: number of workers for multiprocessing + :param bool flexible: if to calculate flexible version of the metric + :return int: average RBS + """ + file_to_uni = run_distance( + folder, + file_list, + universe, + no_workers, + flexible=flexible, + uni_to_file=False, + ) + + uni_to_file = run_distance( + folder, + file_list, + universe, + no_workers, + flexible=flexible, + uni_to_file=True, + ) + rbs = get_rbs(file_to_uni[1], uni_to_file[1]) + return np.mean(rbs) + + +def get_rbs_from_assessment_file(file, cs_each_file=False, flexible=False): + """ + Calculate RBS form file with results of metrics per file + :param str file: path to file with assessment results + :param bool cs_each_file: if report RBS for each file, not average for the collection + :param bool flexible: if use flexible version of the metric + """ + df = pd.read_csv(file, index_col=(0)) + if flexible: + df["f_t_u"] = df["median_dist_file_to_universe_flex"] + df["u_t_f"] = df["median_dist_universe_to_file_flex"] + else: + df["f_t_u"] = df["median_dist_file_to_universe"] + df["u_t_f"] = df["median_dist_universe_to_file"] + df["RBS"] = get_rbs(df["f_t_u"], df["u_t_f"]) + if cs_each_file: + return df + else: + return df["RBS"].mean() + + +def get_f_10_score( + folder, + file_list, + universe, + no_workers, +): + """ + Get F10 score for a universes and collection of files + :param str folder: path to folder with the collection + :param str file_list: path to file with list of files in the collection + :param str universe: path to the universe + :param int no_workers: number of workers for multiprocessing + :return int: average F10 score + """ + res = run_intersection( + folder, + file_list, + universe, + no_workers, + ) + res = np.array(res) + res = res[:, 1:] + res = res.astype("float") + recall = res[:, 2] / (res[:, 2] + res[:, 1]) + precision = res[:, 2] / (res[:, 2] + res[:, 0]) + f_10 = (1 + 10**2) * (precision * recall) / ((10**2 * precision) + recall) + return np.mean(f_10) + + +def get_f_10_score_from_assessment_file(file, f10_each_file=False): + """ + Get F10 score from assessment output file + :param str file: path to file with assessment results + :param bool f10_each_file: if report F10 for each file, not average for the collection + """ + df = pd.read_csv(file, index_col=(0)) + r = df["universe&file"] / (df["universe&file"] + df["file/universe"]) + p = df["universe&file"] / (df["universe&file"] + df["univers/file"]) + df["F_10"] = (1 + 10**2) * (p * r) / ((10**2 * p) + r) + if f10_each_file: + return df["F_10"] + else: + return df["F_10"].mean() + + +def get_likelihood( + model_file, + universe, + cove_folder, + cove_prefix="all", + flexible=False, + save_peak_input=False, +): + """ + Calculate universe likelihood given collection + :param str model_file: path to file with likelihood model + :param str universe: path to the universe + :param str cove_folder: path to the coverage folder + :param str cove_prefix: prefixed used for generating coverage + :param bool flexible: if to calculate flexible likelihood + :param bool save_peak_input: if to save likelihood input of each region + :return: + """ + if flexible: + lh = likelihood_flexible_universe( + model_file, universe, cove_folder, cove_prefix, save_peak_input + ) + else: + if save_peak_input: + warnings.warn("Unused argument: save_peak_input") + lh = hard_universe_likelihood(model_file, universe, cove_folder, cove_prefix) + + return lh + + +def filter_universe( + universe, + universe_filtered, + min_size=0, + min_coverage=0, + filter_lh=False, + model_file=None, + cove_folder=None, + cove_prefix=None, + lh_cutoff=0, +): + """ + Filter universe by region size, coverage by collection, likelihood + :param str universe: path to input universe + :param str universe_filtered: path to output filtered universe + :param int min_size: minimum size of the region in the output universe + :param int min_coverage: minimum number coverage of universe region by collection + :param bool filter_lh: if use likelihood to filter universe + :param str model_file: path to collection likelihood model + :param str cove_folder: path to folder with coverage tracks + :param str cove_prefix: prefixed used for creating tracks + :param int lh_cutoff: minimum likelihood input + """ + if filter_lh: + check_if_uni_flexible(universe) + if not all([model_file, cove_folder, cove_prefix]): + miss_args = [] + if not model_file: + miss_args.append("model_file") + if not cove_folder: + miss_args.append("cove_folder") + if not cove_prefix: + miss_args.append("cove_prefix") + raise ValueError( + "Missing {} for peak likelihood calculations.".format(",".join(miss_args)) + ) + likelihood_flexible_universe(model_file, universe, cove_folder, cove_prefix, True) + universe = universe + "_peak_likelihood" + with open(universe) as uni: + with open(universe_filtered, "w+") as uni_flt: + for i in uni: + j = i.split("\t") + j[1], j[2], j[4] = int(j[1]), int(j[2]), int(j[4]) + if j[2] - j[1] > min_size: + if j[4] > min_coverage: + if filter_lh: + if float(j[9].strip("\n")) > lh_cutoff: + uni_flt.write(i) + else: + uni_flt.write(i) diff --git a/geniml/assess/cli.py b/geniml/assess/cli.py new file mode 100644 index 00000000..2eb827f4 --- /dev/null +++ b/geniml/assess/cli.py @@ -0,0 +1,67 @@ +def build_subparser(parser): + """ + Builds argument parser. + + :return argparse.ArgumentParser: Argument parser + """ + parser.add_argument( + "--overlap", + help="if calculate base-level overlap score", + action="store_true", + ) + parser.add_argument( + "--distance", + help="if calculate distance from region in query to nearest region in the universe", + action="store_true", + ) + parser.add_argument( + "--distance-universe-to-file", + help="if calculate distance from region in the universe to nearest region query", + action="store_true", + ) + parser.add_argument( + "--distance-flexible", + help="if calculate distance from region in query to nearest region in the universe taking into account " + "universe flexibility ", + action="store_true", + ) + parser.add_argument( + "--distance-flexible-universe-to-file", + help="if calculate distance from region in the universe to nearest region in query taking into account " + "universe flexibility ", + action="store_true", + ) + parser.add_argument( + "--raw-data-folder", + help="folder with raw data", + type=str, + required=True, + ) + parser.add_argument( + "--file-list", + help="list of files that need to be assessed", + type=str, + required=True, + ) + parser.add_argument("--universe", help="universe file", type=str, required=True) + parser.add_argument( + "--no-workers", + help="number of core that should be used", + default=4, + type=int, + ) + parser.add_argument( + "--save-to-file", + help="if save statistics for each BED file to a file", + action="store_true", + ) + parser.add_argument("--folder-out", help="folder to which save the statistic", type=str) + parser.add_argument("--pref", help="statistic file prefix", type=str) + + parser.add_argument( + "--save-each", + help="if save distance for each peak in each file ", + action="store_true", + ) + + return parser diff --git a/geniml/assess/distance.py b/geniml/assess/distance.py new file mode 100644 index 00000000..d4ca3a0d --- /dev/null +++ b/geniml/assess/distance.py @@ -0,0 +1,321 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import tempfile +from multiprocessing import Pool + +import numpy as np +import pandas as pd + +from ..utils import natural_chr_sort +from .utils import check_if_uni_flexible, check_if_uni_sorted, prep_data, process_db_line + + +def flexible_distance_between_two_regions(region, query): + """Calculate distance between region and flexible region from flexible universe + :param [int, int] region: region from flexible universe + :param int query: analyzed region + :return int: distance + """ + if region[0] <= query <= region[1]: + return 0 + else: + return min(abs(region[0] - query), abs(region[1] - query)) + + +def distance_between_two_regions(region, query): + """Calculate distance between region in database and region from the query + :param [int] region: region from hard universe + :param int query: analysed region + :return int: distance + """ + return abs(region[0] - query) + + +def distance_to_closest_region( + db, db_queue, i, current_chrom, unused_db, pos_index, flexible, uni_to_file +): + """ + Calculate distance from given peak to the closest region in database + :param file db: database file + :param list db_queue: queue of three last positions in database + :param i: analyzed position from the query + :param str current_chrom: current analyzed chromosome from query + :param list unused_db: list of positions from universe that were not compared to query + :param list pos_index: which indexes from universe region use to calculate distance + :param bool flexible: whether the universe if flexible + :param bool uni_to_file: whether calculate distance from universe to file + :return int: peak distance to universe + """ + if flexible: + if uni_to_file: + dist_to_db_que = [flexible_distance_between_two_regions(i, j[0]) for j in db_queue] + else: + dist_to_db_que = [flexible_distance_between_two_regions(j, i[0]) for j in db_queue] + else: + dist_to_db_que = [distance_between_two_regions(j, i[0]) for j in db_queue] + min_pos = np.argmin(dist_to_db_que) + while min_pos == 2: + d = db.readline().strip("\n") + if d == "": + return dist_to_db_que[min_pos] + pos, pos_chrom = process_db_line(d, pos_index) + if pos_chrom != current_chrom: + unused_db.append([pos, pos_chrom]) + return dist_to_db_que[min_pos] + db_queue[:-1] = db_queue[1:] + db_queue[-1] = pos + if flexible: + if uni_to_file: + dist_to_db_que = [flexible_distance_between_two_regions(i, j[0]) for j in db_queue] + else: + dist_to_db_que = [flexible_distance_between_two_regions(j, i[0]) for j in db_queue] + else: + dist_to_db_que = [distance_between_two_regions(j, i[0]) for j in db_queue] + min_pos = np.argmin(dist_to_db_que) + return dist_to_db_que[min_pos] + + +def read_in_new_universe_regions( + db, + q_chrom, + current_chrom, + unused_db, + db_queue, + waiting, + pos_index, +): + """ + Read in new universe regions closest to the peak + :param file db: universe file + :param str q_chrom: new peak's chromosome + :param str current_chrom: chromosome that was analyzed so far + :param list unused_db: list of positions from universe that were not compared to query + :param list db_queue: que of three last positions in universe + :param bool waiting: whether iterating through file, without calculating + distance, if present chromosome not present in universe + :param list pos_index: which indexes from universe region use to calculate distance + :return bool, str: if iterating through chromosome not present in universe; current chromosome in query + """ + if q_chrom != current_chrom: + # change chromosome + db_queue.clear() + # clean up the que + if len(unused_db) == 0: + d = db.readline().strip("\n") + if d == "": + waiting = True + return waiting, current_chrom + d_start, d_start_chrom = process_db_line(d, pos_index) + while current_chrom == d_start_chrom: + # finish reading old chromosome in DB file + d = db.readline().strip("\n") + if d == "": + break + d_start, d_start_chrom = process_db_line(d, pos_index) + unused_db.append([d_start, d_start_chrom]) + current_chrom = q_chrom + if current_chrom == unused_db[-1][1]: + waiting = False + db_queue.append(unused_db[-1][0]) + unused_db.clear() + elif natural_chr_sort(unused_db[-1][1], current_chrom) == 1: + # chrom present in file not in DB + waiting = True + return waiting, current_chrom + while len(db_queue) < 3: + d = db.readline().strip("\n") + if d == "": + break + d_start, d_start_chrom = process_db_line(d, pos_index) + if d_start_chrom == current_chrom: + db_queue.append(d_start) + elif natural_chr_sort(d_start_chrom, current_chrom) == 1: + unused_db.append([d_start, d_start_chrom]) + waiting = True + return waiting, current_chrom + if len(db_queue) == 0: + waiting = True + return waiting, current_chrom + + +def calc_distance_between_two_files( + universe, + q_folder, + q_file, + flexible, + save_each, + folder_out, + pref, + uni_to_file=False, +): + """ + Maine function for calculating distance between regions in file query to regions in database + :param str universe: path to universe + :param str q_folder: path to folder containing query files + :param str q_file: query file + :param boolean flexible: whether the universe if flexible + :param bool save_each: whether to save calculated distances for each file + :param str folder_out: output folder + :param str pref: prefix used as the name of the folder + containing calculated distance for each file + :param uni_to_file: whether to calculate distance from universe to file + :return str, int, int: file name; median od distance of starts to + starts in universe; median od distance of ends to ends in universe + """ + query = tempfile.NamedTemporaryFile() + prep_data(q_folder, q_file, query) + if uni_to_file: + db_start_name = query.name + db_end_name = query.name + q = open(universe) + else: + db_start_name = universe + db_end_name = universe + q = query + with open(db_start_name) as db_start, open(db_end_name) as db_end: + db_queue_start = [] + current_chrom_start = "chr0" + dist_start = [] + unused_db_start = [] + waiting_start = False + db_queue_end = [] + current_chrom_end = "chr0" + dist_end = [] + unused_db_end = [] + waiting_end = False + start_index_q, start_index_db = [1], [1] + end_index_q, end_index_db = [2], [2] + if flexible and uni_to_file: + start_index_q = [1, 6] + end_index_q = [7, 2] + if flexible and not uni_to_file: + start_index_db = [1, 6] + end_index_db = [7, 2] + for i in q: + if not uni_to_file: + i = i.decode("utf-8") + i = i.split("\t") + start = [int(i[ind]) for ind in start_index_q] + end = [int(i[ind]) for ind in end_index_q] + q_chrom = i[0] + result_start = read_in_new_universe_regions( + db_start, + q_chrom, + current_chrom_start, + unused_db_start, + db_queue_start, + waiting_start, + start_index_db, + ) + (waiting_start, current_chrom_start) = result_start + if not waiting_start: + result = distance_to_closest_region( + db_start, + db_queue_start, + start, + current_chrom_start, + unused_db_start, + start_index_db, + flexible, + uni_to_file, + ) + dist_start.append(result) + result_end = read_in_new_universe_regions( + db_end, + q_chrom, + current_chrom_end, + unused_db_end, + db_queue_end, + waiting_end, + end_index_db, + ) + (waiting_end, current_chrom_end) = result_end + if not waiting_end: + res = distance_to_closest_region( + db_end, + db_queue_end, + end, + current_chrom_end, + unused_db_end, + end_index_db, + flexible, + uni_to_file, + ) + dist_end.append(res) + query.close() + if save_each: + with open(os.path.join(folder_out, pref, q_file), "w") as f: + for i, j in zip(dist_start, dist_end): + f.write(f"{i}\t{j}\n") + if not dist_start: + print(f"File {q_file} doesn't contain any chromosomes present in universe") + return q_file, None + dist = dist_start + dist_end + return q_file, np.median(dist) + + +def run_distance( + folder, + file_list, + universe, + no_workers, + flexible=False, + folder_out=None, + pref=None, + save_each=False, + uni_to_file=False, +): + """ + For group of files calculate distance to the nearest region in universe + :param str folder: path to folder containing query files + :param str file_list: path to file containing list of query files + :param str universe: path to universe file + :param int no_workers: number of parallel processes + :param bool flexible: whether the universe if flexible + :param str folder_out: output folder + :param str pref: prefix used for saving + :param bool save_each: whether to save calculated distances for each file + :param uni_to_file: whether to calculate distance from universe to file + :return float; float: mean of median distances from starts in query to the nearest starts in universe; + mean of median distances from ends in query to the nearest ends in universe + """ + check_if_uni_sorted(universe) + if flexible: + check_if_uni_flexible(universe) + with open(file_list) as f: + files = f.read().split("\n")[:-1] + res = [] + if save_each: + os.makedirs(os.path.join(folder_out, pref), exist_ok=True) + if no_workers <= 1: + for i in files: + r = calc_distance_between_two_files( + universe, + folder, + i, + flexible, + save_each, + folder_out, + pref, + uni_to_file, + ) + res.append(r) + else: + with Pool(no_workers) as p: + args = [ + ( + universe, + folder, + f, + flexible, + save_each, + folder_out, + pref, + uni_to_file, + ) + for f in files + ] + res = p.starmap(calc_distance_between_two_files, args) + return pd.DataFrame(res) diff --git a/gitk/assess/intersection.py b/geniml/assess/intersection.py similarity index 66% rename from gitk/assess/intersection.py rename to geniml/assess/intersection.py index e56c1a09..98e0e1e5 100644 --- a/gitk/assess/intersection.py +++ b/geniml/assess/intersection.py @@ -1,60 +1,54 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -from .utils import process_line, prep_data, check_if_uni_sorted import os +import tempfile from multiprocessing import Pool + import numpy as np +import pandas as pd + from ..utils import natural_chr_sort -import tempfile +from .utils import check_if_uni_sorted, prep_data, process_line def chrom_cmp(a, b): - """Natural chromosome names comparison""" - # com = natural_chr_sort(a, b) - # if com < 0: - # return a, False, True - # if com == 0: - # return a, False, False - # if com > 0: - # return b, True, False - ac = a.replace("chr", "") - ac = ac.split("_")[0] - bc = b.replace("chr", "") - bc = bc.split("_")[0] - if bc.isnumeric() and ac.isnumeric() and bc != ac: - if int(bc) < int(ac): - return b, True, False - else: - return a, False, True + """Return smaller chromosome name""" + c = natural_chr_sort(a, b) + if c > 0: + return b, True, False else: - if b < a: - return b, True, False - else: - return a, False, True + return a, False, True -def relationship_helper(region_a, region_b, only_in, overlap, start_a, start_b): +def relationship_helper(region_a, region_b, only_in, overlap): """For two region calculate their overlap; for earlier region - calculate how many base pair only in it""" + calculate how many base pair only in it + :param [int, int] region_a: region that starts first + :param [int, int] region_b: region that starts second + :param int only_in: number of positions only in a so far + :param int overlap: number of overlapping so far + """ if region_b[0] <= region_a[1]: only_in += region_b[0] - region_a[0] if region_b[1] <= region_a[1]: overlap += region_b[1] - region_b[0] start_a, start_b = region_b[1], region_b[1] inside_b, inside_a = False, True + return only_in, inside_a, inside_b, overlap, start_a, start_b elif region_b[1] > region_a[1]: overlap += region_a[1] - region_b[0] start_a, start_b = region_a[1], region_a[1] inside_a, inside_b = False, True + return only_in, inside_a, inside_b, overlap, start_a, start_b elif region_b[0] > region_a[1]: only_in += region_a[1] - region_a[0] inside_a, inside_b = False, True start_a, start_b = region_a[1], region_b[0] - return only_in, inside_a, inside_b, overlap, start_a, start_b + return only_in, inside_a, inside_b, overlap, start_a, start_b -def relationship( +def two_region_intersection_diff( region_d, region_q, only_in_d, @@ -68,7 +62,7 @@ def relationship( waiting_q, ): """ - Check mutual position and calculate intersection and difference of two regions + Check mutual position of two regions and calculate intersection and difference of two regions :param list region_d: region from universe :param list region_q: region from query :param int only_in_d: number of base pair only in universe @@ -76,10 +70,10 @@ def relationship( :param bool inside_d: whether there is still part of the region from universe to analyse :param bool inside_q: whether there is still part of the region from query to analyse :param int overlap: size of overlap - :param int start_d: start position of currently analysed universe region - :param int start_q: start position of currently analysed query region - :param bool waiting_d: whether waiting for the query to finish chrom - :param bool waiting_q: whether waiting for the universe to finish chrom + :param int start_d: start position of currently analyzed universe region + :param int start_q: start position of currently analyzed query region + :param bool waiting_d: whether waiting for the query to finish chromosome + :param bool waiting_q: whether waiting for the universe to finish chromosome """ if waiting_q: only_in_d += region_d[1] - region_d[0] @@ -91,19 +85,18 @@ def relationship( inside_q, inside_d = False, True else: if region_d[0] <= region_q[0]: - res = relationship_helper( - region_d, region_q, only_in_d, overlap, start_d, start_q - ) + res = relationship_helper(region_d, region_q, only_in_d, overlap) (only_in_d, inside_d, inside_q, overlap, start_d, start_q) = res if region_d[0] > region_q[0]: - res = relationship_helper( - region_q, region_d, only_in_q, overlap, start_q, start_d - ) + res = relationship_helper(region_q, region_d, only_in_q, overlap) (only_in_q, inside_q, inside_d, overlap, start_q, start_d) = res return only_in_d, only_in_q, inside_d, inside_q, overlap, start_d, start_q -def read_in_new_line(region, start, chrom, inside, waiting, lines, cchrom, not_e): +def read_in_new_line(region, start, chrom, inside, waiting, lines, c_chrom, not_e): + """ + Read in a new line from query or universe file + """ if not inside: if not waiting: line = lines.readline() @@ -112,7 +105,7 @@ def read_in_new_line(region, start, chrom, inside, waiting, lines, cchrom, not_e line = line.strip("\n") if line != "": region, start, chrom = process_line(line) - if chrom != cchrom: + if chrom != c_chrom: waiting = True else: not_e = False @@ -120,7 +113,7 @@ def read_in_new_line(region, start, chrom, inside, waiting, lines, cchrom, not_e return region, start, chrom, waiting, not_e -def calc_stats(db, folder, query): +def calc_diff_intersection(db, folder, query): """ Difference and overlap of two files on base pair level :param str db: path to universe file @@ -131,7 +124,10 @@ def calc_stats(db, folder, query): only_in_d, only_in_q, overlap = 0, 0, 0 inside_d, inside_q = False, False # inside a region not_end_d, not_end_q = True, True # if there are regions to process - waiting_d, waiting_q = False, False # if waiting for the other file to finish chrom + waiting_d, waiting_q = ( + False, + False, + ) # if waiting for the other file to finish chrom lines_q = tempfile.NamedTemporaryFile() prep_data(folder, query, lines_q) if os.stat(lines_q.name).st_size == 0: @@ -148,7 +144,7 @@ def calc_stats(db, folder, query): else: c_chrom, waiting_d, waiting_q = chrom_cmp(chrom_d, chrom_q) while not_end_d or not_end_q: - res = relationship( + regions_stats = two_region_intersection_diff( [start_d, pos_d[1]], [start_q, pos_q[1]], only_in_d, @@ -161,13 +157,35 @@ def calc_stats(db, folder, query): waiting_d, waiting_q, ) - (only_in_d, only_in_q, inside_d, inside_q, overlap, start_d, start_q) = res + ( + only_in_d, + only_in_q, + inside_d, + inside_q, + overlap, + start_d, + start_q, + ) = regions_stats new_d = read_in_new_line( - pos_d, start_d, chrom_d, inside_d, waiting_d, lines_db, c_chrom, not_end_d + pos_d, + start_d, + chrom_d, + inside_d, + waiting_d, + lines_db, + c_chrom, + not_end_d, ) (pos_d, start_d, chrom_d, waiting_d, not_end_d) = new_d new_q = read_in_new_line( - pos_q, start_q, chrom_q, inside_q, waiting_q, lines_q, c_chrom, not_end_q + pos_q, + start_q, + chrom_q, + inside_q, + waiting_q, + lines_q, + c_chrom, + not_end_q, ) (pos_q, start_q, chrom_q, waiting_q, not_end_q) = new_q if waiting_d or waiting_q: @@ -190,14 +208,17 @@ def calc_stats(db, folder, query): def run_intersection( - folder, file_list, universe, npool, save_to_file=False, folder_out=None, pref=None + folder, + file_list, + universe, + no_workers, ): """ Calculate the base pair intersection of universe and group of files :param str folder: path to folder containing query files :param str file_list: path to file containing list of query files :param str universe: path to universe file - :param int npool: number of parallel processes + :param int no_workers: number of parallel processes :param str save_to_file: whether to save median of calculated distances for each file :param str folder_out: output folder :param str pref: prefix used for saving @@ -205,31 +226,15 @@ def run_intersection( mean of fractions of intersection of file and universe divided by file size """ check_if_uni_sorted(universe) - if save_to_file: - os.makedirs(folder_out, exist_ok=True) - # os.mkdir("tmp") - files = open(file_list).read().split("\n")[:-1] + with open(file_list) as f: + files = f.read().split("\n")[:-1] res = [] - if npool <= 1: + if no_workers <= 1: for i in files: - r = calc_stats(universe, folder, i) + r = calc_diff_intersection(universe, folder, i) res.append(r) else: - with Pool(npool) as p: + with Pool(no_workers) as p: args = [(universe, folder, f) for f in files] - res = p.starmap(calc_stats, args) - # os.rmdir("tmp") - if save_to_file: - fout = os.path.join(folder_out, pref + "_data.tsv") - with open(fout, "w") as o: - o.write("file\tunivers/file\tfile/universe\tuniverse&file\n") - for r in res: - o.write(f"{r[0]}\t{r[1]}\t{r[2]}\t{r[3]}\n") - else: - res = np.array(res) - res = res[:, 1:] - res = res.astype("float") - recall = res[:, 2] / (res[:, 2] + res[:, 1]) - precision = res[:, 2] / (res[:, 2] + res[:, 0]) - F_10 = (1 + 10**2) * (precision * recall) / ((10**2 * precision) + recall) - return np.median(F_10) + res = p.starmap(calc_diff_intersection, args) + return pd.DataFrame(res) diff --git a/geniml/assess/likelihood.py b/geniml/assess/likelihood.py new file mode 100644 index 00000000..4381c3e8 --- /dev/null +++ b/geniml/assess/likelihood.py @@ -0,0 +1,328 @@ +import os + +import numpy as np + +from ..likelihood.build_model import ModelLH +from ..utils import read_chromosome_from_bw +from .utils import check_if_uni_flexible, check_if_uni_sorted + + +class LhModel: + def __init__(self, model, cove): + """ + Object with combined information about lh model and coverage + :param ndarray model: lh model array + :param ndarray cove: coverage array + """ + self.model = model + self.cove = cove + + def __getitem__(self, sliced): + return self.model[self.cove[sliced[0]], sliced[1]] + + +def calc_likelihood_hard( + universe, + chroms, + model_lh, + coverage_folder, + coverage_prefix, + name, + s_index, + e_index=None, +): + """ + Calculate likelihood of universe for given type of model + To be used with binomial model + :param str universe: path to universe file + :param list chroms: list of chromosomes present in model + :param ModelLH model_lh: likelihood model + :param coverage_prefix: prefix used in uniwig for creating coverage + :param coverage_folder: path to a folder with genome coverage by tracks + :param str name: suffix of model file name, which contains information + about model type + :param int s_index: from which position in universe line take assess region + start position + :param int e_index: from which position in universe line take assess region + end position + :return float: likelihood of universe for given model + """ + current_chrom = "" + missing_chrom = "" + empty_start = 0 + res = 0 + e = 0 + done_chrom = [] + prob_array, cove_array = None, None + with open(universe) as uni: + for i in uni: + e += 1 + i = i.split("\t") + i[1], i[2] = int(i[1]), int(i[2]) + if i[0] == missing_chrom: + pass + else: + if i[0] != current_chrom: + if i[0] in chroms: + done_chrom.append(i[0]) + model_lh.clear_chrom(current_chrom) + if e != 1: + res += np.sum(prob_array[empty_start:, 0]) + + current_chrom = i[0] + model_lh.read_chrom_track(current_chrom, name) + prob_model = model_lh[current_chrom] + cove_array = read_chromosome_from_bw( + os.path.join(coverage_folder, f"{coverage_prefix}_{name}.bw"), + current_chrom, + ) + prob_array = LhModel(prob_model[name], cove_array) + empty_start = 0 + else: + print(f"Chromosome {i[0]} missing from model") + missing_chrom = i[0] + start = i[s_index] + if e_index is None: + end = i[s_index] + 1 + else: + end = i[e_index] + r1 = np.sum(prob_array[start:end, 1]) + r2 = np.sum(prob_array[empty_start:start, 0]) + res += r1 + res += r2 + empty_start = end + res += np.sum(prob_array[empty_start:, 0]) + z = set(chroms).difference(set(done_chrom)) + z = list(z) + for i in z: + model_lh.read_chrom_track(i, name) + prob_model = model_lh[i] + cove_array = read_chromosome_from_bw( + os.path.join(coverage_folder, f"{coverage_prefix}_{name}.bw"), + i, + ) + prob_array = LhModel(prob_model[name], cove_array) + res += np.sum(prob_array[:, 0]) + return res + + +def hard_universe_likelihood(model, universe, coverage_folder, coverage_prefix): + """ + Calculate likelihood of hard universe based on core, start, + end coverage model + :param str model: path to file containing model + :param str universe: path to universe + :param coverage_prefix: prefix used in uniwig for creating coverage + :param coverage_folder: path to a folder with genome coverage by tracks + :return float: likelihood + """ + check_if_uni_sorted(universe) + model_lh = ModelLH(model) + chroms = model_lh.chromosomes_list + s = calc_likelihood_hard( + universe, chroms, model_lh, coverage_folder, coverage_prefix, "start", 1 + ) + e = calc_likelihood_hard( + universe, chroms, model_lh, coverage_folder, coverage_prefix, "end", 2 + ) + c = calc_likelihood_hard( + universe, + chroms, + model_lh, + coverage_folder, + coverage_prefix, + "core", + 1, + 2, + ) + return sum([s, e, c]) + + +def likelihood_only_core(model_file, universe, coverage_folder, coverage_prefix): + """ + Calculate likelihood of universe based only on core coverage model + :param str model_file: path to name containing model + :param str universe: path to universe + :param coverage_prefix: prefix used in uniwig for creating coverage + :param coverage_folder: path to a folder with genome coverage by tracks + :return float: likelihood + """ + check_if_uni_sorted(universe) + model_lh = ModelLH(model_file) + chroms = model_lh.chromosomes_list + c = calc_likelihood_hard( + universe, + chroms, + model_lh, + coverage_folder, + coverage_prefix, + "core", + 1, + 2, + ) + return c + + +def background_likelihood(start, end, model_start, model_cove, model_end): + """ + Calculate likelihood of background for given region + """ + res = np.sum(model_start[start:end, 0]) + res += np.sum(model_cove[start:end, 0]) + res += np.sum(model_end[start:end, 0]) + return res + + +def weigh_livelihood(start, end, model_process, model_cove, model_out, reverse): + """ + Calculate weighted likelihood of flexible part of the region + :param int start: start of the region + :param int end: end of the region + :param array model_process: model for analyzed type of flexible region + :param array model_cove: model for coverage + :param array model_out: model for flexible region that is not being analyzed + :param bool reverse: if model_process corespondents to end we have to reverse the weighs + :return float: likelihood of flexible part of the region + """ + e_w = 1 / (end - start) # weights for processed model + c_w = np.linspace(start=e_w, stop=1, num=(end - start)) # weights for core in processed region + if reverse: + c_w = c_w[::-1] + res = e_w * np.sum(model_process[start:end, 1]) + res += np.sum(c_w * model_cove[start:end, 1]) + res += (1 - e_w) * np.sum(model_process[start:end, 0]) + res += np.sum((1 - c_w) * model_cove[start:end, 0]) + res += np.sum(model_out[start:end, 0]) + return res + + +def flexible_peak_likelihood(start_s, start_e, end_s, end_e, model_start, model_cove, model_end): + # core part of the peak + res = np.sum(model_cove[start_e:end_s, 1]) + res += np.sum(model_start[start_e:end_s, 0]) + res += np.sum(model_end[start_e:end_s, 0]) + # start part of the peak + res += weigh_livelihood(start_s, start_e, model_start, model_cove, model_end, False) + # end part of the peak + res += weigh_livelihood(end_s, end_e, model_end, model_cove, model_start, True) + return res + + +def read_coverage(cove_folder, cove_prefix, current_chrom): + cove_start = read_chromosome_from_bw( + os.path.join(cove_folder, f"{cove_prefix}_start.bw"), + current_chrom, + ) + cove_core = read_chromosome_from_bw( + os.path.join(cove_folder, f"{cove_prefix}_core.bw"), + current_chrom, + ) + cove_end = read_chromosome_from_bw( + os.path.join(cove_folder, f"{cove_prefix}_end.bw"), + current_chrom, + ) + cove = {"start": cove_start, "core": cove_core, "end": cove_end} + return cove + + +def likelihood_flexible_universe( + model_file, universe, cove_folder, cove_prefix, save_peak_input=False +): + """ + Likelihood of given universe under the model + :param str model_file: path to file with lh model + :param str universe: path to universe + :param cove_folder: path to a folder with genome coverage by tracks + :param cove_prefix: prefix used in uniwig for creating coverage + :param bool save_peak_input: whether to save universe with each peak lh + :return float: lh of the flexible universe + """ + current_chrom = "" + missing_chrom = "" + empty_start = 0 + res = 0 + check_if_uni_sorted(universe) + check_if_uni_flexible(universe) + model_lh = ModelLH(model_file) + chroms = model_lh.chromosomes_list + done_chroms = [] + output = [] + e = 0 # number of processed chromosomes + with open(universe) as uni: + for line in uni: + i = line.split("\t") + peak_start_s, peak_end_e = int(i[1]), int(i[2]) + peak_start_e, peak_end_s = int(i[6]), int(i[7]) + if i[0] == missing_chrom: + pass + else: + if i[0] != current_chrom: + if i[0] in chroms: + model_lh.clear_chrom(current_chrom) + if e != 0: + # if we read any chromosomes add to result background + # likelihood of part of the genome after the last region + res += background_likelihood( + empty_start, + chr_size, + prob_start, + prob_core, + prob_end, + ) + current_chrom = i[0] + done_chroms.append(current_chrom) + e += 1 + model_lh.read_chrom(current_chrom) + models_current = model_lh[current_chrom] + cove_current = read_coverage(cove_folder, cove_prefix, current_chrom) + chr_size = len(cove_current["start"]) + prob_start = LhModel(models_current["start"], cove_current["start"]) + prob_core = LhModel(models_current["core"], cove_current["core"]) + prob_end = LhModel(models_current["end"], cove_current["end"]) + + else: + print(f"Chromosome {i[0]} missing from model") + missing_chrom = i[0] + res += background_likelihood( + empty_start, + peak_start_s, + prob_start, + prob_core, + prob_end, + ) + peak_likelihood = flexible_peak_likelihood( + peak_start_s, + peak_start_e, + peak_end_s, + peak_end_e, + prob_start, + prob_core, + prob_end, + ) + res += peak_likelihood + if save_peak_input: + background = background_likelihood( + peak_start_s, peak_end_e, prob_start, prob_core, prob_end + ) + contribution = peak_likelihood - background + output.append("{}\t{}\n".format(line.strip("\n"), contribution)) + empty_start = peak_end_e + + res += background_likelihood(empty_start, chr_size, prob_start, prob_core, prob_end) + if save_peak_input: + print("saving") + with open(universe + "_peak_likelihood", "w") as f: + f.writelines(output) + z = set(chroms).difference(set(done_chroms)) + z = list(z) + for i in z: + for name in ["start", "core", "end"]: + model_lh.read_chrom_track(i, name) + prob_model = model_lh[i] + cove_array = read_chromosome_from_bw( + os.path.join(cove_folder, f"{cove_prefix}_{name}.bw"), + i, + ) + prob_array = LhModel(prob_model[name], cove_array) + res += np.sum(prob_array[:, 0]) + return res diff --git a/gitk/assess/utils.py b/geniml/assess/utils.py similarity index 88% rename from gitk/assess/utils.py rename to geniml/assess/utils.py index 6ea8b6d6..8b50e65e 100644 --- a/gitk/assess/utils.py +++ b/geniml/assess/utils.py @@ -1,12 +1,8 @@ -from subprocess import Popen, PIPE -import subprocess -import shlex import os +import shlex +import subprocess import tempfile - - -def help(f): - f.write(b"Welcome to geeksforgeeks") +from subprocess import PIPE, Popen def prep_data(folder, file, tmp_file): @@ -36,6 +32,14 @@ def check_if_uni_sorted(universe): raise Exception("Universe not sorted") +def check_if_uni_flexible(universe): + with open(universe) as u: + l = u.readline() + l = l.split("\t") + if len(l) < 6: + raise Exception("Universe is not flexible") + + def process_line(line): """Helper for reading in bed file line""" line = line.split("\t")[:3] diff --git a/geniml/atacformer/__init__.py b/geniml/atacformer/__init__.py new file mode 100644 index 00000000..3556ac55 --- /dev/null +++ b/geniml/atacformer/__init__.py @@ -0,0 +1,47 @@ +from transformers import AutoConfig + +from ._version import VERSION +from .data_processing import TrainingTokenizer +from .modeling_atacformer import ( + AtacformerModel, + AtacformerForMaskedLM, + AtacformerForReplacedTokenDetection, + AtacformerForCellClustering, + AtacformerForUnsupervisedBatchCorrection, +) +from .configuration_atacformer import AtacformerConfig +from .modeling_utils import freeze_except_last_n, patch_atacformer_model_for_mps +from .training_utils import ( + DataCollatorForReplacedTokenDetection, + DataCollatorForTripletLoss, + DataCollatorForUnsupervisedBatchCorrection, + ModelParameterChangeCallback, + AdjustedRandIndexCallback, + get_git_hash, + get_decaying_cosine_with_hard_restarts_schedule_with_warmup, + tokenize_anndata, +) + +AutoConfig.register("atacformer", AtacformerConfig) + +__all__ = [ + "AtacformerConfig", + "AtacformerModel", + "AtacformerForMaskedLM", + "AtacformerForReplacedTokenDetection", + "AtacformerForCellClustering", + "AtacformerForUnsupervisedBatchCorrection", + "DataCollatorForReplacedTokenDetection", + "DataCollatorForTripletLoss", + "DataCollatorForUnsupervisedBatchCorrection", + "ModelParameterChangeCallback", + "AdjustedRandIndexCallback", + "TrainingTokenizer", + "tokenize_anndata", + "get_decaying_cosine_with_hard_restarts_schedule_with_warmup", + "get_git_hash", + "freeze_except_last_n", + "patch_atacformer_model_for_mps", +] +__version__ = VERSION +__author__ = "Nathan LeRoy" diff --git a/geniml/atacformer/_version.py b/geniml/atacformer/_version.py new file mode 100644 index 00000000..1cf6267a --- /dev/null +++ b/geniml/atacformer/_version.py @@ -0,0 +1 @@ +VERSION = "0.1.0" diff --git a/geniml/atacformer/configuration_atacformer.py b/geniml/atacformer/configuration_atacformer.py new file mode 100644 index 00000000..7ecd323c --- /dev/null +++ b/geniml/atacformer/configuration_atacformer.py @@ -0,0 +1,118 @@ +from transformers import PretrainedConfig + + +class AtacformerConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of an `AtacformerModel`. + it inherits from [`ModernBertConfig`] and expands it for Atacformer specific settings. + instantiating a configuration with the defaults will yield a similar configuration to that of the + modernbert base configuration. + + Args: + use_pos_embeddings (`bool`, *optional*, defaults to `True`): + whether to use positional embeddings. + vocab_size (`int`, *optional*, defaults to 890711): + vocabulary size tailored for genomic regions. + max_position_embeddings (`int`, *optional*, defaults to 8192): + the maximum sequence length that this model might ever be used with. + hidden_size (`int`, *optional*, defaults to 384): + the size of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 1536): + the size of the "intermediate" (often named feed-forward) layer in the transformer. + num_hidden_layers (`int`, *optional*, defaults to 6): + the number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 8): + the number of attention heads in each attention layer. + pad_token_id (`int`, *optional*, defaults to 890705): + the id of the token used for padding. + eos_token_id (`int`, *optional*, defaults to 890708): + the id of the token used for the end of a sequence. + bos_token_id (`int`, *optional*, defaults to 890709): + the id of the token used for the beginning of a sequence. + cls_token_id (`int`, *optional*, defaults to 890707): + the id of the token used for classification tasks. + sep_token_id (`int`, *optional*, defaults to 890710): + the id of the token used to separate segments in a sequence. + sparse_prediction (`bool`, *optional*, defaults to `True`): + whether to use sparse prediction for the output layer. + norm_eps (`float`, *optional*, defaults to 1e-5): + the epsilon value used for layer normalization. + embedding_dropout (`float`, *optional*, defaults to 0.0): + the dropout probability for the embedding layer. + initializer_range (`float`, *optional*, defaults to 0.02): + the standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_cutoff_factor (`float`, *optional*, defaults to 2.0): + the cutoff factor for the truncated normal initializer. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + whether to tie the word embeddings with the output layer. + num_batches (`int`, *optional*, defaults to 1): + the number of batches when doing batch correction training. + lambda_adv: float = 1.0, + the weight for the adversarial loss. + grl_alpha: float = 1.0, + the alpha value for the gradient reversal layer. + bc_unfreeze_last_n_layers (`int`, *optional*, defaults to 0): + the number of last layers to unfreeze during training for batch correction. + **kwargs: (additional keyword arguments, *optional*): + additional configuration parameters. + """ + + model_type = "atacformer" + + def __init__( + self, + use_pos_embeddings: bool = True, + vocab_size: int = 890711, + max_position_embeddings: int = 8192, + hidden_size: int = 384, + intermediate_size: int = 1536, + num_hidden_layers: int = 6, + num_attention_heads: int = 8, + pad_token_id: int = 890705, + eos_token_id: int = 890708, + bos_token_id: int = 890709, + cls_token_id: int = 890707, + sep_token_id: int = 890710, + sparse_prediction: bool = True, + norm_eps: float = 1e-5, + embedding_dropout: float = 0.0, + initializer_range: float = 0.02, + initializer_cutoff_factor: float = 2.0, + tie_word_embeddings: bool = True, + num_batches: int = None, + lambda_adv: float = 1.0, + grl_alpha: float = 1.0, + bc_unfreeze_last_n_layers: int = 2, + **kwargs, + ): + super().__init__( + vocab_size=vocab_size, + max_position_embeddings=max_position_embeddings, + hidden_size=hidden_size, + intermediate_size=intermediate_size, + num_hidden_layers=num_hidden_layers, + num_attention_heads=num_attention_heads, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + bos_token_id=bos_token_id, + cls_token_id=cls_token_id, + sep_token_id=sep_token_id, + sparse_prediction=sparse_prediction, + norm_eps=norm_eps, + embedding_dropout=embedding_dropout, + initializer_range=initializer_range, + initializer_cutoff_factor=initializer_cutoff_factor, + tie_word_embeddings=tie_word_embeddings, + num_batches=num_batches, + lambda_adv=lambda_adv, + grl_alpha=grl_alpha, + bc_unfreeze_last_n_layers=bc_unfreeze_last_n_layers, + **kwargs, + ) + self.use_pos_embeddings = use_pos_embeddings + self.num_batches = num_batches + self.lambda_adv = lambda_adv + self.grl_alpha = grl_alpha + self.bc_unfreeze_last_n_layers = bc_unfreeze_last_n_layers + +__all__ = ["AtacformerConfig"] diff --git a/geniml/atacformer/data_processing.py b/geniml/atacformer/data_processing.py new file mode 100644 index 00000000..b388a97a --- /dev/null +++ b/geniml/atacformer/data_processing.py @@ -0,0 +1,57 @@ +import os +from typing import List + +from gtars.tokenizers import Tokenizer +from transformers import PreTrainedTokenizerBase + + +class TrainingTokenizer(Tokenizer, PreTrainedTokenizerBase): + """ + A special training tokenizer. This class is a subclass of both **our** Tokenizer and + PreTrainedTokenizerBase. This is because the data collator requires a collator that is + both a Tokenizer and a PreTrainedTokenizerBase. This is a workaround to make the + code work with our Tokenizer. + """ + + @property + def added_tokens_decoder(self): + return dict() + + @property + def added_tokens_encoder(self): + return dict() + + def num_special_tokens_to_add(self, pair=False): + return len(self.special_tokens_map) + + def save_vocabulary(self, save_directory, filename_prefix=None): + # save vocab (a dict of token to id) to a file in the save_directory + vocab = self.get_vocab() + if not save_directory: + raise ValueError("save_directory must be specified to save the vocabulary.") + if filename_prefix: + vocab_file = f"{filename_prefix}-vocab.txt" + else: + vocab_file = "vocab.txt" + vocab_path = os.path.join(save_directory, vocab_file) + with open(vocab_path, "w", encoding="utf-8") as vocab_writer: + for token, token_id in vocab.items(): + vocab_writer.write(f"{token}\t{token_id}\n") + + return (vocab_path,) + + @property + def all_special_ids(self): + """ + Returns a list of all special token ids. + """ + return self.encode(list(self.special_tokens_map.values())) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return " ".join(tokens) + + def __new__(cls, *args, **kwargs): + return super().__new__(cls, *args, **kwargs) + + def __init__(self, *args, **kwargs): + PreTrainedTokenizerBase.__init__(self) diff --git a/geniml/atacformer/functional.py b/geniml/atacformer/functional.py new file mode 100644 index 00000000..6598bc05 --- /dev/null +++ b/geniml/atacformer/functional.py @@ -0,0 +1,16 @@ +from torch.autograd import Function + +class GradientReversal(Function): + @staticmethod + def forward(ctx, x, alpha): + ctx.save_for_backward(x, alpha) + return x + + @staticmethod + def backward(ctx, grad_output): + grad_input = None + _, alpha = ctx.saved_tensors + if ctx.needs_input_grad[0]: + grad_input = - alpha*grad_output + return grad_input, None +revgrad = GradientReversal.apply \ No newline at end of file diff --git a/geniml/atacformer/modeling_atacformer.py b/geniml/atacformer/modeling_atacformer.py new file mode 100644 index 00000000..39f1196f --- /dev/null +++ b/geniml/atacformer/modeling_atacformer.py @@ -0,0 +1,761 @@ +from typing import Optional, Union, Tuple, List + +import torch +import torch.nn.functional as F +import torch.nn as nn +import numpy as np +from tqdm import tqdm + +from transformers import PreTrainedModel +from transformers.utils import logging +from transformers.modeling_outputs import ( + MaskedLMOutput, + TokenClassifierOutput, + BaseModelOutput, + SequenceClassifierOutput, +) + +from .configuration_atacformer import AtacformerConfig +from .modeling_utils import freeze_except_last_n +from .functional import revgrad + +logger = logging.get_logger(__name__) + +# try to import cut cross entropy +try: + from cut_cross_entropy.linear_cross_entropy import LinearCrossEntropy + + _CCE_AVAILABLE = True +except ImportError: + logger.warning( + "Cut cross entropy not found, please install it with `pip install cut-cross-entropy`." + ) + _CCE_AVAILABLE = False + + +class AtacformerPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = AtacformerConfig + base_model_prefix = "atacformer" + supports_gradient_checkpointing = True + _supports_sdpa = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +class AtacformerEmbeddings(nn.Module): + """ + Simple embedding layer that includes learnable token and position embeddings. + """ + + def __init__(self, config: AtacformerConfig): + """ + Args: + config (AtacformerConfig): Configuration object for the model. + """ + super().__init__() + self.config = config + self.token_embeddings = nn.Embedding( + config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id + ) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + + def forward(self, input_ids: torch.LongTensor) -> torch.Tensor: + bsz, seq_len = input_ids.size() + + # create [0,1,2,…,seq_len‑1] and expand to [bsz, seq_len] + pos_ids = torch.arange(seq_len, device=input_ids.device).unsqueeze(0).expand(bsz, -1) + + tok_emb = self.token_embeddings(input_ids) + pos_emb = self.position_embeddings(pos_ids) + x = tok_emb + if self.config.use_pos_embeddings: + x = x + pos_emb + return x + + +class AtacformerModel(AtacformerPreTrainedModel): + """ + atacformer model with a simple embedding layer that skips positional encoding. + """ + + config_class = AtacformerConfig + base_model_prefix = "atacformer" + + def __init__(self, config: AtacformerConfig): + super().__init__(config) + self.embeddings = AtacformerEmbeddings(config) + self.encoder = nn.TransformerEncoder( + nn.TransformerEncoderLayer( + batch_first=True, + d_model=config.hidden_size, + nhead=config.num_attention_heads, + dim_feedforward=config.intermediate_size, + activation="relu", + ), + num_layers=config.num_hidden_layers, + ) + + self.init_weights() + + def get_input_embeddings(self): + return self.embeddings.token_embeddings + + def set_input_embeddings(self, value): + self.embeddings.token_embeddings = value + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + return_dict: Optional[bool] = None, + **kwargs: Optional[dict], + ) -> torch.Tensor: + """ + Forward pass of the model. + + Args: + input_ids (`torch.LongTensor`): + Input tensor of shape (batch_size, sequence_length). + attention_mask (`torch.Tensor`, *optional*): + Mask to avoid performing attention on padding token indices. + return_dict (`bool`, *optional*): + Whether to return the outputs as a dict or a tuple. + Raises: + ValueError: If `input_ids` is not provided. + Returns: + `torch.Tensor`: The output of the model. It will be a tensor of shape (batch_size, sequence_length, hidden_size). + """ + if input_ids is None: + raise ValueError("You have to specify input_ids") + + # prepare attention mask + if attention_mask is None: + attention_mask = torch.ones(input_ids.shape, dtype=torch.bool, device=input_ids.device) + + # get embeddings + embeddings = self.embeddings(input_ids) + + # pass through encoder + outputs = self.encoder(embeddings, src_key_padding_mask=~attention_mask) + + return outputs + + +class EncodeTokenizedCellsMixin: + """ + Provides a default `encode_tokenized_cells` by delegating to `self.atacformer(...)`. + Assumes any subclass has an attribute `atacformer` with a forward method + that takes (input_ids, attention_mask, return_dict=False). + Token pooling (mean) can be disabled to return per-token embeddings. + """ + + def encode_tokenized_cells( + self, + input_ids: List[List[int]], + batch_size: int = 16, + pool_tokens: bool = True, + ) -> torch.Tensor: + """ + Loops internally over input_ids to produce a [N, D] matrix (if pooled) or [N, L, D] tensor (if not). + Args: + input_ids (Sequence[torch.LongTensor]): + A sequence of tokenized input IDs, each of shape (sequence_length,). + batch_size (int, *optional*, defaults to 16): + The batch size to use for encoding. + pool_tokens (bool, *optional*, defaults to True): + Whether to mean-pool the token embeddings (True) or return per-token embeddings (False). + """ + if not hasattr(self, "atacformer"): + raise AttributeError( + "This class must have an 'atacformer' attribute with a forward method." + ) + if not hasattr(self.config, "pad_token_id") or not hasattr( + self.config, "max_position_embeddings" + ): + raise AttributeError( + "This class must have 'pad_token_id' and 'max_position_embeddings' in its config." + ) + + pad_id = self.config.pad_token_id + max_ctx = self.config.max_position_embeddings + + device = next(self.parameters()).device + all_embs = [] + + with torch.no_grad(): + for start in tqdm(range(0, len(input_ids), batch_size), desc="Encoding batches"): + torch.cuda.empty_cache() + batch_seqs = input_ids[start : start + batch_size] + toks = [ + torch.tensor( + ( + np.random.choice(s, size=max_ctx, replace=len(s) < max_ctx) + if len(s) > max_ctx + else s + ), + dtype=torch.long, + device=device, + ) + for s in batch_seqs + ] + padded = nn.utils.rnn.pad_sequence( + toks, batch_first=True, padding_value=pad_id + ).to(device) + mask = padded != pad_id + last_h = self.atacformer(input_ids=padded, attention_mask=mask) + if pool_tokens: + masked = last_h * mask.unsqueeze(-1) + summed = masked.sum(dim=1) + lengths = mask.sum(dim=1).unsqueeze(-1) + batch_emb = summed / lengths.clamp(min=1) + else: + batch_emb = last_h + all_embs.append(batch_emb) + return torch.cat(all_embs, dim=0) + + +class AtacformerForMaskedLM(EncodeTokenizedCellsMixin, AtacformerPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + """ + atacformerModel for masked language modeling. + """ + + def __init__(self, config: AtacformerConfig): + super().__init__(config) + self.atacformer = AtacformerModel(config) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # use cut cross entropy if available + if _CCE_AVAILABLE: + self.loss_fct = LinearCrossEntropy() + else: + self.loss_fct = nn.CrossEntropyLoss( + ignore_index=-100, + reduction="mean", + ) + + self.post_init() + + def get_input_embeddings(self): + return self.atacformer.get_input_embeddings() + + def get_output_embeddings(self): + return self.lm_head + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.LongTensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + """ + Forward pass of the model. + + Args: + input_ids (`torch.LongTensor`): + Input tensor of shape (batch_size, sequence_length). + attention_mask (`torch.Tensor`, *optional*): + Mask to avoid performing attention on padding token indices. + labels (`torch.LongTensor`, *optional*): + Labels for masked language modeling. + return_dict (`bool`, *optional*): + Whether to return the outputs as a dict or a tuple. + Returns: + `MaskedLMOutput`: The output of the model. + """ + if input_ids is None: + raise ValueError("You have to specify input_ids") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # ensure attention mask is bool + if attention_mask is None: + attention_mask = torch.ones(input_ids.shape, dtype=torch.bool, device=input_ids.device) + else: + attention_mask = attention_mask.bool() + + # get embeddings + # shape (batch_size, sequence_length, hidden_size) + outputs = self.atacformer( + input_ids, attention_mask=attention_mask, return_dict=return_dict + ) + + # compute loss if labels are provided + loss = None + if labels is not None: + required_dtype = self.get_output_embeddings().weight.dtype # should be torch.bfloat16 + if outputs.dtype != required_dtype: + # logger.warning_once(f"casting hidden states from {outputs.dtype} to {required_dtype} before cce loss.") # Optional logging + outputs = outputs.to(required_dtype) + assert ( + _CCE_AVAILABLE + ), "Cut cross entropy is not available. Please install it with `pip install cut-cross-entropy`." + + loss = self.loss_fct( + e=outputs, + c=self.get_output_embeddings().weight, + targets=labels, + bias=self.get_output_embeddings().bias, + ) + + if not return_dict: + if loss is not None: + return (loss, None) + (outputs,) + + return MaskedLMOutput(loss=loss, logits=None, hidden_states=outputs, attentions=None) + + +class AtacformerDiscriminatorHead(nn.Module): + """ + 2-layer token-wise classifier, copied from Electra-Small. + Hidden size is usually the same as the backbone’s, but you + can pass a smaller `disc_hidden_size` in the config if you like. + """ + + def __init__(self, config: AtacformerConfig): + super().__init__() + hidden_sz = getattr(config, "discriminator_hidden_size", config.hidden_size) + self.dense = nn.Linear(config.hidden_size, hidden_sz) + self.act = nn.GELU() + self.norm = nn.LayerNorm(hidden_sz, eps=config.norm_eps) + self.classifier = nn.Linear(hidden_sz, 1) + + def forward(self, sequence_output: torch.Tensor) -> torch.Tensor: + x = self.dense(sequence_output) + x = self.act(x) + x = self.norm(x) + logits = self.classifier(x).squeeze(-1) # (B, L) + return logits + + +class AtacformerForReplacedTokenDetection(EncodeTokenizedCellsMixin, AtacformerPreTrainedModel): + """ + Atacformer model for replaced token detection. This model uses the ELECTRA + framework to train a discriminator (this model) to detect replaced tokens. + + https://arxiv.org/abs/2003.10555 + """ + + def __init__(self, config: AtacformerConfig): + super().__init__(config) + self.atacformer = AtacformerModel(config) + self.discriminator = AtacformerDiscriminatorHead(config) + + self.post_init() + + def get_input_embeddings(self): + return self.atacformer.get_input_embeddings() + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.LongTensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + """ + Forward pass of the model. + + Args: + input_ids (`torch.LongTensor`): + Input tensor of shape (batch_size, sequence_length). + attention_mask (`torch.Tensor`, *optional*): + Mask to avoid performing attention on padding token indices. + labels (`torch.LongTensor`, *optional*): + Labels for masked language modeling. + return_dict (`bool`, *optional*): + Whether to return the outputs as a dict or a tuple. + Returns: + `TokenClassifierOutput`: The output of the model. + """ + if input_ids is None: + raise ValueError("You have to specify input_ids") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # ensure attention mask is bool + if attention_mask is None: + attention_mask = torch.ones(input_ids.shape, dtype=torch.bool, device=input_ids.device) + else: + attention_mask = attention_mask.bool() + + # get embeddings + # shape (batch_size, sequence_length, hidden_size) + backbone_out = self.atacformer( + input_ids=input_ids, + attention_mask=attention_mask, + return_dict=True, + ) + logits = self.discriminator(backbone_out) # (B, L) + + loss = None + if labels is not None: + # labels: 1=replaced, 0=original, -100=ignore (special tokens) + active = labels != -100 + if active.any(): + loss = F.binary_cross_entropy_with_logits(logits[active], labels.float()[active]) + + if not return_dict: + return (loss, logits) if loss is not None else (logits,) + + return TokenClassifierOutput( # simple HF container + loss=loss, logits=logits, hidden_states=backbone_out + ) + + +class AtacformerForCellClustering(EncodeTokenizedCellsMixin, AtacformerPreTrainedModel): + """ + Atacformer model for cell clustering. It follows a similar learning framework + to SentenceBERT (SBERT), where the model is trained to minimize the distance + between positive pairs and maximize the distance between negative pairs. + """ + + def __init__(self, config: AtacformerConfig): + super().__init__(config) + self.atacformer = AtacformerModel(config) + self.triplet_loss = torch.nn.TripletMarginLoss(margin=1.0, p=2) + + self.post_init() + + def get_input_embeddings(self): + return self.atacformer.get_input_embeddings() + + def _mean_pooling( + self, embeddings: torch.Tensor, attention_mask: torch.Tensor + ) -> torch.Tensor: + """ + Mean pooling for the embeddings. + + Args: + embeddings (`torch.Tensor`): + Embeddings tensor of shape (batch_size, sequence_length, hidden_size). + attention_mask (`torch.Tensor`): + Attention mask tensor of shape (batch_size, sequence_length). + Returns: + `torch.Tensor`: The mean-pooled embeddings. + """ + # apply attention mask + attention_mask = attention_mask.unsqueeze(-1).expand(embeddings.size()) + sum_embeddings = torch.sum(embeddings * attention_mask, 1) + sum_mask = torch.clamp(attention_mask.sum(1), min=1e-9) + return sum_embeddings / sum_mask + + def forward( + self, + input_ids_anchor: torch.LongTensor, + attention_mask_anchor: Optional[torch.Tensor] = None, + input_ids_positive: torch.LongTensor = None, + attention_mask_positive: Optional[torch.Tensor] = None, + input_ids_negative: torch.LongTensor = None, + attention_mask_negative: Optional[torch.Tensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], BaseModelOutput]: + + if attention_mask_anchor is None: + attention_mask_anchor = torch.ones_like(input_ids_anchor, dtype=torch.bool) + if attention_mask_positive is None: + attention_mask_positive = torch.ones_like(input_ids_positive, dtype=torch.bool) + if attention_mask_negative is None: + attention_mask_negative = torch.ones_like(input_ids_negative, dtype=torch.bool) + + # encode and pool + emb_anchor = self._mean_pooling( + self.atacformer( + input_ids_anchor, attention_mask=attention_mask_anchor, return_dict=False + ), + attention_mask_anchor, + ) + emb_positive = self._mean_pooling( + self.atacformer( + input_ids_positive, attention_mask=attention_mask_positive, return_dict=False + ), + attention_mask_positive, + ) + emb_negative = self._mean_pooling( + self.atacformer( + input_ids_negative, attention_mask=attention_mask_negative, return_dict=False + ), + attention_mask_negative, + ) + + # Triplet margin loss + loss = self.triplet_loss(emb_anchor, emb_positive, emb_negative) + + if not return_dict: + return (loss, emb_anchor, emb_positive, emb_negative) + + return BaseModelOutput( + loss=loss, + last_hidden_state=None, + hidden_states=(emb_anchor, emb_positive, emb_negative), + attentions=None, + ) + + +class AtacformerPairwiseInteractionHead(nn.Module): + """ + Pairwise interaction head that will build pairwise interaction] + scores for embeddings output by AtacformerModel. (self-interaction) + """ + + def __init__(self, config: AtacformerConfig): + super().__init__() + self.mlp = nn.Sequential( + nn.Linear(config.hidden_size, config.hidden_size), + nn.ReLU(), + nn.Linear(config.hidden_size, 1), + ) + + def forward(self, embeddings: torch.Tensor) -> torch.Tensor: + """ + Forward pass for the pairwise interaction head with a batch dimension. + + Args: + embeddings (torch.Tensor): The input embeddings of shape (batch_size, n, hidden_size). + + Returns: + torch.Tensor: The pairwise interaction scores of shape (batch_size, n, n). + """ + B, n, d = embeddings.size() + + # create tensor pairs for each batch sample + emb1 = embeddings.unsqueeze(2).expand(B, n, n, d) + emb2 = embeddings.unsqueeze(1).expand(B, n, n, d) + + pairwise_features = torch.cat( + [emb1, emb2, emb1 * emb2, torch.abs(emb1 - emb2)], dim=-1 + ) # shape: (B, n, n, 4*d) + + scores = self.mlp(pairwise_features).squeeze(-1) # shape: (B, n, n) + return scores + + +class AtacformerForPairwiseInteraction(AtacformerPreTrainedModel): + """ + Atacformer model for pairwise interaction prediction. It follows a similar learning framework + to the one used in protein-protein interaction prediction, where the model is trained to predict + whether two proteins interact or not based on their sequence embeddings. + """ + + def __init__(self, config: AtacformerConfig): + super().__init__(config) + self.atacformer = AtacformerModel(config) + self.pairwise_head = AtacformerPairwiseInteractionHead(config) + self.post_init() + + def get_input_embeddings(self): + return self.atacformer.get_input_embeddings() + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], BaseModelOutput]: + """ + Forward pass of the model. + + Args: + input_ids (`torch.LongTensor`): + Input tensor of shape (batch_size, sequence_length). + attention_mask (`torch.Tensor`, *optional*): + Mask to avoid performing attention on padding token indices. + labels (`torch.Tensor`, *optional*): + Labels for pairwise interaction prediction. + return_dict (`bool`, *optional*): + Whether to return the outputs as a dict or a tuple. + Returns: + `BaseModelOutput`: The output of the model. + """ + if input_ids is None: + raise ValueError("You have to specify input_ids") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # ensure attention mask is bool + if attention_mask is None: + attention_mask = torch.ones(input_ids.shape, dtype=torch.bool, device=input_ids.device) + else: + attention_mask = attention_mask.bool() + + # get embeddings + # shape (batch_size, sequence_length, hidden_size) + backbone_out = self.atacformer( + input_ids=input_ids, + attention_mask=attention_mask, + ) + + # get pairwise interaction scores + pairwise_scores = self.pairwise_head(backbone_out) + if not return_dict: + return (pairwise_scores,) + + return BaseModelOutput( + loss=None, + last_hidden_state=backbone_out.last_hidden_state, + hidden_states=None, + attentions=None, + ) + + +class GRL(nn.Module): + def __init__(self, alpha: float = 1.0): + super().__init__() + self.alpha = torch.tensor(alpha, requires_grad=False) + + def forward(self, x): + return revgrad(x, self.alpha) + + +class AtacformerForUnsupervisedBatchCorrection( + EncodeTokenizedCellsMixin, AtacformerPreTrainedModel +): + """ + Atacformer model for batch correction. It follows a similar learning framework + to the one used in domain adaptation, where the model is trained to correct + batch effects in the embeddings. + """ + + def __init__(self, config: AtacformerConfig): + super().__init__(config) + self.atacformer = AtacformerModel(config) + self.discriminator = AtacformerDiscriminatorHead(config) + self.grl = GRL(alpha=config.grl_alpha) # gradient reversal layer + self.batch_prediction_head = nn.Linear(config.hidden_size, config.num_batches) + + self.lambda_adversarial = config.lambda_adv # weight for adversarial loss + + self.post_init() + freeze_except_last_n(self.atacformer, config.bc_unfreeze_last_n_layers) + + def get_input_embeddings(self): + return self.atacformer.get_input_embeddings() + + def _mean_pooling( + self, embeddings: torch.Tensor, attention_mask: torch.Tensor + ) -> torch.Tensor: + """ + Mean pooling for the embeddings. + + Args: + embeddings (`torch.Tensor`): + Embeddings tensor of shape (batch_size, sequence_length, hidden_size). + attention_mask (`torch.Tensor`): + Attention mask tensor of shape (batch_size, sequence_length). + Returns: + `torch.Tensor`: The mean-pooled embeddings. + """ + # apply attention mask + attention_mask = attention_mask.unsqueeze(-1).expand(embeddings.size()) + sum_embeddings = torch.sum(embeddings * attention_mask, 1) + sum_mask = torch.clamp(attention_mask.sum(1), min=1e-9) + return sum_embeddings / sum_mask + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + batch_labels: Optional[torch.Tensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + """ + Forward pass of the model. + + Args: + input_ids (`torch.LongTensor`): + Input tensor of shape (batch_size, sequence_length). + attention_mask (`torch.Tensor`, *optional*): + Mask to avoid performing attention on padding token indices. + labels (`torch.Tensor`, *optional*): + Labels for masked language modeling (ELECTRA). + batch_labels (`torch.Tensor`, *optional*): + Labels for batch prediction. Should be of shape (batch_size,). + return_dict (`bool`, *optional*): + Whether to return the outputs as a dict or a tuple. + Returns: + `BaseModelOutput`: The output of the model. + """ + if input_ids is None: + raise ValueError("You have to specify input_ids") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # ensure attention mask is bool + if attention_mask is None: + attention_mask = torch.ones(input_ids.shape, dtype=torch.bool, device=input_ids.device) + else: + attention_mask = attention_mask.bool() + + # get embeddings + # shape (batch_size, sequence_length, hidden_size) + backbone_out = self.atacformer( + input_ids=input_ids, + attention_mask=attention_mask, + ) + + cell_embeddings = self._mean_pooling(backbone_out, attention_mask) + + # 1) MLM-ELECTRA loss + logits_mlm = self.discriminator(backbone_out) # (B, L) + loss_mlm = None + if labels is not None: + # labels: 1=replaced, 0=original, -100=ignore (special tokens) + active = labels != -100 + if active.any(): + loss_mlm = F.binary_cross_entropy_with_logits( + logits_mlm[active], labels.float()[active] + ) + + # 2) Adversarial loss for batch prediction + logits_adv = self.batch_prediction_head(self.grl(cell_embeddings)) # (B, num_batches) + loss_adv = None + if batch_labels is not None: + loss_adv = F.cross_entropy(logits_adv, batch_labels, ignore_index=-100) + + # total + loss = None + if loss_mlm is not None and loss_adv is not None: + loss = loss_mlm + self.lambda_adversarial * loss_adv + elif loss_mlm is not None: + loss = loss_mlm + elif loss_adv is not None: + loss = self.lambda_adversarial * loss_adv + + # keep losses for logging + # if self.keep_losses: + # self.loss_mlm = loss_mlm + # self.loss_adv = loss_adv + # self.loss = loss + + if not return_dict: + return (loss, logits_mlm, logits_adv, cell_embeddings) + + return SequenceClassifierOutput( + loss=loss, + logits=logits_mlm, + hidden_states=None, + attentions=None, + ) diff --git a/geniml/atacformer/modeling_utils.py b/geniml/atacformer/modeling_utils.py new file mode 100644 index 00000000..744a228c --- /dev/null +++ b/geniml/atacformer/modeling_utils.py @@ -0,0 +1,50 @@ +from typing import TYPE_CHECKING + +import torch.nn as nn + +if TYPE_CHECKING: + from atacformer.modeling_atacformer import AtacformerModel # type: ignore[import-untyped] + + +def freeze_except_last_n(model: "AtacformerModel", n: int = 2): + """ + Freeze all parameters except the last n layers of the encoder. + Also keeps all layer norms trainable for stability. + + Args: + model (AtacformerModel): The model to freeze. + n (int): The number of last layers to keep trainable. + """ + if n < 0: + raise ValueError("n must be a non-negative integer.") + if n > len(model.encoder.layers): + raise ValueError( + f"n must be less than or equal to the number of layers ({len(model.encoder.layers)})." + ) + if n == 0: + return + for p in model.parameters(): + p.requires_grad = False # freeze everything first + for i in range(-n, 0): # last n layers + for p in model.encoder.layers[i].parameters(): + p.requires_grad = True + # always keep layer norms trainable for stability + for m in model.modules(): + if isinstance(m, nn.LayerNorm): + for p in m.parameters(): + p.requires_grad = True + + +def patch_atacformer_model_for_mps(model: nn.Module): + """ + Look for any `TransformerEncoder` layers in the model and patch them + by setting `enable_nested_tensor` to False and setting + `use_nested_tensor` to False. + + Args: + model (nn.Module): The model to patch. + """ + for module in model.modules(): + if isinstance(module, nn.TransformerEncoder): + module.enable_nested_tensor = False + module.use_nested_tensor = False diff --git a/geniml/atacformer/training_utils.py b/geniml/atacformer/training_utils.py new file mode 100644 index 00000000..16e8bc74 --- /dev/null +++ b/geniml/atacformer/training_utils.py @@ -0,0 +1,422 @@ +import logging +import math +import subprocess +from functools import partial +from typing import List, Dict +from collections import defaultdict + +import torch +import scanpy as sc + +import numpy as np +from torch.optim import Optimizer +from torch.optim.lr_scheduler import LambdaLR +from torch.nn.utils.rnn import pad_sequence +from tqdm import tqdm +from gtars.tokenizers import Tokenizer +from gtars.models import Region +from transformers import DataCollatorForLanguageModeling, TrainerCallback +from transformers.integrations.integration_utils import is_wandb_available + +from .data_processing import TrainingTokenizer + +def get_git_hash() -> str: + """ + Get the current git hash of the repository. + """ + try: + return subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("ascii").strip() + except Exception: + raise RuntimeError("Could not get git hash. Make sure you are in a git repository.") + +def tokenize_anndata(adata: sc.AnnData, tokenizer: Tokenizer): + """ + Tokenize an AnnData object. This is more involved, so it gets its own function. + + Args: + adata (sc.AnnData): The AnnData object to tokenize. + tokenizer (Tokenizer): The tokenizer to use. + """ + # extract regions from AnnData + # its weird because of how numpy handle Intervals, the parent class of Region, + # see here: + # https://stackoverflow.com/a/43722306/13175187 + adata_features = [ + Region(chr, int(start), int(end)) + for chr, start, end in tqdm( + zip(adata.var["chr"], adata.var["start"], adata.var["end"]), + total=adata.var.shape[0], + desc="Extracting regions from AnnData", + ) + ] + features = np.ndarray(len(adata_features), dtype=object) + for i, region in enumerate(adata_features): + features[i] = region + del adata_features + # tokenize + tokenized = [] + x = adata.X + for row in tqdm( + range(adata.shape[0]), + total=adata.shape[0], + desc="Tokenizing", + ): + _, non_zeros = x[row].nonzero() + regions = features[non_zeros] + tokenized.append(tokenizer(regions)) + return tokenized + +class WandbMixin: + def __init__(self, *args, **kwargs): + if not is_wandb_available(): + raise RuntimeError("WandbCallback requires wandb to be installed. Run `pip install wandb`.") + import wandb + self.wandb = wandb + super().__init__(*args, **kwargs) + +class DataCollatorForReplacedTokenDetection(WandbMixin, DataCollatorForLanguageModeling): + """ + Like HF’s MLM collator but: + • never uses [MASK] + • picks replacement tokens from a user-supplied distribution + • returns per-token 0/1 labels for ELECTRA-style discrimination + """ + + def __init__( + self, + tokenizer: TrainingTokenizer, + mlm_probability: float = 0.15, + seed: int | None = None, + ): + """ + Simple data collator for ELECTRA-style token replacement detection. + Args: + tokenizer (TrainingTokenizer): The tokenizer to use. + vocab_counts (torch.Tensor | None): 1-D tensor, size == vocab, log-probs OR probs + mlm_probability (float): Probability of masking a token. + seed (int | None): Random seed for reproducibility. + """ + super().__init__( + tokenizer=tokenizer, + mlm=True, + mlm_probability=mlm_probability, + mask_replace_prob=0.0, # we’ll never emit [MASK] + random_replace_prob=1.0, # always replace + seed=seed, + ) + + self.mlm_probability = mlm_probability + + # generate a uniform distribution + # over the vocabulary (ignoring special tokens) + vocab_counts = torch.ones(tokenizer.vocab_size, dtype=torch.float) + vocab_counts[tokenizer.all_special_ids] = 0 + self.uniform_vocab_probs = vocab_counts / vocab_counts.sum() + + def torch_mask_tokens( + self, inputs: torch.Tensor, special_tokens_mask: torch.Tensor | None = None + ) -> tuple[torch.Tensor, torch.Tensor]: + import torch + + + original = inputs.clone() + + # 1 pick positions to corrupt + # for probability_matrix, each spot gets mlm_probability (probability of being replaced) + probability_matrix = torch.full_like(inputs, self.mlm_probability, dtype=torch.float) + if special_tokens_mask is None: + special_tokens_mask = torch.tensor( + [self.tokenizer.get_special_tokens_mask(v, None, True) for v in inputs.tolist()], + dtype=torch.bool, + device=inputs.device, + ) + probability_matrix.masked_fill_(special_tokens_mask, 0.0) + replace_mask = torch.bernoulli(probability_matrix, generator=self.generator).bool() + + # 2) sample replacements from your distribution + num_to_replace = replace_mask.sum() + if num_to_replace > 0: + sampled = torch.multinomial( + self.uniform_vocab_probs.to(inputs.device), + num_samples=num_to_replace, + replacement=True, + ) + inputs[replace_mask] = sampled.long().to(inputs.device) + + # 3 discriminator labels: 1 ≙ token was replaced, 0 ≙ original + labels = (inputs != original).long() + labels[special_tokens_mask] = -100 # ignore loss on special / pad + + return inputs, labels + +class DataCollatorForTripletLoss: + """ + A simple data collator for triplet loss to fine-tune Atacformer for cell-type clustering + """ + def __init__(self, tokenizer: TrainingTokenizer, max_position_embeddings: int = None): + self.tokenizer = tokenizer + self.pad_token_id = tokenizer.pad_token_id + self.max_position_embeddings = max_position_embeddings + + + def _truncate(self, seq: List[int]) -> List[int]: + if self.max_position_embeddings is not None and len(seq) > self.max_position_embeddings: + idx = np.random.choice(len(seq), size=self.max_position_embeddings, replace=False) + idx.sort() + return [seq[i] for i in idx] + return seq + + def __call__(self, features: List[Dict]) -> Dict[str, torch.Tensor]: + # Unpack triplets and truncate if needed + anchors = [torch.tensor(self._truncate(f["input_ids_anchor"]), dtype=torch.long) for f in features] + positives = [torch.tensor(self._truncate(f["input_ids_positive"]), dtype=torch.long) for f in features] + negatives = [torch.tensor(self._truncate(f["input_ids_negative"]), dtype=torch.long) for f in features] + + # Pad all + input_ids_anchor = pad_sequence(anchors, batch_first=True, padding_value=self.pad_token_id) + input_ids_positive = pad_sequence(positives, batch_first=True, padding_value=self.pad_token_id) + input_ids_negative = pad_sequence(negatives, batch_first=True, padding_value=self.pad_token_id) + + # Create attention masks + attention_mask_anchor = (input_ids_anchor != self.pad_token_id) + attention_mask_positive = (input_ids_positive != self.pad_token_id) + attention_mask_negative = (input_ids_negative != self.pad_token_id) + + return { + "input_ids_anchor": input_ids_anchor, + "input_ids_positive": input_ids_positive, + "input_ids_negative": input_ids_negative, + "attention_mask_anchor": attention_mask_anchor, + "attention_mask_positive": attention_mask_positive, + "attention_mask_negative": attention_mask_negative, + } + +class DataCollatorForUnsupervisedBatchCorrection(DataCollatorForReplacedTokenDetection): + def __init__(self, tokenizer: TrainingTokenizer, max_position_embeddings: int = None, mlm_probability: float = 0.15, seed: int | None = None): + # call parent __init__ to properly initialize the ELECTRA token replacement + super().__init__(tokenizer=tokenizer, mlm_probability=mlm_probability, seed=seed) + self.max_position_embeddings = max_position_embeddings + + + def _truncate(self, seq: List[int]) -> List[int]: + if self.max_position_embeddings is not None and len(seq) > self.max_position_embeddings: + idx = np.random.choice(len(seq), size=self.max_position_embeddings, replace=False) + idx.sort() + return [seq[i] for i in idx] + return seq + + def __call__(self, features: List[Dict]) -> Dict[str, torch.Tensor]: + # unpack + input_ids = [torch.tensor(self._truncate(f["input_ids"]), dtype=torch.long) for f in features] + batch_labels = torch.tensor([f["batch_id"] for f in features], dtype=torch.long) + + pad_token_id = self.tokenizer.pad_token_id + + # run the masking + input_ids, labels = super().torch_mask_tokens( + inputs=pad_sequence(input_ids, batch_first=True, padding_value=pad_token_id), + special_tokens_mask=None + ) + + # create attention mask + attention_mask = (input_ids != pad_token_id) + + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "labels": labels, + "batch_labels": batch_labels, # batch labels for batch correction + } + + +class ModelParameterChangeCallback(WandbMixin, TrainerCallback): + """ + A callback to log the changes in model parameters after training. + """ + def __init__(self, initial_params: dict[str, torch.Tensor]): + super().__init__() + + self.initial_params = initial_params + + def _compute_param_changes(self, model: torch.nn.Module): + """ + Compute the changes in model parameters after training. + + Args: + model (torch.nn.Module): The model to check. + """ + updates = defaultdict(float) + counts = defaultdict(int) + + for (name, p) in model.named_parameters(): + delta = (p.detach().cpu() - self.initial_params[name]).norm().item() + module = name.rsplit('.', 1)[0] # e.g. 'encoder.layer1' + updates[module] += delta + counts[module] += 1 + + + for m in updates: + updates[m] = math.sqrt(updates[m] / counts[m]) + + return updates + + + def on_log(self, args, state, control, **kwargs): + """ + Log the changes in model parameters after training. + """ + model = kwargs.get("model") + step = state.global_step + if model is not None: + updates = self._compute_param_changes(model) + table = self.wandb.Table(columns=["module","delta","step"]) + + if self.wandb.run is not None: + for module, delta in updates.items(): + table.add_data(module, delta, step) + self.wandb.log({"parameter_changes": table}, step=step) + else: + print("Parameter changes:") + for module, delta in updates.items(): + print(f"{module}: {delta:.4f}") + else: + raise ValueError("Model is not available in the callback. Please check the Trainer configuration.") + +class AdjustedRandIndexCallback(WandbMixin, TrainerCallback): + """ + A callback to log the adjusted Rand index (ARI) during training. + """ + def __init__(self, input_ids: List[List[int]], cell_type_labels: List[int], pad_token_id: int, batch_size: int = 128, log_every_n_steps: int = 500): + super().__init__() + try: + from sklearn.metrics import adjusted_rand_score + from sklearn.cluster import KMeans + except ImportError: + raise ImportError("scikit-learn is required for AdjustedRandIndexCallback. Please install it with `pip install scikit-learn`.") + + assert len(input_ids) == len(cell_type_labels), "Input IDs and cell type labels must have the same length." + + self.initial_labels = cell_type_labels + self.num_classes = len(set(cell_type_labels)) + self.input_ids = input_ids + self.pad_token_id = pad_token_id + self.batch_size = batch_size + self.log_every_n_steps = log_every_n_steps + + def on_log(self, args, state, control, **kwargs): + """ + Log the adjusted Rand index (ARI) during training. + """ + from sklearn.metrics import adjusted_rand_score + from sklearn.cluster import KMeans + + model = kwargs.get("model") + step = state.global_step + + assert model is not None, "Model is not available in the callback. Please check the Trainer configuration." + + if model is None: + raise ValueError("Model is not available in the callback. Please check the Trainer configuration.") + + if step % self.log_every_n_steps != 0: + # only compute ARI every n steps regardless of other logging + return + + cell_embeddings = model.encode_tokenized_cells( + self.input_ids, + batch_size=self.batch_size + ) + + # detach, move to cpu, and convert to numpy + cell_embeddings = cell_embeddings.detach().cpu().to(torch.float32).numpy() + + # perform KMeans clustering + kmeans = KMeans(n_clusters=self.num_classes, random_state=42) + kmeans.fit(cell_embeddings) + predicted_labels = kmeans.labels_ + + # compute ARI + ari = adjusted_rand_score(self.initial_labels, predicted_labels) + if self.wandb.run is not None: + self.wandb.log({"adjusted_rand_index": ari}, step=step) + else: + print(f"Adjusted Rand Index at step {step}: {ari:.4f}") + +def _get_decaying_cosine_with_hard_restarts_schedule_with_warmup_lr_lambda( + current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: int +): + if current_step < num_warmup_steps: + return float(current_step) / float(max(1, num_warmup_steps)) + progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) + amp = (num_training_steps + num_warmup_steps - current_step) / (2 * num_training_steps) + if progress >= 1.0: + return 0.0 + return max(0.0, amp * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0)))) + +def get_decaying_cosine_with_hard_restarts_schedule_with_warmup( + optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: int = 1, last_epoch: int = -1 +): + """ + Very similar to huggingfaces built-in cosine with restarts, however the amplitude slowly decreases so that + the "kick ups" are less aggressive. + + Create a schedule with a learning rate that decreases following the values of the cosine function between the + initial lr set in the optimizer to 0, with several hard restarts, after a warmup period during which it increases + linearly between 0 and the initial lr set in the optimizer. + + Args: + optimizer ([`~torch.optim.Optimizer`]): + The optimizer for which to schedule the learning rate. + num_warmup_steps (`int`): + The number of steps for the warmup phase. + num_training_steps (`int`): + The total number of training steps. + num_cycles (`int`, *optional*, defaults to 1): + The number of hard restarts to use. + last_epoch (`int`, *optional*, defaults to -1): + The index of the last epoch when resuming training. + + Return: + `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. + """ + + lr_lambda = partial( + _get_decaying_cosine_with_hard_restarts_schedule_with_warmup_lr_lambda, + num_warmup_steps=num_warmup_steps, + num_training_steps=num_training_steps, + num_cycles=num_cycles, + ) + return LambdaLR(optimizer, lr_lambda, last_epoch) + +def _get_linear_schedule_with_floor_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int, num_training_steps: int): + if current_step < num_warmup_steps: + return float(current_step) / float(max(1, num_warmup_steps)) + lam = max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))) + return max(lam, 0.5) + + +def get_linear_schedule_with_floor_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1): + """ + Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer, down to a floor value, + after a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer. + + Args: + optimizer ([`~torch.optim.Optimizer`]): + The optimizer for which to schedule the learning rate. + num_warmup_steps (`int`): + The number of steps for the warmup phase. + num_training_steps (`int`): + The total number of training steps. + last_epoch (`int`, *optional*, defaults to -1): + The index of the last epoch when resuming training. + + Return: + `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. + """ + + lr_lambda = partial( + _get_linear_schedule_with_floor_with_warmup_lr_lambda, + num_warmup_steps=num_warmup_steps, + num_training_steps=num_training_steps, + ) + return LambdaLR(optimizer, lr_lambda, last_epoch) diff --git a/geniml/bbclient/__init__.py b/geniml/bbclient/__init__.py new file mode 100644 index 00000000..4d86e27f --- /dev/null +++ b/geniml/bbclient/__init__.py @@ -0,0 +1,3 @@ +from .bbclient import BBClient + +__all__ = ["BBClient"] diff --git a/geniml/bbclient/bbclient.py b/geniml/bbclient/bbclient.py new file mode 100644 index 00000000..9f6fa393 --- /dev/null +++ b/geniml/bbclient/bbclient.py @@ -0,0 +1,537 @@ +import os +from contextlib import suppress +from logging import getLogger +from typing import Dict, List, NoReturn, Union + +import boto3 +import requests +import s3fs +import zarr +from botocore.exceptions import ClientError +from pybiocfilecache import BiocFileCache +from pybiocfilecache.exceptions import RnameExistsError +from zarr import Array +from zarr.errors import PathNotFoundError + +from gtars.models import RegionSet +from ..exceptions import TokenizedFileNotFoundError, TokenizedFileNotFoundInCacheError +from ..io.io import BedSet +from .const import ( + BED_TOKENS_PATTERN, + BEDFILE_URL_PATTERN, + BEDSET_URL_PATTERN, + DEFAULT_BEDBASE_API, + DEFAULT_BEDFILE_EXT, + DEFAULT_BEDFILE_SUBFOLDER, + DEFAULT_BEDSET_EXT, + DEFAULT_BEDSET_SUBFOLDER, + DEFAULT_BUCKET_FOLDER, + DEFAULT_BUCKET_NAME, + DEFAULT_CACHE_FOLDER, + DEFAULT_ZARR_FOLDER, + MODULE_NAME, +) +from .utils import BedCacheManager, get_abs_path + +_LOGGER = getLogger(MODULE_NAME) + + +class BBClient(BedCacheManager): + def __init__( + self, + cache_folder: Union[str, os.PathLike] = DEFAULT_CACHE_FOLDER, + bedbase_api: str = DEFAULT_BEDBASE_API, + ): + """ + BBClient to deal with download files from bedbase and caching them. + + :param cache_folder: path to local folder as cache of files from bedbase, + if not given it will be the environment variable `BBCLIENT_CACHE` + :param bedbase_api: url to bedbase + """ + cache_folder = get_abs_path(cache_folder) + super().__init__(cache_folder) + + self._bedfile_cache = BiocFileCache(os.path.join(cache_folder, DEFAULT_BEDFILE_SUBFOLDER)) + self._bedset_cache = BiocFileCache(os.path.join(cache_folder, DEFAULT_BEDSET_SUBFOLDER)) + + self.zarr_cache = zarr.group( + store=os.path.join(cache_folder, DEFAULT_ZARR_FOLDER), overwrite=False + ) + self.bedbase_api = bedbase_api + + def load_bedset(self, bedset_id: str) -> BedSet: + """ + Load a BEDset from cache, or download and add it to the cache with its BED files + + :param bedset_id: unique identifier of a BED set + :return: the BedSet object + """ + + file_path = self._bedset_path(bedset_id) + + if os.path.exists(file_path): + _LOGGER.info(f"BED set {bedset_id} already exists in cache.") + with open(file_path, "r") as file: + extracted_data = file.read().splitlines() + else: + extracted_data = self._download_bedset_data(bedset_id) + # write the identifiers of BED files in the BedSet to a local .txt file + with open(file_path, "w") as file: + for value in extracted_data: + file.write(value + "\n") + _LOGGER.info(f"BED set {bedset_id} downloaded and cached successfully.") + + # return the BedSet + return BedSet( + [self.load_bed(bedfile_id) for bedfile_id in extracted_data], + identifier=bedset_id, + ) + + def _download_bedset_data(self, bedset_id: str) -> List[str]: + """ + Download BED set from BEDbase API and return the list of identifiers of BED files in the set + + :param bedset_id: unique identifier of a BED set + :return: the list of identifiers of BED files in the set + """ + bedset_url = BEDSET_URL_PATTERN.format(bedbase_api=self.bedbase_api, bedset_id=bedset_id) + response = requests.get(bedset_url) + data = response.json()["results"] + extracted_data = [entry.get("id") for entry in data] + + return extracted_data + + def load_bed(self, bed_id: str) -> RegionSet: + """ + Loads a BED file from cache, or downloads and caches it if it doesn't exist + + :param bed_id: unique identifier of a BED file + :return: the RegionSet object + """ + file_path = self._bedfile_path(bed_id) + + if os.path.exists(file_path): + _LOGGER.info(f"BED file {bed_id} already exists in cache.") + # if not in the cache, download from BEDbase and write to file in cache + else: + bed_data = self._download_bed_file_from_bb(bed_id) + with open(file_path, "wb") as f: + f.write(bed_data) + + self._bedfile_cache.add(bed_id, fpath=file_path, action="asis") + + _LOGGER.info(f"BED file {bed_id} was downloaded and cached successfully") + + # return RegionSet(regions=file_path) + return RegionSet(file_path) + + def add_bedset_to_cache(self, bedset: BedSet) -> str: + """ + Add a BED set to the cache + + :param bedset: the BED set to be added, a BedSet class + :return: the identifier if the BedSet object + """ + + bedset_id = bedset.compute_bedset_identifier() + file_path = self._bedset_path(bedset_id) + if os.path.exists(file_path): + _LOGGER.info(f"{file_path} already exists in cache.") + else: + with open(file_path, "w") as file: + for bedfile in bedset: + bedfile_id = self.add_bed_to_cache(bedfile).identifier + file.write(bedfile_id + "\n") + self._bedset_cache.add(bedset_id, fpath=file_path, action="asis") + return bedset_id + + def add_bed_to_cache(self, bedfile: Union[RegionSet, str], force: bool = False) -> RegionSet: + """ + Add a BED file to the cache + + :param bedfile: a RegionSet object or a path or url to the BED file + :param force: whether to overwrite the existing file in cache + :return: the RegionSet identifier + """ + + if isinstance(bedfile, str): + bedfile = RegionSet(bedfile) + elif not isinstance(bedfile, RegionSet): + raise TypeError( + f"Input must be a RegionSet or a path to a BED file, not {type(bedfile)}" + ) + + bedfile_id = bedfile.identifier + file_path = self._bedfile_path(bedfile_id) + if os.path.exists(file_path) and not force: + _LOGGER.info(f"{file_path} already exists in cache.") + else: + # if bedfile.path is None or is_url(bedfile.path): + # bedfile.to_pandas().to_csv( + # file_path, index=False, compression="gzip", header=False, sep="\t" + # ) + # else: + # # copy the BED file out of cache + # if is_gzipped(bedfile.path): + # shutil.copyfile(bedfile.path, file_path) + # else: + # # https://docs.python.org/3/library/gzip.html + # with open(bedfile.path, "rb") as f_in: + # with gzip.open(file_path, "wb") as f_out: + # shutil.copyfileobj(f_in, f_out) + bedfile.to_bed_gz(file_path) + with suppress(RnameExistsError): + self._bedfile_cache.add(bedfile_id, fpath=file_path, action="asis") + return bedfile + + def add_bed_tokens_to_cache(self, bed_id: str, universe_id: str) -> None: + """ + Add a tokenized BED file to the cache + + :param bed_id: the identifier of the BED file + :param universe_id: the identifier of the universe + + :return: the identifier of the tokenized BED file + """ + + tokens_info_url = BED_TOKENS_PATTERN.format( + bedbase_api=DEFAULT_BEDBASE_API, bed_id=bed_id, universe_id=universe_id + ) + response = requests.get(tokens_info_url) + if response.status_code == 404: + raise TokenizedFileNotFoundError( + f"Tokenized BED file {bed_id} for {universe_id} does not exist in bedbase." + f"Please make sure the tokenized BED file is available in bedbase." + ) + + tokens_info = response.json() + file_path = tokens_info["file_path"] + + s3fc_obj = s3fs.S3FileSystem(endpoint_url=tokens_info["endpoint_url"]) + zarr_store = s3fs.S3Map(root=file_path, s3=s3fc_obj, check=False, create=False) + cache_obj = zarr.LRUStoreCache(zarr_store, max_size=2**28) + + try: + tokenized_bed = zarr.open(cache_obj, mode="r") + except PathNotFoundError: + raise TokenizedFileNotFoundError( + f"Tokenized BED file {bed_id} for {universe_id} does not exist in bedbase." + f"Please make sure the tokenized BED file is available in bedbase." + ) + + self.cache_tokens(bed_id, universe_id, tokenized_bed) + + def load_bed_tokens(self, bed_id: str, universe_id: str) -> Array: + """ + Load a tokenized BED file from cache, or download and cache it if it doesn't exist + + :param bed_id: the identifier of the BED file + :param universe_id: the identifier of the universe + + :return: the zarr array of tokens + """ + try: + zarr_array = self.zarr_cache[universe_id][bed_id] + except KeyError: + try: + self.add_bed_tokens_to_cache(bed_id, universe_id) + except TokenizedFileNotFoundError: + raise TokenizedFileNotFoundInCacheError( + f"Tokenized BED file {bed_id} for {universe_id} does not exist in cache." + "And it is not available in bedbase." + ) + zarr_array = self.zarr_cache[universe_id][bed_id] + + return zarr_array + + def remove_tokens(self, bed_id: str, universe_id: str) -> None: + """ + Remove all tokenized BED files from cache + """ + try: + del self.zarr_cache[universe_id][bed_id] + except KeyError: + raise TokenizedFileNotFoundInCacheError( + f"Tokenized BED file {bed_id} for {universe_id} does not exist in cache." + ) + + def cache_tokens(self, bed_id: str, universe_id: str, tokens: Union[list, Array]) -> None: + """ + Cache tokenized BED file + + :param bed_id: the identifier of the BED file + :param universe_id: the identifier of the universe + :param tokens: the list of tokens + + :return: None + """ + + univers_group = self.zarr_cache.require_group(universe_id) + univers_group.create_dataset(bed_id, data=tokens, overwrite=True) + + _LOGGER.info( + f"Tokenized BED file {bed_id} tokenized using {universe_id} was cached successfully" + ) + + def add_bed_to_s3( + self, + identifier: str, + bucket: str = DEFAULT_BUCKET_NAME, + endpoint_url: str = None, + aws_access_key_id: str = None, + aws_secret_access_key: str = None, + s3_path: str = DEFAULT_BUCKET_FOLDER, + ) -> str: + """ + Add a cached BED file to S3 + + :param identifier: the unique identifier of the BED file + :param bucket: the name of the bucket + :param endpoint_url: the URL of the S3 endpoint [Default: set up by the environment vars] + :param aws_access_key_id: the access key of the AWS account [Default: set up by the environment vars] + :param aws_secret_access_key: the secret access key of the AWS account [Default: set up by the environment vars] + :param s3_path: the path on S3 + + :return: full path on S3 + """ + s3_client = boto3.client( + "s3", + endpoint_url=endpoint_url, + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + ) + local_file_path = self.seek(identifier) + bed_file_name = os.path.basename(local_file_path) + s3_bed_path = os.path.join(identifier[0], identifier[1], bed_file_name) + if s3_path: + s3_bed_path = os.path.join(s3_path, s3_bed_path) + + s3_client.upload_file(local_file_path, bucket, s3_bed_path) + _LOGGER.info(f"Project was uploaded successfully to s3://{bucket}/{s3_bed_path}") + return s3_bed_path + + def get_bed_from_s3( + self, + identifier: str, + bucket: str = DEFAULT_BUCKET_NAME, + endpoint_url: str = None, + aws_access_key_id: str = None, + aws_secret_access_key: str = None, + s3_path: str = DEFAULT_BUCKET_FOLDER, + ) -> str: + """ + Get a cached BED file from S3 and cache it locally + + :param identifier: the unique identifier of the BED file + :param bucket: the name of the bucket + :param endpoint_url: the URL of the S3 endpoint [Default: set up by the environment vars] + :param aws_access_key_id: the access key of the AWS account [Default: set up by the environment vars] + :param aws_secret_access_key: the secret access key of the AWS account [Default: set up by the environment vars] + :param s3_path: the path on S3 + + :return: bed file id + :raise FileNotFoundError: if the identifier does not exist in cache + """ + s3_bed_path = os.path.join( + identifier[0], identifier[1], f"{identifier}{DEFAULT_BEDFILE_EXT}" + ) + if s3_path: + s3_bed_path = os.path.join(s3_path, s3_bed_path) + + s3_client = boto3.client( + "s3", + endpoint_url=endpoint_url, + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + ) + try: + s3_client.download_file( + bucket, s3_bed_path, self._bedfile_path(identifier, create=True) + ) + except ClientError as e: + if e.response["Error"]["Code"] == "404": + raise FileNotFoundError(f"{identifier} does not exist in S3.") + else: + raise e + + return identifier + + def seek(self, identifier: str) -> str: + """ + Get local path to BED file or BED set with specific identifier + + :param identifier: the unique identifier + :return: the local path of the file + :raise FileNotFoundError: if the identifier does not exist in cache + """ + + # bedfile + file_path = self._bedset_path(identifier, False) + if os.path.exists(file_path): + return file_path + else: + # bedset + file_path = self._bedfile_path(identifier, False) + if os.path.exists(file_path): + return file_path + else: + raise FileNotFoundError(f"{identifier} does not exist in cache.") + + def remove_bedset_from_cache(self, bedset_id: str, remove_bed_files: bool = False) -> NoReturn: + """ + Remove a BED set from cache + + :param bedset_id: the identifier of BED set + :param remove_bed_files: whether also remove BED files in the BED set + :raise FileNotFoundError: if the BED set does not exist in cache + """ + + file_path = self.seek(bedset_id) + if remove_bed_files: + with open(file_path, "r") as file: + extracted_data = file.readlines() + for bedfile_id in extracted_data: + self.remove_bedfile_from_cache(bedfile_id) + + self._bedset_cache.remove(bedset_id) + # commented due to bioc file cache removal: + # self._remove(file_path) + + def list_beds(self) -> Dict[str, str]: + """ + List all BED files in cache + + :return: the list of identifiers of BED files + """ + + resources = self._bedfile_cache.list_resources() + + results = {} + for resource in resources: + results[resource.rname] = resource.fpath + + results = dict(sorted(results.items())) + return results + + def list_bedsets(self) -> Dict[str, str]: + """ + List all BED sets in cache + + :return: the list of identifiers of BED sets + """ + + resources = self._bedset_cache.list_resources() + + results = {} + for resource in resources: + results[resource.rname] = resource.fpath + + results = dict(sorted(results.items())) + return results + + def _download_bed_file_from_bb(self, bedfile: str) -> bytes: + """ + Download BED file from BEDbase API and return the file content as bytes + + :param bedfile: unique identifier of a BED file + :return: the file content as bytes + """ + + bed_url = BEDFILE_URL_PATTERN.format(bedbase_api=self.bedbase_api, bed_id=bedfile) + response = requests.get(bed_url) + response.raise_for_status() + return response.content + + def _bedset_path(self, bedset_id: str, create: bool = True) -> str: + """ + Get the path of a BED set's .txt file with given identifier + + :param bedset_id: the identifier of BED set + :param create: whether the cache path needs creating + :return: the path to the .txt file + """ + + subfolder_name = DEFAULT_BEDSET_SUBFOLDER + file_extension = DEFAULT_BEDSET_EXT + + return self._cache_path(bedset_id, subfolder_name, file_extension, create) + + def _bedfile_path(self, bedfile_id: str, create: bool = True) -> str: + """ + Get the path of a BED file's .bed.gz file with given identifier + + :param bedfile_id: the identifier of BED file + :param create: whether the cache path needs creating + :return: the path to the .bed.gz file + """ + + subfolder_name = DEFAULT_BEDFILE_SUBFOLDER + file_extension = DEFAULT_BEDFILE_EXT + + return self._cache_path(bedfile_id, subfolder_name, file_extension, create) + + def _cache_path( + self, + identifier: str, + subfolder_name: str, + file_extension: str, + create: bool = True, + ) -> str: + """ + Get the path of a file in cache folder + + :param identifier: the identifier of BED set or BED file + :param subfolder_name: "bedsets" or "bedfiles" + :param file_extension: ".txt" or ".bed.gz" + :param create: whether the cache path needs creating + :return: the path to the file + """ + + filename = f"{identifier}{file_extension}" + folder_name = os.path.join(self.cache_folder, subfolder_name, identifier[0], identifier[1]) + if create: + self.create_cache_folder(folder_name) + return os.path.join(folder_name, filename) + + def remove_bedfile_from_cache(self, bedfile_id: str) -> NoReturn: + """ + Remove a BED file from cache + + :param bedfile_id: the identifier of BED file + :raise FileNotFoundError: if the BED set does not exist in cache + """ + + # commented due to bioc chacing removal method + # file_path = self.seek(bedfile_id) + # self._remove(file_path) + self._bedfile_cache.remove(bedfile_id) + + @staticmethod + def _remove(file_path: str) -> None: + """ + Remove a file within the cache with given path, and remove empty subfolders after removal + Structure of folders in cache: + cache_folder + bedfiles + a/b/ab1234xyz.bed.gz + bedsets + c/d/cd123hij.txt + + :param file_path: the path to the file + :return: None + """ + # the subfolder that matches the second digit of the identifier + sub_folder_2 = os.path.split(file_path)[0] + # the subfolder that matches the first digit of the identifier + sub_folder_1 = os.path.split(sub_folder_2)[0] + + os.remove(file_path) + + # if the subfolders are empty after removal, remove the folders too + if len(os.listdir(sub_folder_2)) == 0: + os.rmdir(sub_folder_2) + if len(os.listdir(sub_folder_1)) == 0: + os.rmdir(sub_folder_1) + + return None diff --git a/geniml/bbclient/cli.py b/geniml/bbclient/cli.py new file mode 100755 index 00000000..b89d8241 --- /dev/null +++ b/geniml/bbclient/cli.py @@ -0,0 +1,123 @@ +from logging import getLogger + +from .const import DEFAULT_CACHE_FOLDER, MODULE_NAME + +_LOGGER = getLogger(MODULE_NAME) + + +def build_subparser_cache_bed(parser): + """ + Builds argument parser to support to cache a BED file from local file or BEDbase. + """ + parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path") + parser.add_argument( + "--cache-folder", + default=DEFAULT_CACHE_FOLDER, + help="Cache folder path (default: bed_cache)", + ) + + return parser + + +def build_subparser_cache_bedset(parser): + """ + Builds argument parser to support to cache a BEDset from local folder or BEDbase. + """ + parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path") + parser.add_argument( + "--cache-folder", + default=DEFAULT_CACHE_FOLDER, + help="Cache folder path (default: bed_cache)", + ) + + return parser + + +def build_subparser_seek(parser): + """ + Builds argument parser to support to seek the path of BED file or BEDset. + """ + parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path") + parser.add_argument( + "--cache-folder", + default=DEFAULT_CACHE_FOLDER, + help="Cache folder path (default: bed_cache)", + ) + + return parser + + +def build_subparser_inspect(parser): + """ + Builds argument parser to support to list and count files and subdirectories in the subdirectory bedfiles and bedsets. + """ + parser.add_argument( + "--cache-folder", + default=DEFAULT_CACHE_FOLDER, + help="Cache folder path (default: bed_cache)", + ) + + return parser + + +def build_subparser_cache_tokens(parser): + """ + Builds argument parser to support to cache tokens from local file or BEDbase. + """ + parser.add_argument( + "--bed-id", dest="bed_id", nargs=1, help="Token file identifier, url, or file path" + ) + parser.add_argument( + "--universe-id", + dest="universe_id", + nargs=1, + help="Unique identifier for the universe of the token file", + ) + parser.add_argument( + "--cache-folder", + default=DEFAULT_CACHE_FOLDER, + help="Cache folder path (default: bed_cache)", + ) + + return parser + + +def build_subparser_remove(parser): + """ + Builds argument parser to support to remove bed files or bedsets from the cache folder + """ + parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path") + parser.add_argument( + "--cache-folder", + default=DEFAULT_CACHE_FOLDER, + help="Cache folder path (default: bed_cache)", + ) + + return parser + + +def build_subparser(parser): + """ + Builds argument parser to support the eval command line interface. + """ + sp = parser.add_subparsers(dest="subcommand") + msg_by_cmd = { + "cache-bed": "Cache a BED file from local file or BEDbase", + "cache-tokens": "Cache tokens from local file or BEDbase", + "cache-bedset": "Cache a BED set from local folder or BEDbase", + "seek": "Seek the BED / BEDset path by giving identifier", + "inspect-bedfiles": "Inspect the contents of bedfile cache folder", + "inspect-bedsets": "Inspect the contents of bedset cache folder", + "rm": "Remove the BED/BEDset from cache with given identifier", + } + subparsers = {} + for k, v in msg_by_cmd.items(): + subparsers[k] = sp.add_parser(k, description=v, help=v) + subparsers["cache-bed"] = build_subparser_cache_bed(subparsers["cache-bed"]) + subparsers["cache-tokens"] = build_subparser_cache_tokens(subparsers["cache-tokens"]) + subparsers["cache-bedset"] = build_subparser_cache_bedset(subparsers["cache-bedset"]) + subparsers["seek"] = build_subparser_seek(subparsers["seek"]) + subparsers["inspect"] = build_subparser_inspect(subparsers["inspect-bedfiles"]) + subparsers["inspect"] = build_subparser_inspect(subparsers["inspect-bedsets"]) + subparsers["rm"] = build_subparser_remove(subparsers["rm"]) + return parser diff --git a/geniml/bbclient/const.py b/geniml/bbclient/const.py new file mode 100644 index 00000000..93a85f06 --- /dev/null +++ b/geniml/bbclient/const.py @@ -0,0 +1,24 @@ +import os + +MODULE_NAME = "bbclient" + +DEFAULT_BEDBASE_API = os.getenv("BEDBASE_API") or "https://api.bedbase.org" +DEFAULT_BEDSET_SUBFOLDER = "bedsets" +DEFAULT_BEDFILE_SUBFOLDER = "bedfiles" +DEFAULT_BEDSET_EXT = ".txt" +DEFAULT_BEDFILE_EXT = ".bed.gz" + +BEDFILE_URL_PATTERN = "{bedbase_api}/v1/objects/bed.{bed_id}.bed_file/access/http/bytes" +BEDSET_URL_PATTERN = "{bedbase_api}/v1/bedset/{bedset_id}/bedfiles" +BED_TOKENS_PATTERN = "{bedbase_api}/v1/bed/{bed_id}/tokens/{universe_id}/info" + +BBCLIENT_CACHE_ENV = "BBCLIENT_CACHE" + +HOME_PATH = os.getenv("HOME") +if not HOME_PATH: + HOME_PATH = os.path.expanduser("~") +DEFAULT_CACHE_FOLDER = os.getenv(BBCLIENT_CACHE_ENV) or os.path.join(HOME_PATH, ".bbcache/") + +DEFAULT_ZARR_FOLDER = "tokens.zarr" +DEFAULT_BUCKET_NAME = "bedbase" +DEFAULT_BUCKET_FOLDER = "bed_files" diff --git a/geniml/bbclient/utils.py b/geniml/bbclient/utils.py new file mode 100644 index 00000000..b7b60ee1 --- /dev/null +++ b/geniml/bbclient/utils.py @@ -0,0 +1,84 @@ +import gzip +import os +from io import BytesIO +from pathlib import Path +from typing import Optional + +import genomicranges +import pandas as pd + +from .const import DEFAULT_CACHE_FOLDER + + +class BedCacheManager: + def __init__(self, cache_folder: str): + self.cache_folder = cache_folder + self.create_cache_folder() + + def create_cache_folder(self, subfolder_path: Optional[str] = None) -> None: + """ + Create cache folder if it doesn't exist + + :param subfolder_path: path to the subfolder + """ + if subfolder_path is None: + subfolder_path = self.cache_folder + + full_path = os.path.abspath(subfolder_path) + if not os.path.exists(full_path): + os.makedirs(full_path) + + @staticmethod + def process_local_bed_data(file_path: str) -> genomicranges.GenomicRanges: + """Process a local BED file and return the file content as bytes""" + with open(file_path, "rb") as local_file: + file_content = local_file.read() + + gr_bed_local = BedCacheManager.decompress_and_convert_to_genomic_ranges(file_content) + + return gr_bed_local + + @staticmethod + def decompress_and_convert_to_genomic_ranges( + content: bytes, + ) -> genomicranges.GenomicRanges: + """Decompress a BED file and convert it to a GenomicRanges object""" + is_gzipped = content[:2] == b"\x1f\x8b" + + if is_gzipped: + with gzip.GzipFile(fileobj=BytesIO(content), mode="rb") as f: + df = pd.read_csv(f, sep="\t", header=None, engine="pyarrow") + else: + df = pd.read_csv(BytesIO(content), sep="\t", header=None, engine="pyarrow") + + header = [ + "seqnames", + "starts", + "ends", + "name", + "score", + "strand", + "thickStart", + "thickEnd", + "itemRgb", + "blockCount", + ] + df.columns = header[: len(df.columns)] + gr = genomicranges.from_pandas(df) + + return gr + + +def get_abs_path(path: str = DEFAULT_CACHE_FOLDER, create_folder: bool = True) -> str: + """ + Get absolute path to the folder and create it if it doesn't exist + + :param path: path to the folder + :param create_folder: create folder if it doesn't exist + + :return: absolute path to the folder + """ + absolute_cache_folder = os.path.expandvars(path) + if create_folder: + Path(absolute_cache_folder).mkdir(parents=True, exist_ok=True) + return absolute_cache_folder diff --git a/geniml/bedshift/BedshiftYAMLHandler.py b/geniml/bedshift/BedshiftYAMLHandler.py new file mode 100644 index 00000000..36c03f46 --- /dev/null +++ b/geniml/bedshift/BedshiftYAMLHandler.py @@ -0,0 +1,193 @@ +import logging +import os +import sys + +import yaml + + +class BedshiftYAMLHandler(object): + def __init__(self, bedshifter, yaml_fp, logger=None): + """ + Handles Bedshift perturbations from yaml files + + :param bedshift.Bedshift bedshifter: a Bedshift object + :param str yaml_fp: path to yaml file + :param logging.logger logger: logger object + """ + self.bedshifter = bedshifter + self.yaml_fp = yaml_fp + if logger is not None: + self._LOGGER = logger + else: + self._LOGGER = logging.getLogger("BedshiftYAMLHandler") + + def _print_sample_config(self): + """ + bedshift_operations: + - add: + rate: 0.1 + mean: 100 + stdev: 20 + - drop_from_file: + file: tests/test.bed + rate: 0.1 + delimiter: \t + - shift_from_file: + file: bedshifted_test.bed + rate: 0.3 + mean: 100 + stdev: 200 + - add_from_file: + file: tests/small_test.bed + rate: 0.2 + - cut: + rate: 0.2 + - drop: + rate: 0.30 + - shift: + rate: 0.05 + mean: 100 + stdev: 200 + - merge: + rate: 0.15 + """ + self._LOGGER.info(self._print_sample_config.__doc__) + + def _read_from_yaml(self, fp): + with open(fp, "r") as yaml_file: + config_data = yaml.load(yaml_file, Loader=yaml.FullLoader) + self._LOGGER.info("Loaded configuration settings from {}".format(fp)) + return config_data + + def handle_yaml(self): + """ + Performs perturbations provided in the yaml config file in the order they were provided. + """ + data = self._read_from_yaml(self.yaml_fp) + operations = [operation for operation in data["bedshift_operations"]] + num_changed = 0 + + for operation in operations: + ##### add ##### + if set(["add", "rate", "mean", "stdev"]) == set(list(operation.keys())): + rate = operation["rate"] + mean = operation["mean"] + std = operation["stdev"] + num_added = self.bedshifter.add(rate, mean, std) + num_changed += num_added + + ##### add_from_file with no delimiter provided ##### + elif set(["add_from_file", "file", "rate"]) == set(list(operation.keys())): + fp = operation["file"] + if os.path.isfile(fp): + add_rate = operation["rate"] + num_added = self.bedshifter.add_from_file(fp, add_rate) + num_changed += num_added + else: + self._logger.error("File '{}' does not exist.".format(fp)) + sys.exit(1) + + ##### add_from_file with delimiter provided ##### + elif set(["add_from_file", "file", "rate", "delimiter"]) == set( + list(operation.keys()) + ): + fp = operation["file"] + if os.path.isfile(fp): + add_rate = operation["rate"] + delimiter = operation["delimiter"] + num_added = self.bedshifter.add_from_file(fp, add_rate, delimiter) + num_changed += num_added + else: + self._logger.error("File '{}' does not exist.".format(fp)) + sys.exit(1) + + ##### drop ##### + elif set(["drop", "rate"]) == set(list(operation.keys())): + rate = operation["rate"] + num_dropped = self.bedshifter.drop(rate) + num_changed += num_dropped + + ##### drop_from_file with no delimiter provided ##### + elif set(["drop_from_file", "file", "rate"]) == set(list(operation.keys())): + fp = operation["file"] + if os.path.isfile(fp): + drop_rate = operation["rate"] + num_dropped = self.bedshifter.drop_from_file(fp, drop_rate) + num_changed += num_dropped + else: + self._LOGGER.error("File '{}' does not exist.".format(fp)) + sys.exit(1) + + ##### drop_from_file with delimiter provided ##### + elif set(["drop_from_file", "file", "rate", "delimiter"]) == set( + list(operation.keys()) + ): + fp = operation["file"] + if os.path.isfile(fp): + drop_rate = operation["rate"] + delimiter = operation["delimiter"] + num_dropped = self.bedshifter.drop_from_file(fp, drop_rate, delimiter) + num_changed += num_dropped + else: + self._LOGGER.error("File '{}' does not exist.".format(fp)) + sys.exit(1) + + ##### shift ##### + elif set(["shift", "rate", "mean", "stdev"]) == set(list(operation.keys())): + rate = operation["rate"] + mean = operation["mean"] + std = operation["stdev"] + num_shifted = self.bedshifter.shift(rate, mean, std) + num_changed += num_shifted + + ##### shift_from_file ##### + elif set(["shift_from_file", "file", "rate", "mean", "stdev"]) == set( + list(operation.keys()) + ): + fp = operation["file"] + if os.path.isfile(fp): + rate = operation["rate"] + mean = operation["mean"] + std = operation["stdev"] + num_shifted = self.bedshifter.shift_from_file(fp, rate, mean, std) + num_changed += num_shifted + else: + self._LOGGER.error("File '{}' does not exist.".format(fp)) + sys.exit(1) + + ##### shift_from_file with delimiter provided ##### + elif set(["shift_from_file", "file", "rate", "mean", "stdev", "delimiter"]) == set( + list(operation.keys()) + ): + fp = operation["file"] + if os.path.isfile(fp): + rate = operation["rate"] + mean = operation["mean"] + std = operation["stdev"] + delimiter = operation["delimiter"] + num_shifted = self.bedshifter.shift_from_file(fp, rate, mean, std, delimiter) + num_changed += num_shifted + else: + self._LOGGER.error("File '{}' does not exist.".format(fp)) + sys.exit(1) + + ##### cut ##### + elif set(["cut", "rate"]) == set(list(operation.keys())): + rate = operation["rate"] + num_cut = self.bedshifter.cut(rate) + num_changed += num_cut + + ##### merge ##### + elif set(["merge", "rate"]) == set(list(operation.keys())): + rate = operation["rate"] + num_merged = self.bedshifter.merge(rate) + num_changed += num_merged + + else: + self._LOGGER.error( + "\n\nInvalid settings entered in the config file. Please refer to the example below.\n\n" + ) + self._print_sample_config() + sys.exit(1) + + return num_changed diff --git a/geniml/bedshift/__init__.py b/geniml/bedshift/__init__.py new file mode 100644 index 00000000..74c96d98 --- /dev/null +++ b/geniml/bedshift/__init__.py @@ -0,0 +1,11 @@ +# Project configuration, particularly for logging. + +import logmuse + +from ._version import __version__ +from .bedshift import Bedshift + +__classes__ = ["Bedshift"] +__all__ = __classes__ + [] + +logmuse.init_logger("bedshift") diff --git a/geniml/bedshift/_version.py b/geniml/bedshift/_version.py new file mode 100644 index 00000000..a82b376d --- /dev/null +++ b/geniml/bedshift/_version.py @@ -0,0 +1 @@ +__version__ = "1.1.1" diff --git a/geniml/bedshift/arguments.py b/geniml/bedshift/arguments.py new file mode 100644 index 00000000..997b453c --- /dev/null +++ b/geniml/bedshift/arguments.py @@ -0,0 +1,141 @@ +import argparse + +from geniml.bedshift._version import __version__ + + +class _VersionInHelpParser(argparse.ArgumentParser): + def format_help(self): + """Add version information to help text.""" + return ( + "version: {}\n".format(__version__) + super(_VersionInHelpParser, self).format_help() + ) + + +def build_argparser(): + """ + Builds argument parser. + + :return: argparse.ArgumentParser + """ + + banner = "%(prog)s - randomize BED files" + additional_description = "\n..." + + parser = _VersionInHelpParser(description=banner, epilog=additional_description) + + parser.add_argument( + "-V", + "--version", + action="version", + version="%(prog)s {v}".format(v=__version__), + ) + + parser.add_argument("-b", "--bedfile", required=True, help="File path to bed file.") + + parser.add_argument( + "-l", + "--chrom-lengths", + type=str, + required=False, + help="TSV text file with one row per chromosomes indicating chromosome sizes", + ) + + parser.add_argument( + "-g", + "--genome", + type=str, + required=False, + help="Refgenie genome identifier (used for chrom sizes).", + ) + + parser.add_argument("-d", "--droprate", type=float, default=0.0, help="Droprate parameter") + + parser.add_argument("-a", "--addrate", type=float, default=0.0, help="Addrate parameter") + + parser.add_argument("--addmean", type=float, default=320.0, help="Mean add region length") + + parser.add_argument("--addstdev", type=float, default=30.0, help="Stdev add length") + + parser.add_argument("--addfile", type=str, help="Add regions from a bedfile") + + parser.add_argument( + "--valid-regions", + type=str, + dest="valid_regions", + help="valid regions in which regions can be randomly added", + ) + + parser.add_argument("-s", "--shiftrate", type=float, default=0.0, help="Shift probability") + + parser.add_argument("--shiftmean", type=float, default=0.0, help="Mean shift") + + parser.add_argument("--shiftstdev", type=float, default=150.0, help="Stdev shift") + + parser.add_argument("--shiftfile", type=str, help="Shift regions from a bedfile") + + parser.add_argument("-c", "--cutrate", type=float, default=0.0, help="Cut probability") + + parser.add_argument( + "-m", + "--mergerate", + type=float, + default=0.0, + help="Merge probability. WARNING: will likely create regions that are thousands of base pairs long", + ) + + parser.add_argument("--dropfile", type=str, help="Drop regions from a bedfile") + + parser.add_argument( + "-o", + "--outputfile", + type=str, + help="output file name (including extension). if not specified, will default to bedshifted_{originalname}.bed", + ) + + parser.add_argument( + "-r", + "--repeat", + type=int, + default=1, + help="the number of times to repeat the operation", + ) + + parser.add_argument( + "-y", + "--yaml-config", + dest="yaml_config", + type=str, + help="run yaml configuration file", + ) + + parser.add_argument( + "--seed", + default=None, + help="an integer-valued seed for allowing reproducible perturbations", + ) + + return parser + + +param_msg = """Params: + chrom.sizes file: {chromsizes} + shift: + shift rate: {shiftrate} + shift mean distance: {shiftmean} + shift stdev: {shiftstdev} + shift regions from file: {shiftfile} + add: + rate: {addrate} + add mean length: {addmean} + add stdev: {addstdev} + add regions from file: {addfile} + valid regions: {valid_regions} + cut rate: {cutrate} + drop rate: {droprate} + drop regions from file: {dropfile} + merge rate: {mergerate} + outputfile: {outputfile} + repeat: {repeat} + yaml_config: {yaml_config} + seed: {seed} +""" diff --git a/geniml/bedshift/bedshift.py b/geniml/bedshift/bedshift.py new file mode 100644 index 00000000..8668d2dd --- /dev/null +++ b/geniml/bedshift/bedshift.py @@ -0,0 +1,687 @@ +"""Perturb regions in bedfiles""" + +import logging +import math +import os +import random +import sys + +import logmuse +import numpy as np +import pandas as pd + +from geniml.bedshift import BedshiftYAMLHandler, arguments +from geniml.bedshift._version import __version__ + +_LOGGER = logging.getLogger(__name__) + +__all__ = ["Bedshift"] + + +class Bedshift(object): + """ + The bedshift object with methods to perturb regions + """ + + def __init__(self, bedfile_path, chrom_sizes=None, delimiter="\t"): + """ + Read in a .bed file to pandas DataFrame format + + :param str bedfile_path: the path to the BED file + :param str chrom_sizes: the path to the chrom.sizes file + :param str delimiter: the delimiter used in the BED file + """ + self.bedfile_path = bedfile_path + self.chrom_lens = {} + if chrom_sizes: + self._read_chromsizes(chrom_sizes) + df = self.read_bed(bedfile_path, delimiter=delimiter) + self.original_num_regions = df.shape[0] + self.bed = ( + df.astype({0: "object", 1: "int64", 2: "int64", 3: "object"}) + .sort_values([0, 1, 2]) + .reset_index(drop=True) + ) + self.original_bed = self.bed.copy() + + def _read_chromsizes(self, fp): + """ + Read chromosome sizes file + + :param str fp: path to the chrom sizes file + """ + try: + with open(fp) as f: + for line in f: + line = line.strip().split("\t") + chrom = str(line[0]) + size = int(line[1]) + self.chrom_lens[chrom] = size + except FileNotFoundError: + msg = "Fasta file path {} invalid".format(fp) + _LOGGER.error(msg) + raise FileNotFoundError(msg) + + total_len = sum(self.chrom_lens.values()) + self.chrom_weights = [chrom_len / total_len for chrom_len in self.chrom_lens.values()] + + def reset_bed(self): + """ + Reset the stored bedfile to the state before perturbations + """ + self.bed = self.original_bed.copy() + + def _precheck(self, rate, requiresChromLens=False, isAdd=False): + """ + Check if the rate of perturbation is too high or low + + :param float rate: the rate of perturbation + :param bool requiresChromLens: check if the perturbation requires a chromosome lengths file + :param bool isAdd: if True, do a special check for the add rate + """ + if isAdd: + if rate < 0: + msg = "Rate must be greater than 0" + _LOGGER.error(msg) + raise ValueError(msg) + else: + if rate < 0 or rate > 1: + msg = "Rate must be between 0 and 1" + _LOGGER.error(msg) + raise ValueError(msg) + if requiresChromLens: + if len(self.chrom_lens) == 0: + msg = "chrom.sizes file must be specified" + _LOGGER.error(msg) + raise FileNotFoundError(msg) + + def pick_random_chroms(self, n): + """ + Utility function to pick a random chromosome + + :param str n: the number of random chromosomes to pick + :return str, float chrom_str, chrom_len: chromosome number and length + """ + chrom_strs = random.choices(list(self.chrom_lens.keys()), weights=self.chrom_weights, k=n) + chrom_lens = [self.chrom_lens[chrom_str] for chrom_str in chrom_strs] + return zip(chrom_strs, chrom_lens) + + def add(self, addrate, addmean, addstdev, valid_bed=None, delimiter="\t"): + """ + Add regions + + :param float addrate: the rate to add regions + :param float addmean: the mean length of added regions + :param float addstdev: the standard deviation of the length of added regions + :param str valid_bed: the file with valid regions where new regions can be added + :param str delimiter: the delimiter used in valid_bed + :return int: the number of regions added + """ + if valid_bed: + self._precheck(addrate, requiresChromLens=False, isAdd=True) + else: + self._precheck(addrate, requiresChromLens=True, isAdd=True) + + rows = self.bed.shape[0] + num_add = int(rows * addrate) + new_regions = {0: [], 1: [], 2: [], 3: []} + if valid_bed: + valid_regions = self.read_bed(valid_bed, delimiter) + valid_regions[3] = valid_regions[2] - valid_regions[1] + total_bp = valid_regions[3].sum() + valid_regions[4] = valid_regions[3].apply(lambda x: x / total_bp) + add_rows = random.choices( + list(range(len(valid_regions))), + weights=list(valid_regions[4]), + k=num_add, + ) + for row in add_rows: + data = valid_regions.loc[row] + chrom = data[0] + start = random.randint(data[1], data[2]) + end = start + int(np.random.normal(addmean, addstdev)) + new_regions[0].append(chrom) + new_regions[1].append(start) + new_regions[2].append(end) + new_regions[3].append("A") + else: + random_chroms = self.pick_random_chroms(num_add) + for chrom_str, chrom_len in random_chroms: + start = random.randint(1, chrom_len) + # ensure chromosome length is not exceeded + end = min(start + int(np.random.normal(addmean, addstdev)), chrom_len) + new_regions[0].append(chrom_str) + new_regions[1].append(start) + new_regions[2].append(end) + new_regions[3].append("A") + self.bed = pd.concat([self.bed, pd.DataFrame(new_regions)], ignore_index=True) + return num_add + + def add_from_file(self, fp, addrate, delimiter="\t"): + """ + Add regions from another bedfile to this perturbed bedfile + + :param float addrate: the rate to add regions + :param str fp: the filepath to the other bedfile + :return int: the number of regions added + """ + self._precheck(addrate, requiresChromLens=False, isAdd=True) + + rows = self.bed.shape[0] + num_add = int(rows * addrate) + df = self.read_bed(fp, delimiter=delimiter) + dflen = len(df) + if num_add > dflen: + _LOGGER.warning( + "Number of regions to be added ({}) is larger than the provided bedfile size ({}). Adding {} regions.".format( + num_add, dflen, dflen + ) + ) + num_add = dflen + add_rows = random.sample(list(range(dflen)), num_add) + add_df = df.loc[add_rows].reset_index(drop=True) + add_df[3] = pd.Series(["A"] * add_df.shape[0]) + self.bed = pd.concat([self.bed, add_df], ignore_index=True) + return num_add + + def shift(self, shiftrate, shiftmean, shiftstdev, shift_rows=[]): + """ + Shift regions + + :param float shiftrate: the rate to shift regions (both the start and end are shifted by the same amount) + :param float shiftmean: the mean shift distance + :param float shiftstdev: the standard deviation of the shift distance + :return int: the number of regions shifted + """ + self._precheck(shiftrate, requiresChromLens=True) + + rows = self.bed.shape[0] + if len(shift_rows) == 0: + shift_rows = random.sample(list(range(rows)), int(rows * shiftrate)) + new_row_list = [] + to_drop = [] + num_shifted = 0 + invalid_shifted = 0 + for row in shift_rows: + drop_row, new_region = self._shift( + row, shiftmean, shiftstdev + ) # shifted rows display a 1 + if drop_row is not None and new_region: + num_shifted += 1 + new_row_list.append(new_region) + to_drop.append(drop_row) + else: + invalid_shifted += 1 + self.bed = self.bed.drop(to_drop) + self.bed = pd.concat([self.bed, pd.DataFrame(new_row_list)], ignore_index=True) + self.bed = self.bed.reset_index(drop=True) + if invalid_shifted > 0: + _LOGGER.warning( + f"{invalid_shifted} regions were prevented from being shifted outside of chromosome boundaries. Reported regions shifted will be less than expected." + ) + return num_shifted + + def _shift(self, row, mean, stdev): + theshift = int(np.random.normal(mean, stdev)) + + chrom = self.bed.loc[row][0] + start = self.bed.loc[row][1] + end = self.bed.loc[row][2] + if start + theshift < 0 or end + theshift > self.chrom_lens[str(chrom)]: + # check if the region is shifted out of chromosome length bounds + return None, None + + return row, {0: chrom, 1: start + theshift, 2: end + theshift, 3: "S"} + + def shift_from_file(self, fp, shiftrate, shiftmean, shiftstdev, delimiter="\t"): + """ + Shift regions that overlap the specified file's regions + + :param str fp: the file on which to find overlaps + :param float shiftrate: the rate to shift regions (both the start and end are shifted by the same amount) + :param float shiftmean: the mean shift distance + :param float shiftstdev: the standard deviation of the shift distance + :param str delimiter: the delimiter used in fp + :return int: the number of regions shifted + """ + self._precheck(shiftrate, requiresChromLens=True) + + rows = self.bed.shape[0] + num_shift = int(rows * shiftrate) + + intersect_regions = self._find_overlap(fp) + original_colnames = self.bed.columns + intersect_regions.columns = [str(col) for col in intersect_regions.columns] + self.bed.columns = [str(col) for col in self.bed.columns] + indices_of_overlap_regions = self.bed.reset_index().merge(intersect_regions)["index"] + self.bed.columns = [int(col) for col in self.bed.columns] + + interlen = len(indices_of_overlap_regions) + if num_shift > interlen: + _LOGGER.warning( + "Desired regions shifted ({}) is greater than the number of overlaps found ({}). Shifting {} regions.".format( + num_shift, interlen, interlen + ) + ) + num_shift = len(indices_of_overlap_regions) + + elif interlen > num_shift: + indices_of_overlap_regions = indices_of_overlap_regions.sample(n=num_shift) + + indices_of_overlap_regions = indices_of_overlap_regions.to_list() + + return self.shift(shiftrate, shiftmean, shiftstdev, indices_of_overlap_regions) + + def cut(self, cutrate): + """ + Cut regions to create two new regions + + :param float cutrate: the rate to cut regions into two separate regions + :return int: the number of regions cut + """ + self._precheck(cutrate) + + rows = self.bed.shape[0] + cut_rows = random.sample(list(range(rows)), int(rows * cutrate)) + new_row_list = [] + to_drop = [] + for row in cut_rows: + drop_row, new_regions = self._cut(row) # cut rows display a 2 + new_row_list.extend(new_regions) + to_drop.append(drop_row) + self.bed = self.bed.drop(to_drop) + self.bed = pd.concat([self.bed, pd.DataFrame(new_row_list)], ignore_index=True) + self.bed = self.bed.reset_index(drop=True) + return len(cut_rows) + + def _cut(self, row): + chrom = self.bed.loc[row][0] + start = self.bed.loc[row][1] + end = self.bed.loc[row][2] + + # choose where to cut the region + thecut = (start + end) // 2 # int(np.random.normal((start+end)/2, (end - start)/6)) + if thecut <= start: + thecut = start + 10 + if thecut >= end: + thecut = end - 10 + + """ may add in later, this makes the api confusing! + # adjust the cut regions using the shift function + new_segs = self.__shift(new_segs, 0, meanshift, stdevshift) + new_segs = self.__shift(new_segs, 1, meanshift, stdevshift) + """ + + return ( + row, + [ + {0: chrom, 1: start, 2: thecut, 3: "C"}, + {0: chrom, 1: thecut, 2: end, 3: "C"}, + ], + ) + + def merge(self, mergerate): + """ + Merge two regions into one new region + + :param float mergerate: the rate to merge two regions into one + :return int: number of regions merged + """ + self._precheck(mergerate) + + rows = self.bed.shape[0] + merge_rows = random.sample(list(range(rows)), int(rows * mergerate)) + to_add = [] + to_drop = [] + for row in merge_rows: + drop_rows, add_row = self._merge(row) + if drop_rows and add_row: + to_add.append(add_row) + to_drop.extend(drop_rows) + self.bed = self.bed.drop(to_drop) + self.bed = pd.concat([self.bed, pd.DataFrame(to_add)], ignore_index=True) + self.bed = self.bed.reset_index(drop=True) + return len(to_drop) + + def _merge(self, row): + # check if the regions being merged are on the same chromosome + if row + 1 not in self.bed.index or self.bed.loc[row][0] != self.bed.loc[row + 1][0]: + return None, None + + chrom = self.bed.loc[row][0] + start = self.bed.loc[row][1] + end = self.bed.loc[row + 1][2] + return [row, row + 1], {0: chrom, 1: start, 2: end, 3: "M"} + + def drop(self, droprate): + """ + Drop regions + + :param float droprate: the rate to drop/remove regions + :return int: the number of rows dropped + """ + self._precheck(droprate) + + rows = self.bed.shape[0] + drop_rows = random.sample(list(range(rows)), int(rows * droprate)) + self.bed = self.bed.drop(drop_rows) + self.bed = self.bed.reset_index(drop=True) + return len(drop_rows) + + def drop_from_file(self, fp, droprate, delimiter="\t"): + """ + drop regions that overlap between the reference bedfile and the provided bedfile. + + :param float droprate: the rate to drop regions + :param str fp: the filepath to the other bedfile containing regions to be dropped + :return int: the number of regions dropped + """ + self._precheck(droprate) + + rows = self.bed.shape[0] + num_drop = int(rows * droprate) + drop_bed = self.read_bed(fp, delimiter=delimiter) + + intersect_regions = self._find_overlap(drop_bed) + original_colnames = self.bed.columns + intersect_regions.columns = [str(col) for col in intersect_regions.columns] + self.bed.columns = [str(col) for col in self.bed.columns] + indices_of_overlap_regions = self.bed.reset_index().merge(intersect_regions)["index"] + self.bed.columns = [int(col) for col in self.bed.columns] + + interlen = len(indices_of_overlap_regions) + if num_drop > interlen: + _LOGGER.warning( + "Desired regions dropped ({}) is greater than the number of overlaps found ({}). Dropping {} regions.".format( + num_drop, interlen, interlen + ) + ) + num_drop = len(indices_of_overlap_regions) + elif interlen > num_drop: + indices_of_overlap_regions = indices_of_overlap_regions.sample(n=num_drop) + indices_of_overlap_regions = indices_of_overlap_regions.to_list() + + self.bed = self.bed.drop(indices_of_overlap_regions) + return num_drop + + def set_seed(self, seednum): + try: + seednum = int(seednum) + random.seed(seednum) + np.random.seed(seednum) + except ValueError: + msg = "Seed should be an integer, not {}.".format(type(seednum)) + _LOGGER.error(msg) + raise ValueError(msg) + + def _find_overlap(self, fp, reference=None): + """ + Find intersecting regions between the reference bedfile and the comparison file provided in the yaml config file. + + :param str fp: path to file, or pandas DataFrame, for comparison + :param str reference: path to file, or pandas DataFrame, for reference. If None, then defaults to the original BED file provided to the Bedshift constructor + :return pd.DataFrame: a DataFrame of overlapping regions + """ + if reference is None: + reference_bed = self.original_bed.copy() + else: + if isinstance(reference, pd.DataFrame): + reference_bed = reference.copy() + elif isinstance(reference, str): + reference_bed = self.read_bed(reference) + else: + raise Exception("unsupported input type: {}".format(type(reference))) + if isinstance(fp, pd.DataFrame): + comparison_bed = fp.copy() + elif isinstance(fp, str): + comparison_bed = self.read_bed(fp) + else: + raise Exception("unsupported input type: {}".format(type(reference))) + reference_bed.columns = ["Chromosome", "Start", "End", "modifications"] + comparison_bed.columns = ["Chromosome", "Start", "End", "modifications"] + # TODO, switch this overlap calculation to use genomicranges + raise NotImplementedError( + "This relies on pyranges, which was removed with the switch to geniml." + ) + + # USE AILIST HERE: + reference_pr = pr.PyRanges(reference_bed) + comparison_pr = pr.PyRanges(comparison_bed) + intersection = reference_pr.overlap(comparison_pr, how="first").as_df() + if len(intersection) == 0: + raise Exception( + "no intersection found between {} and {}".format(reference_bed, comparison_bed) + ) + intersection = intersection.drop(["modifications"], axis=1) + intersection.columns = [0, 1, 2] + return intersection + + def all_perturbations( + self, + addrate=0.0, + addmean=320.0, + addstdev=30.0, + addfile=None, + valid_regions=None, + shiftrate=0.0, + shiftmean=0.0, + shiftstdev=150.0, + shiftfile=None, + cutrate=0.0, + mergerate=0.0, + droprate=0.0, + dropfile=None, + yaml=None, + seed=None, + ): + """ + Perform all five perturbations in the order of shift, add, cut, merge, drop. + + :param float addrate: the rate (as a proportion of the total number of regions) to add regions + :param float addmean: the mean length of added regions + :param float addstdev: the standard deviation of the length of added regions + :param str addfile: the file containing regions to be added + :param str valid_regions: the file containing regions where new regions can be added + :param float shiftrate: the rate to shift regions (both the start and end are shifted by the same amount) + :param float shiftmean: the mean shift distance + :param float shiftstdev: the standard deviation of the shift distance + :param str shiftfile: the file containing regions to be shifted + :param float cutrate: the rate to cut regions into two separate regions + :param float mergerate: the rate to merge two regions into one + :param float droprate: the rate to drop/remove regions + :param str dropfile: the file containing regions to be dropped + :param str yaml: the yaml_config filepath + :param bedshift.Bedshift bedshifter: Bedshift instance + :param int seed: a seed for allowing reproducible perturbations + :return int: the number of total regions perturbed + """ + if seed: + self.set_seed(seed) + if yaml: + return BedshiftYAMLHandler.BedshiftYAMLHandler(self, yaml).handle_yaml() + n = 0 + if shiftrate > 0: + if shiftfile: + n += self.shift_from_file(shiftfile, shiftrate, shiftmean, shiftstdev) + else: + n += self.shift(shiftrate, shiftmean, shiftstdev) + if addrate > 0: + if addfile: + n += self.add_from_file(addfile, addrate) + else: + n += self.add(addrate, addmean, addstdev, valid_regions) + if cutrate > 0: + n += self.cut(cutrate) + if mergerate > 0: + n += self.merge(mergerate) + if droprate > 0: + if dropfile: + n += self.drop_from_file(dropfile, droprate) + else: + n += self.drop(droprate) + + return n + + def to_bed(self, outfile_name): + """ + Write a pandas dataframe back into BED file format + + :param str outfile_name: The name of the output BED file + """ + self.bed.sort_values([0, 1, 2], inplace=True) + self.bed.to_csv(outfile_name, sep="\t", header=False, index=False, float_format="%.0f") + + def read_bed(self, bedfile_path, delimiter="\t"): + """ + Read a BED file into pandas dataframe + + :param str bedfile_path: The path to the BED file + """ + try: + df = pd.read_csv( + bedfile_path, + sep=delimiter, + header=None, + usecols=[0, 1, 2], + engine="python", + ) + except FileNotFoundError: + msg = "BED file path {} invalid".format(bedfile_path) + _LOGGER.error(msg) + raise FileNotFoundError(msg) + except: + msg = "File {} could not be read".format(bedfile_path) + _LOGGER.error(msg) + raise Exception(msg) + + # if there is a header line in the table, remove it + if not str(df.iloc[0, 1]).isdigit(): + df = df[1:].reset_index(drop=True) + + df[3] = "-" # column indicating which modifications were made + return df + + +def main(): + """Primary workflow""" + + parser = logmuse.add_logging_options(arguments.build_argparser()) + args, remaining_args = parser.parse_known_args() + global _LOGGER + _LOGGER = logmuse.logger_via_cli(args) + + _LOGGER.info("Welcome to bedshift version {}".format(__version__)) + _LOGGER.info("Shifting file: '{}'".format(args.bedfile)) + + if not args.bedfile: + parser.print_help() + msg = "No BED file given" + _LOGGER.error(msg) + raise MissingArgumentError(msg) + + if args.chrom_lengths: + pass + elif args.genome: + try: + import refgenconf + + rgc = refgenconf.RefGenConf(refgenconf.select_genome_config()) + args.chrom_lengths = rgc.seek(args.genome, "fasta", None, "chrom_sizes") + except ModuleNotFoundError: + msg = "You must have package refgenconf installed to use a refgenie genome" + _LOGGER.error(msg) + raise ModuleNotFoundError(msg) + + msg = arguments.param_msg + + if args.repeat < 1: + msg = "Repeats must be greater than 0" + _LOGGER.error(msg) + raise ValueError(msg) + + if args.outputfile: + outfile_base = args.outputfile + else: + outfile_base = "bedshifted_{}".format(os.path.basename(args.bedfile)) + + _LOGGER.info( + msg.format( + bedfile=args.bedfile, + chromsizes=args.chrom_lengths, + droprate=args.droprate, + dropfile=args.dropfile, + addrate=args.addrate, + addmean=args.addmean, + addstdev=args.addstdev, + addfile=args.addfile, + valid_regions=args.valid_regions, + shiftrate=args.shiftrate, + shiftmean=args.shiftmean, + shiftstdev=args.shiftstdev, + shiftfile=args.shiftfile, + cutrate=args.cutrate, + mergerate=args.mergerate, + outputfile=outfile_base, + repeat=args.repeat, + yaml_config=args.yaml_config, + seed=args.seed, + ) + ) + + bedshifter = Bedshift(args.bedfile, args.chrom_lengths) + _LOGGER.info(f"Generating {args.repeat} repetitions...") + + pct_reports = [int(x * args.repeat / 100) for x in [5, 25, 50, 75, 100]] + + for i in range(args.repeat): + n = bedshifter.all_perturbations( + args.addrate, + args.addmean, + args.addstdev, + args.addfile, + args.valid_regions, + args.shiftrate, + args.shiftmean, + args.shiftstdev, + args.shiftfile, + args.cutrate, + args.mergerate, + args.droprate, + args.dropfile, + args.yaml_config, + args.seed, + ) + if args.repeat == 1: + bedshifter.to_bed(outfile_base) + _LOGGER.info( + "REGION COUNT | original: {}\tnew: {}\tchanged: {}\t\noutput file: {}".format( + bedshifter.original_num_regions, + bedshifter.bed.shape[0], + str(n), + outfile_base, + ) + ) + else: + basename, ext = os.path.splitext(os.path.basename(outfile_base)) + dirname = os.path.dirname(outfile_base) + digits = int(math.log10(args.repeat)) + 1 + + rep = str(i + 1).zfill(digits) + modified_outfile_path = os.path.join(dirname, f"{basename}_rep{rep}{ext}") + bedshifter.to_bed(modified_outfile_path) + + pct_finished = int((100 * (i + 1)) / args.repeat) + if i + 1 in pct_reports: + _LOGGER.info( + f"Rep {i+1}. Finished: {pct_finished}%. Output file: {modified_outfile_path}" + ) + + bedshifter.reset_bed() + + +if __name__ == "__main__": + try: + sys.exit(main()) + except KeyboardInterrupt: + _LOGGER.error("Program canceled by user!") + sys.exit(1) diff --git a/gitk/bedspace/__init__.py b/geniml/bedspace/__init__.py similarity index 100% rename from gitk/bedspace/__init__.py rename to geniml/bedspace/__init__.py diff --git a/gitk/bedspace/_version.py b/geniml/bedspace/_version.py similarity index 100% rename from gitk/bedspace/_version.py rename to geniml/bedspace/_version.py diff --git a/gitk/bedspace/argparsers.py b/geniml/bedspace/argparsers.py similarity index 92% rename from gitk/bedspace/argparsers.py rename to geniml/bedspace/argparsers.py index fbbc9a72..2fd44ea5 100644 --- a/gitk/bedspace/argparsers.py +++ b/geniml/bedspace/argparsers.py @@ -4,7 +4,9 @@ from .const import * -def build_preprocess_argparser(parser: VersionInHelpParser) -> VersionInHelpParser: +def build_preprocess_argparser( + parser: VersionInHelpParser, +) -> VersionInHelpParser: parser.add_argument( "-i", "--input", @@ -12,9 +14,7 @@ def build_preprocess_argparser(parser: VersionInHelpParser) -> VersionInHelpPars help="Path to input bed files", ) - parser.add_argument( - "-m", "--metadata", dest="metadata", help="Path to metadata file" - ) + parser.add_argument("-m", "--metadata", dest="metadata", help="Path to metadata file") parser.add_argument( "-u", @@ -85,7 +85,9 @@ def build_train_argparser(parser: VersionInHelpParser) -> VersionInHelpParser: ) -def build_distance_argparser(parser: VersionInHelpParser) -> VersionInHelpParser: +def build_distance_argparser( + parser: VersionInHelpParser, +) -> VersionInHelpParser: parser.add_argument( "-i", "--input", diff --git a/gitk/bedspace/cli.py b/geniml/bedspace/cli.py similarity index 100% rename from gitk/bedspace/cli.py rename to geniml/bedspace/cli.py diff --git a/gitk/bedspace/const.py b/geniml/bedspace/const.py similarity index 100% rename from gitk/bedspace/const.py rename to geniml/bedspace/const.py diff --git a/gitk/bedspace/pipeline/bashcode.sh b/geniml/bedspace/pipeline/bashcode.sh similarity index 100% rename from gitk/bedspace/pipeline/bashcode.sh rename to geniml/bedspace/pipeline/bashcode.sh diff --git a/gitk/bedspace/pipeline/bedspace_queryDBsim.py b/geniml/bedspace/pipeline/bedspace_queryDBsim.py similarity index 92% rename from gitk/bedspace/pipeline/bedspace_queryDBsim.py rename to geniml/bedspace/pipeline/bedspace_queryDBsim.py index b92da26f..d10191b0 100644 --- a/gitk/bedspace/pipeline/bedspace_queryDBsim.py +++ b/geniml/bedspace/pipeline/bedspace_queryDBsim.py @@ -26,8 +26,8 @@ from ubiquerg import VersionInHelpParser -def data_preprocessing(path_embeded_document): - document_embedding = pd.read_csv(path_embeded_document, header=None) +def data_preprocessing(path_embedded_document): + document_embedding = pd.read_csv(path_embedded_document, header=None) document_embedding = document_embedding[0].str.split("__label__", expand=True) document_embedding[list(document_embedding)[1:]] = document_embedding[ list(document_embedding)[1:] @@ -60,9 +60,7 @@ def calculate_distance(X_files, X_labels, y_files, y_labels): value_name="score", ) scaler = MinMaxScaler() - file_distance["score"] = scaler.fit_transform( - np.array(file_distance["score"]).reshape(-1, 1) - ) + file_distance["score"] = scaler.fit_transform(np.array(file_distance["score"]).reshape(-1, 1)) return file_distance @@ -72,7 +70,6 @@ def meta_preprocessing(meta): labels = [] for l in args.labels.split(","): if l in meta.index: - labels.append(l) labels.insert(0, "file_name") meta = meta[labels] @@ -165,13 +162,9 @@ def main(): # define file path model = os.path.join(args.output_path, "starspace_model_{}".format(assembly)) - dist = os.path.join( - args.output_path, "query_db_similarity_score_{}.csv".format(assembly) - ) + dist = os.path.join(args.output_path, "query_db_similarity_score_{}.csv".format(assembly)) - doc_embed_dB = bed2vec( - file_list_db, universe, model, assembly, "DB", args.output_path - ) + doc_embed_dB = bed2vec(file_list_db, universe, model, assembly, "DB", args.output_path) db_vectors = data_preprocessing(doc_embed_dB) @@ -185,7 +178,6 @@ def main(): print(query_vectors.shape) for i in range(len(query_vectors)): - query_vector = query_vectors[i] df_similarity = calculate_distance( diff --git a/gitk/bedspace/pipeline/bedspace_test.py b/geniml/bedspace/pipeline/bedspace_test_model.py similarity index 86% rename from gitk/bedspace/pipeline/bedspace_test.py rename to geniml/bedspace/pipeline/bedspace_test_model.py index 87e7dc05..c42f1e9b 100755 --- a/gitk/bedspace/pipeline/bedspace_test.py +++ b/geniml/bedspace/pipeline/bedspace_test_model.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -bedfile embeding pipeline (test) +bedfile embedding pipeline (test) """ import argparse import datetime @@ -30,8 +30,8 @@ from ubiquerg import VersionInHelpParser -def data_preprocessing(path_embeded_document): - document_embedding = pd.read_csv(path_embeded_document, header=None) +def data_preprocessing(path_embedded_document): + document_embedding = pd.read_csv(path_embedded_document, header=None) document_embedding = document_embedding[0].str.split("__label__", expand=True) document_embedding[list(document_embedding)[1:]] = document_embedding[ list(document_embedding)[1:] @@ -53,9 +53,7 @@ def label_preprocessing(path_word_embedding, label_prefix): labels = [] label_vectors = [] word_embedding = pd.read_csv(path_word_embedding, sep="\t", header=None) - vectors = word_embedding[ - word_embedding[0].str.contains(label_prefix) - ] # .reset_index() + vectors = word_embedding[word_embedding[0].str.contains(label_prefix)] # .reset_index() for l in range(len(vectors)): label_vectors.append((list(vectors.iloc[l])[1:])) labels.append(list(vectors.iloc[l])[0].replace(label_prefix, "")) @@ -76,9 +74,7 @@ def calculate_distance(X_files, X_labels, y_files, y_labels): value_name="score", ) scaler = MinMaxScaler() - file_distance["score"] = scaler.fit_transform( - np.array(file_distance["score"]).reshape(-1, 1) - ) + file_distance["score"] = scaler.fit_transform(np.array(file_distance["score"]).reshape(-1, 1)) return file_distance @@ -87,7 +83,6 @@ def meta_preprocessing(meta): labels = [] for l in args.labels.split(","): if l in meta.index: - labels.append(l) labels.insert(0, "file_name") meta = meta[labels] @@ -158,14 +153,10 @@ def main(): # define file path model = os.path.join(args.output_path, "starspace_model_{}".format(assembly)) - label_embed = os.path.join( - args.output_path, "starspace_model_{}.tsv".format(assembly) - ) + label_embed = os.path.join(args.output_path, "starspace_model_{}.tsv".format(assembly)) docs = os.path.join(args.output_path, "documents_{}.txt".format(assembly)) # files = os.path.join(args.output_path, "filenames_{}.txt".format(assembly)) - doc_embed = os.path.join( - args.output_path, "train_starspace_embed_{}.txt".format(assembly) - ) + doc_embed = os.path.join(args.output_path, "train_starspace_embed_{}.txt".format(assembly)) dist = os.path.join(args.output_path, "similarity_score_{}.csv".format(assembly)) embedding_labels, labels = label_preprocessing(label_embed, label_prefix) @@ -173,9 +164,7 @@ def main(): # Data prepration trained_documents = [] with Pool(n_process) as p: - trained_documents = p.starmap( - data_prepration_test, [(x, universe) for x in file_list] - ) + trained_documents = p.starmap(data_prepration_test, [(x, universe) for x in file_list]) p.close() p.join() print("Reading files done") @@ -213,15 +202,11 @@ def main(): df_similarity["filename"] = [file_list[i].split(",")[0]] * len(labels) - df_similarity = df_similarity[ - ["filename", "file_label", "search_term", "score"] - ] + df_similarity = df_similarity[["filename", "file_label", "search_term", "score"]] # filter res by dist threshold thresh = 0.5 - df_similarity = df_similarity[df_similarity["score"] < thresh].reset_index( - drop=True - ) + df_similarity = df_similarity[df_similarity["score"] < thresh].reset_index(drop=True) if os.path.exists(dist): df_similarity.to_csv(dist, header=False, index=None, mode="a") diff --git a/gitk/bedspace/pipeline/bedspace_train.py b/geniml/bedspace/pipeline/bedspace_train.py similarity index 94% rename from gitk/bedspace/pipeline/bedspace_train.py rename to geniml/bedspace/pipeline/bedspace_train.py index ad6ab650..3c393677 100755 --- a/gitk/bedspace/pipeline/bedspace_train.py +++ b/geniml/bedspace/pipeline/bedspace_train.py @@ -1,8 +1,7 @@ """ -bedfile embeding pipeline (train) +bedfile embedding pipeline (train) """ - import argparse import itertools import os @@ -133,9 +132,7 @@ # StarSpace Parameters universe = pybedtools.BedTool(args.univ_path) # define file path -train_files = os.path.join( - args.output_path, "documents_file_{}.txt".format(args.genome) -) +train_files = os.path.join(args.output_path, "documents_file_{}.txt".format(args.genome)) model = os.path.join(args.output_path, "starspace_model_{}".format(args.genome)) @@ -156,10 +153,7 @@ def main(): with Pool(n_process) as p: trained_documents = p.starmap( data_prepration, - [ - (x, universe) - for x in file_list[args.start_line : args.start_line + args.no_files] - ], + [(x, universe) for x in file_list[args.start_line : args.start_line + args.no_files]], ) p.close() p.join() diff --git a/gitk/bedspace/pipeline/distances.py b/geniml/bedspace/pipeline/distances.py similarity index 88% rename from gitk/bedspace/pipeline/distances.py rename to geniml/bedspace/pipeline/distances.py index bdb10e11..6dd9b6e1 100644 --- a/gitk/bedspace/pipeline/distances.py +++ b/geniml/bedspace/pipeline/distances.py @@ -73,8 +73,8 @@ def bed2vec(file_list, universe, model, source, output_path, path_to_starsapce): return doc_embed -def data_preprocessing(path_embeded_document): - document_embedding = pd.read_csv(path_embeded_document, header=None) +def data_preprocessing(path_embedded_document): + document_embedding = pd.read_csv(path_embedded_document, header=None) document_embedding = document_embedding[0].str.split("__label__", expand=True) document_embedding[list(document_embedding)[1:]] = document_embedding[ list(document_embedding)[1:] @@ -96,9 +96,7 @@ def label_preprocessing(path_word_embedding, label_prefix): labels = [] label_vectors = [] word_embedding = pd.read_csv(path_word_embedding, sep="\t", header=None) - vectors = word_embedding[ - word_embedding[0].str.contains(label_prefix) - ] # .reset_index() + vectors = word_embedding[word_embedding[0].str.contains(label_prefix)] # .reset_index() for l in range(len(vectors)): label_vectors.append((list(vectors.iloc[l])[1:])) labels.append(list(vectors.iloc[l])[0].replace(label_prefix, "")) @@ -111,9 +109,7 @@ def calculate_distance(X_files, X_labels, y_files, y_labels): distance_matrix = distance.cdist(X_files, X_labels, "cosine") df_distance_matrix = pd.DataFrame(distance_matrix) df_distance_matrix.columns = y_labels - df_distance_matrix["file_label"] = [ - y_files[i].split(",")[1] for i in range(len(y_files)) - ] + df_distance_matrix["file_label"] = [y_files[i].split(",")[1] for i in range(len(y_files))] file_distance = pd.melt( df_distance_matrix, id_vars="file_label", @@ -121,9 +117,7 @@ def calculate_distance(X_files, X_labels, y_files, y_labels): value_name="score", ) scaler = MinMaxScaler() - file_distance["score"] = scaler.fit_transform( - np.array(file_distance["score"]).reshape(-1, 1) - ) + file_distance["score"] = scaler.fit_transform(np.array(file_distance["score"]).reshape(-1, 1)) return file_distance @@ -141,9 +135,7 @@ def calculate_distance_qc(X_files, X_labels, y_files, y_labels): value_name="score", ) scaler = MinMaxScaler() - file_distance["score"] = scaler.fit_transform( - np.array(file_distance["score"]).reshape(-1, 1) - ) + file_distance["score"] = scaler.fit_transform(np.array(file_distance["score"]).reshape(-1, 1)) return file_distance @@ -206,17 +198,15 @@ def main( df_similarity = calculate_distance(Xs, embedding_labels, file_list, labels_l) - df_similarity["filename"] = [ - file_list[i].split(",")[0] for i in range(len(file_list)) - ] * len(labels_l) + df_similarity["filename"] = [file_list[i].split(",")[0] for i in range(len(file_list))] * len( + labels_l + ) df_similarity = df_similarity[["filename", "file_label", "search_term", "score"]] # filter res by dist threshold - df_similarity = df_similarity[df_similarity["score"] > threshold].reset_index( - drop=True - ) + df_similarity = df_similarity[df_similarity["score"] > threshold].reset_index(drop=True) df_similarity["score"] = 1 - df_similarity["score"] @@ -229,9 +219,7 @@ def main( distance_file_path_rr = os.path.join(output, "similarity_score_rr.csv") - doc_embed_dB = bed2vec( - file_list_db, universe, input, "DB", temp_path, path_to_starsapce - ) + doc_embed_dB = bed2vec(file_list_db, universe, input, "DB", temp_path, path_to_starsapce) db_vectors = data_preprocessing(doc_embed_dB) @@ -241,9 +229,7 @@ def main( query_vectors = data_preprocessing(doc_embed_query) - df_similarity = calculate_distance_qc( - db_vectors, query_vectors, file_list_db, file_list_query - ) + df_similarity = calculate_distance_qc(db_vectors, query_vectors, file_list_db, file_list_query) df_similarity = df_similarity[["db_file", "test_file", "score"]] diff --git a/gitk/bedspace/pipeline/helpers.py b/geniml/bedspace/pipeline/helpers.py similarity index 90% rename from gitk/bedspace/pipeline/helpers.py rename to geniml/bedspace/pipeline/helpers.py index 5bae227d..07dae33b 100644 --- a/gitk/bedspace/pipeline/helpers.py +++ b/geniml/bedspace/pipeline/helpers.py @@ -10,9 +10,7 @@ def data_prepration(path_file_label: str, univ: str): path_file_label = path_file_label.split(",") path_file = path_file_label[0] - labels = " ".join( - ["__label__" + label for label in path_file_label[1:] if label != ""] - ) + labels = " ".join(["__label__" + label for label in path_file_label[1:] if label != ""]) if os.path.exists(path_file): try: df = pybedtools.BedTool(path_file) @@ -28,7 +26,10 @@ def data_prepration(path_file_label: str, univ: str): + "_" + file_regions["end"].astype(str) ) - return [path_file, " ".join(list(file_regions["region"])) + " " + labels] + return [ + path_file, + " ".join(list(file_regions["region"])) + " " + labels, + ] except Exception: print("Error in reading file: ", path_file) return [path_file, " "] @@ -64,12 +65,12 @@ def data_prepration_test(path_file_label, univ): def bed2vec(file_list, universe, model, assembly, source, output_path): - docs = os.path.join(output_path, "documents_{}.txt".format(assembly)) files = os.path.join(output_path, "filenames_{}.txt".format(assembly)) doc_embed = os.path.join( - output_path, "{}_starspace_embed_{}_{}.txt".format(source, assembly, source) + output_path, + "{}_starspace_embed_{}_{}.txt".format(source, assembly, source), ) documents = [] diff --git a/gitk/bedspace/pipeline/preprocess.py b/geniml/bedspace/pipeline/preprocess.py similarity index 91% rename from gitk/bedspace/pipeline/preprocess.py rename to geniml/bedspace/pipeline/preprocess.py index bf064358..c15d815e 100644 --- a/gitk/bedspace/pipeline/preprocess.py +++ b/geniml/bedspace/pipeline/preprocess.py @@ -13,9 +13,7 @@ def data_prepration(path_file_label: str, univ): path_file_label = path_file_label.split(",") path_file = path_file_label[0] - labels = " ".join( - ["__label__" + label for label in path_file_label[1:] if label != ""] - ) + labels = " ".join(["__label__" + label for label in path_file_label[1:] if label != ""]) if os.path.exists(path_file): try: @@ -33,7 +31,10 @@ def data_prepration(path_file_label: str, univ): + "_" + file_regions["end"].astype(str) ) - return [path_file, " ".join(list(file_regions["region"])) + " " + labels] + return [ + path_file, + " ".join(list(file_regions["region"])) + " " + labels, + ] except Exception: print("Error in reading file: ", path_file) return [path_file, " "] diff --git a/gitk/bedspace/pipeline/search.py b/geniml/bedspace/pipeline/search.py similarity index 88% rename from gitk/bedspace/pipeline/search.py rename to geniml/bedspace/pipeline/search.py index 9618dc9f..89b23cde 100644 --- a/gitk/bedspace/pipeline/search.py +++ b/geniml/bedspace/pipeline/search.py @@ -32,13 +32,17 @@ def run_scenario1( search_table = pd.pivot_table( distance, values="score", index=["filename"], columns=["search_term"] ).reset_index() - df = search_table[["filename", searchterm]].sort_values( - by=[searchterm], ascending=False - )[0:num_results] + df = search_table[["filename", searchterm]].sort_values(by=[searchterm], ascending=False)[ + 0:num_results + ] df = df.sort_values(by=[searchterm], ascending=True) df["color"] = "green" plt = df.plot.barh( - x="filename", y=searchterm, figsize=(6, 4), fontsize=10, color=list(df["color"]) + x="filename", + y=searchterm, + figsize=(6, 4), + fontsize=10, + color=list(df["color"]), ) plt.set_xlabel("Similarity", fontsize=10) plt.set_ylabel("File_name", fontsize=10) @@ -79,7 +83,11 @@ def run_scenario2( df["color"] = "green" plt = df.plot.barh( - x="search_term", y="score", figsize=(6, 4), fontsize=10, color=list(df["color"]) + x="search_term", + y="score", + figsize=(6, 4), + fontsize=10, + color=list(df["color"]), ) plt.set_xlabel("Similarity", fontsize=10) plt.set_ylabel("Ranked labels", fontsize=10) @@ -124,7 +132,11 @@ def run_scenario3( df["color"] = "green" plt = df.plot.barh( - x="db_file", y="score", figsize=(6, 4), fontsize=10, color=list(df["color"]) + x="db_file", + y="score", + figsize=(6, 4), + fontsize=10, + color=list(df["color"]), ) plt.set_xlabel("Similarity", fontsize=10) plt.set_ylabel("Files in db", fontsize=10) diff --git a/gitk/bedspace/pipeline/train.py b/geniml/bedspace/pipeline/train.py similarity index 100% rename from gitk/bedspace/pipeline/train.py rename to geniml/bedspace/pipeline/train.py diff --git a/gitk/bedspace/pipeline/Visualization.py b/geniml/bedspace/pipeline/visualization.py similarity index 88% rename from gitk/bedspace/pipeline/Visualization.py rename to geniml/bedspace/pipeline/visualization.py index 96fa1289..e2798e4f 100644 --- a/gitk/bedspace/pipeline/Visualization.py +++ b/geniml/bedspace/pipeline/visualization.py @@ -23,12 +23,8 @@ def label_preprocessing(path_label_embedding, label_prefix, common_labels=[]): labels = [] label_vectors = [] - label_embedding = pd.read_csv( - path_label_embedding, sep="\t", header=None, skiprows=1 - ) - vectors = label_embedding[ - label_embedding[0].str.contains(label_prefix) - ] # .reset_index() + label_embedding = pd.read_csv(path_label_embedding, sep="\t", header=None, skiprows=1) + vectors = label_embedding[label_embedding[0].str.contains(label_prefix)] # .reset_index() vectors[0] = vectors[0].str.replace(label_prefix, "") @@ -49,7 +45,6 @@ def UMAP_plot( plottitle="", output_folder="", ): - np.random.seed(3) dp = 400 @@ -108,7 +103,11 @@ def UMAP_plot( for i, txt in enumerate(list(ump_data[title])): fig.annotate( - txt, (ump_data.iloc[i]["UMAP 1"] - 0.05, ump_data.iloc[i]["UMAP 2"] + 0.05) + txt, + ( + ump_data.iloc[i]["UMAP 1"] - 0.05, + ump_data.iloc[i]["UMAP 2"] + 0.05, + ), ) # plt.legend(loc='upper right', ) @@ -203,7 +202,6 @@ def retrieve_meta_test(): def Scenario1(path_simfile): - distance = pd.read_csv(file) distance.file_label = distance.file_label.str.lower() distance.search_term = distance.search_term.str.lower() @@ -216,9 +214,7 @@ def Scenario1(path_simfile): search_table, on="filename", ).drop_duplicates() - search_table = search_table.merge( - meta_test, left_on="filename", right_on="file_name" - ) + search_table = search_table.merge(meta_test, left_on="filename", right_on="file_name") if search == "cell": ind = 0 @@ -238,19 +234,14 @@ def Scenario1(path_simfile): np.min([ind, len_targets]) ] - search_table = search_table[ - ["filename", "file_label", "original_label"] + (training_labels) - ] - search_table["predicted_label"] = search_table[list(search_table)[3:]].idxmin( - axis=1 - ) + search_table = search_table[["filename", "file_label", "original_label"] + (training_labels)] + search_table["predicted_label"] = search_table[list(search_table)[3:]].idxmin(axis=1) for searchterm in training_labels: - nof = len(search_table[search_table.file_label.str.contains(searchterm)]) - df = search_table[ - ["filename", "file_label", "original_label", searchterm] - ].sort_values(by=[searchterm])[0:10] + df = search_table[["filename", "file_label", "original_label", searchterm]].sort_values( + by=[searchterm] + )[0:10] df = df.sort_values(by=[searchterm], ascending=False) df["color"] = "gray" @@ -273,9 +264,7 @@ def Scenario1(path_simfile): plt.axis(xmin=0.5, xmax=1.01) plt.figure.savefig( - "../outputs/bedembed_output/figures/S1/{}_nof{}.svg".format( - searchterm, nof - ), + "../outputs/bedembed_output/figures/S1/{}_nof{}.svg".format(searchterm, nof), format="svg", bbox_inches="tight", ) @@ -292,7 +281,6 @@ def Scenario1(path_simfile): def Scenario2(path_simfile): - distance = pd.read_csv(file) distance.file_label = distance.file_label.str.lower() distance.search_term = distance.search_term.str.lower() @@ -300,9 +288,7 @@ def Scenario2(path_simfile): search_table = pd.pivot_table( distance, values="score", index=["filename"], columns=["search_term"] ).reset_index() - search_table = search_table.merge( - meta_test, left_on="filename", right_on="file_name" - ) + search_table = search_table.merge(meta_test, left_on="filename", right_on="file_name") search_table = pd.merge( distance[["filename", "file_label"]].drop_duplicates(), search_table, @@ -324,12 +310,8 @@ def Scenario2(path_simfile): np.min([ind, len_targets]) ] - search_table = search_table[ - ["filename", "file_label", "original_label"] + (training_labels) - ] - search_table["predicted_label"] = search_table[list(search_table)[3:]].idxmin( - axis=1 - ) + search_table = search_table[["filename", "file_label", "original_label"] + (training_labels)] + search_table["predicted_label"] = search_table[list(search_table)[3:]].idxmin(axis=1) i = 0 b = search_table @@ -353,13 +335,16 @@ def Scenario2(path_simfile): i += 1 X = pd.DataFrame(all_weights).rename( - columns={0: "Filename", 1: "Filelabel", 2: "AllLabels", 3: "Distance_score"} + columns={ + 0: "Filename", + 1: "Filelabel", + 2: "AllLabels", + 3: "Distance_score", + } ) for file in list(set(X.Filename)): - df = X[X.Filename == file].sort_values(by=["Distance_score"], ascending=False)[ - 0:10 - ] + df = X[X.Filename == file].sort_values(by=["Distance_score"], ascending=False)[0:10] df = df.sort_values(by=["Distance_score"], ascending=True) df["color"] = "green" plt = df.plot.barh( @@ -394,9 +379,7 @@ def Scenario2(path_simfile): meta_train = (pd.read_csv(path_meta + "tests/test_file_meta.csv"))[ ["file_name", "cell_type", "target"] ] -meta_train["original_label_train"] = ( - meta_train["target"] + " " + meta_train["cell_type"] -) +meta_train["original_label_train"] = meta_train["target"] + " " + meta_train["cell_type"] meta_test.file_name = "/project/shefflab/data/encode/" + meta_test.file_name meta_train.file_name = "/project/shefflab/data/encode/" + meta_train.file_name @@ -412,13 +395,15 @@ def Scenario2(path_simfile): )[[0, 1, 2]] sim = sim.merge(meta_test, left_on="test_name", right_on="file_name").merge( - meta_train, left_on="train_name", right_on="file_name", suffixes=("_test", "_train") + meta_train, + left_on="train_name", + right_on="file_name", + suffixes=("_test", "_train"), ) # sim.score = 1 - sim.score for test in list(set(sim.test_name)): - df = sim[sim.test_name == test].sort_values(by="score", ascending=False)[ [ "test_name", @@ -436,7 +421,6 @@ def Scenario2(path_simfile): df.loc[df.original_label_test == df.original_label_train, "color"] = "green" if len(df[df.color == "green"]) == nof: - df.loc[(df.color != "green"), "color"] = "gray" plt = df.plot.barh( @@ -449,9 +433,7 @@ def Scenario2(path_simfile): plt.axis(xmin=0.7, xmax=1.01) plt.figure.savefig( - "../outputs/bedembed_output/figures/S3/{}_nof{}.svg".format( - test.split("/")[-1], nof - ), + "../outputs/bedembed_output/figures/S3/{}_nof{}.svg".format(test.split("/")[-1], nof), format="svg", bbox_inches="tight", ) diff --git a/gitk/bedspace/tests/test_file_meta.csv b/geniml/bedspace/tests/test_file_meta.csv similarity index 100% rename from gitk/bedspace/tests/test_file_meta.csv rename to geniml/bedspace/tests/test_file_meta.csv diff --git a/geniml/cli.py b/geniml/cli.py new file mode 100644 index 00000000..04dfcb56 --- /dev/null +++ b/geniml/cli.py @@ -0,0 +1,406 @@ +import logging +import os +import sys +from typing import Dict + +import logmuse +from ubiquerg import VersionInHelpParser + +from ._version import __version__ +from .assess.cli import build_subparser as assess_subparser +from .bbclient.cli import build_subparser as bbclient_subparser +from .bedspace.cli import build_argparser as bedspace_subparser +from .eval.cli import build_subparser as eval_subparser +from .likelihood.cli import build_subparser as likelihood_subparser +from .region2vec.cli import build_subparser as region2vec_subparser +from .scembed.argparser import build_argparser as scembed_subparser +from .tokenization.cli import build_subparser as tokenization_subparser +from .universe.cli import build_mode_parser as universe_subparser + + +def print_inspect_beds(bb_cache_folder) -> None: + """ + Print the bed files in the cache folder. + + :param bb_cache_folder: Cache folder path + """ + from rich.console import Console + from rich.table import Table + + from .bbclient import BBClient + + _LOGGER.info(f"Bedfiles directory:") + bbc = BBClient(cache_folder=bb_cache_folder) + result = bbc.list_beds() + + console = Console() + + # Create a Table + table = Table(title="Cached Bedfiles") + + # Add columns + table.add_column("ID", justify="center", style="cyan", no_wrap=True) + table.add_column("Path", style="magenta") + + # Add rows from the dictionary + for id, path in result.items(): + table.add_row(str(id), path) + + # Print the table + console.print(table) + + console.print(f"Number of bed files: {len(result)}") + + +def print_inspect_bedsets(bb_cache_folder) -> None: + """ + Print the bed sets in the cache folder. + + :param bb_cache_folder: Cache folder path + """ + from rich.console import Console + from rich.table import Table + + from .bbclient import BBClient + + _LOGGER.info(f"Bedsets directory:") + bbc = BBClient(cache_folder=bb_cache_folder) + result = bbc.list_bedsets() + + console = Console() + + # Create a Table + table = Table(title="Cached Bedsets") + + # Add columns + table.add_column("ID", justify="center", style="cyan", no_wrap=True) + table.add_column("Path", style="magenta") + + # Add rows from the dictionary + for id, path in result.items(): + table.add_row(str(id), path) + + # Print the table + console.print(table) + + console.print(f"Number of bed sets: {len(result)}") + + +def build_argparser(): + """ + Builds argument parser. + + :return argparse.ArgumentParser: Argument parser + """ + + banner = "%(prog)s - Genomic Interval toolkit" + additional_description = "\nhttps://geniml.databio.org" + + parser = VersionInHelpParser( + prog="geniml", + version=f"{__version__}", + description=banner, + epilog=additional_description, + ) + + # Individual subcommands + msg_by_cmd = { + "assess-universe": "Assess a universe", + "bbclient": "Client for the BEDbase server", + "bedspace": "Coembed regionsets (bed files) and labels", + "build-universe": "Build a consensus peak set using one of provided model", + "eval": "Evaluate a set of region embeddings", + "lh": "Make likelihood model", + "region2vec": "Train a region2vec model", + "scembed": "Embed single-cell data as region vectors", + "tokenize": "Tokenize BED files", + } + + sp = parser.add_subparsers(dest="command") + subparsers: Dict[str, VersionInHelpParser] = {} + for k, v in msg_by_cmd.items(): + subparsers[k] = sp.add_parser(k, description=v, help=v) + + # build up subparsers for modules + subparsers["assess-universe"] = assess_subparser(subparsers["assess-universe"]) + subparsers["bbclient"] = bbclient_subparser(subparsers["bbclient"]) + subparsers["bedspace"] = bedspace_subparser(subparsers["bedspace"]) + subparsers["build-universe"] = universe_subparser(subparsers["build-universe"]) + subparsers["eval"] = eval_subparser(subparsers["eval"]) + subparsers["lh"] = likelihood_subparser(subparsers["lh"]) + subparsers["region2vec"] = region2vec_subparser(subparsers["region2vec"]) + subparsers["scembed"] = scembed_subparser(subparsers["scembed"]) + subparsers["tokenize"] = tokenization_subparser(subparsers["tokenize"]) + + return parser + + +def main(test_args=None): + parser = logmuse.add_logging_options(build_argparser()) + args, _ = parser.parse_known_args() + if test_args: + args.__dict__.update(test_args) + + global _LOGGER + _LOGGER = logmuse.logger_via_cli(args, make_root=True) + + if args.command is None: + parser.print_help(sys.stderr) + sys.exit(1) + + _LOGGER.info(f"Command: {args.command}") + + if args.command == "assess-universe": + from .assess.assess import run_all_assessment_methods + + run_all_assessment_methods( + args.raw_data_folder, + args.file_list, + args.universe, + args.no_workers, + args.folder_out, + args.pref, + args.save_each, + args.overlap, + args.distance, + args.distance_flexible, + args.distance_universe_to_file, + args.distance_flexible_universe_to_file, + ) + + if args.command == "lh": + # _LOGGER.info(f"Subcommand: {args.subcommand}") + # if args.subcommand == "build_model": + from .likelihood.build_model import main + + main( + args.model_file, + args.coverage_folder, + args.coverage_prefix, + args.file_no, + args.force, + ) + + if args.command == "bbclient": + if args.subcommand in [ + "cache-bed", + "cache-tokens", + "cache-bedset", + "seek", + "inspect-bedfiles", + "inspect-bedsets", + "rm", + ]: + _LOGGER.info(f"Subcommand: {args.subcommand}") + from .bbclient import BBClient + + bbc = BBClient(cache_folder=args.cache_folder) + + else: + # if no subcommand, print help format of bbclient subparser + # from https://stackoverflow.com/a/20096044/23054783 + import argparse + + subparsers_actions = [ + action + for action in parser._actions + if isinstance(action, argparse._SubParsersAction) + ] + # there will probably only be one subparser_action, + # but better safe than sorry + for subparsers_action in subparsers_actions: + # get all subparsers and print help + for choice, subparser in subparsers_action.choices.items(): + if choice == "bbclient": + print(subparser.format_help()) + sys.exit(1) + if args.subcommand == "cache-bed": + # if input is a BED file path + if os.path.exists(args.identifier[0]): + identifier = bbc.add_bed_to_cache(args.identifier[0]) + _LOGGER.info(f"BED file {identifier} has been cached") + else: + bbc.load_bed(args.identifier[0]) + + if args.subcommand == "cache-tokens": + bbc.add_bed_tokens_to_cache(args.bed_id[0], args.universe_id[0]) + + if args.subcommand == "cache-bedset": + if os.path.isdir(args.identifier[0]): + from .io import BedSet + + bedset = BedSet( + [ + os.path.join(args.identifier[0], file_name) + for file_name in os.listdir(args.identifier[0]) + ] + ) + bbc.add_bedset_to_cache(bedset) + _LOGGER.info(f"BED set {bedset.compute_bedset_identifier()} has been cached") + + else: + bbc.load_bedset(args.identifier[0]) + + if args.subcommand == "seek": + handler = logging.StreamHandler(sys.stdout) + _LOGGER.addHandler(handler) + _LOGGER.info(bbc.seek(args.identifier[0])) + + if args.subcommand == "inspect-bedfiles": + print_inspect_beds(args.cache_folder) + + if args.subcommand == "inspect-bedsets": + print_inspect_bedsets(args.cache_folder) + + if args.subcommand == "rm": + file_path = bbc.seek(args.identifier[0]) + bbc._remove(file_path) + _LOGGER.info(f"{file_path} is removed") + + if args.command == "build-universe": + _LOGGER.info(f"Subcommand: {args.subcommand}") + if args.subcommand == "hmm": + from .universe.hmm_universe import hmm_universe + + hmm_universe( + coverage_folder=args.coverage_folder, + out_file=args.output_file, + prefix=args.coverage_prefix, + normalize=args.not_normalize, + save_max_cove=args.save_max_cove, + ) + + if args.subcommand == "ml": + from .universe.ml_universe import ml_universe + + ml_universe( + model_file=args.model_file, + cove_folder=args.coverage_folder, + cove_prefix=args.coverage_prefix, + file_out=args.output_file, + ) + + if args.subcommand == "cc": + from .universe.cc_universe import cc_universe + + cc_universe( + cove=args.coverage_folder, + cove_prefix=args.coverage_prefix, + file_out=args.output_file, + merge=args.merge, + filter_size=args.filter_size, + cutoff=args.cutoff, + ) + if args.subcommand == "ccf": + from .universe.ccf_universe import ccf_universe + + ccf_universe( + cove=args.coverage_folder, + cove_prefix=args.coverage_prefix, + file_out=args.output_file, + ) + + if args.command == "scembed": + _LOGGER.info("Running scembed") + pass + # scembed_main(test_args) + + if args.command == "region2vec": + from .region2vec import region2vec + + region2vec( + token_folder=args.token_folder, + save_dir=args.save_dir, + num_shufflings=args.num_shuffle, + num_processes=args.nworkers, + embedding_dim=args.embed_dim, + context_win_size=args.context_len, + save_freq=args.save_freq, + resume_path=args.resume, + train_alg=args.train_alg, + min_count=args.min_count, + neg_samples=args.neg_samples, + init_lr=args.init_lr, + min_lr=args.min_lr, + lr_scheduler=args.lr_mode, + milestones=args.milestones, + seed=args.seed, + ) + if args.command == "tokenize": + from .tokenization import hard_tokenization + + hard_tokenization( + src_folder=args.data_folder, + dst_folder=args.token_folder, + universe_file=args.universe, + fraction=args.fraction, + num_workers=args.nworkers, + bedtools_path=args.bedtools_path, + ) + if args.command == "eval": + if args.subcommand == "gdst": + from geniml.eval.gdst import get_gdst_score + + gdst_score = get_gdst_score( + args.model_path, args.embed_type, args.num_samples, args.seed + ) + print(gdst_score) + if args.subcommand == "npt": + from geniml.eval.npt import get_npt_score + + npt_score = get_npt_score( + args.model_path, + args.embed_type, + args.K, + args.num_samples, + args.seed, + args.K, + num_workers=args.num_workers, + ) + print(npt_score["SNPR"][0]) + if args.subcommand == "ctt": + from geniml.eval.ctt import get_ctt_score + + ctt_score = get_ctt_score( + args.model_path, + args.embed_type, + args.seed, + args.num_samples, + args.num_workers, + ) + + print(ctt_score) + if args.subcommand == "rct": + from geniml.eval.rct import get_rct_score + + rct_score = get_rct_score( + args.model_path, + args.embed_type, + args.bin_path, + args.out_dim, + args.cv_num, + args.seed, + args.num_workers, + ) + print(rct_score) + if args.subcommand == "bin-gen": + import glob + import pickle + + from geniml.eval.utils import get_bin_embeddings + + if os.path.exists(args.file_name): + print(f"{args.file_name} exists!") + return + token_files = glob.glob(os.path.join(args.token_folder, "*")) + bin_embed = get_bin_embeddings(args.universe, token_files) + os.makedirs(os.path.dirname(args.file_name), exist_ok=True) + with open(args.file_name, "wb") as f: + pickle.dump(bin_embed, f) + print(f"binary embeddings saved to {args.file_name}") + + return + + +if __name__ == "__main__": + main() diff --git a/geniml/const.py b/geniml/const.py new file mode 100644 index 00000000..c8f19b7b --- /dev/null +++ b/geniml/const.py @@ -0,0 +1,2 @@ +PKG_NAME = "geniml" +GTOK_EXT = "gtok" diff --git a/geniml/craft/__init__.py b/geniml/craft/__init__.py new file mode 100644 index 00000000..cb0d4256 --- /dev/null +++ b/geniml/craft/__init__.py @@ -0,0 +1,16 @@ +from ._version import VERSION + +from .modeling_craft import CraftModel, CraftForContrastiveLearning, CraftForGeneActivityPrediction +from .configuration_craft import CraftConfig +from .training_utils import DataCollatorForCraft, DataCollatorForCraftGeneActivityPrediction + +__all__ = [ + "CraftConfig", + "CraftModel", + "CraftForContrastiveLearning", + "CraftForGeneActivityPrediction", + "DataCollatorForCraft", + "DataCollatorForCraftGeneActivityPrediction" +] +__version__ = VERSION +__author__ = "Nathan LeRoy" diff --git a/geniml/craft/_version.py b/geniml/craft/_version.py new file mode 100644 index 00000000..58c8455f --- /dev/null +++ b/geniml/craft/_version.py @@ -0,0 +1 @@ +VERSION = "0.1.0" \ No newline at end of file diff --git a/geniml/craft/configuration_craft.py b/geniml/craft/configuration_craft.py new file mode 100644 index 00000000..a23a40ed --- /dev/null +++ b/geniml/craft/configuration_craft.py @@ -0,0 +1,38 @@ +from transformers import PretrainedConfig + +from atacformer.configuration_atacformer import AtacformerConfig +from geneformer.configuration_geneformer import GeneformerConfig + +class CraftConfig(PretrainedConfig): + """ + Configuration for the CRAFT model, a contrastive RNA-ATAC transformer that attempts + to learn leverage Geneformer and Atacformer to learn a joint representation of RNA and ATAC data. + """ + def __init__( + self, + geneformer_config: GeneformerConfig = None, + atacformer_config: AtacformerConfig = None, + projection_dim: int = 512, + logit_scale_init_value: float = 2.6592, + **kwargs, + ): + """ + Joint configuration for the CRAFT model. + + Args: + geneformer_config (GeneformerConfig): Configuration for the Geneformer model. + atacformer_config (AtacformerConfig): Configuration for the Atacformer model. + projection_dim (int): Dimension of the projection layer. + logit_scale_init_value (float): Initial value for the logit scale parameter. + """ + super().__init__(**kwargs) + + if isinstance(geneformer_config, dict): + geneformer_config = GeneformerConfig.from_dict(geneformer_config) + if isinstance(atacformer_config, dict): + atacformer_config = AtacformerConfig.from_dict(atacformer_config) + + self.geneformer_config = geneformer_config + self.atacformer_config = atacformer_config + self.projection_dim = projection_dim + self.logit_scale_init_value = logit_scale_init_value \ No newline at end of file diff --git a/geniml/craft/modeling_craft.py b/geniml/craft/modeling_craft.py new file mode 100644 index 00000000..d61185d4 --- /dev/null +++ b/geniml/craft/modeling_craft.py @@ -0,0 +1,302 @@ +from typing import Optional, Tuple, Union, Any +from dataclasses import dataclass + +from transformers import PreTrainedModel +from transformers.modeling_outputs import ModelOutput, BaseModelOutput +from transformers.utils import logging + +from atacformer import AtacformerModel +from geneformer import GeneformerModel + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .configuration_craft import CraftConfig + +logger = logging.get_logger(__name__) + +@dataclass +class CraftOutput(ModelOutput): + """ + Args: + loss (torch.FloatTensor of shape (1,), optional): + Contrastive loss measuring the similarity between gene and chromatin accessibility representations. + logits_per_atac (torch.FloatTensor of shape (gene_batch_size, atac_batch_size)): + Scaled dot-product scores between the gene embeddings and the ATAC embeddings, representing gene-to-ATAC similarity. + logits_per_genes (torch.FloatTensor of shape (atac_batch_size, gene_batch_size)): + Scaled dot-product scores between the ATAC embeddings and the gene embeddings, representing ATAC-to-gene similarity. + geneformer_output (BaseModelOutput, optional): + Output from the gene encoder containing hidden states and additional information. + atacformer_output (BaseModelOutput, optional): + Output from the ATAC encoder containing hidden states and additional information. + """ + + loss: Optional[torch.FloatTensor] = None + logits_per_atac: Optional[torch.FloatTensor] = None + logits_per_genes: Optional[torch.FloatTensor] = None + geneformer_output: Optional[BaseModelOutput] = None + atacformer_output: Optional[BaseModelOutput] = None + + def to_tuple(self) -> Tuple[Any]: + return tuple( + self[k] if k not in ["geneformer_output", "atacformer_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + +@dataclass +class CraftGeneActivityOutput(ModelOutput): + """ + Args: + loss (torch.FloatTensor of shape (1,), optional): + Loss value for the gene activity prediction task. + gene_activity_predictions (torch.FloatTensor of shape (batch_size, n_genes)): + Predicted gene activity scores for each gene in the batch. + """ + loss: Optional[torch.FloatTensor] = None + gene_activity_predictions: Optional[torch.FloatTensor] = None + +class CraftModel(PreTrainedModel): + """ + CRAFT Model with a masked language modeling head. + """ + config_class = CraftConfig + base_model_prefix = "craft" + + def __init__(self, config: CraftConfig): + super().__init__(config) + self.config = config + self.geneformer_config = config.geneformer_config + self.atacformer_config = config.atacformer_config + + self.gene_encoder = GeneformerModel(self.geneformer_config) + self.atac_encoder = AtacformerModel(self.atacformer_config) + + self.projection_dim = config.projection_dim + self.atac_embed_dim = self.atacformer_config.hidden_size + self.gene_embed_dim = self.geneformer_config.hidden_size + + self.gene_projection = torch.nn.Linear(self.gene_embed_dim, self.projection_dim, bias=False) + self.atac_projection = torch.nn.Linear(self.atac_embed_dim, self.projection_dim, bias=False) + self.logit_scale = torch.nn.Parameter(torch.tensor(self.config.logit_scale_init_value)) + + self.post_init() + + def _pool_embeddings(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: + """ + Pool the embeddings using the attention mask and mean pooling. + + Args: + hidden_states (torch.Tensor): The hidden states from the transformer model. + attention_mask (torch.Tensor): The attention mask to apply. + """ + attention_mask = attention_mask.unsqueeze(-1) + sum_embeddings = (hidden_states * attention_mask).sum(1) + sum_mask = attention_mask.sum(1).clamp(min=1e-9) + return sum_embeddings / sum_mask + + def forward( + self, + gene_input_ids: torch.Tensor = None, + gene_attention_mask: torch.Tensor = None, + gene_token_type_ids: torch.Tensor = None, + atac_input_ids: torch.Tensor = None, + atac_attention_mask: torch.Tensor = None, + return_dict: bool = None, + **kwargs, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor], CraftOutput]: + """ + Forward pass through the CRAFT model. + + Args: + gene_input_ids (torch.Tensor): Input IDs for the gene encoder. + gene_attention_mask (torch.Tensor): Attention mask for the gene encoder. + gene_token_type_ids (torch.Tensor): Token type IDs for the gene encoder. + atac_input_ids (torch.Tensor): Input IDs for the ATAC encoder. + atac_attention_mask (torch.Tensor): Attention mask for the ATAC encoder. + return_dict (bool): Whether to return a dictionary or tuple. + Returns: + torch.Tensor: The logits from the CRAFT model. + """ + # gene encoding + gene_outputs = self.gene_encoder.bert( + input_ids=gene_input_ids, + attention_mask=gene_attention_mask, + token_type_ids=gene_token_type_ids, + output_hidden_states=True, + return_dict=True + ) + gene_hidden_states = gene_outputs.last_hidden_state # last layer hidden states + gene_pooled_output = self._pool_embeddings(gene_hidden_states, gene_attention_mask) + + + # atac encoding + atac_outputs = self.atac_encoder( + input_ids=atac_input_ids, + attention_mask=atac_attention_mask, + ) + atac_pooled_output = self._pool_embeddings(atac_outputs, atac_attention_mask) + + # project into same space + gene_projs = self.gene_projection(gene_pooled_output) + atac_projs = self.atac_projection(atac_pooled_output) + + # normalize the projections + gene_projs = F.normalize(gene_projs, dim=-1) + atac_projs = F.normalize(atac_projs, dim=-1) + + # scaled pairwise cosine similarities + cos_sims = torch.matmul(gene_projs, atac_projs.T) * self.logit_scale.exp() + + n = gene_input_ids.shape[0] + labels = torch.arange(n, device=cos_sims.device, dtype=torch.long) + loss_i = F.cross_entropy(cos_sims, labels) # image→text + loss_t = F.cross_entropy(cos_sims.T, labels) # text→image + loss = (loss_i + loss_t) / 2 + + if not return_dict: + return (loss, cos_sims, cos_sims.T, gene_outputs, atac_outputs) + + return CraftOutput( + loss=loss, + logits_per_atac=cos_sims, + logits_per_genes=cos_sims.T, + geneformer_output=gene_outputs, + atacformer_output=atac_outputs, + ) + +class CraftForContrastiveLearning(PreTrainedModel): + """ + CRAFT model for contrastive learning between gene and ATAC embeddings. While this + looks redudant with the CraftModel, it makes it easier to use the model + for further tasks like gene activity prediction without needing to + instantiate the CraftModel directly. + + Mostly used for pre-training tasks + """ + + config_class = CraftConfig + base_model_prefix = "craft_for_contrastive_learning" + + def __init__(self, config: CraftConfig): + super().__init__(config) + self.craft = CraftModel(config) + + def forward( + self, + gene_input_ids: torch.Tensor, + gene_attention_mask: torch.Tensor, + gene_token_type_ids: torch.Tensor, + atac_input_ids: torch.Tensor, + atac_attention_mask: torch.Tensor, + ) -> CraftOutput: + """ + Forward pass through the model. + + Args: + gene_input_ids (torch.Tensor): Input IDs for the gene encoder. + gene_attention_mask (torch.Tensor): Attention mask for the gene encoder. + gene_token_type_ids (torch.Tensor): Token type IDs for the gene encoder. + atac_input_ids (torch.Tensor): Input IDs for the ATAC encoder. + atac_attention_mask (torch.Tensor): Attention mask for the ATAC encoder. + + Returns: + CraftOutput: The output of the CRAFT model containing loss and logits. + """ + return self.craft( + gene_input_ids=gene_input_ids, + gene_attention_mask=gene_attention_mask, + gene_token_type_ids=gene_token_type_ids, + atac_input_ids=atac_input_ids, + atac_attention_mask=atac_attention_mask, + return_dict=True + ) + +class GeneActivityPredictionHead(nn.Module): + """ + A head for computing gene activity scores from the shared latent space + of the CRAFT model. + + Mostly used for scATAC-seq data, where we want to predict gene activity + scores from the ATAC-seq embeddings. + """ + def __init__(self, config: CraftConfig): + super().__init__() + self.projection_dim = config.projection_dim + self.n_genes = config.geneformer_config.vocab_size - 2 # Exclude and tokens + self.gene_activity_head = nn.Sequential( + nn.Linear(self.projection_dim, self.projection_dim, bias=False), + nn.ReLU(), + nn.Linear(self.projection_dim, self.n_genes, bias=False) + ) + + def forward(self, latent_embeddings: torch.Tensor) -> torch.Tensor: + """ + Forward pass to compute gene activity scores. + + Args: + latent_embeddings (torch.Tensor): The latent embeddings from the CRAFT model. + + Returns: + torch.Tensor: The computed gene activity scores. + """ + return self.gene_activity_head(latent_embeddings) + +class CraftForGeneActivityPrediction(PreTrainedModel): + """ + CRAFT model for gene activity prediction. + """ + + config_class = CraftConfig + base_model_prefix = "craft_for_gene_activity_prediction" + + def __init__(self, config: CraftConfig): + super().__init__(config) + self.craft = CraftModel(config) + self.gene_activity_head = GeneActivityPredictionHead(config) + + def forward(self, + atac_input_ids: torch.Tensor, + atac_attention_mask: torch.Tensor, + gene_activity: Optional[torch.Tensor] = None, + return_dict: bool = True + ) -> Union[Tuple[torch.Tensor | None, Any], CraftGeneActivityOutput]: + """ + Forward pass through the model. + + Args: + atac_input_ids (torch.Tensor): Input IDs for the ATAC encoder. + atac_attention_mask (torch.Tensor): Attention mask for the ATAC encoder. + gene_activity (Optional[torch.Tensor]): Optional gene activity scores for computing loss. + Returns: + torch.Tensor: The predicted gene activity scores. + """ + atac_latent_embeddings = self.craft.atac_encoder( + input_ids=atac_input_ids, + attention_mask=atac_attention_mask + ) + # pool the ATAC embeddings to get cell-level representations + atac_latent_embeddings = self.craft._pool_embeddings( + atac_latent_embeddings, + atac_attention_mask + ) + # project the ATAC embeddings to the shared latent space + atac_latent_embeddings = self.craft.atac_projection(atac_latent_embeddings) + + # normalize the embeddings + atac_latent_embeddings = F.normalize(atac_latent_embeddings, dim=-1) + + # compute gene activity predictions + gene_activity_predictions = self.gene_activity_head(atac_latent_embeddings) + + loss = None + if gene_activity is not None: + loss = F.mse_loss(gene_activity_predictions, gene_activity) + + if not return_dict: + return (loss, gene_activity_predictions) + + return CraftGeneActivityOutput( + loss=loss, + gene_activity_predictions=gene_activity_predictions + ) \ No newline at end of file diff --git a/geniml/craft/training_utils.py b/geniml/craft/training_utils.py new file mode 100644 index 00000000..e9773fc3 --- /dev/null +++ b/geniml/craft/training_utils.py @@ -0,0 +1,95 @@ +from typing import List, Dict + +import torch +from torch.nn.utils.rnn import pad_sequence + +class DataCollatorForCraft: + """ + Pads + builds masks for gene/ATAC pairs used by CraftModel. + """ + + def __init__( + self, + gene_pad: int, + atac_pad: int, + gene_max_len: int | None = None, + atac_max_len: int | None = None, + ): + self.gene_pad, self.atac_pad = gene_pad, atac_pad + self.gene_max_len, self.atac_max_len = gene_max_len, atac_max_len + + @staticmethod + def _truncate(seq: List[int], max_len: int | None) -> List[int]: + return seq[:max_len] if max_len is not None else seq + + def __call__(self, features: List[Dict]) -> Dict[str, torch.Tensor]: + gene_ids = [torch.tensor(self._truncate(f["gene_input_ids"], self.gene_max_len), + dtype=torch.long) + for f in features] + atac_ids = [torch.tensor(self._truncate(f["atac_input_ids"], self.atac_max_len), + dtype=torch.long) + for f in features] + + gene_batch = pad_sequence(gene_ids, batch_first=True, + padding_value=self.gene_pad) + atac_batch = pad_sequence(atac_ids, batch_first=True, + padding_value=self.atac_pad) + + gene_mask = (gene_batch != self.gene_pad).long() + atac_mask = (atac_batch != self.atac_pad).bool() # atacformer needs a bool mask (using nn.TransformerEncoder) + + if "gene_token_type_ids" in features[0]: + tt_ids = [torch.tensor(self._truncate(f["gene_token_type_ids"], + self.gene_max_len), + dtype=torch.long) + for f in features] + gene_tt = pad_sequence(tt_ids, batch_first=True, padding_value=0) + else: + gene_tt = torch.zeros_like(gene_batch) + + return { + "gene_input_ids" : gene_batch, + "gene_attention_mask" : gene_mask, + "gene_token_type_ids" : gene_tt, + "atac_input_ids" : atac_batch, + "atac_attention_mask" : atac_mask, + } + +class DataCollatorForCraftGeneActivityPrediction: + """ + Pads + builds masks for ATAC-gene pairs used by CraftForGeneActivityPrediction. + + Gene activity is always the same shape for everything, its just a set of floats + representing the activity of each gene in the genome, so we don't pad it. + """ + + def __init__( + self, + atac_pad: int, + atac_max_len: int | None = None, + ): + self.atac_pad = atac_pad + self.atac_max_len = atac_max_len + + @staticmethod + def _truncate(seq: List[int], max_len: int | None) -> List[int]: + return seq[:max_len] if max_len is not None else seq + + def __call__(self, features: List[Dict]) -> Dict[str, torch.Tensor]: + atac_ids = [torch.tensor(self._truncate(f["atac_input_ids"], self.atac_max_len), + dtype=torch.long) + for f in features] + + gene_activity = torch.stack([torch.tensor(f["gene_activity"], dtype=torch.float) + for f in features]) + + atac_batch = pad_sequence(atac_ids, batch_first=True, + padding_value=self.atac_pad) + + atac_mask = (atac_batch != self.atac_pad).bool() # atacformer needs a bool mask (using nn.TransformerEncoder) + + return { + "atac_input_ids": atac_batch, + "atac_attention_mask": atac_mask, + "gene_activity": gene_activity + } \ No newline at end of file diff --git a/geniml/eval/README.md b/geniml/eval/README.md new file mode 100644 index 00000000..753ca905 --- /dev/null +++ b/geniml/eval/README.md @@ -0,0 +1,125 @@ +# Evaluation of Genomic Region Embeddings + +## Preparations +### Create a Base Embedding Object +Given a set of genomic region embeddings `embeddings` and the corresponding regions `vocab`, use `BaseEmbeddings` to create an `base` embedding object. +``` +from geniml.eval.utils import BaseEmbeddings + +import pickle +base_obj = BaseEmbeddings(embeddings, vocab) +with open("base_embed.pt", "wb") as f: + pickle.dump(base_obj, f) +``` +### Generate Binary Embeddings +```python +from geniml.eval.utils import get_bin_embeddings +universe_file = "/path/to/universe.bed" +token_files = ["file1.bed", "file2.bed"] +bin_embed = get_bin_embeddings(universe_file, token_files) +``` +Or use command line: +```bash +geniml eval bin-gen --universe /path/to/universe.bed --token-folder /path/to/tokenized/folder --file-name bin_embed.pickle +``` +## Statistical Tests +### Cluster Tendency Test (CTT) +CTT analyzes how well a set of region embeddings can be clustered. CTT score lies between 0 and 1. A larger CTT score indicates a greater tendency for the embeddings being evaluated to have clusters. When the embeddings are uniformly distributed, the score is 0.5. For evenly spaced embeddings, the score approaches 0. + +```python +from geniml.eval.ctt import get_ctt_score, ctt_eval + +path = "/path/to/a/region2vec/model/" +embed_type = "region2vec" +ctt_score = get_ctt_score(path, embed_type, seed=42, num_data=10000, num_workers=10) +print(ctt_score) + +# evaluate a batch of models and run CTT for 5 times with different random seeds +batch = [(path, embed_type)] +ctt_score_arr = ctt_eval(batch, num_runs=5, num_data=10000,num_workers=10) +print(f"Model: {ctt_score_arr[0][0]}\n CTT scores:{ctt_score_arr[0][1]}") # CTT scores for the 1st model in the batch +``` + +Or use the command line +```bash +geniml eval ctt --model-path /path/to/a/region2vec/model/ --embed-type region2vec +``` +### Reconstruction Test (RCT) +RCT evaluates how well an embedding of a region preserves the region’s occurrence information in the training data. The best RCT score is 1. + +```python +from geniml.eval.rct import get_rct_score, rct_eval + +path = "/path/to/a/region2vec/model/" +embed_type = "region2vec" +bin_path = "/path/to/a/binary/embedding/for/the/same/tokenized/files/" +# set out_dim to -1 use all the dimensions of the binary embeddings. Set out_dim to a small positive number to reduce computational complexity. +rct_score = get_rct_score(path, embed_type, bin_path, out_dim=-1, cv_num=5, seed=42, num_workers=10) +print(rct_score) + +# evaluate a batch of models and run RCT for 5 times with different random seeds +batch = [(path, embed_type, bin_path)] +rct_score_arr = rct_eval(batch, num_runs=5, cv_num=5, out_dim=-1, num_workers=10) +print(f"Model: {rct_score_arr[0][0]}\n RCT scores:{rct_score_arr[0][1]}") # RCT scores for the 1st model in the batch +``` + +Or use the command line +```bash +geniml eval rct --model-path /path/to/a/region2vec/model/ --embed-type region2vec +``` +To change the learning setting, go to the definition of `get_rct_score` in `geniml/eval/rct.py` and change the constructor of `MLPRegressor`. + + +## Biological Tests +### Genome Distance Scaling Test (GDST) +GDST calculates a score measuring how much the embedding distance between two regions scales the corresponding genome distance. + +```python +from geniml.eval.gdst import get_gdst_score, gdst_eval + +path = "/path/to/a/region2vec/model/" +embed_type = "region2vec" +gdst_score = get_gdst_score(path, embed_type, num_samples=10000,seed=42) +print(gdst_score) + +# evaluate a batch of models and run GDST for 5 times with different random seeds +batch = [(path,embed_type)] + +gdst_score_arr = gdst_eval(batch, num_runs=5, num_samples=10000) +``` + +Or use the command line +```bash +geniml eval gdst --model-path /path/to/a/region2vec/model/ --embed-type region2vec +``` + + +### Neighborhood Preserving Test (NPT) + +NPT evaluates how significant genomic region embeddings preserve their neighboring regions on the genome against random embeddings. The code output the NPT score for a set of region embeddings. + +```python +from geniml.eval.npt import get_npt_score, npt_eval + +path = "/path/to/a/region2vec/model/" +embed_type = "region2vec" +K = 10 +# If resolution = K gives NPT for K neighbors +# If resolution < K, gives NPT for [resolution, resolution*2, ...] neighbors + +resolution = K +npt_score = get_npt_score(path, embed_type, K, num_samples=100, seed=0, resolution=resolution,num_workers=10) + +print(npt_score['SNPR']) + +# evaluate a batch of models and run NPT for 5 times with different random seeds +batch = [(path, embed_type)] +npt_score_arr = npt_eval(batch, K, num_samples=100, num_workers=10, num_runs=5, resolution=resolution) + +print(f"Model: {npt_score_arr[0][0]}\n NPT scores: {npt_score_arr[0][1]}") # NPT scores for the 1st model in the batch +``` + +Or use the command line (the output will be the result when resolution=K) +```bash +geniml eval npt --model-path /path/to/a/region2vec/model/ --embed-type region2vec --K 50 --num-samples 1000 +``` diff --git a/gitk/__init__.py b/geniml/eval/__init__.py similarity index 100% rename from gitk/__init__.py rename to geniml/eval/__init__.py diff --git a/geniml/eval/cli.py b/geniml/eval/cli.py new file mode 100644 index 00000000..cc96c3ae --- /dev/null +++ b/geniml/eval/cli.py @@ -0,0 +1,193 @@ +def build_subparser_gdst(parser): + """ + Builds argument parser to support the gdst command line interface (under eval). + """ + + parser.add_argument( + "--model-path", + required=True, + type=str, + help="path to a Region2Vec model or a Base model", + ) + parser.add_argument("--embed-type", required=True, type=str, help="region2vec or base") + parser.add_argument( + "--num-samples", + default=10000, + type=int, + help="number of samples used in calculation", + ) + parser.add_argument( + "--seed", + default=42, + type=int, + help="random seed", + ) + + return parser + + +def build_subparser_npt(parser): + """ + Builds argument parser to support the npt command line interface (under eval). + """ + parser.add_argument( + "--model-path", + required=True, + type=str, + help="path to a region2vec model or a Base model", + ) + parser.add_argument("--embed-type", required=True, type=str, help="region2vec or base") + parser.add_argument("--K", required=True, type=int, help="number of nearest regions") + parser.add_argument( + "--num-samples", + default=1000, + type=int, + help="number of samples used in calculation", + ) + parser.add_argument( + "--seed", + default=42, + type=int, + help="random seed", + ) + parser.add_argument( + "--num-workers", + default=10, + type=int, + help="number of parllel processes", + ) + return parser + + +def build_subparser_ctt(parser): + """ + Builds argument parser to support the ctt command line interface (under eval). + """ + parser.add_argument( + "--model-path", + required=True, + type=str, + help="path to a region2vec model or a Base model", + ) + parser.add_argument("--embed-type", required=True, type=str, help="region2vec or base") + + parser.add_argument( + "--num-samples", + default=10000, + type=int, + help="number of samples used in calculation", + ) + parser.add_argument( + "--seed", + default=42, + type=int, + help="random seed", + ) + parser.add_argument( + "--num-workers", + default=10, + type=int, + help="number of parllel processes", + ) + + return parser + + +def build_subparser_rct(parser): + """ + Builds argument parser to support the rct command line interface (under eval). + """ + parser.add_argument( + "--model-path", + required=True, + type=str, + help="path to a region2vec model or a Base model", + ) + parser.add_argument( + "--bin-path", + required=True, + type=str, + help="path to a set of binary embedding", + ) + parser.add_argument( + "--embed-type", + required=True, + type=str, + help="region2vec or base for model-path", + ) + + parser.add_argument( + "--cv-num", + default=5, + type=int, + help="number of folds in cross-validation", + ) + parser.add_argument( + "--seed", + default=42, + type=int, + help="random seed", + ) + parser.add_argument( + "--out-dim", + default=-1, + type=int, + help="Used when the binary embeddings are very high-dimensional, i.e., there are many training files. Default -1 represents using all the dimensions", + ) + parser.add_argument( + "--num-workers", + default=10, + type=int, + help="number of parllel processes", + ) + + return parser + + +def build_subparser_bingen(parser): + """ + Builds argument parser to support the bin-gen command line interface (under eval). + """ + parser.add_argument( + "--universe", + required=True, + type=str, + help="path to a universe file", + ) + parser.add_argument( + "--token-folder", + required=True, + type=str, + help="path to a folder storing tokenized files", + ) + parser.add_argument( + "--file-name", + required=True, + type=str, + help="name of the generated binary embeddings", + ) + + return parser + + +def build_subparser(parser): + """ + Builds argument parser to support the eval command line interface. + """ + sp = parser.add_subparsers(dest="subcommand") + msg_by_cmd = { + "gdst": "Genome distance scaling test", + "npt": "Neighborhood preserving test", + "ctt": "Cluster tendency test", + "rct": "Reconstruction test", + "bin-gen": "Generate binary embeddings", + } + subparsers = {} + for k, v in msg_by_cmd.items(): + subparsers[k] = sp.add_parser(k, description=v, help=v) + subparsers["gdst"] = build_subparser_gdst(subparsers["gdst"]) + subparsers["npt"] = build_subparser_npt(subparsers["npt"]) + subparsers["ctt"] = build_subparser_ctt(subparsers["ctt"]) + subparsers["rct"] = build_subparser_rct(subparsers["rct"]) + subparsers["bin-gen"] = build_subparser_bingen(subparsers["bin-gen"]) + return parser diff --git a/geniml/eval/const.py b/geniml/eval/const.py new file mode 100644 index 00000000..ed67b7d7 --- /dev/null +++ b/geniml/eval/const.py @@ -0,0 +1,4 @@ +GENOME_DIST_SCALAR = 1e10 +CTT_QUANTILE_MAX = 0.95 +CTT_QUANTILE_MIN = 0.05 +CTT_TEST_RATIO = 0.1 diff --git a/geniml/eval/ctt.py b/geniml/eval/ctt.py new file mode 100644 index 00000000..dbff5355 --- /dev/null +++ b/geniml/eval/ctt.py @@ -0,0 +1,157 @@ +import os +import pickle + +os.environ["OPENBLAS_NUM_THREADS"] = "1" + +from typing import List, Tuple + +import numpy as np +from sklearn.neighbors import NearestNeighbors + +from .const import CTT_QUANTILE_MAX, CTT_QUANTILE_MIN, CTT_TEST_RATIO +from .utils import load_genomic_embeddings + + +def get_ctt_score( + path: str, + embed_type: str, + seed: int = 42, + num_data: int = 10000, + num_workers: int = 10, +) -> float: + """Runs the cluster tendency test (CTT) on a model. + + Args: + path (str): The path to a model. + embed_type (str): The type of the model: "region2vec" or "base". + seed (int, optional): Random seed. Defaults to 42. + num_data (int, optional): Number of embeddings used for evaluation. + Defaults to 10000. + num_workers (int, optional): Number of parallel processes used. + Defaults to 10. + + Raises: + ValueError: The number of samples is too small. + ZeroDivisionError: The denominator of the CTT score is zero. + + Returns: + float: The CTT score for the model. + """ + np.random.seed(seed) + data, vocab = load_genomic_embeddings(path, embed_type) + num_ori, dimension = data.shape + if num_data < num_ori: + num = num_data + else: + num = num_ori + data = data[np.random.choice(num_ori, num)] + if num < 100: + raise ValueError(f"Number of samples ({num}) is too small") + num_samples = int(num * CTT_TEST_RATIO) + + sel_indexes = np.random.choice(num, num_samples) + data_sample = data[sel_indexes] + neigh = NearestNeighbors(n_neighbors=2, n_jobs=num_workers).fit(data) + sample_dist, _ = neigh.kneighbors(data_sample) + sample_dist_to_nn = sample_dist[:, 1] + + max_vals = np.quantile(data, CTT_QUANTILE_MAX, axis=0) + min_vals = np.quantile(data, CTT_QUANTILE_MIN, axis=0) + random_points = np.random.uniform(min_vals, max_vals, (num_samples, dimension)) + + random_dist, _ = neigh.kneighbors(random_points, n_neighbors=1) + + random_dist_to_nn = random_dist[:, 0] + + x = sum(sample_dist_to_nn**2) + y = sum(random_dist_to_nn**2) + + if x + y == 0: + raise ZeroDivisionError("The denominator is zero") + + return y / (x + y) + + +def get_ctt_batch( + batch: List[Tuple[str, str]], + seed: int = 42, + num_data: int = 10000, + save_path: str = None, + num_workers: int = 10, +) -> List[Tuple[str, float]]: + """Runs the cluster tendency test (CTT) on a batch of models. + + Args: + batch (list[tuple[str, str]]): A list of (model path, model type) + tuples. Model type could be "region2vec" or "base". + seed (int, optional): Random seed. Defaults to 42. + num_data (int, optional): Number of embeddings used for evaluation. + Defaults to 10000. + save_path (str, optional): Save the results to save_path. Defaults to + None. + num_workers (int, optional): Number of parallel processes used. + Defaults to 10. + + Returns: + List[Tuple[str, float]]: A list of (model path, CTT score) tuples. + """ + ctt_arr = [] + for path, embed_type in batch: + ctt = get_ctt_score(path, embed_type, seed, num_data, num_workers) + # print(f"{'/'.join(path.split('/')[-3:])}: {ctt:.4f}") + ctt_arr.append((path, ctt)) + if save_path: + os.makedirs(os.path.dirname(save_path), exist_ok=True) + with open(save_path, "wb") as f: + pickle.dump(ctt_arr, f) + return ctt_arr + + +def ctt_eval( + batch: List[Tuple[str, str]], + num_runs: int = 20, + num_data: int = 10000, + save_folder: str = None, + num_workers: int = 10, +) -> List[Tuple[str, list[float]]]: + """Runs the CTT on a batch of models for multiple times. + + Runs the cluster tendency test (CTT) for a batch of models for num_runs + times with different random seeds. + + + Args: + batch (list[tuple[str, str]]): A list of (model path, model type) + tuples. Model type could be "region2vec" or "base". + num_runs (int, optional): Number of runs. Defaults to 20. + num_data (int, optional): Number of embeddings used for evaluation. + Defaults to 10000. + save_folder (str, optional): Folder to save the results from each run. + Defaults to None. + num_workers (int, optional): Number of parallel processes used. + Defaults to 10. + + Returns: + list[tuple[str, list[float]]]: A list of (model path, CTT scores from + num_runs) tuples. + """ + results_seeds = [] + for seed in range(num_runs): + print(f"----------------Run {seed}----------------") + save_path = os.path.join(save_folder, f"ctt_eval_seed{seed}") if save_folder else None + result_list = get_ctt_batch(batch, seed, num_data, save_path, num_workers) + results_seeds.append(result_list) + + ctt_res = [[] for i in range(len(batch))] + for results in results_seeds: + for i, res in enumerate(results): + ctt_res[i].append(res[1]) + assert res[0] == batch[i][0], "key == batch[i][0]" + + mean_ctt = [np.array(r).mean() for r in ctt_res] + std_ctt = [np.array(r).std() for r in ctt_res] + # models = [t[0] for t in batch] + for i in range(len(mean_ctt)): + print(f"{batch[i][0]}\n CTT (std): {mean_ctt[i]:.4f} ({std_ctt[i]:.4f}) \n") + ctt_arr = [(batch[i][0], ctt_res[i]) for i in range(len(batch))] + return ctt_arr diff --git a/geniml/eval/gdst.py b/geniml/eval/gdst.py new file mode 100644 index 00000000..dfcf01ea --- /dev/null +++ b/geniml/eval/gdst.py @@ -0,0 +1,238 @@ +import os +import pickle +from typing import List, Tuple, Union + +os.environ["OPENBLAS_NUM_THREADS"] = "1" +import multiprocessing as mp +from multiprocessing.queues import Queue + +import numpy as np +from sklearn.linear_model import LinearRegression + +from .const import GENOME_DIST_SCALAR +from .utils import cosine_distance, genome_distance, load_genomic_embeddings + + +def sample_from_vocab(vocab: List[str], num_samples: int, seed: int = 42) -> List[str]: + """Samples regions from vocab. + + Samples regions proportionally from each chromosome. + + Args: + vocab (list[str]): A list of regions. + num_samples (int): Number of regions to sample. + seed (int, optional): Random seed. Defaults to 42. + + Returns: + list[str]: A list of sampled regions. + """ + chr_probs = {} + region_dict = {} + num_vocab = len(vocab) + # build stat from vocab + for region in vocab: + chr_str, position = region.split(":") + chr_str = chr_str.strip() + start, end = position.split("-") + start = int(start.strip()) + end = int(end.strip()) + chr_probs[chr_str] = chr_probs.get(chr_str, 0) + 1 + if chr_str in region_dict: + region_dict[chr_str].append((start, end)) + else: + region_dict[chr_str] = [(start, end)] + total = sum([chr_probs[k] for k in chr_probs]) + chr_probs = [(k, chr_probs[k] / total) for k in chr_probs] + + count = 0 + + chr_names = [t[0] for t in chr_probs] + chr_probs = [t[1] for t in chr_probs] + sampled_regions = [] + np.random.seed(seed) + while count < num_samples: + sel_chr = np.random.choice(chr_names, p=chr_probs) + regions = region_dict[sel_chr] + if len(regions) < 2: + continue + sel_indexes = np.random.choice(len(regions), 2, replace=False) + r1, r2 = regions[sel_indexes[0]], regions[sel_indexes[1]] + gdist = genome_distance(r1, r2) + sampled_regions.append( + ( + f"{sel_chr}:{r1[0]}-{r1[1]}", + f"{sel_chr}:{r2[0]}-{r2[1]}", + gdist, + ) + ) + count += 1 + return sampled_regions + + +def get_gdst_score( + path: str, + embed_type: str, + num_samples: int = 10000, + seed: int = 42, + queue: Queue = None, + worker_id: int = None, +) -> Union[float, Tuple[int, str, float]]: + """Runs the GDST on a model. + + Args: + path (str): The path to a model. + embed_type (str): The model type: "region2vec" or "base". + num_samples (int, optional): Number of embeddings used for evaluation. + Defaults to 10000. + seed (int, optional): Random seed. Defaults to 42. + queue (Queue, optional): The multiprocessing + queue used to store results. Defaults to None. + worker_id (int, optional): Worker id. Defaults to None. + + Returns: + Union[float, tuple[int, str, float]]: A GDST score when used in a single + process; or, a tuple of (worker id, model path, GDST score) when + used in multiple processes. + """ + embed_rep, vocab = load_genomic_embeddings(path, embed_type) + regions = sample_from_vocab(vocab, num_samples, seed) + region2idx = {r: i for i, r in enumerate(vocab)} + gdist_arr = [r[2] / GENOME_DIST_SCALAR for r in regions] + edist_arr = np.array( + [ + cosine_distance(embed_rep[region2idx[t[0]]], embed_rep[region2idx[t[1]]]) + for t in regions + ] + ) + gd_arr = list(zip(gdist_arr, edist_arr)) + X = np.array([[g[0]] for g in gd_arr]) + y = np.array([g[1] for g in gd_arr]) + reg = LinearRegression().fit(X, y) + slope = reg.coef_[0] + if queue: + queue.put((worker_id, path, slope)) + return worker_id, path, slope + else: + return slope + + +def writer_multiprocessing(save_path: str, num: int, q: Queue) -> List[Tuple[str, float]]: + """Writes results from multiple processes to a list. + + Args: + save_path (str): The path to the saved results. + num (int): The number of results. + q (Queue): A multiprocessing queue. + + Returns: + list[tuple[str, float]]: A list of (model path, GDST score) tuples. + """ + results = ["" for i in range(num)] + while True: + m = q.get() + if m == "kill": + break + index = m[0] + results[index] = (m[1], m[2]) + if save_path: + os.makedirs(os.path.dirname(save_path), exist_ok=True) + with open(save_path, "wb") as f: + pickle.dump(results, f) + return results + + +def get_gdst_score_batch( + batch: List[Tuple[str, str]], + num_samples: int = 10000, + seed: int = 42, + save_path: str = None, + num_workers: int = 1, +) -> List[Tuple[str, float]]: + """Runs the GDST on a batch of models. + + Args: + batch (list[tuple[str, str]]): A list of (model path, model type) tuples. + num_samples (int, optional): Number of embeddings used for evaluation. + Defaults to 10000. + seed (int, optional): Random seed. Defaults to 42. + save_path (str, optional): Save the results to save_path. Defaults to + None. + num_workers (int, optional): Number of parallel processes used. + Defaults to 1. + + Returns: + list[tuple[str, float]]: A list of (model path, GDST score) tuples. + """ + if num_workers <= 1: + gds_arr = [] + for path, embed_type in batch: + gds = get_gdst_score(path, embed_type, num_samples, seed) + print(path, gds) + gds_arr.append((path, gds)) + if save_path: + os.makedirs(os.path.dirname(save_path), exist_ok=True) + with open(save_path, "wb") as f: + pickle.dump(gds_arr, f) + else: + manager = mp.Manager() + queue = manager.Queue() + with mp.Pool(processes=num_workers) as pool: + writer = pool.apply_async(writer_multiprocessing, (save_path, len(batch), queue)) + all_processes = [] + for i, (path, embed_type) in enumerate(batch): + process = pool.apply_async( + get_gdst_score, + (path, embed_type, num_samples, seed, queue, i), + ) + all_processes.append(process) + + for process in all_processes: + process.get() + queue.put("kill") + gds_arr = writer.get() + return gds_arr + + +def gdst_eval( + batch: List[Tuple[str, str]], + num_runs: int = 20, + num_samples: int = 1000, + save_folder: str = None, + num_workers: int = 10, +) -> List[Tuple[str, List[float]]]: + """Runs the GDST on a batch of models for multiple times. + + Args: + batch (list[tuple[str, str]]): A list of (model path, model type) tuples. + num_runs (int, optional): Number of runs. Defaults to 20. + num_samples (int, optional): Number of embeddings used for evaluation. + Defaults to 1000. + save_folder (str, optional): Folder to save the results from each run. + Defaults to None. + num_workers (int, optional): Number of parallel processes used. + Defaults to 10. + + Returns: + list[tuple[str, list[float]]]: A list of (model path, GDST scores from + num_runs) tuples. + """ + results_seeds = [] + for seed in range(num_runs): + print(f"----------------Run {seed}----------------") + save_path = os.path.join(save_folder, f"gdst_eval_seed{seed}") if save_folder else None + result_list = get_gdst_score_batch(batch, num_samples, seed, save_path, num_workers) + results_seeds.append(result_list) + + gds_res = [[] for i in range(len(batch))] + for results in results_seeds: + for i, res in enumerate(results): + gds_res[i].append(res[1]) + assert res[0] == batch[i][0], "key == batch[i][0]" + + mean_gds = [np.array(r).mean() for r in gds_res] + std_gds = [np.array(r).std() for r in gds_res] + models = [t[0] for t in batch] + for i in range(len(mean_gds)): + print(f"{batch[i][0]}\n GDST score (std): {mean_gds[i]:.4f} ({std_gds[i]:.4f}) \n") + gds_arr = [(batch[i][0], gds_res[i]) for i in range(len(batch))] + return gds_arr diff --git a/geniml/eval/npt.py b/geniml/eval/npt.py new file mode 100644 index 00000000..de726efa --- /dev/null +++ b/geniml/eval/npt.py @@ -0,0 +1,519 @@ +import os + +os.environ["OPENBLAS_NUM_THREADS"] = "1" + +import multiprocessing as mp +import pickle +from multiprocessing.queues import Queue +from typing import Dict, List, Tuple, Union + +import numpy as np + +from .utils import genome_distance, load_genomic_embeddings + + +def get_topk_embed( + i: int, K: int, embed: np.ndarray, dist: str = "cosine" +) -> Tuple[np.ndarray, np.ndarray]: + """Gets the nearest K embedding indexes to the i-th embedding. + + Args: + i (int): The index for the query embedding. + K (int): The number of nearest embeddings to select. + embed (np.ndarray): An array of embedding vectors + dist (str, optional): The distance function used. Defaults to "cosine". + + Returns: + tuple[np.ndarray, np.ndarray]: K indexes of nearest embeddings and the + corresponding similarities. + """ + num = len(embed) + if dist == "cosine": + nom = np.dot(embed[i : i + 1], embed.T) + denom = np.linalg.norm(embed[i : i + 1]) * np.linalg.norm(embed, axis=1) + sims = (nom / denom)[0] + indexes = np.argsort(-sims)[1 : K + 1] + s = sims[indexes] + elif dist == "euclidean": + dist = np.linalg.norm(embed[i : i + 1] - embed, axis=1) + indexes = np.argsort(dist)[1 : K + 1] + s = -dist[indexes] + elif dist == "jaccard": + nom = np.dot(embed[i : i + 1], embed.T) + denom = ((embed[i : i + 1] + embed) > 0.0).sum(axis=1) + sims = (nom / denom)[0] + indexes = np.argsort(-sims)[1 : K + 1] + s = sims[indexes] + return indexes, s + + +def find_kneighbors(region_array: List[Tuple[str, int, int]], index: int, k: int) -> List[int]: + """Finds the indexes of the K nearest regions of a query region on genome. + + region_array must be sorted, and all regions are on the same chromosome. + + Args: + region_array (list[tuple[str, int, int]]): A list of (chromosome, start + position, end position) tuples. + index (int): The index of the query region. + k (int): Specifies the number of nearest neighbors of the query region. + + Returns: + list[int]: A list of indexes of the K nearest neighbors in + region_array. + """ + if len(region_array) < k: + k = len(region_array) + qregion = region_array[index] + left_idx = max(index - k, 0) + right_idx = min(index + k, len(region_array) - 1) + rdist_arr = [] + for idx in range(left_idx, right_idx + 1): + rdist_arr.append(genome_distance(qregion, region_array[idx])) + rdist_arr = np.array(rdist_arr) + Kneighbors_idx = np.argsort(rdist_arr)[1 : k + 1] + Kneighbors_idx = Kneighbors_idx + left_idx + return Kneighbors_idx + + +def calculate_overlap_bins( + local_idx: int, + K: int, + chromo: str, + region_array: List[Tuple[str, int, int]], + region2index: dict[str, int], + embed_rep: np.ndarray, + res: int = 10, + dist: str = "cosine", + same_chromo: bool = True, +) -> np.ndarray: + """Calculates the overlap ratios for a region. + + Calculates the overlap ratios for a region between its K-nearest neighbor + set obtained using genome distance and its K-nearest neighbor set obtained + using embedding distance. If res < K, then calculates ratios for size + res*1, res*2, ..., min(res*n, K). + + Args: + local_idx (int): The local index of a region on its chromosome. + K (int): Specifies the number of nearest neighbors. + chromo (str): Chromosome. + region_array (list[tuple[str, int, int]]): A list of (chromosome, start + position, end position) tuples. + region2index (dict[str, int]): A dictionary of (region, index). + embed_rep (np.ndarray): An array of embedding vectors. + res (int, optional): Resolution. Size of neighborhood set. Defaults to + 10. + dist (str, optional): Distance function. Defaults to "cosine". + same_chromo (bool, optional): Whether to find nearest neighbors on the + same chromosome in the embedding space. Defaults to True. + + Returns: + np.ndarray: An array of overlap ratios. + """ + Kindices = find_kneighbors(region_array, local_idx, K) + if len(Kindices) == 0: + return 0 + str_kregions = [ + f"{chromo}:{region_array[k][0]}-{region_array[k][1]}" for k in Kindices + ] # sorted in ascending order + _Krdist_global_indices = np.array([region2index[r] for r in str_kregions]) + + if same_chromo: + chr_regions = [ + f"{chromo}:{region_array[k][0]}-{region_array[k][1]}" for k in range(len(region_array)) + ] + chr_global_indices = np.array([region2index[r] for r in chr_regions]) + chr_embeds = embed_rep[chr_global_indices] + _Kedist_local_indices, _ = get_topk_embed(local_idx, K, chr_embeds, dist) + _Kedist_global_indices = np.array([chr_global_indices[i] for i in _Kedist_local_indices]) + else: + idx = region2index[f"{chromo}:{region_array[local_idx][0]}-{region_array[local_idx][1]}"] + _Kedist_global_indices, _ = get_topk_embed(idx, K, embed_rep, dist) + + bin_overlaps = [] + prev = 0 + assert res < K + 1, "resolution < K + 1" + for i in range(res, K + 1, res): + set1 = set(_Krdist_global_indices[prev:i]) + set2 = set(_Kedist_global_indices[prev:i]) + + overlap = len(set1.intersection(set2)) / len(set1) + bin_overlaps.append(overlap) + + return np.array(bin_overlaps) + + +def cal_snpr(ratio_embed: np.ndarray, ratio_random: np.ndarray) -> np.ndarray: + """Calculates SNPR values. + + :param ratio_embed: Overlap ratios for query embeddings. + :param ratio_random: Overlap ratios for random embeddings. + + :return: SNPR values. + """ + res = np.log10((ratio_embed + 1.0e-10) / (ratio_random + 1.0e-10)) + res = np.maximum(res, 0) + return res + + +var_dict = {} + + +def worker_func( + i: int, + K: int, + chromo: str, + region_array: List[Tuple[str, int, int]], + embed_type: str, + resolution: int, + dist: str, +) -> np.ndarray: + """Wrapper for calculate_overlap_bins + + Args: + i (int): The local index of a region on its chromosome. + K (int): Specifies the number of nearest neighbors. + chromo (str): Chromosome. + region_array (list[tuple[str, int, int]]): A list of (chromosome, start + position, end position) tuples. + embed_type (str): Embedding type, "region2vec" or "base". + resolution (int): Resolution. + dist (str): Distance function. + + Returns: + np.ndarray: An array of overlap ratios. + """ + if embed_type == "embed": + embeds = var_dict["embed_rep"] + elif embed_type == "random": + embeds = var_dict["ref_embed"] + nprs = calculate_overlap_bins( + i, + K, + chromo, + region_array, + var_dict["region2vec_index"], + embeds, + resolution, + dist, + ) + return nprs + + +def init_worker( + embed_rep: np.ndarray, ref_embed: np.ndarray, region2index: Dict[str, int] +) -> None: + """Initializes data used by workers. + + Args: + embed_rep (np.ndarray): Query embeddings. + ref_embed (np.ndarray): Random embeddings. + region2index (dict[str, int]): A region to index dictionary. + """ + var_dict["embed_rep"] = embed_rep + var_dict["ref_embed"] = ref_embed + var_dict["region2vec_index"] = region2index + + +def get_npt_score( + model_path: str, + embed_type: str, + K: int, + num_samples: int = 100, + seed: int = 0, + resolution: int = 10, + dist: str = "cosine", + num_workers: int = 10, +) -> Dict[str, Union[int, np.ndarray, str]]: + """Runs the NPT on a model. + + If num_samples > 0, then randomly sample num_samples regions proportional + from each chromosome. If num_samples == 0, all regions are used in the + test. If K > resolution, then returns an array of NPT scores; otherwise, + returns one NPT score. + + Args: + model_path (str): The path to a model. + embed_type (str): The model type: "region2vec" or "base". + K (int): Specifies the number of nearest neighbors. + num_samples (int, optional): Number of embeddings used for evaluation. + Defaults to 100. + seed (int, optional): Random seed. Defaults to 0. + resolution (int, optional): Resolution of a neighborhood set. Defaults + to 10. + dist (str, optional): Distance function. Defaults to "cosine". + num_workers (int, optional): Number of parallel processes used. + Defaults to 10. + + Returns: + dict[str, Union[int, np.ndarray, str]]: NPT results in a dictionary + "K": K, + "Avg_qNPR": Average NPR ratios for query embeddings, + "Avg_rNPR": Average NPR ratios for random embeddings,, + "SNPR": SNPR values, + "Resolution": Resolution, + "Path": Model path, + """ + embed_rep, regions_r2v = load_genomic_embeddings(model_path, embed_type) + + region2index = {r: i for i, r in enumerate(regions_r2v)} + # Group regions by chromosomes + chromo_regions = {} + for v in regions_r2v: + chromo, region = v.split(":") # e.g. chr1:100-1000 + chromo = chromo.strip() # remove possible spaces + region = region.strip() # remove possible spaces + start, end = region.split("-") + start = int(start.strip()) + end = int(end.strip()) + if chromo not in chromo_regions: + chromo_regions[chromo] = [(start, end)] + else: + chromo_regions[chromo].append((start, end)) + + # sort regions in each chromosome + chromo_ratios = {} + for chromo in chromo_regions: + region_array = chromo_regions[chromo] + chromo_regions[chromo] = sorted(region_array, key=lambda x: x[0]) + chromo_ratios[chromo] = len(region_array) / len(regions_r2v) + + num_regions, num_dim = embed_rep.shape + + np.random.seed(seed) + + ref_embed = (np.random.rand(num_regions, num_dim) - 0.5) / num_dim + + avg_ratio = 0.0 + avg_ratio_ref = 0.0 + count = 0 + + if num_workers > 1: + with mp.Pool( + processes=num_workers, + initializer=init_worker, + initargs=(embed_rep, ref_embed, region2index), + ) as pool: + all_processes = [] + for chromo in chromo_regions: + region_array = chromo_regions[chromo] + if num_samples == 0: # exhaustive + indexes = list(range(len(region_array))) + else: + num = min( + len(region_array), + round(num_samples * chromo_ratios[chromo]), + ) + indexes = np.random.permutation(len(region_array))[0:num] + for i in indexes: + process_embed = pool.apply_async( + worker_func, + (i, K, chromo, region_array, "embed", resolution, dist), + ) + process_random = pool.apply_async( + worker_func, + ( + i, + K, + chromo, + region_array, + "random", + resolution, + dist, + ), + ) + all_processes.append((process_embed, process_random)) + + for i, (process_embed, process_random) in enumerate(all_processes): + avg_ratio = (avg_ratio * count + process_embed.get()) / (count + 1) + avg_ratio_ref = (avg_ratio_ref * count + process_random.get()) / (count + 1) + count = count + 1 + else: + for chromo in chromo_regions: + region_array = chromo_regions[chromo] + if num_samples == 0: # exhaustive + indexes = list(range(len(region_array))) + else: + num = min( + len(region_array), + round(num_samples * chromo_ratios[chromo]), + ) + indexes = np.random.permutation(len(region_array))[0:num] + for i in indexes: + nprs_embed = calculate_overlap_bins( + i, + K, + chromo, + region_array, + region2index, + embed_rep, + resolution, + dist, + ) + nprs_random = calculate_overlap_bins( + i, + K, + chromo, + region_array, + region2index, + ref_embed, + resolution, + dist, + ) + avg_ratio = (avg_ratio * count + nprs_embed) / (count + 1) + avg_ratio_ref = (avg_ratio_ref * count + nprs_random) / (count + 1) + count = count + 1 + snprs = cal_snpr(avg_ratio, avg_ratio_ref) + + ratio_msg = " ".join([f"{r:.6f}" for r in avg_ratio]) + ratio_ref_msg = " ".join([f"{r:.6f}" for r in avg_ratio_ref]) + snprs_msg = " ".join([f"{r:.6f}" for r in snprs]) + result = { + "K": K, + "Avg_qNPR": avg_ratio, + "Avg_rNPR": avg_ratio_ref, + "SNPR": snprs, + "Resolution": resolution, + "Path": model_path, + } + return result + + +def writer_multiprocessing(save_path: str, num: int, q: Queue) -> List[Tuple[str, float]]: + """Writes results from multiple processes to a list. + + Args: + save_path (str): The path to the saved results. + num (int): The number of results. + q (Queue): A multiprocessing queue. + + Returns: + list[tuple[str, float]]: A list of (model path, NPT score) tuples. + """ + results = [[] for i in range(num)] + while True: + m = q.get() + if m == "kill": + break + worker_id = m[0] + results[worker_id] = m[1] + if save_path: + os.makedirs(os.path.dirname(save_path), exist_ok=True) + with open(save_path, "wb") as f: + pickle.dump(results, f) + return results + + +def get_npt_score_batch( + batch: List[Tuple[str, str]], + K: int, + num_samples: int = 100, + num_workers: int = 10, + seed: int = 0, + resolution: int = 10, + dist: str = "cosine", + save_path: str = None, +) -> List[Dict[str, Union[int, np.ndarray, str]]]: + """Runs the NPT on a batch of models. + + Args: + batch (list[tuple[str, str]]): A list of (model path, model type) tuples. + K (int): Specifies the number of nearest neighbors. + num_samples (int, optional): Number of embeddings used for evaluation. + Defaults to 100. + num_workers (int, optional): Number of parallel processes used. + Defaults to 10. + seed (int, optional): Random seed. Defaults to 0. + resolution (int, optional): Resolution of a neighborhood set. Defaults + to 10. + dist (str, optional): Distance function. Defaults to "cosine". + save_path (str, optional): Save the results to save_path. Defaults to + None. + + Returns: + list[dict[str, Union[int, np.ndarray, str]]]: A list of dictionaries of + NPT results. + """ + result_list = [] + for index, (path, embed_type) in enumerate(batch): + result = get_npt_score( + path, + embed_type, + K, + num_samples, + seed, + resolution, + dist, + num_workers, + ) + result_list.append(result) + if save_path: + os.makedirs(os.path.dirname(save_path), exist_ok=True) + with open(save_path, "wb") as f: + pickle.dump(result_list, f) + return result_list + + +def npt_eval( + batch: List[Tuple[str, str]], + K: int, + num_samples: int = 100, + num_workers: int = 10, + num_runs: int = 20, + resolution: int = 10, + dist: str = "cosine", + save_folder: str = None, +) -> List[Tuple[str, np.ndarray, int]]: + """Runs the NPT on a batch of models for multiple times. + + Args: + batch (list[tuple[str, str]]): A list of (model path, model type) tuples. + K (int): Specifies the number of nearest neighbors. + num_samples (int, optional): Number of embeddings used for evaluation. + Defaults to 100. + num_workers (int, optional): Number of parallel processes used. + Defaults to 10. + num_runs (int, optional): Number of runs. Defaults to 20. + resolution (int, optional): Resolution of a neighborhood set. + Defaults to 10. + dist (str, optional): Distance function. Defaults to "cosine". + save_folder (str, optional): Folder to save the results from each run. + Defaults to None. + + Returns: + list[tuple[str, np.ndarray, int]]: A list of (model path, snprs from + num_runs, resolution) tuples. + """ + results_seeds = [] + assert resolution <= K, "resolution <= K" + for seed in range(num_runs): + print(f"----------------Run {seed}----------------") + save_path = os.path.join(save_folder, f"npt_eval_seed{seed}") if save_folder else None + result_list = get_npt_score_batch( + batch, + K, + num_samples=num_samples, + num_workers=num_workers, + seed=seed, + resolution=resolution, + dist=dist, + save_path=save_path, + ) + results_seeds.append(result_list) + snpr_results = [[] for i in range(len(batch))] + paths = ["" for i in range(len(batch))] + for results in results_seeds: + for i, result in enumerate(results): + key = result["Path"] + snpr_results[i].append(result["SNPR"]) + paths[i] = key + snpr_results = [np.array(v) for v in snpr_results] + for i in range(len(batch)): + snpr_arr = snpr_results[i] + avg_snprs = snpr_arr.mean(axis=0) + std_snprs = snpr_arr.std(axis=0) + msg = " ".join([f"{m:.4f}({s:.4f})" for m, s in zip(avg_snprs, std_snprs)]) + print(f"{paths[i]}\nSNPRs:{msg}\n") + snpr_results = [(paths[i], snpr_results[i], resolution) for i in range(len(batch))] + return snpr_results diff --git a/geniml/eval/rct.py b/geniml/eval/rct.py new file mode 100644 index 00000000..9a419d69 --- /dev/null +++ b/geniml/eval/rct.py @@ -0,0 +1,170 @@ +import argparse +import glob +import multiprocessing as mp +import os +import pickle +import random +import time + +import numpy as np +import sklearn.neural_network as nn +from gensim.models import Word2Vec +from sklearn.compose import TransformedTargetRegressor +from sklearn.model_selection import KFold, cross_val_score, train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler + +from ..utils import timer_func +from .utils import cosine_distance, genome_distance, load_genomic_embeddings + + +def get_rct_score( + path: str, + embed_type: str, + bin_path: str, + out_dim: int = -1, + cv_num: int = 5, + seed: int = 42, + num_workers: int = 10, +) -> float: + """Runs the RCT on a model. + + Args: + path (str): The path to a model. + embed_type (str): The type of the model: "region2vec" or "base". + bin_path (str): The path to a binary embedding model. + out_dim (int, optional): Output dimension of a prediction. Defaults to + -1, and out_dim is the same as the dimension of a binary embedding. + cv_num (int, optional): Number of folds in cross-validation. Defaults + to 5. + seed (int, optional): Random seed. Defaults to 42. + num_workers (int, optional): Number of parallel processes used. + Defaults to 10. + + Returns: + float: the RCT score. + """ + embed_rep, vocab = load_genomic_embeddings(path, embed_type) + embed_bin, vocab_bin = load_genomic_embeddings(bin_path, "base") + region2idx = {r: i for i, r in enumerate(vocab)} + region2idx_bin = {r: i for i, r in enumerate(vocab_bin)} + # align embed_bin with embed_rep + if out_dim <= 0: + embed_bin = np.array([embed_bin[region2idx_bin[v]] for v in vocab]) + else: + bin_dim = embed_bin.shape[1] + out_dim = min(bin_dim, out_dim) + sel_dims = np.random.choice(bin_dim, out_dim) + embed_bin = np.array([embed_bin[region2idx_bin[v]][sel_dims] for v in vocab]) + + regressor = nn.MLPRegressor( + hidden_layer_sizes=(200), + activation="relu", + solver="adam", + alpha=0.0001, # regularizer strength + batch_size="auto", + learning_rate_init=0.001, + max_iter=200, + shuffle=True, + random_state=seed, + tol=0.0001, + verbose=False, + early_stopping=False, + validation_fraction=0.1, + n_iter_no_change=10, + ) + model_in = make_pipeline(StandardScaler(), regressor) + model = TransformedTargetRegressor(regressor=model_in, transformer=StandardScaler()) + + kf = KFold(n_splits=cv_num, shuffle=True, random_state=seed) + if num_workers > cv_num: + num_workers = cv_num + score = cross_val_score(model, embed_rep, embed_bin, cv=kf, n_jobs=num_workers, verbose=0) + return score.mean() + + +def reconstruction_batch( + batch: list[tuple[str, str, str]], + cv_num: int, + out_dim: int = -1, + seed: int = 42, + save_path: str = None, + num_workers: int = 10, +) -> list[tuple[str, float]]: + """Runs the RCT on a batch of models. + + Args: + batch (list[tuple[str, str, str]]): A list of (model path, model type, + binary embedding path) tuples. Model type could be "region2vec" or + "base". + cv_num (int): Number of folds in cross-validation. Defaults + to 5. + out_dim (int, optional): Output dimension of a prediction. Defaults to + -1, and out_dim is the same as the dimension of a binary embedding. + seed (int, optional): Random seed. Defaults to 42. + save_path (str, optional): Save the results to save_path. Defaults to + None. + num_workers (int, optional): Number of parallel processes used. + Defaults to 10. + + Returns: + list[tuple[str, float]]: A list of (model path, RCT score) tuples. + """ + rct_arr = [] + for path, embed_type, bin_path in batch: + score = get_rct_score(path, embed_type, bin_path, out_dim, cv_num, seed, num_workers) + rct_arr.append((path, score)) + if save_path: + os.makedirs(os.path.dirname(save_path), exist_ok=True) + with open(save_path, "wb") as f: + pickle.dump(rct_arr, f) + return rct_arr + + +def rct_eval( + batch: list[tuple[str, str, str]], + num_runs: int = 5, + cv_num: int = 5, + out_dim: int = -1, + save_folder: str = None, + num_workers: int = 10, +) -> list[tuple[str, list[float]]]: + """Runs the RCT on a batch of models for multiple times. + + Args: + batch (list[tuple[str, str, str]]): A list of (model path, model type, + binary embedding path) tuples. Model type could be "region2vec" or + "base". + num_runs (int, optional): Number of runs. Defaults to 5. + cv_num (int, optional): Number of folds in cross-validation. + Defaults to 5. + out_dim (int, optional): Output dimension of a prediction. Defaults to + -1, and out_dim is the same as the dimension of a binary embedding. + save_folder (str, optional): Folder to save the results from each run. + Defaults to None. + num_workers (int, optional): Number of parallel processes used. + Defaults to 10. + + Returns: + list[tuple[str, list[float]]]: A list of (model path, RCT scores from + num_runs) tuples. + """ + results_seeds = [] + for seed in range(num_runs): + print(f"----------------Run {seed}----------------") + save_path = os.path.join(save_folder, f"rct_eval_seed{seed}") if save_folder else None + result_list = reconstruction_batch(batch, cv_num, out_dim, seed, save_path, num_workers) + results_seeds.append(result_list) + + rct_res = [[] for i in range(len(batch))] + for results in results_seeds: + for i, res in enumerate(results): + rct_res[i].append(res[1]) + assert res[0] == batch[i][0], "key == batch[i][0]" + mean_rct = [np.array(r).mean() for r in rct_res] + std_rct = [np.array(r).std() for r in rct_res] + models = [t[0] for t in batch] + for i in range(len(mean_rct)): + print(f"{batch[i][0]}\n RCT (std): {mean_rct[i]:.4f} ({std_rct[i]:.4f}) \n") + rct_arr = [(batch[i][0], rct_res[i]) for i in range(len(batch))] + return rct_arr diff --git a/geniml/eval/utils.py b/geniml/eval/utils.py new file mode 100644 index 00000000..3d2c3496 --- /dev/null +++ b/geniml/eval/utils.py @@ -0,0 +1,257 @@ +import os +import pickle +from typing import Dict, List, Tuple, Union + +import numpy as np +from gensim.models import Word2Vec + +from ..region2vec import Region2VecExModel + + +def genome_distance(u: Tuple[int, int], v: Tuple[int, int]) -> float: + """Computes the genome distance between two regions. + + Assumes that the two regions, u and v, are on the same chromosome. + + Args: + u (tuple[int, int]): A region denoted by its start and end positions. + v (tuple[int, int]): A region denoted by its start and end positions. + + Returns: + float: The genome distance between the two regions. + """ + return float(u[1] < v[1]) * max(v[0] - u[1] + 1, 0) + float(u[1] >= v[1]) * max( + u[0] - v[1] + 1, 0 + ) + + +def cosine_distance(x: np.ndarray, y: np.ndarray) -> float: + """Calculates the cosine distance between two embedding vectors. + + Args: + x (np.ndarray): An embedding vector. + y (np.ndarray): An embedding vector. + + Returns: + float: The cosine distance between two embedding vectors. + """ + return (1 - ((x / np.linalg.norm(x)) * (y / np.linalg.norm(y))).sum()) / 2 + + +class BaseEmbeddings: + """Wraps embeddings and the corresponding regions in one object. + + Attributes: + embeddings (np.ndarray): Region embedding vectors. + vocab (list[str]): A list of regions in the format of chr:start-end. + """ + + def __init__(self, embeddings, vocab): + self.embeddings = embeddings + self.vocab = vocab + + +def get_bin_embeddings(universe_file: str, tokenized_files: list[str]) -> BaseEmbeddings: + """Gets a BaseEmbeddings object for binary embeddings. + + Args: + universe_file (str): The path to a universe file. + tokenized_files (list[str]): A list of tokoenized BED files (in full + paths). + + Returns: + BaseEmbeddings: A BaseEmbeddings object for binary embeddings. + """ + vocab = [] + with open(universe_file, "r") as f: + for line in f: + elements = line.strip().split("\t") + region = f"{elements[0]}:{elements[1]}-{elements[2]}" + vocab.append(region) + vocab_dict = {v: i for i, v in enumerate(vocab)} + print("vocab size is", len(vocab)) + bin_embeds = np.zeros((len(vocab), len(tokenized_files))) + for i, token_file in enumerate(tokenized_files): + with open(token_file, "r") as f: + for line in f: + elements = line.strip().split("\t") + region = f"{elements[0]}:{elements[1]}-{elements[2]}" + if region in vocab_dict: + bin_embeds[vocab_dict[region]][i] = 1 + bin_embed_obj = BaseEmbeddings(bin_embeds, vocab) + return bin_embed_obj + + +def get_pca_embeddings( + bin_embed_obj: BaseEmbeddings, dim: int, kwargs: Dict[str, Union[int, float]] = {} +) -> BaseEmbeddings: + """Gets PCA embeddings from binary embeddings. + + Args: + bin_embed_obj (BaseEmbeddings): A BaseEmbeddings object for binary embeddings. + dim (int): Number of dimensions for PCA embeddings. + kwargs (dict[str, Union[int, float]], optional): Parameters passed to + PCA. Defaults to {}. + + Returns: + BaseEmbeddings: A BaseEmbeddings object for PCA embeddings. + """ + from sklearn.decomposition import PCA + + embeds = PCA(n_components=dim, **kwargs).fit_transform(bin_embed_obj.embeddings) + pca_embed_obj = BaseEmbeddings(embeds, bin_embed_obj.vocab) + return pca_embed_obj + + +def get_umap_embeddings( + bin_embed_obj: BaseEmbeddings, dim: int, kwargs: Dict[str, Union[int, float]] = {} +) -> BaseEmbeddings: + """Gets UMAP embeddings from binary embeddings. + + Args: + bin_embed_obj (BaseEmbeddings): A BaseEmbeddings object for binary embeddings. + dim (int): Number of dimensions for UMAP embeddings. + kwargs (dict[str, Union[int, float]], optional): Parameters passed to + UMAP. Defaults to {}. + + Returns: + BaseEmbeddings: A BaseEmbeddings object for UMAP embeddings. + """ + import umap + + embeds = umap.UMAP(n_components=dim, **kwargs).fit_transform(bin_embed_obj.embeddings) + umap_embed_obj = BaseEmbeddings(embeds, bin_embed_obj.vocab) + return umap_embed_obj + + +def save_base_embeddings(base_embed_obj: BaseEmbeddings, file_name: str) -> None: + """Saves the BaseEmbeddings object to disk. + + Args: + base_embed_obj (BaseEmbeddings): A BaseEmbeddings object. + file_name (str): Save the BaseEmbeddings object to file_name. + """ + with open(file_name, "wb") as f: + pickle.dump(base_embed_obj, f) + + +def load_base_embeddings(path: str) -> Tuple[np.ndarray, List[str]]: + """Loads a BaseEmbeddings object. + + Args: + path (str): The path to a BaseEmbeddings object. + + Returns: + tuple[np.ndarray, list[str]]: Embedding vectors and the corresponding + region list. + """ + with open(path, "rb") as f: + base_embed_obj = pickle.load(f) + return base_embed_obj.embeddings, base_embed_obj.vocab + + +def load_genomic_embeddings( + model_path: str, embed_type: str = "region2vec" +) -> Tuple[np.ndarray, List[str]]: + """Loads genomic region embeddings based on the type. + + Args: + model_path (str): The path to a saved model, or a huggingface repo of a model. + embed_type (str, optional): The model type. Defaults to "region2vec". + Can be "region2vec", "base", or "huggingface". + + Returns: + tuple[np.ndarray, list[str]]: Embedding vectors and the corresponding + region list. + """ + if os.path.exists(model_path): + # try to load local + if embed_type == "region2vec": + model = Word2Vec.load(model_path) + regions_r2v = model.wv.index_to_key + embed_rep = model.wv.vectors + return embed_rep, regions_r2v + elif embed_type == "base": + embed_rep, regions_r2v = load_base_embeddings(model_path) + return embed_rep, regions_r2v + + else: + # try to load from huggingface + exmodel = Region2VecExModel(model_path) + embed_rep = exmodel.model.projection.weight.data.numpy() + regions_r2v = [region2vocab_modify(r) for r in exmodel.tokenizer.universe.regions] + # remove embeddings representing unknown token and padding token + return embed_rep[:-2], regions_r2v + + +def region2vocab_modify(region) -> str: + """Convert a builtins.Region object to a string in the format of chr:start-end. + + Args: + region (builtins.Region): A region stored in tokenizer + + Returns: + str: region string in standardized format chr:start-end. + """ + return f"{region.chr}:{region.start}-{region.end}" + + +def sort_key(x: str) -> Tuple[int, int]: + """Extracts chromosome in number and the start position of a region. + + Args: + x (str): A region in the chr:start-end position. + + Returns: + tuple[int, int]: Chromosome in number and the start position. + """ + elements = x.split(":") + chr_idx = elements[0][3:] + try: + idx = int(chr_idx) + except ValueError: + idx = 23 + for c in chr_idx: + idx += ord(c) + start = int(elements[1].split("-")[0].strip()) + return idx, start + + +def get_vocab(model_path: str, type: str = "base", ordered: bool = True) -> List[str]: + """Gets vocab from a model. + + Args: + model_path (str): The path to a saved model. + type (str, optional): The embedding type. Defaults to "base". + ordered (bool, optional): Choose whether to sort the regions. Defaults + to True. + + Returns: + list[str]: A list of regions. + """ + + if type == "region2vec": + model = Word2Vec.load(model_path) + regions_r2v = model.wv.index_to_key + elif type == "base": + _, regions_r2v = load_base_embeddings(model_path) + if ordered: + regions_r2v = sorted(regions_r2v, key=sort_key) + return regions_r2v + + +def write_vocab(vocab: List[str], file_name: str) -> None: + """Writes a list of regions to a file. + + Args: + vocab (list[str]): A list of regions in the format of chr:start-end. + file_name (str): Saves vocab as file_name. + """ + with open(file_name, "w") as f: + for v in vocab: + elements = v.split(":") + chr = elements[0].strip() + s, e = elements[1].split("-") + s = s.strip() + e = e.strip() + f.write(f"{chr}\t{s}\t{e}\n") diff --git a/geniml/exceptions.py b/geniml/exceptions.py new file mode 100644 index 00000000..e44a6345 --- /dev/null +++ b/geniml/exceptions.py @@ -0,0 +1,26 @@ +class GenimlBaseError(Exception): + """Base error type for peppy custom errors.""" + + def __init__(self, msg): + super(GenimlBaseError, self).__init__(msg) + + +class BBClientError(GenimlBaseError): + """Base error type for BBClient errors.""" + + def __init__(self, msg): + super(BBClientError, self).__init__(msg) + + +class TokenizedFileNotFoundError(BBClientError): + """Error raised when a tokenized file is not found.""" + + def __init__(self, msg): + super(TokenizedFileNotFoundError, self).__init__(msg) + + +class TokenizedFileNotFoundInCacheError(BBClientError): + """Error raised when a tokenized file is not found in cache.""" + + def __init__(self, msg): + super(TokenizedFileNotFoundInCacheError, self).__init__(msg) diff --git a/geniml/geneformer/__init__.py b/geniml/geneformer/__init__.py new file mode 100644 index 00000000..907c58e6 --- /dev/null +++ b/geniml/geneformer/__init__.py @@ -0,0 +1,18 @@ +from transformers import AutoConfig +from ._version import VERSION + +from .configuration_geneformer import GeneformerConfig +from .modeling_geneformer import ( + GeneformerModel +) +from .tokenization_geneformer import TranscriptomeTokenizer + +AutoConfig.register("geneformer", GeneformerConfig) + +__all__ = [ + "GeneformerConfig", + "GeneformerModel", + "TranscriptomeTokenizer" +] +__version__ = VERSION +__author__ = "Nathan LeRoy, Christina Theodoris" \ No newline at end of file diff --git a/geniml/geneformer/_version.py b/geniml/geneformer/_version.py new file mode 100644 index 00000000..58c8455f --- /dev/null +++ b/geniml/geneformer/_version.py @@ -0,0 +1 @@ +VERSION = "0.1.0" \ No newline at end of file diff --git a/geniml/geneformer/configuration_geneformer.py b/geniml/geneformer/configuration_geneformer.py new file mode 100644 index 00000000..2b781378 --- /dev/null +++ b/geniml/geneformer/configuration_geneformer.py @@ -0,0 +1,42 @@ +from transformers import BertConfig + + +class GeneformerConfig(BertConfig): + """ + Configuration for Geneformer model, a BERT-like transformer for gene tokens. + """ + model_type = "geneformer" + + def __init__( + self, + vocab_size: int = 20275, + hidden_size: int = 512, + intermediate_size: int = 1024, + num_attention_heads: int = 8, + num_hidden_layers: int = 12, + attention_probs_dropout_prob: float = 0.02, + hidden_act: str = "relu", + hidden_dropout_prob: float = 0.02, + initializer_range: float = 0.02, + layer_norm_eps: float = 1e-12, + max_position_embeddings: int = 4096, + pad_token_id: int = 0, + classifier_dropout: float = None, + **kwargs + ): + super().__init__( + vocab_size=vocab_size, + hidden_size=hidden_size, + intermediate_size=intermediate_size, + num_attention_heads=num_attention_heads, + num_hidden_layers=num_hidden_layers, + attention_probs_dropout_prob=attention_probs_dropout_prob, + hidden_act=hidden_act, + hidden_dropout_prob=hidden_dropout_prob, + initializer_range=initializer_range, + layer_norm_eps=layer_norm_eps, + max_position_embeddings=max_position_embeddings, + pad_token_id=pad_token_id, + classifier_dropout=classifier_dropout, + **kwargs + ) \ No newline at end of file diff --git a/geniml/geneformer/modeling_geneformer.py b/geniml/geneformer/modeling_geneformer.py new file mode 100644 index 00000000..512ec8dc --- /dev/null +++ b/geniml/geneformer/modeling_geneformer.py @@ -0,0 +1,36 @@ +# re-written after introspecting the original code: https://huggingface.co/ctheodoris/Geneformer/blob/main/examples/pretraining_new_model/pretrain_geneformer_w_deepspeed.py + +from transformers import BertForMaskedLM + +from .configuration_geneformer import GeneformerConfig + +class GeneformerModel(BertForMaskedLM): + """ + Geneformer Model with a masked language modeling head. + """ + config_class = GeneformerConfig + base_model_prefix = "geneformer" + _tied_weights_keys = [ + 'cls.predictions.bias', + 'cls.predictions.decoder.bias', + 'cls.predictions.decoder.weight', + 'bert.embeddings.word_embeddings.weight', + ] + + def __init__(self, config: GeneformerConfig): + super().__init__(config) + + def get_input_embeddings(self): + """ + Returns the input embeddings of the model. + """ + return self.bert.embeddings.word_embeddings + def set_input_embeddings(self, value): + """ + Sets the input embeddings of the model. + """ + self.bert.embeddings.word_embeddings = value + + def forward(self, *args, **kwargs): + outputs = super().forward(*args, **kwargs) + return outputs \ No newline at end of file diff --git a/geniml/geneformer/tokenization_geneformer.py b/geniml/geneformer/tokenization_geneformer.py new file mode 100644 index 00000000..b46e4e86 --- /dev/null +++ b/geniml/geneformer/tokenization_geneformer.py @@ -0,0 +1,684 @@ +# copied from https://huggingface.co/ctheodoris/Geneformer/raw/main/geneformer/tokenizer.py +""" +Geneformer tokenizer. + +**Input data:** + +| *Required format:* raw counts scRNAseq data without feature selection as .loom or anndata file. +| *Required row (gene) attribute:* "ensembl_id"; Ensembl ID for each gene. +| *Required col (cell) attribute:* "n_counts"; total read counts in that cell. + +| *Optional col (cell) attribute:* "filter_pass"; binary indicator of whether cell should be tokenized based on user-defined filtering criteria. +| *Optional col (cell) attributes:* any other cell metadata can be passed on to the tokenized dataset as a custom attribute dictionary as shown below. + +**Usage:** + +.. code-block :: python + + >>> from geneformer import TranscriptomeTokenizer + >>> tk = TranscriptomeTokenizer({"cell_type": "cell_type", "organ_major": "organ"}, nproc=4) + >>> tk.tokenize_data("data_directory", "output_directory", "output_prefix") + +**Description:** + +| Input data is a directory with .loom or .h5ad files containing raw counts from single cell RNAseq data, including all genes detected in the transcriptome without feature selection. The input file type is specified by the argument file_format in the tokenize_data function. + +| The discussion below references the .loom file format, but the analagous labels are required for .h5ad files, just that they will be column instead of row attributes and vice versa due to the transposed format of the two file types. + +| Genes should be labeled with Ensembl IDs (loom row attribute "ensembl_id"), which provide a unique identifer for conversion to tokens. Other forms of gene annotations (e.g. gene names) can be converted to Ensembl IDs via Ensembl Biomart. Cells should be labeled with the total read count in the cell (loom column attribute "n_counts") to be used for normalization. + +| No cell metadata is required, but custom cell attributes may be passed onto the tokenized dataset by providing a dictionary of custom attributes to be added, which is formatted as loom_col_attr_name : desired_dataset_col_attr_name. For example, if the original .loom dataset has column attributes "cell_type" and "organ_major" and one would like to retain these attributes as labels in the tokenized dataset with the new names "cell_type" and "organ", respectively, the following custom attribute dictionary should be provided: {"cell_type": "cell_type", "organ_major": "organ"}. + +| Additionally, if the original .loom file contains a cell column attribute called "filter_pass", this column will be used as a binary indicator of whether to include these cells in the tokenized data. All cells with "1" in this attribute will be tokenized, whereas the others will be excluded. One may use this column to indicate QC filtering or other criteria for selection for inclusion in the final tokenized dataset. + +| If one's data is in other formats besides .loom or .h5ad, one can use the relevant tools (such as Anndata tools) to convert the file to a .loom or .h5ad format prior to running the transcriptome tokenizer. + +| OF NOTE: Take care that the correct token dictionary and gene median file is used for the correct model. + +| OF NOTE: For 95M model series, special_token should be True and model_input_size should be 4096. For 30M model series, special_token should be False and model_input_size should be 2048. + +""" + +from __future__ import annotations + +import logging +import os +import pickle +import warnings +from collections import Counter +from pathlib import Path +from typing import Literal + +import loompy as lp +import numpy as np +import pandas as pd +import scanpy as sc +import scipy.sparse as sp +from datasets import Dataset +from tqdm import tqdm + +warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*") # noqa +import loompy as lp # noqa + +logger = logging.getLogger(__name__) + +def rank_genes(gene_vector, gene_tokens): + """ + Rank gene expression vector. + """ + # sort by median-scaled gene values + sorted_indices = np.argsort(-gene_vector) + return gene_tokens[sorted_indices] + + +def tokenize_cell(gene_vector, gene_tokens): + """ + Convert normalized gene expression vector to tokenized rank value encoding. + """ + # create array of gene vector with token indices + # mask undetected genes + nonzero_mask = np.nonzero(gene_vector)[0] + # rank by median-scaled gene values + return rank_genes(gene_vector[nonzero_mask], gene_tokens[nonzero_mask]) + + +def sum_ensembl_ids( + data_directory, + collapse_gene_ids, + gene_mapping_dict, + gene_token_dict, + custom_attr_name_dict, + file_format="loom", + chunk_size=512, +): + if file_format == "loom": + """ + Map Ensembl IDs from gene mapping dictionary. If duplicate Ensembl IDs are found, sum counts together. + """ + with lp.connect(data_directory) as data: + assert ( + "ensembl_id" in data.ra.keys() + ), "'ensembl_id' column missing from data.ra.keys()" + + assert ( + "ensembl_id_collapsed" not in data.ra.keys() + ), "'ensembl_id_collapsed' column already exists in data.ra.keys()" + + assert ( + "n_counts" in data.ca.keys() + ), "'n_counts' column missing from data.ca.keys()" + + if custom_attr_name_dict is not None: + for label in custom_attr_name_dict: + assert label in data.ca.keys(), f"Attribute `{label}` not present in dataset features" + + # Get the ensembl ids that exist in data + ensembl_ids = data.ra.ensembl_id + # Check for duplicate Ensembl IDs if collapse_gene_ids is False. + # Comparing to gene_token_dict here, would not perform any mapping steps + if not collapse_gene_ids: + ensembl_id_check = [ + gene for gene in ensembl_ids if gene in gene_token_dict.keys() + ] + if len(ensembl_id_check) == len(set(ensembl_id_check)): + return data_directory + else: + raise ValueError("Error: data Ensembl IDs non-unique.") + + # Get the genes that exist in the mapping dictionary and the value of those genes + genes_in_map_dict = [gene for gene in ensembl_ids if gene in gene_mapping_dict.keys()] + vals_from_map_dict = [gene_mapping_dict.get(gene) for gene in genes_in_map_dict] + + # if the genes in the mapping dict and the value of those genes are of the same length, + # simply return the mapped values + if(len(set(genes_in_map_dict)) == len(set(vals_from_map_dict))): + mapped_vals = [gene_mapping_dict.get(gene.upper()) for gene in data.ra["ensembl_id"]] + data.ra["ensembl_id_collapsed"] = mapped_vals + return data_directory + # Genes need to be collapsed + else: + dedup_filename = data_directory.with_name( + data_directory.stem + "__dedup.loom" + ) + mapped_vals = [gene_mapping_dict.get(gene.upper()) for gene in data.ra["ensembl_id"]] + data.ra["ensembl_id_collapsed"] = mapped_vals + dup_genes = [ + idx + for idx, count in Counter(data.ra["ensembl_id_collapsed"]).items() + if count > 1 + ] + num_chunks = int(np.ceil(data.shape[1] / chunk_size)) + first_chunk = True + for _, _, view in tqdm( + data.scan(axis=1, batch_size=chunk_size), total=num_chunks + ): + + def process_chunk(view, duplic_genes): + data_count_view = pd.DataFrame( + view, index=data.ra["ensembl_id_collapsed"] + ) + unique_data_df = data_count_view.loc[ + ~data_count_view.index.isin(duplic_genes) + ] + dup_data_df = data_count_view.loc[ + data_count_view.index.isin( + [i for i in duplic_genes if "None" not in i] + ) + ] + summed_data = dup_data_df.groupby(dup_data_df.index).sum() + if not summed_data.index.is_unique: + raise ValueError( + "Error: Ensembl IDs in summed data frame non-unique." + ) + data_count_view = pd.concat( + [unique_data_df, summed_data], axis=0 + ) + if not data_count_view.index.is_unique: + raise ValueError( + "Error: Ensembl IDs in final data frame non-unique." + ) + return data_count_view + + processed_chunk = process_chunk(view[:, :], dup_genes) + processed_array = processed_chunk.to_numpy() + new_row_attrs = {"ensembl_id_collapsed": processed_chunk.index.to_numpy()} + + if "n_counts" not in view.ca.keys(): + total_count_view = np.sum(view[:, :], axis=0).astype(int) + view.ca["n_counts"] = total_count_view + + if first_chunk: # Create the Loom file with the first chunk + lp.create( + f"{dedup_filename}", + processed_array, + row_attrs=new_row_attrs, + col_attrs=view.ca, + ) + first_chunk = False + else: # Append subsequent chunks + with lp.connect(dedup_filename, mode="r+") as dsout: + dsout.add_columns(processed_array, col_attrs=view.ca) + return dedup_filename + + elif file_format == "h5ad": + """ + Map Ensembl IDs from gene mapping dictionary. If duplicate Ensembl IDs are found, sum counts together. + Returns adata object with deduplicated Ensembl IDs. + """ + + data = sc.read_h5ad(str(data_directory)) + + assert ( + "ensembl_id" in data.var.columns + ), "'ensembl_id' column missing from data.var" + + assert ( + "ensembl_id_collapsed" not in data.var.columns + ), "'ensembl_id_collapsed' column already exists in data.var" + assert ( + "n_counts" in data.obs.columns + ), "'n_counts' column missing from data.obs" + + if custom_attr_name_dict is not None: + for label in custom_attr_name_dict: + assert label in data.obs.columns, f"Attribute `{label}` not present in data.obs" + + + # Get the ensembl ids that exist in data + ensembl_ids = data.var.ensembl_id + # Check for duplicate Ensembl IDs if collapse_gene_ids is False. + # Comparing to gene_token_dict here, would not perform any mapping steps + if not collapse_gene_ids: + ensembl_id_check = [ + gene for gene in ensembl_ids if gene in gene_token_dict.keys() + ] + if len(ensembl_id_check) == len(set(ensembl_id_check)): + return data_directory + else: + raise ValueError("Error: data Ensembl IDs non-unique.") + + # Get the genes that exist in the mapping dictionary and the value of those genes + genes_in_map_dict = [gene for gene in ensembl_ids if gene in gene_mapping_dict.keys()] + vals_from_map_dict = [gene_mapping_dict.get(gene) for gene in genes_in_map_dict] + + # if the genes in the mapping dict and the value of those genes are of the same length, + # simply return the mapped values + if(len(set(genes_in_map_dict)) == len(set(vals_from_map_dict))): + data.var["ensembl_id_collapsed"] = data.var.ensembl_id.str.upper().map(gene_mapping_dict) + return data + # Genes need to be collapsed + else: + data.var["ensembl_id_collapsed"] = data.var.ensembl_id.str.upper().map(gene_mapping_dict) + data.var_names = data.var["ensembl_id_collapsed"] + data = data[:, ~data.var.index.isna()] + dup_genes = [ + idx for idx, count in Counter(data.var_names).items() if count > 1 + ] + + num_chunks = int(np.ceil(data.shape[0] / chunk_size)) + + processed_genes = [] + for i in tqdm(range(num_chunks)): + start_idx = i * chunk_size + end_idx = min((i + 1) * chunk_size, data.shape[0]) + data_chunk = data[start_idx:end_idx, :] + + processed_chunks = [] + for dup_gene in dup_genes: + data_dup_gene = data_chunk[:, data_chunk.var_names == dup_gene] + df = pd.DataFrame.sparse.from_spmatrix( + data_dup_gene.X, + index=data_dup_gene.obs_names, + columns=data_dup_gene.var_names, + ) + df_sum = pd.DataFrame(df.sum(axis=1)) + df_sum.columns = [dup_gene] + df_sum.index = data_dup_gene.obs.index + processed_chunks.append(df_sum) + + processed_chunks = pd.concat(processed_chunks, axis=1) + processed_genes.append(processed_chunks) + processed_genes = pd.concat(processed_genes, axis=0) + var_df = pd.DataFrame({"ensembl_id_collapsed": processed_genes.columns}) + var_df.index = processed_genes.columns + processed_genes = sc.AnnData(X=processed_genes, obs=data.obs, var=var_df) + + data_dedup = data[:, ~data.var.index.isin(dup_genes)] # Deduplicated data + data_dedup = sc.concat([data_dedup, processed_genes], axis=1) + data_dedup.obs = data.obs + return data_dedup + + +class TranscriptomeTokenizer: + def __init__( + self, + custom_attr_name_dict=None, + nproc=1, + chunk_size=512, + model_input_size=4096, + special_token=True, + collapse_gene_ids=True, + gene_median_file=None, + token_dictionary_file=None, + gene_mapping_file=None, + ): + """ + Initialize tokenizer. + + **Parameters:** + + custom_attr_name_dict : None, dict + | Dictionary of custom attributes to be added to the dataset. + | Keys are the names of the attributes in the loom file. + | Values are the names of the attributes in the dataset. + nproc : int + | Number of processes to use for dataset mapping. + chunk_size : int = 512 + | Chunk size for anndata tokenizer. + model_input_size : int = 4096 + | Max input size of model to truncate input to. + | For the 30M model series, should be 2048. For the 95M model series, should be 4096. + special_token : bool = True + | Adds CLS token before and EOS token after rank value encoding. + | For the 30M model series, should be False. For the 95M model series, should be True. + collapse_gene_ids : bool = True + | Whether to collapse gene IDs based on gene mapping dictionary. + gene_median_file : Path + | Path to pickle file containing dictionary of non-zero median + | gene expression values across Genecorpus-30M. + token_dictionary_file : Path + | Path to pickle file containing token dictionary (Ensembl IDs:token). + gene_mapping_file : None, Path + | Path to pickle file containing dictionary for collapsing gene IDs. + + """ + # dictionary of custom attributes {output dataset column name: input .loom column name} + self.custom_attr_name_dict = custom_attr_name_dict + + # number of processes for dataset mapping + self.nproc = nproc + + # chunk size for anndata tokenizer + self.chunk_size = chunk_size + + # input size for tokenization + self.model_input_size = model_input_size + + # add CLS and EOS tokens + self.special_token = special_token + + # load dictionary of gene normalization factors + # (non-zero median value of expression across Genecorpus-30M) + with open(gene_median_file, "rb") as f: + self.gene_median_dict = pickle.load(f) + + # load token dictionary (Ensembl IDs:token) + with open(token_dictionary_file, "rb") as f: + self.gene_token_dict = pickle.load(f) + + # check for special token in gene_token_dict + if self.special_token: + if ("" not in self.gene_token_dict.keys()) and ( + "" not in self.gene_token_dict.keys() + ): + logger.error( + " and required in gene_token_dict when special_token = True." + ) + raise + + if not self.special_token: + if ("" in self.gene_token_dict.keys()) and ( + "" in self.gene_token_dict.keys() + ): + logger.warning( + " and are in gene_token_dict but special_token = False. Please note that for 95M model series, special_token should be True." + ) + + # if collapsing duplicate gene IDs + self.collapse_gene_ids = collapse_gene_ids + + # load gene mappings dictionary (Ensembl IDs:Ensembl ID) + if gene_mapping_file is not None: + with open(gene_mapping_file, "rb") as f: + self.gene_mapping_dict = pickle.load(f) + else: + self.gene_mapping_dict = {k: k for k, _ in self.gene_token_dict.items()} + + # gene keys for full vocabulary + self.gene_keys = list(self.gene_token_dict.keys()) + + # Filter gene mapping dict for items that exist in gene_token_dict + gene_keys_set = set(self.gene_token_dict.keys()) + self.gene_mapping_dict = { + k: v for k, v in self.gene_mapping_dict.items() if v in gene_keys_set + } + + # protein-coding and miRNA gene list dictionary for selecting .loom rows for tokenization + self.genelist_dict = dict(zip(self.gene_keys, [True] * len(self.gene_keys))) + + def tokenize_data( + self, + data_directory: Path | str, + output_directory: Path | str, + output_prefix: str, + file_format: Literal["loom", "h5ad"] = "loom", + use_generator: bool = False, + ): + """ + Tokenize .loom files in data_directory and save as tokenized .dataset in output_directory. + + **Parameters:** + + data_directory : Path + | Path to directory containing loom files or anndata files + output_directory : Path + | Path to directory where tokenized data will be saved as .dataset + output_prefix : str + | Prefix for output .dataset + file_format : str + | Format of input files. Can be "loom" or "h5ad". + use_generator : bool + | Whether to use generator or dict for tokenization. + + """ + tokenized_cells, cell_metadata = self.tokenize_files( + Path(data_directory), file_format + ) + tokenized_dataset = self.create_dataset( + tokenized_cells, + cell_metadata, + use_generator=use_generator, + ) + + output_path = (Path(output_directory) / output_prefix).with_suffix(".dataset") + tokenized_dataset.save_to_disk(str(output_path)) + + def tokenize_files( + self, data_directory, file_format: Literal["loom", "h5ad"] = "loom" + ): + tokenized_cells = [] + if self.custom_attr_name_dict is not None: + cell_attr = [attr_key for attr_key in self.custom_attr_name_dict.keys()] + cell_metadata = { + attr_key: [] for attr_key in self.custom_attr_name_dict.values() + } + + # loops through directories to tokenize .loom files + file_found = 0 + # loops through directories to tokenize .loom or .h5ad files + tokenize_file_fn = ( + self.tokenize_loom if file_format == "loom" else self.tokenize_anndata + ) + for file_path in data_directory.glob(f"*.{file_format}"): + file_found = 1 + print(f"Tokenizing {file_path}") + file_tokenized_cells, file_cell_metadata = tokenize_file_fn(file_path) + tokenized_cells += file_tokenized_cells + if self.custom_attr_name_dict is not None: + for k in cell_attr: + cell_metadata[self.custom_attr_name_dict[k]] += file_cell_metadata[ + k + ] + else: + cell_metadata = None + + if file_found == 0: + logger.error( + f"No .{file_format} files found in directory {data_directory}." + ) + raise + return tokenized_cells, cell_metadata + + def tokenize_anndata(self, adata_file_path, target_sum=10_000): + adata = sum_ensembl_ids( + adata_file_path, + self.collapse_gene_ids, + self.gene_mapping_dict, + self.gene_token_dict, + self.custom_attr_name_dict, + file_format="h5ad", + chunk_size=self.chunk_size, + ) + + if self.custom_attr_name_dict is not None: + file_cell_metadata = { + attr_key: [] for attr_key in self.custom_attr_name_dict.keys() + } + + coding_miRNA_loc = np.where( + [self.genelist_dict.get(i, False) for i in adata.var["ensembl_id_collapsed"]] + )[0] + norm_factor_vector = np.array( + [ + self.gene_median_dict[i] + for i in adata.var["ensembl_id_collapsed"][coding_miRNA_loc] + ] + ) + coding_miRNA_ids = adata.var["ensembl_id_collapsed"][coding_miRNA_loc] + coding_miRNA_tokens = np.array( + [self.gene_token_dict[i] for i in coding_miRNA_ids] + ) + + try: + _ = adata.obs["filter_pass"] + except KeyError: + var_exists = False + else: + var_exists = True + + if var_exists: + filter_pass_loc = np.where([i == 1 for i in adata.obs["filter_pass"]])[0] + elif not var_exists: + print( + f"{adata_file_path} has no column attribute 'filter_pass'; tokenizing all cells." + ) + filter_pass_loc = np.array([i for i in range(adata.shape[0])]) + + tokenized_cells = [] + + for i in range(0, len(filter_pass_loc), self.chunk_size): + idx = filter_pass_loc[i : i + self.chunk_size] + + n_counts = adata[idx].obs["n_counts"].values[:, None] + X_view0 = adata[idx, :].X + X_view = X_view0[:, coding_miRNA_loc] + X_norm = X_view / n_counts * target_sum / norm_factor_vector + X_norm = sp.csr_matrix(X_norm) + + tokenized_cells += [ + rank_genes(X_norm[i].data, coding_miRNA_tokens[X_norm[i].indices]) + for i in range(X_norm.shape[0]) + ] + + # add custom attributes for subview to dict + if self.custom_attr_name_dict is not None: + for k in file_cell_metadata.keys(): + file_cell_metadata[k] += adata[idx].obs[k].tolist() + else: + file_cell_metadata = None + + return tokenized_cells, file_cell_metadata + + def tokenize_loom(self, loom_file_path, target_sum=10_000): + if self.custom_attr_name_dict is not None: + file_cell_metadata = { + attr_key: [] for attr_key in self.custom_attr_name_dict.keys() + } + loom_file_path_original = loom_file_path + + dedup_filename = loom_file_path.with_name(loom_file_path.stem + "__dedup.loom") + loom_file_path = sum_ensembl_ids( + loom_file_path, + self.collapse_gene_ids, + self.gene_mapping_dict, + self.gene_token_dict, + self.custom_attr_name_dict, + file_format="loom", + chunk_size=self.chunk_size, + ) + + with lp.connect(str(loom_file_path)) as data: + # define coordinates of detected protein-coding or miRNA genes and vector of their normalization factors + coding_miRNA_loc = np.where( + [self.genelist_dict.get(i, False) for i in data.ra["ensembl_id_collapsed"]] + )[0] + norm_factor_vector = np.array( + [ + self.gene_median_dict[i] + for i in data.ra["ensembl_id_collapsed"][coding_miRNA_loc] + ] + ) + coding_miRNA_ids = data.ra["ensembl_id_collapsed"][coding_miRNA_loc] + coding_miRNA_tokens = np.array( + [self.gene_token_dict[i] for i in coding_miRNA_ids] + ) + + # define coordinates of cells passing filters for inclusion (e.g. QC) + try: + data.ca["filter_pass"] + except AttributeError: + var_exists = False + else: + var_exists = True + + if var_exists: + filter_pass_loc = np.where([i == 1 for i in data.ca["filter_pass"]])[0] + elif not var_exists: + print( + f"{loom_file_path} has no column attribute 'filter_pass'; tokenizing all cells." + ) + filter_pass_loc = np.array([i for i in range(data.shape[1])]) + + # scan through .loom files and tokenize cells + tokenized_cells = [] + for _ix, _selection, view in data.scan( + items=filter_pass_loc, axis=1, batch_size=self.chunk_size + ): + # select subview with protein-coding and miRNA genes + subview = view.view[coding_miRNA_loc, :] + + # normalize by total counts per cell and multiply by 10,000 to allocate bits to precision + # and normalize by gene normalization factors + subview_norm_array = ( + subview[:, :] + / subview.ca.n_counts + * target_sum + / norm_factor_vector[:, None] + ) + # tokenize subview gene vectors + tokenized_cells += [ + tokenize_cell(subview_norm_array[:, i], coding_miRNA_tokens) + for i in range(subview_norm_array.shape[1]) + ] + + # add custom attributes for subview to dict + if self.custom_attr_name_dict is not None: + for k in file_cell_metadata.keys(): + file_cell_metadata[k] += subview.ca[k].tolist() + else: + file_cell_metadata = None + + if str(dedup_filename) == str(loom_file_path): + os.remove(str(dedup_filename)) + + with lp.connect(str(loom_file_path_original)) as data: + if "ensembl_id_collapsed" in data.ra.keys(): + del data.ra["ensembl_id_collapsed"] + + + return tokenized_cells, file_cell_metadata + + def create_dataset( + self, + tokenized_cells, + cell_metadata, + use_generator=False, + keep_uncropped_input_ids=False, + ): + print("Creating dataset.") + # create dict for dataset creation + dataset_dict = {"input_ids": tokenized_cells} + if self.custom_attr_name_dict is not None: + dataset_dict.update(cell_metadata) + + # create dataset + if use_generator: + + def dict_generator(): + for i in range(len(tokenized_cells)): + yield {k: dataset_dict[k][i] for k in dataset_dict.keys()} + + output_dataset = Dataset.from_generator(dict_generator, num_proc=self.nproc) + else: + output_dataset = Dataset.from_dict(dataset_dict) + + def format_cell_features(example): + # Store original uncropped input_ids in separate feature + if keep_uncropped_input_ids: + example["input_ids_uncropped"] = example["input_ids"] + example["length_uncropped"] = len(example["input_ids"]) + + # Truncate/Crop input_ids to input size + if self.special_token: + example["input_ids"] = example["input_ids"][ + 0 : self.model_input_size - 2 + ] # truncate to leave space for CLS and EOS token + example["input_ids"] = np.insert( + example["input_ids"], 0, self.gene_token_dict.get("") + ) + example["input_ids"] = np.insert( + example["input_ids"], + len(example["input_ids"]), + self.gene_token_dict.get(""), + ) + else: + # Truncate/Crop input_ids to input size + example["input_ids"] = example["input_ids"][0 : self.model_input_size] + example["length"] = len(example["input_ids"]) + + return example + + output_dataset_truncated = output_dataset.map( + format_cell_features, num_proc=self.nproc + ) + return output_dataset_truncated diff --git a/geniml/io/__init__.py b/geniml/io/__init__.py new file mode 100644 index 00000000..1393b48a --- /dev/null +++ b/geniml/io/__init__.py @@ -0,0 +1,3 @@ +from .io import BedSet, RegionSet, Region + +__all__ = ["Region", "BedSet", "RegionSet"] diff --git a/geniml/io/const.py b/geniml/io/const.py new file mode 100644 index 00000000..16c89b66 --- /dev/null +++ b/geniml/io/const.py @@ -0,0 +1,23 @@ +from typing import Literal + +MAF_HUGO_SYMBOL_COL_NAME = "Hugo_Symbol" +MAF_ENTREZ_GENE_ID_COL_NAME = "Entrez_Gene_Id" +MAF_CENTER_COL_NAME = "Center" +MAF_NCBI_BUILD_COL_NAME = "NCBI_Build" +MAF_CHROMOSOME_COL_NAME = "Chromosome" +MAF_START_COL_NAME = "Start_position" +MAF_END_COL_NAME = "End_position" +MAF_STRAND_COL_NAME = "Strand" + +MAF_COLUMN = Literal[ + "Hugo_Symbol", + "Entrez_Gene_Id", + "Center", + "NCBI_Build", + "Chromosome", + "Start_Position", + "End_Position", + "Strand", +] + +MAF_FILE_DELIM = "\t" diff --git a/geniml/io/exceptions.py b/geniml/io/exceptions.py new file mode 100644 index 00000000..9179593a --- /dev/null +++ b/geniml/io/exceptions.py @@ -0,0 +1,17 @@ +from typing import Optional + +from ..exceptions import GenimlBaseError + + +class BackedFileNotAvailableError(GenimlBaseError): + default_message = "File from url is not available in backed mode." + + def __init__(self, message: Optional[str] = None): + super().__init__(message or self.default_message) + + +class BEDFileReadError(GenimlBaseError): + default_message = "Error reading BED file." + + def __init__(self, message: Optional[str] = None): + super().__init__(message or self.default_message) diff --git a/geniml/io/io.py b/geniml/io/io.py new file mode 100644 index 00000000..4b2f99fb --- /dev/null +++ b/geniml/io/io.py @@ -0,0 +1,629 @@ +import gzip +import logging +import warnings +from typing_extensions import deprecated +import os +from hashlib import md5 +from typing import List, NoReturn, Union + +import genomicranges +import numpy as np +import pandas as pd +from iranges import IRanges +from ubiquerg import is_url +from gtars.models import RegionSet as GRegionSet + +from .const import ( + MAF_CENTER_COL_NAME, + MAF_CHROMOSOME_COL_NAME, + MAF_END_COL_NAME, + MAF_ENTREZ_GENE_ID_COL_NAME, + MAF_FILE_DELIM, + MAF_HUGO_SYMBOL_COL_NAME, + MAF_NCBI_BUILD_COL_NAME, + MAF_START_COL_NAME, + MAF_STRAND_COL_NAME, +) +from .exceptions import BackedFileNotAvailableError, BEDFileReadError +from .utils import compute_md5sum_bedset, extract_maf_col_positions, is_gzipped, read_bedset_file + +_LOGGER = logging.getLogger("bbclient") + + +@deprecated( + "This class is deprecated and will be removed in future versions. Use Region class from Gtars" +) +class Region: + def __init__(self, chr: str, start: int, stop: int): + """ + Instantiate a Region object. + + :param str chr: chromosome + :param int start: start position + :param int stop: stop position + """ + warnings.warn( + "Region is deprecated and will be removed in future versions.", DeprecationWarning + ) + + self.chr = chr + self.start = start + self.end = stop + + def __repr__(self): + return f"Region({self.chr}, {self.start}, {self.end})" + + +@deprecated("This class is deprecated and will be removed in future versions.") +class RegionSet: + def __init__(self, regions: Union[str, List[Region]], backed: bool = False): + """ + Instantiate a RegionSet object. This can be backed or not backed. It represents a set of genomic regions. + + If you specify `backed` as True, then the bed file will not be loaded into memory. This is useful for large + bed files. You can still iterate over the regions, but you cannot index into them. + + :param regions: path, or url to bed file or list of Region objects + :param backed: whether to load the bed file into memory or not [Default: False] + """ + warnings.warn( + "RegionSet is deprecated and will be removed in future versions. Use RegionSet class from Gtars", + DeprecationWarning, + ) + + self._df: Union[pd.DataFrame, None] = None + + if isinstance(regions, str): + self.backed = backed + self.regions: List[Region] = [] + self.path = regions + self.regions = None + self.is_gzipped = False + + if backed: + if is_url(regions): + raise BackedFileNotAvailableError() + # Open function depending on file type + if not is_gzipped(regions): + open_func = open + mode = "r" + else: + self.is_gzipped = True + open_func = gzip.open + mode = "rt" + + # https://stackoverflow.com/a/32607817/13175187 + try: + with open_func(self.path, mode) as file: + self.length = sum(1 for line in file if line.strip()) + except UnicodeDecodeError: + self.is_gzipped = True + with gzip.open(self.path, "rt") as file: + self.length = sum(1 for line in file if line.strip()) + + else: + if is_gzipped(regions): + df = self._read_gzipped_file(regions) + else: + df = self._read_file_pd( + regions, + sep="\t", + header=None, + engine="pyarrow", + ) + self._df = df + + _regions = [] + df.apply( + lambda row: _regions.append(Region(row[0], row[1], row[2])), + axis=1, + ) + + self.regions = _regions + self.length = len(self.regions) + + # load from list + elif isinstance(regions, list) and all([isinstance(region, Region) for region in regions]): + self.backed = False + self.path = None + self.regions = regions + self.length = len(self.regions) + else: + raise ValueError("regions must be a path to a bed file or a list of Region objects") + + self._identifier = None + + def to_pandas(self) -> Union[pd.DataFrame, None]: + if self._df is None: + seqnames, starts, ends = zip( + *[(region.chr, region.start, region.end) for region in self] + ) + return pd.DataFrame([seqnames, starts, ends]) + + return self._df + + def _read_gzipped_file(self, file_path: str) -> pd.DataFrame: + """ + Read a gzipped file into a pandas dataframe + + :param file_path: path to gzipped file + :return: pandas dataframe + """ + return self._read_file_pd( + file_path, + sep="\t", + compression="gzip", + header=None, + # engine="pyarrow", + ) + + def _read_file_pd(self, *args, **kwargs) -> pd.DataFrame: + """ + Read bed file into a pandas DataFrame, and skip header rows if needed + + :return: pandas dataframe + """ + max_rows = 5 + row_count = 0 + while row_count <= max_rows: + try: + df = pd.read_csv(*args, **kwargs, skiprows=row_count) + if row_count > 0: + _LOGGER.info(f"Skipped {row_count} rows while standardization. File: '{args}'") + df = df.dropna(axis=1) + for index, row in df.iterrows(): + if ( + isinstance(row[0], str) + and isinstance(row[1], int) + and isinstance(row[2], int) + ): + return df + else: + if isinstance(row[1], str): + try: + _ = int(row[1]) + df[1] = pd.to_numeric(df[1]) + except ValueError: + row_count += 1 + break + if isinstance(row[2], str): + try: + _ = int(row[2]) + df[2] = pd.to_numeric(df[2]) + except ValueError: + row_count += 1 + break + return df + except (pd.errors.ParserError, pd.errors.EmptyDataError) as _: + if row_count <= max_rows: + row_count += 1 + raise BEDFileReadError("Cannot read bed file.") + + def __len__(self): + return self.length + + def __getitem__(self, key): + if self.backed: + raise NotImplementedError("Backed RegionSets do not currently support indexing.") + else: + return self.regions[key] + + def __repr__(self): + if self.path: + if self.backed: + return f"RegionSet({self.path}, backed=True)" + else: + return f"RegionSet({self.path})" + else: + return f"RegionSet(n={self.length})" + + def __iter__(self): + if self.backed: + # Open function depending on file type + if self.is_gzipped: + open_func = gzip.open + mode = "rt" + else: + open_func = open + mode = "r" + + with open_func(self.path, mode) as f: + skipped_lines = 0 + max_skipped_lines = 5 + for line in f: + try: + chr, start, stop = line.split("\t")[:3] + except ValueError as _: + if skipped_lines < max_skipped_lines: + skipped_lines += 1 + continue + else: + raise BEDFileReadError("Could not read line bed file") + if skipped_lines > 0: + _LOGGER.info( + f"Skipped {skipped_lines} lines while opening file. File: '{self.path}'" + ) + yield Region(chr, int(start), int(stop)) + else: + for region in self.regions: + yield region + + @property + def identifier(self) -> str: + return self.compute_bed_identifier() + + def to_granges(self) -> genomicranges.GenomicRanges: + """ + Return GenomicRanges contained in this BED file + + :return: GenomicRanges object + """ + + seqnames, starts, ends = zip( + *[(region.chr, region.start, region.end) for region in self.regions] + ) + width_list = [] + for start, end in zip(starts, ends): + width_list.append(end - start) + ir = IRanges(start=starts, width=width_list) + + return genomicranges.GenomicRanges(seqnames, ir) + + def compute_bed_identifier(self) -> str: + """ + Return bed file identifier. If it is not set, compute one + + :return: the identifier of BED file (str) + """ + if self._identifier is not None: + return self._identifier + else: + if not self.backed: + # concate column values + chrs = ",".join([str(region.chr) for region in self.regions]) + starts = ",".join([str(region.start) for region in self.regions]) + ends = ",".join([str(region.end) for region in self.regions]) + + else: + open_func = open if not is_gzipped(self.path) else gzip.open + mode = "r" if not is_gzipped(self.path) else "rt" + with open_func(self.path, mode) as f: + # concate column values + chrs = [] + starts = [] + ends = [] + for row in f: + chrs.append(row.split("\t")[0]) + starts.append(row.split("\t")[1]) + ends.append(row.split("\t")[2].replace("\n", "")) + chrs = ",".join(chrs) + starts = ",".join(starts) + ends = ",".join(ends) + + # hash column values + chr_digest = md5(chrs.encode("utf-8")).hexdigest() + start_digest = md5(starts.encode("utf-8")).hexdigest() + end_digest = md5(ends.encode("utf-8")).hexdigest() + # hash column digests + bed_digest = md5( + ",".join([chr_digest, start_digest, end_digest]).encode("utf-8") + ).hexdigest() + + self._identifier = bed_digest + + return self._identifier + + +class BedSet: + """ + BedSet object + """ + + def __init__( + self, + region_sets: Union[List[RegionSet], List[str], List[List[Region]], None] = None, + file_path: str = None, + identifier: str = None, + ): + """ + :param region_sets: list of BED file paths, RegionSet, or 2-dimension list of Region [Default: None - empty BedSet] + :param file_path: path to the .txt file with identifier of all BED files in it + :param identifier: the identifier of the BED set + """ + + if isinstance(region_sets, list): + # init with a list of BED files + if all(isinstance(region_set, RegionSet) for region_set in region_sets): + self.region_sets = region_sets + # init with a list of file paths or a 2d list of Region + else: + self.region_sets = [] + for r in region_sets: + self.region_sets.append(GRegionSet(r)) + + elif file_path is not None: + if os.path.isfile(file_path): + self.region_sets = [GRegionSet(r) for r in read_bedset_file(file_path)] + else: + raise FileNotFoundError(f"The specified file '{file_path}' does not exist.") + else: + # create empty regionSet + self.region_sets = [] + + self._bedset_identifier = identifier + + def __len__(self): + return len(self.region_sets) + + def __iter__(self): + for region_set in self.region_sets: + yield region_set + + def __getitem__(self, indx: int): + return self.region_sets[indx] + + @property + def identifier(self) -> str: + return self._bedset_identifier or self.compute_bedset_identifier() + + def add(self, bedfile: RegionSet) -> NoReturn: + """ + Add a BED file to the BED set + + !Warning: if new bedfile will be added, bedSet identifier will be changed! + + :param bedfile: RegionSet instance, that should be added to the bedSet + :return: NoReturn + """ + self.region_sets.append(bedfile) + + self._bedset_identifier = self.compute_bedset_identifier() + + def to_granges_list(self) -> genomicranges.GenomicRangesList: + """ + Process a list of BED set identifiers and returns a GenomicRangesList object + """ + gr_list = [] + for regionset in self.region_sets: + gr_list.append(regionset.to_granges()) + + return genomicranges.GenomicRangesList(ranges=gr_list) + + def compute_bedset_identifier(self) -> str: + """ + Return the identifier. If it is not set, compute one + + :param bedset: BedSet object + :return: the identifier of BED set + """ + if self._bedset_identifier is not None: + return self._bedset_identifier + + elif self._bedset_identifier is None: + bedfile_ids = [] + for bedfile in self.region_sets: + bedfile_ids.append(bedfile.identifier) + self._bedset_identifier = compute_md5sum_bedset(bedfile_ids) + + return self._bedset_identifier + + +class SNP: + """ + Python representation of a SNP + """ + + def __init__( + self, + hugo_symbol: str = None, + entrez_gene_id: str = None, + center: str = None, + ncbi_build: str = None, + chromosome: str = None, + start_position: int = None, + end_position: int = None, + strand: str = None, + ): + self.hugo_symbol = hugo_symbol + self.entrez_gene_id = entrez_gene_id + self.center = center + self.ncbi_build = ncbi_build + self.chromosome = chromosome + self.start_position = start_position + self.end_position = end_position + self.strand = strand + + @property + def start(self): + return self.start_position + + @property + def end(self): + return self.end_position + + @property + def chr(self): + return self.chromosome + + def to_region(self): + chr = self.chromosome + start = int(self.start_position) + end = int(self.end_position) + # bump end position by 1 if needed + if start == end: + end += 1 + + return Region(chr, start, end) + + def __len__(self): + return self.end - self.start + + def __repr__(self): + return f"SNP({self.chromosome}, {self.start_position}, {self.end_position}, {self.strand})" + + +class Maf: + """ + Python representation of a MAF file, only supports some columns for now + """ + + def _extract_value_from_col(self, col_name: str, line: str) -> any: + """ + Extract a value from a column in a line of a MAF file. + + :param col_name: name of column + :param line: line from MAF file + :return: value of column + """ + return line[self.col_positions[col_name]] if self.col_positions[col_name] else None + + def __init__( + self, + maf_file: str, + backed: bool = False, + bump_end_position: bool = False, + chr_rep_as_int: bool = False, + ): + """ + :param maf_file: path to maf file + :param backed: whether to load the maf file into memory or not + :param bump_end_position: whether to bump the end position by 1 or not (this is useful for interval trees and interval lists) + :param chr_rep_as_int: whether to represent the chromosome as an int or not (this is useful for interval trees and interval lists) + """ + # load from file + if isinstance(maf_file, str): + self.maf_file = maf_file + self.col_positions = extract_maf_col_positions(maf_file) + self.backed = backed + self.mafs: List[SNP] = [] + + # Open function depending on file type + open_func = gzip.open if is_gzipped(maf_file) else open + mode = "rt" if is_gzipped(maf_file) else "r" + + if backed: + self.mafs = None + # https://stackoverflow.com/a/32607817/13175187 + with open_func(self.maf_file, mode) as file: + self.length = ( + sum(1 for line in file if line.strip()) - 1 + ) # subtract 1 for header + else: + with open_func(maf_file, mode) as f: + # skip header + lines = f.readlines()[1:] + for line in lines: + # some bed files have more than 3 columns, so we just take the first 3 + line = line.strip().split(MAF_FILE_DELIM) + self.mafs.append( + SNP( + hugo_symbol=self._extract_value_from_col( + MAF_HUGO_SYMBOL_COL_NAME, line + ), + entrez_gene_id=self._extract_value_from_col( + MAF_ENTREZ_GENE_ID_COL_NAME, line + ), + center=self._extract_value_from_col(MAF_CENTER_COL_NAME, line), + ncbi_build=self._extract_value_from_col( + MAF_NCBI_BUILD_COL_NAME, line + ), + chromosome=self._extract_value_from_col( + MAF_CHROMOSOME_COL_NAME, line + ), + start_position=int( + self._extract_value_from_col(MAF_START_COL_NAME, line) + ), + end_position=int( + self._extract_value_from_col(MAF_END_COL_NAME, line) + ), + strand=self._extract_value_from_col(MAF_STRAND_COL_NAME, line), + ) + ) + self.length = len(self.mafs) + + # post process according to flags + for maf in self.mafs: + if bump_end_position: + maf.end_position += 1 + if not chr_rep_as_int: + maf.chromosome = "chr" + str(maf.chromosome) + else: + raise ValueError("mafs must be a path to a maf file") + + def __len__(self): + return self.length + + def __getitem__(self, key): + if self.backed: + raise NotImplementedError("Backed MAFs do not currently support indexing.") + else: + return self.mafs[key] + + def __iter__(self): + if self.backed: + # Open function depending on file type + open_func = gzip.open if is_gzipped(self.maf_file) else open + mode = "rt" if is_gzipped(self.maf_file) else "r" + + with open_func(self.maf_file, mode) as f: + # skip header + for i, line in enumerate(f): + if i == 0: + continue + line = line.strip().split(MAF_FILE_DELIM) + yield SNP( + hugo_symbol=self._extract_value_from_col(MAF_HUGO_SYMBOL_COL_NAME, line), + entrez_gene_id=self._extract_value_from_col( + MAF_ENTREZ_GENE_ID_COL_NAME, line + ), + center=self._extract_value_from_col(MAF_CENTER_COL_NAME, line), + ncbi_build=self._extract_value_from_col(MAF_NCBI_BUILD_COL_NAME, line), + chromosome=self._extract_value_from_col(MAF_CHROMOSOME_COL_NAME, line), + start_position=self._extract_value_from_col(MAF_START_COL_NAME, line), + end_position=self._extract_value_from_col(MAF_END_COL_NAME, line), + strand=self._extract_value_from_col(MAF_STRAND_COL_NAME, line), + ) + + else: + for maf in self.mafs: + yield maf + + def __repr__(self): + return f"MAF({self.maf_file})" + + +# TODO: This belongs somewhere else; does it even make sense? +class TokenizedRegionSet(object): + """Represents a tokenized region set""" + + def __init__(self, tokens: np.ndarray, universe: RegionSet): + self.tokens = tokens + self.universe = universe + + +# Write a class representing a collection of RegionSets +# TODO: This shouldn't read in the actual files, it should just represent the files and use lazy loading +class RegionSetCollection(object): + """Represents a collection of RegionSets""" + + def __init__(self, region_sets: List[RegionSet] = None, file_globs: List[str] = None): + if region_sets: + self.region_sets = region_sets + elif file_globs: + self.region_sets = [] + for glob in file_globs: + self.region_sets.extend([RegionSet(path) for path in glob.glob(glob)]) + + def __getitem__(self, key): + return self.region_sets[key] + + def __len__(self): + return len(self.region_sets) + + +# Do we need an EmbeddingSet class? +class EmbeddingSet(object): + """Represents embeddings and labels""" + + embeddings: np.ndarray + labels: list diff --git a/geniml/io/utils.py b/geniml/io/utils.py new file mode 100644 index 00000000..e6f504e3 --- /dev/null +++ b/geniml/io/utils.py @@ -0,0 +1,97 @@ +import gzip +import os +from hashlib import md5 +from typing import Dict, List, Union + +from .const import ( + MAF_CENTER_COL_NAME, + MAF_CHROMOSOME_COL_NAME, + MAF_COLUMN, + MAF_END_COL_NAME, + MAF_ENTREZ_GENE_ID_COL_NAME, + MAF_FILE_DELIM, + MAF_HUGO_SYMBOL_COL_NAME, + MAF_NCBI_BUILD_COL_NAME, + MAF_START_COL_NAME, + MAF_STRAND_COL_NAME, +) + + +def is_gzipped(file: str) -> bool: + """ + Check if a file is gzipped. + + :param file: path to file + :return: True if file is gzipped, else False + """ + _, file_extension = os.path.splitext(file) + return file_extension == ".gz" + + +def extract_maf_col_positions(file: str) -> Dict[MAF_COLUMN, Union[int, None]]: + """ + Extract the column positions of the MAF file. + + :param file: path to .maf file + :return: dictionary of column positions + """ + + def get_index_from_header(header: List[str], col_name: str) -> Union[int, None]: + """ + Get the index of a column from a header. + + :param header: list of column names + :param col_name: column name + :return: index of column + """ + try: + return header.index(col_name) + except ValueError: + return None + + # detect open function + open_func = open if not is_gzipped(file) else gzip.open + mode = "r" if not is_gzipped(file) else "rt" + + with open_func(file, mode) as f: + header = f.readline().strip().split(MAF_FILE_DELIM) + col_positions = { + MAF_HUGO_SYMBOL_COL_NAME: get_index_from_header(header, MAF_HUGO_SYMBOL_COL_NAME), + MAF_ENTREZ_GENE_ID_COL_NAME: get_index_from_header( + header, MAF_ENTREZ_GENE_ID_COL_NAME + ), + MAF_CENTER_COL_NAME: get_index_from_header(header, MAF_CENTER_COL_NAME), + MAF_NCBI_BUILD_COL_NAME: get_index_from_header(header, MAF_NCBI_BUILD_COL_NAME), + MAF_CHROMOSOME_COL_NAME: get_index_from_header(header, MAF_CHROMOSOME_COL_NAME), + MAF_START_COL_NAME: get_index_from_header(header, MAF_START_COL_NAME), + MAF_END_COL_NAME: get_index_from_header(header, MAF_END_COL_NAME), + MAF_STRAND_COL_NAME: get_index_from_header(header, MAF_STRAND_COL_NAME), + } + return col_positions + + +def read_bedset_file(file_path: str) -> List[str]: + """ + Load a bedset from a text file + + :param file_path: path to the file + + :return: list of bed identifiers + """ + bed_identifiers = [] + + with open(file_path, "r") as f: + for line in f: + bed_identifiers.append(line.strip()) + return bed_identifiers + + +def compute_md5sum_bedset(bedset: List[str]) -> str: + """ + Compute the md5sum of a bedset + + :param bedset: list of bed identifiers + + :return: md5sum of the bedset + """ + return md5("".join(bedset).encode()).hexdigest() diff --git a/gitk/likelihood/README.md b/geniml/likelihood/README.md similarity index 61% rename from gitk/likelihood/README.md rename to geniml/likelihood/README.md index 880ce65c..b37548dd 100644 --- a/gitk/likelihood/README.md +++ b/geniml/likelihood/README.md @@ -8,19 +8,19 @@ ``` -gitk lh build_model --model_folder tests/consesnus/lh_model \ +geniml lh build_model --model_folder tests/consesnus/lh_model \ --file_list tests/consesnus/file_list.txt \ --coverage_folder tests/consesnus/coverage/ ``` ``` - gitk lh universe_hard --coverage_file tests/consesnus/coverage/all_core.bw \ + geniml lh universe_hard --coverage_file tests/consesnus/coverage/all_core.bw \ --fout tests/consesnus/universe/ML_hard.bed ``` ``` -gitk lh universe_flexible --model_folder test/data/lh_model \ +geniml lh universe_flexible --model_folder test/data/lh_model \ --output_file test/results/universe/ML_flexible.bed ``` \ No newline at end of file diff --git a/gitk/hmm/__init__.py b/geniml/likelihood/__init__.py similarity index 100% rename from gitk/hmm/__init__.py rename to geniml/likelihood/__init__.py diff --git a/gitk/likelihood/build_model.py b/geniml/likelihood/build_model.py similarity index 51% rename from gitk/likelihood/build_model.py rename to geniml/likelihood/build_model.py index c20e0089..8f535dea 100644 --- a/gitk/likelihood/build_model.py +++ b/geniml/likelihood/build_model.py @@ -1,43 +1,32 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -import numpy as np import os -from ..utils import timer_func -import pyBigWig import tarfile import tempfile -WINDOW_SIZE = 25 -WRONG_UNIWIG = False +import numpy as np +import pyBigWig + +from ..utils import read_chromosome_from_bw, timer_func -def model_binomial(folder_in, in_file, chrom, file_out, file_no=None, start=0): - """ "Create binomial likelihood model - First column likelihood of background - Second column likelihood of coverage""" +def model_binomial(folder_in, in_file, chrom, file_out, file_no=None): + """Create binomial likelihood model + first column - likelihood of background + second column - likelihood of coverage""" in_file = os.path.join(folder_in, in_file) - bw = pyBigWig.open(in_file) - chrom_size = bw.chroms(chrom) - if pyBigWig.numpy: - distr_cov = bw.values(chrom, start, chrom_size, numpy=True) - else: - distr_cov = bw.values(chrom, start, chrom_size) - distr_cov = np.array(distr_cov) - distr_cov[np.isnan(distr_cov)] = 0 - if WRONG_UNIWIG and ("cove" not in in_file): - distr_cov = np.pad(distr_cov[WINDOW_SIZE:], (0, WINDOW_SIZE)) + distr_cov = read_chromosome_from_bw(in_file, chrom) + chrom_size = len(distr_cov) no_possible = file_no * len(distr_cov) # number of possible spots covered no_cov = np.sum(distr_cov) # number of spots covered - no_ncov = np.subtract(no_possible, no_cov) # number of spots uncovered - distr_ncov = np.subtract( - file_no, distr_cov - ) # for each position in how many files is empty - cov = distr_cov / no_cov - ncov = distr_ncov / no_ncov - p_cov = np.log10(cov + 1e-10) - p_ncov = np.log10(ncov + 1e-10) - prob_array = np.vstack((p_ncov, p_cov)).T + cov_uniq = np.unique(distr_cov).astype(np.uint16) + prob_array = np.zeros((int(np.max(cov_uniq)) + 1, 2)) + for i in cov_uniq: + prob_array[i] = [ + np.log10((file_no - i) / (no_possible - no_cov) + 1e-10), + np.log10((i / no_cov) + 1e-10), + ] header = f"{chrom}_{chrom_size}" r = {header: prob_array} np.savez_compressed(file_out, **r) @@ -45,6 +34,10 @@ def model_binomial(folder_in, in_file, chrom, file_out, file_no=None, start=0): class ChromosomeModel: def __init__(self, folder, chrom): + """ + :param str folder: file with the model + :param str chrom: of which chromosome is the model + """ self.folder = folder self.chromosome = chrom self.start_file = f"{self.chromosome}_start" @@ -57,32 +50,42 @@ def __init__(self, folder, chrom): } self.models = {} - def make_model( - self, coverage_folder, coverage_start, coverage_end, coverage_core, file_no - ): + def __getitem__(self, item): + return self.models[item] + + def make_model(self, coverage_folder, coverage_prefix, file_no): + """ + Make a lh model of given chromosome from coverage files + :param str coverage_folder: path to name with coverage files + :param str coverage_prefix: prefixed used for making coverage files + :param int file_no: number of files from which model is being created + """ model_binomial( coverage_folder, - coverage_start, + f"{coverage_prefix}_start.bw", self.chromosome, os.path.join(self.folder, self.start_file), file_no, ) model_binomial( coverage_folder, - coverage_core, + f"{coverage_prefix}_core.bw", self.chromosome, os.path.join(self.folder, self.core_file), file_no, ) model_binomial( coverage_folder, - coverage_end, + f"{coverage_prefix}_end.bw", self.chromosome, os.path.join(self.folder, self.end_file), file_no, ) def read(self): + """ + Read model + """ model_folder = tarfile.open(self.folder, "r") for f in self.files: file = model_folder.extractfile(self.files[f]) @@ -91,6 +94,9 @@ def read(self): model_folder.close() def read_track(self, track): + """ + Read specific track from model + """ model_folder = tarfile.open(self.folder, "r") file = model_folder.extractfile(self.files[track]) values = np.load(file) @@ -99,22 +105,40 @@ def read_track(self, track): class ModelLH: - def __init__(self, folder): - self.folder = folder + def __init__(self, file): + """ + Likelihood model class + :param str file: file containing the model + """ + self.name = file self.chromosomes_list = [] self.chromosomes_models = {} - if os.path.exists(self.folder): - if tarfile.is_tarfile(self.folder): - files = tarfile.open(self.folder, "r") + if os.path.exists(self.name): + if tarfile.is_tarfile(self.name): + files = tarfile.open(self.name, "r") chroms = files.getnames() self.chromosomes_list = list(set([i.split("_")[0] for i in chroms])) - def make(self, coverage_folder, coverage_prefix, file_no): - tar_arch = tarfile.open(self.folder, "w") + def __getitem__(self, item): + return self.chromosomes_models[item] + + def make(self, coverage_folder, coverage_prefix, file_no, force=False): + """ + Make lh model for all chromosomes + :param str coverage_folder: folder with coverage files + :param str coverage_prefix: prefixed used for making coverage files + :param int file_no: number of file from which model is being made + :param bool force: if overwrite an existing model + """ + if os.path.exists(self.name): + if not force: + print("Model already exists. If you want to overwrite it use force argument") + return + else: + print("Overwriting existing model") + tar_arch = tarfile.open(self.name, "w") temp_dir = tempfile.TemporaryDirectory() - bw_start = pyBigWig.open( - os.path.join(coverage_folder, f"{coverage_prefix}_start.bw") - ) + bw_start = pyBigWig.open(os.path.join(coverage_folder, f"{coverage_prefix}_start.bw")) chroms = bw_start.chroms() bw_start.close() self.chromosomes_list = [i for i in chroms if chroms[i] != 0] @@ -122,9 +146,7 @@ def make(self, coverage_folder, coverage_prefix, file_no): chrom_model = ChromosomeModel(temp_dir.name, c) chrom_model.make_model( coverage_folder, - f"{coverage_prefix}_start.bw", - f"{coverage_prefix}_core.bw", - f"{coverage_prefix}_end.bw", + coverage_prefix, file_no, ) for f in chrom_model.files: @@ -137,32 +159,34 @@ def make(self, coverage_folder, coverage_prefix, file_no): tar_arch.close() def read_chrom(self, chrom): - self.chromosomes_models[chrom] = ChromosomeModel(self.folder, chrom) + """ + Read into model specific chromosome + """ + self.chromosomes_models[chrom] = ChromosomeModel(self.name, chrom) self.chromosomes_models[chrom].read() def read_chrom_track(self, chrom, track): - self.chromosomes_models[chrom] = ChromosomeModel(self.folder, chrom) + """ + Read into model specific track for chromosome + """ + self.chromosomes_models[chrom] = ChromosomeModel(self.name, chrom) self.chromosomes_models[chrom].read_track(track) def clear_chrom(self, chrom): + """ + Clear model for given chromosome + """ self.chromosomes_models[chrom] = None @timer_func -def main( - model_folder, - coverage_folder, - coverage_prefix, - file_no=None, -): +def main(model_file, coverage_folder, coverage_prefix, file_no=None, force=False): """ Crate likelihood models for all chromosomes - :param str model_folder: output folder + :param str model_file: output name :param str coverage_folder: folder with coverage files - :param str coverage_start: file with coverage of start without extension - :param str coverage_end: file with coverage of end without extension - :param str coverage_core: file with coverage of core without extension + :param str coverage_prefix: prefix used for making coverage files :param int file_no: number of files used for making coverage tracks """ - model = ModelLH(model_folder) - model.make(coverage_folder, coverage_prefix, file_no) + model = ModelLH(model_file) + model.make(coverage_folder, coverage_prefix, file_no, force) diff --git a/geniml/likelihood/cli.py b/geniml/likelihood/cli.py new file mode 100644 index 00000000..9f5ca2b9 --- /dev/null +++ b/geniml/likelihood/cli.py @@ -0,0 +1,30 @@ +def build_subparser(parser): + """ + Builds argument parser. + """ + + parser.add_argument( + "--model-file", + help="path to file with lh model", + required=True, + type=str, + ) + parser.add_argument("--file-no", help="number of files used to make the model", type=int) + parser.add_argument( + "--coverage-folder", + help="path to coverage folder", + required=True, + type=str, + ) + parser.add_argument( + "--coverage-prefix", + help="prefix used when making coverage files", + default="all", + type=str, + ) + parser.add_argument( + "--force", + help="if overwrite existing model", + action="store_true", + ) + return parser diff --git a/geniml/models/__init__.py b/geniml/models/__init__.py new file mode 100644 index 00000000..a7dc64ed --- /dev/null +++ b/geniml/models/__init__.py @@ -0,0 +1 @@ +from .main import ExModel diff --git a/geniml/models/main.py b/geniml/models/main.py new file mode 100644 index 00000000..486ae6a1 --- /dev/null +++ b/geniml/models/main.py @@ -0,0 +1,39 @@ +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING + +import torch.nn as nn + +from ..io import RegionSet + +if TYPE_CHECKING: + from ..tokenization import Tokenizer + + +class Model(nn.Module): + """Class representing an *actual* model, that is, weights, etc""" + + pass + + +class ExModel(ABC): + """ + An Extended Model is a 3-part object consisting of a tokenizer (T), a + universe/vocabulary (U), and a model (M). The tokenizer is used to tokenize + region sets into the universe. The model is defined on the universe. + """ + + model: Model + universe: RegionSet + tokenizer: "Tokenizer" + + @abstractmethod + def __init__(self, model_path: str = None, tokenizer: "Tokenizer" = None, device: str = None): + """ + Initialize the model. If model_path is not None, load the model from + huggingface. + + :param str model_path: Path to the model on huggingface to load + :param Tokenizer tokenizer: Tokenizer to use + :param str device: Device to use (e.g. "cpu", "cuda:0", etc) + """ + pass diff --git a/geniml/nn/__init__.py b/geniml/nn/__init__.py new file mode 100644 index 00000000..b23a4405 --- /dev/null +++ b/geniml/nn/__init__.py @@ -0,0 +1 @@ +from .main import Attention, GradientReversal diff --git a/geniml/nn/main.py b/geniml/nn/main.py new file mode 100644 index 00000000..eeb5a278 --- /dev/null +++ b/geniml/nn/main.py @@ -0,0 +1,42 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Attention(nn.Module): + def __init__(self, embed_dim): + super(Attention, self).__init__() + self.embed_dim = embed_dim + self.attention = nn.Linear(embed_dim, 1) + + def forward(self, x): + # x: [batch_size, seq_len, embed_dim] + attention_weights = F.softmax(self.attention(x), dim=1) + # attention_weights: [batch_size, seq_len, 1] + weighted_sum = torch.sum(x * attention_weights, dim=1) / torch.sum( + attention_weights, dim=1 + ) + # weighted_sum: [batch_size, embed_dim] + return weighted_sum + + +# https://github.com/tadeephuy/GradientReversal +class GradientReversalLayer(torch.autograd.Function): + @staticmethod + def forward(ctx, x, alpha): + ctx.alpha = alpha + return x.view_as(x) + + @staticmethod + def backward(ctx, grad_output): + output = grad_output.neg() * ctx.alpha + return output, None + + +class GradientReversal(nn.Module): + def __init__(self, alpha): + super().__init__() + self.alpha = torch.tensor(alpha, requires_grad=False) + + def forward(self, x): + return GradientReversalLayer.apply(x, self.alpha) diff --git a/geniml/region2vec/__init__.py b/geniml/region2vec/__init__.py new file mode 100644 index 00000000..22a7f78e --- /dev/null +++ b/geniml/region2vec/__init__.py @@ -0,0 +1,3 @@ +# from .main import Region2Vec, Region2VecExModel +# from .main_legacy import region2vec +# diff --git a/geniml/region2vec/cli.py b/geniml/region2vec/cli.py new file mode 100644 index 00000000..623bbd3a --- /dev/null +++ b/geniml/region2vec/cli.py @@ -0,0 +1,61 @@ +def build_subparser(parser): + """Builds an argument parser to support the region2vec command line interface.""" + parser.add_argument("--token-folder", type=str, help="path to tokenized files") + parser.add_argument("--num-shuffle", type=int, help="number of shufflings/training epochs") + parser.add_argument("--embed-dim", type=int, help="embedding dimension") + parser.add_argument("--context-len", type=int, help="Context window size (half)") + parser.add_argument("--nworkers", type=int, default=10, help="number of workers") + parser.add_argument( + "--save-freq", + type=int, + default=-1, + help="Save a model after the given number of training epochs. If -1, then only save the best and latest models", + ) + parser.add_argument( + "--save-dir", + type=str, + help="path to the folder that saves the training result", + ) + parser.add_argument( + "--resume", + type=str, + default="", + help="path to a trained model. If specified, the model will be used to initialize the region2vec embeddings", + ) + parser.add_argument( + "--train-alg", + type=str, + default="cbow", + help="training algorithm, select from [cbow, skip-gram]", + ) + parser.add_argument( + "--min-count", + type=int, + default=5, + help="threshold for filtering out regions with low frequency in the internal vocabulary", + ) + parser.add_argument( + "--neg-samples", + type=int, + default=5, + help="number of noise words in negative sampling, usually between 5-20", + ) + parser.add_argument("--init-lr", type=float, default=0.1, help="initial learning rate") + parser.add_argument("--milestones", nargs="+", type=int, default=[100, 200]) + parser.add_argument( + "--lr-mode", + type=str, + default="linear", + choices=["milestone", "linear"], + help="type of learning rate scheduler, milestone or linear", + ) + parser.add_argument( + "--update-vocab", + type=str, + default="once", + help="[every] update at every epoch; [once] Update once since the vocabulary does not change", + ) + parser.add_argument("--min-lr", type=float, default=1.0e-6, help="minimum learning rate") + parser.add_argument("--seed", type=int, default=42, help="random seed") + + return parser diff --git a/geniml/region2vec/const.py b/geniml/region2vec/const.py new file mode 100644 index 00000000..9a60ea94 --- /dev/null +++ b/geniml/region2vec/const.py @@ -0,0 +1,26 @@ +from typing import Literal + +MODULE_NAME = "region2vec" +LR_TYPES = Literal["constant", "exponential", "step"] +POOLING_TYPES = Literal["mean", "max"] +MAX_WAIT_TIME = 10800 + +DEFAULT_EPOCHS = 100 +DEFAULT_GENSIM_EPOCHS = 1 +DEFAULT_MIN_COUNT = 10 +DEFAULT_N_SHUFFLES = 1 # 1 is sufficient for most cases +DEFAULT_WINDOW_SIZE = 5 +DEFAULT_EMBEDDING_DIM = 100 +DEFAULT_EPOCHS = 10 +DEFAULT_INIT_LR = 0.1 # https://github.com/databio/gitk/issues/6#issuecomment-1476273162 +DEFAULT_MIN_LR = 0.0001 # gensim default +DEFAULT_NS_POWER = 0.75 + +CONFIG_FILE_NAME = "config.yaml" +MODEL_FILE_NAME = "checkpoint.pt" +UNIVERSE_FILE_NAME = "universe.bed" + +POOLING_METHOD_KEY = "pooling_method" +EMBEDDING_DIM_KEY = "embedding_dim" +EMBEDDING_DIM_KEY_OLD = "embedding_size" +VOCAB_SIZE_KEY = "vocab_size" diff --git a/geniml/region2vec/experimental.py b/geniml/region2vec/experimental.py new file mode 100644 index 00000000..60d92dd8 --- /dev/null +++ b/geniml/region2vec/experimental.py @@ -0,0 +1,273 @@ +import logging +from typing import List, Tuple + +try: + import torch + import torch.nn as nn + from rich.progress import track + from torch.utils.data import Dataset +except ImportError: + raise ImportError( + "Please install Machine Learning dependencies by running 'pip install geniml[ml]'" + ) + +from .const import DEFAULT_N_SHUFFLES, DEFAULT_NS_POWER, DEFAULT_WINDOW_SIZE, MODULE_NAME +from .utils import shuffle_documents + +_LOGGER = logging.getLogger(MODULE_NAME) + + +class Region2VecDataset(Dataset): + def __init__(self, samples: List[Tuple[List[any], any]]): + self.samples = samples + + def __len__(self): + return len(self.samples) + + def __getitem__(self, idx) -> Tuple[List[any], any]: + # we need to return things as a tensor for proper batching + return self.samples[idx] + + +def generate_window_training_data( + data: List[List[any]], + window_size: int = DEFAULT_WINDOW_SIZE, + n_shuffles: int = DEFAULT_N_SHUFFLES, + threads: int = None, + padding_value: any = 0, + return_tensor: bool = True, +) -> List[Tuple[List[any], any]]: + """ + Generates the windowed training data by sliding across the region sets. This is for the CBOW model. + + :param List[any] data: The data to generate the training data from. + :param int window_size: The window size to use. + :param int n_shuffles: The number of shuffles to perform. + :param int threads: The number of threads to use. + :param any padding_value: The padding value to use. + :param bool return_tensor: Whether or not to return the data as a tensor. + + :return Tuple[List[List[any]], List[any]]: The contexts and targets. + """ + _LOGGER.info("Generating windowed training data.") + + # shuffle the documents + documents = shuffle_documents( + [[t for t in tokens] for tokens in data], n_shuffles=n_shuffles, threads=threads + ) + + # compute the context length (inputs) + context_len_req = 2 * window_size + # contexts = [] + # targets = [] + samples = [] + for document in track(documents, total=len(documents), description="Generating training data"): + for i, target in enumerate(document): + context = document[max(0, i - window_size) : i] + document[i + 1 : i + window_size + 1] + + # pad the context if necessary + if len(context) < context_len_req: + context = context + [padding_value] * (context_len_req - len(context)) + + # contexts.append(context) + # targets.append(target) + if return_tensor: + samples.append( + ( + torch.tensor(context, dtype=torch.long), + torch.tensor(target, dtype=torch.long), + ) + ) + else: + samples.append((context, target)) + + # return contexts, targets + return samples + + +def generate_window_training_data_wrap( + data: List[List[any]], + window_size: int = DEFAULT_WINDOW_SIZE, + n_shuffles: int = DEFAULT_N_SHUFFLES, + threads: int = None, + padding_value: any = 0, + return_tensor: bool = True, +) -> List[Tuple[List[any], any]]: + """ + Generates the windowed training data by sliding across the region sets. When the sliding window runs into the bounds of the list, it wraps around to the start or end of the array. + + :param List[any] data: The data to generate the training data from. + :param int window_size: The window size to use. + :param int n_shuffles: The number of shuffles to perform. + :param int threads: The number of threads to use. + :param any padding_value: The padding value to use. + :param bool return_tensor: Whether or not to return the data as a tensor. + + :return Tuple[List[List[any]], List[any]]: The contexts and targets. + """ + _LOGGER.info("Generating windowed training data.") + + # shuffle the documents + documents = shuffle_documents( + [[t for t in tokens] for tokens in data], n_shuffles=n_shuffles, threads=threads + ) + + samples = [] + + for document in track(documents, total=len(documents), description="Generating training data"): + for i in range(0, window_size): + target = document[i] + context = ( + document[i - window_size :] + document[0:i] + document[i + 1 : i + 1 + window_size] + ) + if return_tensor: + samples.append( + ( + torch.tensor(context, dtype=torch.long), + torch.tensor(target, dtype=torch.long), + ) + ) + else: + samples.append((context, target)) + for i in range(window_size, len(document) - window_size): + target = document[i] + context = document[i - window_size : i] + document[i + 1 : i + 1 + window_size] + if return_tensor: + samples.append( + ( + torch.tensor(context, dtype=torch.long), + torch.tensor(target, dtype=torch.long), + ) + ) + else: + samples.append((context, target)) + for i in range(len(document) - window_size, len(document)): + target = document[i] + context = ( + document[i - window_size : i] + + document[i + 1 :] + + document[0 : i - len(document) + window_size + 1] + ) + if return_tensor: + samples.append( + ( + torch.tensor(context, dtype=torch.long), + torch.tensor(target, dtype=torch.long), + ) + ) + else: + samples.append((context, target)) + + return samples + + +def generate_frequency_distribution(tokens: List[List[int]], vocab_length: int) -> torch.Tensor: + """ + Generate the frequency distribution of the tokens. + + :param List[List[int]] tokens: The tokens to generate the frequency distribution from. + """ + tokens_flat = [t for tokens in tokens for t in tokens] + + # create a tensor of all zeros with the length of the vocabulary + freq_dist = torch.zeros(vocab_length, dtype=torch.float) + + # count the number of times each token appears + for token in track( + tokens_flat, + total=len(tokens_flat), + description="Generating frequency distribution", + ): + freq_dist[token] += 1 + + # normalize the frequency distribution + freq_dist /= freq_dist.sum() + + return freq_dist + + +class NegativeSampler: + def __init__( + self, + freq_dist: torch.Tensor, + power: float = DEFAULT_NS_POWER, + batch_size: int = None, + ): + """ + Initialize the negative sampler. + + :param torch.Tensor freq_dist: List of frequencies for each token. Must be normalized. + :param float power: The power to use for the negative sampling. It is not recommended to change this. + """ + self.dist = freq_dist**power + self.dist /= self.dist.sum() + self.power = power + self.batch_size = batch_size + + def sample(self, k: int = 5, batch_size: int = None) -> torch.Tensor: + """ + Sample from the negative sampler. + + :param int k: The number of samples to draw. + """ + batch_size = batch_size or self.batch_size + if batch_size is None: + raise ValueError( + "Must provide batch_size to sample from negative sampler. This can be set in the constructor or in the sample method." + ) + negative_samples = torch.multinomial(self.dist, batch_size * k, replacement=True) + return negative_samples.view(batch_size, k) + + +class NegativeSampleDataset(Dataset): + def __init__(self, samples: torch.Tensor): + self.samples = samples + + def __len__(self): + return len(self.samples) + + def __getitem__(self, idx) -> torch.Tensor: + return self.samples[idx] + + +# negative sampling loss +class NSLoss(nn.Module): + def __init__(self): + super(NSLoss, self).__init__() + + def forward( + self, + context: torch.Tensor, + negative_samples: torch.Tensor, + target: torch.Tensor, + ): + """ + :param torch.Tensor context: The context vectors. + :param torch.Tensor negative_samples: The negative sample vectors. + :param torch.Tensor target: The target vectors. + """ + # there is one target that gets mapped to each context + target_v_context = target.unsqueeze(1).expand( + context.shape[0], context.shape[1], context.shape[2] + ) + target_v_neg = target.unsqueeze(1).expand( + negative_samples.shape[0], + negative_samples.shape[1], + negative_samples.shape[2], + ) + + # target is now of shape (batch_size, num_context_vectors, embedding_size) + # negative_samples is of shape (batch_size, num_negative_samples, embedding_size) + # context is of shape (batch_size, num_context_vectors, embedding_size) + + # compute the dot product between the context and target + pos_loss = torch.sum( + torch.nn.functional.logsigmoid(torch.bmm(context, target_v_context.transpose(1, 2))) + ) + neg_loss = torch.sum( + torch.nn.functional.logsigmoid( + torch.bmm(-negative_samples, target_v_neg.transpose(1, 2)) + ) + ) + + return -(pos_loss + neg_loss) diff --git a/geniml/region2vec/main.py b/geniml/region2vec/main.py new file mode 100644 index 00000000..604ac376 --- /dev/null +++ b/geniml/region2vec/main.py @@ -0,0 +1,375 @@ +import os +from logging import getLogger +from typing import List, Union, Sequence, Optional + +import numpy as np +from torch.nn.utils.rnn import pad_sequence + +try: + import torch +except ImportError: + raise ImportError( + "Please install Machine Learning dependencies by running 'pip install geniml[ml]'" + ) + +from gtars.tokenizers import Tokenizer +from gtars.models import Region as GRegion +from gtars.models import RegionSet as GRegionSet +from huggingface_hub import hf_hub_download +from rich.progress import track + +from ..io import Region, RegionSet +from ..models import ExModel +from .const import ( + CONFIG_FILE_NAME, + DEFAULT_EMBEDDING_DIM, + DEFAULT_EPOCHS, + DEFAULT_MIN_COUNT, + DEFAULT_WINDOW_SIZE, + MODEL_FILE_NAME, + POOLING_METHOD_KEY, + POOLING_TYPES, + UNIVERSE_FILE_NAME, +) +from .models import Region2Vec +from .utils import ( + Region2VecDataset, + export_region2vec_model, + load_local_region2vec_model, + train_region2vec_model, +) + +_GENSIM_LOGGER = getLogger("gensim") + +# demote gensim logger to warning +_GENSIM_LOGGER.setLevel("WARNING") + + +class Region2VecExModel(ExModel): + def __init__( + self, + model_path: str = None, + tokenizer: Tokenizer = None, + device: str = None, + pooling_method: POOLING_TYPES = "mean", + **kwargs, + ): + """ + Initialize Region2VecExModel. + + :param str model_path: Path to the pre-trained model on huggingface. + :param embedding_dim: Dimension of the embedding. + :param kwargs: Additional keyword arguments to pass to the model. + """ + super().__init__() + self.model_path: str = model_path + self.tokenizer: Tokenizer + self.trained: bool = False + self._model: Region2Vec = None + self.pooling_method = pooling_method + + if model_path is not None: + self._init_from_huggingface(model_path) + self.trained = True + + elif tokenizer is not None: + self._init_model(tokenizer, **kwargs) + + # set the device + self._target_device = torch.device( + device if device else ("cuda" if torch.cuda.is_available() else "cpu") + ) + + def _init_tokenizer(self, tokenizer: Union[Tokenizer, str]): + """ + Initialize the tokenizer. + + :param tokenizer: Tokenizer to initialize. + """ + if isinstance(tokenizer, str): + if os.path.exists(tokenizer): + self.tokenizer = Tokenizer(tokenizer) + else: + raise ValueError( + f"tokenizer path {tokenizer} does not exist. Please provide a valid path." + ) + elif isinstance(tokenizer, Tokenizer): + self.tokenizer = tokenizer + else: + raise TypeError("tokenizer must be a path to a bed file or an Tokenizer object.") + + def _init_model(self, tokenizer, **kwargs): + """ + Initialize the core model. This will initialize the model from scratch. + + :param kwargs: Additional keyword arguments to pass to the model. + """ + self._init_tokenizer(tokenizer) + padding_idx = self.tokenizer.pad_token_id + self._model = Region2Vec( + len(self.tokenizer), + embedding_dim=kwargs.get("embedding_dim", DEFAULT_EMBEDDING_DIM), + padding_idx=padding_idx, + ) + + @property + def model(self): + """ + Get the core Region2Vec model. + """ + return self._model + + def add_tokenizer(self, tokenizer: Tokenizer, **kwargs): + """ + Add a tokenizer to the model. This should be use when the model + is not initialized with a tokenizer. + + :param tokenizer: Tokenizer to add to the model. + :param kwargs: Additional keyword arguments to pass to the model. + """ + if self._model is not None: + raise RuntimeError("Cannot add a tokenizer to a model that is already initialized.") + + self.tokenizer = tokenizer + if not self.trained: + self._init_model(**kwargs) + + def _load_local_model(self, model_path: str, vocab_path: str, config_path: str): + """ + Load the model from a checkpoint. + + :param str model_path: Path to the model checkpoint. + :param str vocab_path: Path to the vocabulary file. + """ + tokenizer = Tokenizer(vocab_path) + + # read id of padding token from tokenizer + padding_idx = tokenizer.pad_token_id + + _model, config = load_local_region2vec_model( + model_path, config_path, padding_idx=padding_idx + ) + + self._model = _model + self.tokenizer = tokenizer + + self.trained = True + if POOLING_METHOD_KEY in config: + self.pooling_method = config[POOLING_METHOD_KEY] + + def _init_from_huggingface( + self, + model_path: str, + model_file_name: str = MODEL_FILE_NAME, + universe_file_name: str = UNIVERSE_FILE_NAME, + config_file_name: str = CONFIG_FILE_NAME, + **kwargs, + ): + """ + Initialize the model from a huggingface model. This uses the model path + to download the necessary files and then "build itself up" from those. This + includes both the actual model and the tokenizer. + + :param str model_path: Path to the pre-trained model on huggingface. + :param str model_file_name: Name of the model file. + :param str universe_file_name: Name of the universe file. + :param kwargs: Additional keyword arguments to pass to the hf download function. + """ + model_file_path = hf_hub_download(model_path, model_file_name, **kwargs) + universe_path = hf_hub_download(model_path, universe_file_name, **kwargs) + config_path = hf_hub_download(model_path, config_file_name, **kwargs) + + self._load_local_model(model_file_path, universe_path, config_path) + + @classmethod + def from_pretrained( + cls, + path_to_files: str, + model_file_name: str = MODEL_FILE_NAME, + universe_file_name: str = UNIVERSE_FILE_NAME, + config_file_name: str = CONFIG_FILE_NAME, + ) -> "Region2VecExModel": + """ + Load the model from a set of files that were exported using the export function. + + :param str path_to_files: Path to the directory containing the files. + :param str model_file_name: Name of the model file. + :param str universe_file_name: Name of the universe file. + """ + model_file_path = os.path.join(path_to_files, model_file_name) + universe_file_path = os.path.join(path_to_files, universe_file_name) + config_file_path = os.path.join(path_to_files, config_file_name) + + instance = cls() + instance._load_local_model(model_file_path, universe_file_path, config_file_path) + instance.trained = True + + return instance + + def _validate_data_for_training( + self, data: Union[List[RegionSet], List[str], List[List[Region]]] + ) -> List[RegionSet]: + """ + Validate the data for training. This will return a list of RegionSets if the data is valid. + + :param Union[List[RegionSet], List[str]] data: List of data to train on. This is either + a list of RegionSets or a list of paths to bed files. + :return: List of RegionSets. + """ + if not isinstance(data, list): + raise TypeError("data must be a list or RegionSets or a list of paths to bed files.") + if len(data) == 0: + raise ValueError("data must not be empty.") + + # check if the data is a list of RegionSets + if isinstance(data[0], RegionSet): + return data + elif isinstance(data[0], str): + return [RegionSet(f) for f in data] + elif isinstance(data[0], list) and isinstance(data[0][0], Region): + return [RegionSet([r for r in region_list]) for region_list in data] + + def train( + self, + dataset: Region2VecDataset, + window_size: int = DEFAULT_WINDOW_SIZE, + epochs: int = DEFAULT_EPOCHS, + min_count: int = DEFAULT_MIN_COUNT, + num_cpus: int = 1, + seed: int = 42, + save_checkpoint_path: str = None, + gensim_params: dict = {}, + load_from_checkpoint: str = None, + ) -> bool: + """ + Train the model. + + :param dataset Region2VecDataset: Dataset to train on. + :param int window_size: Window size for the model. + :param int epochs: Number of epochs to train for. + :param int min_count: Minimum count for a region to be included in the vocabulary. + :param int num_cpus: Number of cpus to use for training. + :param int seed: Seed to use for training. + :param str save_checkpoint_path: Path to save the model checkpoints to. + :param dict gensim_params: Additional parameters to pass to the gensim model. + :param str load_from_checkpoint: Path to a checkpoint to load from. + + :return np.ndarray: Loss values for each epoch. + """ + # validate a model exists + if self._model is None: + raise RuntimeError( + "Cannot train a model that has not been initialized. Please initialize the model first using a tokenizer or from a huggingface model." + ) + + gensim_model = train_region2vec_model( + dataset, + embedding_dim=self._model.embedding_dim, + window_size=window_size, + epochs=epochs, + min_count=min_count, + num_cpus=num_cpus, + seed=seed, + save_checkpoint_path=save_checkpoint_path, + gensim_params=gensim_params, + load_from_checkpoint=load_from_checkpoint, + ) + + # once done training, set the weights of the pytorch model in self._model + for id in track( + gensim_model.wv.key_to_index, + total=len(gensim_model.wv.key_to_index), + description="Setting weights.", + ): + self._model.projection.weight.data[int(id)] = torch.tensor(gensim_model.wv[id]) + + # set the model as trained + self.trained = True + + return True + + def export( + self, + path: str, + checkpoint_file: str = MODEL_FILE_NAME, + universe_file: str = UNIVERSE_FILE_NAME, + config_file: str = CONFIG_FILE_NAME, + ): + """ + Function to facilitate exporting the model in a way that can + be directly uploaded to huggingface. This exports the model + weights and the vocabulary. + + :param str path: Path to export the model to. + """ + + export_region2vec_model( + self._model, + self.tokenizer, + path, + checkpoint_file=checkpoint_file, + universe_file=universe_file, + config_file=config_file, + ) + + def encode( + self, + regions: Union[str, Region, Sequence[Region], RegionSet, GRegionSet], + pooling: POOLING_TYPES = None, + batch_size: Optional[int] = 64, # <-- new arg + ) -> np.ndarray: + """ + Vectorise one or many regions. + + :param regions: Region(s) to encode. + :param pooling: "mean" or "max" token-pooling. + :param batch_size: How many regions to pad/encode at once + (None or 0 ➜ process all in one go). + """ + # ---------- input normalisation ---------- + pooling = pooling or self.pooling_method + + if isinstance(regions, Region): + regions = [regions] + elif isinstance(regions, str): + regions = RegionSet(regions) + + if not isinstance(regions[0], (Region, GRegion)): + raise TypeError("regions must be a list of Region or GRegion objects.") + if pooling not in {"mean", "max"}: + raise ValueError(f"pooling must be one of {POOLING_TYPES}") + + # tokenize + token_sets = [self.tokenizer([r]) for r in regions] + token_ids = [ts["input_ids"] for ts in token_sets] + + # ---------- batched padding / projection ---------- + pad_id = self._model.padding_idx + outputs = [] + + n = len(token_ids) + bs = n if not batch_size or batch_size <= 0 else batch_size + for start in range(0, n, bs): + chunk = token_ids[start : start + bs] + + tensors = pad_sequence( + [torch.tensor(t, dtype=torch.long) for t in chunk], + batch_first=True, + padding_value=pad_id, + ) + + reg_emb = self._model.projection(tensors) # (B, T, D) + mask = tensors.ne(self._model.projection.padding_idx).unsqueeze(-1) + + if pooling == "mean": + masked = reg_emb * mask + summed = masked.sum(dim=1) + counts = mask.sum(dim=1).clamp(min=1) + chunk_out = summed / counts + else: # pooling == "max" + reg_emb.masked_fill_(~mask, float("-inf")) + chunk_out = reg_emb.max(dim=1).values + + outputs.append(chunk_out.detach()) + + return torch.cat(outputs, dim=0).cpu().numpy() diff --git a/geniml/region2vec/main_legacy.py b/geniml/region2vec/main_legacy.py new file mode 100644 index 00000000..e9beec3d --- /dev/null +++ b/geniml/region2vec/main_legacy.py @@ -0,0 +1,165 @@ +import multiprocessing +import os +from typing import List + +import numpy as np + +from . import utils +from .region2vec_train import main as region2_train +from .region_shuffling import main as sent_gen + + +class Namespace: + def __init__(self, **kwargs): + self.__dict__.update(kwargs) + + +def region2vec( + token_folder: str, + save_dir: str, + file_list: List[str] = None, + data_type: str = "files", + mat_path: str = None, + num_shufflings: int = 1000, + num_processes: int = 10, + tokenization_mode: str = "hard", + embedding_dim: int = 100, + context_win_size: int = 5, + save_freq: int = -1, + resume_path: str = "", + train_alg: str = "cbow", + min_count: int = 5, + neg_samples: int = 5, + init_lr: float = 0.025, + min_lr: float = 1e-4, + lr_scheduler: str = "linear", + milestones: List[int] = [], + hier_softmax: bool = False, + seed: int = 0, + update_vocab: str = "once", +): + """Trains a Region2Vec model. + + Starts two subprocesses: one that generates shuffled datasets, and the + other consumes the shuffled datasets to train a Region2Vec model. + + Args: + token_folder (str): The path to the folder of tokenized files. + save_dir (str): The folder that stores the training results. + file_list (list[str], optional): Specifies which files from + token_folder are used for training. When None, uses all the files + in token_folder. Defaults to None. + data_type (str, optional): "files" or "matrix". Defaults to "files". + mat_path (str, optional): Used only when data_type = "matrix". Defaults + to None. + num_shufflings (int, optional): Number of shuffled datasets to + generate. Defaults to 1000. + num_processes (int, optional): Number of processes used. Defaults to 10. + tokenization_mode (str, optional): Tokenization mode. Defaults to + "hard", i.e., concatenating all regions in a BED files in a random order. + embedding_dim (int, optional): Dimension of embedding vectors. Defaults + to 100. + context_win_size (int, optional): Context window size. Defaults to 5. + save_freq (int, optional): Save frequency. Defaults to -1. + resume_path (str, optional): Starts with a previously trained model. + Defaults to "". + train_alg (str, optional): Training algorithm. Defaults to "cbow". + min_count (int, optional): Minimum frequency required to keep a region. + Defaults to 5. + neg_samples (int, optional): Number of negative samples used in + training. Defaults to 5. + init_lr (float, optional): Initial learning rate. Defaults to 0.025. + min_lr (float, optional): Minimum learning rate. Defaults to 1e-4. + lr_scheduler (str, optional): Type of the learning rate scheduler. + Defaults to "linear". + milestones (list[int], optional): Used only when + lr_scheduler="milestones". Defaults to []. + hier_softmax (bool, optional): Whether to use hierarchical softmax + during training. Defaults to False. + seed (int, optional): Random seed. Defaults to 0. + update_vocab (str, optional): If "every", then updates the vocabulary + for each shuffled dataset. Defaults to "once" assuming no new + regions occur in shuffled datasets. + """ + timer = utils.Timer() + start_time = timer.t() + if file_list is None: + files = os.listdir(token_folder) + else: + files = file_list + os.makedirs(save_dir, exist_ok=True) + file_list_path = os.path.join(save_dir, "file_list.txt") + utils.set_log_path(save_dir) + with open(file_list_path, "w") as f: + for file in files: + f.write(file) + f.write("\n") + + training_processes = [] + num_sent_processes = min(int(np.ceil(num_processes / 2)), 4) + nworkers = min(num_shufflings, num_sent_processes) + utils.log(f"num_sent_processes: {nworkers}") + if nworkers <= 1: + sent_gen_args = Namespace( + tokenization_folder=token_folder, + save_dir=save_dir, + file_list=file_list_path, + tokenization_mode=tokenization_mode, + pool=1, # maximum number of unused shuffled datasets generated at a time + worker_id=0, + number=num_shufflings, + ) + p = multiprocessing.Process(target=sent_gen, args=(sent_gen_args,)) + p.start() + training_processes.append(p) + else: + num_arrs = [num_shufflings // nworkers] * (nworkers - 1) + + num_arrs.append(num_shufflings - np.array(num_arrs).sum()) + sent_gen_args_arr = [] + for n in range(nworkers): + sent_gen_args = Namespace( + tokenization_folder=token_folder, + data_type=data_type, + mat_path=mat_path, + save_dir=save_dir, + file_list=file_list_path, + tokenization_mode=tokenization_mode, + pool=1, # maximum number of unused shuffled datasets generated at a time + worker_id=n, + number=num_arrs[n], + ) + sent_gen_args_arr.append(sent_gen_args) + for n in range(nworkers): + p = multiprocessing.Process(target=sent_gen, args=(sent_gen_args_arr[n],)) + p.start() + training_processes.append(p) + + num_region2vec_processes = max(num_processes - nworkers, 1) + region2vec_args = Namespace( + num_shuffle=num_shufflings, + embed_dim=embedding_dim, + context_len=context_win_size, + nworkers=num_region2vec_processes, + save_freq=save_freq, + save_dir=save_dir, + resume=resume_path, + train_alg=train_alg, + min_count=min_count, + neg_samples=neg_samples, + init_lr=init_lr, + min_lr=min_lr, + lr_mode=lr_scheduler, + milestones=milestones, + hier_softmax=hier_softmax, + update_vocab=update_vocab, + seed=seed, + ) + p = multiprocessing.Process(target=region2_train, args=(region2vec_args,)) + p.start() + training_processes.append(p) + for p in training_processes: + p.join() + os.remove(file_list_path) + elapsed_time = timer.t() - start_time + print(f"[Training] {utils.time_str(elapsed_time)}/{utils.time_str(timer.t())}") diff --git a/geniml/region2vec/models.py b/geniml/region2vec/models.py new file mode 100644 index 00000000..3c560fb2 --- /dev/null +++ b/geniml/region2vec/models.py @@ -0,0 +1,74 @@ +try: + import torch + import torch.nn as nn +except ImportError: + raise ImportError( + "Please install Machine Learning dependencies by running 'pip install geniml[ml]'" + ) + +from .const import DEFAULT_EMBEDDING_DIM, POOLING_TYPES + + +class Word2Vec(nn.Module): + """ + Word2Vec model. + """ + + def __init__(self, vocab_size: int, embedding_dim: int = DEFAULT_EMBEDDING_DIM, **kwargs): + super().__init__() + self.vocab_size = vocab_size + self.embedding_dim = embedding_dim + self.padding_idx = kwargs.get("padding_idx", None) + self.projection = nn.Embedding(vocab_size, embedding_dim, padding_idx=self.padding_idx) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.projection(x) + return x + + +class Region2Vec(Word2Vec): + def __init__(self, vocab_size: int, embedding_dim: int = DEFAULT_EMBEDDING_DIM, **kwargs): + super().__init__(vocab_size, embedding_dim, **kwargs) + + +class MeanPooling(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.mean(x, dim=1) + + +class MaxPooling(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.max(x, dim=1) + + +class RegionSet2Vec(nn.Module): + def __init__(self, region2vec: Region2Vec, pooling: POOLING_TYPES = "mean"): + """ + Initialize the RegionSet2Vec. RegionSet2Vec is a wrapper around the Region2Vec model that allows + pooling over a set of regions. This is useful for classification tasks where the input is a set of + regions, such as classifying a cell type based on the set of regions that are accessible. + + :param Union[Region2Vec, str] region2vec: Either a Region2Vec instance or a path to a huggingface model. + :param POOLING_TYPES pooling: The pooling type to use. Either "mean" or "max". + """ + super().__init__() + + self.region2vec: Region2Vec = region2vec + # assign the pooling layer based on mean or max + if pooling == "mean": + self.pooling = MeanPooling() + elif pooling == "max": + self.pooling = MaxPooling() + else: + raise ValueError(f"Invalid pooling type {pooling} passed.") + + def forward(self, x) -> torch.Tensor: + x = self.region2vec(x) + x = self.pooling(x) + return x diff --git a/geniml/region2vec/pooling.py b/geniml/region2vec/pooling.py new file mode 100644 index 00000000..41d228cd --- /dev/null +++ b/geniml/region2vec/pooling.py @@ -0,0 +1,37 @@ +from typing import List, Union + +import numpy as np + + +def mean_pooling(region_vectors: List[Union[np.ndarray, None]]) -> np.ndarray: + """ + Mean pooling of region vectors. + + The function first filters out None values and then computes the mean of the remaining vectors. + + :param region_vectors | None: region vectors + :return: mean pooled vector + """ + region_vectors = [rv for rv in region_vectors if rv is not None] + if len(region_vectors) == 0: + return None + + region_vectors = np.array(region_vectors) + return np.mean(region_vectors, axis=0) + + +def max_pooling(region_vectors: List[Union[np.ndarray, None]]) -> np.ndarray: + """ + Max pooling of region vectors. + + The function first filters out None values and then computes the max of the remaining vectors. + + :param region_vectors | None: region vectors + :return: max pooled vector + """ + region_vectors = [rv for rv in region_vectors if rv is not None] + if len(region_vectors) == 0: + return None + + region_vectors = np.array(region_vectors) + return np.max(region_vectors, axis=0) diff --git a/gitk/region2vec/region2vec_train.py b/geniml/region2vec/region2vec_train.py similarity index 55% rename from gitk/region2vec/region2vec_train.py rename to geniml/region2vec/region2vec_train.py index efc7441d..0b3f8712 100644 --- a/gitk/region2vec/region2vec_train.py +++ b/geniml/region2vec/region2vec_train.py @@ -1,39 +1,60 @@ -import os -import glob -import time -import datetime import argparse +import datetime +import glob +import logging +import os import pickle +import random +import time +from typing import Union + from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence -from gitk.region2vec import utils -import logging -import random -logging.basicConfig( - format="%(asctime)s : %(levelname)s : %(message)s", level=logging.ERROR -) +from . import utils +from .const import * + + +def find_dataset(data_folder: str) -> Union[str, int]: + """Finds an available dataset in data_folder. + + Finds an available dataset from a folder specified by data_folder. + Args: + data_folder (str): The path to the folder where datasets are generated. -def find_dataset(data_folder): + Returns: + Union[str, int]: -1 when no datasets are found after MAX_WAIT_TIME; + otherwise, a path to the found dataset. + """ train_pattern = os.path.join(data_folder, "pool*[0-9]") + count = 0 while True: dsets = glob.glob(train_pattern) if len(dsets) == 0: print("No available dataset, waiting for generation...", end="\r") time.sleep(1) + count += 1 + if count == MAX_WAIT_TIME: + print("Wait time exceeds MAX_WAIT_TIME, exit") + return -1 else: return dsets[random.randint(0, len(dsets) - 1)] -def main(args): +def main(args: argparse.Namespace) -> None: + """Trains a Region2Vec model using the arguments in args. + + Called internally as a subprocess by region2vec in main.py. + + Args: + args (argparse.Namespace): See the definition of ArgumentParser. + """ save_dir = args.save_dir data_folder = os.path.join( save_dir, "shuffled_datasets" ) # shuffled datasets are stored in the shuffled_datasets folder - model_dir = os.path.join( - save_dir, "models" - ) # model snapshots are stored in the models folder + model_dir = os.path.join(save_dir, "models") # model snapshots are stored in the models folder os.makedirs(save_dir, exist_ok=True) os.makedirs(model_dir, exist_ok=True) utils.set_log_path(save_dir) # specify the path to the log file @@ -50,14 +71,9 @@ def main(args): msg_model += "hierarchical softmax\033[00m" else: hs = 0 - msg_model += "negative sampling with {} negative samples\033[00m".format( - args.neg_samples - ) + msg_model += f"negative sampling with {args.neg_samples} negative samples\033[00m" if not os.path.exists(args.resume): vocab_update = False - # model = Word2Vec(size=args.embed_dim, alpha=args.init_lr, window=args.context_len, min_count=args.min_count, - # seed=args.seed, workers=args.nworkers, sg=train_alg, negative=args.neg_samples, hs=hs) - # lasest model = Word2Vec( vector_size=args.embed_dim, alpha=args.init_lr, @@ -72,9 +88,7 @@ def main(args): utils.log(msg_model) else: utils.log( - "\033[91mResuming {}, make sure the model configurations are consistent\033[00m".format( - args.resume - ) + f"\033[91mResuming {args.resume}, make sure the model configurations are consistent\033[00m" ) model = Word2Vec.load(args.resume) vocab_update = True @@ -87,30 +101,34 @@ def main(args): lr_info = {"freq": 1} lr_scheduler = utils.lr_scheduler( - args.init_lr, args.min_lr, args.num_shuffle, lr_info=lr_info, mode=args.lr_mode + args.init_lr, + args.min_lr, + args.num_shuffle, + lr_info=lr_info, + mode=args.lr_mode, ) run_timer = utils.Timer() - utils.log("[{}] Start training".format(datetime.datetime.now().strftime("%x-%X"))) - utils.log( - "[{}] Building vocabulary".format(datetime.datetime.now().strftime("%x-%X")) - ) + cur_time = datetime.datetime.now().strftime("%x-%X") + utils.log(f"[{cur_time}] Start training") + utils.log(f"[{cur_time}] Building vocabulary") dset = find_dataset(data_folder) + if dset == -1: + return sentences = LineSentence(dset) # create sentence iterator model.build_vocab(sentences, update=vocab_update) # prepare the model vocabulary - utils.log( - "[{}]\033[93m Vocabulary size is {}\033[00m".format( - datetime.datetime.now().strftime("%x-%X"), len(model.wv.index_to_key) - ) - ) + cur_time = datetime.datetime.now().strftime("%x-%X") + utils.log(f"[{cur_time}] Vocabulary size is {len(model.wv.index_to_key)}") build_vocab_time = run_timer.t() min_loss = 1.0e100 # start training for sidx in range(args.num_shuffle): epoch_timer = utils.Timer() - msg = "[Shuffling {:>4d}] ".format(sidx + 1) + msg = f"[Shuffling {sidx + 1:>4d}] " dset = find_dataset(data_folder) + if dset == -1: + return dname = dset.split("/")[-1] dst_name = os.path.join(data_folder, dname + "using") os.rename(dset, dst_name) # change to file name to pool%dusing @@ -133,77 +151,77 @@ def main(args): if loss < min_loss: min_loss = loss - model.save(os.path.join(model_dir, "word2vec_best.pt")) - model.save(os.path.join(model_dir, "word2vec_latest.pt")) + model.save(os.path.join(model_dir, "region2vec_best.pt")) + model.save(os.path.join(model_dir, "region2vec_latest.pt")) if args.save_freq > 0 and (sidx + 1) % args.save_freq == 0: - model.save(os.path.join(model_dir, "word2vec_{}.pt".format(sidx + 1))) + model.save(os.path.join(model_dir, f"region2vec_{sidx + 1}.pt")) est_time = (run_timer.t() - build_vocab_time) / ( sidx + 1 ) * args.num_shuffle + build_vocab_time - msg += "loss {:>12.4f} lr {:>5.4f} vocab_size {:>12d} ({}/{})".format( - loss, - lr_scheduler.lr, - len(model.wv.index_to_key), - utils.time_str(epoch_timer.t()), - utils.time_str(est_time), - ) + msg += f"loss {loss:>12.4f} lr {lr_scheduler.lr:>5.4f} vocab_size {len(model.wv.index_to_key):>12d} ({utils.time_str(epoch_timer.t())}/{utils.time_str(est_time)})" utils.log(msg) lr_scheduler.step() with open(os.path.join(model_dir, "loss_all.pickle"), "wb") as f: pickle.dump(loss_all, f) - elasped_time = run_timer.t() - utils.log( - "[{}] Training finished, training Time {}".format( - datetime.datetime.now().strftime("%x-%X"), utils.time_str(elasped_time) - ) - ) + elapsed_time = run_timer.t() + cur_time = datetime.datetime.now().strftime("%x-%X") + utils.log(f"[{cur_time}] Training finished, training Time {utils.time_str(elapsed_time)}") # remove intermediate datasets - os.system("rm -rf {}".format(data_folder)) # remove the generated shuffled datasets + os.system(f"rm -rf {data_folder}") # remove the generated shuffled datasets if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Gene Embedding") - parser.add_argument("--num_shuffle", type=int, help="number of shuffled datasets") - parser.add_argument("--embed_dim", type=int, help="embedding dimension") - parser.add_argument("--context_len", type=int, help="window size") - parser.add_argument("--nworkers", type=int, help="number of workers") - parser.add_argument("--save_freq", type=int, default=0, help="save frequency") - parser.add_argument("--save_dir", help="path to save the training result") - parser.add_argument("--resume", help="path to a saved model") + parser = argparse.ArgumentParser(description="Region2Vec training") + parser.add_argument( + "--num-shuffle", type=int, help="number of shuffled datasets used for training" + ) + parser.add_argument("--embed-dim", type=int, help="dimension of embedding vectors") + parser.add_argument("--context-len", type=int, help="length of the moving context window") parser.add_argument( - "--train_alg", help="training algorithm, select from [cbow, skip-gram]" + "--nworkers", type=int, help="number of parallel processes used in training" ) parser.add_argument( - "--min_count", type=int, help="threshold of pruning the internal vocabulary" + "--save-freq", + type=int, + default=0, + help="save the Region2Vec model after consuming save_freq datasets; use a nonpositive number to disable", + ) + parser.add_argument("--save-dir", help="the folder that saves the Region2Vec model") + parser.add_argument("--resume", help="path to a previously trained model") + parser.add_argument("--train-alg", help="training algorithm, select from [cbow, skip-gram]") + parser.add_argument( + "--min-count", + type=int, + help="threshold of pruning the internal vocabulary", ) parser.add_argument( - "--neg_samples", + "--neg-samples", type=int, help="number of noise words in negative sampling, usually between 5-20", ) parser.add_argument( - "--hier_softmax", + "--hier-softmax", default=False, action="store_true", help="if given, hierarchical softmax will be used", ) - parser.add_argument("--init_lr", type=float, help="initial learning rate") + parser.add_argument("--init-lr", type=float, help="initial learning rate") parser.add_argument("--milestones", nargs="+", type=int, default=[100, 200]) parser.add_argument( - "--lr_mode", + "--lr-mode", type=str, choices=["milestone", "linear"], - help="milestone or linear", + help="type of learning rate scheduler, milestone or linear", ) parser.add_argument( - "--update_vocab", + "--update-vocab", type=str, default="once", help="[every] update at every epoch; [once] Update once since the vocabulary does not change", ) - parser.add_argument("--min_lr", type=float, help="minimum learning rate") + parser.add_argument("--min-lr", type=float, help="minimum learning rate") parser.add_argument("--seed", type=int, help="random seed") args = parser.parse_args() main(args) diff --git a/geniml/region2vec/region_shuffling.py b/geniml/region2vec/region_shuffling.py new file mode 100644 index 00000000..d38dace7 --- /dev/null +++ b/geniml/region2vec/region_shuffling.py @@ -0,0 +1,246 @@ +import argparse +import glob +import os +import pickle +import random +import time +from typing import List + +import numpy as np +from geniml.region2vec import utils + + +class BEDDataset: + """Wraps a set of BED files in a BEDDataset object. + + Stores the information of a set of BED files. + Generates a new dataset with regions shuffled in BED files. + + Attributes: + filename_list: A list of BED file names. + nfiles: The number of BED files in the BEDdataset object. + """ + + def __init__(self, file_list: str) -> None: + """Initializes a BEDDataset object. + + Args: + file_list (str): A file storing a list of BED file names that + should be included in the dataset. + """ + self.filename_list = [] + with open(file_list, "r") as f: + for idx, line in enumerate(f): + filename = line.strip() + self.filename_list.append(filename) + + self.nfiles = len(self.filename_list) + + def regions2sentences_sampling(self, src_path: str, dst_path: str) -> None: + """Constructs a sentence by sampling regions from a BED file. + + Each region in a BED file has a probability. Constructs a sentence by + sampling regions in a BED file based on their probabilities. + + Args: + src_path (str): The folder where BED files reside. + dst_path (str): The destination file that stores all the generated + BED files; each line has regions sampled from a BED file. + """ + with open(dst_fname, "w") as fout: + for fname in self.filename_list: + src_fname = os.path.join(src_path, fname) + sentence = [] + probs = [] + with open(src_fname, "r") as f: + for line in f: + elements = line.strip().split("\t") + word = elements[0].strip() + sentence.append(word) + probs.append(float(elements[-2].strip())) + probs = np.array(probs) + probs = probs / probs.sum() + sentence = np.array(sentence) + + sampled_sentence = np.random.choice(sentence, len(probs), p=probs) + # sampled_sentence = list(set(sampled_sentence)) + sampled_sentence = sampled_sentence.tolist() + str_sent = " ".join(sampled_sentence) + + fout.write(str_sent) + fout.write("\n") + + def regions2sentences(self, src_path: str, dst_path: str) -> None: + """Concatenates all regions in a BED file randomly into a sentence. + + This functions is called in the hard tokenization mode. + + Args: + src_path (str): The folder where BED files reside. + dst_path (str): The destination file that stores all the generated + BED files; each line has all the regions from a BED file. + """ + with open(dst_path, "w") as f_out: + for fname in self.filename_list: + src_fname = os.path.join(src_path, fname) + sentence = [] + with open(src_fname, "r") as f: + for line in f: + elements = line.strip().split("\t")[0:3] + chr_name = elements[0].strip() + start = elements[1].strip() + end = elements[2].strip() + word = chr_name + ":" + start + "-" + end + sentence.append(word) + random.shuffle(sentence) # shuffle the regions in the sentence + str_sent = " ".join(sentence) + f_out.write(str_sent) + f_out.write("\n") + + +class MatrixDataset: + """Wraps the binary representation of BED files into a MatrixDataset. + + Stores the information of a set of BED files. + Generates a new dataset with regions shuffled in BED files. + """ + + def __init__(self, matrix: List[List[int]]): + """Initializes a MatrixDataset object with matrix. + + Args: + matrix (list[list[int]]): The binary representation of BED files. + Each row represents a BED file. Each column denotes a region. + Each element denotes the presence (1) or absence (0) of a + region in a BED file. + """ + self.mat = [[] for i in range(len(matrix))] + for i in range(len(matrix)): + for j in range(len(matrix[i])): + if matrix[i][j] != 0: + self.mat[i].append(j) + + def regions2sentences(self, dst_path: str) -> None: + """Concatenates all regions in a BED file randomly into a sentence. + + This functions is called in the hard tokenization mode. + + Args: + dst_path (str): The destination file that stores all the generated + BED files; each line has all the regions from a BED file. + """ + with open(dst_path, "w") as f_out: + for i in range(len(self.mat)): + sentence = [] + for j in range(len(self.mat[i])): + sentence.append(self.mat[i][j]) + if len(sentence) > 0: + random.shuffle(sentence) # shuffle the regions in the sentence + str_sent = " ".join(sentence) + f_out.write(str_sent) + f_out.write("\n") + + +def main(args: argparse.Namespace) -> None: + """Generates shuffled datasets. + + Called internally as a subprocess by region2vec in main.py. + + Args: + args (argparse.Namespace): See the definition of ArgumentParser. + """ + DATA_FOLDER = os.path.join(args.save_dir, "shuffled_datasets") + os.makedirs(DATA_FOLDER, exist_ok=True) + src_path = args.tokenization_folder + worker_id = args.worker_id + random.seed(worker_id) + np.random.seed(worker_id) + if args.data_type == "files": + dataset = BEDDataset(args.file_list) + else: + with open(args.mat_path, "rb") as f: + matrix = pickle.load(f) + dataset = MatrixDataset(matrix) + pool = args.pool + utils.log(f"[{worker_id}] Creating shuffled datasets in \033[93m{DATA_FOLDER}\033[00m") + + for i in range(pool): + name_used = os.path.join(DATA_FOLDER, f"pool{worker_id}-{i}used") + name_using = os.path.join(DATA_FOLDER, f"pool{worker_id}-{i}using") + name_creating = os.path.join(DATA_FOLDER, f"pool{worker_id}-{i}creating") + name = os.path.join(DATA_FOLDER, f"pool{worker_id}-{i}") + if os.path.exists(name_using): + print("File exists") + return + if os.path.exists(name_used): + print("File exists") + return + if os.path.exists(name): + print("File exists") + return + if os.path.exists(name_creating): + print("File exists") + return + # create an empty file + with open(name_used, "w") as f: + pass + + num_created = 0 + while True: + if num_created == args.number: + break + # determine whether to create a new dataset + files = glob.glob(os.path.join(DATA_FOLDER, f"pool{worker_id}*used")) + if len(files) == 0: + time.sleep(1) # wait for 10 seconds + # print('Waiting for the data to be consumed',end="\r") + else: + # delete the used dataset and generate a new dataset in the same folder + sel_file = files[random.randint(0, len(files) - 1)] + fname = sel_file.split("/")[-1][:-4] + os.system(f"rm -f {sel_file}") # delete the dataset + dpath = os.path.join(DATA_FOLDER, fname + "creating") + with open(dpath, "w") as f: + pass + if args.tokenization_mode == "hard": + dataset.regions2sentences(src_path, dpath) + else: + dataset.regions2sentences_sampling(src_path, dpath) + + num_created += 1 + # print('[',datetime.datetime.now(),']',' Created %dth dataset' % num_created) + dst_name = os.path.join(DATA_FOLDER, fname) + os.rename(dpath, dst_name) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Sentence Generation") + parser.add_argument("--file-list", help="path to a file list") + parser.add_argument("--tokenization-mode", help="tokenization mode") + parser.add_argument( + "--tokenization-folder", + help="path to the folder that saves tokenized regions", + ) + parser.add_argument("--save-dir", help="parent folder to generated shuffled datasets") + parser.add_argument( + "--pool", + type=int, + default=3, + help="maximum number of shuffled datasets before consuming one", + ) + parser.add_argument( + "--worker-id", + type=int, + default=0, + help="used in the parallel mode", + ) + parser.add_argument( + "--number", + type=int, + default=1000, + help="number of shuffling the whole dataset", + ) + + args = parser.parse_args() + + main(args) diff --git a/geniml/region2vec/utils.py b/geniml/region2vec/utils.py new file mode 100644 index 00000000..a3e0dc02 --- /dev/null +++ b/geniml/region2vec/utils.py @@ -0,0 +1,538 @@ +import glob +import logging +import os +import random +import select +import shutil +import sys +import time + +from concurrent.futures import ThreadPoolExecutor +from typing import TYPE_CHECKING, Dict, List, Tuple, Union + +import numpy as np +import pyarrow as pa +import pyarrow.parquet as pq + +try: + import torch +except ImportError: + raise ImportError( + "Please install Machine Learning dependencies by running 'pip install geniml[ml]'" + ) + +from gtars.utils import read_tokens_from_gtok +from gtars.tokenizers import Tokenizer +from yaml import safe_dump, safe_load + +if TYPE_CHECKING: + from gensim.models import Word2Vec as GensimWord2Vec + +from ..const import GTOK_EXT +from .const import ( + CONFIG_FILE_NAME, + DEFAULT_EMBEDDING_DIM, + DEFAULT_EPOCHS, + DEFAULT_INIT_LR, + DEFAULT_MIN_COUNT, + DEFAULT_MIN_LR, + DEFAULT_WINDOW_SIZE, + EMBEDDING_DIM_KEY, + EMBEDDING_DIM_KEY_OLD, + LR_TYPES, + MODEL_FILE_NAME, + MODULE_NAME, + UNIVERSE_FILE_NAME, + VOCAB_SIZE_KEY, +) +from .models import Region2Vec + +_LOGGER = logging.getLogger(MODULE_NAME) + +_log_path = None + + +def set_log_path(path): + global _log_path + _log_path = path + + +class Timer: + """Records the running time. + + Uses Timer.s() or Timer() to record the start time. Then, calls Timer.t() to get the + elapsed time in seconds. + """ + + def __init__(self): + """Initializes a Timer object and starts the timer.""" + self.v = time.time() + + def s(self): + """Restarts the timer.""" + self.v = time.time() + + def t(self): + """Gives the elapsed time. + + Returns: + float: The elapsed time in seconds. + """ + return time.time() - self.v + + +def time_str(t: float) -> str: + """Converts time in float to a readable format. + + Converts time in float to hours, minutes, or seconds based on the value of + t. + + Args: + t (float): Time in seconds. + + Returns: + str: Time in readable time. + """ + if t >= 3600: + return f"{t / 3600:.2f}h" + if t >= 60: + return f"{t / 60:.2f}m" + return f"{t:.2f}s" + + +def timed_response(prompt: str, wait_time: int, default: str): + """Prints prompt and waits for response. + + Args: + prompt (str): The question asks for a response. + wait_time (int): The number of seconds for waiting. + default (str): If no response received, uses default as the response. + + Returns: + str: a response given by the user or the default one. + """ + print(prompt, end="", flush=True) + i, o, e = select.select([sys.stdin], [], [], wait_time) + if i: + ans = sys.stdin.readline().strip() + if ans not in ["y", "n"]: + print(f"\033[91m{default}\033[00m") + return default + else: + return ans + else: + print(f"\033[91m{default}\033[00m") + return default + + +def log(obj: str, filename: str = "log.txt") -> None: + """Adds information in obj to a file specified by filename. + + Adds information in obj to a file (default: log.txt) and prints obj. + + Args: + obj (str): A string. + filename (str, optional): The log file name. Defaults to "log.txt". + """ + print(obj) + if _log_path is not None: + with open(os.path.join(_log_path, filename), "a") as f: + f.write(obj) + f.write("\n") + + +class lr_scheduler: + """Changes the learning rate. + + Changes the learning rate using the mode of linear or milestones. + If mode = "linear", then the learning rate linearly decreases after certain + epochs. + If mode = "milestones", then the learning rate decreases at specified + epochs. + """ + + def __init__( + self, + init_lr: float, + end_lr: float, + epochs: int, + lr_info: Dict[str, Union[int, float, list]], + mode: str = "linear", + ): + """Initializes the learning rate scheduler. + + Args: + init_lr (float): The initial learning rate. + end_lr (float): The last learning rate. + epochs (int): The number of training epochs. + lr_info (dict[str,Union[int,list]]): Dictionary storing information + for learning rate scheduling. + mode (str, optional): The mode of learning rate scheduling. + Defaults to "linear". + """ + self.lr = init_lr + self.end_lr = end_lr + self.init_lr = init_lr + self.mode = mode + self.epochs = epochs + self.lr_info = lr_info + self.count = 0 + if mode == "linear": + self.freq = lr_info["freq"] + + def step(self): + """Updates the learning rate. + + Returns: + float: Current learning rate. + """ + self.count += 1 + if self.mode == "linear": + if self.count % self.freq == 0: + self.lr = self.init_lr - (self.init_lr - self.end_lr) / self.epochs * self.count + elif self.mode == "milestone": + milestones = np.array(self.lr_info["milestones"]) + power = (milestones <= self.count).sum() + self.lr = self.init_lr * np.power(self.lr_info["ratio"], float(power)) + if self.lr < self.end_lr: + self.lr = self.end_lr + return self.lr + + +def ensure_dir(folder: str, default: str = "y") -> None: + """Makes sure the folder exists. + + Makes sure the folder exists. If the folder exists, then asks the user to + keep [n] or delete [y] it. If no response received after 5 secs, then + deletes the folder and create a new one. + + Args: + folder (str): The folder to be created. + default (str, optional): Choose whether to delete [y] or keep [n] the + folder. Defaults to y. + """ + if os.path.exists(folder): + if default == "y": + prompt = f"\033[91m{folder} exists,remove?([y]/n):\033[00m " + else: + prompt = f"\033[91m{folder} exists,remove?(y/[n]):\033[00m " + ans = timed_response(prompt, 5, default) + if ans != "n": + shutil.rmtree(folder) + else: + return + os.makedirs(folder, exist_ok=True) + + +class LearningRateScheduler: + """ + Simple class to track learning rates of the training procedure + + Based off of: https://machinelearningmastery.com/using-learning-rate-schedules-deep-learning-models-python-keras/ + """ + + def __init__( + self, + init_lr: float = DEFAULT_INIT_LR, + min_lr: float = DEFAULT_MIN_LR, + type: LR_TYPES = "exponential", + decay: float = None, + n_epochs: int = None, + ): + """ + :param float init_lr: The initial learning rate + :param float min_lr: The minimum learning rate + :param str type: The type of learning rate schedule to use. Must be one of ['linear', 'exponential']. + :param float decay: The decay rate to use. If None, this will be calculated from init_lr and n_epochs. + :param int n_epochs: The number of epochs to train for. Only used if decay is None. + """ + self.init_lr = init_lr + self.min_lr = min_lr + self.n_epochs = n_epochs + + # convert type to learning rate if necessary + if type not in ["constant", "linear", "exponential"]: + raise ValueError( + f"Unknown schedule type: {type}. Must be one of ['constant', 'linear', 'exponential']." + ) + + self.type = type + + # init the current lr and iteration + self._current_lr = init_lr + self._iter = 1 + + # init decay rate + if decay is None: + _LOGGER.warning( + "No decay rate provided. Calculating decay rate from init_lr and n_epochs." + ) + self.decay = init_lr / n_epochs + else: + self.decay = decay + + def _update_linear(self, epoch: int): + """ + Update the learning rate using a linear schedule. + + :param int epoch: The current epoch + """ + + lr = self.init_lr - (self.decay * epoch) + return max(lr, self.min_lr) + + def _update_exponential(self, epoch: int): + """ + Update the learning rate using an exponential schedule. + + :param int epoch: The current epoch + """ + lr = self.get_lr() * (1 / (1 + self.decay * epoch)) + return max(lr, self.min_lr) + + def update(self): + # update the learning rate according to the type + if self.type == "linear": + self._current_lr = self._update_linear(self._iter) + self._iter += 1 + elif self.type == "exponential": + self._current_lr = self._update_exponential(self._iter) + self._iter += 1 + elif self.type == "constant": + pass # do nothing + else: + raise ValueError(f"Unknown schedule type: {self.type}") + + def get_lr(self): + return self._current_lr + + +def shuffle_documents( + documents: List[List[any]], + n_shuffles: int = 1, + threads: int = None, +) -> List[List[any]]: + """ + Shuffle around the genomic regions for each cell to generate a "context". + + :param List[List[str]] documents: the document list to shuffle. + :param int n_shuffles: The number of shuffles to conduct. + """ + + def shuffle_list(list: List[any], n: int) -> List[any]: + for _ in range(n): + random.shuffle(list) + return list + + _LOGGER.debug(f"Shuffling documents {n_shuffles} times.") + shuffled_documents = documents.copy() + with ThreadPoolExecutor(max_workers=threads) as executor: + shuffled_documents = list( + executor.map( + shuffle_list, + shuffled_documents, + [n_shuffles] * len(documents), + ), + ) + return shuffled_documents + + +def export_region2vec_model( + model: torch.nn.Module, + tokenizer: Tokenizer, + path: str, + checkpoint_file: str = MODEL_FILE_NAME, + universe_file: str = UNIVERSE_FILE_NAME, + config_file: str = CONFIG_FILE_NAME, + **kwargs: Dict[str, any], +): + """ + Export the region2vec model to a folder + + :param torch.nn.Module model: The model to export + :param Tokenizer tokenizer: The tokenizer to export + :param str path: The path to export the model to + :param str checkpoint_file: The name of the checkpoint file to export + :param str universe_file: The name of the universe file to export + :param str config_file: The name of the config file to export + :param Dict[str, any] kwargs: Any additional arguments to pass to the config file + """ + # make sure the path exists + if not os.path.exists(path): + os.makedirs(path) + + # export the model weights + torch.save(model.state_dict(), os.path.join(path, checkpoint_file)) + + # export the config (vocab size, embedding size) + config = { + VOCAB_SIZE_KEY: len(tokenizer), + EMBEDDING_DIM_KEY: model.embedding_dim, + } + if kwargs: + config.update(kwargs) + + with open(os.path.join(path, config_file), "w") as f: + safe_dump(config, f) + + +def load_local_region2vec_model( + model_path: str, config_path: str, **kwargs +) -> Tuple[Region2Vec, dict]: + """ + Load a region2vec model from a local directory + + :param str model_path: The path to the model checkpoint file + :param str config_path: The path to the model config file + :param kwargs: include id of padding token + """ + + # load the model state dict (weights) + params = torch.load(model_path, weights_only=True) + + # get the model config (vocab size, embedding size) + with open(config_path, "r") as f: + config = safe_load(f) + + # try with new key first, then old key for backwards compatibility + embedding_dim = config.get(EMBEDDING_DIM_KEY, config.get(EMBEDDING_DIM_KEY_OLD)) + if embedding_dim is None: + raise KeyError( + f"Could not find embedding dimension in config file. Expected key {EMBEDDING_DIM_KEY} or {EMBEDDING_DIM_KEY_OLD}." + ) + else: + if EMBEDDING_DIM_KEY_OLD in config: + _LOGGER.warning( + f"Found old key {EMBEDDING_DIM_KEY_OLD} in config file. This key will be deprecated in future versions. Please notify this models maintainer." + ) + + model = Region2Vec( + config[VOCAB_SIZE_KEY], + embedding_dim=embedding_dim, + padding_idx=kwargs.get("padding_idx", None), + ) + + model.load_state_dict(params) + + return model, config + + +class Region2VecDataset: + def __init__( + self, + path: str, + shuffle: bool = True, + convert_to_str: bool = False, + ): + """ + Initialize a Region2VecDataset. + + The regions are stored in a parquet file, with one document (cell, BED file, etc) per row. + + :param str path: Path to the parquet file containing the tokens. + :param bool shuffle: Whether to shuffle the tokens in each document. + :param bool convert_to_str: Whether to convert the tokens to strings. + """ + self.table = pq.read_table(path) + self.data = self.table["tokens"].to_pylist() + self.shuffle = shuffle + self.convert_to_str = convert_to_str + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + tokens = self.data[idx] + if self.shuffle: + random.shuffle(tokens) + return [str(t) for t in tokens] if self.convert_to_str else tokens + + def __iter__(self): + for i in range(len(self)): + yield self[i] + + def __repr__(self): + return f"Region2VecDataset(data={self.data}, shuffle={self.shuffle})" + + +def train_region2vec_model( + dataset: Region2VecDataset, + embedding_dim: int = DEFAULT_EMBEDDING_DIM, + window_size: int = DEFAULT_WINDOW_SIZE, + epochs: int = DEFAULT_EPOCHS, + min_count: int = DEFAULT_MIN_COUNT, + num_cpus: int = 1, + seed: int = 42, + save_checkpoint_path: str = None, + gensim_params: dict = {}, + load_from_checkpoint: str = None, +) -> "GensimWord2Vec": + """ + Train a gensim Word2Vewc model on the given dataset. + + :param Region2VecDataset data: Data to train on. This is a dataset of tokens. + :param int embedding_dim: Embedding dimension for the model. + :param int window_size: Window size for the model. + :param int epochs: Number of epochs to train for. + :param int min_count: Minimum count for a region to be included in the vocabulary. + :param int num_cpus: Number of cpus to use for training. + :param int seed: Seed to use for training. + :param str save_checkpoint_path: Path to save the model checkpoints to. + :param dict gensim_params: Additional parameters to pass to the gensim model. + :param str load_from_checkpoint: Path to a checkpoint to load from. + + :return GensimWord2Vec: The gensim model that was trained. + """ + # we only need gensim if we are training + from gensim.models import Word2Vec as GensimWord2Vec + from gensim.models.callbacks import CallbackAny2Vec + + class TrainingCallback(CallbackAny2Vec): + """Callback to print loss after each epoch.""" + + def __init__(self): + self.epoch = 0 + + def on_epoch_end(self, model: GensimWord2Vec): + # log the loss + loss = model.get_latest_training_loss() + print("Loss after epoch {}: {}".format(self.epoch, loss)) + _LOGGER.info(f"EPOCH {self.epoch} COMPLETE.") + self.epoch += 1 + + # save the model + if save_checkpoint_path is not None: + # make sure the path exists + if not os.path.exists(save_checkpoint_path): + os.makedirs(save_checkpoint_path) + # save the model + model.save(os.path.join(save_checkpoint_path, f"epoch_{self.epoch}.model")) + + # create gensim model that will be used to train + if load_from_checkpoint is not None: + _LOGGER.info(f"Loading model from checkpoint: {load_from_checkpoint}") + gensim_model = GensimWord2Vec.load(load_from_checkpoint) + else: + _LOGGER.info("Creating new gensim model.") + gensim_model = GensimWord2Vec( + vector_size=embedding_dim, + window=window_size, + min_count=min_count, + workers=num_cpus, + seed=seed, + **gensim_params, + ) + _LOGGER.info("Building vocabulary.") + gensim_model.build_vocab(dataset) + + _LOGGER.info("Training model.") + gensim_model.train( + dataset, + epochs=epochs, # train for 1 epoch at a time, shuffle data each time + compute_loss=True, + total_words=gensim_model.corpus_total_words, + callbacks=[TrainingCallback()], + ) + + _LOGGER.info("Training complete. Moving weights to pytorch model.") + return gensim_model diff --git a/geniml/scembed/__init__.py b/geniml/scembed/__init__.py new file mode 100644 index 00000000..02ad2ece --- /dev/null +++ b/geniml/scembed/__init__.py @@ -0,0 +1,4 @@ +# from .annotation import * +# from .const import * +# from .main import * +# from .utils import * diff --git a/gitk/scembed/_version.py b/geniml/scembed/_version.py similarity index 100% rename from gitk/scembed/_version.py rename to geniml/scembed/_version.py diff --git a/geniml/scembed/annotation.py b/geniml/scembed/annotation.py new file mode 100644 index 00000000..2493685f --- /dev/null +++ b/geniml/scembed/annotation.py @@ -0,0 +1,142 @@ +from collections import Counter + +import scanpy as sc +from qdrant_client import QdrantClient +from rich.progress import track + + +class AnnotationServer(QdrantClient): + def __init__( + self, + location: str = None, + url: str = None, + port: int = None, + api_key: str = None, + collection_name: str = None, + timeout: float = 10, + ): + """ + A class for querying a Qdrant server for cell type predictions. This class requires that you have + a Qdrant server running with a collection of cell type embeddings. You can create this collection using + the `geniml/examples/scembed/load_qdrant.ipynb` script. + + :param str collection_name: The name of the collection to query. + :param str location: The location of the Qdrant server. This should be in the format `host:port`. + :param str url: The URL of the Qdrant server. + :param int port: The port of the Qdrant server. + :param str api_key: The API key for the Qdrant server. + :param float timeout: The timeout for the Qdrant server. + """ + super().__init__(location, url=url, port=port, api_key=api_key, timeout=timeout) + self.collection_name = collection_name + + +class Annotator: + def __init__( + self, + collection_name: str = None, + location: str = None, + url: str = None, + port: int = None, + timeout: float = 10, + ): + """ + A class for annotating single cell data with cell type predictions. This class requires that you have + a Qdrant server running with a collection of cell type embeddings. You can create this collection using + the `geniml/examples/scembed/load_qdrant.ipynb` script. + + :param str collection_name: The name of the collection to query. + :param str location: The location of the Qdrant server. This should be in the format `host:port`. + :param str url: The URL of the Qdrant server. + :param int port: The port of the Qdrant server. + :param float timeout: The timeout for the Qdrant server. + + """ + self.collection_name = collection_name + self.url = url + self.port = port + self.timeout = timeout + self._annotation_server = AnnotationServer( + location=location, + url=self.url, + port=self.port, + collection_name=self.collection_name, + timeout=self.timeout, + ) + + def annotate( + self, + adata: sc.AnnData, + embedding_key: str = "embedding", + key_added: str = "pred_celltype", + cluter_key: str = "leiden", + knn: int = 3, + score_threshold: float = 0.5, + ): + """ + Annotate a sc.AnnData object with cell type predictions for each cluster. This functions requires + that you have 1) clustered your data and 2) embedded your data using some pretrained model (databio/multiome). + + You can cluster your data using the `scanpy` + + It is *imperative* that the model used to embed your single cells is the same model used to produce the + embeddings in the database. Otherwise, the predictions will not be accurate; in fact they will be meaningless. + + :param sc.AnnData adata: The annotated data. + :param str embedding_key: The key in `adata.obsm` where the embeddings are stored. + :param str key_added: The key in `adata.obs` where the cell type predictions will be stored. + :param str cluter_key: The key in `adata.obs` where the cluster labels are stored. + :param int knn: The number of nearest neighbors to use when querying the database. + :param float score_threshold: The score threshold to use when querying the database. + """ + + _temp_key = "putative_cell_type" + + # check that the embedding key exists + if embedding_key not in adata.obsm.keys(): + raise ValueError( + f"Embedding key '{embedding_key}' not found in adata.obsm. Please embed your data first." + ) + if cluter_key not in adata.obs.keys(): + raise ValueError( + f"Cluster key '{cluter_key}' not found in adata.obs. Please cluster your data first." + ) + + # init list + scembed_cell_type_preds = [] + + # use qdrant and a simple KNN approach to attach cell types to the embeddings + for embedding in track(adata.obsm["embedding"], total=len(adata.obsm["embedding"])): + results = self._annotation_server.search( + collection_name=self.collection_name, + query_vector=embedding, + limit=knn, + score_threshold=score_threshold, + ) + + result_dicts = [result.dict() for result in results] + + # count "cell_type" in all dicts + c = Counter([result["payload"]["cell_type"] for result in result_dicts]) + + # simply get the name of the top most common and thats it + try: + cell_type = c.most_common(1)[0][0] + except IndexError: + cell_type = "Unknown" + + scembed_cell_type_preds.append(cell_type) + + # add to adata, these are just putative cell types + # we will take a consensus vote later using clusters + adata.obs[_temp_key] = scembed_cell_type_preds + + # now take a consensus vote for each cluster + cluster_celltypes = {} + for cluster in adata.obs["leiden"].unique(): + cluster_celltypes[cluster] = Counter( + adata.obs[adata.obs["leiden"] == cluster][_temp_key] + ).most_common(1)[0][0] + + # map the cluster_to_cell_type dictionary to the leiden column + adata.obs[key_added] = adata.obs[cluter_key].map(cluster_celltypes) diff --git a/geniml/scembed/argparser.py b/geniml/scembed/argparser.py new file mode 100644 index 00000000..d60cd974 --- /dev/null +++ b/geniml/scembed/argparser.py @@ -0,0 +1,75 @@ +from ubiquerg import VersionInHelpParser + +from ._version import __version__ +from .const import * + + +def build_argparser(parser: VersionInHelpParser = None) -> VersionInHelpParser: + """ + Parse command-line arguments passed to the pipeline. + + :param VersionInHelpParser parser: an argument parser object (argparse.ArgumentParser) + :return: the argument parser object + """ + # Argument Parsing + ########################################################################### + if parser is None: + parser = VersionInHelpParser( + prog=PKG_NAME, + version=__version__, + description="%(prog)s - embed single-cell data as region vectors", + ) + + # Pipeline-specific arguments + parser.add_argument( + "-i", + "--input", + default=None, + type=str, + required=True, + help="Path to MarketMatrix format count matrix.", + ) + + parser.add_argument( + "--nothreads", + dest="nothreads", + default=1, + help="Number of available processors for " "Word2Vec training.", + ) + + parser.add_argument( + "--noreads", + dest="noreads", + default=2, + help="Minimum number of reads that overlap a region " "for that region to be included.", + ) + + parser.add_argument( + "--window-size", + dest="window_size", + default=100, + help="Word2Vec window size.", + ) + + parser.add_argument( + "--epochs", + dest="epochs", + default=30, + help="Number of epochs for training", + ) + + parser.add_argument( + "--dimension", + dest="dimension", + default=100, + help="Number of dimensions to train the word2vec " "model.", + ) + + parser.add_argument( + "--min-count", + dest="min_count", + default=10, + help="Minimum count for Word2Vec model.", + ) + + return parser diff --git a/gitk/scembed/cli.py b/geniml/scembed/cli.py similarity index 95% rename from gitk/scembed/cli.py rename to geniml/scembed/cli.py index 8a07d47d..29eb21cc 100644 --- a/gitk/scembed/cli.py +++ b/geniml/scembed/cli.py @@ -1,17 +1,13 @@ -import sys import os +import sys + import logmuse from gensim.models import Word2Vec -from .argparser import build_argparser from ._version import __version__ +from .argparser import build_argparser from .const import * -from .scembed import ( - convert_anndata_to_documents, - load_scanpy_data, - shuffle_documents, - train, -) +from .main import convert_anndata_to_documents, load_scanpy_data, shuffle_documents, train def main(): diff --git a/geniml/scembed/const.py b/geniml/scembed/const.py new file mode 100644 index 00000000..affb3c8e --- /dev/null +++ b/geniml/scembed/const.py @@ -0,0 +1,8 @@ +LOGGING_LEVEL = "INFO" +MODULE_NAME = "scembed" + +CHR_KEY = "chr" +START_KEY = "start" +END_KEY = "end" + +DEFAULT_CHUNK_SIZE = 1000 diff --git a/geniml/scembed/exceptions.py b/geniml/scembed/exceptions.py new file mode 100644 index 00000000..1570e065 --- /dev/null +++ b/geniml/scembed/exceptions.py @@ -0,0 +1,6 @@ +class ScembedException(Exception): + pass + + +class ModelNotTrainedError(ScembedException): + pass diff --git a/geniml/scembed/main.py b/geniml/scembed/main.py new file mode 100755 index 00000000..7f6de5b7 --- /dev/null +++ b/geniml/scembed/main.py @@ -0,0 +1,321 @@ +import os +from logging import getLogger +from typing import Union + +import numpy as np +import scanpy as sc +import torch +from huggingface_hub import hf_hub_download +from rich.progress import track +from gtars.tokenizers import Tokenizer + +from ..region2vec.const import ( + CONFIG_FILE_NAME, + DEFAULT_EMBEDDING_DIM, + DEFAULT_EPOCHS, + DEFAULT_MIN_COUNT, + DEFAULT_WINDOW_SIZE, + MODEL_FILE_NAME, + POOLING_METHOD_KEY, + POOLING_TYPES, + UNIVERSE_FILE_NAME, +) +from ..region2vec.main import Region2Vec +from ..region2vec.utils import ( + Region2VecDataset, + export_region2vec_model, + load_local_region2vec_model, + train_region2vec_model, +) +from ..tokenization.utils import tokenize_anndata +from .const import MODULE_NAME + +_GENSIM_LOGGER = getLogger("gensim") +_LOGGER = getLogger(MODULE_NAME) + +# demote gensim logger to warning +_GENSIM_LOGGER.setLevel("WARNING") + + +class ScEmbed: + def __init__( + self, + model_path: str = None, + tokenizer: Tokenizer = None, + device: str = None, + pooling_method: POOLING_TYPES = "mean", + **kwargs, + ): + """ + Initialize ScEmbed. + + :param str model_path: Path to the pre-trained model on huggingface. + :param embedding_dim: Dimension of the embedding. + :param kwargs: Additional keyword arguments to pass to the model. + """ + super().__init__() + self.model_path: str = model_path + self.tokenizer: Tokenizer + self.trained: bool = False + self._model: Region2Vec = None + self.pooling_method: POOLING_TYPES = pooling_method + + if model_path is not None: + self._init_from_huggingface(model_path) + self.trained = True + + elif tokenizer is not None: + self._init_model(tokenizer, **kwargs) + + # set the device + self._target_device = torch.device( + device if device else ("cuda" if torch.cuda.is_available() else "cpu") + ) + + def _init_tokenizer(self, tokenizer: Union[Tokenizer, str]): + """ + Initialize the tokenizer. + + :param tokenizer: Tokenizer to add to the model. + """ + if isinstance(tokenizer, str): + if os.path.exists(tokenizer): + self.tokenizer = Tokenizer(tokenizer) + else: + raise FileNotFoundError(f"Tokenizer file {tokenizer} not found.") + elif isinstance(tokenizer, Tokenizer): + self.tokenizer = tokenizer + else: + raise TypeError("tokenizer must be of type Tokenizer or str.") + + def _init_model(self, tokenizer, **kwargs): + """ + Initialize the core model. This will initialize the model from scratch. + + :param kwargs: Additional keyword arguments to pass to the model. + """ + self._init_tokenizer(tokenizer) + + self._model = Region2Vec( + len(self.tokenizer), + embedding_dim=kwargs.get("embedding_dim", DEFAULT_EMBEDDING_DIM), + ) + + @property + def model(self): + """ + Get the core Region2Vec model. + """ + return self._model + + def add_tokenizer(self, tokenizer: Tokenizer, **kwargs): + """ + Add a tokenizer to the model. This should be use when the model + is not initialized with a tokenizer. + + :param tokenizer: Tokenizer to add to the model. + :param kwargs: Additional keyword arguments to pass to the model. + """ + if self._model is not None: + raise RuntimeError("Cannot add a tokenizer to a model that is already initialized.") + + self.tokenizer = tokenizer + if not self.trained: + self._init_model(**kwargs) + + def _load_local_model(self, model_path: str, vocab_path: str, config_path: str): + """ + Load the model from a checkpoint. + + :param str model_path: Path to the model checkpoint. + :param str vocab_path: Path to the vocabulary file. + """ + _model, config = load_local_region2vec_model(model_path, config_path) + tokenizer = Tokenizer(vocab_path) + + self._model = _model + self.tokenizer = tokenizer + + if POOLING_METHOD_KEY in config: + self.pooling_method = config[POOLING_METHOD_KEY] + + def _init_from_huggingface( + self, + model_path: str, + model_file_name: str = MODEL_FILE_NAME, + universe_file_name: str = UNIVERSE_FILE_NAME, + config_file_name: str = CONFIG_FILE_NAME, + **kwargs, + ): + """ + Initialize the model from a huggingface model. This uses the model path + to download the necessary files and then "build itself up" from those. This + includes both the actual model and the tokenizer. + + :param str model_path: Path to the pre-trained model on huggingface. + :param str model_file_name: Name of the model file. + :param str universe_file_name: Name of the universe file. + :param kwargs: Additional keyword arguments to pass to the hf download function. + """ + model_file_path = hf_hub_download(model_path, model_file_name, **kwargs) + universe_path = hf_hub_download(model_path, universe_file_name, **kwargs) + config_path = hf_hub_download(model_path, config_file_name, **kwargs) + + self._load_local_model(model_file_path, universe_path, config_path) + + @classmethod + def from_pretrained( + cls, + path_to_files: str, + model_file_name: str = MODEL_FILE_NAME, + universe_file_name: str = UNIVERSE_FILE_NAME, + config_file_name: str = CONFIG_FILE_NAME, + ) -> "ScEmbed": + """ + Load the model from a set of files that were exported using the export function. + + :param str path_to_files: Path to the directory containing the files. + :param str model_file_name: Name of the model file. + :param str universe_file_name: Name of the universe file. + :param str config_file_name: Name of the config file. + + :return: The loaded model. + """ + model_file_path = os.path.join(path_to_files, model_file_name) + universe_file_path = os.path.join(path_to_files, universe_file_name) + config_file_path = os.path.join(path_to_files, config_file_name) + + instance = cls() + instance._load_local_model(model_file_path, universe_file_path, config_file_path) + instance.trained = True + + return instance + + def train( + self, + dataset: Region2VecDataset, + window_size: int = DEFAULT_WINDOW_SIZE, + epochs: int = DEFAULT_EPOCHS, + min_count: int = DEFAULT_MIN_COUNT, + num_cpus: int = 1, + seed: int = 42, + save_checkpoint_path: str = None, + gensim_params: dict = {}, + load_from_checkpoint: str = None, + ) -> bool: + """ + Train the model. + + :param Region2VecDataset data: Data to train on. This is a dataset of tokens. + :param int window_size: Window size for the model. + :param int epochs: Number of epochs to train for. + :param int min_count: Minimum count for a region to be included in the vocabulary. + :param int num_cpus: Number of cpus to use for training. + :param int seed: Seed to use for training. + :param str save_checkpoint_path: Path to save the model checkpoints to. + :param dict gensim_params: Additional parameters to pass to the gensim model. + :param str load_from_checkpoint: Path to a checkpoint to load from. + + :return bool: Whether or not the model was trained. + """ + + # validate a model exists + if self._model is None: + raise RuntimeError( + "Cannot train a model that has not been initialized. Please initialize the model first using a tokenizer or from a huggingface model." + ) + + gensim_model = train_region2vec_model( + dataset, + embedding_dim=self._model.embedding_dim, + window_size=window_size, + epochs=epochs, + min_count=min_count, + num_cpus=num_cpus, + seed=seed, + save_checkpoint_path=save_checkpoint_path, + gensim_params=gensim_params, + load_from_checkpoint=load_from_checkpoint, + ) + + # once done training, set the weights of the pytorch model in self._model + for id in track( + gensim_model.wv.key_to_index, + total=len(gensim_model.wv.key_to_index), + description="Setting weights.", + ): + self._model.projection.weight.data[int(id)] = torch.tensor(gensim_model.wv[id]) + + # set the model as trained + self.trained = True + + return True + + def export( + self, + path: str, + checkpoint_file: str = MODEL_FILE_NAME, + universe_file: str = UNIVERSE_FILE_NAME, + config_file: str = CONFIG_FILE_NAME, + ): + """ + Function to facilitate exporting the model in a way that can + be directly uploaded to huggingface. This exports the model + weights and the vocabulary. + + :param str path: Path to export the model to. + """ + # make sure the model is trained + if not self.trained: + raise RuntimeError("Cannot export an untrained model.") + + export_region2vec_model( + self._model, + self.tokenizer, + path, + checkpoint_file=checkpoint_file, + universe_file=universe_file, + config_file=config_file, + ) + + def encode(self, regions: Union[sc.AnnData, str], pooling: POOLING_TYPES = None) -> np.ndarray: + """ + Get the vector for a region. + + :param Region region: Region to get the vector for. + :param str pooling: Pooling type to use. + + :return np.ndarray: Vector for the region. + """ + # allow the user to override the pooling method + pooling = pooling or self.pooling_method + + # data validation + if not (isinstance(regions, sc.AnnData) or isinstance(regions, str)): + raise TypeError( + f"Regions must be of type AnnData or str, not {type(regions).__name__}" + ) + if isinstance(regions, str): + regions = sc.read_h5ad(regions) + + if pooling not in ["mean", "max"]: + raise ValueError(f"pooling must be one of {POOLING_TYPES}") + + # tokenize the region + tokens = tokenize_anndata(regions, self.tokenizer) + tokens = [[t["input_ids"] for t in sublist] for sublist in tokens] + + # get the vector + embeddings = [] + for token_set in track(tokens, total=len(tokens), description="Getting embeddings"): + region_embeddings = self._model.projection(torch.tensor(token_set)) + if pooling == "mean": + region_embeddings = torch.mean(region_embeddings, axis=0).detach().numpy() + elif pooling == "max": + region_embeddings = torch.max(region_embeddings, axis=0).values.detach().numpy() + else: + # this should be unreachable + raise ValueError(f"pooling must be one of {POOLING_TYPES}") + embeddings.append(region_embeddings) + + return np.vstack(embeddings) diff --git a/geniml/scembed/models.py b/geniml/scembed/models.py new file mode 100644 index 00000000..e69de29b diff --git a/geniml/search/__init__.py b/geniml/search/__init__.py new file mode 100644 index 00000000..ea603813 --- /dev/null +++ b/geniml/search/__init__.py @@ -0,0 +1,6 @@ +from .backends import HNSWBackend, QdrantBackend +from .filebackend_tools import merge_backends +from .interfaces import BED2BEDSearchInterface, Text2BEDSearchInterface +from .query2vec import BED2Vec, Text2Vec +from .search_eval import anecdotal_search_from_hf_data +from .utils import rand_eval diff --git a/geniml/search/backends/__init__.py b/geniml/search/backends/__init__.py new file mode 100644 index 00000000..41343e65 --- /dev/null +++ b/geniml/search/backends/__init__.py @@ -0,0 +1,3 @@ +from .bivecbackend import BiVectorBackend +from .dbbackend import QdrantBackend +from .filebackend import HNSWBackend diff --git a/geniml/search/backends/abstract.py b/geniml/search/backends/abstract.py new file mode 100644 index 00000000..f4aaa5ed --- /dev/null +++ b/geniml/search/backends/abstract.py @@ -0,0 +1,57 @@ +from abc import ABC, abstractmethod +from typing import Dict, List, Union + +import numpy as np + + +class EmSearchBackend(ABC): + """ + An abstract class representing Embedding Search Backends. This allows + backends to be either a qdrant server or a local in-memory NMS index, or + anything, really. This allows us to use the same interface for both. + """ + + def __init__(self, embeddings: np.ndarray = None, labels: list = None) -> None: + if embeddings: + self.load(embeddings, labels) + + @abstractmethod + def load( + self, + vectors: np.ndarray, + ids: Union[np.ndarray, None] = None, + payloads: Union[List[Dict[str, str]], None] = None, + ) -> None: + raise NotImplementedError + + @abstractmethod + def search( + self, + query: np.ndarray, + limit: int, + with_payload: bool = True, + with_vectors: bool = True, + offset: int = 0, + ) -> List[Dict]: + """ + Search for the nearest neighbors of the given embedding + + :param query: the embedding to search for + :param k: the number of results to return + :return: a list of (id, score) pairs + """ + raise NotImplementedError() + + @abstractmethod + def __len__(self) -> int: + """ + Return the number of embeddings in the backend + """ + raise NotImplementedError() + + @abstractmethod + def retrieve_info(self, key) -> List[Dict]: + """ + with a list of storage ids, return matching vectors and their information + """ + raise NotImplementedError() diff --git a/geniml/search/backends/bivecbackend.py b/geniml/search/backends/bivecbackend.py new file mode 100644 index 00000000..68d47b4e --- /dev/null +++ b/geniml/search/backends/bivecbackend.py @@ -0,0 +1,289 @@ +import logging +import math +from typing import Dict, List, Tuple, Union + +import numpy as np + +from ...const import PKG_NAME +from .abstract import EmSearchBackend + +_LOGGER = logging.getLogger(PKG_NAME) + + +def batch_for_request( + ids: Union[List[str], List[int]], ranks_scores: List[float], batch_size: int = 100 +) -> List[Tuple[List, List]]: # used ChatGPT + """ + + :param ids: collected ids of BED files matching retrieved metadata tags + :param ranks_scores: keep track of ranks or scores from metadata tag embedding vector search + :param batch_size: size of batch, > 100 may crash qdrant server + :return: batched BED ids and matching text-search scores/ranks + """ + # Check if the lists are the same length + if len(ids) != len(ranks_scores): + raise ValueError("The lists must have the same length.") + + # Create batches + batches = [] + for i in range(0, len(ids), batch_size): + batch1 = ids[i : i + batch_size] + batch2 = ranks_scores[i : i + batch_size] + batches.append((batch1, batch2)) + + return batches + + +class BiVectorBackend: + """ + Search backend that connects the embeddings of metadata tags and bed files + """ + + def __init__( + self, + metadata_backend: EmSearchBackend, + bed_backend: EmSearchBackend, + metadata_payload_matches: str = "matched_files", + ): + """ + :param metadata_backend: search backend where embedding vectors of metadata tags are stored + :param bed_backend: search backend where embedding vectors of BED files are stored + :param metadata_payload_matches: the key in metadata backend payloads to files matching to that metadata tag + """ + self.metadata_backend = metadata_backend + self.bed_backend = bed_backend + self.metadata_payload_matches = metadata_payload_matches + + def search( + self, + query: np.ndarray, + limit: int, + with_payload: bool = True, + with_vectors: bool = True, + offset: int = 0, + p: float = 1.0, + q: float = 1.0, + distance: bool = False, + rank: bool = False, + ) -> List[Dict[str, Union[int, float, Dict[str, str], List[float]]]]: + """ + :param query: query vector (embedding vector of query term) + :param limit: number of nearest neighbors to search for query vector + :param with_payload: whether payload is included in the result + :param with_vectors: whether the stored vector is included in the result + :param offset: the offset of the search results + :param p: weights to the score of metadata search, recommend 0 < p <= 1.0 + :param q: weights to the score of BED search, recommend 0 < q <= 1.0 + :param distance: whether the score is distance or similarity + :param rank: whether the result is ranked based on rank or score + :return: the search result(a list of dictionaries, + each dictionary include: storage id, vector payload (optional), vector (optional)) + """ + + # the key for the score in result: distance or score (cosine similarity) + self.score_key = "distance" if distance else "score" + + # metadata search + metadata_results = self.metadata_backend.search( + query, + limit=int(math.log(limit) * 5) if limit > 10 else 5, + with_payload=True, + offset=0, + ) + + if isinstance(metadata_results, dict): + metadata_results = [metadata_results] + + if rank: + return self._rank_search(metadata_results, limit, with_payload, with_vectors, offset) + else: + return self._score_search( + metadata_results, limit, with_payload, with_vectors, offset, p, q + ) + + def _rank_search( + self, + metadata_results: List[Dict], + limit: int, + with_payload: bool = True, + with_vectors: bool = True, + offset: int = 0, + ) -> List[Dict[str, Union[int, float, Dict[str, str], List[float]]]]: + """ + Search based on maximum rank in results of metadata embedding and results of BED embedding + + :param metadata_results: result of metadata search + :param limit: see docstring of def search + :param with_payload: + :param with_vectors: + :param offset: + :return: the search result ranked based on maximum rank + """ + + text_rank = [] + ids_to_retrieve = [] + + query_bed_ids = set() + + for i, result in enumerate(metadata_results): + # all bed files matching the retrieved metadata tag + bed_ids = result["payload"][self.metadata_payload_matches] + + unique_bed_ids = [id_ for id_ in bed_ids if id_ not in query_bed_ids] + query_bed_ids.update(unique_bed_ids) + + for id_ in unique_bed_ids: + text_rank.append(i) + ids_to_retrieve.append(id_) + bed_results = [] + max_rank = [] + request_batches = batch_for_request(ids_to_retrieve, text_rank, 100) + + for ids, ranks in request_batches: + query_beds = self.bed_backend.retrieve_info(ids, with_vectors=True) + if isinstance(query_beds, dict): + query_beds = [query_beds] + + bed_vecs = [b["vector"] for b in query_beds] + + # search request once + retrieved_batch = self.bed_backend.search( + np.array(bed_vecs), + limit=limit, + with_payload=with_payload, + with_vectors=with_vectors, + offset=0, + ) + + for i, retrieved_beds in enumerate(retrieved_batch): + # j: rank for each bed vector query search + for j, retrieval in enumerate(retrieved_beds): + bed_results.append(retrieval) + # collect maximum rank + max_rank.append(max(ranks[i], j)) + + return self._top_k(max_rank, bed_results, limit, offset=offset, rank=True) + + def _score_search( + self, + metadata_results: List[Dict], + limit: int, + with_payload: bool = True, + with_vectors: bool = True, + offset: int = 0, + p: float = 1.0, + q: float = 1.0, + ) -> List[Dict[str, Union[int, float, Dict[str, str], List[float]]]]: + """ + Search based on weighted score from results of metadata embedding and results of BED embedding + + :param metadata_results: result of metadata search + :param limit: see docstring of def search + :param with_payload: + :param with_vectors: + :param offset: + :param p: + :param q: + :return: the search result ranked based on weighted similarity scores + """ + text_scores = [] + ids_to_retrieve = [] + + query_bed_ids = set() + for i, result in enumerate(metadata_results): + # all bed files matching the retrieved metadata tag + text_score = ( + 1 - result[self.score_key] + if self.score_key == "distance" + else result[self.score_key] + ) + bed_ids = result["payload"][self.metadata_payload_matches] + + unique_bed_ids = [id_ for id_ in bed_ids if id_ not in query_bed_ids] + query_bed_ids.update(unique_bed_ids) + + for id_ in unique_bed_ids: + text_scores.append(text_score) + ids_to_retrieve.append(id_) + + bed_results = [] + overall_scores = [] + + request_batches = batch_for_request(ids_to_retrieve, text_scores, 100) + + for ids, scores in request_batches: + query_beds = self.bed_backend.retrieve_info(ids, with_vectors=True) + if isinstance(query_beds, dict): + query_beds = [query_beds] + + bed_vecs = [b["vector"] for b in query_beds] + + retrieved_batch = self.bed_backend.search( + np.array(bed_vecs), + limit=limit, + with_payload=with_payload, + with_vectors=with_vectors, + offset=0, + ) + + for i, retrieved_beds in enumerate(retrieved_batch): + # j: rank for each bed vector query search + for retrieval in retrieved_beds: + # calculate weighted score + bed_score = ( + 1 - retrieval[self.score_key] + if self.score_key == "distance" + else retrieval[self.score_key] + ) + bed_results.append(retrieval) + overall_scores.append((p * scores[i] + q * bed_score) / 2) + + return self._top_k(overall_scores, bed_results, limit=limit, offset=offset, rank=False) + + def _top_k( + self, + scales: List[Union[int, float]], + results: List[Dict[str, Union[int, float, Dict[str, str], List[float]]]], + limit: int = 10, + offset: int = 0, + rank: bool = True, + ): + """ + Sort top k result and remove repetition + + :param scales: list of weighted scores or maximum rank + :param results: retrieval result + :param limit: number of result to return + :param offset: the offset of the search results + :param rank: whether the scale is maximum rank or not + :return: the top k selected result after rank + """ + paired_score_results = list(zip(scales, results)) + + # sort result + if not rank: + paired_score_results.sort(reverse=True, key=lambda x: x[0]) + else: + paired_score_results.sort(key=lambda x: x[0]) + + unique_result = {} + for scale, result in paired_score_results: + store_id = result["id"] + # filter out overlap + if store_id not in unique_result: + # add rank or score into the result + if not rank: + if self.score_key == "distance": + del result[self.score_key] + result["score"] = scale + else: + try: + del result["score"] + except KeyError: + del result["distance"] + + result["max_rank"] = scale + unique_result[store_id] = result + + top_k_results = list(unique_result.values())[offset : limit + offset] + return top_k_results diff --git a/geniml/search/backends/dbbackend.py b/geniml/search/backends/dbbackend.py new file mode 100644 index 00000000..59259194 --- /dev/null +++ b/geniml/search/backends/dbbackend.py @@ -0,0 +1,311 @@ +import logging +import os +from typing import Dict, List, Union + +import numpy as np +from qdrant_client import QdrantClient +from qdrant_client.http.models import SearchRequest +from qdrant_client.models import Distance, PointStruct, VectorParams + +from geniml.const import PKG_NAME +from geniml.search.const import ( + DEFAULT_COLLECTION_NAME, + DEFAULT_DIM, + DEFAULT_QDRANT_DIST, + DEFAULT_QDRANT_HOST, + DEFAULT_QDRANT_PORT, + DEFAULT_QUANTIZATION_CONFIG, +) + +from ..utils import verify_load_inputs +from .abstract import EmSearchBackend + +_LOGGER = logging.getLogger(PKG_NAME) + + +def queries_to_requests( + queries: np.ndarray, + limit: int, + with_payload: bool = True, + with_vectors: bool = True, + offset: int = 0, +) -> List[SearchRequest]: + """ + Prepare all search requests for each query vector in a batch + + :param queries: see docstring of QdrantBackend.batch_search + :param limit: + :param with_payload: + :param with_vectors: + :param offset: + """ + requests = [] + for query in queries: + if query.ndim > 1: + # that each request is from one single query vector + requests.extend(queries_to_requests(query, limit, with_payload, with_vectors, offset)) + else: + requests.append( + SearchRequest( + vector=query, + limit=limit, + with_vector=with_vectors, + with_payload=with_payload, + offset=offset, + ) + ) + return requests + + +def results_processing(search_results, with_payload: bool, with_vectors: bool) -> List[Dict]: + """ + Process the search result into unified format: list of dictionaries + + :param search_results: result of qdrant client similarity search + :type search_results: search result of qdrant client + :param with_payload: see docstring of QdrantBackend.search + :param with_vectors: + """ + output_list = [] + for result in search_results: + # build each dictionary + result_dict = {"id": result.id, "score": result.score} + if with_payload: + result_dict["payload"] = result.payload + if with_vectors: + result_dict["vector"] = result.vector + output_list.append(result_dict) + return output_list + + +class QdrantBackend(EmSearchBackend): + """A search backend that uses a qdrant server to store and search embeddings""" + + def __init__( + self, + dim: int = DEFAULT_DIM, + dist: Distance = DEFAULT_QDRANT_DIST, + collection: str = DEFAULT_COLLECTION_NAME, + qdrant_host: str = DEFAULT_QDRANT_HOST, + qdrant_port: int = DEFAULT_QDRANT_PORT, + qdrant_api_key: str = None, + ): + """ + Connect to Qdrant on commandline first: + (Ubuntu Linux terminal) + sudo docker run -p 6333:6333 -v $(pwd)/qdrant_storage:/qdrant/storage qdrant/qdrant + + :param config: the vector parameter + :param collection: name of collection + :param qdrant_host: host of qdrant server + :param qdrant_port: port of qdrant server + :param qdrant_api_key: api key + """ + super().__init__() + self.collection = collection + self.config = VectorParams(size=dim, distance=dist) + self.url = os.environ.get("QDRANT_HOST", qdrant_host) + self.port = os.environ.get("QDRANT_PORT", qdrant_port) + self.qd_client = QdrantClient( + url=self.url, + port=self.port, + api_key=os.environ.get("QDRANT_API_KEY", qdrant_api_key), + ) + + # Create collection only if it does not exist + try: + collection_info = self.qd_client.get_collection(collection_name=self.collection) + _LOGGER.info( + f"Using collection {self.collection} with {collection_info.points_count} points." + ) + except Exception: # qdrant_client.http.exceptions.UnexpectedResponse + _LOGGER.info(f"Collection {self.collection} does not exist, creating it.") + self.qd_client.recreate_collection( + collection_name=self.collection, + vectors_config=self.config, + quantization_config=DEFAULT_QUANTIZATION_CONFIG, + ) + + def load( + self, + vectors: np.ndarray, + ids: Union[List[str], None] = None, + payloads: Union[List[Dict[str, str]], None] = None, + ): + """ + Upload vectors and their labels into qdrant storage. + + :param vectors: embedding vectors, a np.ndarray with shape of (n, ) + :param ids: list of n point ids, or None to generate ids automatically + :param payloads: optional list of n dictionaries that contain vector metadata + :return: + """ + + if not ids: + start = len(self) + ids = list(range(start, start + len(payloads))) + + verify_load_inputs(vectors, ids, payloads) + + points = [ + PointStruct(id=ids[i], vector=vectors[i].tolist(), payload=payloads[i]) + for i in range(len(payloads)) + ] + self.qd_client.upsert( + collection_name=self.collection, + points=points, + ) + + def search( + self, + query: np.ndarray, + limit: int, + with_payload: bool = True, + with_vectors: bool = True, + offset: int = 0, + ) -> Union[ + List[Dict[str, Union[int, float, Dict[str, str], List[float]]]], + List[List[Dict[str, Union[int, float, Dict[str, str], List[float]]]]], + ]: + """ + with a given query vector, get k nearest neighbors from vectors in the collection + + :param query: a vector to search + :param limit: number of returned results + :param with_payload: whether payload is included in the result + :param with_vectors: whether the stored vector is included in the result + :param offset: the offset of the search results + :return: a list of dictionary that contains the search results in this format: + { + "id": + "score": + "payload": { + + } + "vector": [] + } + """ + if query.ndim > 1: + return self.batch_search(query, limit, with_payload, with_vectors, offset) + # KNN search in qdrant client + search_results = self.qd_client.search( + collection_name=self.collection, + query_vector=query, + limit=limit, + with_payload=with_payload, + with_vectors=with_vectors, + offset=offset, + ) + + # add the results in to the output list + return results_processing(search_results, with_payload, with_vectors) + + def batch_search( + self, + queries: np.ndarray, + limit: int, + with_payload: bool = True, + with_vectors: bool = True, + offset: int = 0, + ) -> List[List[Dict[str, Union[int, float, Dict[str, str], List[float]]]]]: + """ + + :param queries: multiple search vectors, np.ndarray with shape of (n, dim) + :param limit: see docstring of def search + :type limit: + :param with_payload: + :param with_vectors: + :param offset: + :return: results of all search requests with each vector in queries + """ + output_list = [] + # build all search requests + requests = queries_to_requests(queries, limit, with_payload, with_vectors, offset) + + search_results = self.qd_client.search_batch( + collection_name=self.collection, requests=requests + ) + + # add the results in to the output list + for batch in search_results: + batch_list = results_processing(batch, with_payload, with_vectors) + output_list.append(batch_list) + return output_list + + def __len__(self) -> int: + """ + Return the number of embeddings in the backend + """ + return self.qd_client.get_collection(collection_name=self.collection).vectors_count + + def retrieve_info( + self, ids: Union[List[int], int, List[str], str], with_vectors: bool = False + ) -> Union[ + Dict[str, Union[int, str, List[float], Dict[str, str]]], + List[Dict[str, Union[int, str, List[float], Dict[str, str]]]], + ]: + """ + With a given list of storage ids, return the information of these vectors + + :param ids: list of ids, or a single id + :param with_vectors: whether the vectors themselves will also be returned in the output + :return: if ids is one id, a dictionary similar to the output of search() will be returned, without "score"; + if ids is a list, a list of dictionaries will be returned + """ + if not isinstance(ids, list): + # retrieve() only takes iterable input + ids = [ids] + + # add hyphen to uuid if missing + for i in range(len(ids)): + id_ = ids[i] + if isinstance(id_, str): + if not "-" in id_: + ids[i] = f"{id_[:8]}-{id_[8:12]}-{id_[12:16]}-{id_[16:20]}-{id_[20:]}" + + output_list = [] + retrievals = self.qd_client.retrieve( + collection_name=self.collection, + ids=ids, + with_payload=True, + with_vectors=with_vectors, # no need vectors + ) + + retrieval_dict = {result.id: result for result in retrievals} + + # retrieve() of qd client does not return result in the order of ids in the list + # get the retrieval result in output by id order + for id_ in ids: + try: + result = retrieval_dict[id_] + except: + _LOGGER.warning(f"Warning: no id stored in backend matches {id_}.") + continue + result_dict = {"id": result.id, "payload": result.payload} + if with_vectors: + result_dict["vector"] = result.vector + output_list.append(result_dict) + + # with just one id, only the dictionary instead of the list will be returned + if len(output_list) == 1: + return output_list[0] + else: + return output_list + + def __str__(self): + n_items = len(self) + msg = f"""QdrantBackend + n items: {n_items} + url: {self.url}:{self.port}, + collection: {self.collection} + """ + return msg + + def __repr__(self): + n_items = len(self) + msg = f"""QdrantBackend + n items: {n_items} + url: {self.url}:{self.port}, + collection: {self.collection} + """ + return msg diff --git a/geniml/search/backends/filebackend.py b/geniml/search/backends/filebackend.py new file mode 100644 index 00000000..49d9d92d --- /dev/null +++ b/geniml/search/backends/filebackend.py @@ -0,0 +1,235 @@ +import json +import os.path +import pickle +from typing import Dict, List, Union + +import hnswlib +import yaml + +from ... import _LOGGER + +DEP_HNSWLIB = True +# try: +# +# +# +# except ImportError: +# DEP_HNSWLIB = False +# _LOGGER.error( +# "HNSWBackend requires hnswlib. Install hnswlib, or ignore this if you don't need HNSWBackend" +# ) + +import numpy as np + +from geniml.search.const import ( + DEFAULT_DIM, + DEFAULT_EF, + DEFAULT_HNSW_SPACE, + DEFAULT_INDEX_PATH, + DEFAULT_M, +) + +from ..utils import verify_load_inputs +from .abstract import EmSearchBackend + +# if not DEP_HNSWLIB: +# +# class HNSWBackend(EmSearchBackend): +# pass +# +# else: + + +class HNSWBackend(EmSearchBackend): + """A search backend that uses a local HNSW index to store and search embeddings""" + + # instance variables, should not be class variables + + def __init__( + self, + local_index_path: str = DEFAULT_INDEX_PATH, + payloads: Union[dict, str] = dict(), + space: str = DEFAULT_HNSW_SPACE, + dim: int = DEFAULT_DIM, + ef: int = DEFAULT_EF, + m: int = DEFAULT_M, + ): + """ + Initiate the backend + + :param local_index_path: local path where the index is saved to + :param space: possible options are l2, cosine or ip + :param dim: dimension of vectors that will be stored + :param ef: defines a construction time/accuracy trade-off, higher ef -> more accurate but slower + :param m: connected with internal dimensionality of the data, higher M -> higher accuracy/run_time + when ef is fixed + """ + # super(HNSWBackend, self).__init__() + # initiate the index + self.idx = hnswlib.Index(space=space, dim=dim) # possible options are l2, cosine or ip + self.idx.init_index(max_elements=0, ef_construction=ef, M=m) + + # load from local index that already store vectors + if os.path.exists(local_index_path): + self.idx.load_index(local_index_path) + _LOGGER.info(f"Using index {local_index_path} with {self.idx.element_count} points.") + + # load payloads: + if isinstance(payloads, str): + if payloads.endswith(".json"): + with open(payloads, "r") as f: + self.payloads = json.load(f) + elif payloads.endswith(".pkl"): + self.payloads = pickle.load(open(payloads, "rb")) + elif payloads.endswith(".yaml"): + with open(payloads, "r") as f: + self.payloads = yaml.load(f, Loader=yaml.SafeLoader) + + else: + raise ValueError( + f"payload should be either a json, pickle, or yaml file. you supplied: {payloads.split('.')[-1]}" + ) + else: + self.payloads = payloads + # self.payloads = {} + # save the index to local file path + else: + _LOGGER.info(f"Index {local_index_path} does not exist, creating it.") + self.idx.save_index(local_index_path) + self.payloads = {} + # self.payloads = payloads + self.idx_path = local_index_path + + def load( + self, + vectors: np.ndarray, + ids: Union[np.ndarray, None] = None, + payloads: Union[List[Dict[str, str]], None] = None, + ): + """ + Upload embedding vectors into the hnsw index, and store their hnsw index id and payloads into metadata + + :param vectors: embedding vectors, a np.ndarray with shape of (n, ) + :param ids: list of n point ids, or None to generate ids automatically + :param payloads: optional list of n dictionaries that contain vector metadata + :return: + """ + + # increase max_elements to contain new loadings + current_max = self.idx.get_max_elements() + + if ids is None: + new_max = current_max + vectors.shape[0] + ids = np.arange(start=current_max, stop=new_max) + else: + new_max = ids.max() + 1 + + # check if the number of embedding vectors and labels are same + verify_load_inputs(vectors, ids, payloads) + + if payloads: + for i in range(len(payloads)): + self.payloads[ids[i]] = payloads[i] + + # update hnsw index and load embedding vectors + self.idx.load_index(self.idx_path, max_elements=new_max) + self.idx.add_items(vectors) + + # save hnsw index to local file + self.idx.save_index(self.idx_path) + + def search( + self, + query: np.ndarray, + limit: int, + with_payload: bool = True, + with_vectors: bool = True, + offset: int = 0, + ) -> Union[ + List[Dict[str, Union[int, float, Dict[str, str], np.ndarray]]], + List[List[Dict[str, Union[int, float, Dict[str, str], np.ndarray]]]], + ]: + """ + With query vector(s), get the limit nearest neighbors. + + :param query: the query vector, np.ndarray with shape of (1, dim) or (dim, ) + :param limit: number of nearest neighbors to search for query vector + :param with_payload: whether payload is included in the result + :param with_vectors: whether the stored vector is included in the result + :param offset: the offset of the search results + :return: if the shape of query vector is (, ), a list of limit dictionaries will be returned, + the format of dictionary will be: + { + "id": + "distance": + "payload": { + + } + "vector": [] + } + if the shape of query vector is (n, ), a 2d list will be returned, + which is a list of n * list of limit dictionaries + """ + ids, distances = self.idx.knn_query(query, k=limit + offset) + # ids and distances are 2d array + ids = ids.tolist() + distances = distances.tolist() + + output_list = [] + for i in range(len(ids)): + search_list = [] + result_id = ids[i] + result_distances = distances[i] + if with_vectors: + result_vectors = self.idx.get_items(result_id, return_type="numpy") + for j in range(limit): + output_dict = {"id": result_id[j], "distance": result_distances[j]} + if with_payload: + output_dict["payload"] = self.payloads[result_id[j]] + if with_vectors: + output_dict["vector"] = result_vectors[j] + search_list.append(output_dict) + output_list.append(search_list) + + if len(output_list) == 1: + return output_list[0] + else: + return output_list + + def __len__(self) -> int: + return self.idx.element_count + + def retrieve_info(self, ids: Union[List[int], int], with_vectors: bool = False) -> Union[ + Dict[str, Union[int, List[float], Dict[str, str]]], + List[Dict[str, Union[int, List[float], Dict[str, str]]]], + ]: + """ + With an id or a list of storage ids, return the information of these vectors + :param ids: storage id, or a list of ids + :param with_vectors: whether the stored vector is included in the result + :return: + """ + if not isinstance(ids, list): + # retrieve() only takes iterable input + ids = [ids] + output_list = [] + for id_ in ids: + output_dict = {"id": id_, "payload": self.payloads[id_]} + output_list.append(output_dict) + + if with_vectors: + vecs = self.idx.get_items(ids, return_type="numpy") + for i in range(len(vecs)): + output_list[i]["vector"] = vecs[i] + + # with just one id, only the dictionary instead of the list will be returned + if len(output_list) == 1: + return output_list[0] + else: + return output_list + + def __str__(self): + return "HNSWBackend with {} items".format(len(self)) + + def __repr__(self): + return "HNSWBackend with {} items".format(len(self)) diff --git a/geniml/search/const.py b/geniml/search/const.py new file mode 100644 index 00000000..ced9e24d --- /dev/null +++ b/geniml/search/const.py @@ -0,0 +1,44 @@ +from qdrant_client.http import models +from qdrant_client.models import Distance + +DEFAULT_QDRANT_HOST = "localhost" +DEFAULT_QDRANT_PORT = 6333 + +DEFAULT_COLLECTION_NAME = "embeddings" + +DEFAULT_QDRANT_DIST = Distance.COSINE + +DEFAULT_INDEX_PATH = "./current_index.bin" + +DEFAULT_HNSW_SPACE = "cosine" + +DEFAULT_DIM = 100 + +DEFAULT_TEXT_DIM = 384 + + +# the size of the dynamic list for the nearest neighbors +# Higher ef leads to more accurate but slower search +# cannot be set lower than the number of queried nearest neighbors k +DEFAULT_EF = 200 + +# the number of bi-directional links created for every new element during construction +# Higher M work better on datasets with high intrinsic dimensionality and/or high recall +# low M work better for datasets with low intrinsic dimensionality and/or low recalls. +DEFAULT_M = 64 + +DEFAULT_QUANTIZATION_CONFIG = models.ScalarQuantization( + scalar=models.ScalarQuantizationConfig( + type=models.ScalarType.INT8, + quantile=0.99, + always_ram=True, + ), +) + + +# for evaluation dataset from huggingface +HF_INDEX = "index.bin" +HF_PAYLOADS = "payloads.pkl" +HF_METADATA = "metadata.json" + +TEXT_ENCODER_REPO = "databio/bivec-search-demo" diff --git a/geniml/search/filebackend_tools.py b/geniml/search/filebackend_tools.py new file mode 100644 index 00000000..5e13749f --- /dev/null +++ b/geniml/search/filebackend_tools.py @@ -0,0 +1,41 @@ +import logging +import pickle +from typing import Dict, List, Set, Tuple, Union + +import numpy as np + +from ..const import PKG_NAME +from .backends.filebackend import HNSWBackend + +_LOGGER = logging.getLogger(PKG_NAME) + + +def merge_backends( + backends_to_merge: List[HNSWBackend], local_index_path: str, dim: int +) -> HNSWBackend: + """ + Merge multiple backends into one + + :param backends_to_merge: a list of [HNSWBackend] + :param local_index_path: the path to the local index file of the merged output HNSWBackend + :param dim: the dimension of vectors stored in the HNSWBackend + + :return: a HNSWBackend that comes from merge all HNSWBackend in the input list backends_to_merge + """ + + result_backend = HNSWBackend( + local_index_path=local_index_path, + payloads={}, + dim=dim, + ) + + for backend in backends_to_merge: + result_vecs = [] + result_payloads = [] + for j in range(len(backend)): + result_vecs.append(backend.idx.get_items([j], return_type="numpy")[0]) + result_payloads.append(backend.payloads[j]) + + result_backend.load(vectors=np.array(result_vecs), payloads=result_payloads) + + return result_backend diff --git a/geniml/search/hfdemo/__init__.py b/geniml/search/hfdemo/__init__.py new file mode 100644 index 00000000..aa887171 --- /dev/null +++ b/geniml/search/hfdemo/__init__.py @@ -0,0 +1 @@ +from .bivec_demo import hf_bivec_search diff --git a/geniml/search/hfdemo/bivec_demo.py b/geniml/search/hfdemo/bivec_demo.py new file mode 100644 index 00000000..d651b416 --- /dev/null +++ b/geniml/search/hfdemo/bivec_demo.py @@ -0,0 +1,121 @@ +import json +import os +import tempfile +from typing import Dict + +import numpy as np +from huggingface_hub import hf_hub_download + +from ..backends import BiVectorBackend, HNSWBackend +from ..const import TEXT_ENCODER_REPO +from ..interfaces import BiVectorSearchInterface + + +def load_json(json_path: str) -> Dict: + """ + Load metadata stored in json files + + :param json_path: path to json file + :return: dictionary stored in the json file + """ + with open(json_path, "r") as f: + result = json.load(f) + return result + + +def load_vectors(npz_path, vec_key="vectors") -> np.ndarray: + """ + Load vectors stored in .npz file + + :param npz_path: path to the npz file + :param vec_key: storage key of vector in the npz file + :return: the stored vectors + """ + data = np.load(npz_path) + return data[vec_key] + + +def hf_bivec_search(query, repo: str = TEXT_ENCODER_REPO, limit=5, p=1.0, q=1.0, rank=True): + """ + Demo using data loaded onto huggingface dataset + + :param query: free form query terms + :param repo: the huggingface repository of text encoder model + :param limit:see docstring of geniml.search.backend.BiVectorBackend + :param p: + :param q: + :param rank: + :return: the search result from demo dataset on huggingface + """ + + # download files from huggingface dataset + bed_embeddings_path = hf_hub_download(repo, "bed_embeddings.npz", repo_type="dataset") + file_id_path = hf_hub_download(repo, "file_id.json", repo_type="dataset") + metadata_path = hf_hub_download(repo, "file_key_metadata.json", repo_type="dataset") + metadata_match_path = hf_hub_download(repo, "metadata_id_match.json", repo_type="dataset") + text_embeddings_path = hf_hub_download(repo, "text_embeddings.npz", repo_type="dataset") + + # load data from downloaded files + file_id_dict = load_json(file_id_path) + metadata_dict = load_json(metadata_path) + metadata_match_dict = load_json(metadata_match_path) + + bed_data = np.load(bed_embeddings_path) + bed_embeddings = bed_data["vectors"] + bed_names = list(bed_data["names"]) + + bed_name_idx = {value: index for index, value in enumerate(bed_names)} + + text_data = np.load(text_embeddings_path) + + text_embeddings = text_data["vectors"] + text_annotations = list(text_data["texts"]) + + bed_payloads = [] + bed_vecs = [] + + # vectors and payloads for metadata backend + for i in range(len(file_id_dict)): + bed_embedding_id = bed_name_idx[file_id_dict[str(i)]] + bed_vecs.append(bed_embeddings[bed_embedding_id]) + bed_payloads.append( + {"name": file_id_dict[str(i)], "metadata": metadata_dict[file_id_dict[str(i)]]} + ) + + # payloads for bed file backend + text_payloads = [] + for annotation in text_annotations: + text_payloads.append( + {"term": annotation, "matched_files": metadata_match_dict[annotation]} + ) + + # backends in temporary directory + with tempfile.TemporaryDirectory() as temp_dir: + # backend for BED file embedding vectors + bed_backend = HNSWBackend(local_index_path=os.path.join(temp_dir, "bed.bin"), dim=100) + bed_backend.load(vectors=np.array(bed_vecs), payloads=bed_payloads) + + # backend for metadata embedding vectors + text_backend = HNSWBackend(local_index_path=os.path.join(temp_dir, "text.bin"), dim=384) + text_backend.load(vectors=np.array(text_embeddings), payloads=text_payloads) + + # combined bi-vector search backend + search_backend = BiVectorBackend(text_backend, bed_backend) + + # search interface + search_interface = BiVectorSearchInterface( + backend=search_backend, query2vec="sentence-transformers/all-MiniLM-L6-v2" + ) + + result = search_interface.query_search( + query=query, + limit=limit, + with_payload=True, + p=p, + q=q, + with_vectors=False, + distance=True, # HNSWBackend returns result by distance instead of similarity + rank=rank, + ) + + return result diff --git a/geniml/search/interfaces/__init__.py b/geniml/search/interfaces/__init__.py new file mode 100644 index 00000000..ff2707ca --- /dev/null +++ b/geniml/search/interfaces/__init__.py @@ -0,0 +1,3 @@ +from .bed2bed import BED2BEDSearchInterface +from .mlfree import BiVectorSearchInterface +from .text2bed import Text2BEDSearchInterface diff --git a/geniml/search/interfaces/abstract.py b/geniml/search/interfaces/abstract.py new file mode 100644 index 00000000..3a8fb1e8 --- /dev/null +++ b/geniml/search/interfaces/abstract.py @@ -0,0 +1,32 @@ +from abc import ABC, abstractmethod +from typing import Dict, List, Union + +import numpy as np + +from ...io import RegionSet +from ..backends.abstract import EmSearchBackend +from ..query2vec.abstract import Query2Vec + + +class BEDSearchInterface(ABC): + """ + An abstract class representing BED files search interface. + The query will be embedded by one of the subclass of Query2Vec, + and the embedding is used to do KNN search in the backend + where BED embeddings are stored. + """ + + def __init__(self, backend: EmSearchBackend, query2vec: Query2Vec) -> None: + self.backend = backend + self.query2vec = query2vec + + @abstractmethod + def query_search( + self, + query: Union[str, RegionSet, np.ndarray], + limit: int, + with_payload: bool = True, + with_vectors: bool = True, + offset: int = 0, + ) -> List[Dict]: + raise NotImplementedError diff --git a/geniml/search/interfaces/bed2bed.py b/geniml/search/interfaces/bed2bed.py new file mode 100644 index 00000000..51b7d450 --- /dev/null +++ b/geniml/search/interfaces/bed2bed.py @@ -0,0 +1,50 @@ +from typing import Dict, List, Union + +import numpy as np + +from ...io import RegionSet +from ..backends import HNSWBackend, QdrantBackend +from ..query2vec import BED2Vec +from .abstract import BEDSearchInterface + + +class BED2BEDSearchInterface(BEDSearchInterface): + """Search interface for the query that is a region set""" + + def __init__( + self, + backend: Union[QdrantBackend, HNSWBackend], + query2vec: Union[str, BED2Vec], + ): + """ + + :param backend: the backend where vectors are stored + + :param query2vec: a BED2Vec model, or a hugging face model repository of geniml.region2vec.Region2VecExModel + """ + if isinstance(query2vec, str): + self.query2vec = BED2Vec(query2vec) + else: + self.query2vec = query2vec + + self.backend = backend + + def query_search( + self, + query: Union[str, RegionSet, np.ndarray], + limit: int, + with_payload: bool = True, + with_vectors: bool = True, + offset: int = 0, + ) -> List[Dict]: + """ + :param query: a region set, s path to a BED file in disk, or a region set embedding vector + + for rest of the parameters, check the docstring of QdrantBackend.search() or HNSWBackend.search() + """ + if isinstance(query, np.ndarray): + search_vec = query + else: + search_vec = self.query2vec.forward(query) + + return self.backend.search(search_vec, limit, with_payload, with_vectors, offset) diff --git a/geniml/search/interfaces/mlfree.py b/geniml/search/interfaces/mlfree.py new file mode 100644 index 00000000..918aeb43 --- /dev/null +++ b/geniml/search/interfaces/mlfree.py @@ -0,0 +1,59 @@ +from typing import Dict, List, Union + +import numpy as np + +from ..backends import BiVectorBackend +from ..query2vec import Text2Vec +from .abstract import BEDSearchInterface + + +class BiVectorSearchInterface(BEDSearchInterface): + """Search interface for ML free bi-vectors searching backend""" + + def __init__(self, backend: BiVectorBackend, query2vec: Union[str, Text2Vec]) -> None: + """ + Initiate the search interface + + :param backend: the backend where vectors are stored + :param query2vec: a Text2Vec, for details, see docstrings in geniml.search.query2vec.text2vec + """ + if isinstance(query2vec, str): + self.query2vec = Text2Vec(query2vec, v2v=None) + else: + self.query2vec = query2vec + self.backend = backend + + def query_search( + self, + query: Union[str, np.ndarray], + limit: int, + with_payload: bool = True, + with_vectors: bool = True, + offset: int = 0, + p: float = 1.0, + q: float = 1.0, + distance: bool = False, + rank: bool = False, + ) -> List[Dict]: + """ + + :param query: the natural language query string, or a vector in the embedding space of region sets + + for rest of the parameters, check the docstring of QdrantBackend.search() or HNSWBackend.search() + """ + if isinstance(query, np.ndarray): + search_vec = query + else: + search_vec = self.query2vec.forward(query) + + return self.backend.search( + query=search_vec, + limit=limit, + with_payload=with_payload, + with_vectors=with_vectors, + offset=offset, + p=p, + q=q, + distance=distance, + rank=rank, + ) diff --git a/geniml/search/interfaces/text2bed.py b/geniml/search/interfaces/text2bed.py new file mode 100644 index 00000000..5f6b75b7 --- /dev/null +++ b/geniml/search/interfaces/text2bed.py @@ -0,0 +1,113 @@ +import logging +from typing import Dict, List, Union + +import numpy as np + +from geniml.const import PKG_NAME + +from ..backends import HNSWBackend, QdrantBackend +from ..query2vec import Text2Vec +from ..utils import single_query_eval +from .abstract import BEDSearchInterface + +_LOGGER = logging.getLogger(PKG_NAME) + + +class Text2BEDSearchInterface(BEDSearchInterface): + """Search interface for the query that is a natural language string""" + + def __init__( + self, + backend: Union[QdrantBackend, HNSWBackend], + query2vec: Text2Vec, + ): + """ + Initiate the search interface + + :param backend: the backend where vectors are stored + :param query2vec: a Text2Vec, for details, see docstrings in geniml.search.query2vec.text2vec + """ + self.query2vec = query2vec + self.backend = backend + + def query_search( + self, + query: Union[str, np.ndarray], + limit: int, + with_payload: bool = True, + with_vectors: bool = True, + offset: int = 0, + ) -> List[Dict]: + """ + + :param query: the natural language query string, or a vector in the embedding space of region sets + + for rest of the parameters, check the docstring of QdrantBackend.search() or HNSWBackend.search() + """ + if isinstance(query, np.ndarray): + search_vec = query + else: + search_vec = self.query2vec.forward(query) + + return self.backend.search(search_vec, limit, with_payload, with_vectors, offset) + + def eval(self, query_dict: Dict[str, List[Union[int, np.int64]]]) -> Dict[str, float]: + """ + With a query dictionary, return the Mean Average Precision, AUC-ROC and R-precision of query retrieval + + :param query_dict:a dictionary that contains query and relevant results in this format: + { + :[ + , + ... + ], + ... + } + + :return: a Tuple of (Mean Average Precision, Average AUC-ROC, Average R-precision) + """ + + # number + n = len(self.backend) + + # set ef for search + # ef cannot be set lower than the number of queried nearest neighbors k + + if isinstance(self.backend, HNSWBackend): + self.backend.idx.set_ef(n) + + sum_ap = 0 # sum of all average precision + sum_auc = 0 # sum of all AUC-ROC + sum_rp = 0 # sum of all R-Precision + + # total number of queries + query_count = 0 + + k = n # to rank all results + # evaluate each retrieval + for query_str in query_dict.keys(): + relevant_results = query_dict[query_str] # set of relevant ids + + try: + search_results = self.query_search( + query=query_str, limit=k, with_vectors=False, with_payload=False + ) + query_count += 1 + except Exception as e: + _LOGGER.error(f"{query_str} caused {e}") + continue + + ap, auc, rp = single_query_eval(search_results, relevant_results) + sum_ap += ap + sum_auc += auc + sum_rp += rp + + if query_count > 0: + return { + "Mean Average Precision": sum_ap / query_count, + "Mean AUC-ROC": sum_auc / query_count, + "Average R-Precision": sum_rp / query_count, + } + + else: + return {"Mean Average Precision": 0.0, "Mean AUC-ROC": 0.0, "Average R-Precision": 0.0} diff --git a/geniml/search/query2vec/__init__.py b/geniml/search/query2vec/__init__.py new file mode 100644 index 00000000..87a3d029 --- /dev/null +++ b/geniml/search/query2vec/__init__.py @@ -0,0 +1,2 @@ +from .bed2vec import BED2Vec +from .text2vec import Text2Vec diff --git a/geniml/search/query2vec/abstract.py b/geniml/search/query2vec/abstract.py new file mode 100644 index 00000000..71ab1a04 --- /dev/null +++ b/geniml/search/query2vec/abstract.py @@ -0,0 +1,31 @@ +from abc import ABC, abstractmethod +from typing import Union + +import numpy as np + +from ...io import RegionSet + + +class Query2Vec(ABC): + """ + An abstract class representing query embedder. In retrieval of region sets, + it embeds the query into a vector, which is used for KNN search in backend + """ + + def __init__(self) -> None: + raise NotImplementedError + + @abstractmethod + def forward(self, query: Union[str, RegionSet]) -> np.ndarray: + """ + + Parameters + ---------- + query : a natural language string (query term or path to a BED file in disk), + or a RegionSet object + + Returns + ------- + the embedding vector + """ + raise NotImplementedError diff --git a/geniml/search/query2vec/bed2vec.py b/geniml/search/query2vec/bed2vec.py new file mode 100644 index 00000000..1d930bd6 --- /dev/null +++ b/geniml/search/query2vec/bed2vec.py @@ -0,0 +1,46 @@ +import logging +from typing import Union + +import numpy as np + +from ...const import PKG_NAME +from ...io import RegionSet +from ...region2vec.main import Region2VecExModel +from .abstract import Query2Vec + +_LOGGER = logging.getLogger(PKG_NAME) + + +class BED2Vec(Query2Vec): + """Embed a query region set into a vector""" + + def __init__(self, model: Union[str, Region2VecExModel]): + """ + set the Region2VecExModel + + :param model: a Region2VecExModel or a model repository on Hugging Face + """ + if isinstance(model, str): + self.model = Region2VecExModel(model) + elif isinstance(model, Region2VecExModel): + self.model = model + else: + _LOGGER.error( + "TypeError: Please give a Region2VecExModel or a model repository on Hugging Face" + ) + + def forward(self, query: Union[str, RegionSet]) -> np.ndarray: + """ + Embed the query region set + + :param query: a RegionSet, or the path to a BED file in disk + + :return: the region set embedding + """ + # if query is a BED file name, read it as a RegionSet class + if isinstance(query, str): + query = RegionSet(query) + + region_embeddings = self.model.encode(query) + # BED embedding: averaging region embeddings + return np.mean(region_embeddings, axis=0) diff --git a/geniml/search/query2vec/text2vec.py b/geniml/search/query2vec/text2vec.py new file mode 100644 index 00000000..d8676704 --- /dev/null +++ b/geniml/search/query2vec/text2vec.py @@ -0,0 +1,49 @@ +import logging +from typing import Union + +import numpy as np + +# from langchain_huggingface.embeddings import HuggingFaceEmbeddings +from fastembed import TextEmbedding + +from ...const import PKG_NAME +from ...text2bednn import Vec2VecFNN +from .abstract import Query2Vec + +_LOGGER = logging.getLogger(PKG_NAME) + + +class Text2Vec(Query2Vec): + """Map a query string into a vector into the embedding space of region sets""" + + def __init__(self, hf_repo: str, v2v: Union[str, Vec2VecFNN, None]): + """ + :param text_embedder: a model repository on Hugging Face + :param v2v: a Vec2VecFNN (see geniml/text2bednn/text2bednn.py) or a model repository on Hugging Face + """ + # Set model that embed natural language + self.text_embedder = TextEmbedding(model_name=hf_repo) + # Set model that maps natural language embeddings into the embedding space of region sets + if isinstance(v2v, Vec2VecFNN): + self.v2v = v2v + elif isinstance(v2v, str): + # for bivec search (ML free) + self.v2v = Vec2VecFNN(v2v) + else: + self.v2v = None + + def forward(self, query: str) -> np.ndarray: + """ + Embed the query natural language string + + :param query: a natural language string + + :return: the embedding vector of query + """ + # embed query string + query_embedding = list(self.text_embedder.embed(query))[0] + if self.v2v is None: + return query_embedding + else: + # map the query string embedding into the embedding space of region sets + return self.v2v.encode(query_embedding) diff --git a/geniml/search/search_eval.py b/geniml/search/search_eval.py new file mode 100644 index 00000000..2479a5cd --- /dev/null +++ b/geniml/search/search_eval.py @@ -0,0 +1,56 @@ +import json +from typing import Dict, List, Union + +from huggingface_hub import hf_hub_download + +from .backends import HNSWBackend +from .const import HF_INDEX, HF_METADATA, HF_PAYLOADS +from .interfaces import Text2BEDSearchInterface +from .query2vec import Text2Vec + + +def anecdotal_search_from_hf_data( + query: str, dataset_repo: str, search_model_repo: str, text_embed_model_repo: str, k: int = 10 +) -> List[Dict[str, Union[float, int, Dict[str, str]]]]: + """ + Test the retrieval performance of a trained search model on a dataset on huggingface + + :param query: user input search term + :param dataset_repo: huggingface repository of the dataset + :param search_model_repo: huggingface repository of the search model + :param text_embed_model_repo:huggingface repository of the text encoder model + :param k: number of returned result + :return:a dictionary containing search score (distance to the mapped query vector) and + search result metadata. + """ + + # download embedding vector backends and metadata from huggingface + index_path = hf_hub_download(dataset_repo, HF_INDEX, repo_type="dataset") + payloads_path = hf_hub_download(dataset_repo, HF_PAYLOADS, repo_type="dataset") + metadata_path = hf_hub_download(dataset_repo, HF_METADATA, repo_type="dataset") + + # evaluation backend + eval_backend = HNSWBackend(local_index_path=index_path, payloads=payloads_path) + + # load metadata + with open(metadata_path, "r") as f: + metadata_dict = json.load(f) + + text2vec = Text2Vec(text_embed_model_repo, search_model_repo) + search_interface = Text2BEDSearchInterface(eval_backend, text2vec) + + search_results = search_interface.query_search(query, k, with_payload=True, with_vectors=False) + + # curate output dictionary + result_files_id_dict = { + search_results[i]["payload"]["file"]: i for i in range(len(search_results)) + } + for attribute in metadata_dict: + for metadata in metadata_dict[attribute]: + for file in metadata_dict[attribute][metadata]: + try: + search_results[result_files_id_dict[file]]["payload"][attribute] = metadata + except: + continue + + return search_results diff --git a/geniml/search/utils.py b/geniml/search/utils.py new file mode 100644 index 00000000..2b71ea82 --- /dev/null +++ b/geniml/search/utils.py @@ -0,0 +1,103 @@ +import random +from typing import Dict, List, Tuple, Union + +import numpy as np + + +def verify_load_inputs( + vectors: np.ndarray, + ids: Union[List[Union[str, int]], np.ndarray], + payloads: List[Dict[str, str]], +): + n_ids = len(ids) + n_vectors = vectors.shape[0] + n_payloads = len(payloads) + if n_ids != n_vectors or n_ids != n_payloads: + raise ValueError( + "The number of ids ({n_ids}), vectors ({n_vectors}), and payloads ({n_payloads}) must match" + ) + + +def single_query_eval(search_results: List, relevant_results: List) -> Tuple[float, float, float]: + """ + Evaluate a single query + + :param search_results: List of store ids, by the order of similarity in search + :param relevant_results: List if store id which are relevant search results + + :return: a Tuple of (Average Precision, AUC-ROC, R-precision) + """ + num_relevant = len(relevant_results) + retrieved_relevant = 0 + k = len(search_results) + sum_precision = 0 + x = [0] # (false_positive/(false_positive + true_negative) + y = [0] # recall or true_positive / (true_positive + false_negative) + false_positive = 0 + true_negative = k - num_relevant + true_positive = 0 + false_negative = num_relevant + + for i in range(k): + result = search_results[i] + result_id = result["id"] + if result_id in relevant_results: # one relevant is retrieved + true_positive += 1 + false_negative -= 1 + retrieved_relevant += 1 + + sum_precision += retrieved_relevant / (i + 1) + + else: # one irrelevant is retrieved + false_positive += 1 + true_negative -= 1 + x.append(false_positive / (false_positive + true_negative)) + y.append(true_positive / (true_positive + false_negative)) + if i == num_relevant - 1: + r_precision = retrieved_relevant / num_relevant + average_precision = sum_precision / num_relevant + # compute AUC-ROC + auc = np.trapz(y, x) + return average_precision, auc, r_precision + + +def rand_eval(n: int, query_dict: Dict) -> Tuple[float, float, float]: + """ + Evaluation results if the retrieval is completely random + + :param n: total number of results + + :param query_dict: a dictionary that contains query and relevant results in this format: + { + :[ + , + ... + ], + ... + } + + :return: a Tuple of (Average Precision, AUC-ROC, R-precision) + """ + sum_ap = 0 # sum of all average precisions + sum_auc = 0 + sum_rp = 0 + query_count = 0 + + for query_str in query_dict.keys(): + relevant_results = query_dict[query_str] # set of relevant ids + search_results_ids = list(range(n)) + random.shuffle(search_results_ids) + search_results = [] + for result_id in search_results_ids: + search_results.append({"id": result_id}) + query_count += 1 + ap, auc, rp = single_query_eval(search_results, relevant_results) + sum_ap += ap + sum_auc += auc + sum_rp += rp + + if query_count > 0: + return sum_ap / query_count, sum_auc / query_count, sum_rp / query_count + + else: + return 0.0, 0.0, 0.0 diff --git a/geniml/text2bednn/__init__.py b/geniml/text2bednn/__init__.py new file mode 100644 index 00000000..e7aca626 --- /dev/null +++ b/geniml/text2bednn/__init__.py @@ -0,0 +1,2 @@ +from .text2bednn import Vec2VecFNN +from .utils import arrays_to_torch_dataloader, metadata_dict_from_csv diff --git a/geniml/text2bednn/const.py b/geniml/text2bednn/const.py new file mode 100644 index 00000000..c60a51cd --- /dev/null +++ b/geniml/text2bednn/const.py @@ -0,0 +1,38 @@ +# metadata from csv +DEFAULT_GENOME_KEY = "sample_genome" +DEFAULT_SERIES_KEY = "gse" +DEFAULT_FILE_KEY = "file" +BIO_GPT_REPO = "microsoft/biogpt" +BIO_BERT_REPO = "dmis-lab/biobert-v1.1" + + +DEFAULT_TRAIN_P = 0.85 * 0.9 +DEFAULT_VALIDATE_P = 0.85 * 0.1 + +# sentence transformer model from hugging face +DEFAULT_NL_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L12-v2" +DEFAULT_MAX_SEQ_LENGTH = 1000 + +DEFAULT_NUM_EPOCHS = 1000 +DEFAULT_NUM_UNITS = 256 +DEFAULT_BATCH_SIZE = 1 +DEFAULT_OPTIMIZER_NAME = "Adam" +DEFAULT_LOSS_NAME = "cosine_embedding_loss" +DEFAULT_MARGIN = 0.0 +# embedding dimension of Region2Vec: https://huggingface.co/databio/r2v-ChIP-atlas-hg38 +DEFAULT_EMBEDDING_DIM = (100,) +# default learning rate of Adam optimizer +DEFAULT_LEARNING_RATE = 0.001 + +# if validation loss does not improve after patience*epoches, training stops +DEFAULT_PATIENCE = 0.2 + + +DEFAULT_DATALOADER_SHUFFLE = True +MODULE_NAME = "text2bednn" +CONFIG_FILE_NAME = "config.yaml" +TORCH_MODEL_FILE_NAME_PATTERN = "v2v_{callback}_{checkpoint}.pt" +DEFAULT_MUST_TRAINED = True +DEFAULT_PLOT_FILE_NAME = "training_history" +DEFAULT_PLOT_TITLE = "Diagram of loss and epochs" +DEFAULT_HUGGINGFACE_MODEL_NAME = "checkpoint.pt" diff --git a/geniml/text2bednn/text2bednn.py b/geniml/text2bednn/text2bednn.py new file mode 100644 index 00000000..d83fcf20 --- /dev/null +++ b/geniml/text2bednn/text2bednn.py @@ -0,0 +1,449 @@ +import logging +import math +import os +from typing import List, Tuple, Union + +import matplotlib.pyplot as plt +import numpy as np + +try: + import torch + from torch.nn import CosineEmbeddingLoss, CosineSimilarity, Linear, MSELoss, ReLU, Sequential +except ImportError: + raise ImportError( + "Please install Machine Learning dependencies by running 'pip install geniml[ml]'" + ) +from huggingface_hub import hf_hub_download +from yaml import safe_dump, safe_load + +from .const import ( + CONFIG_FILE_NAME, + DEFAULT_BATCH_SIZE, + DEFAULT_HUGGINGFACE_MODEL_NAME, + DEFAULT_LEARNING_RATE, + DEFAULT_LOSS_NAME, + DEFAULT_MARGIN, + DEFAULT_MUST_TRAINED, + DEFAULT_NUM_EPOCHS, + DEFAULT_NUM_UNITS, + DEFAULT_OPTIMIZER_NAME, + DEFAULT_PATIENCE, + DEFAULT_PLOT_FILE_NAME, + DEFAULT_PLOT_TITLE, + MODULE_NAME, + TORCH_MODEL_FILE_NAME_PATTERN, +) +from .utils import arrays_to_torch_dataloader, dtype_check + +_LOGGER = logging.getLogger(MODULE_NAME) + + +class Vec2Vec(Sequential): + def __init__( + self, + input_dim: int, + output_dim: int, + num_units: Union[int, List[int]], + ): + if not isinstance(num_units, list): + num_units = [num_units] + num_extra_hidden_layers = len(num_units) - 1 + # input and first hidden layer + current_layer_units_num = num_units[0] + layers_list = [ + Linear(in_features=input_dim, out_features=current_layer_units_num), + ReLU(), + ] + previous_layer_units_num = current_layer_units_num + + # extra hidden layer + for i in range(num_extra_hidden_layers): + current_layer_units_num = num_units[i + 1] + layers_list.append( + Linear( + in_features=previous_layer_units_num, + out_features=current_layer_units_num, + ) + ) + layers_list.append(ReLU()) + previous_layer_units_num = current_layer_units_num + + # output layer + layers_list.append(Linear(in_features=previous_layer_units_num, out_features=output_dim)) + + super().__init__(*layers_list) + + +class Vec2VecFNN: + def __init__(self, model_path: Union[str, None] = None): + """ + Initialize Vec2VecFNNtorch. + + :param model_path: path to the pretrained model on huggingface. + """ + # initialize the feedforward neural network model, which is a torch.nn.Sequential + # self.model = + self.model = None + # whether the model is trained + self.trained = False + # optimizer + self.optimizer = None + # loss function + self.loss_fn = None + # model configure + self.config = {} + # training history + self.most_recent_train = {} + + if model_path is not None: + # load from hugging face + self._init_from_huggingface(model_path) + self.trained = True + + def _init_from_huggingface( + self, + model_path: str, + model_file_name: str = DEFAULT_HUGGINGFACE_MODEL_NAME, + config_file_name: str = CONFIG_FILE_NAME, + **kwargs, + ): + """ + Initialize the model from a huggingface model. This uses the model path + to download the necessary files and then "build itself up" from those. + + :param model_path: path to the pre-trained model on huggingface. + :param model_file_name: name of the model file. + :param config_file_name: name of the config file + """ + model_file_path = hf_hub_download(model_path, model_file_name, **kwargs) + config_path = hf_hub_download(model_path, config_file_name, **kwargs) + + self.load_from_disk(model_file_path, config_path) + + def load_from_disk(self, model_path: str, config_path: str): + """ + Load model from local files + + :param model_path: path of saved model file (usually in format of .pt) + :param config_path: path of saved config file (in format of yaml) + """ + # get the model config (layer structure) + with open(config_path, "r") as f: + config = safe_load(f) + + self.config = config + # reinitiate the self.model + self.model = Vec2Vec( + config["input_dim"], + config["output_dim"], + config["num_units"], + ) + + # load the Sequential model from saved files + self.model.load_state_dict(torch.load(model_path)) + + def export( + self, + path: str, + checkpoint_file: str, + config_file: str = CONFIG_FILE_NAME, + must_trained: bool = DEFAULT_MUST_TRAINED, + ): + """ + Save model weights and config + + :param path: path to export the model to + :param checkpoint_file: name of model checkpoint file + :param config_file: name of model config file + :param must_trained: whether the model needs training to be exported + """ + # whether the model must be finished training to export + if must_trained and not self.trained: + raise RuntimeError("Cannot export an untrained model.") + + if not os.path.exists(path): + os.makedirs(path) + + # export the model weights + torch.save(self.model.state_dict(), os.path.join(path, checkpoint_file)) + + # export the model config + with open(os.path.join(path, config_file), "w") as f: + safe_dump(self.config, f) + + def embedding_to_embedding(self, input_vecs: np.ndarray) -> np.ndarray: + """ + Predict the region set embedding from embedding of natural language strings + + :param input_vecs: input embedding vectors + :return: the output of the neural network model + """ + # pytorch tensor's default dtype is float 32 + return self.model(torch.from_numpy(dtype_check(input_vecs))).detach().numpy() + + def compile( + self, + optimizer: str, + loss: str, + learning_rate: float, + margin: Union[float, None] = DEFAULT_MARGIN, + ): + """ + Configure the model for training. This includes setting the optimizer and loss function. + + :param optimizer: the name of optimizer + :param loss: the name of loss function + :param learning_rate: the learning rate of model backpropagation + :param margin: should be a number from −1−1 to 1, 0 to 0.5 is suggested, only for CosineEmbeddingLoss + """ + + # set optimizer + if optimizer == "Adam": + self.optimizer = torch.optim.Adam(self.model.parameters(), learning_rate) + + elif optimizer == "SGD": + self.optimizer = torch.optim.SGD(self.model.parameters(), learning_rate) + + else: + raise ValueError("Please give a valid name of optimizer") + + # set loss function + if loss == "cosine_embedding_loss": + self.loss_fn = CosineEmbeddingLoss(margin=margin) + elif loss == "cosine_similarity": + self.loss_fn = CosineSimilarity() + elif loss == "mean_squared_error": + self.loss_fn = MSELoss() + else: + raise ValueError("Please give a valid name of loss function") + + # add information to model config + self.config["optimizer"] = optimizer + self.config["loss"] = loss + + def train( + self, + training_X: np.ndarray, + training_Y: np.ndarray, + validating_data: Union[Tuple[np.ndarray, np.ndarray], None] = None, + save_best: bool = False, + folder_path: Union[str, None] = None, + best_model_file_name: Union[str, None] = None, + early_stop: bool = False, + patience: float = DEFAULT_PATIENCE, + opt_name: str = DEFAULT_OPTIMIZER_NAME, + loss_func: str = DEFAULT_LOSS_NAME, + num_epochs: int = DEFAULT_NUM_EPOCHS, + batch_size: int = DEFAULT_BATCH_SIZE, + learning_rate: float = DEFAULT_LEARNING_RATE, + training_target: Union[np.ndarray, None] = None, + validating_target: Union[np.ndarray, None] = None, + **kwargs, + ): + """ + Based on https://pytorch.org/tutorials/beginner/introyt/trainingyt.html + Fit the feedforward neural network + + :param training_X: embedding vectors of metadata, np.ndarray with shape of (n, ) + :param training_Y: embedding vectors of region set, np.ndarray with shape of (n, ) + :param validating_data: validating data, which contains validating X and validating Y + :param save_best: whether the best performance model is saved after each epoch (based on validation loss) + :param folder_path: the path to the folder to save the model and config + :param best_model_file_name: the name of the file of saved best model + :param early_stop: whether the training should be stopped early to prevent overfitting + :param patience: the percentage of epoches to stop training if no validation loss improvement + :param opt_name: name of optimizer + :param loss_func: name of loss function + :param num_epochs: number of training epoches + :param batch_size: size of batch for training + :param learning_rate: learning rate of optimizer + :param training_target: + :param validating_target: + :param kwargs: see units and layers in reinit_model() + """ + # if current model is empty, add layers + if self.model is None: + # dimensions of input and output + input_dim = training_X.shape[1] + output_dim = training_Y.shape[1] + + self.config["input_dim"] = input_dim + self.config["output_dim"] = output_dim + self.config["num_units"] = kwargs.get("num_units") or DEFAULT_NUM_UNITS + self.model = Vec2Vec( + input_dim=input_dim, + output_dim=output_dim, + num_units=self.config["num_units"], + ) + + if training_target is None: + training_target = np.repeat(1, training_X.shape[0]) + # raise the error if validating data is needed but not provided + if validating_data is not None: + validating_X, validating_Y = validating_data + if validating_target is None: + validating_target = np.repeat(1, validating_X.shape[0]) + validating_data = arrays_to_torch_dataloader( + validating_X, + validating_Y, + validating_target, + batch_size=batch_size, + shuffle=False, + ) + + self.most_recent_train["val_loss"] = [] + elif save_best or early_stop: + raise ValueError("Validating data is not provided") + if save_best and folder_path is None: + raise ValueError( + "ValueError: Path to folder where the best performance model will be saved is required" + ) + + # compile the model + self.compile(optimizer=opt_name, loss=loss_func, learning_rate=learning_rate) + + # convert training data from np.ndarray to DataLoader + training_data = arrays_to_torch_dataloader( + training_X, training_Y, training_target, batch_size + ) + + best_val_loss = 1_000_000.0 + patience_count = 0 + self.most_recent_train["loss"] = [] + + for epoch in range(num_epochs): + # gradient tracking is on + self.model.train(True) + avg_loss = self.train_one_epoch(training_data) + self.most_recent_train["loss"].append(avg_loss) + # set model to evaluation mode, disabling dropout and using population + # statistics for batch normalization. + self.model.eval() + + if validating_data is not None: + running_val_loss = 0.0 + # disable gradient computation + with torch.no_grad(): + for i, (val_x, val_y, val_target) in enumerate(validating_data): + val_output = self.model(val_x) + val_loss = self.calc_loss(val_output, val_y, val_target) + running_val_loss += val_loss + + avg_val_loss = running_val_loss / (i + 1) + self.most_recent_train["val_loss"].append(avg_val_loss) + # logging training and validating loss + _LOGGER.info(f"EPOCH {epoch + 1}: loss: -{avg_loss} - val_loss: -{avg_val_loss}") + + # save the best-performing model + if avg_val_loss < best_val_loss: + # reset the patience count + patience_count = 0 + best_val_loss = avg_val_loss + if save_best: + self.export( + folder_path, + best_model_file_name + or TORCH_MODEL_FILE_NAME_PATTERN.format( + callback="best", checkpoint=str(epoch) + ), + must_trained=False, + ) + + # early stop to prevent overfitting + if early_stop: + if avg_val_loss > avg_loss: + patience_count += 1 + if patience_count > int(math.ceil(patience * num_epochs)): + break + + else: + _LOGGER.info(f"EPOCH {epoch + 1}: loss: -{avg_loss}") + self.trained = True + + def train_one_epoch(self, training_data) -> torch.Tensor: + """ + Based on https://pytorch.org/tutorials/beginner/introyt/trainingyt.html + + One epoch's training loop + + :return: the average training loss of one epoch + """ + epoch_loss = 0.0 + + # train on each batch + for i, (x, y, target) in enumerate(training_data): + # zero gradients for every batch + self.optimizer.zero_grad() + + # batch prediction + outputs = self.model(x) + + # compute loss and gradients + batch_loss = self.calc_loss(outputs, y, target) + batch_loss.backward() + + # adjust learning weights + self.optimizer.step() + + # gather loss and report + epoch_loss += batch_loss.item() + return epoch_loss / (i + 1) + + def calc_loss( + self, outputs: torch.Tensor, y: torch.Tensor, target: torch.Tensor + ) -> torch.Tensor: + """ + Calculating loss when different loss function is given + + :param outputs: the output of model + :param y: the correct label + :param target: + :return: the loss + + """ + + if not self.config["loss"]: + raise ValueError("Please compile the model first") + + # when all targets are 1 + # loss = 1 - cos(output, y) + # https://pytorch.org/docs/stable/generated/torch.nn.CosineEmbeddingLoss.html + elif self.config["loss"] == "cosine_embedding_loss": + return self.loss_fn(outputs, y, target) + else: + return self.loss_fn(outputs, y) + + def plot_training_hist( + self, + save_path: Union[str, None] = None, + plot_file_name: Union[str, None] = DEFAULT_PLOT_FILE_NAME, + title: Union[str, None] = DEFAULT_PLOT_TITLE, + ) -> None: + """ + Plot the training & validating loss of the most recent training + + :param save_path: the path of folder where image will be saved + :param plot_file_name: the file name of the png file + :param title: the title in the image + :return: None + """ + + epoch_range = range(1, len(self.most_recent_train["loss"]) + 1) + train_loss = self.most_recent_train["loss"] + plt.figure() + plt.plot(epoch_range, train_loss, "r", label="Training loss") + try: + valid_loss = self.most_recent_train["val_loss"] + plt.plot(epoch_range, valid_loss, "b", label="Validation loss") + except: + pass + plt.title(title) + plt.legend() + if save_path: + plt.savefig(os.path.join(save_path, plot_file_name)) + else: + plt.show() + plt.close() + + def __repr__(self): + return f"Vec2Vec(input_dimension={self.config['input_dim']}, output_dimension={self.config['output_dim']}, trained={self.trained})" diff --git a/geniml/text2bednn/utils.py b/geniml/text2bednn/utils.py new file mode 100644 index 00000000..88e91beb --- /dev/null +++ b/geniml/text2bednn/utils.py @@ -0,0 +1,177 @@ +import logging +from typing import Dict, List, Set, Union + +import numpy as np +import pandas as pd + +try: + import torch + from torch.utils.data import DataLoader, TensorDataset +except ImportError: + raise ImportError( + "Please install Machine Learning dependencies by running 'pip install geniml[ml]'" + ) + +from .const import ( + DEFAULT_BATCH_SIZE, + DEFAULT_DATALOADER_SHUFFLE, + DEFAULT_FILE_KEY, + DEFAULT_GENOME_KEY, + DEFAULT_SERIES_KEY, + MODULE_NAME, +) + +_LOGGER = logging.getLogger(MODULE_NAME) + + +def arrays_to_torch_dataloader( + X: np.ndarray, + Y: np.ndarray, + target: np.ndarray, + batch_size: int = DEFAULT_BATCH_SIZE, + shuffle: bool = DEFAULT_DATALOADER_SHUFFLE, +) -> DataLoader: + """ + Based on https://stackoverflow.com/questions/44429199/how-to-load-a-list-of-numpy-arrays-to-pytorch-dataset-loader + Store np.ndarray of X and Y into a torch.DataLoader + + :param X: embedding vectors of input data (natural language embeddings) + :param Y: embedding vectors of output data (BED file embeddings) + :param target: vector of 1 and -1, indicating if each vector pair of (X, Y) are target pairs or not + :param batch_size: size of small batch + :param shuffle: shuffle dataset or not + :return: a Dataset for pytorch training in format of torch.DataLoader + """ + tensor_X = torch.from_numpy(dtype_check(X)) + tensor_Y = torch.from_numpy(dtype_check(Y)) + tensor_target = torch.from_numpy(dtype_check(target)) + my_dataset = TensorDataset(tensor_X, tensor_Y, tensor_target) # create your dataset + + return DataLoader(my_dataset, batch_size=batch_size, shuffle=shuffle) + + +def dtype_check(vecs: np.ndarray) -> np.ndarray: + """ + Since the default float in np is float64, but in pytorch tensor it's float32, + to avoid errors, the dtype will be switched + + :param vecs: input np.ndarray + + :return: np.ndarray with dtype of float32 + """ + if not isinstance(vecs.dtype, type(np.dtype("float32"))): + vecs = vecs.astype(np.float32) + + return vecs + + +def metadata_dict_from_csv( + csv_path: str, + col_names: Set[str], + file_key: str = DEFAULT_FILE_KEY, + genomes: Union[Set[str], None] = None, + genomes_key: Union[str, None] = DEFAULT_GENOME_KEY, + series_key: Union[str, None] = DEFAULT_SERIES_KEY, + chunk_size: Union[int, None] = None, +) -> Dict[str, Union[str, Dict[str, Union[str, List[str]]]]]: + """ + Read selected columns from a metadata csv and return metadata dictionary, + can filter genomes with given list of genomes and the column name of genome + + :param csv_path: path to the csv file that contain metadata + :param col_names: set of csv columns that contain informative metadata + :param file_key: name of column of file names + :param genomes: set of genomes + :param genomes_key: name of column of sample genomes + :param series_key: name of column of series + :param chunk_size: size of chunk to read when the csv file is large + + :return: a metadata dictionary in this format: + if series information is in the csv, the dictionary format will be: + { + :[ + { + "name": + "metadata": { + : , + ... + } + }, + ... + ], + ... + } + + else, the dictionary format will be: + { + : { + : , + ... + }, + ... + } + """ + + # dictionary to store data + output_dict = dict() + # count number of series, files, and csv chunks + series_count = 0 + bed_count = 0 + text_count = 0 + empty_count = 0 + read_chunk = True + # read csv + for chunk in pd.read_csv(csv_path, chunksize=chunk_size): + # if chunk size is None + if isinstance(chunk, str): + read_chunk = False + rows_to_ite = pd.read_csv(csv_path) + else: + rows_to_ite = chunk + + for index, row in rows_to_ite.iterrows(): + genome_filter = True + # select genome if list of genomes and genome key are given + if genomes is not None and genomes_key is not None: + if row[genomes_key].strip() not in genomes: + genome_filter = False + + if genome_filter: + # collect metadata + metadata_dict = dict() + + for col in col_names: + if isinstance(row[col], str): # + text_count += 1 + metadata_dict[col] = row[col] + + if len(metadata_dict) == 0: + empty_count += 1 + # add the metadata into output dictionary + else: + if series_key is None or not series_key in rows_to_ite.columns: + output_dict[row[file_key]] = metadata_dict + + else: + payload = { + "name": row[file_key], + "metadata": metadata_dict, + } + try: + output_dict[row[series_key]].append(payload) + except: + output_dict[row[series_key]] = [payload] + series_count += 1 + bed_count += 1 + if not read_chunk: + break + + # output of summary statistics + if series_key is not None: + _LOGGER.info(f"Number of series: {series_count}") + + _LOGGER.info(f"Number of files: {bed_count}") + _LOGGER.info(f"Number of metadata strings: {text_count}") + _LOGGER.info(f"Number of files with 0 metadata strings: {empty_count}") + + return output_dict diff --git a/geniml/tokenization/__init__.py b/geniml/tokenization/__init__.py new file mode 100644 index 00000000..498a6d70 --- /dev/null +++ b/geniml/tokenization/__init__.py @@ -0,0 +1,2 @@ +# from .main import Tokenizer, AnnDataTokenizer, TreeTokenizer +# from .main import hard_tokenization_main as hard_tokenization diff --git a/geniml/tokenization/bedtools_tokenizer.py b/geniml/tokenization/bedtools_tokenizer.py new file mode 100644 index 00000000..e8cc0c90 --- /dev/null +++ b/geniml/tokenization/bedtools_tokenizer.py @@ -0,0 +1,56 @@ +from . import FileTokenizer + + +class BEDToolsTokenizer(FileTokenizer): + """A tokenizer that uses bedtools to tokenize BED files""" + + def __init__(self, bedtools_path: str, universe_path: str = None): + """Initialize a BEDToolsTokenizer + + Args: + bedtools_path (str): Path to a bedtools binary. + universe_path (str): Path to a universe BED file. + """ + self.bedtools_path = bedtools_path + self.universe_path = universe_path + + def tokenize(self, input_globs: list[str], universe_path: str = None) -> RegionSet: + """Tokenize a RegionSet using bedtools""" + + universe_path = universe_path or self.universe_path + + # loop through globs and tokenize each file + for glob in input_globs: + for path in glob.glob(glob): + _tokenize_one(path, universe_path) + + def _tokenize_one(self, input_path: str, universe_path: str): + output_path = os.path.join(input_path, "tokenized.bed") + bedtools_path = self.bedtools_path + # bedtools can't actually read from stdin, so we have to use a temporary file... + + # sort_process = subprocess.Popen(shlex.split(f"sort -k1,1V -k2,2n {input_path}"), stdout=subprocess.PIPE) + # bedtools_process = subprocess.Popen( + # shlex.split(f"{bedtools_path} intersect -a {universe} -b -u -f {fraction}"), + # stdin = sort_process.stdout, + # stdout = output_file, + # ) + # bedtools_process.communicate() + + # get a temporary file path using tempfile + import templfile + + with tempfile.NamedTemporaryFile() as temp_path, open(output_path, "w") as output_file: + # sort the input file + sort_process = subprocess.Popen( + shlex.split(f"sort -k1,1V -k2,2n {input_path}"), stdout=temp_path + ) + sort_process.communicate() + # tokenize the sorted file + bedtools_process = subprocess.Popen( + shlex.split( + f"{bedtools_path} intersect -a {universe} -b {temp_path} -u -f {fraction}" + ), + stdout=output_file, + ) + bedtools_process.communicate() diff --git a/geniml/tokenization/cli.py b/geniml/tokenization/cli.py new file mode 100644 index 00000000..aa729587 --- /dev/null +++ b/geniml/tokenization/cli.py @@ -0,0 +1,25 @@ +def build_subparser(parser): + """Builds an argument parser to support the tokenize command line interface.""" + parser.add_argument( + "--data-folder", + type=str, + help="Path to the folder that stores BED files", + ) + parser.add_argument("--token-folder", type=str, help="Folder that stores tokenized files") + # parameters for hard tokenization + parser.add_argument("--universe", type=str, help="Path to a universe file") + parser.add_argument("--nworkers", type=int, default=10, help="number of workers") + parser.add_argument( + "--bedtools-path", + type=str, + default="bedtools", + help="Path to the bedtools binary. Default: bedtools. If bedtools does not exists, an exception will be raised", + ) + parser.add_argument( + "--fraction", + type=float, + default=1.0e-9, + help="A parameter for bedtools.intersect", + ) + + return parser diff --git a/geniml/tokenization/hard_tokenization_batch.py b/geniml/tokenization/hard_tokenization_batch.py new file mode 100644 index 00000000..56636285 --- /dev/null +++ b/geniml/tokenization/hard_tokenization_batch.py @@ -0,0 +1,162 @@ +import argparse +import os +import shlex +import subprocess + +from .utils import Timer, time_str + + +def bedtools_tokenization( + f: str, + bedtools_path: str, + data_folder: str, + target_folder: str, + universe: str, + fraction: float, +) -> None: + """Uses bedtools to tokenize a raw BED file. + + Args: + f (str): File name. + bedtools_path (str): Path to a bedtools binary. + data_folder (str): The folder where raw BED files reside. + target_folder (str): The folder that stores tokenized BED files. + universe (str): Path to a universe file. + fraction (float): A parameter for bedtools.intersect. + """ + fname = os.path.join(data_folder, f) + temp = os.path.join(target_folder, f + "_sorted") + target = os.path.join(target_folder, f) + with open(temp, "w") as f_temp: + subprocess.run(shlex.split(f"sort -k1,1V -k2,2n {fname}"), stdout=f_temp) + with open(target, "w") as f_target: + subprocess.run( + shlex.split(f"{bedtools_path} intersect -a {universe} -b {temp} -u -f {fraction}"), + stdout=f_target, + ) + os.remove(temp) + + +def generate_tokens( + raw_data_folder: str, + token_folder: str, + universe: str, + file_list: str, + bedtools: str, + fraction: float, +) -> None: + """Tokenizes raw BED files specified by file_list. + + Tokenizes raw BED files specified by file_list. First, checks existing files + in token_folder. If token_folder has all the tokenized BED files, then does + nothing. Otherwise, tokenizes raw BED files that are in file_list but not + in token_folder. + + Args: + raw_data_folder (str): The folder where raw BED files reside. + token_folder (str): The folder to store tokenized BED files. + universe (str): The path to a universe file. + file_list (str): The path to a file which contains selected BED files per row. + bedtools (str): The path to a bedtools binary. + fraction (float): A parameter for bedtools.intersect. + """ + usize = 0 + with open(universe, "r") as f: + for _ in f: + usize += 1 + print(f"\033[93mUniverse size is {usize}\033[00m") + + all_set = [] + with open(file_list, "r") as fin: + for fname in fin: + name = fname.strip() + all_set.append(name) + all_set = set(all_set) + + if os.path.exists(token_folder): + files = os.listdir(token_folder) + existing_set = set([f.strip() for f in files]) + not_covered = all_set - existing_set + number = len(not_covered) + if number == 0: + print(f"Use the existing folder {token_folder}", flush=True) + return + else: + print( + f"Folder {token_folder} exists with {number} files not processed. Continue...", + flush=True, + ) + else: + os.makedirs(token_folder) + not_covered = all_set + for f in not_covered: + bedtools_tokenization(f, bedtools, raw_data_folder, token_folder, universe, fraction) + + +def main(args: argparse.Namespace): + """Generates tokenized BED files. + + Calls generate_tokens using the arguments in args. Prints status + information. + + Args: + args (argparse.Namespace): See the definition of the ArgumentParser. + """ + local_timer = utils.Timer() + print(f"Entering hard tokenization. Results stored in {args.token_folder}") + generate_tokens( + args.data_folder, + args.token_folder, + args.universe, + args.file_list, + args.bedtools_path, + args.fraction, + ) + tokenization_time = local_timer.t() + print(f"Hard tokenization takes {utils.time_str(tokenization_time)}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--data-folder", + type=str, + default="/scratch/gz5hp/encode3/cell/datasets", + help="path to the folder that stores BED files", + ) + parser.add_argument( + "--file-list", + type=str, + default="/home/gz5hp/encode3_proj/all_file_list.txt", + help="list of BED files that will be tokenized", + ) + + parser.add_argument( + "--token-folder", + type=str, + default="/scratch/gz5hp/encode3/cell/tokens", + help="folder that stores tokenized files", + ) + # parameters for hard tokenization + parser.add_argument( + "--universe", + type=str, + default="/home/gz5hp/encode3_proj/GRCh38-universe.bed", + help="path to a universe file", + ) + parser.add_argument( + "--bedtools-path", + type=str, + default="/scratch/gz5hp/genomes/bedtools", + help="path to the bedtools binary", + ) + parser.add_argument( + "--fraction", + type=float, + default=1.0e-9, + help="a parameter for bedtools.intersect", + ) + + args = parser.parse_args() + if os.path.exists(args.file_list): + main(args) diff --git a/geniml/tokenization/main.py b/geniml/tokenization/main.py new file mode 100644 index 00000000..e5f85b64 --- /dev/null +++ b/geniml/tokenization/main.py @@ -0,0 +1,146 @@ +import multiprocessing +import os +import shutil +import subprocess +from typing import List + +import numpy as np +from geniml.tokenization.split_file import split_file + +from .hard_tokenization_batch import main as hard_tokenization +from .utils import Timer, time_str + + +def hard_tokenization_main( + src_folder: str, + dst_folder: str, + universe_file: str, + fraction: float = 1e-9, + file_list: List[str] = None, + num_workers: int = 10, + bedtools_path: str = "bedtools", +) -> int: + """Tokenizes raw BED files in parallel. + + This is the main function for hard tokenization. It uses multiple processes to + speed up the tokenization process. + + Args: + src_folder (str): The folder where raw BED files reside. + dst_folder (str): The folder to store tokenized BED files. + universe_file (str): The path to a universe file. + fraction (float, optional): A parameter for bedtools.intersect. + Defaults to 1e-9. + file_list (list[str], optional): A list of files (just names not full + paths) that need to be tokenized. Defaults to None and uses all BED + files in src_folder. + num_workers (int, optional): Number of processes used. Defaults to 10. + bedtools_path (str, optional): The path to a bedtools binary. Defaults + to "bedtools". + + Raises: + Exception: No bedtools executable found + + Returns: + int: 0 when the dst_folder folder has incomplete list of tokenized BED + files which should be removed first; 1 when the dst_folder folder + has the complete tokenized BED files or the tokenization process + succeeds. + """ + timer = Timer() + start_time = timer.t() + + file_list_path = os.path.join(dst_folder, "file_list.txt") + files = os.listdir(src_folder) + file_count = len(files) + if file_count == 0: + print(f"No files in {src_folder}") + return 0 + + os.makedirs(dst_folder, exist_ok=True) + if file_list is None: # use all bed files in data_folder + # generate a file list + file_list = files + print(f"Use all ({file_count}) bed files in {src_folder}") + else: + file_number = len(file_list) + print(f"{file_count} bed files in total, use {file_number} of them") + + # check whether all files in file_list are tokenized + number = -1 + if os.path.exists(dst_folder): + all_set = set([f.strip() for f in file_list]) + existing_set = set(os.listdir(dst_folder)) + not_covered = all_set - existing_set + number = len(not_covered) + if number == 0 and len(existing_set) == len(all_set): + print("Skip tokenization. Using the existing tokenization files") + return 1 + elif len(existing_set) > 0: + print( + f"Folder {dst_folder} exists with incomplete tokenized files. Please empty/delete the folder first" + ) + return 0 + + with open(file_list_path, "w") as f: + for file in file_list: + f.write(file) + f.write("\n") + + if bedtools_path == "bedtools": + try: + rval = subprocess.call([bedtools_path, "--version"]) + except Exception: + raise Exception("No bedtools executable found") + if rval != 0: + raise Exception("No bedtools executable found") + + print(f"Tokenizing {len(file_list)} bed files ...") + + file_count = len(file_list) + # split the file_list into several subsets for each worker to process in parallel + nworkers = min(int(np.ceil(file_count / 20)), num_workers) + if nworkers <= 1: + tokenization_args = Namespace( + data_folder=src_folder, + file_list=file_list_path, + token_folder=dst_folder, + universe=universe_file, + bedtools_path=bedtools_path, + fraction=fraction, + ) + hard_tokenization(tokenization_args) + + else: # multiprocessing + dest_folder = os.path.join(dst_folder, "splits") + split_file(file_list_path, dest_folder, nworkers) + args_arr = [] + for n in range(nworkers): + temp_token_folder = os.path.join(dst_folder, f"batch_{n}") + tokenization_args = Namespace( + data_folder=src_folder, + file_list=os.path.join(dest_folder, f"split_{n}.txt"), + token_folder=temp_token_folder, + universe=universe_file, + bedtools_path=bedtools_path, + fraction=fraction, + ) + args_arr.append(tokenization_args) + with multiprocessing.Pool(nworkers) as pool: + processes = [pool.apply_async(hard_tokenization, args=(param,)) for param in args_arr] + _ = [r.get() for r in processes] + # move tokenized files in different folders to expr_tokens + shutil.rmtree(dest_folder) + for param in args_arr: + allfiles = os.listdir(param.token_folder) + for f in allfiles: + shutil.move( + os.path.join(param.token_folder, f), + os.path.join(dst_folder, f), + ) + shutil.rmtree(param.token_folder) + os.remove(file_list_path) + print(f"Tokenization complete {len(os.listdir(dst_folder))}/{file_count} bed files") + elapsed_time = timer.t() - start_time + print(f"[Tokenization] {time_str(elapsed_time)}/{time_str(timer.t())}") + return 1 diff --git a/gitk/tokenization/split_file.py b/geniml/tokenization/split_file.py similarity index 52% rename from gitk/tokenization/split_file.py rename to geniml/tokenization/split_file.py index 66661c33..4d964cae 100644 --- a/gitk/tokenization/split_file.py +++ b/geniml/tokenization/split_file.py @@ -1,10 +1,15 @@ -import os import argparse +import os -def get_file_rows(file_path): - """ - Count how many files are included +def get_file_rows(file_path: str) -> int: + """Counts how many files are included. + + Args: + file_path (str): The path to a file. + + Returns: + int: The number of rows in the file. """ count = 0 with open(file_path, "r") as f: @@ -13,13 +18,16 @@ def get_file_rows(file_path): return count -def split_file(file_path, dest_folder, num_parts): - """ - Split a list of files into a specified non-overlapping batches +def split_file(file_path: str, dest_folder: str, num_parts: int) -> None: + """Splits a list of files into num_parts non-overlapping batches. + + This is a helper function for tokenization in parallel. The dest_folder + must be empty. - file_path: path to a list of files - dest_folder: folder to store file splits - num_parts: number of parts needed + Args: + file_path (str): The path to a file with BED file names per row. + dest_folder (str): The folder to store split file lists. + num_parts (int): Number of batches to split. """ if os.path.exists(dest_folder): print("Folder exists") @@ -41,7 +49,7 @@ def split_file(file_path, dest_folder, num_parts): num_arr = [num_per_file] * (num_parts - 1) + [last_file_num] pos = 0 for index in range(num_parts): - fname = os.path.join(dest_folder, "split_{}.txt".format(index)) + fname = os.path.join(dest_folder, f"split_{index}.txt") with open(fname, "w") as fout: for _ in range(num_arr[index]): fout.write(list_arr[pos]) @@ -52,9 +60,14 @@ def split_file(file_path, dest_folder, num_parts): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--file_path", default="") - parser.add_argument("--dest_folder", default="") - parser.add_argument("--num_parts", type=int, default=5) + parser.add_argument("--file-path", default="", help="path to a file list") + parser.add_argument("--dest-folder", default="", help="where to store the split files") + parser.add_argument( + "--num-parts", + type=int, + default=5, + help="split the original file list to the specified parts", + ) args = parser.parse_args() split_file(args.file_path, args.dest_folder, args.num_parts) diff --git a/geniml/tokenization/utils.py b/geniml/tokenization/utils.py new file mode 100644 index 00000000..042483fa --- /dev/null +++ b/geniml/tokenization/utils.py @@ -0,0 +1,92 @@ +import time + +import numpy as np +import scanpy as sc + +from tqdm import tqdm +from gtars.tokenizers import Tokenizer +from gtars.models import Region + + +class Timer: + """Records the running time. + + Uses Timer.s() or Timer() to record the start time. Then, calls Timer.t() to get the + elapsed time in seconds. + """ + + def __init__(self): + """Initializes a Timer object and starts the timer.""" + self.v = time.time() + + def s(self): + """Restarts the timer.""" + self.v = time.time() + + def t(self): + """Gives the elapsed time. + + Returns: + float: The elapsed time in seconds. + """ + return time.time() - self.v + + +def time_str(t: float) -> str: + """Converts time in float to a readable format. + + Converts time in float to hours, minutes, or seconds based on the value of + t. + + Args: + t (float): Time in seconds. + + Returns: + str: Time in readable time. + """ + if t >= 3600: + return f"{t / 3600:.2f}h" + if t >= 60: + return f"{t / 60:.2f}m" + return f"{t:.2f}s" + + +def tokenize_anndata(adata: sc.AnnData, tokenizer: Tokenizer): + """ + Tokenize an AnnData object. This is more involved, so it gets its own function. + Args: + adata (sc.AnnData): The AnnData object to tokenize. + tokenizer (Tokenizer): The tokenizer to use. + """ + # extract regions from AnnData + # its weird because of how numpy handle Intervals, the parent class of Region, + # see here: + # https://stackoverflow.com/a/43722306/13175187 + adata_features = [ + Region(chr, int(start), int(end)) + for chr, start, end in tqdm( + zip(adata.var["chr"], adata.var["start"], adata.var["end"]), + total=adata.var.shape[0], + desc="Extracting regions from AnnData", + ) + ] + + features = np.ndarray(len(adata_features), dtype=object) + for i, region in enumerate(adata_features): + features[i] = region + + del adata_features + + # tokenize + tokenized = [] + x = adata.X + for row in tqdm( + range(adata.shape[0]), + total=adata.shape[0], + desc="Tokenizing", + ): + _, non_zeros = x[row].nonzero() + regions = features[non_zeros] + tokenized.append(tokenizer(regions)) + + return tokenized diff --git a/gitk/likelihood/__init__.py b/geniml/universe/__init__.py similarity index 100% rename from gitk/likelihood/__init__.py rename to geniml/universe/__init__.py diff --git a/gitk/likelihood/universe_hard.py b/geniml/universe/cc_universe.py similarity index 66% rename from gitk/likelihood/universe_hard.py rename to geniml/universe/cc_universe.py index 91812745..31cd62b4 100644 --- a/gitk/likelihood/universe_hard.py +++ b/geniml/universe/cc_universe.py @@ -1,22 +1,23 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -import numpy as np import os -from time import time from functools import cmp_to_key + +import numpy as np import pyBigWig -from ..utils import natural_chr_sort, timer_func + +from geniml.utils import natural_chr_sort, timer_func -def get_uni(file, chrom, cut_off=None): +def get_uni(file, chrom, cutoff=None): """For each position check if coverage is bigger than cut-off; if cut-off not provided calculate value that gives maximum likelihood universe :param str file: coverage file :param str chrom: chromosome to analyse - :param int cut_off: base pairs with values grater of equal to cut-off can be included in universe - :return:""" + :param int cutoff: base pairs with values grater of equal to cut-off can be included in universe + :return ndarray: vector with universes states; 0 - background, 1- universe""" file = pyBigWig.open(file) if pyBigWig.numpy: track = file.values(chrom, 0, file.chroms(chrom), numpy=True) @@ -25,23 +26,23 @@ def get_uni(file, chrom, cut_off=None): track = np.array(track) track[np.isnan(track)] = 0 track = track.astype(np.uint16) - if cut_off is None: - cut_off = np.sum(track) / len(track) - inter_pos = track >= cut_off + if cutoff is None: + cutoff = np.sum(track) / len(track) + inter_pos = track >= cutoff file.close() return inter_pos -def save_simple(fout, inter_pos, chrom): +def save_simple(file_out, inter_pos, chrom): """ Save cut-off universe to a file without any processing - :param str fout: output file + :param str file_out: output file :param bool vector inter_pos: whether each position should be included in universe :param str chrom: chromosome to analyse """ inter_pos_uni = np.argwhere(inter_pos) start = inter_pos_uni[0][0] - with open(fout, "a") as f: + with open(file_out, "a") as f: for i in range(1, len(inter_pos_uni)): if inter_pos_uni[i] - inter_pos_uni[i - 1] != 1: end = inter_pos_uni[i - 1][0] + 1 @@ -51,10 +52,10 @@ def save_simple(fout, inter_pos, chrom): f.write(f"{chrom}\t{start}\t{end}\n") -def marge_filter(fout, inter_pos, chrom, merge_dist=100, size_flt=1000): +def marge_filter(file_out, inter_pos, chrom, merge_dist=100, size_flt=1000): """ Save cut-off universe to a file with filtering region size and merging close regions - :param fout: output file + :param file_out: output file :param bool vector inter_pos: whether each position should be included in universe :param str chrom: chromosome to analyse :param int merge_dist: regions closer than merge_dist will be merged into one @@ -62,7 +63,7 @@ def marge_filter(fout, inter_pos, chrom, merge_dist=100, size_flt=1000): """ inter_pos_uni = np.argwhere(inter_pos) start = inter_pos_uni[0][0] - with open(fout, "a") as f: + with open(file_out, "a") as f: for i in range(1, len(inter_pos_uni)): if inter_pos_uni[i] - inter_pos_uni[i - 1] >= merge_dist: end = inter_pos_uni[i - 1][0] + 1 @@ -74,17 +75,19 @@ def marge_filter(fout, inter_pos, chrom, merge_dist=100, size_flt=1000): f.write(f"{chrom}\t{start}\t{end}\n") -def main(file, fout, merge=0, filter_size=0, cut_off=None): +def cc_universe(cove, file_out, cove_prefix="all", merge=0, filter_size=0, cutoff=None): """ - Creat cut-off universe based on coverage track - :param str file: path to coverage file without extension + Create cut-off coverage universe based on coverage track + :param str cove: path to coverage folder + :param str cove_prefix: prefix of the coverage file :param int merge: regions closer than this value will be merged into one :param int filter_size: regions smaller than this value will not be reported - :param str fout: output file - :param int cut_off: base pairs with coverage equal to or greater than this value will be included in the universe + :param str file_out: output file + :param int cutoff: base pairs with coverage equal to or greater than this value will be included in the universe """ - if os.path.isfile(fout): - raise Exception(f"File : {fout} exists") + if os.path.isfile(file_out): + raise Exception(f"File : {file_out} exists") + file = os.path.join(cove, f"{cove_prefix}_core.bw") bw_start = pyBigWig.open(file) chroms = bw_start.chroms() bw_start.close() @@ -93,8 +96,8 @@ def main(file, fout, merge=0, filter_size=0, cut_off=None): chroms = {i: chroms[i] for i in chroms_key} for chrom in chroms: if chroms[chrom] > 0: - inter_pos = get_uni(file, chrom, cut_off) + inter_pos = get_uni(file, chrom, cutoff) if merge == 0 and filter_size == 0: - save_simple(fout, inter_pos, chrom) + save_simple(file_out, inter_pos, chrom) else: - marge_filter(fout, inter_pos, chrom, merge, filter_size) + marge_filter(file_out, inter_pos, chrom, merge, filter_size) diff --git a/geniml/universe/ccf_universe.py b/geniml/universe/ccf_universe.py new file mode 100644 index 00000000..303e1b25 --- /dev/null +++ b/geniml/universe/ccf_universe.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +from functools import cmp_to_key + +import numpy as np +import pyBigWig + +from geniml.utils import natural_chr_sort, timer_func + + +def ana_region(reg, start_s, starts, ends, track_val): + """Check how many regions given part of universe contains + :param ndarray reg: vector with universes states; 0 - background, 1- boundary, 2- core + :param int start_s: start of the part of the universe + :param list starts: list of region starts in given part of universe + :param list ends: list of region ends in given part of universe + :param ndarray track_val: genome coverage by the collection for given part of universe + """ + core_s, core_e = [], [] + core_pos = np.argwhere(reg == 2).flatten() + if len(core_pos) == 0: + return "empty" + else: + core_s.append(core_pos[0] + start_s) + if core_pos[0] == 0: + core_s[0] += 1 + for i in range(1, len(core_pos)): + if core_pos[i] - core_pos[i - 1] >= 50: + core_e.append(core_pos[i - 1] + start_s) + min_point = np.argmin(track_val[core_pos[i - 1] : core_pos[i]]) + min_point = min_point + core_pos[i - 1] + ends.append(int(min_point) + start_s) + starts.append(ends[-1] + 1) + core_s.append(core_pos[i] + start_s) + if core_s[-1] == starts[-1]: + core_s[-1] += 1 + core_e.append(core_pos[-1] + start_s) + if core_pos[-1] == len(reg) - 1: + core_e[-1] -= 1 + return core_s, core_e, starts, ends + + +def save_regions(inter_pos, chrom, bedname, track): + """Save regions from universes to file + :param ndarray inter_pos: vector with universes states; 0 - background, 1- boundary, 2- core + :param str chrom: chromosome to analyse + :param str bedname: output file + :param ndarray track: vector with coverage values + """ + ind = np.argwhere(inter_pos != 0) + ind = ind.flatten() + start_s = ind[0] + to_file = [] + line = chrom + "\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n" + for i in range(1, len(ind)): + if ind[i] - ind[i - 1] != 1: + end_e = ind[i - 1] + region = inter_pos[start_s : end_e + 1] + res = ana_region(region, start_s, [start_s], [], track[start_s : end_e + 1]) + if res != "empty": + save_start_e, save_end_s, save_start_s, save_end_e = res + save_end_e = save_end_e + [end_e] + for a, b, c, d in zip(save_start_e, save_end_s, save_start_s, save_end_e): + if a != b: + val = 0 + li = line.format( + int(c), + int(d) + 1, + "universe", + val, + ".", + int(a), + int(b), + "0,0,255", + ) + to_file.append(li) + start_s = ind[i] + end_e = ind[-1] + region = inter_pos[start_s : end_e + 1] + res = ana_region(region, start_s, [start_s], [], track[start_s : end_e + 1]) + if res != "empty": + save_start_e, save_end_s, save_start_s, save_end_e = res + save_end_e = save_end_e + [end_e] + for a, b, c, d in zip(save_start_e, save_end_s, save_start_s, save_end_e): + val = 0 + li = line.format( + int(c), + int(d) + 1, + "universe", + val, + ".", + int(a), + int(b), + "0,0,255", + ) + to_file.append(li) + with open(bedname, "a") as f: + f.writelines(to_file) + + +def get_uni(file, chrom, bedname): + """Build cut-off coverage flexible universes from coverage track + :param str file: coverage file + :param str chrom: chromosome to analyse + :param str bedname: output file + """ + file = pyBigWig.open(file) + if pyBigWig.numpy: + track = file.values(chrom, 0, file.chroms(chrom), numpy=True) + else: + track = file.values(chrom, 0, file.chroms(chrom)) + track = np.array(track) + track[np.isnan(track)] = 0 + track = track.astype(np.uint16) + cutoff = np.sum(track) / len(track) + track_non_zero_sort = np.sort(track[track != 0]) + + cutoff = max([1, np.round(cutoff)]) + pos = np.where(track_non_zero_sort == cutoff)[0] + if len(pos) == 0: + uniq_val = np.unique(track_non_zero_sort) + dist = np.absolute(uniq_val - cutoff) + cutoff = uniq_val[dist.argmin()] + pos = np.where(track_non_zero_sort == cutoff)[0] + f, l = pos[0] / len(track_non_zero_sort), pos[-1] / len(track_non_zero_sort) + q_cutoff = np.mean([f, l]) + lower = np.quantile(track_non_zero_sort, max(0, q_cutoff - 0.2)) + upper = np.quantile(track_non_zero_sort, min(1, q_cutoff + 0.2)) + inter_pos = np.zeros(len(track), dtype=np.uint8) + inter_pos[track >= lower] = 1 + inter_pos[track > upper] = 2 + save_regions(inter_pos, chrom, bedname, track) + + +def ccf_universe(cove, file_out, cove_prefix="all"): + """ + Create cut-off flexible universe based on coverage track + :param str cove: path to coverage folder + :param str file_out: output file + :param str cove_prefix: prefixed used for creating signal tracks + """ + if os.path.isfile(file_out): + raise Exception(f"File : {file_out} exists") + file = os.path.join(cove, f"{cove_prefix}_core.bw") + bw = pyBigWig.open(file) + chroms = bw.chroms() + bw.close() + chroms_key = list(chroms.keys()) + chroms_key = sorted(chroms_key, key=cmp_to_key(natural_chr_sort)) + chroms = {i: chroms[i] for i in chroms_key} + for chrom in chroms: + if chroms[chrom] > 0: + get_uni(file, chrom, file_out) diff --git a/geniml/universe/cli.py b/geniml/universe/cli.py new file mode 100644 index 00000000..2b853df6 --- /dev/null +++ b/geniml/universe/cli.py @@ -0,0 +1,84 @@ +def build_subparser_hmm(parser): + parser = build_subparser(parser) + + parser.add_argument( + "--not-normalize", + help="if not to normalize coverage signal before using HMM", + action="store_false", + ) + parser.add_argument( + "--save-max-cove", + help="if present saves maximum coverage for each peak", + action="store_true", + ) + + return parser + + +def build_subparser_ml(parser): + parser = build_subparser(parser) + parser.add_argument("--model-file", help="path to lh model file", required=True, type=str) + return parser + + +def build_subparser_cc(parser): + parser = build_subparser(parser) + parser.add_argument( + "--merge", + help="distance between output peaks that should be merged into one in output universe", + default=0, + type=int, + ) + parser.add_argument( + "--filter-size", + help="minimal size of the region in the universe", + default=0, + type=int, + ) + parser.add_argument("--cutoff", help="cutoff value used for making universe", type=int) + + return parser + + +def build_subparser(parser): + """ + Parse command-line arguments passed to the pipeline. + """ + parser.add_argument( + "--output-file", + help="path to output, universe file", + required=True, + type=str, + ) + parser.add_argument( + "--coverage-folder", + help="path to core coverage folder", + required=True, + type=str, + ) + parser.add_argument( + "--coverage-prefix", + help="prefixed used for making coverage files", + default="all", + type=str, + ) + + return parser + + +def build_mode_parser(parser): + sp = parser.add_subparsers(dest="subcommand") + msg_by_cmd = { + "cc": "Making coverage cut-off universe", + "ccf": "Making coverage cut-off flexible universe", + "ml": "Making ML universe", + "hmm": "Making HMM universe", + } + subparsers = {} + for k, v in msg_by_cmd.items(): + subparsers[k] = sp.add_parser(k, description=v, help=v) + subparsers["cc"] = build_subparser_cc(subparsers["cc"]) + subparsers["ccf"] = build_subparser(subparsers["ccf"]) + subparsers["ml"] = build_subparser_ml(subparsers["ml"]) + subparsers["hmm"] = build_subparser_hmm(subparsers["hmm"]) + return parser diff --git a/geniml/universe/const.py b/geniml/universe/const.py new file mode 100644 index 00000000..f1bed631 --- /dev/null +++ b/geniml/universe/const.py @@ -0,0 +1,13 @@ +TRANSMAT = [ + [1 - 1e-10, 1e-10, 0, 0], + [0, 1 - 1e-6, 1e-6, 0], + [0, 0, 1 - 1e-10, 1e-10], + [0.1, 0, 0, 0.9], +] + +LAMBDAS = [ + [5, 3, 0.0001], + [0.05, 5, 0.05], + [0.0001, 3, 5], + [0.0001, 0.001, 0.0001], +] diff --git a/gitk/hmm/custom_distribution.py b/geniml/universe/custom_distribution.py similarity index 96% rename from gitk/hmm/custom_distribution.py rename to geniml/universe/custom_distribution.py index ffd9d80a..c11adb7c 100644 --- a/gitk/hmm/custom_distribution.py +++ b/geniml/universe/custom_distribution.py @@ -3,9 +3,9 @@ import numpy as np -from sklearn.utils import check_random_state -from scipy.stats import nbinom, beta from hmmlearn.base import BaseHMM +from scipy.stats import beta, nbinom +from sklearn.utils import check_random_state def _check_and_set_n_features(model, X): @@ -208,9 +208,7 @@ def _check(self): if self.beta_.shape != (self.n_components, n_features): raise ValueError("beta_ must have shape (n_components, n_features)") if self.alfa_.shape != self.beta_.shape: - raise ValueError( - "alfa_ and beta_ must have the same shape (n_components, n_features)" - ) + raise ValueError("alfa_ and beta_ must have the same shape (n_components, n_features)") self.n_features = n_features def _generate_sample_from_state(self, state, random_state): @@ -218,10 +216,7 @@ def _generate_sample_from_state(self, state, random_state): def _compute_log_likelihood(self, X): return np.array( - [ - np.sum(beta.logpdf(X, a, b), axis=1) - for a, b in zip(self.alfa_, self.beta_) - ] + [np.sum(beta.logpdf(X, a, b), axis=1) for a, b in zip(self.alfa_, self.beta_)] ).T def _compute_likelihood(self, X): diff --git a/geniml/universe/hmm_universe.py b/geniml/universe/hmm_universe.py new file mode 100644 index 00000000..df8f6adc --- /dev/null +++ b/geniml/universe/hmm_universe.py @@ -0,0 +1,148 @@ +import os +from functools import cmp_to_key +from logging import getLogger + +import numpy as np +import pyBigWig +from scipy.stats import nbinom + +from ..const import PKG_NAME +from ..utils import natural_chr_sort +from .const import LAMBDAS, TRANSMAT +from .models import PoissonModel +from .utils import find_full, predictions_to_bed + +_LOGGER = getLogger(PKG_NAME) + +""" States legend +0 -> start +1 -> core +2 -> end +3 -> background""" + + +def norm(track, mode): + """Normalize the coverage track depending on track type. + For each unique value in the track calculates the corresponding + quantile taking into account that values occur different number of times.""" + important_val = track[track != 0] + important_val_unique, counts = np.unique(important_val, return_counts=True) + uniq_dict = {i: j for i, j in zip(important_val_unique, counts)} + # how many times each value is present in the track + important_val_unique_sort = np.sort(important_val_unique) + if mode == "ends": + n = 0.1 + if mode == "core": + n = 0.2 + bs = 0 # what fraction of the distribution was used for normalization + val = {} # for each unique value in track holds the corresponding quantile + for i in important_val_unique_sort: + move_val = (uniq_dict[i] / len(important_val)) / 2 + # how far from last quantile is te next one + val[i] = nbinom.ppf(bs + move_val, 1, n) + bs = bs + move_val * 2 + track[track != 0] = [val[i] for i in important_val] + + +def process_bigwig(file, seq, p, chrom, chrom_size, normalize=True, mode=None): + """Preprocess bigWig file""" + if pyBigWig.numpy: + track = file.values(chrom, 0, chrom_size, numpy=True) + else: + track = file.values(chrom, 0, chrom_size) + track = np.array(track) + track[np.isnan(track)] = 0 + track = track.astype(np.uint16) + if normalize: + norm(track, mode) + seq[:, p] = track + + +def read_data(start, core, end, chrom, normalize=True): + """ + Read in and preprocess data + :param str start: path to file with start coverage + :param str end: path to file with end coverage + :param str core: path to file with core coverage + :param str chrom: chromosome to analyse + :param bool normalize: whether to normalize the coverage + :return: chromosome size, coverage matrix + """ + start = pyBigWig.open(start + ".bw") + chroms = start.chroms() + chrom_size = chroms[chrom] + seq = np.zeros((chrom_size, 3), dtype=np.uint16) + process_bigwig(start, seq, 0, chrom, chrom_size, normalize, mode="ends") + start.close() + core = pyBigWig.open(core + ".bw") + process_bigwig(core, seq, 1, chrom, chrom_size, normalize, mode="core") + core.close() + end = pyBigWig.open(end + ".bw") + process_bigwig(end, seq, 2, chrom, chrom_size, normalize, mode="ends") + end.close() + return chrom_size, seq + + +def split_predict(seq, empty_starts, empty_ends, model): + """Make model prediction only for regions containing + nonzero positions""" + hmm_predictions = np.full(len(seq), 3, dtype=np.uint8) + for s, e in zip(empty_starts, empty_ends): + res = model.predict(seq[s:e]) + hmm_predictions[s:e] = res + return hmm_predictions + + +def run_hmm(start, core, end, chrom, normalize=True): + """Make HMM prediction for given chromosome""" + chrom_size, seq = read_data(start, core, end, chrom, normalize=normalize) + empty_starts, empty_ends = find_full(seq) + model = PoissonModel(TRANSMAT, LAMBDAS, save_matrix=False).model + hmm_predictions = split_predict(seq, empty_starts, empty_ends, model) + return hmm_predictions, model + + +def hmm_universe( + coverage_folder, + out_file, + prefix="all", + normalize=True, + save_max_cove=False, +): + """ + Create HMM based universe from coverage + :param str coverage_folder: path to folder with coverage files + :param str start: start coverage file name + :param str end: end coverage file name + :param str core: core coverage file name + :param str out_file: path to the output file with universe + :param bool normalize: whether to normalize file + :param bool save_max_cove: whether to save the maximum + peak coverage + """ + if os.path.isfile(out_file): + raise Exception(f"File : {out_file} exists") + start = os.path.join(coverage_folder, f"{prefix}_start") + core = os.path.join(coverage_folder, f"{prefix}_core") + end = os.path.join(coverage_folder, f"{prefix}_end") + bw_start = pyBigWig.open(start + ".bw") + chroms = bw_start.chroms() + bw_start.close() + chroms_key = list(chroms.keys()) + chroms_key = sorted(chroms_key, key=cmp_to_key(natural_chr_sort)) + chroms = {i: chroms[i] for i in chroms_key} + for C in chroms: + if chroms[C] > 0: + pred, m = run_hmm(start, core, end, C, normalize=normalize) + predictions_to_bed( + pred, + C, + out_file, + save_max_cove=save_max_cove, + cove_file=core + ".bw", + ) + + +def test_hmm(message): + """Just prints a test message""" + _LOGGER.info(message) diff --git a/geniml/universe/ml_universe.py b/geniml/universe/ml_universe.py new file mode 100644 index 00000000..cb2e85ec --- /dev/null +++ b/geniml/universe/ml_universe.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import importlib.util +import os +from functools import cmp_to_key + +import numpy as np + +from geniml.likelihood.build_model import ModelLH + +from ..utils import natural_chr_sort, read_chromosome_from_bw, timer_func +from .utils import find_full, predictions_to_bed + +package_name = "numba" + +if importlib.util.find_spec(package_name) is None: + + def process_part( + cove, + model_start=np.array([[]]), + model_core=np.array([[]]), + model_end=np.array([[]]), + ): + """ + Finding ML path through matrix using dynamic programing without numba + :param ndarray cove: coverage tracks + :param ndarray model_start: lh model for starts + :param ndarray model_core: lh model for core + :param ndarray model_end: lh model for ends + :return ndarray: ML path through matrix + """ + mat = np.zeros((len(cove), 4)) + (N, M) = mat.shape + start_b = model_start[cove[:, 0], 0] + core_b = model_core[cove[:, 1], 0] + end_b = model_end[cove[:, 2], 0] + start = model_start[cove[:, 0], 1] + core = model_core[cove[:, 1], 1] + end = model_end[cove[:, 2], 1] + mat[:, 0] = start + core_b + end_b + mat[:, 1] = start_b + core + end_b + mat[:, 2] = start_b + core_b + end + mat[:, 3] = start_b + core_b + end_b + + for i in range(1, N): + for j in range(M): + mat[i, j] += max(mat[i - 1, j], mat[i - 1, j - 1]) + path = np.zeros(len(mat), dtype=np.int8) + path[-1] = np.argmax(mat[-1]) + for i in range(len(mat) - 2, -1, -1): + prev_index = path[i + 1] + new_index = prev_index - (mat[i, prev_index - 1] > mat[i, prev_index]) + if new_index == -1: + new_index = 3 + path[i] = new_index + return path + +else: + from numba import njit + + @njit + def process_part( + cove, + model_start=np.array([[]]), + model_core=np.array([[]]), + model_end=np.array([[]]), + ): + """ + Finding ML path through matrix using dynamic programing with numba + :param ndarray cove: coverage tracks + :param ndarray model_start: lh model for starts + :param ndarray model_core: lh model for core + :param ndarray model_end: lh model for ends + :return ndarray: ML path through matrix + """ + mat = np.zeros((len(cove), 4)) + (N, M) = mat.shape + for i in range(N): + start_b = model_start[cove[i, 0], 0] + core_b = model_core[cove[i, 1], 0] + end_b = model_end[cove[i, 2], 0] + start = model_start[cove[i, 0], 1] + core = model_core[cove[i, 1], 1] + end = model_end[cove[i, 2], 1] + mat[i, 0] = start + core_b + end_b + mat[i, 1] = start_b + core + end_b + mat[i, 2] = start_b + core_b + end + mat[i, 3] = start_b + core_b + end_b + for i in range(1, N): + for j in range(M): + mat[i, j] += max(mat[i - 1, j], mat[i - 1, j - 1]) + path = np.zeros(len(mat), dtype=np.int8) + path[-1] = np.argmax(mat[-1]) + for i in range(len(mat) - 2, -1, -1): + prev_index = path[i + 1] + new_index = prev_index - (mat[i, prev_index - 1] > mat[i, prev_index]) + if new_index == -1: + new_index = 3 + path[i] = new_index + return path + + +def make_ml_flexible_universe(model_lh, cove_folder, cove_prefix, chrom, file_out): + """ + Make ML flexible universe per chromosome + :param ModelLH model_lh: lh model + :param str cove_folder: path to a folder with genome coverage by tracks + :param str cove_prefix: prefix used in uniwig for creating coverage + :param str chrom: chromosome to be processed + :param str file_out: output file with the universe + """ + model_lh.read_chrom(chrom) + chrom_model = model_lh[chrom] + start = read_chromosome_from_bw(os.path.join(cove_folder, f"{cove_prefix}_start.bw"), chrom) + core = read_chromosome_from_bw(os.path.join(cove_folder, f"{cove_prefix}_core.bw"), chrom) + end = read_chromosome_from_bw(os.path.join(cove_folder, f"{cove_prefix}_end.bw"), chrom) + cove = np.zeros((len(start), 3), dtype=np.uint16) + cove[:, 0] = start + cove[:, 1] = core + cove[:, 2] = end + full_start, full_end = find_full(cove) + path = np.full(len(cove), 3, dtype=np.uint8) + for s, e in zip(full_start, full_end): + res = process_part( + cove[s:e], + chrom_model["start"], + chrom_model["core"], + chrom_model["end"], + ) + path[s:e] = res + + predictions_to_bed(path, chrom, file_out) + + +def ml_universe(model_file, cove_folder, cove_prefix, file_out): + """ + Make ML flexible universe + :param str model_file: input name with likelihood models + :param str file_out: output file with the universe + :param str cove_folder: path to a folder with genome coverage by tracks + :param str cove_prefix: prefix used in uniwig for creating coverage + """ + if os.path.isfile(file_out): + raise Exception(f"File : {file_out} exists") + lh_model = ModelLH(model_file) + chroms = sorted(lh_model.chromosomes_list, key=cmp_to_key(natural_chr_sort)) + for C in chroms: + make_ml_flexible_universe(lh_model, cove_folder, cove_prefix, C, file_out) diff --git a/gitk/hmm/models.py b/geniml/universe/models.py similarity index 78% rename from gitk/hmm/models.py rename to geniml/universe/models.py index 596b7f53..16a123b4 100644 --- a/gitk/hmm/models.py +++ b/geniml/universe/models.py @@ -1,11 +1,13 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -from hmmlearn import hmm -import numpy as np import os +from abc import ABC + +import numpy as np +from hmmlearn import hmm + from .custom_distribution import NBHMM, BetaHMM -from abc import ABC, abstractmethod class Model(ABC): @@ -26,11 +28,6 @@ def __init__(self, trans_matrix, init_para, para, save_matrix): self.trans_matrix = trans_matrix self.start_matrix = np.array([0.01, 0.01, 0.97, 0.01]) - @abstractmethod - def make(self): - """Abstract method for defining the model""" - pass - def save_tras(self, out_folder): np.savetxt(os.path.join(out_folder, "trans_matrix.csv"), self.trans_matrix) @@ -63,12 +60,11 @@ def __init__( if save_matrix: super().save_tras(out_folder) np.savetxt( - os.path.join(out_folder, "lambdas_matrix.csv"), self.lambdas_matrix + os.path.join(out_folder, "lambdas_matrix.csv"), + self.lambdas_matrix, ) - def make(self): - """Initialize HMM model""" - model = hmm.PoissonHMM( + self.model = hmm.PoissonHMM( n_components=self.state_no, verbose=True, init_params=self.init_para, @@ -76,11 +72,10 @@ def make(self): ) if "t" not in self.init_para: - model.transmat_ = self.trans_matrix + self.model.transmat_ = self.trans_matrix if "l" not in self.init_para: - model.lambdas_ = self.lambdas_matrix - model.startprob_ = self.start_matrix - return model + self.model.lambdas_ = self.lambdas_matrix + self.model.startprob_ = self.start_matrix class GaussianModel(Model): @@ -112,13 +107,12 @@ def __init__( if save_matrix: super().save_tras(out_folder) np.savetxt( - os.path.join(out_folder, "covars_matrix.csv"), self.covars_matrix + os.path.join(out_folder, "covars_matrix.csv"), + self.covars_matrix, ) np.savetxt(os.path.join(out_folder, "means_matrix.csv"), self.means_matrix) - def make(self): - """Initialize HMM model""" - model = hmm.GaussianHMM( + self.model = hmm.GaussianHMM( n_components=self.state_no, verbose=True, init_params=self.init_para, @@ -126,13 +120,12 @@ def make(self): ) if "t" not in self.init_para: - model.transmat_ = self.trans_matrix + self.model.transmat_ = self.trans_matrix if "m" not in self.init_para: - model.means_ = self.means_matrix + self.model.means_ = self.means_matrix if "c" not in self.init_para: - model.covars_ = self.covars_matrix - model.startprob_ = self.start_matrix - return model + self.model.covars_ = self.covars_matrix + self.model.startprob_ = self.start_matrix class NBModel(Model): @@ -164,24 +157,22 @@ def __init__( if save_matrix: super().save_tras(out_folder) np.savetxt( - os.path.join(failures_matrix, "failures_matrix.csv"), self.covars_matrix + os.path.join(failures_matrix, "failures_matrix.csv"), + self.covars_matrix, ) np.savetxt(os.path.join(out_folder, "prob_matrix.csv"), self.prob_matrix) - def make(self): - """Initialize HMM model""" - model = NBHMM( + self.model = NBHMM( n_components=self.state_no, verbose=True, init_params=self.init_para, params=self.para, ) - model.transmat_ = self.trans_matrix - model.failures_ = self.failures_matrix - model.prob_ = self.prob_matrix - model.startprob_ = self.start_matrix - return model + self.model.transmat_ = self.trans_matrix + self.model.failures_ = self.failures_matrix + self.model.prob_ = self.prob_matrix + self.model.startprob_ = self.start_matrix class BetaModel(Model): @@ -215,17 +206,14 @@ def __init__( np.savetxt(os.path.join(out_folder, "alfa_matrix.csv"), self.alfa_matrix) np.savetxt(os.path.join(out_folder, "beta_matrix.csv"), self.beta_matrix) - def make(self): - """Initialize HMM model""" - model = BetaHMM( + self.model = BetaHMM( n_components=self.state_no, verbose=True, init_params=self.init_para, params=self.para, ) - model.transmat_ = self.trans_matrix - model.alfa_ = self.alfa_matrix - model.beta_ = self.beta_matrix - model.startprob_ = self.start_matrix - return model + self.model.transmat_ = self.trans_matrix + self.model.alfa_ = self.alfa_matrix + self.model.beta_ = self.beta_matrix + self.model.startprob_ = self.start_matrix diff --git a/geniml/universe/utils.py b/geniml/universe/utils.py new file mode 100644 index 00000000..43c2ddee --- /dev/null +++ b/geniml/universe/utils.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import numpy as np +import pyBigWig + + +def ana_region(region, start_s): + """Helper for saving HMM prediction into a file""" + start_e = start_s + np.where(region == 1)[0][0] + end_s = start_s + np.where(region == 2)[0][0] + return start_e, end_s + + +def predictions_to_bed(states, chrom, bedname, save_max_cove=False, cove_file=None): + """ + Save HMM prediction into a file + :param ndarray states: result of HMM prediction + :param str chrom: which chromosome is being analyzed + :param str bedname: path to the output file + :param bool save_max_cove: whether to save the maximum peak coverage to output + file, can result in nonstandard bed file + :param str cove_file: file with core coverage, require for saving maximum peak coverage + """ + ind = np.argwhere(states != 3) + ind = ind.flatten() + start_s = ind[0] + to_file = [] + line = chrom + "\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n" + if save_max_cove: + coverage = pyBigWig.open(cove_file) + for i in range(1, len(ind)): + if ind[i] - ind[i - 1] != 1: + end_e = ind[i - 1] + region = states[start_s : end_e + 1] + res = ana_region(region, start_s) + save_start_e, save_end_s = res + val = 0 + if save_max_cove: + val = coverage.stats(chrom, int(start_s), int(end_e) + 1, type="max") + val = int(val[0]) + to_file.append( + line.format( + start_s, + end_e + 1, + "universe", + val, + ".", + save_start_e, + save_end_s, + "0,0,255", + ) + ) + start_s = ind[i] + if states[ind[-1]] == 2: + region = states[start_s : ind[-1] + 1] + res = ana_region(region, start_s) + save_start_e, save_end_s = res + val = 0 + if save_max_cove: + val = coverage.stats(chrom, int(start_s), int(ind[-1]) + 1, type="max") + val = int(val[0]) + to_file.append( + line.format( + start_s, + ind[-1] + 1, + "universe", + val, + ".", + save_start_e, + save_end_s, + "0,0,255", + ) + ) + with open(bedname, "a") as f: + f.writelines(to_file) + + +def find_full_full_pos(seq, gap_size=1000, area_size=500): + """Look for nonzero positions in coverage matrix, when most of the positions are zero + :param ndarray seq: vector with information about non-zero positions + :param int gap_size: size of minimum gap between non-zero positions that are separated + :param int area_size: size of the area around non-zero positions to be included in the result + :return list: list of starts of non-zero regions and list of ends of non-zero regions + """ + size = len(seq) + seq = np.argwhere(seq >= 1).flatten() + starts, ends = [], [] + if seq[0] > gap_size: + starts.append(int(seq[0] - area_size)) + else: + starts.append(0) + for e in range(1, len(seq)): + if seq[e] - seq[e - 1] > gap_size: + ends.append(int(seq[e - 1] + area_size)) + starts.append(int(seq[e] - area_size)) + ends.append(min(int(seq[-1] + area_size), size)) + return starts, ends + + +def find_full_empty_pos(seq, gap_size=10000, area_size=1000): + """Look for nonzero positions in coverage matrix, when most of the positions are nonzero + :param ndarray seq: vector with information about non-zero positions + :param int gap_size: size of minimum gap between non-zero positions that are separated + :param int area_size: size of the area around non-zero positions to be included in the result + :return list: list of starts of non-zero regions and list of ends of non-zero regions + """ + size = len(seq) + seq = np.argwhere(seq == 0).flatten() + starts, ends = [], [] + gap_len = 0 + gap_start = 0 + looking_for_first = True + for e in range(1, len(seq)): + if seq[e] - seq[e - 1] == 1: + gap_len += 1 + else: + if gap_len >= gap_size: + starts.append(gap_start) + ends.append(seq[e - 1]) + looking_for_first = False + elif looking_for_first: + starts.append(gap_start) + ends.append(seq[e - 1]) + looking_for_first = False + gap_len = 1 + gap_start = seq[e] + starts_res = [max(0, i - area_size) for i in ends] + end_res = [i + area_size for i in starts[1:]] + [size] + if not starts_res: + starts_res = [0] + return starts_res, end_res + + +def find_full(seq): + """Look for nonzero positions in coverage matrix""" + seq = np.sum(seq, axis=1, dtype=np.uint8) + full_pos_no = np.sum(seq >= 1) + if full_pos_no < len(seq) - full_pos_no: + return find_full_full_pos(seq) + else: + return find_full_empty_pos(seq) diff --git a/geniml/utils.py b/geniml/utils.py new file mode 100644 index 00000000..f7feccf2 --- /dev/null +++ b/geniml/utils.py @@ -0,0 +1,114 @@ +from time import time +from typing import Dict, List, Optional + +import numpy as np +import pyBigWig + + +def natural_chr_sort(a, b): + ac = a.replace("chr", "") + ac = ac.split("_")[0] + bc = b.replace("chr", "") + bc = bc.split("_")[0] + if bc.isnumeric() and ac.isnumeric() and bc != ac: + if int(bc) < int(ac): + return 1 + elif int(bc) > int(ac): + return -1 + else: + return 0 + else: + if b < a: + return 1 + elif a < b: + return -1 + else: + return 0 + + +def timer_func(func): + def wrap_func(*args, **kwargs): + t1 = time() + result = func(*args, **kwargs) + t2 = time() + print(f"Function {func.__name__!r} executed in {(t2-t1)/60:.4f}min") + return result + + return wrap_func + + +def read_chromosome_from_bw(file, chrom): + bw = pyBigWig.open(file) + chrom_size = bw.chroms(chrom) + if pyBigWig.numpy: + cove = bw.values(chrom, 0, chrom_size, numpy=True) + else: + cove = bw.values(chrom, 0, chrom_size) + cove = np.array(cove) + cove[np.isnan(cove)] = 0 + return cove.astype(np.uint16) + + +def find_path(hierarchy: Dict[str, Dict], path: List[str], cell_type: str) -> Optional[List[str]]: + """ + Find the path from the root to a given cell type in a hierarchy. + + :param hierarchy: A dictionary representing the hierarchy. + :param path: The current path. + :param cell_type: The cell type to find. + + :return: The path from the root to the cell type. (a list of strings, ... or None) + """ + if cell_type in hierarchy: + return path + [cell_type] + + for key in hierarchy: + sub_path = find_path(hierarchy[key], path + [key], cell_type) + if sub_path: + return sub_path + + return None + + +def find_lca(path1: List[str], path2: List[str]) -> int: + """ + Find the lowest common ancestor (LCA) of two paths. + + :param path1: The first path. + :param path2: The second path. + """ + min_length = min(len(path1), len(path2)) + for i in range(min_length): + if path1[i] != path2[i]: + return i - 1 + return min_length - 1 + + +def compute_cell_hierarchy_distance( + hierarchy: Dict[str, Dict], cell1: str, cell2: str +) -> Optional[int]: + """ + Compute the distance between two cell types in a hierarchy. + + The distance is the number of edges between the two cells in the hierarchy. + + :param hierarchy: A dictionary representing the hierarchy. + :param cell1: The first cell type. + :param cell2: The second cell type. + + :return: The distance between the two cell types. (an integer, ... or None) + """ + # Find paths from root to both cells + path1 = find_path(hierarchy, [], cell1) + path2 = find_path(hierarchy, [], cell2) + + if not path1 or not path2: + return None # One of the cells doesn't exist in the hierarchy + + # Find the lowest common ancestor (LCA) + lca_index = find_lca(path1, path2) + + # Distance is the sum of the lengths from LCA to both nodes + distance = (len(path1) - lca_index - 1) + (len(path2) - lca_index - 1) + + return distance diff --git a/gitk/README.md b/gitk/README.md deleted file mode 100644 index ec72fe12..00000000 --- a/gitk/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# gitk module - -The `gitk` module is the parent module in the package, which controls the general argument parser (in `cli.py`), constants (`const.py`), and any other utility or general-purpose code that is used across modules. diff --git a/gitk/_version.py b/gitk/_version.py deleted file mode 100644 index 6bc801c3..00000000 --- a/gitk/_version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = "0.0.1-dev" diff --git a/gitk/assess/cli.py b/gitk/assess/cli.py deleted file mode 100644 index 6b3ee7db..00000000 --- a/gitk/assess/cli.py +++ /dev/null @@ -1,41 +0,0 @@ -def build_subparser(parser): - """ - Builds argument parser. - - :return argparse.ArgumentParser - """ - - parser.add_argument("--raw_data_folder", type=str, required=True) - parser.add_argument("--file_list", type=str, required=True) - parser.add_argument("--universe", type=str, required=True) - parser.add_argument("--npool", default=4, type=int) - parser.add_argument("--save_to_file", action="store_true") - parser.add_argument("--folder_out", type=str) - parser.add_argument("--pref", type=str) - - return parser - - -def build_subparser_distance(parser): - parser = build_subparser(parser) - parser.add_argument("--flexible", action="store_true") - parser.add_argument("--save_each", action="store_true") - - return parser - - -def build_mode_parser(parser): - sp = parser.add_subparsers(dest="subcommand") - msg_by_cmd = { - "distance": "Asses based on distance", - "intersection": "Asses based on coverage", - "recovered": "Asses based on percent of recovered starts, ends", - } - subparsers = {} - for k, v in msg_by_cmd.items(): - subparsers[k] = sp.add_parser(k, description=v, help=v) - subparsers["distance"] = build_subparser_distance(subparsers["distance"]) - subparsers["intersection"] = build_subparser(subparsers["intersection"]) - subparsers["recovered"] = build_subparser(subparsers["recovered"]) - - return parser diff --git a/gitk/assess/distance.py b/gitk/assess/distance.py deleted file mode 100644 index d0e4071b..00000000 --- a/gitk/assess/distance.py +++ /dev/null @@ -1,273 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import os -from typing import List, Any - -import numpy as np -import argparse -from multiprocessing import Pool -from .utils import process_db_line, chrom_cmp_bigger, prep_data, check_if_uni_sorted -from ..utils import natural_chr_sort -import tempfile - - -def flexible_distance(r, q): - """Calculate region distance for univers""" - if r[0] <= q <= r[1]: - return 0 - else: - return min(abs(r[0] - q), abs(r[1] - q)) - - -def distance(r, q): - """Calculate distance for hard universe""" - return abs(r[0] - q) - - -def asses(db, db_que, i, current_chrom, unused_db, pos_index, flexible): - """ - Calculate distance from given peak to the closest region in universe - :param file db: universe file - :param list db_que: que of three last positions in universe - :param int i: analysed position from the query - :param str current_chrom: current analysed chromosome from query - :param list unused_db: list of positions from universe that were not compared to query - :param list pos_index: which indexes from universe region use to calculate distance - :param bool flexible: whether the universe if flexible - :return int: peak distance to universe - """ - if flexible: - dist_to_db_que = [flexible_distance(j, i) for j in db_que] - else: - dist_to_db_que = [distance(j, i) for j in db_que] - min_pos = np.argmin(dist_to_db_que) - while min_pos == 2: - d = db.readline().strip("\n") - if d == "": - return dist_to_db_que[min_pos] - pos, pos_chrom = process_db_line(d, pos_index) - if pos_chrom != current_chrom: - unused_db.append([pos, pos_chrom]) - return dist_to_db_que[min_pos] - db_que[:-1] = db_que[1:] - db_que[-1] = pos - if flexible: - dist_to_db_que = [flexible_distance(j, i) for j in db_que] - else: - dist_to_db_que = [distance(j, i) for j in db_que] - min_pos = np.argmin(dist_to_db_que) - return dist_to_db_que[min_pos] - - -def process_line( - db, - q_chrom, - current_chrom, - unused_db, - db_que, - dist, - waiting, - start, - pos_index, - flexible, -): - """ - Calculate distance from new peak to universe - :param file db: universe file - :param str q_chrom: on which chromosome id the new peak - :param str current_chrom: chromosome that was analysed so far - :param list unused_db: list of positions from universe that were not compared to query - :param list db_que: que of three last positions in universe - :param list dist: list of all calculated distances - :param bool waiting: whether iterating through file, without calculating - distance, if present chromosome not present in universe - :param start: analysed position from the query - :param pos_index: which indexes from universe region use to calculate distance - :param flexible: whether the universe if flexible - :return: if iterating through chromosome not present in universe; current chromosome in query - """ - if q_chrom != current_chrom: - # change chromosome - db_que.clear() - # clean up the que - if len(unused_db) == 0: - d = db.readline().strip("\n") - if d == "": - waiting = True - return waiting, current_chrom - d_start, d_start_chrom = process_db_line(d, pos_index) - while current_chrom == d_start_chrom: - # finish reading old chromosome in DB file - d = db.readline().strip("\n") - if d == "": - break - d_start, d_start_chrom = process_db_line(d, pos_index) - unused_db.append([d_start, d_start_chrom]) - current_chrom = q_chrom - if current_chrom == unused_db[-1][1]: - waiting = False - db_que.append(unused_db[-1][0]) - unused_db.clear() - elif natural_chr_sort(unused_db[-1][1], current_chrom) == 1: - # chrom present in file not in DB - waiting = True - return waiting, current_chrom - while len(db_que) < 3: - d = db.readline().strip("\n") - if d == "": - break - d_start, d_start_chrom = process_db_line(d, pos_index) - if d_start_chrom == current_chrom: - db_que.append(d_start) - elif natural_chr_sort(d_start_chrom, current_chrom) == 1: - unused_db.append([d_start, d_start_chrom]) - waiting = True - return waiting, current_chrom - if len(db_que) == 0: - waiting = True - if not waiting: - res = asses(db, db_que, start, current_chrom, unused_db, pos_index, flexible) - dist.append(res) - return waiting, current_chrom - - -def calc_distance( - db_file, - q_folder, - q_file, - flexible=False, - save_each=False, - folder_out=None, - pref=None, -): - """ - For given file calculate distance to the nearst region from universe - :param str db_file: path to universe - :param str q_folder: path to folder containing query files - :param str q_file: query file - :param boolean flexible: whether the universe if flexible - :param bool save_each: whether to save calculated distances for each file - :param str folder_out: output folder - :param str pref: prefix used as the name of the folder - containing calculated distance for each file - :return str, int, int: file name; median od distance of starts to - starts in universe; median od distance of ends to ends in universe - """ - q = tempfile.NamedTemporaryFile() - prep_data(q_folder, q_file, q) - db_start = open(db_file) - db_que_start = [] - current_chrom_start = "chr0" - dist_start = [] - unused_db_start = [] - waiting_start = False - db_end = open(db_file) - db_que_end = [] - current_chrom_end = "chr0" - dist_end = [] - unused_db_end = [] - waiting_end = False - pos_start = [1] - pos_end = [2] - if flexible: - pos_start = [1, 6] - pos_end = [7, 2] - for i in q: - i = i.decode("utf-8").split("\t") - start = int(i[1]) - end = int(i[2]) - q_chrom = i[0] - res_start = process_line( - db_start, - q_chrom, - current_chrom_start, - unused_db_start, - db_que_start, - dist_start, - waiting_start, - start, - pos_start, - flexible, - ) - (waiting_start, current_chrom_start) = res_start - res_end = process_line( - db_end, - q_chrom, - current_chrom_end, - unused_db_end, - db_que_end, - dist_end, - waiting_end, - end, - pos_end, - flexible, - ) - (waiting_end, current_chrom_end) = res_end - tmp_file.close() - if save_each: - with open(os.path.join(folder_out, pref, q_file), "w") as f: - for i, j in zip(dist_start, dist_end): - f.write(f"{i}\t{j}\n") - if not dist_start: - print(f"File {q_file} doesn't contain any chromosomes present in universe") - return q_file, None, None - return q_file, np.median(dist_start), np.median(dist_end) - - -def run_distance( - folder, - file_list, - universe, - npool, - flexible=False, - save_to_file=False, - folder_out=None, - pref=None, - save_each=False, -): - """ - For group of files calculate distance to the nearest region in universe - :param str folder: path to folder containing query files - :param str file_list: path to file containing list of query files - :param str universe: path to universe file - :param int npool: number of parallel processes - :param bool flexible: whether the universe if flexible - :param bool save_to_file: whether to save median of calculated distances for each file - :param str folder_out: output folder - :param str pref: prefix used for saving - :param bool save_each: whether to save calculated distances for each file - :return float; float: mean of median distances from starts in query to the nearest starts in universe; - mean of median distances from ends in query to the nearest ends in universe - """ - check_if_uni_sorted(universe) - files = open(file_list).read().split("\n")[:-1] - res = [] - if folder_out: - os.makedirs(folder_out, exist_ok=True) - if save_each: - os.makedirs(os.path.join(folder_out, pref)) - if npool <= 1: - for i in files: - r = calc_distance( - universe, folder, i, flexible, save_each, folder_out, pref - ) - res.append(r) - else: - with Pool(npool) as p: - args = [ - (universe, folder, f, flexible, save_each, folder_out, pref) - for f in files - ] - res = p.starmap(calc_distance, args) - if save_to_file: - fout = os.path.join(folder_out, pref + "_data.tsv") - with open(fout, "w") as o: - o.write("file\tmedian_dist_start\tmedian_dist_end\n") - for r in res: - o.write(f"{r[0]}\t{r[1]}\t{r[2]}\n") - else: - res = np.array(res) - res = res[:, 1:] - res = res.astype("float") - return np.mean([np.nanmedian(res[:, 0]), np.nanmedian(res[:, 1])]) diff --git a/gitk/assess/likelihood.py b/gitk/assess/likelihood.py deleted file mode 100644 index ec1bc754..00000000 --- a/gitk/assess/likelihood.py +++ /dev/null @@ -1,209 +0,0 @@ -import numpy as np -import os -from .utils import check_if_uni_sorted -from ..likelihood.build_model import ModelLH - - -def calc_likelihood_hard(universe, chroms, model_lh, name, s_index, e_index=None): - """ - Calculate likelihood of universe for given type of model - To be used with binomial model - :param str universe: path to universe file - :param list chroms: list of chromosomes present in likelihood model - :param str model_folder: path to folder with model - :param str name: suffix of model file name, which contains information - about model type - :param int s_index: from which position in univers line take assess region - start position - :param int e_index: from which position in univers line take assess region - end position - :return float: likelihood of univers for given model - """ - curent_chrom = "" - missing_chrom = "" - empty_start = 0 - res = 0 - e = 0 - prob_array = None - with open(universe) as uni: - for i in uni: - e += 1 - i = i.split("\t") - i[1], i[2] = int(i[1]), int(i[2]) - if i[0] == missing_chrom: - pass - else: - if i[0] != curent_chrom: - if i[0] in chroms: - model_lh.clear_chrom(curent_chrom) - if e != 1: - res += np.sum(prob_array[empty_start:, 0]) - - curent_chrom = i[0] - model_lh.read_chrom_track(curent_chrom, name) - prob_array = model_lh.chromosomes_models[curent_chrom].models[ - name - ] - empty_start = 0 - else: - print(f"Chromosome {i[0]} missing from model") - missing_chrom = i[0] - start = i[s_index] - if e_index is None: - end = i[s_index] + 1 - else: - end = i[e_index] - r1 = np.sum(prob_array[start:end, 1]) - r2 = np.sum(prob_array[empty_start:start, 0]) - res += r1 - res += r2 - empty_start = end - res += np.sum(prob_array[empty_start:, 0]) - return res - - -def hard_universe_likelihood(model_folder, universe): - """ - Calculate likelihood of hard universe based on core, start, - end coverage model - :param str model_folder: path to folder containing model - :param str universe: path to universe - :return float: likelihood - """ - check_if_uni_sorted(universe) - model_lh = ModelLH(model_folder) - chroms = model_lh.chromosomes_list - s = calc_likelihood_hard(universe, chroms, model_lh, "start", 1) - e = calc_likelihood_hard(universe, chroms, model_lh, "end", 2) - c = calc_likelihood_hard(universe, chroms, model_lh, "core", 1, 2) - return sum([s, e, c]) - - -def likelihood_only_core(model_folder, universe, core="core"): - """ - Calculate likelihood of universe based on core coverage model - :param str model_folder: path to folder containing model - :param str universe: path to universe - :param str core: model file name - :return float: likelihood - """ - check_if_uni_sorted(universe) - model_lh = ModelLH(model_folder) - chroms = model_lh.chromosomes_list - c = calc_likelihood_hard(universe, chroms, model_folder, core, 1, 2) - return c - - -def background_likelihood(start, end, model_start, model_cove, model_end): - res = np.sum(model_start[start:end, 0]) - res += np.sum(model_cove[start:end, 0]) - res += np.sum(model_end[start:end, 0]) - return res - - -def weigh_livelihood(start, end, model_process, model_cove, model_out, reverse): - e_w = 1 / (end - start) # weights for processed model - c_w = np.linspace( - start=e_w, stop=1, num=(end - start) - ) # weights for core in processed region - if reverse: - c_w = c_w[::-1] - res = e_w * np.sum(model_process[start:end, 1]) - res += np.sum(c_w * model_cove[start:end, 1]) - res += (1 - e_w) * np.sum(model_process[start:end, 0]) - res += np.sum((1 - c_w) * model_cove[start:end, 0]) - res += np.sum(model_out[start:end, 0]) - return res - - -def flexible_peak_likelihood( - startS, startE, endS, endE, model_start, model_cove, model_end -): - # core part of the peak - res = np.sum(model_cove[startE:endS, 1]) - res += np.sum(model_start[startE:endS, 0]) - res += np.sum(model_end[startE:endS, 0]) - # start part of the peak - res += weigh_livelihood(startS, startE, model_start, model_cove, model_end, False) - # end part of the peak - res += weigh_livelihood(endS, endE, model_end, model_cove, model_start, True) - return res - - -def likelihood_flexible_universe(model_folder, universe, save_peak_input=False): - curent_chrom = "" - missing_chrom = "" - empty_start = 0 - res = 0 - check_if_uni_sorted(universe) - model_lh = ModelLH(model_folder) - chroms = model_lh.chromosomes_list - if save_peak_input: - output = [] - e = 0 # number of processed chromosomes - with open(universe) as uni: - for line in uni: - i = line.split("\t") - peak_start_s, peak_end_e = int(i[1]), int(i[2]) - peak_start_e, peak_end_s = int(i[6]), int(i[7]) - if i[0] == missing_chrom: - pass - else: - if i[0] != curent_chrom: - if i[0] in chroms: - model_lh.clear_chrom(curent_chrom) - if e != 0: - # if we read any chromosomes add to result background - # likelihood of part of the genome after the last region - res += background_likelihood( - empty_start, - len(model_start), - model_start, - model_core, - model_end, - ) - curent_chrom = i[0] - e += 1 - model_lh.read_chrom(curent_chrom) - model_start = model_lh.chromosomes_models[curent_chrom].models[ - "start" - ] - model_core = model_lh.chromosomes_models[curent_chrom].models[ - "core" - ] - model_end = model_lh.chromosomes_models[curent_chrom].models[ - "end" - ] - - else: - print(f"Chromosome {i[0]} missing from model") - missing_chrom = i[0] - res += background_likelihood( - empty_start, peak_start_s, model_start, model_core, model_end - ) - peak_likelihood = flexible_peak_likelihood( - peak_start_s, - peak_start_e, - peak_end_s, - peak_end_e, - model_start, - model_core, - model_end, - ) - res += peak_likelihood - if save_peak_input: - backgroung = background_likelihood( - peak_start_s, peak_end_e, model_start, model_core, model_end - ) - contribution = peak_likelihood - backgroung - output.append("{}\t{}\n".format(line.strip("\n"), contribution)) - empty_start = peak_end_e - - res += background_likelihood( - empty_start, len(model_start), model_start, model_core, model_end - ) - if save_peak_input: - print("saving") - with open(universe + "_peak_likelihood", "w") as f: - f.writelines(output) - return res diff --git a/gitk/assess/recovered.py b/gitk/assess/recovered.py deleted file mode 100644 index d2e6da67..00000000 --- a/gitk/assess/recovered.py +++ /dev/null @@ -1,151 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import os -from .utils import chrom_cmp_bigger, process_db_line, prep_data, check_if_uni_sorted -import numpy as np -from multiprocessing import Pool - - -def process_region( - start, start_region, q_chrom, start_region_chrom, db, b_index, e_index -): - """ - For given position check if it is covered by universe flexible region - :param int start: position to analyse from query peak - :param list start_region: last analysed region from universe - :param str q_chrom: chromosome of query peak - :param str start_region_chrom: chromosome of universe region - :param file db: universe file - :param int b_index: index of region beginning in line from universe - :param int e_index: index of region ending in line from universe - :return: whether peak is covered; current region from universe; current chromosome of region from universe - """ - while chrom_cmp_bigger(q_chrom, start_region_chrom): - dn = db.readline().strip("\n") - if dn == "": - break - start_region, start_region_chrom = process_db_line(dn, [b_index, e_index]) - while start > start_region[1] and q_chrom == start_region_chrom: - dn = db.readline().strip("\n") - if dn == "": - break - start_region, start_region_chrom = process_db_line(dn, [b_index, e_index]) - if start_region[0] <= start < start_region[1] and q_chrom == start_region_chrom: - return True, start_region, start_region_chrom - return False, start_region, start_region_chrom - - -def calc_no_retrieve(db_file, q_folder, q_file): - """ - Calculate percent of strats and ends covered by flexible universe for given file - :param str db_file: path to universe file - :param str q_folder: path to folder containing query files - :param str q_file: file name - :return: file name; number of peaks in file; number of peaks with start covered by universe; - percent of peaks with start covered by universe; number of peaks with end covered by universe; - percent of peaks with end covered by universe; number of peaks with at least on end covered by universe; - percent of peaks with at least on end covered by universe; number of peaks with both ends covered by universe; - percent of peaks with both ends covered by universe; - """ - prep_data(q_folder, q_file) - q = open(os.path.join("tmp", q_file + "_sorted"), "r") - db_start = open(db_file) - d_start = db_start.readline().strip("\n") - start_region, start_region_chrom = process_db_line(d_start, [1, 6]) - db_end = open(db_file) - d_end = db_end.readline().strip("\n") - end_region, end_region_chrom = process_db_line(d_end, [1, 6]) - res_start = 0 - res_end = 0 - res_or = 0 - res_and = 0 - q_len = 0 - for i in q: - q_len += 1 - i = i.split("\t") - start = int(i[1]) - end = int(i[2]) - q_chrom = i[0] - start_out = process_region( - start, start_region, q_chrom, start_region_chrom, db_start, 1, 6 - ) - (found_start, start_region, start_region_chrom) = start_out - end_out = process_region( - end, end_region, q_chrom, end_region_chrom, db_end, 7, 2 - ) - (found_end, end_region, end_region_chrom) = end_out - if found_start: - res_start += 1 - if found_end: - res_end += 1 - if found_start or found_end: - res_or += 1 - if found_start and found_end: - res_and += 1 - os.remove(os.path.join("tmp", q_file + "_sorted")) - return ( - q_file, - q_len, - res_start, - res_start / q_len * 100, - res_end, - res_end / q_len * 100, - res_or, - res_or / q_len * 100, - res_and, - res_and / q_len * 100, - ) - - -def run_recovered( - q_folder, file_list, db_file, npool, save_to_file=False, folder_out=None, pref=None -): - """ - Calculate percent of strats and ends covered by flexible universe for set of files - :param str q_folder: path to folder containing query files - :param str file_list: path to file containing list of query files - :param str db_file: path to universe file - :param int npool: number of parallel processes - :param bool save_to_file: whether to save median of calculated distances for each file - :param str folder_out: output folder - :param str pref: prefix used for saving - :return: mean of percent of strats covered by flexible universe; - mean of percent of ends covered by flexible universe; - mean of percent of regions with at least one end covered by flexible universe; - mean of percent of regions with both ends covered by flexible universe - """ - check_if_uni_sorted(db_file) - if folder_out: - os.makedirs(folder_out, exist_ok=True) - os.mkdir("tmp") - files = open(file_list).read().split("\n")[:-1] - res = [] - if npool <= 1: - for i in files: - r = calc_no_retrieve(db_file, q_folder, i) - res.append(r) - else: - with Pool(npool) as p: - args = [(db_file, q_folder, f) for f in files] - res = p.starmap(calc_no_retrieve, args) - os.rmdir("tmp") - if save_to_file: - fout = os.path.join(folder_out, pref + "_data.tsv") - with open(fout, "w") as o: - header = [ - "file", - "peak_no", - "peak_start_percent", - "peak_end_percent", - "peak_or_percent", - "peak_and_percent", - ] - o.write("\t".join(header) + "\n") - for r in res: - o.write(f"{r[0]}\t{r[1]}\t{r[3]}\t{r[5]}\t{r[7]}\t{r[9]}\n") - else: - res = np.array(res) - res = res[:, [1, 3, 5, 7, 9]] - res = res.astype("float") - return np.mean(res, axis=0) diff --git a/gitk/bedspace/README.md b/gitk/bedspace/README.md deleted file mode 100644 index c3637418..00000000 --- a/gitk/bedspace/README.md +++ /dev/null @@ -1,84 +0,0 @@ -# BEDSpace -## Overview -`bedspace` uses the StarSpace method (Wu et al., 2018) to jointly embed genomic interval regions sets with associated metadata into a shared latent embedding space. This facilitates fast search and retrieval of similar region sets and their associated metadata. - -## Installation -`bedspace` is a part of the `gitk` package, which can be installed from PyPI using `pip install gitk`. To ensure that everything is working correctly, run the following command: - -``` -python -c "from gitk import bedspace" -``` - -## Usage -There are four main commands in `bedspace`: -1. `bedspace preprocess`: preprocesses a set of genomic interval regions and their associated metadata into a format that can be used by `bedspace train`. -2. `bedspace train`: trains a StarSpace model on the preprocessed data. -3. `bedspace distances`: computes the distances between all of the trained model's region sets and meta data labels. -4. `bedspace search` searches for the most similar region sets and metadata labels to a given query. There are three scenarios in this command, which will be described in turn. - -### `bedspace preprocess` -The `preprocess` command will prepare a set of region sets and metadata labels for training. This includes things like adding the `__label__` prefix to metadata labels, and converting the region sets into a format that can be used by StarSpace. The command takes in a set of region sets and metadata labels, and outputs a set of preprocessed region sets and metadata labels. The command can be run as follows: - -``` -gitk bedspace preprocess \ - --input \ - --metadata \ - --universe \ - --labels \ - --output -``` -### `bedspace train` -The `train` command will train a StarSpace model on the preprocessed region sets and metadata labels. It requires that you have ran the `preprocess` command first. The `train` command takes in a set of preprocessed region sets and metadata labels, and outputs a trained StarSpace model. The command can be run as follows: - -``` -gitk bedspace train \ - --path-to-starspace \ - --input \ - --output \ - --dim \ - --epochs \ - --lr -``` - -### `bedspace distances` -The `distances` command will compute the distances between all of the region sets and metadata labels in the trained model. It requires that you have ran the `train` command first. The `distances` command takes in a trained StarSpace model, and outputs a set of distances between all of the region sets and metadata labels in the model. The command can be run as follows: - -``` -gitk bedspace distances \ - --input \ - --metadata \ - --universe \ - --labels \ - --files \ - --output -``` - -### `bedspace search` -There are three scenarios when using the `search` command. 1) You have a query region set and want to find the most similar metadata labels, 2) You have a query metadata label and want to find the most similar region sets, and 3) You have a query region set and want to find the most similar region sets. These are labeled `r2l`, ``l2r``, and `r2r` respectively. The `search` command requires that you have ran the `distances` command first. The `search` command requires you to specify the search type so it knows which scenario you are using. It also requires a query. Example usages for each type are given below: - -#### `r2l` -``` -gitk bedspace search \ - -t lr2 - -d \ - -n \ - path/to/regions.bed -``` - -#### `l2r` -``` -gitk bedspace search \ - -t rl2 - -d \ - -n \ - K562 -``` - -#### `r2r` -``` -gitk bedspace search \ - -t rr2 - -d \ - -n \ - path/to/regions.bed -``` \ No newline at end of file diff --git a/gitk/cli.py b/gitk/cli.py deleted file mode 100644 index c6e4d627..00000000 --- a/gitk/cli.py +++ /dev/null @@ -1,238 +0,0 @@ -from typing import Dict -import logmuse -import sys - -from ubiquerg import VersionInHelpParser - -from .assess.cli import build_mode_parser as assess_subparser -from .eval.cli import build_subparser as eval_subparser -from .hmm.cli import build_subparser as hmm_subparser -from .likelihood.cli import build_subparser as likelihood_subparser -from .scembed.argparser import build_argparser as scembed_subparser -from .bedspace.cli import build_argparser as bedspace_subparser - -from ._version import __version__ - - -def build_argparser(): - """ - Builds argument parser. - - :return argparse.ArgumentParser - """ - - banner = "%(prog)s - Genomic Interval toolkit" - additional_description = "\nhttps://gitk.databio.org" - - parser = VersionInHelpParser( - prog="gitk", - version=f"{__version__}", - description=banner, - epilog=additional_description, - ) - - # Individual subcommands - msg_by_cmd = { - "hmm": "Use an HMM to build a consensus peak set.", - "lh": "Compute likelihood", - "assess": "Assess a universe", - "scembed": "Embed single-cell data as region vectors", - "bedspace": "Coembed regionsets (bed files) and labels", - } - - sp = parser.add_subparsers(dest="command") - subparsers: Dict[str, VersionInHelpParser] = {} - for k, v in msg_by_cmd.items(): - subparsers[k] = sp.add_parser(k, description=v, help=v) - - # build up subparsers for modules - subparsers["hmm"] = hmm_subparser(subparsers["hmm"]) - subparsers["assess"] = assess_subparser(subparsers["assess"]) - subparsers["lh"] = likelihood_subparser(subparsers["lh"]) - subparsers["scembed"] = scembed_subparser(subparsers["scembed"]) - subparsers["bedspace"] = bedspace_subparser(subparsers["bedspace"]) - - return parser - - -def main(test_args=None): - parser = logmuse.add_logging_options(build_argparser()) - args, _ = parser.parse_known_args() - if test_args: - args.__dict__.update(test_args) - - global _LOGGER - _LOGGER = logmuse.logger_via_cli(args, make_root=True) - - if args.command is None: - parser.print_help(sys.stderr) - sys.exit(1) - - _LOGGER.info(f"Command was: {args.command}") - - if args.command == "assess": - _LOGGER.info(f"Subcommand: {args.subcommand}") - if args.subcommand == "distance": - from .assess.distance import run_distance - - run_distance( - args.raw_data_folder, - args.file_list, - args.universe, - args.npool, - args.flexible, - args.save_to_file, - args.folder_out, - args.pref, - args.save_each, - ) - - if args.subcommand == "intersection": - from .assess.intersection import run_intersection - - run_intersection( - args.raw_data_folder, - args.file_list, - args.universe, - args.npool, - args.save_to_file, - args.folder_out, - args.pref, - ) - - if args.subcommand == "recovered": - from .assess.recovered import run_recovered - - run_recovered( - args.raw_data_folder, - args.file_list, - args.universe, - args.npool, - args.save_to_file, - args.folder_out, - args.pref, - ) - - if args.command == "lh": - _LOGGER.info(f"Subcommand: {args.subcommand}") - if args.subcommand == "build_model": - from .likelihood.build_model import main - - main( - args.model_folder, - args.coverage_folder, - args.coverage_prefix, - args.file_no, - ) - - if args.subcommand == "universe_hard": - from .likelihood.universe_hard import main - - main( - args.coverage_file, - args.fout, - args.merge, - args.filter_size, - args.cut_off, - ) - - if args.subcommand == "universe_flexible": - from .likelihood.universe_flexible import main - - main(args.model_folder, args.output_file) - - if args.command == "hmm": - from .hmm.hmm import run_hmm_save_bed - - run_hmm_save_bed( - coverage_folder=args.cov_folder, - out_file=args.out_file, - prefix=args.coverage_prefix, - normalize=args.normalize, - save_max_cove=args.save_max_cove, - ) - - if args.command == "scembed": - from .scembed.scembed import main as scembed_main - - _LOGGER.info("Running scembed") - pass - # scembed_main(test_args) - - return - - if args.command == "bedspace": - from .bedspace.const import PREPROCESS_CMD, TRAIN_CMD, DISTANCES_CMD, SEARCH_CMD - - _LOGGER.info(f"Subcommand: {args.subcommand}") - - if args.subcommand == PREPROCESS_CMD: - from .bedspace.pipeline.preprocess import main as preprocess_main - - _LOGGER.info("Running bedspace preprocess") - preprocess_main( - args.input, args.metadata, args.universe, args.output, args.labels - ) - - elif args.subcommand == TRAIN_CMD: - from .bedspace.pipeline.train import main as train_main - - _LOGGER.info("Running bedspace train") - train_main( - args.path_to_starspace, - args.input, - args.output, - args.num_epochs, - args.dim, - args.learning_rate, - ) - - elif args.subcommand == DISTANCES_CMD: - from .bedspace.pipeline.distances import main as distances_main - - _LOGGER.info("Running bedspace distances") - distances_main( - args.input, - args.metadata, - args.universe, - args.output, - args.labels, - args.files, - args.threshold, - ) - - elif args.subcommand == SEARCH_CMD: - from .bedspace.const import SearchType - from .bedspace.pipeline.search import run_scenario1 as scenario1 - from .bedspace.pipeline.search import run_scenario2 as scenario2 - from .bedspace.pipeline.search import run_scenario3 as scenario3 - - if args.type == SearchType.l2r: - _LOGGER.info("Running bedspace search (scenario 1)") - scenario1( - args.query, - args.distances, - args.num_results, - ) - elif args.type == SearchType.r2l: - _LOGGER.info("Running bedspace search (scenario 2)") - scenario2( - args.query, - args.distances, - args.num_results, - ) - elif args.type == SearchType.l2l: - _LOGGER.info("Running bedspace search (scenario 3)") - scenario3( - args.query, - args.distances, - args.num_results, - ) - - else: - # print help for this subcommand - _LOGGER.info("Running bedspace help") - - -if __name__ == "__main__": - main() diff --git a/gitk/const.py b/gitk/const.py deleted file mode 100644 index deb91e45..00000000 --- a/gitk/const.py +++ /dev/null @@ -1 +0,0 @@ -PKG_NAME = "gitk" diff --git a/gitk/eval/README.md b/gitk/eval/README.md deleted file mode 100644 index d5ab8652..00000000 --- a/gitk/eval/README.md +++ /dev/null @@ -1,105 +0,0 @@ -# Evaluation of Genomic Region Embeddings - -## Genome Distance Test -Evaluate how well genomic region embeddings preserve the structure (relative closeness) of genomic regions on the genome. - -``` -from gitk.eval.genome_distance import * -import numpy as np - -model_path = '/path/to/a/region2vec/model/' -boundaries = np.linspace(0, 1e8, 5) # four bins -result = genome_distance_test(model_path, 'region2vec', boundaries, num_samples=1000, seed=0) - -avgGD = result['AvgGD'] -avgED = result['AvgED'] -gdt_plot_fitted(avgGD, avgED, 'result.png') - - -# process a batch of two models -model_path1 = '/path/to/the/region2vec/model1/' -model_path2 = '/path/to/the/region2vec/model2/' -batch = [(model_path1, 'region2vec'), (model_path2, 'base')] # (model_path, embed_type) -result_list = genome_distance_test_batch(batch, boundaries, num_samples=1000, seed=0) - -slope1 = result_list[0]['Slope'] -error1 = result_list[0]['AvgED'] -AvgGD1 = result_list[0]['AvgGD'] -AvgED1 = result_list[0]['AvgED'] -model_path1 = result_list[0]['Path'] - -slope2 = result_list[1]['Slope'] -error2 = result_list[1]['Error'] -AvgGD2 = result_list[1]['AvgGD'] -AvgED2 = result_list[1]['AvgED'] -model_path2 = result_list[1]['Path'] - -# Run the genome distance test 20 times for the two models -row_labels = ['model1-region2vec', 'model2-base'] -slope_list, approx_err_list = gdt_eval(batch, boundaries, num_runs=20, num_samples=1000, save_folder=None) - -# plot the genome distance test figure -gdt_box_plot(slope_list, approx_err_list, row_labels, filename='gdt_result.png') -``` - -## Neighborhood Preserving Test -Evaluate how significant genomic region embeddings preserve their neighboring regions on the genome against random embeddings. - -``` -from gitk.eval.neighborhood_preserving import * -model_path = '/path/to/a/region2vec/model/' -embed_type = 'region2vec' -K = 50 -result = neighborhood_preserving_test(model_path, embed_type, K, num_samples=1000, seed=0) -print(result['SNPR'][0]) - -# process a batch of two models -model_path1 = '/path/to/the/region2vec/model1/' -model_path2 = '/path/to/the/region2vec/model2/' -batch = [(model_path1, 'region2vec'), (model_path2, 'base')] # (model_path, embed_type) -result_list = neighborhood_preserving_test_batch(batch, K, num_samples=1000, seed=0) -print(result_list[0]['SNPR'][0]) # SNPR for model1 -print(result_list[1]['SNPR'][0]) # SNPR for model2 - -# Run the genome distance test 20 times for the two models, setting save_folder will save the result for each run -snpr_results = npt_eval(batch, K, num_samples=1000, num_runs=20, save_folder=None) - -# plot the neighborhood preserving test figure -row_labels = ['model1-region2vec', 'model2-base'] -snpr_plot(snpr_results, row_labels, filename='snpr_result.png') -``` - -## Clustering Significance Test -Evaluate how well the genomic region embeddings can form biologically meaningful clusters. The metric we use for a set of genomic region embeddings reflects how well these embeddings can separate clusters that are related to transcription start sites from those are not related to transcription start sites. -Since we do not know a priori the true number of clusters for a set of region embeddings, we specify `K_arr` to include several possible numbers of clusters. - -The functions require running R scripts with the `GenomicDistributions` and `optparse` packages. -``` -from gitk.eval.clustering_significance_test import * - -# process a single model (a set of genomic region embeddings) -model_path = '/path/to/a/region2vec/model/' -embed_type = 'region2vec' -save_folder = '/path/to/cst/results/' -Rscript_path = '/path/to/Rscript/' -assembly = 'hg19' -num_samples = 1000 -K_arr = [5, 20, 40, 60, 100] -threshold = 0.0001 # significance threshold -scores = clustering_significance_test(model_path, embed_type, save_folder, Rscript_path, assembly, K_arr, num_samples, threshold) - -# process a batch of two models -model_path1 = '/path/to/the/region2vec/model1/' -model_path2 = '/path/to/the/region2vec/model2/' -batch = [(model_path1, 'region2vec'), (model_path2, 'base')] # (model_path, embed_type) - -# since we have more than one models, we can rank them based on the average scores over different Ks -scores_batch, avg_ranks = clustering_significance_test_batch(batch, save_folder, Rscript_path, assembly, K_arr, num_samples, threshold) - -# average ranks after running clustering_significance_test_batch num_runs times -avg_ranks_arr = cst_eval(batch, K, save_folder, Rscript_path, assembly, K_arr, num_samples, threshold, num_runs=20) - -# plot the average ranks for the two models -row_labels = ['model1-region2vec', 'model2-base'] -cst_plot(avg_ranks_arr, row_labels, filename='cst_result.png') -``` \ No newline at end of file diff --git a/gitk/eval/__init__.py b/gitk/eval/__init__.py deleted file mode 100644 index 5555f0f7..00000000 --- a/gitk/eval/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .get_base_embeddings import save_base_embeddings, load_base_embeddings -from .genome_distance import load_genomic_embeddings diff --git a/gitk/eval/cli.py b/gitk/eval/cli.py deleted file mode 100644 index 57d6401b..00000000 --- a/gitk/eval/cli.py +++ /dev/null @@ -1,2 +0,0 @@ -def build_subparser(parser): - return False diff --git a/gitk/eval/clustering_significance.py b/gitk/eval/clustering_significance.py deleted file mode 100644 index fbd81fd0..00000000 --- a/gitk/eval/clustering_significance.py +++ /dev/null @@ -1,276 +0,0 @@ -import pickle -import numpy as np -import glob -import os -import time -from sklearn.cluster import KMeans -from tqdm import tqdm -import shutil -from scipy.stats import rankdata -import matplotlib.pyplot as plt -from matplotlib.lines import Line2D -import multiprocessing as mp -from gitk.eval import load_genomic_embeddings -import subprocess - - -def random_annotate_points( - labels, num_per_cluster=10, min_num_per_cluster=1, max_font_size=20, min_font_size=8 -): - cluster_labels = np.unique(labels) # sorted - positions = np.arange(len(labels)) - if cluster_labels[0] == -1: - print("Number of clusters: {}".format(len(cluster_labels) - 1)) - else: - print("Number of clusters: {}".format(len(cluster_labels))) - clusters = [positions[labels == c] for c in cluster_labels] - ratios = np.array([len(clusters[i]) / len(labels) for i in range(len(clusters))]) - annotate_arr = [] - for i, c in enumerate(cluster_labels): - num = len(clusters[i]) - annotate_num = max( - min(int(ratios[i] / ratios.max() * num_per_cluster), num), - min(num, min_num_per_cluster), - ) - fsize = max(int(ratios[i] / ratios.max() * max_font_size), min_font_size) - indices = np.random.permutation(num)[0:annotate_num] - pos = clusters[i][indices] - annotate_arr.extend([(p, c, fsize) for p in pos]) - return annotate_arr - - -def assign_color_by_size(cmap_name, labels): - cluster_labels = np.unique(labels) # sorted - cluster_sizes = [(c, (labels == c).sum()) for c in cluster_labels] - cluster_sizes = sorted(cluster_sizes, key=lambda x: -x[1]) - color_mapping = {c: i for i, (c, s) in enumerate(cluster_sizes) if c != -1} - cmap = plt.get_cmap(cmap_name) - colors = cmap(np.linspace(0, 1, len(color_mapping))) - # add outlier color - color_mapping[-1] = len(color_mapping) - colors = np.vstack([colors, [0.0, 0, 0, 1.0]]) - label_colors = [colors[color_mapping[l]] for l in labels] - return label_colors - - -def get_cluster_regions(cluster_idx, labels, vocab, path): - def region2tuple(x): - eles = x.split(":") - chr_name = eles[0].strip() - start, end = eles[1].split("-") - start, end = int(start.strip()), int(end.strip()) - return chr_name, start, end - - positions = np.arange(len(labels)) - indices = positions[labels == cluster_idx] - regions = [region2tuple(vocab[i]) for i in indices] - os.makedirs(path, exist_ok=True) - with open(os.path.join(path, "cluster_{}.bed".format(cluster_idx)), "w") as f: - for chr_name, start, end in regions: - f.write("{}\t{}\t{}\n".format(chr_name, start, end)) - - -def clustering(model_path, embed_type, K, save_folder, seed=0): - np.random.seed(seed) - embeds, vocab = load_genomic_embeddings(model_path, embed_type) - - clustering = KMeans(n_clusters=K, random_state=seed).fit(embeds) - labels = clustering.labels_ - cluster_idxes = np.sort(np.unique(labels)) - for c in cluster_idxes: - get_cluster_regions(c, labels, vocab, save_folder) - with open(os.path.join(save_folder, "labels.pickle"), "wb") as f: - pickle.dump(labels, f) - - -def clustering_batch(batch, K, save_folder, seed=0, num_workers=10): - worker_func = clustering - with mp.Pool(processes=num_workers) as pool: - all_processes = [] - for i, (path, embed_type) in enumerate(enumerate(batch)): - folder = os.path.join(save_folder, "model_{}".format(i)) - os.makedirs(folder, exist_ok=True) - process = pool.apply_async(worker_func, (path, embed_type, K, folder)) - all_processes.append(process) - for process in all_processes: - process.get() - - -def cal_significance_val(pvals, threshold): - num = (pvals < threshold).sum() + (pvals > 1 - threshold).sum() - return num / len(pvals) - - -def clustering_significance_test( - model_path, - embed_type, - save_folder, - Rscript_path, - assembly, - K_arr=[5, 20, 40], - num_samples=1000, - threshold=0.0001, - num_workers=10, - seed=0, -): - for K in K_arr: - target_folder = os.path.join(save_folder, "Kmeans_{}".format(K)) - clustering(model_path, embed_type, K, target_folder, seed=0) - curr_folder = os.path.dirname(os.path.abspath(__file__)) - subprocess.call( - [ - Rscript_path, - "{}/permutation_test.R".format(curr_folder), - "--assembly", - assembly, - "--num_workers", - str(num_workers), - "--path", - save_folder, - "--num_samples", - str(num_samples), - ] - ) - scores = [] - for K in K_arr: - target_path = os.path.join(save_folder, "Kmeans_{}".format(K), "pvals.txt") - with open(target_path, "r") as f: - pvals = f.readlines() - pvals = np.array([float(p.strip()) for p in pvals]) - score = cal_significance_val(pvals, threshold) - scores.append(score) - print(model_path) - print( - "(K, CCSI): " - + " ".join( - ["({},{:.6f})".format(K_arr[i], scores[i]) for i in range(len(K_arr))] - ) - ) - print("\n") - return scores - - -def clustering_significance_test_batch( - batch, - save_folder, - Rscript_path, - assembly, - K_arr=[5, 20, 40, 60, 100], - num_samples=1000, - threshold=0.0001, - num_workers=10, - seed=0, -): - scores_batch = [] - for i, (model_path, embed_type) in enumerate(batch): - target_folder = os.path.join(save_folder, "model_{}".format(i)) - scores = clustering_significance_test( - model_path, - embed_type, - target_folder, - Rscript_path, - assembly, - K_arr, - num_samples, - threshold, - num_workers, - seed, - ) - scores_batch.append(scores) - scores_batch = np.array(scores_batch) - avg_ranks = rankdata(-scores_batch, method="average", axis=0).mean(axis=1) - return scores_batch, avg_ranks - - -def cst_eval( - batch, - save_folder, - Rscript_path, - assembly, - K_arr=[5, 20, 40, 60, 100], - num_samples=1000, - threshold=0.0001, - num_runs=20, - num_workers=10, -): - avg_ranks_arr = [] - for seed in range(num_runs): - target_folder = os.path.join(save_folder, "cst_seed{}".format(seed)) - scores_batch, avg_ranks = clustering_significance_test_batch( - batch, - target_folder, - Rscript_path, - assembly, - K_arr, - num_samples, - threshold, - num_workers, - seed, - ) - avg_ranks_arr.append(avg_ranks) - avg_ranks_arr = np.vstack(avg_ranks_arr) - avg_ranks_arr = [(batch[i][0], avg_ranks_arr[:, i]) for i in range(num_runs)] - return avg_ranks_arr - - -def cst_plot(avg_ranks_arr, row_labels=None, legend_pos=(0.25, 0.6), filename=None): - mean_rank = [t[1].mean() for t in avg_ranks_arr] - std_rank = [t[1].std() for t in avg_ranks_arr] - mean_rank_tuple = [(i, r) for i, r in enumerate(mean_rank)] - mean_rank_tuple = sorted(mean_rank_tuple, key=lambda x: x[1]) - indexes = [t[0] for t in mean_rank_tuple] - - if row_labels is None: - row_labels = [t[0] for t in avg_ranks_arr] - - mean_rank = [mean_rank[i] for i in indexes] - std_rank = [std_rank[i] for i in indexes] - row_labels = [row_labels[i] for i in indexes] - - cmap = plt.get_cmap("Set1") - cmaplist = [cmap(i) for i in range(9)] - fig, ax = plt.subplots(figsize=(10, 6)) - ax.set_xticks(list(range(1, len(mean_rank) + 1))) - - ax.errorbar( - range(1, len(mean_rank) + 1), - mean_rank, - yerr=std_rank, - fmt="o", - ms=10, - mfc=cmaplist[1], - mec=cmaplist[8], - ecolor=cmaplist[2], - elinewidth=3, - capsize=5, - ) - ax.set_xticklabels(row_labels) - ax.set_ylabel("CCSI Rank") - _ = plt.setp( - ax.get_xticklabels(), rotation=-15, ha="left", va="top", rotation_mode="anchor" - ) - patches = [ - Line2D( - [0], - [0], - marker="o", - linestyle="", - color=cmaplist[1], - markersize=12, - mec=cmaplist[8], - ), - Line2D([0], [0], color=cmaplist[2], lw=4), - ] - legend = ax.legend( - labels=["CCSI average rank", "CCSI rank standard deviation"], - handles=patches, - bbox_to_anchor=legend_pos, - loc="center left", - borderaxespad=0, - fontsize=12, - frameon=True, - ) - ax.grid("on") - ax.set_ylim(ax.get_ylim()[::-1]) - if filename: - fig.savefig(filename, bbox_inches="tight") diff --git a/gitk/eval/genome_distance.py b/gitk/eval/genome_distance.py deleted file mode 100644 index 087e0633..00000000 --- a/gitk/eval/genome_distance.py +++ /dev/null @@ -1,714 +0,0 @@ -import pickle -import os - -os.environ["OPENBLAS_NUM_THREADS"] = "1" -import numpy as np -import random -import glob -import time -import time -import multiprocessing as mp -import argparse -from gensim.models import Word2Vec -import matplotlib.pyplot as plt -import matplotlib -from matplotlib.lines import Line2D -from matplotlib.patches import Patch -from sklearn.metrics import r2_score -from gitk.eval import load_base_embeddings - -_log_path = None - - -def set_log_path(path): - global _log_path - _log_path = path - - -def log(obj, filename="log.txt"): - print(obj) - if _log_path is not None: - with open(os.path.join(_log_path, filename), "a") as f: - f.write(obj) - f.write("\n") - - -def load_genomic_embeddings(model_path, embed_type="region2vec"): - if embed_type == "region2vec": - model = Word2Vec.load(model_path) - regions_r2v = model.wv.index_to_key - embed_rep = model.wv.vectors - return embed_rep, regions_r2v - elif embed_type == "base": - embed_rep, regions_r2v = load_base_embeddings(model_path) - return embed_rep, regions_r2v - - -class Timer: - def __init__(self): - self.o = time.time() - - def measure(self, p=1): - x = (time.time() - self.o) / float(p) - x = int(x) - if x >= 3600: - return "{:.1f}h".format(x / 3600) - if x >= 60: - return "{}m".format(round(x / 60)) - return "{}s".format(x) - - -# function calculating the chromosome distance between two regions -func_gdist = lambda u, v: float(u[1] < v[1]) * max(v[0] - u[1] + 1, 0) + float( - u[1] >= v[1] -) * max(u[0] - v[1] + 1, 0) - - -def embed_distance(x1, x2, metric): - if metric == "cosine": - n1 = np.linalg.norm(x1) - n2 = np.linalg.norm(x2) - dist = 1 - np.dot(x1 / n1, x2 / n2) - elif metric == "euclidean": - dist = np.linalg.norm(x1 - x2) - else: - raise ("Invalid metric function") - return dist - - -def sample_pair(chromo_regions, chromo_ratios): - chromo_arr = [t[0] for t in chromo_ratios] - probs = [t[1] for t in chromo_ratios] - chromo = np.random.choice(chromo_arr, p=probs) - region_arr = chromo_regions[chromo] - - idx1 = np.random.randint(len(region_arr)) - idx2 = np.random.randint(len(region_arr)) - while idx1 == idx2: - idx2 = np.random.randint(len(region_arr)) - gdist = func_gdist(region_arr[idx1], region_arr[idx2]) - return chromo, idx1, idx2, gdist - - -def bin_search(boundaries, val): - left = 0 - right = len(boundaries) - 1 - if val < boundaries[left] or val > boundaries[right]: - return -1 - while left < right: - mid = int((left + right) / 2) - if boundaries[mid] == val: - return mid - 1 - elif boundaries[mid] > val: - right = mid - else: - left = mid + 1 - return left - 1 - - -def fill_bins_via_sampling( - embed_rep, - embed_bin, - regions_vocab, - boundaries, - num_per_bin, - dist_metric, - sum_statistic, - seed, -): - np.random.seed(seed) - num, dim = embed_rep.shape - - embed_rep_ref = (np.random.rand(num, dim) - 0.5) / dim - region2index = {r: i for i, r in enumerate(regions_vocab)} - # Group regions by chromosomes - chromo_regions = {} - embed_dict = {} - for i, v in enumerate(regions_vocab): - chromo, region = v.split(":") # e.g. chr1:100-1000 - chromo = chromo.strip() # remove possible spaces - region = region.strip() # remove possible spaces - start, end = region.split("-") - start = int(start.strip()) - end = int(end.strip()) - if chromo_regions.get(chromo, None) is None: - chromo_regions[chromo] = [(start, end)] - embed_dict[chromo] = [i] - else: - chromo_regions[chromo].append((start, end)) - embed_dict[chromo].append(i) - - chromo_ratios = [] - for i, chromo in enumerate(chromo_regions): - chromo_ratios.append((chromo, len(chromo_regions[chromo]) / len(regions_vocab))) - - num_bins = len(boundaries) - 1 - groups = [[] for i in range(num_bins)] - counts = np.array([0 for i in range(num_bins)]) - overlaps = np.array([0 for i in range(num_bins)]) - total_samples = num_per_bin * num_bins - num_try = 0 - MAX_TRY_NUMBER = 1e7 - while counts.sum() < total_samples: - while True: - num_try += 1 - chromo, idx1, idx2, gdist = sample_pair(chromo_regions, chromo_ratios) - bin_idx = bin_search(boundaries, gdist) - if bin_idx == -1: - continue - if counts[bin_idx] < num_per_bin: - break - if num_try >= MAX_TRY_NUMBER: - break - if num_try >= MAX_TRY_NUMBER: - break - emb_arr = embed_dict[chromo] - eidx1, eidx2 = emb_arr[idx1], emb_arr[idx2] - edist = embed_distance(embed_rep[eidx1], embed_rep[eidx2], dist_metric) - overlap_ratio = ( - embed_bin[eidx1] * embed_bin[eidx2] - ).sum() # /embed_bin.shape[1] - edist_ref = embed_distance( - embed_rep_ref[eidx1], embed_rep_ref[eidx2], dist_metric - ) - groups[bin_idx].append((gdist, edist, edist_ref)) - counts[bin_idx] += 1 - overlaps[bin_idx] += overlap_ratio - records = [] - for i in range(num_bins): - if counts[i] == 0: - avg_gd = -1 - avg_ed = -1 - avg_ed_ref = -1 - else: - if sum_statistic == "mean": - avg_gd = np.array([t[0] for t in groups[i]]).mean() - avg_ed = np.array([t[1] for t in groups[i]]).mean() - avg_ed_ref = np.array([t[2] for t in groups[i]]).mean() - elif sum_statistic == "median": - avg_gd = np.median(np.array([t[0] for t in groups[i]])) - avg_ed = np.median(np.array([t[1] for t in groups[i]])) - avg_ed_ref = np.median(np.array([t[2] for t in groups[i]])) - records.append((avg_gd, avg_ed, avg_ed_ref, counts[i], overlaps[i] / counts[i])) - return records - - -def convert_position(pos): - if pos // 1e6 > 0: - return "{:.4f} MB".format(pos / 1e6) - elif pos // 1e3 > 0: - return "{:.4f} KB".format(pos / 1e3) - else: - return "{:.4f} B".format(pos) - - -def get_slope(avgGD, avgED, log_xscale=False): - x = avgGD - x1 = x[x > 0] / 1e8 - y = avgED - y1 = y[x > 0] - if log_xscale: - x1 = np.log10(x1) - A = np.vstack([x1, np.ones(len(x1))]).T - lin_res = np.linalg.lstsq(A, y1, rcond=None) - m, c = lin_res[0] # slope, bias - r = lin_res[1][0] # approximation error - r2 = r2_score(y1, m * x1 + c) - return m, c, r2, x1, y1 - - -def genome_distance_test( - path, - embed_type, - boundaries, - num_samples=100, - metric="euclidean", - sum_statistic="mean", - seed=0, - queue=None, - worker_id=None, -): - embed_rep, regions_vocab = load_genomic_embeddings(path, embed_type) - bin_embed_path = os.path.join("/".join(path.split("/")[0:-3]), "bin_embed.pickle") - embed_bin, regions_bin = load_base_embeddings(bin_embed_path) - r2i = {r: i for i, r in enumerate(regions_bin)} - embed_bin = np.array([embed_bin[r2i[r]] for r in regions_vocab]) - res = fill_bins_via_sampling( - embed_rep, - embed_bin, - regions_vocab, - boundaries, - num_samples, - metric, - sum_statistic, - seed, - ) - msg1 = " ".join(["{:.4f}".format(r[0]) for r in res]) - msg2 = " ".join(["{:.4f}".format(r[1]) for r in res]) - msg3 = " ".join(["{:.4f}".format(r[2]) for r in res]) - msg4 = " ".join(["{:d}".format(r[3]) for r in res]) - msg5 = " ".join(["{:.4f}".format(r[4]) for r in res]) - - res_dict = { - "AvgGD": np.array([r[0] for r in res]), - "AvgED": np.array([r[1] for r in res]), - "AvgED_rand": np.array([r[2] for r in res]), - "num_samples": np.array([r[3] for r in res]), - "overlaps": np.array([r[4] for r in res]), - } - slope, bias, r2, x, y = get_slope(res_dict["AvgGD"], res_dict["AvgED"]) - res_dict["Slope"] = slope - res_dict["R2"] = r2 - res_dict["Path"] = path - msg = "[seed {}]: {}\n".format(seed, path) - msg += ( - "AvgGD: " - + msg1 - + "\n" - + "AvgED: " - + msg2 - + "\n" - + "Slope: {:.4f} R2: {:.4f}\n".format(slope, r2) - + "AvgED(random): " - + msg3 - + "\n" - + "Num Samples:" - + msg4 - + "\n" - + "Overlaped files:" - + msg5 - + "\n" - ) - print(msg) - if queue: - queue.put((worker_id, res_dict)) - return worker_id, res_dict, msg - else: - return res_dict - - -def genome_distance_test_batch( - batch, - boundaries, - num_samples=100, - metric="euclidean", - sum_statistic="mean", - seed=0, - num_workers=5, - save_path=None, -): - timer = Timer() - if num_workers <= 1: - res_list = [] - for path, embed_type in batch: - _, res, msg = genome_distance_test( - path, embed_type, boundaries, num_samples, metric, sum_statistic, seed - ) - res_list.append(res) - else: ## Multi-processing - manager = mp.Manager() - queue = manager.Queue() - with mp.Pool(processes=num_workers) as pool: - writer = pool.apply_async( - writer_multiprocessing, (save_path, len(batch), queue) - ) - all_processes = [] - for i, (path, embed_type) in enumerate(batch): - process = pool.apply_async( - genome_distance_test, - ( - path, - embed_type, - boundaries, - num_samples, - metric, - sum_statistic, - seed, - queue, - i, - ), - ) - all_processes.append(process) - - for process in all_processes: - process.get() - queue.put("kill") - res_list = writer.get() - if save_path: - os.makedirs(os.path.dirname(save_path), exist_ok=True) - with open(save_path, "wb") as f: - pickle.dump(res_list, f) - time_str = timer.measure() - print("Finished. Elasped time: " + time_str) - return res_list - - -def gdt_plot_fitted(avgGD, avgED, filename=None): - # plt.rcParams['text.usetex'] = True - fig, ax = plt.subplots(figsize=(5, 2.5)) - ratio, bias, r2, x, y = get_slope(avgGD, avgED) - ax.plot(x, y, "-^") - ax.plot(x, np.array(x) * ratio + bias, "r--") - t = ax.text( - 0.48, - 0.85, - "AvgGD={:.4f}*AvgED+{:.4f}".format(ratio, bias), - ha="center", - va="center", - size=15, - transform=ax.transAxes, - ) - ax.set_xlabel(r"AvgGD ($10^8$)") - ax.set_ylabel(r"AvgED") - if filename: - fig.savefig(filename, bbox_inches="tight") - - -def heatmap( - data, row_labels, col_labels, ax=None, cbar_kw=None, cbarlabel="", **kwargs -): - """ - Create a heatmap from a numpy array and two lists of labels. - - Parameters - ---------- - data - A 2D numpy array of shape (M, N). - row_labels - A list or array of length M with the labels for the rows. - col_labels - A list or array of length N with the labels for the columns. - ax - A `matplotlib.axes.Axes` instance to which the heatmap is plotted. If - not provided, use current axes or create a new one. Optional. - cbar_kw - A dictionary with arguments to `matplotlib.Figure.colorbar`. Optional. - cbarlabel - The label for the colorbar. Optional. - **kwargs - All other arguments are forwarded to `imshow`. - """ - - if ax is None: - ax = plt.gca() - - if cbar_kw is None: - cbar_kw = {} - - # Plot the heatmap - im = ax.imshow(data, **kwargs) - - # Create colorbar - cbar = ax.figure.colorbar(im, ax=ax, **cbar_kw) - cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom", fontsize=12) - - # Show all ticks and label them with the respective list entries. - ax.set_xticks(np.arange(data.shape[1])) - ax.set_xticklabels(col_labels, fontsize=12) - - ax.set_yticks(np.arange(data.shape[0])) - ax.set_yticklabels(row_labels, fontsize=12) - - # Let the horizontal axes labeling appear on top. - ax.tick_params(top=True, bottom=False, labeltop=True, labelbottom=False) - - # Rotate the tick labels and set their alignment. - plt.setp(ax.get_xticklabels(), rotation=-30, ha="right", rotation_mode="anchor") - - # Turn spines off and create white grid. - # ax.spines[:].set_visible(False) - - ax.set_xticks(np.arange(data.shape[1] + 1) - 0.5, minor=True) - ax.set_yticks(np.arange(data.shape[0] + 1) - 0.5, minor=True) - ax.grid(which="minor", color="w", linestyle="-", linewidth=3) - ax.tick_params(which="minor", bottom=False, left=False) - - return im, cbar - - -def annotate_heatmap( - im, - data=None, - valfmt="{x:.2f}", - textcolors=("black", "white"), - threshold=None, - fontsize=10, - **textkw -): - """ - A function to annotate a heatmap. - - Parameters - ---------- - im - The AxesImage to be labeled. - data - Data used to annotate. If None, the image's data is used. Optional. - valfmt - The format of the annotations inside the heatmap. This should either - use the string format method, e.g. "$ {x:.2f}", or be a - `matplotlib.ticker.Formatter`. Optional. - textcolors - A pair of colors. The first is used for values below a threshold, - the second for those above. Optional. - threshold - Value in data units according to which the colors from textcolors are - applied. If None (the default) uses the middle of the colormap as - separation. Optional. - **kwargs - All other arguments are forwarded to each call to `text` used to create - the text labels. - """ - - if not isinstance(data, (list, np.ndarray)): - data = im.get_array() - - # Normalize the threshold to the images color range. - if threshold is not None: - threshold = im.norm(threshold) - else: - threshold = im.norm(data.max()) / 2.0 - - # Set default alignment to center, but allow it to be - # overwritten by textkw. - kw = dict( - horizontalalignment="center", verticalalignment="center", fontsize=fontsize - ) - kw.update(textkw) - - # Get the formatter in case a string is supplied - if isinstance(valfmt, str): - valfmt = matplotlib.ticker.StrMethodFormatter(valfmt) - - # Loop over the data and create a `Text` for each "pixel". - # Change the text's color depending on the data. - texts = [] - for i in range(data.shape[0]): - for j in range(data.shape[1]): - if im.norm(data[i, j]) > 0.9 or im.norm(data[i, j]) < 0.1: - # if im.norm(data[i, j]) > 0.5: - index = 1 - else: - index = 0 - kw.update(color=textcolors[index]) - text = im.axes.text(j, i, valfmt(data[i, j], None), **kw) - texts.append(text) - - return texts - - -def get_gdt_results(save_paths): - with open(save_paths[0], "rb") as f: - results = pickle.load(f) - num = len(results) - r2_res = [[] for i in range(num)] - ratio_res = [[] for i in range(num)] - avg_gd_res = [[] for i in range(num)] - avg_ed_res = [[] for i in range(num)] - row_labels = ["" for i in range(num)] - for path in save_paths: - with open(path, "rb") as f: - results = pickle.load(f) - for i, res in enumerate(results): - key = res["Path"] - slope = res["Slope"] - r2 = res["R2"] - r2_res[i].append(r2) - ratio_res[i].append(slope) - avg_gd_res[i].append(res["AvgGD"]) - avg_ed_res[i].append(res["AvgED"]) - row_labels[i] = key.split("/")[-3] - avg_gd_res = np.array(avg_gd_res).mean(axis=1) - avg_ed_res = np.array(avg_ed_res).mean(axis=1) - mean_ratios = np.array(ratio_res).mean(axis=1) - mean_tuple = sorted( - [(i, m) for i, m in enumerate(mean_ratios)], key=lambda x: -x[1] - ) - indexes = [t[0] for t in mean_tuple] - mean_ratios = [t[1] for t in mean_tuple] - avg_ed_res = np.array([avg_ed_res[i] for i in indexes]) - row_labels = [row_labels[i] for i in indexes] - fig, ax = plt.subplots(figsize=(10, 20)) - im, cbar = heatmap( - avg_ed_res, - row_labels, - ["Group1", "Group2", "Group3", "Group4"], - ax=ax, - cmap="RdYlBu", - cbarlabel="AvgED", - ) - texts = annotate_heatmap(im, valfmt="{x:.2f}", fontsize=12) - fig.savefig("avged_heatmap.png") - return ratio_res, r2_res - - -def gdt_eval(batch, boundaries, num_runs=20, num_samples=1000, save_folder=None): - results_seeds = [] - for seed in range(num_runs): - print("----------------Run {}----------------".format(seed)) - save_path = ( - os.path.join(save_folder, "gdt_eval_seed{}".format(seed)) - if save_folder - else None - ) - result_list = genome_distance_test_batch( - batch, boundaries, num_samples=num_samples, seed=seed, save_path=save_path - ) - results_seeds.append(result_list) - - # get average slopes and R2 values for the two models - r2_res = [[] for i in range(len(batch))] - ratio_res = [[] for i in range(len(batch))] - for results in results_seeds: - for i, res in enumerate(results): - key = res["Path"] - slope = res["Slope"] - r2 = res["R2"] - r2_res[i].append(r2) - ratio_res[i].append(slope) - assert key == batch[i][0], "key == batch[i][0]" - - mean_ratio = [np.array(r).mean() for r in ratio_res] - std_ratio = [np.array(r).std() for r in ratio_res] - - mean_r2 = [np.array(e).mean() for e in r2_res] - std_r2 = [np.array(e).std() for e in r2_res] - - for i in range(len(mean_ratio)): - print( - "{}\n Slope (std): {:.4f} ({:.4f}) | R2 (std): {:.4f} ({:.4f}) \n".format( - batch[i][0], mean_ratio[i], std_ratio[i], mean_r2[i], std_r2[i] - ) - ) - return ratio_res, r2_res - - -def writer_multiprocessing(save_path, num, q): - results = [[] for i in range(num)] - while True: - m = q.get() - if m == "kill": - break - index = m[0] - results[index] = m[1] - if save_path: - os.makedirs(os.path.dirname(save_path), exist_ok=True) - with open(save_path, "wb") as f: - pickle.dump(results, f) - return results - - -def gdt_box_plot( - ratio_data, r2_data, row_labels=None, legend_pos=(0.25, 0.6), filename=None -): - cmap = plt.get_cmap("Set1") - cmaplist = [cmap(i) for i in range(9)] - # sort based on the mean slope values - ratio_data = [ - np.array(r) / np.sqrt(1 + np.array(r) * np.array(r)) for r in ratio_data - ] - mean_ratio = [(i, np.array(r).mean()) for i, r in enumerate(ratio_data)] - mean_ratio = sorted(mean_ratio, key=lambda x: -x[1]) - - indexes = [m[0] for m in mean_ratio] - mean_ratio = np.array([m[1] for m in mean_ratio]) - pos_slope_indexes = np.array([i for i in indexes if mean_ratio[i] > 0]) - neg_slope_indexes = np.array([i for i in indexes if mean_ratio[i] < 0]) - std_ratio = np.array([np.array(ratio_data[i]).std() for i in indexes]) - row_labels = [row_labels[i] for i in indexes] - r2_data = [r2_data[i] for i in indexes] - mean_r2s = np.array([np.array(e).mean() for e in r2_data]) - std_r2s = np.array([np.array(e).std() for e in r2_data]) - fig, ax = plt.subplots(figsize=(10, 6)) - - ax.plot( - range(len(mean_r2s)), mean_r2s, color=cmaplist[2], marker="o", linestyle="solid" - ) - ax.fill_between( - range(len(mean_r2s)), - mean_r2s - std_r2s, - mean_r2s + std_r2s, - color=cmaplist[2], - alpha=0.1, - ) - ax.set_xticks(list(range(len(mean_r2s))), labels=row_labels) - ax.set_ylabel(r"$R^2$") - _ = plt.setp( - ax.get_xticklabels(), rotation=-15, ha="left", va="top", rotation_mode="anchor" - ) - - ax1 = ax.twinx() - # ax1.fill_between(range(len(mean_r2s)), min(mean_ratio-std_ratio), 0.0, where=(mean_ratio<0), color=cmaplist[0],alpha=0.1) - ax1.plot( - range(len(mean_ratio)), - mean_ratio, - color=cmaplist[1], - marker="o", - linestyle="solid", - ) - ax1.fill_between( - range(len(mean_ratio)), - mean_ratio - std_ratio, - mean_ratio + std_ratio, - color=cmaplist[1], - alpha=0.1, - ) - ax1.set_ylabel(r"$sin(\alpha)$") - - shaded_range = np.arange(len(mean_r2s))[mean_ratio < 0] - if len(shaded_range) > 0: - ax1.axvspan( - shaded_range.min(), shaded_range.max(), color=cmaplist[0], alpha=0.1 - ) - patches = [ - Line2D( - [0], [0], marker="o", linestyle="", color=cmaplist[1], markersize=10 - ), - Patch(color=cmaplist[1], alpha=0.1), - Line2D( - [0], [0], marker="o", linestyle="", color=cmaplist[2], markersize=10 - ), - Patch(color=cmaplist[2], alpha=0.1), - Patch(color=cmaplist[0], alpha=0.1), - ] - legend = ax1.legend( - labels=[ - r"$sin(\alpha)$", - r"std($sin(\alpha)$)", - r"$R^2$", - r"std($R^2$)", - r"$sin(\alpha) < 0$", - ], - handles=patches, - bbox_to_anchor=legend_pos, - loc="center left", - borderaxespad=0, - fontsize=12, - frameon=True, - ) - - else: - patches = [ - Line2D( - [0], [0], marker="o", linestyle="", color=cmaplist[1], markersize=10 - ), - Patch(color=cmaplist[1], alpha=0.1), - Line2D( - [0], [0], marker="o", linestyle="", color=cmaplist[2], markersize=10 - ), - Patch(color=cmaplist[2], alpha=0.1), - ] - legend = ax1.legend( - labels=[r"$sin(\alpha)$", r"std($sin(\alpha)$)", r"$R^2$", r"std($R^2$)"], - handles=patches, - bbox_to_anchor=legend_pos, - loc="center left", - borderaxespad=0, - fontsize=12, - frameon=True, - ) - - ax.grid("on") - if filename: - fig.savefig(filename, bbox_inches="tight") - return row_labels, mean_ratio, mean_r2s diff --git a/gitk/eval/get_base_embeddings.py b/gitk/eval/get_base_embeddings.py deleted file mode 100644 index 0088c018..00000000 --- a/gitk/eval/get_base_embeddings.py +++ /dev/null @@ -1,64 +0,0 @@ -import pickle -import os -import numpy as np -import time -import argparse -import time -import multiprocessing as mp -import glob - - -class BaseEmbeddings: - def __init__(self, embeddings, vocab): - self.embeddings = embeddings - self.vocab = vocab - - -def get_bin_embeddings(universe_file, tokenized_files): - vocab = [] - with open(universe_file, "r") as f: - for line in f: - eles = line.strip().split("\t") - region = "{}:{}-{}".format(*eles) - vocab.append(region) - vocab_dict = {v: i for i, v in enumerate(vocab)} - print("vocab size is", len(vocab)) - bin_embeds = np.zeros((len(vocab), len(tokenized_files))) - for i, token_file in enumerate(tokenized_files): - with open(token_file, "r") as f: - for line in f: - eles = line.strip().split("\t") - region = "{}:{}-{}".format(*eles) - if region in vocab_dict: - bin_embeds[vocab_dict[region]][i] = 1 - bin_embed_obj = BaseEmbeddings(bin_embeds, vocab) - return bin_embed_obj - - -def get_pca_embeddings(bin_embed_obj, dim, kwargs={}): - from sklearn.decomposition import PCA - - embeds = PCA(n_components=dim, **kwargs).fit_transform(bin_embed_obj.embeddings) - pca_embed_obj = BaseEmbeddings(embeds, bin_embed_obj.vocab) - return pca_embed_obj - - -def get_umap_embeddings(bin_embed_obj, dim, kwargs={}): - import umap - - embeds = umap.UMAP(n_components=dim, **kwargs).fit_transform( - bin_embed_obj.embeddings - ) - umap_embed_obj = BaseEmbeddings(embeds, bin_embed_obj.vocab) - return umap_embed_obj - - -def save_base_embeddings(base_embed_obj, file_name): - with open(file_name, "wb") as f: - pickle.dump(base_embed_obj, f) - - -def load_base_embeddings(path): - with open(path, "rb") as f: - base_embed_obj = pickle.load(f) - return base_embed_obj.embeddings, base_embed_obj.vocab diff --git a/gitk/eval/neighborhood_preserving.py b/gitk/eval/neighborhood_preserving.py deleted file mode 100644 index 6cf024a5..00000000 --- a/gitk/eval/neighborhood_preserving.py +++ /dev/null @@ -1,401 +0,0 @@ -import pickle -import os - -os.environ["OPENBLAS_NUM_THREADS"] = "1" -import numpy as np -import time -import argparse -from gensim.models import Word2Vec -import time -import multiprocessing as mp -from gitk.eval import load_genomic_embeddings -import matplotlib.pyplot as plt -from matplotlib.lines import Line2D - - -class Timer: - def __init__(self): - self.o = time.time() - - def measure(self, p=1): - x = (time.time() - self.o) / float(p) - x = int(x) - if x >= 3600: - return "{:.1f}h".format(x / 3600) - if x >= 60: - return "{}m".format(round(x / 60)) - return "{}s".format(x) - - -# function calculating the chromosome distance between two regions -func_rdist = lambda u, v: float(u[1] < v[1]) * max(v[0] - u[1] + 1, 0) + float( - u[1] >= v[1] -) * max(u[0] - v[1] + 1, 0) - - -def get_topk_embed(i, K, embed, dist="euclidean"): - """ - Return the indices for the most similar K regions to the i-th region - embed is the embedding matrix for all the regions in the vocabulary of a region2vec model - """ - num = len(embed) - if dist == "cosine": - nom = np.dot(embed[i : i + 1], embed.T) - denom = np.linalg.norm(embed[i : i + 1]) * np.linalg.norm(embed, axis=1) - sims = (nom / denom)[0] - indexes = np.argsort(-sims)[1 : K + 1] - s = sims[indexes] - elif dist == "euclidean": - dist = np.linalg.norm(embed[i : i + 1] - embed, axis=1) - indexes = np.argsort(dist)[1 : K + 1] - s = -dist[indexes] - elif dist == "jaccard": - nom = np.dot(embed[i : i + 1], embed.T) - denom = ((embed[i : i + 1] + embed) > 0.0).sum(axis=1) - sims = (nom / denom)[0] - indexes = np.argsort(-sims)[1 : K + 1] - s = sims[indexes] - return indexes, s - - -def find_Kneighbors(region_array, index, K): - """ - region_array must be sorted; all regions are on the same chromosome - index is the index for the query region region_array[index] - K is the number of nearest neighbors of the query region - - return: indices of the K nearest neighbors in region_array - """ - if len(region_array) < K: - K = len(region_array) - qregion = region_array[index] - left_idx = max(index - K, 0) - right_idx = min(index + K, len(region_array) - 1) - rdist_arr = [] - for idx in range(left_idx, right_idx + 1): - rdist_arr.append(func_rdist(qregion, region_array[idx])) - rdist_arr = np.array(rdist_arr) - Kneighbors_idx = np.argsort(rdist_arr)[1 : K + 1] - Kneighbors_idx = Kneighbors_idx + left_idx - return Kneighbors_idx - - -def calculate_overlap(i, K, chromo, region_array, region2index, embed_rep): - Kindices = find_Kneighbors(region_array, i, K) - if len(Kindices) == 0: - return 0 - str_kregions = [ - "{}:{}-{}".format(chromo, *region_array[k]) for k in Kindices - ] # sorted in ascending order - _Krdist_global_indices = np.array([region2index[r] for r in str_kregions]) - - idx = region2index["{}:{}-{}".format(chromo, *region_array[i])] - _Kedist_global_indices, _ = get_topk_embed( - idx, K, embed_rep - ) # sorted in ascending order - - overlap = len(set(_Krdist_global_indices).intersection(set(_Kedist_global_indices))) - return overlap - - -def calculate_overlap_bins(i, K, chromo, region_array, region2index, embed_rep, res=50): - Kindices = find_Kneighbors(region_array, i, K) - if len(Kindices) == 0: - return 0 - str_kregions = [ - "{}:{}-{}".format(chromo, *region_array[k]) for k in Kindices - ] # sorted in ascending order - _Krdist_global_indices = np.array([region2index[r] for r in str_kregions]) - - idx = region2index["{}:{}-{}".format(chromo, *region_array[i])] - _Kedist_global_indices, _ = get_topk_embed( - idx, K, embed_rep - ) # sorted in ascending order - - bin_overlaps = [] - prev = 0 - assert res < K + 1, "resolution < K + 1" - for i in range(res, K + 1, res): - set1 = set(_Krdist_global_indices[prev:i]) - set2 = set(_Kedist_global_indices[prev:i]) - - overlap = len(set1.intersection(set2)) / min(i, len(set1)) - bin_overlaps.append(overlap) - - return np.array(bin_overlaps) - - -def calculate_overlap_same_chromosome(i, K, chromo, region_array, embed_rep, dist): - _Krindices = find_Kneighbors(region_array, i, K) - if len(_Krindices) == 0: - return np.zeros(K) - Krindices = np.ones(K) * (-1) - Krindices[0 : len(_Krindices)] = _Krindices - - _Keindices, _ = get_topk_embed(i, K, embed_rep, dist) - Keindices = np.ones(K) * (-2) - Keindices[0 : len(_Keindices)] = _Keindices - - # overlap = set(Krindices).intersection(set(Keindices)) - overlap = (Krindices == Keindices).astype(np.float) - return overlap - - -def cal_snpr(ratio_embed, ratio_random): - return np.log10((ratio_embed + 1.0e-10) / (ratio_random + 1.0e-10)) - - -var_dict = {} - - -def worker_func(i, K, chromo, region_array, embed_type, resolution): - if embed_type == "embed": - embeds = var_dict["embed_rep"] - elif embed_type == "random": - embeds = var_dict["ref_embed"] - nprs = calculate_overlap_bins( - i, K, chromo, region_array, var_dict["region2vec_index"], embeds, resolution - ) - return nprs - - -def init_worker(embed_rep, ref_embed, region2index): - var_dict["embed_rep"] = embed_rep - var_dict["ref_embed"] = ref_embed - var_dict["region2vec_index"] = region2index - - -def neighborhood_preserving_test( - model_path, embed_type, K, num_samples=100, seed=0, resolution=None, num_workers=10 -): - """ - If sampling > 0, then randomly sample num_samples regions in total (proportional for each chromosome) - - If num_samples == 0, all regions are used in calculation - """ - embed_rep, regions_r2v = load_genomic_embeddings(model_path, embed_type) - timer = Timer() - if resolution is None: - resolution = K - - region2index = {r: i for i, r in enumerate(regions_r2v)} - # Group regions by chromosomes - chromo_regions = {} - for v in regions_r2v: - chromo, region = v.split(":") # e.g. chr1:100-1000 - chromo = chromo.strip() # remove possible spaces - region = region.strip() # remove possible spaces - start, end = region.split("-") - start = int(start.strip()) - end = int(end.strip()) - if chromo not in chromo_regions: - chromo_regions[chromo] = [(start, end)] - else: - chromo_regions[chromo].append((start, end)) - - # sort regions in each chromosome - chromo_ratios = {} - for chromo in chromo_regions: - region_array = chromo_regions[chromo] - chromo_regions[chromo] = sorted(region_array, key=lambda x: x[0]) - chromo_ratios[chromo] = len(region_array) / len(regions_r2v) - - num_regions, num_dim = embed_rep.shape - - np.random.seed(seed) - - ref_embed = (np.random.rand(num_regions, num_dim) - 0.5) / num_dim - - avg_ratio = 0.0 - avg_ratio_ref = 0.0 - count = 0 - - with mp.Pool( - processes=num_workers, - initializer=init_worker, - initargs=(embed_rep, ref_embed, region2index), - ) as pool: - all_processes = [] - for chromo in chromo_regions: - region_array = chromo_regions[chromo] - if num_samples == 0: # exhaustive - indexes = list(range(len(region_array))) - else: - num = min(len(region_array), round(num_samples * chromo_ratios[chromo])) - indexes = np.random.permutation(len(region_array))[0:num] - for i in indexes: - process_embed = pool.apply_async( - worker_func, (i, K, chromo, region_array, "embed", resolution) - ) - process_random = pool.apply_async( - worker_func, (i, K, chromo, region_array, "random", resolution) - ) - all_processes.append((process_embed, process_random)) - - for i, (process_embed, process_random) in enumerate(all_processes): - avg_ratio = (avg_ratio * count + process_embed.get()) / (count + 1) - avg_ratio_ref = (avg_ratio_ref * count + process_random.get()) / (count + 1) - count = count + 1 - snprs = cal_snpr(avg_ratio, avg_ratio_ref) - - ratio_msg = " ".join(["{:.6f}".format(r) for r in avg_ratio]) - ratio_ref_msg = " ".join(["{:.6f}".format(r) for r in avg_ratio_ref]) - snprs_msg = " ".join(["{:.6f}".format(r) for r in snprs]) - print(model_path) - - print( - "[seed={}] K={}\n[{}]: {}\n[Random]: {}\n[SNPR] {}".format( - seed, K, embed_type, ratio_msg, ratio_ref_msg, snprs_msg - ) - ) - result = { - "K": K, - "AvgENPR": avg_ratio, - "AvgRNPR": avg_ratio_ref, - "SNPR": snprs, - "Path": model_path, - } - elapsed_time = timer.measure() - print("Elapsed time:", elapsed_time) - return result - - -def writer_multiprocessing(save_path, num, q): - results = [[] for i in range(num)] - while True: - m = q.get() - if m == "kill": - break - worker_id = m[0] - results[worker_id] = m[1] - if save_path: - os.makedirs(os.path.dirname(save_path), exist_ok=True) - with open(save_path, "wb") as f: - pickle.dump(results, f) - return results - - -def neighborhood_preserving_test_batch( - batch, K, num_samples=100, num_workers=10, seed=0, save_path=None -): - print("Total number of models: {}".format(len(batch))) - result_list = [] - for index, (path, embed_type) in enumerate(batch): - result = neighborhood_preserving_test( - path, embed_type, K, num_samples, seed, K, num_workers - ) - result_list.append(result) - if save_path: - os.makedirs(os.path.dirname(save_path), exist_ok=True) - with open(save_path, "wb") as f: - pickle.dump(result_list, f) - return result_list - - -def npt_eval(batch, K, num_samples=100, num_workers=10, num_runs=20, save_folder=None): - results_seeds = [] - for seed in range(num_runs): - print("----------------Run {}----------------".format(seed)) - save_path = ( - os.path.join(save_folder, "npt_eval_seed{}".format(seed)) - if save_folder - else None - ) - result_list = neighborhood_preserving_test_batch( - batch, - K, - num_samples=num_samples, - num_workers=num_workers, - seed=seed, - save_path=save_path, - ) - results_seeds.append(result_list) - snpr_results = [[] for i in range(len(batch))] - paths = ["" for i in range(len(batch))] - for results in results_seeds: - for i, result in enumerate(results): - key = result["Path"] - snpr_results[i].append(result["SNPR"]) - paths[i] = key - snpr_results = [np.array(v) for v in snpr_results] - print(snpr_results[0].shape) - for i in range(len(batch)): - print( - "{}\nSNPR_Avg (std):{:.6f} ({:.6f})".format( - paths[i], snpr_results[i][:, 0].mean(), snpr_results[i][:, 0].std() - ) - ) - snpr_results = [(paths[i], snpr_results[i]) for i in range(len(batch))] - return snpr_results - - -def get_npt_results(save_paths): - snpr_results = {} - for save_path in save_paths: - with open(save_path, "rb") as f: - results = pickle.load(f) - for result in results: - key = result["Path"] - if key in snpr_results: - snpr_results[key].append(result["SNPR"]) - else: - snpr_results[key] = [result["SNPR"]] - snpr_results = [(k, np.array(v)) for k, v in snpr_results.items()] - return snpr_results - - -def snpr_plot(snpr_data, row_labels=None, legend_pos=(0.25, 0.6), filename=None): - snpr_vals = [(k, v[:, 0].mean(), v[:, 0].std()) for k, v in snpr_data] - cmap = plt.get_cmap("Set1") - cmaplist = [cmap(i) for i in range(9)] - if row_labels is None: - row_labels = [k for k, v, s in snpr_vals] - fig, ax = plt.subplots(figsize=(10, 6)) - mean_snpr_tuple = [(i, r[1]) for i, r in enumerate(snpr_vals)] - mean_snpr_tuple = sorted(mean_snpr_tuple, key=lambda x: -x[1]) - mean_snpr = [t[1] for t in mean_snpr_tuple] - indexes = [t[0] for t in mean_snpr_tuple] - std_snpr = [snpr_vals[i][2] for i in indexes] - row_labels = [row_labels[i] for i in indexes] - ax.set_xticks(list(range(1, len(mean_snpr) + 1))) - ax.set_xticklabels(row_labels) - ax.errorbar( - range(1, len(mean_snpr) + 1), - mean_snpr, - yerr=std_snpr, - fmt="o", - ms=10, - mfc=cmaplist[1], - mec=cmaplist[8], - ecolor=cmaplist[2], - elinewidth=3, - capsize=5, - ) - ax.set_ylabel("SNPR") - _ = plt.setp( - ax.get_xticklabels(), rotation=-15, ha="left", va="top", rotation_mode="anchor" - ) - patches = [ - Line2D( - [0], - [0], - marker="o", - linestyle="", - color=cmaplist[1], - markersize=12, - mec=cmaplist[8], - ), - Line2D([0], [0], color=cmaplist[2], lw=4), - ] - legend = ax.legend( - labels=["SNPR", "SNPR standard deviation"], - handles=patches, - bbox_to_anchor=legend_pos, - loc="center left", - borderaxespad=0, - fontsize=12, - frameon=True, - ) - ax.grid("on") - if filename: - fig.savefig(filename, bbox_inches="tight") diff --git a/gitk/eval/permutation.R b/gitk/eval/permutation.R deleted file mode 100644 index 6045f7ce..00000000 --- a/gitk/eval/permutation.R +++ /dev/null @@ -1,109 +0,0 @@ -library("optparse") -library("GenomicDistributions") -library(foreach) -library(doParallel) - -get_cluster_median <- function(fpath){ - query_data = rtracklayer::import(fpath) - queryList = GRangesList(cluster=query_data) - TSSdist = calcFeatureDistRefTSS(queryList, "hg19") - abs_dist = lapply(TSSdist,abs)[['cluster']] - return (c(median(abs_dist), length(query_data))) -} -lappend <- function (lst, ...){ -lst <- c(lst, list(...)) - return(lst) -} -get_cluster_tss <- function(cluster_folder, assembly="hg19"){ - cluster_files = list.files(cluster_folder, pattern="cluster.*bed$") - num_clusters = length(cluster_files) - tss_arr = list() - - for (i in 1:num_clusters){ - queryFile = file.path(cluster_folder,cluster_files[i]) - query_data = rtracklayer::import(queryFile) - queryList = GRangesList(cluster=query_data) - TSSdist = calcFeatureDistRefTSS(queryList, assembly) - abs_dist = lapply(TSSdist,abs)[['cluster']] - tss_arr <- lappend(tss_arr,abs_dist) - } - return (tss_arr) -} -random_median <- function(array, sample_size){ - sample_arr = sample(array,sample_size,FALSE) - return (median(sample_arr)) -} - -perm_test <- function(perm_num, array, size, pos){ - median_arr = replicate(perm_num,random_median(array,size)) - pval = sum(median_arr < pos)/perm_num - return (pval) -} - -find_peak <- function(tss_arr, ldist, rdist){ - a = density(unlist(tss_arr)) - x = unlist(a[1]) - y = unlist(a[2]) - idx = which.max(y) - peak_pos = x[idx] - if (peak_pos > ldist & peak_pos < rdist){ - return (TRUE) - } else { - return (FALSE) - } -} - - -get_significance_vals <- function(path, assembly, num_replicates=10000){ - # calculate the tss of all regions in all the clusters - cluster_tss = get_cluster_tss(path, assembly) - num_clusters <- length(cluster_tss) - # get cluster size - csize_arr = rep(0,times=num_clusters) - for (i in 1:num_clusters){ - s = length(unlist(cluster_tss[i])) - csize_arr[i] <- s - } - # merge tss from all clusters - all_tss = c() - for (i in 1:num_clusters){ - c_tss = unlist(cluster_tss[i]) - all_tss = c(all_tss,c_tss) - } - pval_arr = rep(0, num_clusters) - for (i in 1:num_clusters){ - median_tss = median(unlist(cluster_tss[i])) - pval = perm_test(num_replicates, all_tss, csize_arr[i], median_tss) - pval_arr[i] = pval - } - return (pval_arr) -} - -option_list = list( - make_option("--path", type="character", default="/bigtemp/gz5hp/genomes/tfbs_experiments/tbfs_clustering_results/KMeans_20K_seed0", - help="dataset file name", metavar="character"), - make_option("--assembly", type="character", default="hg19", - help="hg19 or hg38", metavar="character"), - make_option("--num_workers", type="integer",default=10, - help="number of parallel processes", metavar="number of processes"), - make_option("--num_samples", type="integer",default=1000, - help="number of samples", metavar="number of samples") -); - -opt_parser = OptionParser(option_list=option_list) -opt = parse_args(opt_parser) - -registerDoParallel(opt$num_workers) - -pattern = file.path(opt$path, 'Kmeans_*') -paths = Sys.glob(pattern) -num_path <- length(paths) -foreach (i=1:num_path) %dopar% { - folder <- paths[i] - save_path <- file.path(folder, 'pvals.txt') - if (!file.exists(save_path)){ - pvals <- get_significance_vals(folder, opt$assembly, opt$num_samples) - cat(format(pvals, nsmall=6),sep='\n',file=save_path) - } -} - diff --git a/gitk/hmm/README.md b/gitk/hmm/README.md deleted file mode 100644 index 8e7646cb..00000000 --- a/gitk/hmm/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# hmm module - -## Introduction - -This module will use an HMM to create a flexible segment universe, given an input of several bed files. - -## How to use - -Where can you find a very small example dataset? - -``` -gitk hmm --out_file tests/consesnus/universe/hmm_norm.bed --cov_folder tests/consesnus/coverage/ -``` - - -``` -gitk hmm --out_file tests/consesnus/universe/hmm_norm.bed --cov_folder tests/consesnus/coverage/ --normalize -``` \ No newline at end of file diff --git a/gitk/hmm/cli.py b/gitk/hmm/cli.py deleted file mode 100644 index cc1b2a2f..00000000 --- a/gitk/hmm/cli.py +++ /dev/null @@ -1,25 +0,0 @@ -def build_subparser(parser): - """ - Builds argument parser. - - :return argparse.ArgumentParser - """ - - parser.add_argument( - "--out_file", type=str, help="path to result file", required=True - ) - parser.add_argument( - "--cov_folder", type=str, help="path to coverage folder", required=True - ) - parser.add_argument("--normalize", action="store_true") - parser.add_argument( - "--save_max_cove", - help="if present saves maximum coverage for each peak", - action="store_true", - ) - parser.add_argument( - "--lambdas", type=str, help="lambdas matrix used to set emissions" - ) - parser.add_argument("--coverage_prefix", default="all", type=str) - - return parser diff --git a/gitk/hmm/hmm.py b/gitk/hmm/hmm.py deleted file mode 100644 index ed9e5dfc..00000000 --- a/gitk/hmm/hmm.py +++ /dev/null @@ -1,281 +0,0 @@ -import numpy as np -import os -from .models import PoissonModel -import pyBigWig -from scipy.stats import nbinom -from functools import cmp_to_key -from ..utils import natural_chr_sort - -from logging import getLogger -from ..const import PKG_NAME - -_LOGGER = getLogger(PKG_NAME) - -""" States legend -0 -> start -1 -> core -2 -> end -3 -> background""" - -transmat = [ - [1 - 1e-10, 1e-10, 0, 0], - [0, 1 - 1e-6, 1e-6, 0], - [0, 0, 1 - 1e-6, 1e-6], - [0.1, 0, 0, 0.9], -] - -lambdas = [[3, 1, 0.0001], [0.05, 2, 0.05], [0.0001, 1, 3], [1e-4, 1e-3, 1e-4]] - - -def norm(track, mode): - """Normalize the coverage track depending on track type. - For each unique value in the track calculates the corresponding - quantile taking into account that values occur different number of times.""" - important_val = track[track != 0] - important_val_unique, counts = np.unique(important_val, return_counts=True) - uniq_dict = {i: j for i, j in zip(important_val_unique, counts)} - # how many times each value is present in the track - important_val_unique_sort = np.sort(important_val_unique) - if mode == "ends": - n = 0.1 - if mode == "core": - n = 0.085 - bs = 0 # what fraction of the distribution was used for normalization - val = {} # for each unique value in track holds the corresponding quantile - for i in important_val_unique_sort: - move_val = (uniq_dict[i] / len(important_val)) / 2 - # how far from last quantile is te next one - val[i] = nbinom.ppf(bs + move_val, 1, n) - bs = bs + move_val * 2 - track[track != 0] = [val[i] for i in important_val] - - -def process_bigwig(file, seq, p, chrom, chrom_size, normalize=False, mode=None): - """Preprocess bigWig file""" - if pyBigWig.numpy: - track = file.values(chrom, 0, chrom_size, numpy=True) - else: - track = file.values(chrom, 0, chrom_size) - track = np.array(track) - track[np.isnan(track)] = 0 - track = track.astype(np.uint16) - if normalize: - norm(track, mode) - seq[:, p] = track - - -def read_data(start, core, end, chrom, normalize=False): - """ - Read in and preprocess data - :param str start: path to file with start coverage - :param str end: path to file with end coverage - :param str core: path to file with core coverage - :param str chrom: chromosome to analyse - :param bool normalize: whether to normalize the coverage - :return: chromosome size, coverage matrix - """ - start = pyBigWig.open(start + ".bw") - chroms = start.chroms() - chrom_size = chroms[chrom] - seq = np.zeros((chrom_size, 3), dtype=np.uint16) - process_bigwig(start, seq, 0, chrom, chrom_size, normalize, mode="ends") - start.close() - core = pyBigWig.open(core + ".bw") - process_bigwig(core, seq, 1, chrom, chrom_size, normalize, mode="core") - core.close() - end = pyBigWig.open(end + ".bw") - process_bigwig(end, seq, 2, chrom, chrom_size, normalize, mode="ends") - end.close() - return chrom_size, seq - - -def find_full_full_pos(seq, gap_size=1000, area_size=500): - """Look for nonzero positions in coverage matrix, - when most of the positions are zero""" - size = len(seq) - seq = np.argwhere(seq >= 1).flatten() - starts, ends = [], [] - if seq[0] > gap_size: - starts.append(int(seq[0] - area_size)) - else: - starts.append(0) - for e in range(1, len(seq)): - if seq[e] - seq[e - 1] > gap_size: - ends.append(int(seq[e - 1] + area_size)) - starts.append(int(seq[e] - area_size)) - ends.append(min(int(seq[-1] + area_size), size)) - return starts, ends - - -def find_full_empty_pos(seq, gap_size=10000, area_size=1000): - """Look for nonzero positions in coverage matrix, - when most of the positions are nonzero""" - size = len(seq) - seq = np.argwhere(seq == 0).flatten() - starts, ends = [], [] - gap_len = 0 - gap_start = 0 - looking_for_first = True - for e in range(1, len(seq)): - if seq[e] - seq[e - 1] == 1: - gap_len += 1 - else: - if gap_len >= gap_size: - starts.append(gap_start) - ends.append(seq[e - 1]) - looking_for_first = False - elif looking_for_first: - starts.append(gap_start) - ends.append(seq[e - 1]) - looking_for_first = False - gap_len = 1 - gap_start = seq[e] - starts_res = [max(0, i - area_size) for i in ends] - end_res = [i + area_size for i in starts[1:]] + [size] - if not starts_res: - starts_res = [0] - return starts_res, end_res - - -def find_full(seq): - """Look for nonzero positions in coverage matrix""" - seq = np.sum(seq, axis=1, dtype=np.uint8) - full_pos_no = np.sum(seq >= 1) - if full_pos_no < len(seq) - full_pos_no: - return find_full_full_pos(seq) - else: - return find_full_empty_pos(seq) - - -def ana_region(region, start_s): - """Helper for saving HMM prediction into a file""" - start_e = start_s + np.where(region == 1)[0][0] - end_s = start_s + np.where(region == 2)[0][0] - return start_e, end_s - - -def predictions_to_bed(states, chrom, bedname, save_max_cove=False, cove_file=None): - """ - Save HMM prediction into a file - :param array states: result of HMM prediction - :param str chrom: which chromosome is being analysed - :param str bedname: path to the output file - :param bool save_max_cove: whether to save the maximum peak - coverage to output file, can result in nonstandard bed file - :param str cove_file: file with core coverage, require for - saving maximum peak coverage - """ - ind = np.argwhere(states != 3) - ind = ind.flatten() - start_s = ind[0] - to_file = [] - line = chrom + "\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n" - if save_max_cove: - coverage = pyBigWig.open(cove_file) - for i in range(1, len(ind)): - if ind[i] - ind[i - 1] != 1: - end_e = ind[i - 1] - region = states[start_s : end_e + 1] - res = ana_region(region, start_s) - save_start_e, save_end_s = res - val = 0 - if save_max_cove: - val = coverage.stats(chrom, int(start_s), int(end_e) + 1, type="max") - val = int(val[0]) - to_file.append( - line.format( - start_s, - end_e + 1, - "universe", - val, - ".", - save_start_e, - save_end_s, - "0,0,255", - ) - ) - start_s = ind[i] - if states[ind[-1]] == 2: - region = states[start_s : ind[-1] + 1] - res = ana_region(region, start_s) - save_start_e, save_end_s = res - val = 0 - if save_max_cove: - val = coverage.stats(chrom, int(start_s), int(ind[-1]) + 1, type="max") - val = int(val[0]) - to_file.append( - line.format( - start_s, - ind[-1] + 1, - "universe", - val, - ".", - save_start_e, - save_end_s, - "0,0,255", - ) - ) - with open(bedname, "a") as f: - f.writelines(to_file) - - -def split_predict(seq, empty_starts, empty_ends, model): - """Make model prediction only for regions containing - nonzero positions""" - hmm_predictions = np.full(len(seq), 3, dtype=np.uint8) - for s, e in zip(empty_starts, empty_ends): - res = model.predict(seq[s:e]) - hmm_predictions[s:e] = res - return hmm_predictions - - -def run_hmm(start, core, end, chrom, normalize=False): - """Make HMM prediction for given chromosome""" - chrom_size, seq = read_data(start, core, end, chrom, normalize=normalize) - empty_starts, empty_ends = find_full(seq) - model = PoissonModel(transmat, lambdas, save_matrix=False) - model = model.make() - hmm_predictions = split_predict(seq, empty_starts, empty_ends, model) - return hmm_predictions, model - - -def run_hmm_save_bed( - coverage_folder, - out_file, - prefix="all", - normalize=False, - save_max_cove=False, -): - """ - Create HMM based univers from coverage - :param coverage_folder: path to folder with coverage files - :param str start: start coverage file name - :param str end: end coverage file name - :param str core: core coverage file name - :param str out_file: path to the output file with universe - :param bool normalize: whether to normalize file - :param bool save_max_cove: whether to save the maximum - peak coverage - """ - if os.path.isfile(out_file): - raise Exception(f"File : {out_file} exists") - start = os.path.join(coverage_folder, f"{prefix}_start") - core = os.path.join(coverage_folder, f"{prefix}_core") - end = os.path.join(coverage_folder, f"{prefix}_end") - bw_start = pyBigWig.open(start + ".bw") - chroms = bw_start.chroms() - bw_start.close() - chroms_key = list(chroms.keys()) - chroms_key = sorted(chroms_key, key=cmp_to_key(natural_chr_sort)) - chroms = {i: chroms[i] for i in chroms_key} - for C in chroms: - if chroms[C] > 0: - pred, m = run_hmm(start, core, end, C, normalize=normalize) - predictions_to_bed( - pred, C, out_file, save_max_cove=save_max_cove, cove_file=core + ".bw" - ) - - -def test_hmm(message): - """Just prints a test message""" - _LOGGER.info(message) diff --git a/gitk/likelihood/cli.py b/gitk/likelihood/cli.py deleted file mode 100644 index 577da807..00000000 --- a/gitk/likelihood/cli.py +++ /dev/null @@ -1,51 +0,0 @@ -def build_subparser_model(parser): - """ - Builds argument parser. - - :return argparse.ArgumentParser - """ - - parser.add_argument("--model_folder", required=True, type=str) - parser.add_argument("--file_no", type=int) - parser.add_argument("--coverage_folder", required=True, type=str) - parser.add_argument("--coverage_prefix", default="all", type=str) - - return parser - - -def build_subparser_universe_hard(parser): - parser.add_argument("--merge", default=0, type=int) - parser.add_argument("--filter_size", default=0, type=int) - parser.add_argument("--fout", required=True, type=str) - parser.add_argument("--coverage_file", required=True, type=str) - parser.add_argument("--cut_off", type=int) - - return parser - - -def build_subparser_universe_flexible(parser): - parser.add_argument("--output_file", required=True, type=str) - parser.add_argument("--model_folder", required=True, type=str) - - return parser - - -def build_subparser(parser): - sp = parser.add_subparsers(dest="subcommand") - msg_by_cmd = { - "build_model": "Asses based on distance", - "universe_hard": "Making cut-off universe", - "universe_flexible": "Making ML flexible universe", - } - subparsers = {} - for k, v in msg_by_cmd.items(): - subparsers[k] = sp.add_parser(k, description=v, help=v) - subparsers["build_model"] = build_subparser_model(subparsers["build_model"]) - subparsers["universe_hard"] = build_subparser_universe_hard( - subparsers["universe_hard"] - ) - subparsers["universe_flexible"] = build_subparser_universe_flexible( - subparsers["universe_flexible"] - ) - - return parser diff --git a/gitk/likelihood/universe_flexible.py b/gitk/likelihood/universe_flexible.py deleted file mode 100644 index 29953262..00000000 --- a/gitk/likelihood/universe_flexible.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import numpy as np -import os -from functools import cmp_to_key -from ..utils import natural_chr_sort, timer_func -from ..hmm.hmm import predictions_to_bed, find_full_full_pos, find_full_empty_pos -from .build_model import ModelLH -from numba import njit - - -@njit -def process_part(model): - """ - Finding ML path through matrix using dynamic programing - :param array mat: fragment of likelihood model to be processed - :return array: ML path through matrix - """ - mat = np.zeros((len(model), 4)) - (N, M) = mat.shape - background = [0, 2, 4] - for i in range(N): - for k in background: - mat[i, 3] += model[i, k] - for j in range(M - 1): - back = background[:] - back.remove(2 * j) - mat[i, j] = model[i, 2 * j + 1] - for k in back: - mat[i, j] += model[i, k] - - for i in range(1, N): - for j in range(M): - mat[i, j] += max(mat[i - 1, j], mat[i - 1, j - 1]) - path = np.zeros(len(mat), dtype=np.int8) - path[-1] = np.argmax(mat[-1]) - for i in range(len(mat) - 2, -1, -1): - prev_index = path[i + 1] - new_index = prev_index - (mat[i, prev_index - 1] > mat[i, prev_index]) - if new_index == -1: - new_index = 3 - path[i] = new_index - return path - - -def make_ml_flexible_universe(model_lh, chrom, fout): - """ - Make ML flexible universe per chromosome - :param str folderin: input folder with likelihood models - :param str chrom: chromosome to be processed - :param str fout: output file with the universe - """ - model_lh.read_chrom(chrom) - chrom_model = model_lh.chromosomes_models[chrom] - model = np.hstack( - ( - chrom_model.models["start"], - chrom_model.models["core"], - chrom_model.models["end"], - ) - ) - model_lh.clear_chrom(chrom) - seq = np.where(np.sum(model[:, [1, 3, 5]], axis=1) > -30, 1, 0).astype(np.uint8) - full_pos_no = np.sum(seq) - if full_pos_no < len(seq) - full_pos_no: - full_start, full_end = find_full_full_pos(seq) - else: - full_start, full_end = find_full_empty_pos(seq) - path = np.full(len(model), 3, dtype=np.uint8) - for s, e in zip(full_start, full_end): - res = process_part(model[s:e]) - path[s:e] = res - predictions_to_bed(path, chrom, fout) - - -@timer_func -def main(folderin, fout): - """ - Make ML flexible universe - :param str folderin: input folder with likelihood models - :param str fout: output file with the universe - """ - if os.path.isfile(fout): - raise Exception(f"File : {fout} exists") - lh_model = ModelLH(folderin) - chroms = sorted(lh_model.chromosomes_list, key=cmp_to_key(natural_chr_sort)) - for C in chroms: - make_ml_flexible_universe(lh_model, C, fout) diff --git a/gitk/region2vec/README.md b/gitk/region2vec/README.md deleted file mode 100644 index b0ce0d59..00000000 --- a/gitk/region2vec/README.md +++ /dev/null @@ -1,35 +0,0 @@ -# region2vec -`region2vec` will generate embedding vectors for a given region set (universe) from a set of raw bed files. The program will first map all raw regions to the given region set. Then, it will concatenate all regions in a bed file in random orders into a sentence. The generated sentences will be used for word2vec training. - - -## Requirements -- python>=3.6 -All other required packages are listed in `requirements.txt` - - - -## Usage -1. Prepare a set of bed files in `src_folder`. [Optional] If only a subset of files will be used, specify a list of those files as `file_list`. By default, the program will use all the files in the folder to train a region2vec model. -2. Prepare a universe file `universe_file`. -3. Create a token folder which will be used to store tokenized files `dst_folder`. -5. Run the following command -``` -from gitk.tokenization import hard_tokenization -from gitk.region2vec import region2vec - -src_folder = '/path/to/raw/bed/files' -dst_folder = '/path/to/tokenized_files' -universe_file = '/path/to/universe_file' - -# must run tokenization first -status = hard_tokenization(src_folder, dst_folder, universe_file, 1e-9) - -if status: # if hard_tokenization is successful, then run region2vec training - save_dir = '/path/to/training/results' - region2vec(dst_folder, save_dir, num_shufflings=1000) - -``` -For customized settings, please go and check the parameters used in `main.py`. -For training a region2vec model, the parameters, `init_lr`, `window_size`, `num_shufflings`, `embedding_dim`, are frequently tuned in experiments. - - diff --git a/gitk/region2vec/__init__.py b/gitk/region2vec/__init__.py deleted file mode 100644 index 489ce2d6..00000000 --- a/gitk/region2vec/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .main import region2vec diff --git a/gitk/region2vec/main.py b/gitk/region2vec/main.py deleted file mode 100644 index 6e4fafb1..00000000 --- a/gitk/region2vec/main.py +++ /dev/null @@ -1,120 +0,0 @@ -import numpy as np - -import multiprocessing -import os - -from gitk.region2vec import utils -from gitk.region2vec.region2vec_train import main as region2_train -from gitk.region2vec.region_shuffling import main as sent_gen - - -class Namespace: - def __init__(self, **kwargs): - self.__dict__.update(kwargs) - - -def region2vec( - token_folder, # path to the folder of tokenized files - save_dir, # folder to save the training results - file_list=None, # specifies which files from token_folder are used for training - num_shufflings=1000, # Number of shuffled datasets or number of training epochs - num_processes=10, # Maximum number of parallel processes - tokenization_mode="hard", # tokenization mode - embedding_dim=100, # Dimension of region2vec embeddings - context_win_size=5, # Context window size (half) - save_freq=-1, # Save a model after the given number of training epochs. If -1, then only save the best and latest models - resume_path="", # path to a trained model. If specified, the model will be used to initialize the region2vec embeddings - train_alg="cbow", # select training algorithms ['cbow','skip-gram'] - min_count=5, # Threshold for filtering out regions with low frequency - neg_samples=5, # Number of negative samples - init_lr=0.025, # Initial learning rate - min_lr=1e-4, # Minimum learning rate - lr_scheduler="linear", # How to decay the learning rate. Select from linear and milestone - milestones=[], # Specify only when lr_scheduler=milestone. At each given epoch, the learning rate will be multiplied by 0.1 - hier_softmax=False, # Whether to hierarchical softmax - seed=0, # random seed -): - timer = utils.Timer() - start_time = timer.t() - if file_list is None: - files = os.listdir(token_folder) - else: - files = file_list - os.makedirs(save_dir, exist_ok=True) - file_list_path = os.path.join(save_dir, "file_list.txt") - utils.set_log_path(save_dir) - with open(file_list_path, "w") as f: - for file in files: - f.write(file) - f.write("\n") - - training_processes = [] - num_sent_processes = min(int(np.ceil(num_processes / 2)), 4) - nworkers = min(num_shufflings, num_sent_processes) - utils.log("num_sent_processes: {}".format(nworkers)) - if nworkers <= 1: - sent_gen_args = Namespace( - tokenization_folder=token_folder, - save_dir=save_dir, - file_list=file_list_path, - tokenization_mode=tokenization_mode, - pool=1, # maximum number of unused shuffled datasets generated at a time - worker_id=0, - number=num_shufflings, - ) - p = multiprocessing.Process(target=sent_gen, args=(sent_gen_args,)) - p.start() - training_processes.append(p) - else: - num_arrs = [num_shufflings // nworkers] * (nworkers - 1) - - num_arrs.append(num_shufflings - np.array(num_arrs).sum()) - sent_gen_args_arr = [] - for n in range(nworkers): - sent_gen_args = Namespace( - tokenization_folder=token_folder, - save_dir=save_dir, - file_list=file_list_path, - tokenization_mode=tokenization_mode, - pool=1, # maximum number of unused shuffled datasets generated at a time - worker_id=n, - number=num_arrs[n], - ) - sent_gen_args_arr.append(sent_gen_args) - for n in range(nworkers): - p = multiprocessing.Process(target=sent_gen, args=(sent_gen_args_arr[n],)) - p.start() - training_processes.append(p) - - num_region2vec_processes = max(num_processes - nworkers, 1) - region2vec_args = Namespace( - num_shuffle=num_shufflings, - embed_dim=embedding_dim, - context_len=context_win_size, - nworkers=num_region2vec_processes, - save_freq=save_freq, - save_dir=save_dir, - resume=resume_path, - train_alg=train_alg, - min_count=min_count, - neg_samples=neg_samples, - init_lr=init_lr, - min_lr=min_lr, - lr_mode=lr_scheduler, - milestones=milestones, - hier_softmax=hier_softmax, - update_vocab="once", - seed=seed, - ) - p = multiprocessing.Process(target=region2_train, args=(region2vec_args,)) - p.start() - training_processes.append(p) - for p in training_processes: - p.join() - os.remove(file_list_path) - elapsed_time = timer.t() - start_time - print( - "[Training] {}/{}".format( - utils.time_str(elapsed_time), utils.time_str(timer.t()) - ) - ) diff --git a/gitk/region2vec/region_shuffling.py b/gitk/region2vec/region_shuffling.py deleted file mode 100644 index 4b603b2d..00000000 --- a/gitk/region2vec/region_shuffling.py +++ /dev/null @@ -1,160 +0,0 @@ -import os -import numpy as np -import random -import glob -import time -import datetime -import argparse -import os -from gitk.region2vec import utils - - -class BEDDataset: - def __init__(self, args, file_list): - self.links = [] - self.args = args - self.meta_data = dict() - self.file2idx = dict() - with open(file_list, "r") as f: - for idx, line in enumerate(f): - filename = line.strip() - self.links.append(filename) - self.file2idx[filename] = idx - - self.nfiles = len(self.links) - - def regions2sentences_sampling(self, src_path, dst_path): - with open(dst_fname, "w") as fout: - for fname in self.links: - src_fname = os.path.join(src_path, fname) - sentence = [] - probs = [] - with open(src_fname, "r") as f: - for line in f: - elements = line.strip().split("\t") - word = elements[0].strip() - sentence.append(word) - probs.append(float(elements[-2].strip())) - probs = np.array(probs) - probs = probs / probs.sum() - sentence = np.array(sentence) - - sampled_sentence = np.random.choice(sentence, len(probs), p=probs) - # sampled_sentence = list(set(sampled_sentence)) - sampled_sentence = sampled_sentence.tolist() - str_sent = " ".join(sampled_sentence) - - fout.write(str_sent) - fout.write("\n") - - def regions2sentences(self, src_path, dst_path): - with open(dst_path, "w") as f_out: - for fname in self.links: - src_fname = os.path.join(src_path, fname) - sentence = [] - with open(src_fname, "r") as f: - for line in f: - elements = line.strip().split("\t")[0:3] - chr_name = elements[0].strip() - start = elements[1].strip() - end = elements[2].strip() - word = chr_name + ":" + start + "-" + end - sentence.append(word) - random.shuffle(sentence) # shuffle the regions in the sentence - str_sent = " ".join(sentence) - f_out.write(str_sent) - f_out.write("\n") - - -def main(args): - DATA_FOLDER = os.path.join(args.save_dir, "shuffled_datasets") - os.makedirs(DATA_FOLDER, exist_ok=True) - src_path = args.tokenization_folder - worker_id = args.worker_id - random.seed(worker_id) - np.random.seed(worker_id) - dataset = BEDDataset(args, args.file_list) - pool = args.pool - utils.log( - "[{}] Creating shuffled datasets in \033[93m{}\033[00m (at most {} datasets coexist)".format( - worker_id, DATA_FOLDER, pool - ) - ) - - for i in range(pool): - name_used = os.path.join(DATA_FOLDER, f"pool{worker_id}-{i}used") - name_using = os.path.join(DATA_FOLDER, f"pool{worker_id}-{i}using") - name_creating = os.path.join(DATA_FOLDER, f"pool{worker_id}-{i}creating") - name = os.path.join(DATA_FOLDER, f"pool{worker_id}-{i}") - if os.path.exists(name_using): - print("File exists") - return - if os.path.exists(name_used): - print("File exists") - return - if os.path.exists(name): - print("File exists") - return - if os.path.exists(name_creating): - print("File exists") - return - # create an empty file - with open(name_used, "w") as f: - pass - - num_created = 0 - while True: - if num_created == args.number: - break - # determine whether to create a new dataset - files = glob.glob(os.path.join(DATA_FOLDER, f"pool{worker_id}*used")) - if len(files) == 0: - time.sleep(1) # wait for 10 seconds - # print('Waiting for the data to be consumed',end="\r") - else: - # delete the used dataset and generate a new dataset in the same foler - sel_file = files[random.randint(0, len(files) - 1)] - fname = sel_file.split("/")[-1][:-4] - # print('[',datetime.datetime.now(),']','Find used dataset {}'.format(fname)) - os.system("rm -f {}".format(sel_file)) # delete the dataset - dpath = os.path.join(DATA_FOLDER, fname + "creating") - with open(dpath, "w") as f: - pass - if args.tokenization_mode == "hard": - dataset.regions2sentences(src_path, dpath) - else: - dataset.regions2sentences_sampling(src_path, dpath) - - num_created += 1 - # print('[',datetime.datetime.now(),']',' Created %dth dataset' % num_created) - dst_name = os.path.join(DATA_FOLDER, fname) - os.rename(dpath, dst_name) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Sentence Generation") - parser.add_argument("--file_list", help="path to a file list") - parser.add_argument("--tokenization_mode", help="tokenization mode") - parser.add_argument("--tokenization_folder", help="path to the tokenized regions") - parser.add_argument( - "--save_dir", help="parent folder to generated shuffled datasets" - ) - parser.add_argument( - "--pool", - type=int, - default=3, - help="maximum number of shuffled datasets before consuming one", - ) - parser.add_argument( - "--worker_id", - type=int, - default=0, - help="maximum number of shuffled datasets before consuming one", - ) - parser.add_argument( - "--number", type=int, default=1000, help="total number of shuffled datasets" - ) - - args = parser.parse_args() - - main(args) diff --git a/gitk/region2vec/requirements.txt b/gitk/region2vec/requirements.txt deleted file mode 100644 index dfda40fc..00000000 --- a/gitk/region2vec/requirements.txt +++ /dev/null @@ -1,10 +0,0 @@ -gensim -scikit - learn -seaborn -matplotlib -pyyaml -requests -argparse -tqdm -pandas -umap - learn diff --git a/gitk/region2vec/utils.py b/gitk/region2vec/utils.py deleted file mode 100644 index 579dfcd9..00000000 --- a/gitk/region2vec/utils.py +++ /dev/null @@ -1,131 +0,0 @@ -import time -import sys -import select -import os -import numpy as np -import shutil - - -def prRed(skk): - print("\033[91m{}\033[00m".format(skk)) - - -def prGreen(skk): - print("\033[92m{}\033[00m".format(skk)) - - -def prYellow(skk): - print("\033[93m{}\033[00m".format(skk)) - - -def prLightPurple(skk): - print("\033[94m{}\033[00m".format(skk)) - - -def prPurple(skk): - print("\033[95m{}\033[00m".format(skk)) - - -def prCyan(skk): - print("\033[96m{}\033[00m".format(skk)) - - -def prLightGray(skk): - print("\033[97m{}\033[00m".format(skk)) - - -def prBlack(skk): - print("\033[98m{}\033[00m".format(skk)) - - -_log_path = None - - -def set_log_path(path): - global _log_path - _log_path = path - - -class Timer: - def __init__(self): - self.v = time.time() - - def s(self): - self.v = time.time() - - def t(self): - return time.time() - self.v - - -def time_str(t): - if t >= 3600: - return "{:.2f}h".format(t / 3600) - if t >= 60: - return "{:.2f}m".format(t / 60) - return "{:.2f}s".format(t) - - -def timed_response(prompt, wait_time, default): - print(prompt, end="", flush=True) - i, o, e = select.select([sys.stdin], [], [], wait_time) - if i: - ans = sys.stdin.readline().strip() - if ans not in ["y", "n"]: - print("\033[91m{}\033[00m".format(default)) - return default - else: - return ans - else: - print("\033[91m{}\033[00m".format(default)) - return default - - -def log(obj, filename="log.txt"): - print(obj) - if _log_path is not None: - with open(os.path.join(_log_path, filename), "a") as f: - f.write(obj) - f.write("\n") - - -class lr_scheduler: - def __init__(self, init_lr, end_lr, epochs, lr_info=None, mode="linear"): - self.lr = init_lr - self.end_lr = end_lr - self.init_lr = init_lr - self.mode = mode - self.epochs = epochs - self.lr_info = lr_info - self.count = 0 - if mode == "linear": - self.freq = lr_info["freq"] - - def step(self): - self.count += 1 - if self.mode == "linear": - if self.count % self.freq == 0: - self.lr = ( - self.init_lr - - (self.init_lr - self.end_lr) / self.epochs * self.count - ) - elif self.mode == "milestone": - milestones = np.array(self.lr_info["milestones"]) - power = (milestones <= self.count).sum() - self.lr = self.init_lr * np.power(self.lr_info["ratio"], float(power)) - if self.lr < self.end_lr: - self.lr = self.end_lr - return self.lr - - -def ensure_dir(path, default="y"): - if os.path.exists(path): - if default == "y": - prompt = "\033[91m{} exists,remove?([y]/n):\033[00m ".format(path) - else: - prompt = "\033[91m{} exists,remove?(y/[n]):\033[00m ".format(path) - ans = timed_response(prompt, 5, default) - if ans != "n": - shutil.rmtree(path) - else: - return - os.makedirs(path, exist_ok=True) diff --git a/gitk/scembed/__init__.py b/gitk/scembed/__init__.py deleted file mode 100644 index d3f345dd..00000000 --- a/gitk/scembed/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .scembed import * -from .utils import * -from .const import * diff --git a/gitk/scembed/argparser.py b/gitk/scembed/argparser.py deleted file mode 100644 index 2981bcfe..00000000 --- a/gitk/scembed/argparser.py +++ /dev/null @@ -1,147 +0,0 @@ -from ubiquerg import VersionInHelpParser - -from ._version import __version__ -from .const import * - - -def build_argparser(parser: VersionInHelpParser = None): - """ - Parse command-line arguments passed to the pipeline. - """ - # Argument Parsing - ########################################################################### - if parser is None: - parser = VersionInHelpParser( - prog=PKG_NAME, - version=__version__, - description="%(prog)s - embed single-cell data as region vectors", - ) - - # Pipeline-specific arguments - parser.add_argument( - "-i", - "--input", - default=None, - type=str, - required=True, - help="Path to MarketMatrix format count matrix.", - ) - - parser.add_argument( - "-n", - "--names", - default=None, - type=str, - required=False, - help="Path to sample/barcodes names in a single " - "column tab-delimited format.", - ) - - parser.add_argument( - "-c", - "--coords", - default=None, - type=str, - required=False, - help="Path to sample/barcodes coordinates in a " - "chr, start, end tab-delimited format.", - ) - - parser.add_argument( - "-o", - "--output", - default=None, - type=str, - required=True, - help="Path to output directory to store results.", - ) - - parser.add_argument( - "-t", - "--title", - default="scembed", - type=str, - required=False, - help="Project/run title for naming output.", - ) - - parser.add_argument( - "--docs", - dest="docs", - default=None, - help="Path to documents dictionary.", - ) - - parser.add_argument( - "--model", - dest="model", - default=None, - help="Path to Word2Vec model.", - ) - - parser.add_argument( - "--label-delimiter", - dest="label_delimiter", - default="_", - help="Delimiter used to split cell names.", - ) - - parser.add_argument( - "--nothreads", - dest="nothreads", - default=1, - help="Number of available processors for " "Word2Vec training.", - ) - - parser.add_argument( - "--nocells", - dest="nocells", - default=5, - help="Minimum number of cells with a shared region " - "for that region to be included.", - ) - - parser.add_argument( - "--noreads", - dest="noreads", - default=2, - help="Minimum number of reads that overlap a region " - "for that region to be included.", - ) - - parser.add_argument( - "--window-size", - dest="window_size", - default=100, - help="Word2Vec window size.", - ) - - parser.add_argument( - "--epochs", - dest="epochs", - default=30, - help="Number of epochs for training", - ) - - parser.add_argument( - "--dimension", - dest="dimension", - default=100, - help="Number of dimensions to train the word2vec " "model.", - ) - - parser.add_argument( - "--min-count", - dest="min_count", - default=10, - help="Minimum count for Word2Vec model.", - ) - - parser.add_argument( - "--shuffle-repeat", - dest="shuffle_repeat", - default=5, - help="Number of times to shuffle the document to " - "generate date for Word2Vec.", - ) - return parser diff --git a/gitk/scembed/const.py b/gitk/scembed/const.py deleted file mode 100644 index 4a0c490d..00000000 --- a/gitk/scembed/const.py +++ /dev/null @@ -1,17 +0,0 @@ -""" Constants for scembed """ - -__author__ = ["Nathan LeRoy", "Jason Smith", "Erfaneh Gharavi"] -__email__ = "nleroy@virginia.edu" - -LOGGING_LEVEL = "INFO" -PKG_NAME = "scembed" - -DEAFULT_N_SHUFFLES = 1000 -DEFAULT_WINDOW_SIZE = 5 -DEFAULT_EMBEDDING_SIZE = 100 -DEFAULT_EPOCHS = 10 -DEFAULT_INIT_LR = ( - 0.1 # https://github.com/databio/gitk/issues/6#issuecomment-1476273162 -) -DEFAULT_MIN_LR = 0.0001 # gensim default -DEFAULT_DECAY_RATE = 0.95 diff --git a/gitk/scembed/scembed.py b/gitk/scembed/scembed.py deleted file mode 100755 index dccf48c0..00000000 --- a/gitk/scembed/scembed.py +++ /dev/null @@ -1,208 +0,0 @@ -import scanpy as sc -import pandas as pd - -from typing import Dict, List, Union -from concurrent.futures import ThreadPoolExecutor -from random import shuffle -from gensim.models import Word2Vec -from gensim.models.callbacks import CallbackAny2Vec -from numba import config -from logging import getLogger -from tqdm import tqdm - -from .const import * -from .utils import LearningRateScheduler, ScheduleType - -_LOGGER = getLogger(PKG_NAME) - -# set the threading layer before any parallel target compilation -config.THREADING_LAYER = "threadsafe" - - -# shuffle the document to generate data for word2vec -def shuffle_documents( - documents: List[List[str]], - n_shuffles: int, - threads: int = None, -) -> List[List[str]]: - """ - Shuffle around the genomic regions for each cell to generate a "context". - - :param List[List[str]] documents: the document list to shuffle. - :param int n_shuffles: The number of shuffles to conduct. - :param int threads: The number of threads to use for shuffling. - """ - - def shuffle_list(l: List[str], n: int) -> List[str]: - for _ in range(n): - shuffle(l) - return l - - _LOGGER.debug(f"Shuffling documents {n_shuffles} times.") - shuffled_documents = documents.copy() - with ThreadPoolExecutor(max_workers=threads) as executor: - shuffled_documents = list( - tqdm( - executor.map( - shuffle_list, shuffled_documents, [n_shuffles] * len(documents) - ), - total=len(documents), - ) - ) - return shuffled_documents - - -class ReportLossCallback(CallbackAny2Vec): - """ - Callback to report loss after each epoch. - """ - - def __init__(self): - self.epoch = 0 - - def on_epoch_end(self, model: Word2Vec): - loss = model.get_latest_training_loss() - _LOGGER.info(f"Epoch {self.epoch} complete. Loss: {loss}") - self.epoch += 1 - - -class Region2Vec(Word2Vec): - """ - Region2Vec model that extends the Word2Vec model from gensim. - """ - - def __init__( - self, - data: sc.AnnData, - window_size: int = DEFAULT_WINDOW_SIZE, - vector_size: int = DEFAULT_EMBEDDING_SIZE, - min_count: int = 10, - threads: int = 1, - seed: int = 42, - n_shuffles: int = DEAFULT_N_SHUFFLES, - callbacks: List[CallbackAny2Vec] = [], - ): - # convert the data to the - _LOGGER.info("Converting data to documents.") - self.region_sets = convert_anndata_to_documents(data) - self.n_shuffles = n_shuffles - self.callbacks = callbacks - - # instantiate the Word2Vec model - super().__init__( - window=window_size, - vector_size=vector_size, - min_count=min_count, - workers=threads, - seed=seed, - callbacks=callbacks, - ) - - def train( - self, - epochs: Union[int, None] = None, - report_loss: bool = True, - lr: float = DEFAULT_INIT_LR, - min_lr: float = DEFAULT_MIN_LR, - lr_schedule: Union[str, ScheduleType] = "linear", - ): - """ - Train the model. This is done in two steps: First, we shuffle the documents. - Second, we train the model. - """ - if report_loss: - self.callbacks.append(ReportLossCallback()) - - lr_scheduler = LearningRateScheduler( - init_lr=lr, min_lr=min_lr, schedule=lr_schedule, epochs=self.n_shuffles - ) - - # train the model using these shuffled documents - _LOGGER.info("Training starting.") - - for shuffle_num in range(self.n_shuffles): - # update current values - current_lr = lr_scheduler.get_lr() - current_loss = self.get_latest_training_loss() - - # update user - _LOGGER.info( - f"SHUFFLE {shuffle_num} - lr: {current_lr}, loss: {current_loss}" - ) - _LOGGER.info("Shuffling documents.") - - # shuffle regions - self.region_sets = shuffle_documents(self.region_sets, 10) - - # update vocab and train - super().build_vocab(self.region_sets, update=True) - super().train( - self.region_sets, - total_examples=len(self.region_sets), - epochs=epochs or 1, # use the epochs passed in or just one - callbacks=self.callbacks, - compute_loss=report_loss, - start_alpha=current_lr, - ) - - -def load_scanpy_data(path_to_h5ad: str) -> sc.AnnData: - """ - Load in the h5ad file that holds all of the information - for our single-cell data with scanpy. - - :param str path_to_h5ad: the path to the h5ad file made with scanpy - """ - return sc.read_h5ad(path_to_h5ad) - - -def extract_region_list(region_df: pd.DataFrame) -> List[str]: - """ - Parse the `var` attribute of the scanpy.AnnData object and - return a list of regions from the matrix - - :param pandas.DataFrame region_df: the regions dataframe to parse - """ - _LOGGER.info("Extracting region list from matrix.") - regions_parsed = [] - for r in tqdm(region_df.iterrows(), total=region_df.shape[0]): - r_dict = r[1].to_dict() - regions_parsed.append( - " ".join([r_dict["chr"], str(r_dict["start"]), str(r_dict["end"])]) - ) - return regions_parsed - - -def remove_zero_regions(cell_dict: Dict[str, int]) -> Dict[str, int]: - """ - Removes any key-value pairs in a dictionary where the value (copy number) - is equal to zero (no signal). This is done using dictionary comprehension - as it is much faster. - - :param cell_dict Dict[str, int]: the cell dictionary with region index keys and copy number values - """ - return {k: v for k, v in cell_dict.items() if v > 0} - - -def convert_anndata_to_documents(anndata: sc.AnnData) -> List[List[str]]: - """ - Parses the scanpy.AnnData object to create the required "documents" object for - training the Word2Vec model. Each row (or cell) is treated as a "document". That - is, each region is a "word", and the total collection of regions is the "document". - - :param scanpy.AnnData anndata: the AnnData object to parse. - """ - regions_parsed = extract_region_list(anndata.var) - sc_df = anndata.to_df() - docs = [] - _LOGGER.info("Generating documents.") - - for row in tqdm(sc_df.iterrows(), total=sc_df.shape[0]): - row_dict = row[1].to_dict() - row_dict = remove_zero_regions(row_dict) - new_doc = [] - for region_indx in row_dict: - region_str = regions_parsed[int(region_indx)] - new_doc.append(region_str) - docs.append(new_doc) - return docs diff --git a/gitk/scembed/utils.py b/gitk/scembed/utils.py deleted file mode 100644 index e0e9f83c..00000000 --- a/gitk/scembed/utils.py +++ /dev/null @@ -1,78 +0,0 @@ -from typing import Union -from enum import Enum -from logging import getLogger - -from .const import * - -_LOGGER = getLogger(PKG_NAME) - - -class ScheduleType(Enum): - """Learning rate schedule types""" - - LINEAR = "linear" - EXPONENTIAL = "exponential" - - -class LearningRateScheduler: - """ - Simple class to track learning rates of the training procedure - - Based off of: https://machinelearningmastery.com/using-learning-rate-schedules-deep-learning-models-python-keras/ - """ - - def __init__( - self, - init_lr: float = DEFAULT_INIT_LR, - min_lr: float = DEFAULT_MIN_LR, - type: Union[str, ScheduleType] = ScheduleType.EXPONENTIAL, - decay: float = None, - n_epochs: int = None, - ): - self.init_lr = init_lr - self.min_lr = min_lr - self.n_epochs = n_epochs - - # convert type to learning rate if necessary - if isinstance(type, str): - try: - self.type = ScheduleType[type.upper()] - except KeyError: - raise ValueError( - f"Unknown schedule type: {type}. Must be one of ['linear', 'exponential']." - ) - - # init the current lr and iteration - self._current_lr = init_lr - self._iter = 1 - - # init decay rate - if decay is None: - _LOGGER.warning( - "No decay rate provided. Calculating decay rate from init_lr and n_epochs." - ) - self.decay = init_lr / n_epochs - else: - self.decay = decay - - def _update_linear(self, epoch: int): - lr = self.init_lr - (self.decay * epoch) - return max(lr, self.min_lr) - - def _update_exponential(self, epoch: int): - lr = self.get_lr() * (1 / (1 + self.decay * epoch)) - return max(lr, self.min_lr) - - def update(self): - # update the learning rate according to the type - if self.type == ScheduleType.LINEAR: - self._current_lr = self._update_linear(self._iter) - self._iter += 1 - elif self.type == ScheduleType.EXPONENTIAL: - self._current_lr = self._update_exponential(self._iter) - self._iter += 1 - else: - raise ValueError(f"Unknown schedule type: {self.type}") - - def get_lr(self): - return self._current_lr diff --git a/gitk/tokenization/README.md b/gitk/tokenization/README.md deleted file mode 100644 index edaf4ea7..00000000 --- a/gitk/tokenization/README.md +++ /dev/null @@ -1,22 +0,0 @@ -# Tokenization -## Introduction -In training word embeddings, we need to first tokenize each word such that words in different forms are represented by one word. For example, "orange", "oranges" and "Orange" are all mapped to "orange" since they essentially convey the same meaning. This can reduce the vocabulary size and also improve the quality of learned embeddings.
-Similary, before running region2vec, we need to run tokenization on regions. First, we need to provide a universe, the "vocabulary" in the setting of genomic regions. The universe is a BED file, containing representative regions. With the given universe, we represent (tokenize) raw regions with the regions in the universe. - -For hard tokenization, if the overlap between a raw region in a bed file and a region in the universe exceeds a certain amount, then we use the region in the universe to represent this raw region; otherwise, we ignore this raw region. This is a "zeor or one" process. After hard tokenization, each bed file will contain regions all from the universe, and the number of regions will be smaller or equal to the original number. - - -## Usage -For hard tokenization, run -``` -from gitk.tokenization import hard_tokenization - -src_folder = '/path/to/raw/bed/files' -dst_folder = '/path/to/tokenized_files' -universe_file = '/path/to/universe_file' -hard_tokenization(src_folder, dst_folder, universe_file, 1e-9) - -``` -Note that we use the `intersect` function of `bedtools` to do tokenization. If you want to switch to different tools, you can override the `bedtool_tokenization` function in `hard_tokenization_batch.py` and provide the path to your tool by specifying the input argument `bedtools_path`. The `fraction` argument specifies the minimum overlap required as a fraction of some region in the universe (default: 1E-9,i.e. 1bp; maximum 1.0). A raw region will be mapped into a universe region when an overlap is above the threshold. - -The bedtools (version 2.30.0) will be automatically downloaded from https://github.com/arq5x/bedtools2/releases to the `bedtools` folder. diff --git a/gitk/tokenization/__init__.py b/gitk/tokenization/__init__.py deleted file mode 100644 index 36891984..00000000 --- a/gitk/tokenization/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .main import hard_tokenization_main as hard_tokenization diff --git a/gitk/tokenization/hard_tokenization_batch.py b/gitk/tokenization/hard_tokenization_batch.py deleted file mode 100644 index 54c0bb2a..00000000 --- a/gitk/tokenization/hard_tokenization_batch.py +++ /dev/null @@ -1,112 +0,0 @@ -import argparse -import os -from gitk.tokenization import utils -import subprocess -import shlex - - -def bedtool_tokenization( - f, bedtool_path, data_folder, target_folder, universe, fraction -): - fname = os.path.join(data_folder, f) - temp = os.path.join(target_folder, f + "_sorted") - target = os.path.join(target_folder, f) - with open(temp, "w") as f_temp: - subprocess.run(shlex.split("sort -k1,1 -k2,2n {}".format(fname)), stdout=f_temp) - with open(target, "w") as f_target: - subprocess.run( - shlex.split( - "{} intersect -a {} -b {} -u -f {} ".format( - bedtool_path, universe, temp, fraction - ) - ), - stdout=f_target, - ) - os.remove(temp) - - -def generate_tokens( - raw_data_folder, token_folder, universe, file_list, bedtool, fraction -): - """ - Perform hard tokenization on the bed files - """ - usize = 0 - with open(universe, "r") as f: - for _ in f: - usize += 1 - print("\033[93mUniverse size is {}\033[00m".format(usize)) - - all_set = [] - with open(file_list, "r") as fin: - for fname in fin: - name = fname.strip() - all_set.append(name) - all_set = set(all_set) - - if os.path.exists(token_folder): - files = os.listdir(token_folder) - existing_set = set([f.strip() for f in files]) - not_covered = all_set - existing_set - number = len(not_covered) - if number == 0: - print("Use the existing folder {}".format(token_folder), flush=True) - return 0 - else: - print( - "Folder {} exists with {} files not processed. Continue...".format( - token_folder, number - ), - flush=True, - ) - else: - os.makedirs(token_folder) - not_covered = all_set - for f in not_covered: - bedtool_tokenization( - f, bedtool, raw_data_folder, token_folder, universe, fraction - ) - return 0 - - -def main(args): - local_timer = utils.Timer() - print("Entering hard tokenization. Results stored in {}".format(args.token_folder)) - status = generate_tokens( - args.data_folder, - args.token_folder, - args.universe, - args.file_list, - args.bedtools_path, - args.fraction, - ) - if status < 0: - return - tokenization_time = local_timer.t() - print("Hard tokenization takes {}".format(utils.time_str(tokenization_time))) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--data_folder", type=str, default="/scratch/gz5hp/encode3/cell/datasets" - ) - parser.add_argument( - "--file_list", type=str, default="/home/gz5hp/encode3_proj/all_file_list.txt" - ) - - parser.add_argument( - "--token_folder", type=str, default="/scratch/gz5hp/encode3/cell/tokens" - ) - # parameters for hard tokenization - parser.add_argument( - "--universe", type=str, default="/home/gz5hp/encode3_proj/GRCh38-universe.bed" - ) - parser.add_argument( - "--bedtools_path", type=str, default="/scratch/gz5hp/genomes/bedtools" - ) - parser.add_argument("--fraction", type=float, default=1.0e-9) - - args = parser.parse_args() - if os.path.exists(args.file_list): - main(args) diff --git a/gitk/tokenization/main.py b/gitk/tokenization/main.py deleted file mode 100644 index e7b8075a..00000000 --- a/gitk/tokenization/main.py +++ /dev/null @@ -1,145 +0,0 @@ -import numpy as np -import shutil -import argparse -import subprocess -import json - -import multiprocessing -import shlex -import os -from queue import Queue -import yaml -import random -import sys -import requests -import glob -from .hard_tokenization_batch import main as hard_tokenization -from gitk.tokenization.split_file import split_file -import gitk.region2vec.utils as utils - - -class Namespace: - def __init__(self, **kwargs): - self.__dict__.update(kwargs) - - -def hard_tokenization_main( - src_folder, dst_folder, universe_file, fraction=1e-9, file_list=None, num_workers=10 -): - timer = utils.Timer() - start_time = timer.t() - - file_list_path = os.path.join(dst_folder, "file_list.txt") - files = os.listdir(src_folder) - file_count = len(files) - if file_count == 0: - print("No files in {}".format(src_folder)) - return 0 - - os.makedirs(dst_folder, exist_ok=True) - if file_list is None: # use all bed files in data_folder - # generate a file list - file_list = files - print("Use all ({}) bed files in {}".format(file_count, src_folder)) - else: - file_number = len(file_list) - print("{} bed files in total, use {} of them".format(file_count, file_number)) - - # check whether all files in file_list are tokenized - number = -1 - if os.path.exists(dst_folder): - all_set = set([f.strip() for f in file_list]) - existing_set = set(os.listdir(dst_folder)) - not_covered = all_set - existing_set - number = len(not_covered) - if number == 0 and len(existing_set) == len(all_set): - print("Skip tokenization. Using the existing tokenization files") - return 1 - elif len(existing_set) > 0: - print( - "Folder {} exists with incomplete tokenized files. Please empty/delete the folder first".format( - dst_folder - ) - ) - return 0 - - with open(file_list_path, "w") as f: - for file in file_list: - f.write(file) - f.write("\n") - - # Download bedtools for tokenization - bedtools_folder = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "bedtools" - ) - os.makedirs(bedtools_folder, exist_ok=True) - bedtools_path = os.path.join(bedtools_folder, "bedtools") - if not os.path.isfile(bedtools_path): - # Download bedtools - bedtool_url = "https://github.com/arq5x/bedtools2/releases/download/v2.30.0/bedtools.static.binary" - print("Downloading bedtools from \n{}".format(bedtool_url)) - response = requests.get(bedtool_url) - with open(bedtools_path, "wb") as f: - f.write(response.content) - print("bedtools is saved in \n{}".format(bedtools_path)) - subprocess.run(shlex.split("chmod +x {}".format(bedtools_path))) - subprocess.run(shlex.split("{} --version".format(bedtools_path))) - - print("Tokenizing {} bed files ...".format(len(file_list))) - - file_count = len(file_list) - # split the file_list into several subsets for each worker to process in parallel - nworkers = min(int(np.ceil(file_count / 20)), num_workers) - if nworkers <= 1: - tokenization_args = Namespace( - data_folder=src_folder, - file_list=file_list_path, - token_folder=dst_folder, - universe=universe_file, - bedtools_path=bedtools_path, - fraction=fraction, - ) - hard_tokenization(tokenization_args) - - else: # multiprocessing - dest_folder = os.path.join(dst_folder, "splits") - split_file(file_list_path, dest_folder, nworkers) - args_arr = [] - for n in range(nworkers): - temp_token_folder = os.path.join(dst_folder, "batch_{}".format(n)) - tokenization_args = Namespace( - data_folder=src_folder, - file_list=os.path.join(dest_folder, "split_{}.txt".format(n)), - token_folder=temp_token_folder, - universe=universe_file, - bedtools_path=bedtools_path, - fraction=fraction, - ) - args_arr.append(tokenization_args) - with multiprocessing.Pool(nworkers) as pool: - processes = [ - pool.apply_async(hard_tokenization, args=(param,)) for param in args_arr - ] - results = [r.get() for r in processes] - # move tokenized files in different folders to expr_tokens - shutil.rmtree(dest_folder) - for param in args_arr: - allfiles = os.listdir(param.token_folder) - for f in allfiles: - shutil.move( - os.path.join(param.token_folder, f), os.path.join(dst_folder, f) - ) - shutil.rmtree(param.token_folder) - os.remove(file_list_path) - print( - "Tokenization complete {}/{} bed files".format( - len(os.listdir(dst_folder)), file_count - ) - ) - elapsed_time = timer.t() - start_time - print( - "[Tokenization] {}/{}".format( - utils.time_str(elapsed_time), utils.time_str(timer.t()) - ) - ) - return 1 diff --git a/gitk/tokenization/utils.py b/gitk/tokenization/utils.py deleted file mode 100644 index f3c0e740..00000000 --- a/gitk/tokenization/utils.py +++ /dev/null @@ -1,130 +0,0 @@ -import time -import sys -import select -import os -import numpy as np -import shutil - - -def prRed(skk): - print("\033[91m{}\033[00m".format(skk)) - - -def prGreen(skk): - print("\033[92m{}\033[00m".format(skk)) - - -def prYellow(skk): - print("\033[93m{}\033[00m".format(skk)) - - -def prLightPurple(skk): - print("\033[94m{}\033[00m".format(skk)) - - -def prPurple(skk): - print("\033[95m{}\033[00m".format(skk)) - - -def prCyan(skk): - print("\033[96m{}\033[00m".format(skk)) - - -def prLightGray(skk): - print("\033[97m{}\033[00m".format(skk)) - - -def prBlack(skk): - print("\033[98m{}\033[00m".format(skk)) - - -_log_path = None - - -def set_log_path(path): - global _log_path - _log_path = path - - -class Timer: - def __init__(self): - self.v = time.time() - - def s(self): - self.v = time.time() - - def t(self): - return time.time() - self.v - - -def time_str(t): - if t >= 3600: - return "{:.2f}h".format(t / 3600) - if t >= 60: - return "{:.2f}m".format(t / 60) - return "{:.2f}s".format(t) - - -def timed_response(prompt, wait_time, default): - print(prompt, end="", flush=True) - i, o, e = select.select([sys.stdin], [], [], wait_time) - if i: - ans = sys.stdin.readline().strip() - if ans not in ["y", "n"]: - print("\033[91m{}\033[00m".format(default)) - return default - else: - return ans - else: - print("\033[91m{}\033[00m".format(default)) - return default - - -def log(obj, filename="log.txt"): - print(obj, flush=True) - if _log_path is not None: - with open(os.path.join(_log_path, filename), "a") as f: - print(obj, file=f) - - -class lr_scheduler: - def __init__(self, init_lr, end_lr, epochs, lr_info=None, mode="linear"): - self.lr = init_lr - self.end_lr = end_lr - self.init_lr = init_lr - self.mode = mode - self.epochs = epochs - self.lr_info = lr_info - self.count = 0 - if mode == "linear": - self.freq = lr_info["freq"] - - def step(self): - self.count += 1 - if self.mode == "linear": - if self.count % self.freq == 0: - self.lr = ( - self.init_lr - - (self.init_lr - self.end_lr) / self.epochs * self.count - ) - elif self.mode == "milestone": - milestones = np.array(self.lr_info["milestones"]) - power = (milestones <= self.count).sum() - self.lr = self.init_lr * np.power(self.lr_info["ratio"], float(power)) - if self.lr < self.end_lr: - self.lr = self.end_lr - return self.lr - - -def ensure_dir(path, default="y"): - if os.path.exists(path): - if default == "y": - prompt = "\033[91m{} exists,remove?([y]/n):\033[00m ".format(path) - else: - prompt = "\033[91m{} exists,remove?(y/[n]):\033[00m ".format(path) - ans = timed_response(prompt, 5, default) - if ans != "n": - shutil.rmtree(path) - else: - return - os.makedirs(path, exist_ok=True) diff --git a/gitk/utils.py b/gitk/utils.py deleted file mode 100644 index 9ac5176a..00000000 --- a/gitk/utils.py +++ /dev/null @@ -1,33 +0,0 @@ -from time import time - - -def natural_chr_sort(a, b): - ac = a.replace("chr", "") - ac = ac.split("_")[0] - bc = b.replace("chr", "") - bc = bc.split("_")[0] - if bc.isnumeric() and ac.isnumeric() and bc != ac: - if int(bc) < int(ac): - return 1 - elif int(bc) > int(ac): - return -1 - else: - return 0 - else: - if b < a: - return 1 - elif a < b: - return -1 - else: - return 0 - - -def timer_func(func): - def wrap_func(*args, **kwargs): - t1 = time() - result = func(*args, **kwargs) - t2 = time() - print(f"Function {func.__name__!r} executed in {(t2-t1)/60:.4f}min") - return result - - return wrap_func diff --git a/mkdocs.yml b/mkdocs.yml index 3af949a7..07bc68b6 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,16 +1,30 @@ -site_name: gitk -site_logo: img/gitk_logo.svg -site_url: http://gitk.databio.org/ -repo_url: http://github.com/databio/gitk -pypi_name: gitk +site_name: geniml +site_logo: img/geniml_logo.svg +site_url: http://geniml.databio.org/ +repo_url: http://github.com/databio/geniml +pypi_name: geniml +papers: https://www.biorxiv.org/content/10.1101/2020.11.02.364145v1 nav: - Getting Started: - Introduction: README.md + - Module overviews: modules.md - How-to guides: - - Create consensus peaks: likelihood/consensus-peaks.md + - Assess universe fit: tutorials/assess-universe.md + - Search intervals with BEDSpace: tutorials/bedspace.md + - Evaluate embeddings: tutorials/evaluation.md + - Train region2vec embeddings: tutorials/region2vec.md + - Train single-cell embeddings: tutorials/train-scembed-model.md + - Load vector database with embeddings: tutorials/load-qdrant-with-cell-embeddings.md + - Cell-type prediction using KNN: tutorials/cell-type-annotation-with-knn.md + - Tokenization: tutorials/tokenization.md + - Tokenize a BED file on the command line: tutorials/cli-tokenization.md + - Create consensus peaks: tutorials/create-consensus-peaks.md + - Fine-tune embeddings: tutorials/fine-tune-region2vec-model.md + - Randomize bed files: tutorials/bedshift.md + - Create evaluation dataset with bedshift: tutorials/bedshift-evaluation-guide.md - Reference: - - API: autodoc_build/gitk.md + - API: autodoc_build/geniml.md - Support: support.md - Contributing: contributing.md - Changelog: changelog.md @@ -22,6 +36,13 @@ plugins: autodoc_build: "docs/autodoc_build" jupyter_source: "docs_jupyter" jupyter_build: "docs_jupyter/build" - autodoc_package: "gitk" - no_top_level: true + # autodoc_package: "geniml" + no_top_level: false - search + + +navbar: + right: + - text: Manuscripts + icon: fa-file-alt + href: manuscripts \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..c5370b45 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,9 @@ + +[tool.black] +line-length = 99 +target-version = ['py38', 'py310'] +include = '\.pyi?$' + +[tool.ruff] +line-length = 99 +target-version = 'py311' diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt deleted file mode 100644 index 9b7ba6b1..00000000 --- a/requirements/requirements-all.txt +++ /dev/null @@ -1,13 +0,0 @@ -gensim -logmuse -matplotlib -numba -numpy >= 1.23.0 -paramiko >= 3.0.0 -pyBigWig -pyyaml -scanpy -seaborn -tqdm -umap-learn -ubiquerg diff --git a/requirements/requirements-basic.txt b/requirements/requirements-basic.txt new file mode 100644 index 00000000..57245ba7 --- /dev/null +++ b/requirements/requirements-basic.txt @@ -0,0 +1,18 @@ +logmuse >=0.2.8 +ubiquerg >= 0.6.3 +peppy >= 0.40.7 +requests >= 2.31.0 +botocore >= 1.34.54 +boto3 >= 1.34.54 +genomicranges >= 0.4.1 + +gtars >= 0.2.5 +pybiocfilecache == 0.6.1 +zarr >= 2.17.2, < 3.0.0 +pyyaml >= 6.0.1 # for s3fs because of the errors +s3fs >= 2024.3.1 +pyarrow >= 17.0.0 +iranges >= 0.2.11 +numpy >= 1.24.0 +rich >= 13.9.4 +safetensors diff --git a/requirements/requirements-ml.txt b/requirements/requirements-ml.txt new file mode 100644 index 00000000..ce18c13b --- /dev/null +++ b/requirements/requirements-ml.txt @@ -0,0 +1,14 @@ +anndata > 0.9.0 +fastembed >= 0.2.5 +gensim >= 4.3.3 +huggingface_hub >= 0.25.1 +qdrant_client >= 1.11.2 +hnswlib >= 0.8.0 +paramiko >= 3.0.0 +pyBigWig >= 0.3.23 +scanpy >= 1.10.3 +torch >= 2.3.0 +langchain-huggingface==0.0.2 +hmmlearn >=0.3.2 +scipy >= 1.13.1 +transformers >= 4.52.4 \ No newline at end of file diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt new file mode 100644 index 00000000..b60a8b05 --- /dev/null +++ b/requirements/requirements-test.txt @@ -0,0 +1,6 @@ +pytest +pytest-remotedata +pytest-mock +black +ruff +isort \ No newline at end of file diff --git a/requirements/requirements-train.txt b/requirements/requirements-train.txt new file mode 100644 index 00000000..271de0f2 --- /dev/null +++ b/requirements/requirements-train.txt @@ -0,0 +1,4 @@ +numba +hnswlib >= 0.8.0 +pickle +matplotlib \ No newline at end of file diff --git a/scripts/bedshift/bedplot.R b/scripts/bedshift/bedplot.R new file mode 100644 index 00000000..c4762298 --- /dev/null +++ b/scripts/bedshift/bedplot.R @@ -0,0 +1,45 @@ +library("data.table") +library("ggplot2") + +# This script produces visualizations of bedshift results (perturbed bed files) using R. + +# Load in the files and process them, returning a table with all the regions. +bedshiftread = function(startfile, randfiles){ + files = c(randfiles, startfile) + nfiles = length(files) + regionslist = lapply (files, fread) + rowsperfile = sapply(regionslist, NROW) + regionstable = rbindlist(regionslist, fill=TRUE) + regionstable[,fileid:=rep(seq_len(nfiles), rowsperfile)] + regionstable[,file:="random"] + + starfileregions = seq(from=NROW(regionstable)+1-rowsperfile[length(rowsperfile)], to=NROW(regionstable)) + regionstable[starfileregions,file:="original"] + return(regionstable) +} + +# Plot the results of the bedshiftread function +bedshiftplot = function(regionstable) { + ggplot(regionstable, + aes(xmin=V2, xmax=V3, ymin=fileid, ymax=fileid+0.75, fill=file)) + + geom_rect() + + theme_classic() + + scale_fill_manual(values=c("black", "gray")) + + xlab("Genome") + + ylab("Files") + + theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) +} + +# Provide the original file (the one that's being perturbed) +# and the filenames of all randomized files. + +# Run the randomization with a command like this: +# bedshift --verbosity 5 -b tests/simple_1.bed -d .3 -l tests/chrom_sizes_1 -r 10 + +startfile = "tests/simple_1.bed" +randfiles = paste0("rep", 1:10, "_bedshifted_simple_1.bed") + +pdf("drop_H.pdf", width=6, height=2) +regionstable = bedshiftread(startfile, randfiles) +bedshiftplot(regionstable) +dev.off() diff --git a/scripts/bedshift/bedshift.sh b/scripts/bedshift/bedshift.sh new file mode 100644 index 00000000..51afe7a6 --- /dev/null +++ b/scripts/bedshift/bedshift.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# Change NUM_BEDFILES, BEDFILE, and OUTPUT_FILE. +# Other parameters are set to the default values. Edit as needed. +# To run, execute ./bedscript.sh in Terminal. + +NUM_BEDFILES=NUMBER +for (( c=1; c<=$NUM_BEDFILES; c++ )) +do + BEDFILE=PATH/TO/ORIGINAL/FILE$c.bed + DROP_RATE=0.0 + + ADD_RATE=0.0 + ADD_MEAN=320.0 + ADD_STDEV=30.0 + + SHIFT_RATE=0.0 + SHIFT_MEAN=0.0 + SHIFT_STDEV=150.0 + + CUT_RATE=0.0 + MERGE_RATE=0.0 + OUTPUT_FILE=PATH/TO/PERTURBED/FILE$c.bed + + bedshift --bedfile $BEDFILE --droprate $DROP_RATE --addrate $ADD_RATE --addmean $ADD_MEAN --addstdev $ADD_STDEV --shiftrate $SHIFT_RATE --shiftmean $SHIFT_MEAN --shiftstdev $SHIFT_STDEV --cutrate $CUT_RATE --mergerate $MERGE_RATE --outputfile $OUTPUT_FILE +done diff --git a/scripts/bedshift/bedshift2.sh b/scripts/bedshift/bedshift2.sh new file mode 100755 index 00000000..ff1adc4f --- /dev/null +++ b/scripts/bedshift/bedshift2.sh @@ -0,0 +1,19 @@ +#!/bin/bash +for filename in *.bed; do + CHROM_LENGTHS=../tests/hg38.chrom.sizes + BEDFILE=$filename + DROP_RATE=0.3 + + ADD_RATE=0.2 + ADD_MEAN=320.0 + ADD_STDEV=30.0 + + SHIFT_RATE=0.2 + SHIFT_MEAN=0.0 + SHIFT_STDEV=150.0 + + CUT_RATE=0.0 + MERGE_RATE=0.0 + + bedshift --bedfile $BEDFILE --chrom-lengths $CHROM_LENGTHS --droprate $DROP_RATE --addrate $ADD_RATE --addmean $ADD_MEAN --addstdev $ADD_STDEV --shiftrate $SHIFT_RATE --shiftmean $SHIFT_MEAN --shiftstdev $SHIFT_STDEV --cutrate $CUT_RATE --mergerate $MERGE_RATE +done \ No newline at end of file diff --git a/scripts/bedshift/bedshift_files.py b/scripts/bedshift/bedshift_files.py new file mode 100644 index 00000000..6beaf43d --- /dev/null +++ b/scripts/bedshift/bedshift_files.py @@ -0,0 +1,14 @@ +import os + +import bedshift + +datafolder = "." + +files = os.listdir() +for file in files: + if file.endswith(".bed"): + # you may also pass in a chrom.sizes file as the + # second argument if you are adding or shifting regions + b = bedshift.Bedshift(file) + b.all_perturbations(cutrate=0.3, droprate=0.2) + b.to_bed("bedshifted_" + file) diff --git a/setup.py b/setup.py index 940b78ce..f8cac393 100755 --- a/setup.py +++ b/setup.py @@ -1,13 +1,12 @@ import os -import sys from setuptools import setup -PACKAGE_NAME = "gitk" +PACKAGE_NAME = "geniml" # Ordinary dependencies DEPENDENCIES = [] -with open("requirements/requirements-all.txt", "r") as reqs_file: +with open("requirements/requirements-basic.txt", "r") as reqs_file: for line in reqs_file: if not line.strip(): continue @@ -19,6 +18,29 @@ with open(PACKAGE_NAME + "/_version.py", "r") as versionfile: version = versionfile.readline().split()[-1].strip("\"'\n") +# Optional dependencies +# Extras requires a dictionary and not a list? +with open("requirements/requirements-ml.txt", "r") as reqs_file: + ml_dep = [] + for line in reqs_file: + if not line.strip(): + continue + ml_dep.append(line.strip()) + +with open("requirements/requirements-test.txt", "r") as reqs_file: + test_dep = [] + for line in reqs_file: + if not line.strip(): + continue + test_dep.append(line.strip()) + +extra["install_requires"] = DEPENDENCIES +extra["extras_require"] = { + "ml": ml_dep, + "test": test_dep, +} + + with open("README.md") as f: long_description = f.read() @@ -26,12 +48,27 @@ name=PACKAGE_NAME, packages=[ PACKAGE_NAME, - "gitk.assess", - "gitk.eval", - "gitk.hmm", - "gitk.likelihood", - "gitk.scembed", - "gitk.bedspace", + "geniml.atacformer", + "geniml.craft", + "geniml.geneformer", + "geniml.assess", + "geniml.bedspace", + "geniml.bedshift", + "geniml.eval", + "geniml.likelihood", + "geniml.models", + "geniml.region2vec", + "geniml.scembed", + "geniml.tokenization", + "geniml.universe", + "geniml.io", + "geniml.text2bednn", + "geniml.bbclient", + "geniml.search", + "geniml.search.backends", + "geniml.search.interfaces", + "geniml.search.query2vec", + "geniml.nn", ], version=version, long_description=long_description, @@ -44,18 +81,20 @@ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Scientific/Engineering :: Bio-Informatics", ], license="BSD2", entry_points={ "console_scripts": [ - "gitk = gitk.cli:main", + "geniml = geniml.cli:main", + "bedshift = geniml.bedshift.bedshift:main", ], }, keywords="bioinformatics, sequencing, ngs", - package_data={"refgenie": [os.path.join("refgenie", "*")]}, + package_data={"geniml": [os.path.join("geniml", "*")]}, include_package_data=True, - url="http://giss.databio.org", + url="https://docs.bedbase.org/geniml/", author="Nathan Sheffield", - **extra + **extra, ) diff --git a/tests/bedshift/add_valid_test.bed b/tests/bedshift/add_valid_test.bed new file mode 100644 index 00000000..cf53bc77 --- /dev/null +++ b/tests/bedshift/add_valid_test.bed @@ -0,0 +1,15000 @@ +chr1 1057472 1057792 - +chr1 1280014 1280334 - +chr1 1307559 1307676 - +chr1 1376423 1376511 - +chr1 1837836 1837925 - +chr1 1875443 1875763 - +chr1 1891508 1891828 - +chr1 1976320 1976640 - +chr1 2105754 2106074 - +chr1 2126604 2126924 - +chr1 2313240 2313560 - +chr1 2345847 2346167 - +chr1 2479476 2479796 - +chr1 3236554 3236874 - +chr1 3341278 3341598 - +chr1 3369791 3369911 - +chr1 3400447 3400767 - +chr1 3407762 3408082 - +chr1 3481804 3482124 - +chr1 3531706 3532026 - +chr1 3612197 3612517 - +chr1 3641077 3641230 - +chr1 4712650 4712970 - +chr1 5570053 5570151 - +chr1 5574724 5574874 - +chr1 5805886 5806206 - +chr1 6035268 6035588 - +chr1 6094534 6094854 - +chr1 6265241 6265561 - +chr1 6305157 6305477 - +chr1 6306791 6307111 - +chr1 6403604 6403752 - +chr1 6474376 6474696 - +chr1 6498212 6498318 - +chr1 6535684 6535802 - +chr1 6546650 6546787 - +chr1 6549954 6550274 - +chr1 6614549 6614869 - +chr1 6780068 6780388 - +chr1 6787553 6787873 - +chr1 6949982 6950302 - +chr1 7130577 7130681 - +chr1 7267960 7268280 - +chr1 7707403 7707510 - +chr1 7727680 7727859 - +chr1 7729513 7729833 - +chr1 7962830 7963150 - +chr1 8035356 8035676 - +chr1 8042378 8042698 - +chr1 8374403 8374723 - +chr1 8621138 8621458 - +chr1 8731402 8731722 - +chr1 8908970 8909290 - +chr1 9048127 9048447 - +chr1 9065284 9065439 - +chr1 9161623 9161943 - +chr1 9170816 9171136 - +chr1 9202320 9202640 - +chr1 9241446 9241531 - +chr1 9294467 9294787 - +chr1 9310128 9310448 - +chr1 9327184 9327504 - +chr1 9349194 9349316 - +chr1 9515864 9516184 - +chr1 9551165 9551485 - +chr1 9555299 9555619 - +chr1 9675665 9675985 - +chr1 9687084 9687319 - +chr1 9785072 9785392 - +chr1 9927645 9927774 - +chr1 9934944 9935264 - +chr1 10010395 10010715 - +chr1 10213023 10213343 - +chr1 10270229 10270549 - +chr1 10437490 10437810 - +chr1 10488143 10488463 - +chr1 10490278 10490598 - +chr1 10554829 10554944 - +chr1 10567649 10567734 - +chr1 10590338 10590658 - +chr1 11002157 11002269 - +chr1 11021089 11021409 - +chr1 11097796 11097882 - +chr1 11702594 11702741 - +chr1 11705776 11706096 - +chr1 11779943 11780100 - +chr1 11845249 11845569 - +chr1 11898793 11898994 - +chr1 11954029 11954349 - +chr1 12178786 12178894 - +chr1 12184855 12185175 - +chr1 12268937 12269257 - +chr1 12289935 12290043 - +chr1 12336353 12336673 - +chr1 12680026 12680121 - +chr1 12719011 12719331 - +chr1 14140147 14140467 - +chr1 14287024 14287120 - +chr1 15058444 15058764 - +chr1 15105665 15105985 - +chr1 15241998 15242085 - +chr1 15354035 15354355 - +chr1 15373154 15373234 - +chr1 15426891 15427211 - +chr1 15455043 15455363 - +chr1 15478825 15479145 - +chr1 15480585 15480905 - +chr1 15631267 15631587 - +chr1 15650045 15650365 - +chr1 15757740 15757903 - +chr1 15827928 15828248 - +chr1 16067883 16068203 - +chr1 16118638 16118774 - +chr1 16142041 16142361 - +chr1 16321147 16321284 - +chr1 16335623 16335732 - +chr1 16359416 16359607 - +chr1 16382676 16382872 - +chr1 16405379 16405699 - +chr1 16408838 16409158 - +chr1 16481168 16481488 - +chr1 16514357 16514474 - +chr1 16570904 16571224 - +chr1 16947628 16947785 - +chr1 16951519 16951697 - +chr1 17029773 17029894 - +chr1 17036345 17036564 - +chr1 17278179 17278499 - +chr1 17287163 17287483 - +chr1 17386162 17386482 - +chr1 17452266 17452586 - +chr1 17600826 17600924 - +chr1 17662733 17662818 - +chr1 17732692 17733012 - +chr1 17746353 17746673 - +chr1 17803867 17804187 - +chr1 17820738 17821058 - +chr1 17828933 17829067 - +chr1 17933444 17933574 - +chr1 17980283 17980603 - +chr1 17988079 17988399 - +chr1 18152631 18152951 - +chr1 18175756 18176076 - +chr1 18923886 18924206 - +chr1 19147363 19147522 - +chr1 19201766 19202086 - +chr1 19210752 19210865 - +chr1 19233506 19233826 - +chr1 19239633 19239837 - +chr1 19567851 19568005 - +chr1 19667992 19668312 - +chr1 19670868 19671188 - +chr1 19717845 19718165 - +chr1 19980191 19980511 - +chr1 19986343 19986663 - +chr1 20031163 20031286 - +chr1 20042118 20042438 - +chr1 20088703 20089023 - +chr1 20208581 20208901 - +chr1 20244511 20244831 - +chr1 20301305 20301625 - +chr1 20521695 20522015 - +chr1 20539364 20539684 - +chr1 20688314 20688634 - +chr1 20693659 20693979 - +chr1 20703403 20703723 - +chr1 20756949 20757269 - +chr1 20820431 20820529 - +chr1 20848927 20849033 - +chr1 20901736 20902056 - +chr1 20959795 20959956 - +chr1 21543375 21543695 - +chr1 21581710 21582030 - +chr1 21596798 21596941 - +chr1 21661333 21661456 - +chr1 21695062 21695245 - +chr1 21697127 21697301 - +chr1 21817331 21817651 - +chr1 21922049 21922369 - +chr1 21985597 21985917 - +chr1 22290489 22290809 - +chr1 22432159 22432479 - +chr1 22458418 22458738 - +chr1 22602444 22602589 - +chr1 22743108 22743428 - +chr1 22746922 22747242 - +chr1 22968584 22968904 - +chr1 23000652 23000972 - +chr1 23030957 23031277 - +chr1 23188685 23188822 - +chr1 23215745 23216065 - +chr1 23256930 23257047 - +chr1 23290715 23291035 - +chr1 23302947 23303267 - +chr1 23425345 23425459 - +chr1 23730478 23730798 - +chr1 23823570 23823890 - +chr1 23866327 23866647 - +chr1 23871211 23871531 - +chr1 23961468 23961788 - +chr1 23963945 23964265 - +chr1 23984319 23984403 - +chr1 24017998 24018318 - +chr1 24077823 24078143 - +chr1 24098737 24098862 - +chr1 24136223 24136543 - +chr1 24137274 24137594 - +chr1 24194781 24195101 - +chr1 24397364 24397684 - +chr1 24476969 24477289 - +chr1 24525781 24526101 - +chr1 24528384 24528704 - +chr1 24716580 24716900 - +chr1 24717784 24718104 - +chr1 25015050 25015370 - +chr1 25174555 25174644 - +chr1 25224718 25224798 - +chr1 25363677 25363997 - +chr1 25428050 25428165 - +chr1 25899274 25899594 - +chr1 25945582 25945902 - +chr1 26137358 26137678 - +chr1 26221995 26222077 - +chr1 26324659 26324979 - +chr1 26362560 26362696 - +chr1 26431473 26431793 - +chr1 26496141 26496461 - +chr1 26690327 26690647 - +chr1 26718097 26718231 - +chr1 26743553 26743873 - +chr1 26826692 26827012 - +chr1 26946723 26947043 - +chr1 27115679 27115999 - +chr1 27136664 27136745 - +chr1 27179611 27179931 - +chr1 27293514 27293668 - +chr1 27324143 27324463 - +chr1 27371106 27371287 - +chr1 27372423 27372743 - +chr1 27423927 27424247 - +chr1 27429296 27429442 - +chr1 27481537 27481857 - +chr1 27524425 27524745 - +chr1 27627293 27627613 - +chr1 27659292 27659612 - +chr1 27714914 27715234 - +chr1 27721918 27722238 - +chr1 27883257 27883577 - +chr1 27932891 27933211 - +chr1 27935136 27935456 - +chr1 27970540 27970621 - +chr1 27989687 27990007 - +chr1 28184660 28184804 - +chr1 28417744 28418064 - +chr1 28535400 28535720 - +chr1 28575110 28575221 - +chr1 28844619 28844749 - +chr1 29063078 29063398 - +chr1 29203724 29204044 - +chr1 29208580 29208900 - +chr1 29529406 29529726 - +chr1 29706020 29706340 - +chr1 30170858 30171178 - +chr1 30176533 30176613 - +chr1 30265877 30266197 - +chr1 30558832 30559152 - +chr1 30853261 30853581 - +chr1 31180976 31181296 - +chr1 31192073 31192393 - +chr1 31252460 31252780 - +chr1 31255873 31255961 - +chr1 31274718 31274807 - +chr1 31384243 31384563 - +chr1 31635541 31635861 - +chr1 31654291 31654611 - +chr1 31656800 31657120 - +chr1 31895733 31895813 - +chr1 32014508 32014828 - +chr1 32029685 32030005 - +chr1 32059623 32059943 - +chr1 32083786 32084106 - +chr1 32129547 32129680 - +chr1 32181384 32181496 - +chr1 32211343 32211467 - +chr1 32254849 32255169 - +chr1 32429773 32429986 - +chr1 32469185 32469505 - +chr1 32681361 32681446 - +chr1 32713815 32714135 - +chr1 32800739 32801059 - +chr1 32866557 32866646 - +chr1 32879579 32879899 - +chr1 32913935 32914255 - +chr1 32986122 32986278 - +chr1 33177887 33178015 - +chr1 33238612 33238932 - +chr1 33338256 33338576 - +chr1 33342226 33342546 - +chr1 33520764 33521084 - +chr1 33551982 33552302 - +chr1 33560719 33560865 - +chr1 33592812 33593132 - +chr1 33603433 33603753 - +chr1 33761095 33761415 - +chr1 33764365 33764546 - +chr1 33833492 33833599 - +chr1 33921447 33921767 - +chr1 34078510 34078830 - +chr1 34098743 34098883 - +chr1 35208305 35208625 - +chr1 35252081 35252401 - +chr1 35318309 35318629 - +chr1 35325333 35325653 - +chr1 35332044 35332179 - +chr1 35667721 35668041 - +chr1 35916679 35916999 - +chr1 36030272 36030592 - +chr1 36034550 36034870 - +chr1 36181130 36181450 - +chr1 36192476 36192796 - +chr1 36240305 36240625 - +chr1 36348531 36348851 - +chr1 36554305 36554436 - +chr1 36565213 36565533 - +chr1 36600369 36600689 - +chr1 36615125 36615445 - +chr1 36653786 36654106 - +chr1 36765510 36765830 - +chr1 36834754 36834838 - +chr1 36886438 36886758 - +chr1 36981968 36982288 - +chr1 37041685 37042005 - +chr1 37116182 37116327 - +chr1 37169338 37169658 - +chr1 37789413 37789733 - +chr1 37900000 37900150 - +chr1 37920550 37920697 - +chr1 37953081 37953272 - +chr1 38068740 38069060 - +chr1 38276854 38276939 - +chr1 38325784 38326104 - +chr1 38455773 38455893 - +chr1 39052759 39053079 - +chr1 39876614 39876934 - +chr1 40004898 40005218 - +chr1 40119233 40119553 - +chr1 40123727 40124047 - +chr1 40265362 40265682 - +chr1 40282482 40282802 - +chr1 40439184 40439504 - +chr1 40544230 40544550 - +chr1 40609685 40610005 - +chr1 40781078 40781208 - +chr1 40804480 40804645 - +chr1 41077960 41078280 - +chr1 41134667 41134832 - +chr1 41239878 41240198 - +chr1 41314299 41314471 - +chr1 41315712 41316032 - +chr1 41418180 41418500 - +chr1 41733847 41734167 - +chr1 41748382 41748702 - +chr1 41882782 41883102 - +chr1 41898437 41898757 - +chr1 41981785 41982105 - +chr1 42039890 42040029 - +chr1 42612545 42612865 - +chr1 42639144 42639245 - +chr1 42642647 42642967 - +chr1 43222686 43223006 - +chr1 43243558 43243878 - +chr1 43379059 43379379 - +chr1 43539325 43539645 - +chr1 43692949 43693269 - +chr1 43703790 43704110 - +chr1 43994792 43995112 - +chr1 44020113 44020433 - +chr1 44229134 44229454 - +chr1 44400431 44400751 - +chr1 44471407 44471576 - +chr1 44502286 44502606 - +chr1 44513403 44513723 - +chr1 44683495 44683636 - +chr1 44784002 44784322 - +chr1 45060834 45061154 - +chr1 45118654 45118816 - +chr1 45121230 45121550 - +chr1 45180769 45180849 - +chr1 45194954 45195274 - +chr1 45196985 45197305 - +chr1 45251830 45252026 - +chr1 45265358 45265678 - +chr1 45301500 45301820 - +chr1 45480152 45480472 - +chr1 45484321 45484641 - +chr1 46035850 46035956 - +chr1 46502740 46503060 - +chr1 46645233 46645553 - +chr1 46689188 46689508 - +chr1 46813789 46814109 - +chr1 46872021 46872341 - +chr1 46908451 46908531 - +chr1 46912525 46912845 - +chr1 46943119 46943439 - +chr1 46993709 46994029 - +chr1 47190961 47191281 - +chr1 47641095 47641281 - +chr1 47644933 47645056 - +chr1 47658562 47658882 - +chr1 47854584 47854665 - +chr1 47890097 47890417 - +chr1 47898057 47898377 - +chr1 47902216 47902372 - +chr1 47903421 47903741 - +chr1 48176289 48176410 - +chr1 48191968 48192288 - +chr1 48463396 48463528 - +chr1 50814408 50814728 - +chr1 51655126 51655446 - +chr1 51657890 51658210 - +chr1 51762574 51762737 - +chr1 52234676 52234996 - +chr1 52467993 52468313 - +chr1 53151917 53152237 - +chr1 53743918 53744238 - +chr1 53859044 53859364 - +chr1 53935743 53936063 - +chr1 53951149 53951469 - +chr1 54181156 54181476 - +chr1 54326570 54326890 - +chr1 54547367 54547687 - +chr1 54738606 54738926 - +chr1 54955089 54955409 - +chr1 55137815 55137929 - +chr1 55276169 55276489 - +chr1 55410777 55411097 - +chr1 55487746 55488066 - +chr1 55743231 55743551 - +chr1 55841355 55841675 - +chr1 55844195 55844515 - +chr1 57286828 57286982 - +chr1 57297566 57297886 - +chr1 57890187 57890507 - +chr1 59012500 59012615 - +chr1 59234980 59235300 - +chr1 60164951 60165271 - +chr1 60225267 60225379 - +chr1 60413017 60413337 - +chr1 60437080 60437400 - +chr1 60442499 60442819 - +chr1 61515573 61515893 - +chr1 62054357 62054677 - +chr1 62548433 62548753 - +chr1 63377712 63378032 - +chr1 63793285 63793605 - +chr1 63907337 63907657 - +chr1 64140115 64140435 - +chr1 64602799 64603119 - +chr1 65152449 65152769 - +chr1 66153487 66153807 - +chr1 66708482 66708623 - +chr1 66969579 66969899 - +chr1 67191764 67192084 - +chr1 67461771 67462091 - +chr1 67614891 67615211 - +chr1 67813381 67813701 - +chr1 67966260 67966365 - +chr1 68001189 68001334 - +chr1 68027346 68027666 - +chr1 68225996 68226316 - +chr1 68319445 68319553 - +chr1 68808510 68808711 - +chr1 70598688 70599008 - +chr1 76457236 76457556 - +chr1 76619942 76620046 - +chr1 76796688 76797008 - +chr1 77366903 77367223 - +chr1 78149094 78149414 - +chr1 78610787 78611107 - +chr1 78799143 78799463 - +chr1 79962892 79963212 - +chr1 85086578 85086898 - +chr1 85219341 85219484 - +chr1 85238049 85238369 - +chr1 85401928 85402248 - +chr1 85410648 85410968 - +chr1 85575104 85575424 - +chr1 85725294 85725614 - +chr1 85773837 85774157 - +chr1 85809936 85810256 - +chr1 86484036 86484356 - +chr1 87170010 87170330 - +chr1 87597440 87597760 - +chr1 88419486 88419806 - +chr1 89823888 89824208 - +chr1 90228641 90228765 - +chr1 90270239 90270559 - +chr1 91186999 91187319 - +chr1 91630938 91631059 - +chr1 91631709 91632029 - +chr1 93283938 93284258 - +chr1 93297308 93297628 - +chr1 93324417 93324737 - +chr1 94279657 94279816 - +chr1 94505088 94505408 - +chr1 94527149 94527232 - +chr1 94947028 94947348 - +chr1 94979690 94980010 - +chr1 95271491 95271626 - +chr1 95319969 95320289 - +chr1 95507131 95507451 - +chr1 95509070 95509215 - +chr1 97432138 97432458 - +chr1 98516670 98516990 - +chr1 99259875 99260195 - +chr1 100925096 100925416 - +chr1 100929293 100929613 - +chr1 101704418 101704535 - +chr1 101754009 101754329 - +chr1 101774902 101775029 - +chr1 108668278 108668387 - +chr1 109288994 109289314 - +chr1 109642503 109642823 - +chr1 109655449 109655529 - +chr1 109743755 109744075 - +chr1 109782069 109782389 - +chr1 109806290 109806448 - +chr1 109820080 109820400 - +chr1 110033256 110033576 - +chr1 110036468 110036788 - +chr1 110074936 110075256 - +chr1 110075500 110075820 - +chr1 110088889 110089209 - +chr1 110182967 110183066 - +chr1 110185957 110186277 - +chr1 110316319 110316639 - +chr1 110319480 110319800 - +chr1 110334665 110334985 - +chr1 110527060 110527380 - +chr1 110572218 110572538 - +chr1 110596584 110596688 - +chr1 110602757 110603077 - +chr1 110648768 110649088 - +chr1 110659193 110659287 - +chr1 110739642 110739768 - +chr1 110745639 110745959 - +chr1 110750393 110750713 - +chr1 110855359 110855471 - +chr1 110955157 110955477 - +chr1 111002930 111003250 - +chr1 111038644 111038964 - +chr1 111161994 111162314 - +chr1 111326053 111326160 - +chr1 111573148 111573468 - +chr1 111765491 111765811 - +chr1 112024275 112024595 - +chr1 112050945 112051045 - +chr1 112298163 112298483 - +chr1 112879600 112879920 - +chr1 112901433 112901753 - +chr1 112912446 112912530 - +chr1 112933466 112933786 - +chr1 113044447 113044767 - +chr1 113249953 113250273 - +chr1 113258900 113259220 - +chr1 113261304 113261624 - +chr1 113346828 113347148 - +chr1 113351859 113352179 - +chr1 113423586 113423906 - +chr1 113683157 113683477 - +chr1 114825576 114825682 - +chr1 114889180 114889434 - +chr1 115595153 115595473 - +chr1 115655523 115655674 - +chr1 115681526 115681846 - +chr1 116012433 116012525 - +chr1 116138487 116138807 - +chr1 116563316 116563636 - +chr1 116681703 116682023 - +chr1 116694750 116694843 - +chr1 117027894 117028214 - +chr1 117041706 117042026 - +chr1 117117261 117117581 - +chr1 117664698 117665018 - +chr1 117855131 117855451 - +chr1 118301494 118301814 - +chr1 118693977 118694297 - +chr1 119818055 119818375 - +chr1 119910157 119910477 - +chr1 120154333 120154479 - +chr1 120156414 120156734 - +chr1 120286727 120287047 - +chr1 120351606 120351926 - +chr1 120413638 120413788 - +chr1 120428809 120429129 - +chr1 120491259 120491579 - +chr1 144918099 144918257 - +chr1 145014486 145014806 - +chr1 145059083 145059230 - +chr1 145542889 145543209 - +chr1 145588569 145588889 - +chr1 145589312 145589632 - +chr1 145713590 145713910 - +chr1 146644156 146644476 - +chr1 146714075 146714395 - +chr1 146966913 146967040 - +chr1 147099318 147099638 - +chr1 147112673 147112766 - +chr1 147113827 147114147 - +chr1 147172163 147172293 - +chr1 147364935 147365097 - +chr1 147727183 147727503 - +chr1 149204286 149204606 - +chr1 149297691 149298011 - +chr1 150138225 150138545 - +chr1 150436845 150437165 - +chr1 150505057 150505377 - +chr1 150532511 150532644 - +chr1 150533395 150533715 - +chr1 150785714 150785804 - +chr1 150894988 150895308 - +chr1 150952019 150952241 - +chr1 150997057 150997377 - +chr1 151032008 151032328 - +chr1 151043539 151043859 - +chr1 151128530 151128850 - +chr1 151148455 151148775 - +chr1 151224211 151224531 - +chr1 151273497 151273578 - +chr1 151507184 151507504 - +chr1 151566631 151566951 - +chr1 151707551 151707871 - +chr1 151906514 151906834 - +chr1 151940470 151940790 - +chr1 151969313 151969633 - +chr1 151974318 151974462 - +chr1 152007049 152007369 - +chr1 152025059 152025379 - +chr1 152162320 152162640 - +chr1 152430941 152431261 - +chr1 152432751 152433071 - +chr1 152842307 152842627 - +chr1 153324925 153325245 - +chr1 153467432 153467752 - +chr1 153483401 153483721 - +chr1 153505627 153505947 - +chr1 153541128 153541287 - +chr1 153544984 153545304 - +chr1 153600010 153600330 - +chr1 153650267 153650390 - +chr1 153721918 153722238 - +chr1 153769717 153770037 - +chr1 153919071 153919391 - +chr1 154244684 154245004 - +chr1 154297176 154297291 - +chr1 154298703 154299023 - +chr1 154307564 154307884 - +chr1 154317737 154318057 - +chr1 154452458 154452538 - +chr1 154518913 154519233 - +chr1 154532119 154532199 - +chr1 154836221 154836348 - +chr1 154850986 154851306 - +chr1 154916272 154916592 - +chr1 154971601 154971921 - +chr1 154990098 154990252 - +chr1 155017391 155017711 - +chr1 155022946 155023266 - +chr1 155034070 155034258 - +chr1 155057175 155057495 - +chr1 155058671 155058751 - +chr1 155064029 155064162 - +chr1 155140370 155140690 - +chr1 155145693 155146013 - +chr1 155149094 155149414 - +chr1 155163371 155163497 - +chr1 155176630 155176950 - +chr1 155220562 155220757 - +chr1 155293097 155293417 - +chr1 155698948 155699268 - +chr1 155959274 155959418 - +chr1 156036235 156036367 - +chr1 156066423 156066743 - +chr1 156100279 156100367 - +chr1 156195354 156195434 - +chr1 156352879 156352990 - +chr1 156355395 156355715 - +chr1 156415735 156415863 - +chr1 156426230 156426550 - +chr1 156450520 156450840 - +chr1 156553973 156554064 - +chr1 156577898 156578218 - +chr1 156595648 156595968 - +chr1 156627354 156627502 - +chr1 156629838 156629981 - +chr1 156647838 156648158 - +chr1 156659299 156659619 - +chr1 156675756 156676076 - +chr1 156695606 156695926 - +chr1 156767106 156767426 - +chr1 156808798 156808882 - +chr1 156829893 156830213 - +chr1 156859920 156860240 - +chr1 156862056 156862376 - +chr1 156880038 156880358 - +chr1 156890202 156890522 - +chr1 156897632 156897952 - +chr1 157059951 157060271 - +chr1 157125835 157126155 - +chr1 157133454 157133774 - +chr1 157163452 157163772 - +chr1 157339748 157340068 - +chr1 157459195 157459515 - +chr1 157465882 157466202 - +chr1 157933448 157933768 - +chr1 157939579 157939899 - +chr1 157961836 157962013 - +chr1 158057233 158057370 - +chr1 158065574 158065894 - +chr1 158087588 158087908 - +chr1 158094792 158095112 - +chr1 158968512 158968832 - +chr1 159102092 159102412 - +chr1 159130922 159131015 - +chr1 159167804 159168124 - +chr1 159181341 159181661 - +chr1 159184104 159184200 - +chr1 159750319 159750639 - +chr1 159861471 159861616 - +chr1 159906289 159906458 - +chr1 159976152 159976472 - +chr1 160048429 160048749 - +chr1 160055423 160055743 - +chr1 160070887 160070988 - +chr1 160162897 160163217 - +chr1 160189396 160189716 - +chr1 160261130 160261450 - +chr1 160346472 160346559 - +chr1 160921192 160921512 - +chr1 160924018 160924338 - +chr1 160967128 160967448 - +chr1 161011273 161011593 - +chr1 161067465 161067785 - +chr1 161162303 161162623 - +chr1 161179722 161179842 - +chr1 161197163 161197483 - +chr1 162025900 162025993 - +chr1 162361922 162362242 - +chr1 162583475 162583795 - +chr1 162817635 162817955 - +chr1 162847827 162848147 - +chr1 163144637 163144957 - +chr1 163379796 163380116 - +chr1 164756574 164756894 - +chr1 165083556 165083876 - +chr1 165336650 165336970 - +chr1 165338521 165338841 - +chr1 165362230 165362348 - +chr1 165562211 165562531 - +chr1 165893942 165894262 - +chr1 167028506 167028607 - +chr1 167035871 167036191 - +chr1 167237088 167237408 - +chr1 167411431 167411751 - +chr1 167503164 167503277 - +chr1 167657179 167657499 - +chr1 167684815 167684900 - +chr1 167772590 167772910 - +chr1 168051608 168051737 - +chr1 168054001 168054321 - +chr1 168115113 168115318 - +chr1 169681202 169681522 - +chr1 171157841 171158161 - +chr1 171214622 171214942 - +chr1 171614089 171614409 - +chr1 171624647 171624967 - +chr1 171770569 171770889 - +chr1 171954053 171954373 - +chr1 172413159 172413479 - +chr1 172419181 172419262 - +chr1 173224379 173224699 - +chr1 173400353 173400673 - +chr1 174084011 174084331 - +chr1 174998896 174999216 - +chr1 175094442 175094762 - +chr1 175115729 175116049 - +chr1 175121195 175121515 - +chr1 175124028 175124348 - +chr1 175857117 175857437 - +chr1 176344140 176344460 - +chr1 176866946 176867266 - +chr1 177150852 177151172 - +chr1 177882862 177883182 - +chr1 177915117 177915437 - +chr1 177917512 177917832 - +chr1 178000946 178001266 - +chr1 178269074 178269394 - +chr1 178470994 178471115 - +chr1 178501471 178501791 - +chr1 178511643 178511963 - +chr1 178737502 178737822 - +chr1 178971927 178972062 - +chr1 179163012 179163332 - +chr1 179675500 179675820 - +chr1 179779090 179779188 - +chr1 179783921 179784241 - +chr1 179836906 179837226 - +chr1 180148088 180148195 - +chr1 180221764 180222084 - +chr1 180502658 180502978 - +chr1 181007575 181007895 - +chr1 181111641 181111961 - +chr1 181135751 181136071 - +chr1 181159086 181159268 - +chr1 181394128 181394320 - +chr1 182112328 182112648 - +chr1 182365886 182366206 - +chr1 182564817 182565137 - +chr1 182755836 182756156 - +chr1 182931635 182931955 - +chr1 183197140 183197460 - +chr1 183204151 183204471 - +chr1 183250609 183250929 - +chr1 183287674 183287994 - +chr1 183512755 183513075 - +chr1 183578404 183578724 - +chr1 184191739 184192059 - +chr1 184633131 184633451 - +chr1 184660889 184661209 - +chr1 184951427 184951514 - +chr1 185565242 185565562 - +chr1 186181796 186182116 - +chr1 186344476 186344796 - +chr1 186451943 186452035 - +chr1 193090907 193091227 - +chr1 197884951 197885271 - +chr1 198125252 198125572 - +chr1 199082426 199082746 - +chr1 200276650 200276970 - +chr1 200772148 200772280 - +chr1 201012070 201012390 - +chr1 201083269 201083589 - +chr1 201192133 201192286 - +chr1 201223789 201223939 - +chr1 201315762 201316082 - +chr1 201317264 201317584 - +chr1 201330785 201331105 - +chr1 201346005 201346325 - +chr1 201354471 201354791 - +chr1 201377434 201377605 - +chr1 201404842 201405162 - +chr1 201482197 201482356 - +chr1 201499885 201500205 - +chr1 201528795 201529115 - +chr1 201673365 201673685 - +chr1 201762271 201762419 - +chr1 201798007 201798327 - +chr1 201865039 201865359 - +chr1 201968781 201969101 - +chr1 202088130 202088450 - +chr1 202104342 202104662 - +chr1 202153833 202154153 - +chr1 202161941 202162261 - +chr1 202205556 202205724 - +chr1 202284884 202284986 - +chr1 202541016 202541103 - +chr1 202569287 202569607 - +chr1 202594561 202594881 - +chr1 202938113 202938433 - +chr1 203008470 203008790 - +chr1 203236419 203236739 - +chr1 203238205 203238525 - +chr1 203242217 203242537 - +chr1 203307673 203307812 - +chr1 203309206 203309526 - +chr1 203311621 203311941 - +chr1 203456622 203456712 - +chr1 203456919 203457239 - +chr1 203830408 203830728 - +chr1 203840189 203840509 - +chr1 204096880 204097037 - +chr1 204098171 204098491 - +chr1 204131710 204131879 - +chr1 204176234 204176554 - +chr1 204183349 204183669 - +chr1 204227537 204227857 - +chr1 204332823 204333143 - +chr1 204400591 204400671 - +chr1 204716088 204716239 - +chr1 204760910 204761230 - +chr1 204776439 204776597 - +chr1 204911008 204911102 - +chr1 204944783 204945103 - +chr1 204958574 204958694 - +chr1 205030723 205031043 - +chr1 205091340 205091660 - +chr1 205110785 205111105 - +chr1 205225458 205225548 - +chr1 205242738 205242844 - +chr1 205304826 205304988 - +chr1 205323434 205323754 - +chr1 205399955 205400275 - +chr1 205454753 205454895 - +chr1 205498098 205498418 - +chr1 205561047 205561202 - +chr1 205626794 205626965 - +chr1 205628043 205628363 - +chr1 205680254 205680574 - +chr1 205760734 205761054 - +chr1 205888081 205888401 - +chr1 206234408 206234728 - +chr1 206310515 206310595 - +chr1 206652869 206653189 - +chr1 206663521 206663664 - +chr1 206718124 206718444 - +chr1 206757030 206757350 - +chr1 206785829 206785928 - +chr1 206831842 206832162 - +chr1 206837565 206837885 - +chr1 206907975 206908134 - +chr1 207038402 207038483 - +chr1 207084496 207084816 - +chr1 207178374 207178694 - +chr1 207251157 207251477 - +chr1 207793688 207794008 - +chr1 207970729 207971049 - +chr1 208002063 208002149 - +chr1 208058420 208058740 - +chr1 208136900 208137220 - +chr1 208171160 208171480 - +chr1 208195156 208195476 - +chr1 208267138 208267458 - +chr1 208305558 208305878 - +chr1 208412390 208412710 - +chr1 209422034 209422354 - +chr1 209508478 209508798 - +chr1 209527680 209528000 - +chr1 209609163 209609255 - +chr1 209739440 209739760 - +chr1 209742811 209743131 - +chr1 209750188 209750307 - +chr1 209801115 209801261 - +chr1 209866262 209866582 - +chr1 209921183 209921503 - +chr1 210033664 210033984 - +chr1 210484461 210484781 - +chr1 210547845 210547929 - +chr1 210573976 210574296 - +chr1 210580711 210580797 - +chr1 211307512 211307832 - +chr1 211496820 211497140 - +chr1 211526268 211526588 - +chr1 211643413 211643733 - +chr1 211649605 211649925 - +chr1 211687373 211687693 - +chr1 211751050 211751370 - +chr1 211828577 211828897 - +chr1 211871970 211872053 - +chr1 212185425 212185560 - +chr1 212541468 212541548 - +chr1 212629336 212629464 - +chr1 212650523 212650843 - +chr1 212803526 212803636 - +chr1 212838600 212838920 - +chr1 213141147 213141467 - +chr1 213547666 213547986 - +chr1 214434962 214435056 - +chr1 214502213 214502533 - +chr1 214505198 214505518 - +chr1 214506429 214506527 - +chr1 214801505 214801825 - +chr1 214834255 214834575 - +chr1 219559194 219559514 - +chr1 219834518 219834598 - +chr1 220469603 220469923 - +chr1 220853442 220853762 - +chr1 220863452 220863772 - +chr1 220906442 220906762 - +chr1 220948619 220948939 - +chr1 220950077 220950397 - +chr1 220977022 220977342 - +chr1 220983585 220983743 - +chr1 221376832 221377152 - +chr1 221608798 221609118 - +chr1 221651408 221651728 - +chr1 222269536 222269856 - +chr1 222628516 222628836 - +chr1 222638767 222638924 - +chr1 223254771 223255091 - +chr1 223263514 223263669 - +chr1 223297073 223297393 - +chr1 223302657 223302977 - +chr1 223307023 223307343 - +chr1 223316225 223316545 - +chr1 223705117 223705437 - +chr1 223884165 223884485 - +chr1 223936875 223937195 - +chr1 224701442 224701762 - +chr1 224832289 224832609 - +chr1 224839948 224840268 - +chr1 225039213 225039533 - +chr1 225506710 225507030 - +chr1 225662708 225662915 - +chr1 225668630 225668950 - +chr1 225960026 225960346 - +chr1 225965190 225965510 - +chr1 226033522 226033842 - +chr1 226163704 226164024 - +chr1 226313008 226313105 - +chr1 226384792 226384927 - +chr1 226735482 226735663 - +chr1 226796448 226796768 - +chr1 226814179 226814499 - +chr1 226821820 226822140 - +chr1 226898967 226899287 - +chr1 227015799 227016119 - +chr1 227669382 227669482 - +chr1 227730265 227730585 - +chr1 227918145 227918287 - +chr1 227950149 227950469 - +chr1 228187085 228187405 - +chr1 228318693 228319013 - +chr1 228424517 228424837 - +chr1 228646931 228647251 - +chr1 228688777 228688893 - +chr1 229363212 229363532 - +chr1 229650462 229650782 - +chr1 229694524 229694844 - +chr1 229797123 229797223 - +chr1 230303148 230303468 - +chr1 230404662 230404982 - +chr1 230446447 230446585 - +chr1 230778012 230778186 - +chr1 230933630 230933950 - +chr1 231347088 231347408 - +chr1 231726396 231726716 - +chr1 231827448 231827768 - +chr1 231892642 231892786 - +chr1 232002412 232002732 - +chr1 233376863 233377183 - +chr1 233926872 233927192 - +chr1 234445731 234446051 - +chr1 234462386 234462706 - +chr1 234634828 234635148 - +chr1 235247159 235247260 - +chr1 235257166 235257486 - +chr1 235267513 235267833 - +chr1 235416523 235416843 - +chr1 235578668 235578988 - +chr1 235580453 235580773 - +chr1 235667862 235668182 - +chr1 235677101 235677421 - +chr1 236135056 236135376 - +chr1 237032214 237032534 - +chr1 238027590 238027910 - +chr1 240061447 240061767 - +chr1 240114858 240115178 - +chr1 240116283 240116603 - +chr1 240139695 240140015 - +chr1 240167433 240167753 - +chr1 240190387 240190471 - +chr1 240855429 240855749 - +chr1 241051374 241051694 - +chr1 244436461 244436558 - +chr1 244462792 244463112 - +chr1 245206001 245206321 - +chr1 246166479 246166799 - +chr1 246655962 246656282 - +chr1 246852819 246853139 - +chr1 246859794 246860114 - +chr1 246862499 246862819 - +chr1 246917235 246917555 - +chr1 247134699 247135019 - +chr1 247536025 247536345 - +chr1 247553526 247553613 - +chr10 120002 120322 - +chr10 556979 557299 - +chr10 2970463 2970783 - +chr10 3085772 3086092 - +chr10 3235047 3235367 - +chr10 3239559 3239879 - +chr10 3823662 3823982 - +chr10 4698815 4699135 - +chr10 4891974 4892110 - +chr10 5665155 5665475 - +chr10 5671828 5672148 - +chr10 6128393 6128713 - +chr10 6130696 6130937 - +chr10 6193391 6193711 - +chr10 6244555 6244875 - +chr10 6487674 6487994 - +chr10 6689943 6690263 - +chr10 6711250 6711570 - +chr10 6961969 6962289 - +chr10 8095352 8095483 - +chr10 9251187 9251507 - +chr10 11312578 11312898 - +chr10 11466525 11466845 - +chr10 11767226 11767546 - +chr10 11800056 11800186 - +chr10 12028632 12028952 - +chr10 12306160 12306480 - +chr10 12753208 12753308 - +chr10 13116853 13117173 - +chr10 13141643 13141963 - +chr10 13203160 13203480 - +chr10 13344360 13344680 - +chr10 13413919 13414239 - +chr10 13482520 13482840 - +chr10 13578788 13579108 - +chr10 13628781 13628949 - +chr10 13702014 13702334 - +chr10 13998950 13999270 - +chr10 14238945 14239265 - +chr10 14865192 14865512 - +chr10 15148433 15148753 - +chr10 15251839 15252159 - +chr10 15461340 15461660 - +chr10 15627297 15627617 - +chr10 15762083 15762403 - +chr10 17472894 17472994 - +chr10 18503923 18504243 - +chr10 21367648 21367968 - +chr10 21559029 21559140 - +chr10 21784628 21784948 - +chr10 21964134 21964454 - +chr10 22021133 22021453 - +chr10 23277832 23278152 - +chr10 23357146 23357466 - +chr10 23447437 23447523 - +chr10 24485488 24485599 - +chr10 24649478 24649798 - +chr10 24792598 24792918 - +chr10 24807973 24808293 - +chr10 24906713 24907033 - +chr10 25369919 25370239 - +chr10 25402097 25402417 - +chr10 27649519 27649839 - +chr10 28031046 28031366 - +chr10 28530747 28531067 - +chr10 28670836 28671156 - +chr10 28682915 28683235 - +chr10 28721107 28721427 - +chr10 28917081 28917401 - +chr10 28952904 28953021 - +chr10 29280707 29281027 - +chr10 29593534 29593854 - +chr10 29666315 29666635 - +chr10 29675307 29675627 - +chr10 29681360 29681680 - +chr10 29785429 29785575 - +chr10 29824469 29824789 - +chr10 30024747 30025067 - +chr10 30147403 30147723 - +chr10 30287338 30287539 - +chr10 30291414 30291734 - +chr10 30648611 30648931 - +chr10 30681447 30681767 - +chr10 30823496 30823816 - +chr10 30870751 30871071 - +chr10 30880159 30880479 - +chr10 30928881 30929201 - +chr10 30976623 30976943 - +chr10 31072803 31073123 - +chr10 32125772 32126092 - +chr10 32545681 32545804 - +chr10 32635231 32635333 - +chr10 33297753 33297908 - +chr10 33482270 33482590 - +chr10 33619435 33619755 - +chr10 34926524 34926681 - +chr10 34927413 34927733 - +chr10 35415676 35415996 - +chr10 35484997 35485317 - +chr10 35852305 35852523 - +chr10 35924450 35924770 - +chr10 38265562 38265882 - +chr10 43248746 43248839 - +chr10 43362125 43362445 - +chr10 43601048 43601184 - +chr10 43843924 43844244 - +chr10 43951138 43951458 - +chr10 44185500 44185820 - +chr10 44882038 44882358 - +chr10 45353289 45353492 - +chr10 45360300 45360620 - +chr10 45428253 45428395 - +chr10 45470541 45470621 - +chr10 45960534 45960617 - +chr10 46951331 46951536 - +chr10 46962552 46962872 - +chr10 46983727 46983962 - +chr10 47004244 47004564 - +chr10 47081063 47081222 - +chr10 47599893 47600036 - +chr10 47600519 47600839 - +chr10 48377814 48378134 - +chr10 48398546 48398680 - +chr10 48426893 48427213 - +chr10 49423535 49423855 - +chr10 49497726 49498046 - +chr10 49816212 49816532 - +chr10 49864315 49864635 - +chr10 49866330 49866650 - +chr10 50189380 50189700 - +chr10 50615610 50615930 - +chr10 50801198 50801375 - +chr10 50802601 50802921 - +chr10 51564940 51565260 - +chr10 52181063 52181383 - +chr10 52770709 52771029 - +chr10 53182392 53182712 - +chr10 54074180 54074500 - +chr10 54500620 54500940 - +chr10 54539991 54540134 - +chr10 60290882 60291202 - +chr10 61569164 61569484 - +chr10 61590545 61590627 - +chr10 63510710 63511030 - +chr10 63798417 63798737 - +chr10 64353885 64354022 - +chr10 64370423 64370743 - +chr10 65389171 65389491 - +chr10 65479780 65480100 - +chr10 65579547 65579867 - +chr10 65626201 65626521 - +chr10 65632125 65632445 - +chr10 65800973 65801293 - +chr10 69615379 69615699 - +chr10 69916164 69916484 - +chr10 69968234 69968554 - +chr10 70108231 70108551 - +chr10 70310617 70310764 - +chr10 70997456 70997537 - +chr10 71152418 71152550 - +chr10 71173032 71173352 - +chr10 71267703 71267890 - +chr10 71275402 71275722 - +chr10 71389781 71390101 - +chr10 71399009 71399329 - +chr10 71404325 71404645 - +chr10 71476754 71477074 - +chr10 71548985 71549089 - +chr10 71559898 71560218 - +chr10 71661397 71661717 - +chr10 71674515 71674630 - +chr10 71718097 71718417 - +chr10 71875954 71876274 - +chr10 71981293 71981613 - +chr10 72139372 72139692 - +chr10 72221157 72221477 - +chr10 72300274 72300594 - +chr10 72309649 72309969 - +chr10 72575506 72575635 - +chr10 72639810 72640130 - +chr10 72663169 72663316 - +chr10 72685457 72685777 - +chr10 72968943 72969087 - +chr10 73030111 73030431 - +chr10 73044843 73045163 - +chr10 73074037 73074357 - +chr10 73143997 73144317 - +chr10 73329781 73330101 - +chr10 73368964 73369284 - +chr10 73456574 73456738 - +chr10 73533194 73533514 - +chr10 73571744 73572064 - +chr10 73656292 73656446 - +chr10 73715847 73716167 - +chr10 73808405 73808556 - +chr10 74096933 74097253 - +chr10 74100298 74100439 - +chr10 74124052 74124372 - +chr10 75403935 75404255 - +chr10 75608910 75609046 - +chr10 75639463 75639783 - +chr10 75645842 75646162 - +chr10 75700393 75700713 - +chr10 76782069 76782389 - +chr10 76859716 76860036 - +chr10 76947676 76947996 - +chr10 76990363 76990683 - +chr10 76995732 76995979 - +chr10 77155088 77155408 - +chr10 77796157 77796477 - +chr10 78744618 78744938 - +chr10 78764733 78765053 - +chr10 78933659 78933979 - +chr10 79011031 79011351 - +chr10 79151659 79151979 - +chr10 79188509 79188829 - +chr10 79471288 79471448 - +chr10 79479414 79479560 - +chr10 79516257 79516577 - +chr10 79606535 79606637 - +chr10 79807062 79807382 - +chr10 79971227 79971547 - +chr10 80734534 80734854 - +chr10 80955258 80955578 - +chr10 80956644 80956964 - +chr10 81106942 81107262 - +chr10 81134341 81134661 - +chr10 81227191 81227511 - +chr10 81321998 81322136 - +chr10 81689191 81689511 - +chr10 81806673 81806993 - +chr10 82020411 82020731 - +chr10 82204538 82204643 - +chr10 82265418 82265597 - +chr10 82356546 82356683 - +chr10 82359195 82359515 - +chr10 82422428 82422748 - +chr10 82423862 82424182 - +chr10 82459288 82459608 - +chr10 85907796 85908116 - +chr10 85965389 85965709 - +chr10 86016849 86017169 - +chr10 86024478 86024563 - +chr10 86112027 86112347 - +chr10 86300266 86300586 - +chr10 88137113 88137254 - +chr10 88159577 88159897 - +chr10 88345357 88345444 - +chr10 88448082 88448402 - +chr10 88453533 88453853 - +chr10 88470851 88471171 - +chr10 88699223 88699443 - +chr10 88720293 88720435 - +chr10 88726326 88726646 - +chr10 88963320 88963465 - +chr10 90342931 90343251 - +chr10 90570056 90570376 - +chr10 90692446 90692766 - +chr10 90694584 90694904 - +chr10 91020124 91020444 - +chr10 92547694 92548014 - +chr10 92671348 92671668 - +chr10 92766150 92766253 - +chr10 93011238 93011558 - +chr10 93058034 93058354 - +chr10 93311622 93311942 - +chr10 93395176 93395496 - +chr10 94334135 94334455 - +chr10 94602871 94603191 - +chr10 94840048 94840368 - +chr10 95136157 95136477 - +chr10 95301892 95302032 - +chr10 95314554 95314874 - +chr10 95575951 95576271 - +chr10 96039977 96040100 - +chr10 96122622 96122942 - +chr10 96892709 96893029 - +chr10 96943357 96943677 - +chr10 97134396 97134716 - +chr10 97190263 97190357 - +chr10 97205662 97205982 - +chr10 97360923 97361243 - +chr10 97453877 97454020 - +chr10 97989325 97989645 - +chr10 98094687 98095007 - +chr10 98130363 98130683 - +chr10 98135088 98135408 - +chr10 98155689 98156009 - +chr10 98402739 98403059 - +chr10 98525550 98525650 - +chr10 98590318 98590638 - +chr10 98797130 98797450 - +chr10 98832726 98832867 - +chr10 98841402 98841722 - +chr10 98956359 98956679 - +chr10 98970278 98970598 - +chr10 99223430 99223750 - +chr10 99313370 99313492 - +chr10 99332021 99332341 - +chr10 99436448 99436768 - +chr10 99461302 99461390 - +chr10 99469272 99469592 - +chr10 99491523 99491843 - +chr10 99519273 99519445 - +chr10 99551762 99552082 - +chr10 99674526 99674846 - +chr10 99805552 99805872 - +chr10 100185459 100185779 - +chr10 100227205 100227525 - +chr10 100424205 100424525 - +chr10 101152304 101152459 - +chr10 101614615 101614935 - +chr10 101805806 101806126 - +chr10 102098323 102098416 - +chr10 102102106 102102426 - +chr10 102193566 102193886 - +chr10 102242371 102242691 - +chr10 102295481 102295590 - +chr10 102321739 102322059 - +chr10 102638250 102638570 - +chr10 102757449 102757769 - +chr10 102758863 102759183 - +chr10 102772730 102772816 - +chr10 102774147 102774308 - +chr10 102778735 102779055 - +chr10 102801077 102801190 - +chr10 102810033 102810353 - +chr10 102819747 102820067 - +chr10 102827630 102827950 - +chr10 102855393 102855713 - +chr10 102902109 102902229 - +chr10 102921610 102921728 - +chr10 102973405 102973725 - +chr10 102974501 102974821 - +chr10 103032022 103032342 - +chr10 103227097 103227417 - +chr10 103362674 103362754 - +chr10 103540965 103541090 - +chr10 103595748 103596068 - +chr10 103600543 103600633 - +chr10 103649408 103649728 - +chr10 103769701 103769859 - +chr10 103832825 103832913 - +chr10 103892202 103892522 - +chr10 103929276 103929401 - +chr10 103989658 103989761 - +chr10 104111899 104112219 - +chr10 104153646 104153966 - +chr10 104159293 104159613 - +chr10 104163580 104163900 - +chr10 104182471 104182791 - +chr10 104238828 104238942 - +chr10 104263499 104263602 - +chr10 104343619 104343939 - +chr10 104356273 104356593 - +chr10 104438537 104438857 - +chr10 104489027 104489347 - +chr10 104594733 104595053 - +chr10 104677174 104677494 - +chr10 105004466 105004786 - +chr10 105245546 105245700 - +chr10 105324571 105324891 - +chr10 105352531 105352851 - +chr10 105365475 105365795 - +chr10 105445975 105446295 - +chr10 105472682 105473002 - +chr10 105512476 105512796 - +chr10 105644477 105644797 - +chr10 105805254 105805574 - +chr10 106034451 106034771 - +chr10 106051253 106051435 - +chr10 106057558 106057878 - +chr10 106065333 106065653 - +chr10 106340566 106340886 - +chr10 108384616 108384936 - +chr10 110304304 110304624 - +chr10 111612246 111612566 - +chr10 111643951 111644271 - +chr10 111738185 111738505 - +chr10 112114090 112114410 - +chr10 112370069 112370389 - +chr10 112431916 112432097 - +chr10 112440829 112441149 - +chr10 112506158 112506478 - +chr10 113875566 113875886 - +chr10 113897186 113897506 - +chr10 113899923 113900243 - +chr10 114074651 114074971 - +chr10 115144938 115145258 - +chr10 115281050 115281197 - +chr10 115323324 115323495 - +chr10 115438764 115439084 - +chr10 116063691 116064011 - +chr10 116231124 116231444 - +chr10 116463429 116463749 - +chr10 116636823 116637143 - +chr10 116639349 116639669 - +chr10 118047442 118047762 - +chr10 118204068 118204388 - +chr10 118441860 118442180 - +chr10 118547686 118547849 - +chr10 118556562 118556712 - +chr10 118934400 118934720 - +chr10 118935489 118935809 - +chr10 118976186 118976410 - +chr10 119202424 119202744 - +chr10 119256051 119256371 - +chr10 120000809 120001129 - +chr10 120183395 120183715 - +chr10 120898998 120899318 - +chr10 120958963 120959283 - +chr10 120969017 120969337 - +chr10 121176426 121176746 - +chr10 121204899 121205219 - +chr10 121249390 121249522 - +chr10 121362030 121362350 - +chr10 121510259 121510379 - +chr10 122702349 122702669 - +chr10 122708419 122708739 - +chr10 123646554 123646874 - +chr10 123902414 123902510 - +chr10 124027699 124027847 - +chr10 124058296 124058616 - +chr10 124326647 124326967 - +chr10 124739891 124739977 - +chr10 124802859 124803179 - +chr10 124892503 124892823 - +chr10 125318893 125319213 - +chr10 125341670 125341990 - +chr10 125754149 125754469 - +chr10 126077829 126078149 - +chr10 126218713 126219033 - +chr10 126228721 126229041 - +chr10 126429893 126430213 - +chr10 126736662 126736982 - +chr10 126749687 126750007 - +chr10 127622861 127623181 - +chr10 128075572 128075892 - +chr10 128286609 128286929 - +chr10 128437686 128438006 - +chr10 129041795 129042115 - +chr10 129809704 129809866 - +chr10 129833775 129834095 - +chr10 129867647 129867967 - +chr10 129876759 129877079 - +chr10 129948298 129948618 - +chr10 130010180 130010500 - +chr10 131926089 131926409 - +chr10 131988373 131988693 - +chr10 133836581 133836901 - +chr10 133955970 133956290 - +chr10 134036702 134036857 - +chr10 134266663 134266983 - +chr10 134272027 134272347 - +chr10 134468360 134468447 - +chr10 134650404 134650724 - +chr10 134717758 134718078 - +chr10 134753824 134754144 - +chr10 134957409 134957729 - +chr10 135143262 135143582 - +chr10 135251284 135251604 - +chr10 135333655 135333975 - +chr11 189881 190201 - +chr11 192082 192268 - +chr11 304958 305278 - +chr11 448266 448586 - +chr11 518681 519001 - +chr11 728170 728252 - +chr11 783689 783799 - +chr11 848406 848533 - +chr11 1051091 1051411 - +chr11 1108747 1108898 - +chr11 1224328 1224470 - +chr11 1536850 1536958 - +chr11 1616800 1616922 - +chr11 1650060 1650380 - +chr11 1661361 1661681 - +chr11 1674029 1674175 - +chr11 1714455 1714775 - +chr11 1715203 1715523 - +chr11 1853809 1853955 - +chr11 1858940 1859260 - +chr11 1903667 1903987 - +chr11 1970218 1970538 - +chr11 1974759 1975079 - +chr11 2058122 2058442 - +chr11 2171444 2171764 - +chr11 2172716 2173036 - +chr11 2173217 2173537 - +chr11 2177701 2178021 - +chr11 2216259 2216395 - +chr11 2218467 2218787 - +chr11 2326686 2326792 - +chr11 2421481 2421801 - +chr11 2440913 2441233 - +chr11 2441964 2442284 - +chr11 2449445 2449765 - +chr11 2552744 2552859 - +chr11 2554118 2554254 - +chr11 2909734 2909827 - +chr11 2913337 2913657 - +chr11 3058529 3058849 - +chr11 3113144 3113271 - +chr11 3238397 3238717 - +chr11 3630933 3631253 - +chr11 3829086 3829406 - +chr11 3862813 3863133 - +chr11 3876134 3876454 - +chr11 4208796 4209116 - +chr11 4647426 4647746 - +chr11 4658143 4658463 - +chr11 5145308 5145628 - +chr11 5402009 5402329 - +chr11 5497479 5497799 - +chr11 5621522 5621842 - +chr11 5829712 5830032 - +chr11 6272376 6272696 - +chr11 6337079 6337399 - +chr11 6374772 6374928 - +chr11 6384456 6384563 - +chr11 6460235 6460555 - +chr11 6464673 6464993 - +chr11 6704613 6704933 - +chr11 6727191 6727511 - +chr11 6776591 6776911 - +chr11 6833640 6833960 - +chr11 7592918 7593238 - +chr11 7709507 7709827 - +chr11 8008458 8008778 - +chr11 8027938 8028258 - +chr11 8054734 8055054 - +chr11 8114205 8114525 - +chr11 8120899 8121219 - +chr11 8214255 8214335 - +chr11 8240839 8241159 - +chr11 8247131 8247451 - +chr11 8285400 8285720 - +chr11 8350332 8350652 - +chr11 8835164 8835484 - +chr11 9037171 9037491 - +chr11 9159401 9159543 - +chr11 9528323 9528643 - +chr11 9556684 9557004 - +chr11 9634749 9635069 - +chr11 9635622 9635942 - +chr11 10326026 10326346 - +chr11 10329448 10329573 - +chr11 10388234 10388554 - +chr11 10562710 10563030 - +chr11 10757493 10757601 - +chr11 10814032 10814352 - +chr11 10900210 10900530 - +chr11 10955380 10955501 - +chr11 11049669 11049989 - +chr11 11809348 11809668 - +chr11 11811307 11811627 - +chr11 11988436 11988756 - +chr11 12065812 12066132 - +chr11 12068916 12069236 - +chr11 12303423 12303743 - +chr11 12310284 12310392 - +chr11 12433848 12434168 - +chr11 12915667 12915987 - +chr11 12985632 12985952 - +chr11 13260598 13260716 - +chr11 13356840 13357160 - +chr11 13977457 13977777 - +chr11 14270911 14271231 - +chr11 14322274 14322594 - +chr11 14558617 14558937 - +chr11 14926397 14926717 - +chr11 15122658 15122978 - +chr11 15230200 15230520 - +chr11 15349255 15349575 - +chr11 15671649 15671786 - +chr11 15792391 15792711 - +chr11 15841244 15841564 - +chr11 16627905 16628012 - +chr11 16635222 16635542 - +chr11 16807074 16807394 - +chr11 16810020 16810155 - +chr11 16841848 16842168 - +chr11 17084983 17085303 - +chr11 17405582 17405707 - +chr11 17526149 17526469 - +chr11 17620599 17620761 - +chr11 17743611 17743931 - +chr11 17752695 17752829 - +chr11 17994631 17994951 - +chr11 18000293 18000613 - +chr11 18001434 18001597 - +chr11 18013392 18013712 - +chr11 18137077 18137397 - +chr11 18192942 18193262 - +chr11 18265353 18265673 - +chr11 18743632 18743952 - +chr11 18749308 18749417 - +chr11 18753621 18753798 - +chr11 18785838 18786158 - +chr11 19092875 19092994 - +chr11 19098613 19098791 - +chr11 19281382 19281544 - +chr11 19332981 19333104 - +chr11 19575323 19575643 - +chr11 19579230 19579550 - +chr11 19585583 19585903 - +chr11 19764270 19764590 - +chr11 19959825 19960145 - +chr11 19975489 19975809 - +chr11 20132445 20132765 - +chr11 20149793 20150113 - +chr11 20153999 20154319 - +chr11 20375412 20375732 - +chr11 20408760 20408863 - +chr11 20631761 20631926 - +chr11 20669545 20669865 - +chr11 22942511 22942831 - +chr11 26298409 26298729 - +chr11 27180360 27180680 - +chr11 28719507 28719827 - +chr11 29181168 29181488 - +chr11 29350548 29350868 - +chr11 29365290 29365610 - +chr11 31892641 31892961 - +chr11 31979692 31980012 - +chr11 32195012 32195332 - +chr11 32197775 32198095 - +chr11 32814488 32814808 - +chr11 32830795 32831115 - +chr11 33087168 33087488 - +chr11 33829807 33830127 - +chr11 33851113 33851433 - +chr11 33879391 33879711 - +chr11 34009556 34009876 - +chr11 34018553 34018873 - +chr11 34195990 34196140 - +chr11 34380821 34381141 - +chr11 34534542 34534654 - +chr11 35507283 35507603 - +chr11 35856997 35857317 - +chr11 36287631 36287951 - +chr11 36398863 36399183 - +chr11 36766161 36766481 - +chr11 36768729 36768902 - +chr11 40315655 40315975 - +chr11 40328515 40328835 - +chr11 40743800 40744120 - +chr11 43304523 43304843 - +chr11 43873623 43873943 - +chr11 43911784 43912104 - +chr11 43966340 43966660 - +chr11 44058940 44059260 - +chr11 44115846 44116166 - +chr11 44117004 44117324 - +chr11 44131798 44132118 - +chr11 44307999 44308114 - +chr11 44545068 44545388 - +chr11 44560065 44560385 - +chr11 44578226 44578546 - +chr11 44638441 44638761 - +chr11 44642795 44642964 - +chr11 44749409 44749729 - +chr11 44915730 44916050 - +chr11 44948643 44948963 - +chr11 44958107 44958223 - +chr11 45124800 45124890 - +chr11 45127083 45127403 - +chr11 45255022 45255342 - +chr11 45392766 45393086 - +chr11 45676950 45677083 - +chr11 45792874 45793026 - +chr11 45793524 45793844 - +chr11 45864700 45865020 - +chr11 45894293 45894613 - +chr11 45986452 45986772 - +chr11 46260814 46260927 - +chr11 46300008 46300328 - +chr11 46318562 46318882 - +chr11 46330131 46330451 - +chr11 46336719 46337039 - +chr11 46373674 46373994 - +chr11 46391093 46391256 - +chr11 46410962 46411282 - +chr11 46550871 46551191 - +chr11 46559539 46559619 - +chr11 46639331 46639651 - +chr11 46911183 46911503 - +chr11 46941458 46941778 - +chr11 47174755 47174911 - +chr11 47182621 47182941 - +chr11 47376897 47377036 - +chr11 47395489 47395809 - +chr11 47433751 47434071 - +chr11 47475781 47476101 - +chr11 47638627 47638947 - +chr11 48194499 48194662 - +chr11 48329344 48329664 - +chr11 57016583 57016903 - +chr11 57017230 57017550 - +chr11 57045353 57045673 - +chr11 57117324 57117644 - +chr11 57224927 57225059 - +chr11 57226120 57226440 - +chr11 57247809 57248129 - +chr11 57260915 57261235 - +chr11 57294646 57294966 - +chr11 57335760 57335892 - +chr11 57412314 57412634 - +chr11 58053770 58054090 - +chr11 58291731 58291852 - +chr11 58292101 58292421 - +chr11 58335399 58335719 - +chr11 58421489 58421809 - +chr11 58443141 58443284 - +chr11 58555935 58556255 - +chr11 58826011 58826125 - +chr11 58903468 58903624 - +chr11 58925973 58926293 - +chr11 58953768 58954088 - +chr11 58974159 58974479 - +chr11 59448063 59448383 - +chr11 59521461 59521781 - +chr11 60542835 60543155 - +chr11 60598509 60598829 - +chr11 60655056 60655376 - +chr11 60666924 60667079 - +chr11 60737800 60738120 - +chr11 60897347 60897518 - +chr11 60955133 60955276 - +chr11 61021931 61022251 - +chr11 61103787 61103897 - +chr11 61153066 61153386 - +chr11 61203010 61203330 - +chr11 61213673 61213993 - +chr11 61245115 61245435 - +chr11 61284540 61284620 - +chr11 61299640 61299960 - +chr11 61395628 61395948 - +chr11 61405998 61406141 - +chr11 61447591 61447911 - +chr11 61500988 61501147 - +chr11 61515153 61515473 - +chr11 61543987 61544307 - +chr11 61638615 61638935 - +chr11 61665144 61665464 - +chr11 61666672 61666838 - +chr11 61715223 61715359 - +chr11 61849545 61849865 - +chr11 62078727 62078874 - +chr11 62192211 62192531 - +chr11 62310334 62310497 - +chr11 62328038 62328358 - +chr11 62379985 62380142 - +chr11 62462213 62462533 - +chr11 62554078 62554237 - +chr11 62554553 62554873 - +chr11 62630130 62630450 - +chr11 62673514 62673720 - +chr11 62791635 62791849 - +chr11 63246302 63246622 - +chr11 63360603 63360923 - +chr11 63403357 63403677 - +chr11 63506392 63506475 - +chr11 63604712 63605032 - +chr11 63913585 63913905 - +chr11 63916240 63916320 - +chr11 63920641 63920831 - +chr11 64068620 64068702 - +chr11 64071785 64072105 - +chr11 64099341 64099661 - +chr11 64122549 64122869 - +chr11 64216300 64216620 - +chr11 64217040 64217360 - +chr11 64321074 64321192 - +chr11 64420585 64420905 - +chr11 64455874 64456194 - +chr11 64491054 64491184 - +chr11 64510360 64510680 - +chr11 64535245 64535565 - +chr11 64555082 64555402 - +chr11 64614701 64614806 - +chr11 64655370 64655690 - +chr11 64661763 64662083 - +chr11 64781027 64781347 - +chr11 64781626 64781946 - +chr11 64877856 64878022 - +chr11 64879972 64880292 - +chr11 64885257 64885577 - +chr11 64972231 64972551 - +chr11 65042498 65042818 - +chr11 65083095 65083415 - +chr11 65133734 65134054 - +chr11 65183826 65184146 - +chr11 65278255 65278575 - +chr11 65308333 65308653 - +chr11 65479320 65479475 - +chr11 65547370 65547690 - +chr11 65585770 65585925 - +chr11 65627676 65627996 - +chr11 65652213 65652385 - +chr11 65655825 65656145 - +chr11 65756568 65756716 - +chr11 65789933 65790253 - +chr11 65812615 65812935 - +chr11 65835736 65836056 - +chr11 65990236 65990436 - +chr11 66025726 66026046 - +chr11 66049995 66050315 - +chr11 66079268 66079588 - +chr11 66095826 66096146 - +chr11 66131357 66131677 - +chr11 66153639 66153959 - +chr11 66156613 66156729 - +chr11 66176010 66176188 - +chr11 66197510 66197830 - +chr11 66246428 66246748 - +chr11 66311199 66311519 - +chr11 66346480 66346800 - +chr11 66462299 66462483 - +chr11 66821345 66821665 - +chr11 66839214 66839534 - +chr11 66848935 66849255 - +chr11 67030166 67030486 - +chr11 67085047 67085367 - +chr11 67113334 67113654 - +chr11 67118659 67118979 - +chr11 67124341 67124661 - +chr11 67150115 67150275 - +chr11 67273693 67273814 - +chr11 67396701 67397021 - +chr11 67398092 67398412 - +chr11 67426919 67427239 - +chr11 67976764 67976846 - +chr11 68024866 68025186 - +chr11 68065220 68065540 - +chr11 68151508 68151649 - +chr11 68214704 68215024 - +chr11 68780852 68781172 - +chr11 68811707 68812027 - +chr11 68856189 68856509 - +chr11 68868973 68869114 - +chr11 68899041 68899361 - +chr11 69059826 69060146 - +chr11 69485006 69485326 - +chr11 69500742 69500875 - +chr11 69616437 69616586 - +chr11 69703355 69703675 - +chr11 70037681 70038001 - +chr11 70042331 70042452 - +chr11 70313623 70313943 - +chr11 70601658 70601978 - +chr11 70661090 70661220 - +chr11 71131499 71131819 - +chr11 71291221 71291370 - +chr11 71511435 71511755 - +chr11 71524811 71525131 - +chr11 71736271 71736351 - +chr11 71802344 71802664 - +chr11 71855065 71855385 - +chr11 71894610 71894697 - +chr11 71949239 71949559 - +chr11 72125231 72125551 - +chr11 72142154 72142255 - +chr11 72145255 72145459 - +chr11 72182462 72182591 - +chr11 72193347 72193667 - +chr11 72296749 72297069 - +chr11 72354087 72354407 - +chr11 72394893 72395026 - +chr11 72395577 72395897 - +chr11 72436275 72436595 - +chr11 72504858 72504965 - +chr11 72542598 72542734 - +chr11 72853345 72853665 - +chr11 72889131 72889451 - +chr11 72896269 72896589 - +chr11 72953697 72954017 - +chr11 73000616 73000765 - +chr11 73006635 73006716 - +chr11 73018331 73018651 - +chr11 73046606 73046773 - +chr11 73062256 73062576 - +chr11 73087081 73087401 - +chr11 73104657 73104977 - +chr11 73115130 73115450 - +chr11 73189804 73190124 - +chr11 73336760 73337080 - +chr11 73372129 73372229 - +chr11 73675379 73675699 - +chr11 73966849 73967169 - +chr11 74413523 74413672 - +chr11 74437737 74437877 - +chr11 74450851 74451171 - +chr11 74734069 74734187 - +chr11 74743051 74743371 - +chr11 74851593 74851913 - +chr11 74916005 74916325 - +chr11 74959769 74959970 - +chr11 74991319 74991456 - +chr11 75039763 75040083 - +chr11 75095661 75095771 - +chr11 75156469 75156604 - +chr11 75243441 75243761 - +chr11 75258613 75258933 - +chr11 75275101 75275421 - +chr11 75417308 75417628 - +chr11 75434223 75434543 - +chr11 75514300 75514620 - +chr11 75863226 75863546 - +chr11 75918625 75918945 - +chr11 75919849 75920169 - +chr11 76029860 76030180 - +chr11 76391902 76392222 - +chr11 76432812 76433132 - +chr11 76510304 76510624 - +chr11 76518483 76518803 - +chr11 76630922 76631242 - +chr11 76757126 76757446 - +chr11 76784301 76784621 - +chr11 76802323 76802416 - +chr11 76824586 76824906 - +chr11 76838250 76838481 - +chr11 76899428 76899748 - +chr11 77757353 77757498 - +chr11 78356355 78356507 - +chr11 78357656 78357976 - +chr11 78426798 78427118 - +chr11 78509650 78509970 - +chr11 78787618 78787938 - +chr11 78811174 78811494 - +chr11 79111781 79112101 - +chr11 79204722 79204840 - +chr11 82429276 82429596 - +chr11 82486807 82487127 - +chr11 82952419 82952739 - +chr11 83004005 83004325 - +chr11 83547840 83548160 - +chr11 84269155 84269475 - +chr11 85904601 85904921 - +chr11 85929649 85929748 - +chr11 86167143 86167463 - +chr11 86319422 86319742 - +chr11 87063901 87064035 - +chr11 87433662 87433982 - +chr11 88277661 88277787 - +chr11 92967914 92968005 - +chr11 93628563 93628883 - +chr11 93654884 93655204 - +chr11 93904888 93905208 - +chr11 93917090 93917174 - +chr11 94336281 94336601 - +chr11 94630481 94630801 - +chr11 94802864 94803184 - +chr11 95438835 95438947 - +chr11 95569191 95569511 - +chr11 96076356 96076676 - +chr11 98886617 98886937 - +chr11 101731476 101731796 - +chr11 102217737 102218057 - +chr11 102261538 102261623 - +chr11 102537120 102537440 - +chr11 102604001 102604321 - +chr11 102785519 102785839 - +chr11 103472018 103472338 - +chr11 103480212 103480532 - +chr11 103553048 103553368 - +chr11 104479960 104480280 - +chr11 105958589 105958909 - +chr11 107419985 107420305 - +chr11 107670083 107670262 - +chr11 107879386 107879706 - +chr11 108093593 108093913 - +chr11 108519055 108519375 - +chr11 108568757 108568840 - +chr11 109567989 109568309 - +chr11 109723945 109724265 - +chr11 109816958 109817278 - +chr11 110044894 110045214 - +chr11 110939919 110940239 - +chr11 111101154 111101474 - +chr11 111293767 111294087 - +chr11 111299823 111300143 - +chr11 111317309 111317629 - +chr11 111472696 111473016 - +chr11 111749735 111749902 - +chr11 112045803 112046123 - +chr11 112151326 112151646 - +chr11 112426456 112426776 - +chr11 113160443 113160536 - +chr11 113176546 113176677 - +chr11 113403469 113403789 - +chr11 113644640 113644960 - +chr11 113767035 113767355 - +chr11 114131858 114131997 - +chr11 114210963 114211283 - +chr11 115040958 115041278 - +chr11 116562443 116562763 - +chr11 116603533 116603681 - +chr11 116613534 116613854 - +chr11 116661975 116662295 - +chr11 117068621 117068941 - +chr11 117297038 117297189 - +chr11 117492006 117492326 - +chr11 117678222 117678542 - +chr11 117688547 117688725 - +chr11 117689837 117690157 - +chr11 117714785 117714967 - +chr11 117817732 117817868 - +chr11 117924249 117924367 - +chr11 118016035 118016355 - +chr11 118042368 118042688 - +chr11 118069001 118069321 - +chr11 118359259 118359579 - +chr11 118436680 118437000 - +chr11 118481711 118482031 - +chr11 118530281 118530423 - +chr11 118560582 118560712 - +chr11 118560925 118561043 - +chr11 118777724 118778044 - +chr11 118796628 118796948 - +chr11 118798786 118799106 - +chr11 118800993 118801313 - +chr11 118827293 118827613 - +chr11 118901583 118901903 - +chr11 119015601 119015921 - +chr11 119039238 119039558 - +chr11 119076495 119076815 - +chr11 119192342 119192662 - +chr11 119209052 119209372 - +chr11 119210618 119210938 - +chr11 119227148 119227268 - +chr11 119244975 119245098 - +chr11 119252209 119252529 - +chr11 119345548 119345675 - +chr11 119352060 119352380 - +chr11 119404655 119404767 - +chr11 119537214 119537338 - +chr11 119600057 119600377 - +chr11 119612018 119612338 - +chr11 119979005 119979203 - +chr11 120042947 120043267 - +chr11 120105755 120106075 - +chr11 120173890 120174036 - +chr11 120177359 120177679 - +chr11 120381785 120382105 - +chr11 120673121 120673215 - +chr11 120823537 120823857 - +chr11 121119919 121120239 - +chr11 121526189 121526509 - +chr11 121526713 121527033 - +chr11 122073772 122073924 - +chr11 122086130 122086450 - +chr11 122376497 122376817 - +chr11 122451206 122451526 - +chr11 122451794 122452114 - +chr11 122499803 122499886 - +chr11 122652257 122652352 - +chr11 122666866 122667186 - +chr11 122734453 122734773 - +chr11 122893859 122894179 - +chr11 123036707 123037027 - +chr11 123118037 123118156 - +chr11 123132241 123132561 - +chr11 123172295 123172615 - +chr11 123278109 123278429 - +chr11 123381967 123382287 - +chr11 123447815 123447936 - +chr11 123451389 123451517 - +chr11 123511678 123511998 - +chr11 123578354 123578674 - +chr11 123582124 123582237 - +chr11 123940227 123940309 - +chr11 124272239 124272559 - +chr11 124514282 124514602 - +chr11 124628262 124628582 - +chr11 124707393 124707713 - +chr11 124736208 124736528 - +chr11 125011748 125011895 - +chr11 125215671 125215991 - +chr11 125218779 125218889 - +chr11 125274881 125275034 - +chr11 125299035 125299355 - +chr11 125303393 125303713 - +chr11 125322576 125322896 - +chr11 125744460 125744780 - +chr11 125792949 125793269 - +chr11 126180402 126180722 - +chr11 126310834 126310960 - +chr11 126350664 126350984 - +chr11 126860393 126860513 - +chr11 127215756 127216076 - +chr11 128027816 128028136 - +chr11 128500290 128500610 - +chr11 128502080 128502400 - +chr11 128701083 128701403 - +chr11 128718739 128719059 - +chr11 128729755 128730075 - +chr11 128987515 128987835 - +chr11 129201726 129202046 - +chr11 129234101 129234306 - +chr11 129751508 129751828 - +chr11 130002168 130002488 - +chr11 130082972 130083292 - +chr11 130301179 130301499 - +chr11 130304381 130304701 - +chr11 130731965 130732105 - +chr11 130764348 130764668 - +chr11 130786312 130786632 - +chr11 130793214 130793534 - +chr11 130894961 130895281 - +chr11 130934310 130934630 - +chr11 131538705 131539025 - +chr11 131544021 131544101 - +chr11 131557897 131558217 - +chr11 131564730 131565050 - +chr11 131618065 131618385 - +chr11 131737592 131737912 - +chr11 132182529 132182849 - +chr11 132810974 132811294 - +chr11 132948489 132948809 - +chr11 133797515 133797835 - +chr11 133920360 133920680 - +chr11 134009718 134010038 - +chr12 262381 262701 - +chr12 329079 329399 - +chr12 551459 551779 - +chr12 656129 656449 - +chr12 678932 679252 - +chr12 685424 685504 - +chr12 860439 860759 - +chr12 1058992 1059312 - +chr12 1202372 1202692 - +chr12 1227170 1227490 - +chr12 1692665 1692985 - +chr12 1743787 1744107 - +chr12 1762105 1762221 - +chr12 1905199 1905519 - +chr12 2049196 2049516 - +chr12 2166425 2166745 - +chr12 2394012 2394332 - +chr12 2504571 2504891 - +chr12 2692242 2692562 - +chr12 2893196 2893516 - +chr12 2962789 2963109 - +chr12 2994433 2994753 - +chr12 3143388 3143708 - +chr12 3182685 3182796 - +chr12 3244647 3244967 - +chr12 3312305 3312387 - +chr12 3384723 3384853 - +chr12 3409076 3409396 - +chr12 3409669 3409989 - +chr12 3913076 3913396 - +chr12 4386124 4386444 - +chr12 4416993 4417313 - +chr12 4713885 4714081 - +chr12 4958213 4958533 - +chr12 4964625 4964945 - +chr12 4966974 4967112 - +chr12 5112290 5112476 - +chr12 5131642 5131962 - +chr12 5854464 5854784 - +chr12 5905692 5906012 - +chr12 6075159 6075239 - +chr12 6097865 6097984 - +chr12 6147295 6147615 - +chr12 6149324 6149433 - +chr12 6172435 6172564 - +chr12 6304859 6305179 - +chr12 6394206 6394526 - +chr12 6574760 6574897 - +chr12 6647054 6647374 - +chr12 6672640 6672960 - +chr12 6726882 6727202 - +chr12 6862749 6863069 - +chr12 6888116 6888436 - +chr12 6934873 6934985 - +chr12 6944968 6945068 - +chr12 6978553 6978873 - +chr12 6981647 6981783 - +chr12 7033436 7033756 - +chr12 7047195 7047515 - +chr12 7053754 7054074 - +chr12 7074261 7074581 - +chr12 7167779 7168099 - +chr12 7245063 7245383 - +chr12 7261874 7261995 - +chr12 7265932 7266252 - +chr12 7294683 7295003 - +chr12 7372265 7372585 - +chr12 7527145 7527465 - +chr12 7613342 7613473 - +chr12 7798025 7798345 - +chr12 7959574 7959693 - +chr12 8198624 8198944 - +chr12 8261428 8261748 - +chr12 8696884 8697204 - +chr12 9013100 9013420 - +chr12 9043175 9043495 - +chr12 9483825 9484008 - +chr12 9860804 9861124 - +chr12 9966813 9967133 - +chr12 10305849 10306169 - +chr12 10343520 10343615 - +chr12 10826773 10827093 - +chr12 10902761 10903081 - +chr12 11381184 11381504 - +chr12 11639550 11639870 - +chr12 12186293 12186613 - +chr12 12717068 12717228 - +chr12 12857690 12857815 - +chr12 12966033 12966353 - +chr12 13069272 13069401 - +chr12 13387126 13387217 - +chr12 13408808 13408897 - +chr12 13431185 13431365 - +chr12 13515203 13515523 - +chr12 13687739 13688059 - +chr12 14320003 14320323 - +chr12 14409483 14409803 - +chr12 14783125 14783315 - +chr12 14876634 14876954 - +chr12 14922817 14923137 - +chr12 15374381 15374503 - +chr12 15421908 15422228 - +chr12 15743305 15743625 - +chr12 16132079 16132399 - +chr12 16512856 16513176 - +chr12 17794990 17795310 - +chr12 21449737 21450057 - +chr12 22488512 22488832 - +chr12 23358153 23358473 - +chr12 25113205 25113525 - +chr12 26250003 26250323 - +chr12 26587284 26587604 - +chr12 27006704 27007024 - +chr12 27037909 27038229 - +chr12 27167356 27167468 - +chr12 27295074 27295394 - +chr12 27536963 27537283 - +chr12 27856646 27856966 - +chr12 27886031 27886351 - +chr12 28093758 28094078 - +chr12 29025642 29025962 - +chr12 29705298 29705618 - +chr12 31004045 31004365 - +chr12 31138805 31139125 - +chr12 31391743 31392063 - +chr12 31424878 31425198 - +chr12 31556078 31556398 - +chr12 31789605 31789925 - +chr12 32518426 32518746 - +chr12 32908099 32908419 - +chr12 34276143 34276290 - +chr12 38532537 38532857 - +chr12 40384327 40384647 - +chr12 40501359 40501679 - +chr12 40546584 40546904 - +chr12 42418047 42418367 - +chr12 42876595 42876915 - +chr12 42982739 42983059 - +chr12 43732755 43733075 - +chr12 43838786 43839106 - +chr12 44394193 44394513 - +chr12 45342546 45342866 - +chr12 45564263 45564583 - +chr12 45937647 45937967 - +chr12 46122728 46122826 - +chr12 46384373 46384693 - +chr12 46465981 46466196 - +chr12 47055189 47055509 - +chr12 47377796 47378116 - +chr12 48111302 48111622 - +chr12 48136064 48136258 - +chr12 48173119 48173439 - +chr12 48231415 48231735 - +chr12 48343387 48343467 - +chr12 48357101 48357421 - +chr12 48558925 48559245 - +chr12 48583332 48583652 - +chr12 48936109 48936429 - +chr12 49012357 49012522 - +chr12 49189738 49189829 - +chr12 49208454 49208658 - +chr12 49247024 49247344 - +chr12 49284488 49284808 - +chr12 49330201 49330521 - +chr12 49381873 49382193 - +chr12 49389221 49389541 - +chr12 49390656 49390749 - +chr12 49457713 49458033 - +chr12 49491804 49492124 - +chr12 49514910 49515230 - +chr12 49524053 49524373 - +chr12 49628958 49629278 - +chr12 49725414 49725734 - +chr12 49730870 49731190 - +chr12 49736337 49736657 - +chr12 49741396 49741716 - +chr12 49752448 49752768 - +chr12 49974421 49974543 - +chr12 50030701 50031021 - +chr12 50038425 50038556 - +chr12 50101343 50101663 - +chr12 50134901 50135221 - +chr12 50156810 50157130 - +chr12 50170146 50170466 - +chr12 50171947 50172267 - +chr12 50232830 50233150 - +chr12 50260991 50261159 - +chr12 50339354 50339674 - +chr12 50360898 50360982 - +chr12 50361214 50361534 - +chr12 50367969 50368289 - +chr12 50433020 50433340 - +chr12 50449848 50450014 - +chr12 50464389 50464529 - +chr12 50468981 50469301 - +chr12 50482388 50482708 - +chr12 50505354 50505674 - +chr12 51441527 51441847 - +chr12 51735294 51735468 - +chr12 51745730 51746050 - +chr12 51818792 51819112 - +chr12 51925720 51926040 - +chr12 52071917 52072237 - +chr12 52269186 52269270 - +chr12 52317206 52317526 - +chr12 52364623 52364943 - +chr12 52374135 52374455 - +chr12 52386914 52387234 - +chr12 52397876 52398196 - +chr12 52414650 52414760 - +chr12 52417510 52417830 - +chr12 52419457 52419777 - +chr12 52477294 52477454 - +chr12 52545849 52546169 - +chr12 52550670 52550789 - +chr12 52557710 52558030 - +chr12 52607378 52607572 - +chr12 52672858 52673023 - +chr12 52721305 52721625 - +chr12 52976795 52977115 - +chr12 53259216 53259369 - +chr12 53273270 53273590 - +chr12 53273868 53273959 - +chr12 53278606 53278926 - +chr12 53374743 53374873 - +chr12 53447429 53447749 - +chr12 53459708 53460028 - +chr12 53493262 53493582 - +chr12 53553239 53553431 - +chr12 53583932 53584252 - +chr12 53607093 53607413 - +chr12 53715398 53715544 - +chr12 53739014 53739334 - +chr12 53845479 53845799 - +chr12 53902364 53902684 - +chr12 53937242 53937562 - +chr12 54069859 54070179 - +chr12 54095573 54095893 - +chr12 54137544 54137632 - +chr12 54140483 54140670 - +chr12 54151050 54151192 - +chr12 54426369 54426689 - +chr12 54585381 54585701 - +chr12 54595200 54595361 - +chr12 54601677 54601834 - +chr12 54608618 54608805 - +chr12 54610834 54611154 - +chr12 54688014 54688334 - +chr12 54762481 54762801 - +chr12 54764040 54764360 - +chr12 54773504 54773736 - +chr12 54793170 54793335 - +chr12 54798318 54798433 - +chr12 54973589 54973775 - +chr12 54990251 54990571 - +chr12 54997020 54997340 - +chr12 55247765 55248085 - +chr12 55462237 55462557 - +chr12 55466861 55467181 - +chr12 56131011 56131331 - +chr12 56320497 56320817 - +chr12 56334054 56334374 - +chr12 56423522 56423842 - +chr12 56440921 56441241 - +chr12 56472817 56472900 - +chr12 56519960 56520280 - +chr12 56660455 56660775 - +chr12 56727959 56728279 - +chr12 56858110 56858430 - +chr12 56988006 56988117 - +chr12 57156886 57157206 - +chr12 57381339 57381659 - +chr12 57395234 57395554 - +chr12 57401917 57402237 - +chr12 57463237 57463557 - +chr12 57506113 57506433 - +chr12 57518175 57518293 - +chr12 57566821 57567141 - +chr12 57569660 57569980 - +chr12 57576775 57577095 - +chr12 57607163 57607302 - +chr12 57608086 57608406 - +chr12 57632931 57633148 - +chr12 57637470 57637790 - +chr12 57846818 57846952 - +chr12 57856469 57856789 - +chr12 57977105 57977425 - +chr12 58015542 58015862 - +chr12 58026928 58027079 - +chr12 58087419 58087739 - +chr12 58119850 58120170 - +chr12 58176304 58176624 - +chr12 58197104 58197424 - +chr12 58299186 58299434 - +chr12 58934295 58934615 - +chr12 59507639 59507790 - +chr12 59797089 59797409 - +chr12 60202135 60202455 - +chr12 62517498 62517818 - +chr12 62604238 62604342 - +chr12 63187792 63188112 - +chr12 63193044 63193162 - +chr12 63207078 63207398 - +chr12 64237636 64237956 - +chr12 65091854 65092174 - +chr12 65196020 65196340 - +chr12 66672949 66673269 - +chr12 67254959 67255279 - +chr12 67835597 67835917 - +chr12 68434263 68434583 - +chr12 68616928 68617248 - +chr12 69201787 69202107 - +chr12 69224027 69224347 - +chr12 69464139 69464459 - +chr12 69548414 69548734 - +chr12 69550933 69551099 - +chr12 69684872 69685047 - +chr12 69751279 69751599 - +chr12 69809524 69809844 - +chr12 70033228 70033548 - +chr12 70273175 70273495 - +chr12 71113619 71113939 - +chr12 71786349 71786669 - +chr12 72332557 72332877 - +chr12 76552541 76552861 - +chr12 77015290 77015610 - +chr12 77404846 77404992 - +chr12 78319402 78319584 - +chr12 79767730 79768050 - +chr12 81231165 81231485 - +chr12 81522753 81523073 - +chr12 82752377 82752697 - +chr12 88389409 88389729 - +chr12 89413351 89413671 - +chr12 89444857 89445177 - +chr12 89918491 89918811 - +chr12 89936722 89937042 - +chr12 91302887 91303207 - +chr12 91749206 91749526 - +chr12 91775817 91776137 - +chr12 92205486 92205806 - +chr12 92537824 92538144 - +chr12 93221623 93221943 - +chr12 94469101 94469421 - +chr12 94495188 94495508 - +chr12 94496064 94496384 - +chr12 94939966 94940286 - +chr12 95467346 95467666 - +chr12 95730378 95730512 - +chr12 96017367 96017687 - +chr12 96312297 96312617 - +chr12 96608997 96609317 - +chr12 96883124 96883444 - +chr12 98791335 98791655 - +chr12 98884776 98885096 - +chr12 99437131 99437451 - +chr12 100548659 100548792 - +chr12 100948391 100948711 - +chr12 101062714 101063034 - +chr12 101065844 101066164 - +chr12 101221729 101222049 - +chr12 102153259 102153579 - +chr12 102167386 102167706 - +chr12 102233605 102233925 - +chr12 102333588 102333908 - +chr12 102878300 102878620 - +chr12 103163383 103163703 - +chr12 103342651 103342971 - +chr12 103942457 103942777 - +chr12 103969062 103969382 - +chr12 104026483 104026615 - +chr12 104165237 104165557 - +chr12 104451485 104451805 - +chr12 104502989 104503309 - +chr12 104594967 104595287 - +chr12 104609349 104609669 - +chr12 104752205 104752525 - +chr12 104775736 104775843 - +chr12 105184329 105184649 - +chr12 105446854 105447174 - +chr12 105814460 105814780 - +chr12 106099494 106099814 - +chr12 106617927 106618247 - +chr12 106751270 106751590 - +chr12 107796253 107796573 - +chr12 108001427 108001515 - +chr12 108004088 108004408 - +chr12 108064857 108065177 - +chr12 108278889 108279209 - +chr12 108905766 108906086 - +chr12 108957352 108957672 - +chr12 109011901 109012221 - +chr12 109058612 109058932 - +chr12 109125349 109125669 - +chr12 109197318 109197638 - +chr12 109344936 109345256 - +chr12 109531230 109531550 - +chr12 109549716 109550036 - +chr12 109831669 109831803 - +chr12 109865958 109866050 - +chr12 109870680 109870783 - +chr12 109956570 109956890 - +chr12 110035520 110035840 - +chr12 110213192 110213306 - +chr12 110243175 110243331 - +chr12 110283122 110283442 - +chr12 110365385 110365705 - +chr12 110538004 110538324 - +chr12 110869168 110869488 - +chr12 110888071 110888391 - +chr12 110906915 110907235 - +chr12 111325896 111326107 - +chr12 111332265 111332448 - +chr12 111374896 111375216 - +chr12 111736670 111736990 - +chr12 111742383 111742703 - +chr12 111751600 111751920 - +chr12 111834823 111834973 - +chr12 112191001 112191321 - +chr12 112250976 112251296 - +chr12 112608197 112608517 - +chr12 112825285 112825389 - +chr12 113430613 113430933 - +chr12 113486505 113486825 - +chr12 113536169 113536489 - +chr12 113612308 113612466 - +chr12 113634774 113635094 - +chr12 113750317 113750637 - +chr12 113863271 113863391 - +chr12 113877362 113877682 - +chr12 113900184 113900504 - +chr12 114350639 114350794 - +chr12 114392961 114393281 - +chr12 114404146 114404466 - +chr12 114418432 114418587 - +chr12 115063042 115063362 - +chr12 115100421 115100741 - +chr12 116355040 116355360 - +chr12 117141044 117141364 - +chr12 117301038 117301358 - +chr12 117463374 117463694 - +chr12 117592986 117593306 - +chr12 117650507 117650628 - +chr12 117674301 117674621 - +chr12 117804844 117805164 - +chr12 118275156 118275298 - +chr12 118287823 118288143 - +chr12 118295524 118295844 - +chr12 118312668 118312988 - +chr12 118620095 118620415 - +chr12 118814050 118814144 - +chr12 118815721 118816041 - +chr12 119553128 119553448 - +chr12 119982866 119983186 - +chr12 120097130 120097450 - +chr12 120105360 120105680 - +chr12 120218664 120218984 - +chr12 120339877 120340197 - +chr12 120375895 120376215 - +chr12 120524856 120524948 - +chr12 120638988 120639308 - +chr12 120666157 120666477 - +chr12 120739848 120740168 - +chr12 120763281 120763601 - +chr12 120794353 120794673 - +chr12 120868337 120868524 - +chr12 120891758 120892078 - +chr12 120907565 120907661 - +chr12 121021211 121021531 - +chr12 121084737 121085057 - +chr12 121087015 121087335 - +chr12 121443147 121443467 - +chr12 121533787 121534107 - +chr12 121548150 121548470 - +chr12 121668104 121668424 - +chr12 121881063 121881383 - +chr12 121932112 121932228 - +chr12 121937794 121937930 - +chr12 121990543 121990863 - +chr12 122211153 122211473 - +chr12 122214067 122214387 - +chr12 122232168 122232369 - +chr12 122232854 122233174 - +chr12 122249910 122250230 - +chr12 122326901 122327221 - +chr12 122469521 122469841 - +chr12 122492340 122492448 - +chr12 122615568 122615888 - +chr12 122675556 122675675 - +chr12 123237074 123237394 - +chr12 123344798 123345118 - +chr12 123436887 123437207 - +chr12 123528011 123528331 - +chr12 123850254 123850574 - +chr12 124516865 124517185 - +chr12 124521159 124521479 - +chr12 124805560 124805880 - +chr12 124884220 124884540 - +chr12 125248961 125249281 - +chr12 125251564 125251884 - +chr12 125256492 125256641 - +chr12 125391419 125391552 - +chr12 125425199 125425519 - +chr12 125667780 125667968 - +chr12 127778087 127778407 - +chr12 127865647 127865967 - +chr12 128044356 128044676 - +chr12 129266853 129267173 - +chr12 131232816 131232922 - +chr12 131400751 131401071 - +chr12 132093592 132093912 - +chr12 132486140 132486460 - +chr12 132638555 132638639 - +chr12 132991516 132991836 - +chr12 133100456 133100776 - +chr12 133177035 133177355 - +chr12 133191475 133191795 - +chr12 133215354 133215674 - +chr12 133481376 133481696 - +chr12 133757806 133758126 - +chr13 20193081 20193401 - +chr13 20534268 20534588 - +chr13 20737652 20737792 - +chr13 20756430 20756750 - +chr13 20771516 20771836 - +chr13 21069254 21069574 - +chr13 21286753 21287073 - +chr13 21402695 21403015 - +chr13 21517708 21518028 - +chr13 21684944 21685264 - +chr13 21872008 21872328 - +chr13 22051258 22051418 - +chr13 22423515 22423835 - +chr13 23594426 23594746 - +chr13 24534720 24535040 - +chr13 24594481 24594801 - +chr13 24627597 24627917 - +chr13 24803013 24803333 - +chr13 25257986 25258306 - +chr13 25302166 25302486 - +chr13 25569381 25569701 - +chr13 25611417 25611737 - +chr13 25688345 25688665 - +chr13 25861539 25861859 - +chr13 26758871 26759191 - +chr13 27056747 27057067 - +chr13 27070755 27071075 - +chr13 27272856 27273176 - +chr13 27447070 27447390 - +chr13 27504510 27504647 - +chr13 27521099 27521419 - +chr13 27845070 27845390 - +chr13 27933510 27933591 - +chr13 27936236 27936360 - +chr13 27950002 27950083 - +chr13 28055910 28056230 - +chr13 28107235 28107555 - +chr13 28400846 28400953 - +chr13 28404094 28404414 - +chr13 28528017 28528337 - +chr13 28898368 28898688 - +chr13 29210967 29211121 - +chr13 29370826 29371146 - +chr13 29937246 29937416 - +chr13 30046597 30046769 - +chr13 30082417 30082507 - +chr13 30498342 30498662 - +chr13 30687177 30687378 - +chr13 30689160 30689480 - +chr13 31253930 31254250 - +chr13 31377046 31377366 - +chr13 31439510 31439830 - +chr13 31618554 31618874 - +chr13 31998405 31998488 - +chr13 32316280 32316600 - +chr13 32593759 32594079 - +chr13 32820554 32820874 - +chr13 33802746 33803066 - +chr13 34116808 34117128 - +chr13 36295101 36295421 - +chr13 36920898 36921028 - +chr13 36994531 36994851 - +chr13 39719107 39719427 - +chr13 40190366 40190453 - +chr13 40516936 40517256 - +chr13 42033039 42033359 - +chr13 45492356 45492676 - +chr13 45621620 45621702 - +chr13 45855221 45855541 - +chr13 45883588 45883908 - +chr13 45885218 45885538 - +chr13 45945105 45945425 - +chr13 45968136 45968456 - +chr13 46700730 46701050 - +chr13 47125627 47125947 - +chr13 47468221 47468541 - +chr13 47548067 47548387 - +chr13 48612153 48612291 - +chr13 48669281 48669439 - +chr13 49343378 49343519 - +chr13 50181216 50181536 - +chr13 50243877 50244197 - +chr13 50422148 50422468 - +chr13 50510328 50510648 - +chr13 50550819 50551139 - +chr13 50571474 50571794 - +chr13 51486202 51486522 - +chr13 51644237 51644557 - +chr13 51699894 51700214 - +chr13 51860567 51860887 - +chr13 52389272 52389592 - +chr13 52419245 52419565 - +chr13 52701918 52702238 - +chr13 53024657 53024977 - +chr13 53542320 53542506 - +chr13 53726073 53726154 - +chr13 59965793 59966113 - +chr13 60970947 60971267 - +chr13 61023064 61023384 - +chr13 61047834 61047986 - +chr13 67568377 67568697 - +chr13 67990559 67990679 - +chr13 75793888 75794208 - +chr13 76056657 76056977 - +chr13 76259505 76259825 - +chr13 76269974 76270294 - +chr13 76450549 76450869 - +chr13 77477795 77477958 - +chr13 77554376 77554456 - +chr13 79965451 79965771 - +chr13 80253415 80253735 - +chr13 80917003 80917323 - +chr13 88323590 88323679 - +chr13 92001184 92001504 - +chr13 95272583 95272903 - +chr13 95844022 95844342 - +chr13 96032975 96033295 - +chr13 96056562 96056882 - +chr13 96130884 96131040 - +chr13 96486463 96486783 - +chr13 96632786 96633106 - +chr13 97670310 97670630 - +chr13 98011770 98012090 - +chr13 98227212 98227532 - +chr13 98507783 98508103 - +chr13 99097720 99097849 - +chr13 99305785 99305885 - +chr13 99325307 99325627 - +chr13 99492397 99492717 - +chr13 99495163 99495483 - +chr13 99828962 99829045 - +chr13 100089228 100089411 - +chr13 100094562 100094882 - +chr13 100308388 100308708 - +chr13 100568771 100569091 - +chr13 101167229 101167313 - +chr13 101225318 101225638 - +chr13 101347316 101347636 - +chr13 101374182 101374502 - +chr13 103452526 103452846 - +chr13 103522935 103523255 - +chr13 103568373 103568693 - +chr13 105224006 105224326 - +chr13 105792388 105792708 - +chr13 107188813 107189133 - +chr13 107272312 107272632 - +chr13 107570402 107570722 - +chr13 107857790 107857870 - +chr13 107863456 107863575 - +chr13 108302484 108302804 - +chr13 108330475 108330795 - +chr13 108807897 108808217 - +chr13 109797920 109798240 - +chr13 110761279 110761599 - +chr13 110768367 110768687 - +chr13 110874586 110874906 - +chr13 111064623 111064943 - +chr13 111471813 111472133 - +chr13 111567468 111567788 - +chr13 111720122 111720442 - +chr13 111875781 111876101 - +chr13 111997137 111997457 - +chr13 113098399 113098542 - +chr13 113260728 113261048 - +chr13 113528939 113529075 - +chr13 113585793 113586113 - +chr13 113597050 113597370 - +chr13 113683821 113684141 - +chr13 113689287 113689607 - +chr13 114216774 114217094 - +chr13 114427796 114428116 - +chr13 114519921 114520241 - +chr13 114992299 114992453 - +chr13 115105314 115105634 - +chr14 20774011 20774331 - +chr14 21077634 21077741 - +chr14 21165914 21166234 - +chr14 21325811 21326131 - +chr14 21341806 21342126 - +chr14 21439148 21439344 - +chr14 21439577 21439897 - +chr14 21482864 21483184 - +chr14 21560709 21560965 - +chr14 21572730 21572913 - +chr14 21575119 21575260 - +chr14 21945198 21945342 - +chr14 22668934 22669254 - +chr14 23025974 23026294 - +chr14 23122266 23122586 - +chr14 23291998 23292318 - +chr14 23322264 23322584 - +chr14 23341903 23341995 - +chr14 23399597 23399917 - +chr14 23455783 23456103 - +chr14 23525388 23525708 - +chr14 23527292 23527612 - +chr14 23538168 23538350 - +chr14 23578472 23578792 - +chr14 23590073 23590285 - +chr14 23656315 23656635 - +chr14 23764234 23764554 - +chr14 23849275 23849375 - +chr14 23938293 23938613 - +chr14 24028823 24028939 - +chr14 24031988 24032308 - +chr14 24058196 24058516 - +chr14 24101268 24101386 - +chr14 24379836 24380156 - +chr14 24399136 24399234 - +chr14 24423349 24423458 - +chr14 24483181 24483501 - +chr14 24505963 24506283 - +chr14 24527489 24527809 - +chr14 24559845 24559980 - +chr14 24578051 24578371 - +chr14 24630201 24630521 - +chr14 24777418 24777550 - +chr14 24801402 24801722 - +chr14 24802842 24803162 - +chr14 24834824 24835144 - +chr14 24881342 24881662 - +chr14 24901076 24901227 - +chr14 24905957 24906277 - +chr14 25148241 25148410 - +chr14 25179624 25179944 - +chr14 25275449 25275769 - +chr14 25602036 25602356 - +chr14 29220681 29221001 - +chr14 29485519 29485839 - +chr14 29691927 29692247 - +chr14 30739999 30740110 - +chr14 30753060 30753380 - +chr14 30844630 30844720 - +chr14 30991147 30991467 - +chr14 32721318 32721638 - +chr14 33029512 33029832 - +chr14 33045428 33045529 - +chr14 34261943 34262263 - +chr14 34792157 34792477 - +chr14 35876164 35876484 - +chr14 36283644 36283964 - +chr14 36291079 36291399 - +chr14 36539673 36539993 - +chr14 36843269 36843589 - +chr14 37026047 37026367 - +chr14 37074162 37074482 - +chr14 37411542 37411686 - +chr14 38063399 38063719 - +chr14 38343912 38344232 - +chr14 39336802 39337122 - +chr14 39484606 39484926 - +chr14 42693522 42693842 - +chr14 45315491 45315811 - +chr14 45737776 45738096 - +chr14 50100920 50101240 - +chr14 50319643 50319963 - +chr14 50328828 50329070 - +chr14 50332103 50332423 - +chr14 50787757 50788077 - +chr14 50999264 50999584 - +chr14 51240410 51240730 - +chr14 52118666 52118986 - +chr14 52243045 52243365 - +chr14 52246859 52247179 - +chr14 52445860 52446180 - +chr14 54006224 54006544 - +chr14 54795876 54796196 - +chr14 54908055 54908375 - +chr14 55158386 55158522 - +chr14 55240750 55241070 - +chr14 55272643 55272765 - +chr14 55518195 55518357 - +chr14 55556306 55556626 - +chr14 55737822 55738142 - +chr14 56806850 56807170 - +chr14 56958064 56958384 - +chr14 57046055 57046375 - +chr14 57783985 57784305 - +chr14 58633144 58633464 - +chr14 59495293 59495613 - +chr14 59974154 59974331 - +chr14 60043434 60043754 - +chr14 60155193 60155513 - +chr14 60431751 60432071 - +chr14 61028880 61029200 - +chr14 61719539 61719859 - +chr14 62304346 62304666 - +chr14 62327058 62327378 - +chr14 62390103 62390423 - +chr14 63855576 63855896 - +chr14 64108113 64108433 - +chr14 64761589 64761909 - +chr14 64896741 64897061 - +chr14 65007017 65007337 - +chr14 65103441 65103761 - +chr14 65226906 65227226 - +chr14 65231582 65231735 - +chr14 65289932 65290252 - +chr14 65453318 65453638 - +chr14 65689088 65689408 - +chr14 66291453 66291773 - +chr14 67826804 67826911 - +chr14 67893859 67894179 - +chr14 68038040 68038360 - +chr14 68050782 68051102 - +chr14 69283423 69283743 - +chr14 69291589 69291909 - +chr14 69310706 69311026 - +chr14 69378413 69378584 - +chr14 69596011 69596331 - +chr14 69813490 69813810 - +chr14 70041600 70041706 - +chr14 70046889 70047209 - +chr14 70051727 70052047 - +chr14 70070407 70070490 - +chr14 70193692 70194012 - +chr14 70480121 70480261 - +chr14 70494665 70494782 - +chr14 70540712 70541032 - +chr14 71034617 71034937 - +chr14 71284480 71284631 - +chr14 71373881 71374201 - +chr14 71605710 71606030 - +chr14 71633201 71633521 - +chr14 72187679 72187999 - +chr14 72219805 72220125 - +chr14 72322168 72322488 - +chr14 72759150 72759470 - +chr14 72887566 72887697 - +chr14 72980757 72981077 - +chr14 72983606 72983926 - +chr14 73371673 73371993 - +chr14 73415244 73415564 - +chr14 74100263 74100343 - +chr14 74195446 74195766 - +chr14 74225012 74225332 - +chr14 74292327 74292647 - +chr14 74684483 74684563 - +chr14 74718355 74718440 - +chr14 74724255 74724367 - +chr14 74760976 74761296 - +chr14 74829040 74829360 - +chr14 74926298 74926618 - +chr14 74945768 74946088 - +chr14 74967892 74968212 - +chr14 75075703 75075844 - +chr14 75327629 75327949 - +chr14 75372500 75372626 - +chr14 75380707 75381027 - +chr14 75413072 75413392 - +chr14 75555287 75555607 - +chr14 75762926 75763060 - +chr14 75774507 75774827 - +chr14 75955160 75955480 - +chr14 76027325 76027645 - +chr14 76044042 76044362 - +chr14 76120509 76120829 - +chr14 76175559 76175879 - +chr14 76177583 76177680 - +chr14 76445619 76445939 - +chr14 76853608 76853803 - +chr14 77114106 77114426 - +chr14 77251159 77251479 - +chr14 77339164 77339484 - +chr14 77342471 77342599 - +chr14 77379935 77380255 - +chr14 77383515 77383835 - +chr14 77519624 77519944 - +chr14 77534613 77534933 - +chr14 77537030 77537350 - +chr14 77561050 77561370 - +chr14 77767413 77767733 - +chr14 78121345 78121665 - +chr14 78328628 78328826 - +chr14 79064413 79064733 - +chr14 79426064 79426384 - +chr14 81421082 81421402 - +chr14 81453807 81454127 - +chr14 81636935 81637255 - +chr14 81919589 81919909 - +chr14 81930896 81931216 - +chr14 86523422 86523742 - +chr14 88082425 88082745 - +chr14 88237516 88237836 - +chr14 88480213 88480318 - +chr14 88634937 88635257 - +chr14 88958658 88958758 - +chr14 89133812 89134132 - +chr14 89506811 89506914 - +chr14 90381639 90381959 - +chr14 90405816 90406136 - +chr14 90439626 90439946 - +chr14 90701714 90702034 - +chr14 90865998 90866132 - +chr14 91164046 91164184 - +chr14 91224773 91225093 - +chr14 91639513 91639833 - +chr14 91643718 91644038 - +chr14 91752820 91753140 - +chr14 91789947 91790267 - +chr14 91885247 91885428 - +chr14 92573001 92573321 - +chr14 92923066 92923386 - +chr14 93079438 93079758 - +chr14 93147518 93147628 - +chr14 93153356 93153462 - +chr14 93181090 93181410 - +chr14 93418388 93418483 - +chr14 93605524 93605844 - +chr14 93644422 93644742 - +chr14 94359384 94359583 - +chr14 94468151 94468471 - +chr14 94502924 94503244 - +chr14 94603160 94603318 - +chr14 94889829 94890149 - +chr14 94900259 94900579 - +chr14 94952671 94952871 - +chr14 95015775 95016095 - +chr14 95155933 95156081 - +chr14 95908849 95909169 - +chr14 96027254 96027574 - +chr14 96240712 96241032 - +chr14 96460705 96461025 - +chr14 96565213 96565326 - +chr14 96566908 96567228 - +chr14 96741877 96742197 - +chr14 97207760 97208080 - +chr14 99732998 99733161 - +chr14 99786418 99786738 - +chr14 99855578 99855898 - +chr14 99859520 99859600 - +chr14 99984883 99984986 - +chr14 100095281 100095443 - +chr14 100149077 100149397 - +chr14 100154128 100154448 - +chr14 100413364 100413466 - +chr14 100448831 100449151 - +chr14 100462204 100462335 - +chr14 100574680 100575000 - +chr14 100751978 100752298 - +chr14 100769494 100769814 - +chr14 100844203 100844523 - +chr14 100850035 100850355 - +chr14 100883792 100884112 - +chr14 101317499 101317819 - +chr14 101701609 101701929 - +chr14 102094819 102095139 - +chr14 102237298 102237618 - +chr14 102427534 102427651 - +chr14 102562940 102563260 - +chr14 102701654 102701974 - +chr14 102780590 102780910 - +chr14 102783190 102783510 - +chr14 102990833 102991153 - +chr14 103060926 103061246 - +chr14 103241239 103241559 - +chr14 103544744 103544836 - +chr14 103571392 103571526 - +chr14 103607553 103607873 - +chr14 103759259 103759371 - +chr14 103857388 103857708 - +chr14 103860364 103860684 - +chr14 103966507 103966827 - +chr14 103985015 103985123 - +chr14 104013171 104013491 - +chr14 104274776 104275096 - +chr14 104367411 104367731 - +chr14 104387820 104388140 - +chr14 104544390 104544564 - +chr14 105106440 105106526 - +chr14 105116757 105116922 - +chr14 105175074 105175216 - +chr14 105189898 105190218 - +chr14 105234835 105235155 - +chr14 105237277 105237597 - +chr14 105363044 105363364 - +chr14 105442451 105442771 - +chr14 105512167 105512342 - +chr14 105553118 105553438 - +chr14 105556841 105557161 - +chr14 105669321 105669641 - +chr14 105748206 105748526 - +chr14 105818604 105818924 - +chr14 105837165 105837485 - +chr14 105914974 105915294 - +chr14 105947315 105947412 - +chr14 106004308 106004450 - +chr15 20561652 20561972 - +chr15 22461113 22461433 - +chr15 22836769 22837089 - +chr15 25939065 25939385 - +chr15 25955275 25955595 - +chr15 26179167 26179301 - +chr15 26505711 26506031 - +chr15 28331269 28331589 - +chr15 28368642 28368962 - +chr15 29267908 29268228 - +chr15 29382902 29383016 - +chr15 29394551 29394675 - +chr15 29396103 29396194 - +chr15 29432029 29432349 - +chr15 29960256 29960576 - +chr15 30218055 30218375 - +chr15 31421610 31421718 - +chr15 31489686 31490006 - +chr15 31507629 31507949 - +chr15 31727502 31727822 - +chr15 31780864 31781184 - +chr15 31782361 31782481 - +chr15 32943548 32943707 - +chr15 33993010 33993151 - +chr15 34029939 34030259 - +chr15 34354148 34354468 - +chr15 34502247 34502567 - +chr15 34533559 34533879 - +chr15 34635303 34635623 - +chr15 34659769 34660089 - +chr15 35000672 35000764 - +chr15 35261914 35262234 - +chr15 37175870 37176190 - +chr15 38184016 38184336 - +chr15 38295110 38295430 - +chr15 38682172 38682492 - +chr15 38706595 38706915 - +chr15 38732482 38732802 - +chr15 38856230 38856550 - +chr15 38975852 38976172 - +chr15 38991882 38992202 - +chr15 39000039 39000359 - +chr15 39110041 39110140 - +chr15 39872487 39872595 - +chr15 40322490 40322810 - +chr15 40337302 40337622 - +chr15 40397010 40397330 - +chr15 40566969 40567171 - +chr15 40571698 40571828 - +chr15 40602252 40602572 - +chr15 40613111 40613431 - +chr15 40630302 40630622 - +chr15 40636755 40637075 - +chr15 40660282 40660602 - +chr15 40714144 40714464 - +chr15 40799266 40799586 - +chr15 40811944 40812264 - +chr15 40882094 40882207 - +chr15 40971283 40971603 - +chr15 41149937 41150113 - +chr15 41151354 41151674 - +chr15 41184061 41184381 - +chr15 41186303 41186623 - +chr15 41196276 41196465 - +chr15 41314736 41315056 - +chr15 41522868 41523188 - +chr15 41758107 41758237 - +chr15 41849780 41850100 - +chr15 41896078 41896398 - +chr15 42066650 42066970 - +chr15 42103664 42103984 - +chr15 42196752 42197072 - +chr15 42281826 42281961 - +chr15 42352045 42352125 - +chr15 42376599 42376919 - +chr15 42677731 42678051 - +chr15 43218921 43219241 - +chr15 43517687 43518007 - +chr15 43668313 43668633 - +chr15 44061845 44062165 - +chr15 44196005 44196325 - +chr15 44511903 44512223 - +chr15 44870656 44870976 - +chr15 44955835 44956155 - +chr15 45077001 45077130 - +chr15 45145686 45145785 - +chr15 45329377 45329530 - +chr15 45410376 45410584 - +chr15 45455352 45455479 - +chr15 45459596 45459916 - +chr15 45491176 45491264 - +chr15 45497356 45497676 - +chr15 45722524 45722699 - +chr15 45751014 45751334 - +chr15 45997868 45998188 - +chr15 46006738 46007058 - +chr15 47488237 47488557 - +chr15 47763677 47763997 - +chr15 49154028 49154348 - +chr15 49267937 49268257 - +chr15 49714174 49714494 - +chr15 50295986 50296306 - +chr15 51169269 51169363 - +chr15 51178184 51178330 - +chr15 51425197 51425348 - +chr15 51455175 51455306 - +chr15 51484993 51485141 - +chr15 51573574 51573894 - +chr15 51673453 51673773 - +chr15 51675019 51675339 - +chr15 52068413 52068733 - +chr15 52371120 52371440 - +chr15 52500591 52500911 - +chr15 52519605 52519925 - +chr15 53072730 53073050 - +chr15 55460700 55460784 - +chr15 55582670 55582990 - +chr15 56138358 56138678 - +chr15 56189948 56190268 - +chr15 56538296 56538511 - +chr15 56783360 56783680 - +chr15 57430512 57430832 - +chr15 57632920 57633240 - +chr15 57834429 57834749 - +chr15 57852124 57852230 - +chr15 57853206 57853526 - +chr15 58024732 58025052 - +chr15 58158380 58158700 - +chr15 58363046 58363366 - +chr15 58474798 58475118 - +chr15 58562171 58562491 - +chr15 59249589 59249909 - +chr15 59785101 59785421 - +chr15 59825009 59825329 - +chr15 60472037 60472357 - +chr15 60619048 60619368 - +chr15 60800024 60800344 - +chr15 60812986 60813306 - +chr15 61209162 61209482 - +chr15 61343676 61343996 - +chr15 61487381 61487701 - +chr15 62126409 62126552 - +chr15 62127823 62127929 - +chr15 62546578 62546898 - +chr15 63034638 63034958 - +chr15 63340179 63340499 - +chr15 63385839 63386159 - +chr15 63779531 63779851 - +chr15 63902341 63902661 - +chr15 64183233 64183553 - +chr15 64237570 64237722 - +chr15 64359760 64360080 - +chr15 64445843 64446004 - +chr15 64973656 64973976 - +chr15 65020514 65020661 - +chr15 65023499 65023819 - +chr15 65102263 65102375 - +chr15 65127989 65128309 - +chr15 65197750 65198070 - +chr15 65325391 65325711 - +chr15 65346704 65346860 - +chr15 65393919 65394239 - +chr15 65481487 65481807 - +chr15 65563320 65563640 - +chr15 65590070 65590390 - +chr15 65596598 65596918 - +chr15 65611191 65611511 - +chr15 65677188 65677508 - +chr15 65677967 65678287 - +chr15 65693101 65693421 - +chr15 65810054 65810181 - +chr15 65930969 65931289 - +chr15 66149218 66149538 - +chr15 66273223 66273543 - +chr15 66544706 66545026 - +chr15 66790056 66790376 - +chr15 66895969 66896289 - +chr15 66911485 66911805 - +chr15 66990827 66990931 - +chr15 67066818 67066924 - +chr15 67143541 67143861 - +chr15 67145859 67145947 - +chr15 67155191 67155511 - +chr15 67196124 67196265 - +chr15 67725871 67726191 - +chr15 67749283 67749603 - +chr15 68128776 68129096 - +chr15 68155769 68156089 - +chr15 68503264 68503376 - +chr15 68545285 68545605 - +chr15 68599004 68599324 - +chr15 68842856 68843176 - +chr15 68846410 68846730 - +chr15 68851953 68852273 - +chr15 68966447 68966583 - +chr15 69168194 69168514 - +chr15 69217243 69217563 - +chr15 69351475 69351795 - +chr15 69686996 69687316 - +chr15 69755049 69755369 - +chr15 69758302 69758622 - +chr15 69768201 69768350 - +chr15 69770588 69770908 - +chr15 69815755 69816075 - +chr15 70021526 70021846 - +chr15 70759278 70759598 - +chr15 70780471 70780639 - +chr15 70805126 70805446 - +chr15 71117476 71117796 - +chr15 71407636 71407756 - +chr15 71735825 71736145 - +chr15 71740957 71741277 - +chr15 72448490 72448630 - +chr15 72492058 72492378 - +chr15 72518999 72519319 - +chr15 72604965 72605285 - +chr15 72963216 72963536 - +chr15 73014472 73014792 - +chr15 73419688 73420008 - +chr15 73533250 73533570 - +chr15 73928855 73929015 - +chr15 73969428 73969748 - +chr15 73992621 73992941 - +chr15 74189854 74189982 - +chr15 74200671 74200991 - +chr15 74243460 74243780 - +chr15 74257107 74257427 - +chr15 74265296 74265616 - +chr15 74284599 74284919 - +chr15 74419987 74420105 - +chr15 74604126 74604446 - +chr15 74613027 74613347 - +chr15 74614941 74615261 - +chr15 74665982 74666302 - +chr15 74667686 74667777 - +chr15 74674861 74674960 - +chr15 74731093 74731413 - +chr15 74913520 74913656 - +chr15 74924328 74924648 - +chr15 74927512 74927832 - +chr15 75063525 75063647 - +chr15 75080146 75080466 - +chr15 75092617 75092746 - +chr15 75127618 75127938 - +chr15 75193957 75194277 - +chr15 75242802 75243001 - +chr15 75321639 75321959 - +chr15 75497244 75497379 - +chr15 75930649 75930969 - +chr15 76025184 76025504 - +chr15 76431961 76432070 - +chr15 76440340 76440522 - +chr15 76443305 76443625 - +chr15 76478026 76478346 - +chr15 76483107 76483427 - +chr15 76622651 76622971 - +chr15 77316761 77316872 - +chr15 77327415 77327735 - +chr15 77400004 77400324 - +chr15 77789270 77789371 - +chr15 77835806 77835917 - +chr15 77925910 77926108 - +chr15 78113833 78114153 - +chr15 78162808 78163128 - +chr15 78195227 78195547 - +chr15 78202818 78203138 - +chr15 78251852 78251948 - +chr15 78259153 78259473 - +chr15 78400839 78400920 - +chr15 78405570 78405890 - +chr15 78469721 78470041 - +chr15 78533398 78533481 - +chr15 78600861 78600957 - +chr15 78715341 78715436 - +chr15 78910801 78911121 - +chr15 79001760 79002080 - +chr15 79043432 79043752 - +chr15 79103904 79104224 - +chr15 79320732 79321052 - +chr15 79476862 79476991 - +chr15 79632387 79632707 - +chr15 79673379 79673699 - +chr15 79888430 79888750 - +chr15 80142174 80142494 - +chr15 80145877 80146197 - +chr15 80285832 80286012 - +chr15 80465698 80466018 - +chr15 80534528 80534848 - +chr15 80711278 80711598 - +chr15 80748172 80748492 - +chr15 80860049 80860369 - +chr15 81262392 81262712 - +chr15 81558298 81558420 - +chr15 81558810 81559130 - +chr15 81567509 81567602 - +chr15 82339463 82339783 - +chr15 82400932 82401062 - +chr15 82490359 82490679 - +chr15 83518396 83518716 - +chr15 83679617 83679937 - +chr15 84107458 84107778 - +chr15 84228164 84228484 - +chr15 84811583 84811903 - +chr15 85471534 85471691 - +chr15 85498583 85498903 - +chr15 85874464 85874603 - +chr15 86374801 86375121 - +chr15 86383814 86384134 - +chr15 86397587 86397907 - +chr15 86402028 86402348 - +chr15 88215568 88215888 - +chr15 88790201 88790289 - +chr15 88817809 88818129 - +chr15 88993456 88993776 - +chr15 88998257 88998577 - +chr15 89089820 89090140 - +chr15 89157790 89158110 - +chr15 89200619 89200939 - +chr15 89421915 89422235 - +chr15 89448083 89448219 - +chr15 89560841 89561161 - +chr15 89600842 89601008 - +chr15 89672771 89673091 - +chr15 89714757 89715077 - +chr15 89721010 89721330 - +chr15 89750193 89750330 - +chr15 89901938 89902067 - +chr15 89959556 89959876 - +chr15 89973956 89974276 - +chr15 89985941 89986261 - +chr15 90190450 90190562 - +chr15 90283303 90283623 - +chr15 90304084 90304237 - +chr15 90418235 90418555 - +chr15 90514805 90515125 - +chr15 90614518 90614838 - +chr15 90630056 90630376 - +chr15 90701082 90701402 - +chr15 90762869 90763189 - +chr15 90764545 90764865 - +chr15 90836554 90836874 - +chr15 90844449 90844769 - +chr15 90863356 90863499 - +chr15 91191308 91191628 - +chr15 91429332 91429496 - +chr15 91445472 91445792 - +chr15 91475681 91476001 - +chr15 91477626 91477755 - +chr15 91499870 91499993 - +chr15 91599106 91599426 - +chr15 91790839 91791159 - +chr15 93258054 93258374 - +chr15 93573912 93574232 - +chr15 93631086 93631406 - +chr15 94663149 94663469 - +chr15 94867470 94867790 - +chr15 95802666 95802986 - +chr15 96900088 96900408 - +chr15 97778611 97778931 - +chr15 98452552 98452872 - +chr15 98721236 98721556 - +chr15 98854844 98855164 - +chr15 98964863 98965183 - +chr15 98985642 98985962 - +chr15 99062353 99062673 - +chr15 99202471 99202791 - +chr15 99322851 99323171 - +chr15 99394167 99394487 - +chr15 99619967 99620287 - +chr15 99640751 99641071 - +chr15 99753443 99753763 - +chr15 100296060 100296380 - +chr15 100890345 100890665 - +chr15 101212213 101212533 - +chr15 101413313 101413633 - +chr15 101458291 101458371 - +chr15 101547191 101547511 - +chr15 101548628 101548948 - +chr15 101555523 101555631 - +chr15 101629101 101629218 - +chr15 102181405 102181725 - +chr15 102192504 102192824 - +chr16 115791 115885 - +chr16 147016 147175 - +chr16 157048 157368 - +chr16 167793 168113 - +chr16 231436 231756 - +chr16 412169 412489 - +chr16 441898 442043 - +chr16 629102 629422 - +chr16 761737 762057 - +chr16 790962 791282 - +chr16 835633 835953 - +chr16 1105123 1105443 - +chr16 1156301 1156621 - +chr16 1244646 1244966 - +chr16 1350903 1351223 - +chr16 1773588 1773908 - +chr16 1862659 1862979 - +chr16 1979668 1979988 - +chr16 2053911 2054068 - +chr16 2190624 2190944 - +chr16 2390800 2391120 - +chr16 2473197 2473391 - +chr16 2551293 2551440 - +chr16 2552963 2553283 - +chr16 2888884 2889204 - +chr16 2918387 2918707 - +chr16 2925006 2925089 - +chr16 2954178 2954498 - +chr16 2969684 2970004 - +chr16 2977076 2977396 - +chr16 3011838 3011965 - +chr16 3033659 3033979 - +chr16 3081786 3081880 - +chr16 3179552 3179872 - +chr16 3215972 3216292 - +chr16 3989197 3989277 - +chr16 4250253 4250573 - +chr16 4304077 4304332 - +chr16 4338425 4338745 - +chr16 4359736 4360056 - +chr16 4380396 4380555 - +chr16 4409078 4409398 - +chr16 4452678 4452998 - +chr16 4454412 4454732 - +chr16 4556431 4556751 - +chr16 4598784 4599104 - +chr16 4635464 4635784 - +chr16 4674231 4674551 - +chr16 4700520 4700840 - +chr16 5006174 5006494 - +chr16 5142321 5142641 - +chr16 8617771 8617912 - +chr16 8754831 8755151 - +chr16 8974225 8974345 - +chr16 9102647 9102967 - +chr16 9233011 9233147 - +chr16 9254000 9254320 - +chr16 10603450 10603770 - +chr16 10615567 10615887 - +chr16 10673769 10674089 - +chr16 10830948 10831268 - +chr16 11012714 11012858 - +chr16 11047897 11048043 - +chr16 11057881 11058201 - +chr16 11349549 11349869 - +chr16 11351178 11351498 - +chr16 11450571 11450891 - +chr16 11490181 11490335 - +chr16 11499446 11499766 - +chr16 11626008 11626328 - +chr16 11735235 11735555 - +chr16 11808466 11808786 - +chr16 12161750 12162070 - +chr16 12189061 12189381 - +chr16 12322582 12322902 - +chr16 12549258 12549356 - +chr16 12913236 12913556 - +chr16 13959994 13960314 - +chr16 14013521 14013841 - +chr16 14343419 14343739 - +chr16 15859729 15860049 - +chr16 15915837 15915920 - +chr16 15968720 15969040 - +chr16 16221535 16221855 - +chr16 16288746 16289066 - +chr16 17288080 17288400 - +chr16 17561825 17562145 - +chr16 17650481 17650801 - +chr16 17673176 17673496 - +chr16 18812978 18813298 - +chr16 18980665 18980985 - +chr16 18995023 18995343 - +chr16 19083605 19083925 - +chr16 19097931 19098043 - +chr16 19234596 19234916 - +chr16 19513993 19514313 - +chr16 19595045 19595365 - +chr16 19897657 19897872 - +chr16 19910689 19911009 - +chr16 19918901 19919027 - +chr16 20394766 20395086 - +chr16 20397099 20397419 - +chr16 20786085 20786405 - +chr16 20861286 20861447 - +chr16 21095179 21095499 - +chr16 21200013 21200333 - +chr16 21289513 21289833 - +chr16 21311895 21312215 - +chr16 21566235 21566399 - +chr16 21719861 21720181 - +chr16 22012433 22012544 - +chr16 22089058 22089378 - +chr16 22199933 22200253 - +chr16 22207238 22207558 - +chr16 22308449 22308769 - +chr16 22311820 22312140 - +chr16 22332586 22332906 - +chr16 22692029 22692349 - +chr16 22910167 22910487 - +chr16 23072062 23072382 - +chr16 23446781 23447101 - +chr16 23568638 23568958 - +chr16 23607645 23607817 - +chr16 23739240 23739560 - +chr16 23812761 23813081 - +chr16 24528348 24528668 - +chr16 24845803 24846123 - +chr16 25245058 25245378 - +chr16 25850073 25850393 - +chr16 27121092 27121187 - +chr16 27166439 27166606 - +chr16 27190824 27191144 - +chr16 27246302 27246622 - +chr16 27468560 27468643 - +chr16 27527221 27527541 - +chr16 27616867 27617187 - +chr16 27651131 27651451 - +chr16 27701957 27702277 - +chr16 27791210 27791530 - +chr16 28082180 28082296 - +chr16 28289016 28289336 - +chr16 28328420 28328740 - +chr16 28333196 28333293 - +chr16 28338232 28338552 - +chr16 28957005 28957085 - +chr16 28985266 28985586 - +chr16 28992896 28993216 - +chr16 29703303 29703623 - +chr16 29711949 29712057 - +chr16 29740583 29740903 - +chr16 29818925 29819245 - +chr16 29827542 29827862 - +chr16 29859875 29860195 - +chr16 29874783 29875103 - +chr16 30016856 30017176 - +chr16 30032976 30033085 - +chr16 30043577 30043897 - +chr16 30103245 30103565 - +chr16 30420802 30421122 - +chr16 30461110 30461430 - +chr16 30609503 30609823 - +chr16 30640201 30640284 - +chr16 30705271 30705591 - +chr16 30825609 30825929 - +chr16 30856034 30856354 - +chr16 30932946 30933266 - +chr16 30953373 30953459 - +chr16 30964971 30965291 - +chr16 30996281 30996601 - +chr16 31117070 31117185 - +chr16 31153853 31153948 - +chr16 31171018 31171338 - +chr16 31454394 31454714 - +chr16 31463126 31463446 - +chr16 31484044 31484364 - +chr16 31708263 31708583 - +chr16 31870726 31871046 - +chr16 46588401 46588512 - +chr16 46707396 46707552 - +chr16 46887877 46888197 - +chr16 46963654 46963974 - +chr16 47071697 47072017 - +chr16 47175689 47175840 - +chr16 47292799 47293119 - +chr16 48088962 48089282 - +chr16 48151443 48151763 - +chr16 48190679 48190999 - +chr16 48387518 48387838 - +chr16 48657237 48657360 - +chr16 48664438 48664567 - +chr16 48721390 48721710 - +chr16 49384442 49384762 - +chr16 50058820 50059049 - +chr16 50543525 50543845 - +chr16 50840228 50840320 - +chr16 51796051 51796371 - +chr16 51852303 51852623 - +chr16 52225565 52225707 - +chr16 52946500 52946820 - +chr16 53088807 53088952 - +chr16 53544398 53544718 - +chr16 53979842 53980162 - +chr16 54227572 54227892 - +chr16 55499123 55499247 - +chr16 55978328 55978448 - +chr16 56352356 56352446 - +chr16 56390372 56390692 - +chr16 56610955 56611275 - +chr16 56736074 56736188 - +chr16 57118245 57118402 - +chr16 57334632 57334717 - +chr16 57451062 57451190 - +chr16 57452795 57453115 - +chr16 57456311 57456631 - +chr16 57496353 57496673 - +chr16 57553779 57553912 - +chr16 57610355 57610564 - +chr16 57625309 57625629 - +chr16 57683001 57682941 A +chr16 57683001 57682965 A +chr16 57683001 57683273 - +chr16 57683001 57683307 A +chr16 57683001 57683584 A +chr16 57683001 57683662 A +chr16 57683001 57683692 A +chr16 57683001 57683936 A +chr16 57683001 57683972 A +chr16 57683001 57684399 A +chr16 57683001 57684417 A +chr16 57683001 57684543 A +chr16 57683001 57684811 A +chr16 57683001 57684883 A +chr16 57683001 57685041 A +chr16 57683001 57685310 A +chr16 57683001 57685348 A +chr16 57683001 57685692 A +chr16 57683001 57685723 A +chr16 57683001 57686039 A +chr16 57683001 57686229 A +chr16 57683001 57686230 A +chr16 57683001 57686420 A +chr16 57683001 57686438 A +chr16 57683001 57686861 A +chr16 57683002 57682327 A +chr16 57683002 57682838 A +chr16 57683002 57683083 A +chr16 57683002 57684171 A +chr16 57683002 57684635 A +chr16 57683002 57684755 A +chr16 57683002 57684913 A +chr16 57683002 57685138 A +chr16 57683002 57685240 A +chr16 57683002 57685395 A +chr16 57683002 57686344 A +chr16 57683002 57686540 A +chr16 57683002 57688355 A +chr16 57683003 57684002 A +chr16 57683003 57684224 A +chr16 57683003 57684484 A +chr16 57683003 57684525 A +chr16 57683003 57684638 A +chr16 57683003 57685066 A +chr16 57683003 57685138 A +chr16 57683003 57685231 A +chr16 57683003 57685323 A +chr16 57683003 57685355 A +chr16 57683003 57685418 A +chr16 57683003 57685588 A +chr16 57683003 57685788 A +chr16 57683003 57686204 A +chr16 57683003 57686234 A +chr16 57683003 57687264 A +chr16 57683003 57687559 A +chr16 57683004 57681945 A +chr16 57683004 57682489 A +chr16 57683004 57683800 A +chr16 57683004 57683896 A +chr16 57683004 57684157 A +chr16 57683004 57684253 A +chr16 57683004 57684427 A +chr16 57683004 57684616 A +chr16 57683004 57684685 A +chr16 57683004 57684835 A +chr16 57683004 57684907 A +chr16 57683004 57684972 A +chr16 57683004 57685062 A +chr16 57683004 57685090 A +chr16 57683004 57685373 A +chr16 57683004 57685489 A +chr16 57683004 57685964 A +chr16 57683004 57686248 A +chr16 57683004 57686299 A +chr16 57683004 57686974 A +chr16 57683005 57683616 A +chr16 57683005 57683849 A +chr16 57683005 57683851 A +chr16 57683005 57683907 A +chr16 57683005 57683959 A +chr16 57683005 57684016 A +chr16 57683005 57684018 A +chr16 57683005 57684042 A +chr16 57683005 57684458 A +chr16 57683005 57684521 A +chr16 57683005 57684751 A +chr16 57683005 57684869 A +chr16 57683005 57684987 A +chr16 57683005 57685285 A +chr16 57683005 57685296 A +chr16 57683005 57685672 A +chr16 57683005 57685991 A +chr16 57683005 57686067 A +chr16 57683005 57686562 A +chr16 57683006 57682251 A +chr16 57683006 57683422 A +chr16 57683006 57683573 A +chr16 57683006 57684363 A +chr16 57683006 57684469 A +chr16 57683006 57684819 A +chr16 57683006 57684983 A +chr16 57683006 57685027 A +chr16 57683006 57685104 A +chr16 57683006 57685146 A +chr16 57683006 57685148 A +chr16 57683006 57685185 A +chr16 57683006 57685427 A +chr16 57683006 57685559 A +chr16 57683006 57685753 A +chr16 57683006 57686890 A +chr16 57683006 57687131 A +chr16 57683007 57682742 A +chr16 57683007 57683461 A +chr16 57683007 57683802 A +chr16 57683007 57684026 A +chr16 57683007 57684124 A +chr16 57683007 57684183 A +chr16 57683007 57684364 A +chr16 57683007 57684410 A +chr16 57683007 57684540 A +chr16 57683007 57684549 A +chr16 57683007 57684955 A +chr16 57683007 57685135 A +chr16 57683007 57685233 A +chr16 57683007 57685235 A +chr16 57683007 57685524 A +chr16 57683007 57685591 A +chr16 57683007 57685627 A +chr16 57683007 57685684 A +chr16 57683007 57686094 A +chr16 57683008 57683374 A +chr16 57683008 57683635 A +chr16 57683008 57683640 A +chr16 57683008 57684088 A +chr16 57683008 57684365 A +chr16 57683008 57684518 A +chr16 57683008 57684649 A +chr16 57683008 57684767 A +chr16 57683008 57685235 A +chr16 57683008 57685466 A +chr16 57683008 57685837 A +chr16 57683008 57685936 A +chr16 57683008 57685939 A +chr16 57683008 57686037 A +chr16 57683008 57686196 A +chr16 57683008 57686270 A +chr16 57683008 57686288 A +chr16 57683008 57686383 A +chr16 57683008 57686661 A +chr16 57683008 57686723 A +chr16 57683008 57686740 A +chr16 57683008 57686807 A +chr16 57683009 57683407 A +chr16 57683009 57683575 A +chr16 57683009 57684352 A +chr16 57683009 57684441 A +chr16 57683009 57684579 A +chr16 57683009 57684693 A +chr16 57683009 57684703 A +chr16 57683009 57684850 A +chr16 57683009 57685111 A +chr16 57683009 57685113 A +chr16 57683009 57685148 A +chr16 57683009 57685160 A +chr16 57683009 57685303 A +chr16 57683009 57685333 A +chr16 57683009 57685691 A +chr16 57683009 57686255 A +chr16 57683009 57686849 A +chr16 57683010 57684044 A +chr16 57683010 57684425 A +chr16 57683010 57684504 A +chr16 57683010 57684667 A +chr16 57683010 57684825 A +chr16 57683010 57684842 A +chr16 57683010 57684842 A +chr16 57683010 57684851 A +chr16 57683010 57684864 A +chr16 57683010 57685073 A +chr16 57683010 57685234 A +chr16 57683010 57685333 A +chr16 57683010 57685843 A +chr16 57683010 57685869 A +chr16 57683010 57685883 A +chr16 57683010 57685938 A +chr16 57683011 57683350 A +chr16 57683011 57683591 A +chr16 57683011 57683602 A +chr16 57683011 57683741 A +chr16 57683011 57684258 A +chr16 57683011 57684555 A +chr16 57683011 57684861 A +chr16 57683011 57685914 A +chr16 57683011 57685980 A +chr16 57683011 57686185 A +chr16 57683011 57686316 A +chr16 57683012 57683091 A +chr16 57683012 57683361 A +chr16 57683012 57683613 A +chr16 57683012 57684054 A +chr16 57683012 57684159 A +chr16 57683012 57684331 A +chr16 57683012 57684378 A +chr16 57683012 57684379 A +chr16 57683012 57684464 A +chr16 57683012 57684545 A +chr16 57683012 57684556 A +chr16 57683012 57684712 A +chr16 57683012 57684775 A +chr16 57683012 57684840 A +chr16 57683012 57685063 A +chr16 57683012 57685071 A +chr16 57683012 57685236 A +chr16 57683012 57685256 A +chr16 57683012 57685433 A +chr16 57683012 57685645 A +chr16 57683012 57685707 A +chr16 57683012 57685721 A +chr16 57683012 57685869 A +chr16 57683012 57686114 A +chr16 57683012 57686310 A +chr16 57683012 57686311 A +chr16 57683012 57686315 A +chr16 57683012 57688095 A +chr16 57683013 57683605 A +chr16 57683013 57683632 A +chr16 57683013 57683715 A +chr16 57683013 57683890 A +chr16 57683013 57684093 A +chr16 57683013 57684310 A +chr16 57683013 57684602 A +chr16 57683013 57684731 A +chr16 57683013 57685286 A +chr16 57683013 57685301 A +chr16 57683013 57685388 A +chr16 57683013 57685654 A +chr16 57683013 57686022 A +chr16 57683013 57686156 A +chr16 57683013 57686322 A +chr16 57683013 57686643 A +chr16 57683014 57683892 A +chr16 57683014 57684046 A +chr16 57683014 57684408 A +chr16 57683014 57684447 A +chr16 57683014 57684841 A +chr16 57683014 57684855 A +chr16 57683014 57685004 A +chr16 57683014 57685044 A +chr16 57683014 57685775 A +chr16 57683014 57685815 A +chr16 57683014 57686097 A +chr16 57683014 57686903 A +chr16 57683014 57687043 A +chr16 57683015 57684041 A +chr16 57683015 57684102 A +chr16 57683015 57684114 A +chr16 57683015 57684199 A +chr16 57683015 57684322 A +chr16 57683015 57684540 A +chr16 57683015 57684739 A +chr16 57683015 57684757 A +chr16 57683015 57684782 A +chr16 57683015 57684846 A +chr16 57683015 57684981 A +chr16 57683015 57685054 A +chr16 57683015 57685291 A +chr16 57683015 57685318 A +chr16 57683015 57685377 A +chr16 57683015 57685521 A +chr16 57683015 57685567 A +chr16 57683015 57686009 A +chr16 57683015 57686355 A +chr16 57683015 57686378 A +chr16 57683016 57683346 A +chr16 57683016 57683439 A +chr16 57683016 57683665 A +chr16 57683016 57683668 A +chr16 57683016 57684241 A +chr16 57683016 57684401 A +chr16 57683016 57684520 A +chr16 57683016 57684646 A +chr16 57683016 57684665 A +chr16 57683016 57684711 A +chr16 57683016 57684772 A +chr16 57683016 57685106 A +chr16 57683016 57685274 A +chr16 57683016 57685350 A +chr16 57683016 57686678 A +chr16 57683017 57683519 A +chr16 57683017 57683956 A +chr16 57683017 57684164 A +chr16 57683017 57684165 A +chr16 57683017 57684525 A +chr16 57683017 57684762 A +chr16 57683017 57685282 A +chr16 57683017 57685411 A +chr16 57683017 57685466 A +chr16 57683017 57685699 A +chr16 57683017 57685709 A +chr16 57683017 57685724 A +chr16 57683017 57685813 A +chr16 57683017 57685888 A +chr16 57683017 57685953 A +chr16 57683017 57686581 A +chr16 57683017 57687018 A +chr16 57683018 57683084 A +chr16 57683018 57683673 A +chr16 57683018 57683734 A +chr16 57683018 57684027 A +chr16 57683018 57684079 A +chr16 57683018 57684156 A +chr16 57683018 57684298 A +chr16 57683018 57684377 A +chr16 57683018 57684822 A +chr16 57683018 57685227 A +chr16 57683018 57685360 A +chr16 57683018 57685367 A +chr16 57683018 57685586 A +chr16 57683018 57685688 A +chr16 57683018 57685728 A +chr16 57683018 57685761 A +chr16 57683018 57685871 A +chr16 57683018 57686201 A +chr16 57683018 57686562 A +chr16 57683019 57682177 A +chr16 57683019 57683454 A +chr16 57683019 57683456 A +chr16 57683019 57683897 A +chr16 57683019 57683922 A +chr16 57683019 57683963 A +chr16 57683019 57684336 A +chr16 57683019 57684443 A +chr16 57683019 57684798 A +chr16 57683019 57685043 A +chr16 57683019 57685233 A +chr16 57683019 57685295 A +chr16 57683019 57686174 A +chr16 57683019 57686281 A +chr16 57683020 57684190 A +chr16 57683020 57684330 A +chr16 57683020 57684408 A +chr16 57683020 57684762 A +chr16 57683020 57684769 A +chr16 57683020 57684789 A +chr16 57683020 57684875 A +chr16 57683020 57685271 A +chr16 57683020 57685325 A +chr16 57683020 57685402 A +chr16 57683020 57685591 A +chr16 57683020 57685633 A +chr16 57683020 57685839 A +chr16 57683020 57686273 A +chr16 57683020 57686413 A +chr16 57683020 57686583 A +chr16 57683020 57686717 A +chr16 57683020 57686813 A +chr16 57683021 57683642 A +chr16 57683021 57683830 A +chr16 57683021 57683834 A +chr16 57683021 57683879 A +chr16 57683021 57684157 A +chr16 57683021 57684190 A +chr16 57683021 57684212 A +chr16 57683021 57684252 A +chr16 57683021 57684996 A +chr16 57683021 57685016 A +chr16 57683021 57685078 A +chr16 57683021 57685377 A +chr16 57683021 57685530 A +chr16 57683021 57685798 A +chr16 57683021 57685837 A +chr16 57683021 57685867 A +chr16 57683021 57686157 A +chr16 57683021 57686159 A +chr16 57683021 57686307 A +chr16 57683021 57686994 A +chr16 57683021 57687416 A +chr16 57683021 57687561 A +chr16 57683021 57687616 A +chr16 57683021 57687761 A +chr16 57683022 57684065 A +chr16 57683022 57684242 A +chr16 57683022 57684344 A +chr16 57683022 57684449 A +chr16 57683022 57684453 A +chr16 57683022 57684745 A +chr16 57683022 57684938 A +chr16 57683022 57685164 A +chr16 57683022 57685291 A +chr16 57683022 57685375 A +chr16 57683022 57685698 A +chr16 57683022 57686115 A +chr16 57683022 57686191 A +chr16 57683022 57686847 A +chr16 57683022 57687446 A +chr16 57683023 57682920 A +chr16 57683023 57683944 A +chr16 57683023 57684254 A +chr16 57683023 57684453 A +chr16 57683023 57684813 A +chr16 57683023 57684897 A +chr16 57683023 57684999 A +chr16 57683023 57685018 A +chr16 57683023 57685050 A +chr16 57683023 57685160 A +chr16 57683023 57685246 A +chr16 57683023 57685343 A +chr16 57683023 57685414 A +chr16 57683023 57685420 A +chr16 57683023 57685487 A +chr16 57683023 57685627 A +chr16 57683023 57685826 A +chr16 57683023 57685840 A +chr16 57683023 57686347 A +chr16 57683024 57684026 A +chr16 57683024 57684340 A +chr16 57683024 57684356 A +chr16 57683024 57684478 A +chr16 57683024 57684990 A +chr16 57683024 57685000 A +chr16 57683024 57685005 A +chr16 57683024 57685264 A +chr16 57683024 57685478 A +chr16 57683024 57685687 A +chr16 57683024 57685771 A +chr16 57683024 57685781 A +chr16 57683024 57686198 A +chr16 57683024 57686415 A +chr16 57683024 57687090 A +chr16 57683024 57687173 A +chr16 57683025 57682965 A +chr16 57683025 57683206 A +chr16 57683025 57683319 A +chr16 57683025 57683435 A +chr16 57683025 57683604 A +chr16 57683025 57683746 A +chr16 57683025 57684010 A +chr16 57683025 57684224 A +chr16 57683025 57684350 A +chr16 57683025 57684534 A +chr16 57683025 57684638 A +chr16 57683025 57684705 A +chr16 57683025 57684793 A +chr16 57683025 57684820 A +chr16 57683025 57685052 A +chr16 57683025 57685493 A +chr16 57683025 57685548 A +chr16 57683025 57685818 A +chr16 57683025 57685907 A +chr16 57683025 57685907 A +chr16 57683025 57686148 A +chr16 57683025 57686576 A +chr16 57683026 57683926 A +chr16 57683026 57684170 A +chr16 57683026 57684296 A +chr16 57683026 57684343 A +chr16 57683026 57684454 A +chr16 57683026 57684749 A +chr16 57683026 57684836 A +chr16 57683026 57685413 A +chr16 57683026 57685486 A +chr16 57683026 57685893 A +chr16 57683026 57686244 A +chr16 57683026 57686483 A +chr16 57683026 57686578 A +chr16 57683027 57684465 A +chr16 57683027 57684724 A +chr16 57683027 57685019 A +chr16 57683027 57685240 A +chr16 57683027 57685384 A +chr16 57683027 57685642 A +chr16 57683027 57685688 A +chr16 57683027 57685979 A +chr16 57683027 57686000 A +chr16 57683027 57686383 A +chr16 57683027 57687113 A +chr16 57683028 57683296 A +chr16 57683028 57683681 A +chr16 57683028 57683708 A +chr16 57683028 57683975 A +chr16 57683028 57683990 A +chr16 57683028 57684090 A +chr16 57683028 57684263 A +chr16 57683028 57684274 A +chr16 57683028 57684275 A +chr16 57683028 57684363 A +chr16 57683028 57684546 A +chr16 57683028 57685289 A +chr16 57683028 57685527 A +chr16 57683028 57685541 A +chr16 57683028 57685875 A +chr16 57683028 57685968 A +chr16 57683028 57686319 A +chr16 57683028 57686488 A +chr16 57683028 57686579 A +chr16 57683028 57686805 A +chr16 57683029 57682507 A +chr16 57683029 57682972 A +chr16 57683029 57683062 A +chr16 57683029 57683596 A +chr16 57683029 57684062 A +chr16 57683029 57684589 A +chr16 57683029 57684618 A +chr16 57683029 57684632 A +chr16 57683029 57684640 A +chr16 57683029 57684709 A +chr16 57683029 57684834 A +chr16 57683029 57685083 A +chr16 57683029 57685106 A +chr16 57683029 57685190 A +chr16 57683029 57685334 A +chr16 57683029 57685355 A +chr16 57683029 57686005 A +chr16 57683029 57686094 A +chr16 57683029 57686244 A +chr16 57683029 57686719 A +chr16 57683029 57686742 A +chr16 57683029 57687398 A +chr16 57683030 57682620 A +chr16 57683030 57683014 A +chr16 57683030 57683890 A +chr16 57683030 57684278 A +chr16 57683030 57684307 A +chr16 57683030 57684383 A +chr16 57683030 57684943 A +chr16 57683030 57685051 A +chr16 57683030 57685071 A +chr16 57683030 57685271 A +chr16 57683030 57685490 A +chr16 57683030 57686912 A +chr16 57683031 57682987 A +chr16 57683031 57683687 A +chr16 57683031 57684048 A +chr16 57683031 57684223 A +chr16 57683031 57684288 A +chr16 57683031 57684302 A +chr16 57683031 57684472 A +chr16 57683031 57684653 A +chr16 57683031 57684775 A +chr16 57683031 57684816 A +chr16 57683031 57684843 A +chr16 57683031 57684987 A +chr16 57683031 57684994 A +chr16 57683031 57685056 A +chr16 57683031 57685410 A +chr16 57683031 57685466 A +chr16 57683031 57685739 A +chr16 57683031 57686065 A +chr16 57683031 57686807 A +chr16 57683031 57686945 A +chr16 57683031 57687019 A +chr16 57683032 57682931 A +chr16 57683032 57683385 A +chr16 57683032 57683487 A +chr16 57683032 57683840 A +chr16 57683032 57683985 A +chr16 57683032 57684056 A +chr16 57683032 57684613 A +chr16 57683032 57684758 A +chr16 57683032 57684805 A +chr16 57683032 57684828 A +chr16 57683032 57685138 A +chr16 57683032 57685230 A +chr16 57683032 57685276 A +chr16 57683032 57685319 A +chr16 57683032 57685367 A +chr16 57683032 57685704 A +chr16 57683032 57685727 A +chr16 57683032 57685969 A +chr16 57683032 57686024 A +chr16 57683032 57686049 A +chr16 57683032 57686085 A +chr16 57683032 57686159 A +chr16 57683032 57686159 A +chr16 57683032 57686453 A +chr16 57683032 57686701 A +chr16 57683032 57686830 A +chr16 57683033 57681775 A +chr16 57683033 57682740 A +chr16 57683033 57683223 A +chr16 57683033 57683533 A +chr16 57683033 57683847 A +chr16 57683033 57683896 A +chr16 57683033 57684307 A +chr16 57683033 57684443 A +chr16 57683033 57684558 A +chr16 57683033 57684573 A +chr16 57683033 57684780 A +chr16 57683033 57684920 A +chr16 57683033 57685188 A +chr16 57683033 57685226 A +chr16 57683033 57685365 A +chr16 57683033 57685447 A +chr16 57683033 57687188 A +chr16 57683033 57688414 A +chr16 57683034 57682622 A +chr16 57683034 57684300 A +chr16 57683034 57684315 A +chr16 57683034 57684443 A +chr16 57683034 57685067 A +chr16 57683034 57685133 A +chr16 57683034 57685193 A +chr16 57683034 57685246 A +chr16 57683034 57685432 A +chr16 57683034 57685633 A +chr16 57683034 57685771 A +chr16 57683034 57685914 A +chr16 57683034 57685982 A +chr16 57683034 57686010 A +chr16 57683034 57686475 A +chr16 57683034 57686803 A +chr16 57683034 57686810 A +chr16 57683034 57687227 A +chr16 57683035 57683170 A +chr16 57683035 57684347 A +chr16 57683035 57684600 A +chr16 57683035 57684955 A +chr16 57683035 57685169 A +chr16 57683035 57685182 A +chr16 57683035 57685274 A +chr16 57683035 57685277 A +chr16 57683035 57685521 A +chr16 57683035 57685561 A +chr16 57683035 57685870 A +chr16 57683035 57686061 A +chr16 57683035 57686548 A +chr16 57683036 57682993 A +chr16 57683036 57683235 A +chr16 57683036 57683774 A +chr16 57683036 57683871 A +chr16 57683036 57683929 A +chr16 57683036 57684104 A +chr16 57683036 57684282 A +chr16 57683036 57684350 A +chr16 57683036 57684401 A +chr16 57683036 57684638 A +chr16 57683036 57684830 A +chr16 57683036 57685112 A +chr16 57683036 57685263 A +chr16 57683036 57685509 A +chr16 57683036 57685631 A +chr16 57683036 57685825 A +chr16 57683036 57685885 A +chr16 57683036 57685970 A +chr16 57683036 57686273 A +chr16 57683036 57686459 A +chr16 57683036 57686704 A +chr16 57683037 57683298 A +chr16 57683037 57683704 A +chr16 57683037 57683859 A +chr16 57683037 57684061 A +chr16 57683037 57684197 A +chr16 57683037 57684364 A +chr16 57683037 57684479 A +chr16 57683037 57684501 A +chr16 57683037 57684598 A +chr16 57683037 57684656 A +chr16 57683037 57684671 A +chr16 57683037 57684978 A +chr16 57683037 57685125 A +chr16 57683037 57685209 A +chr16 57683037 57685228 A +chr16 57683037 57685872 A +chr16 57683037 57685873 A +chr16 57683038 57684044 A +chr16 57683038 57684433 A +chr16 57683038 57684503 A +chr16 57683038 57684624 A +chr16 57683038 57684809 A +chr16 57683038 57684975 A +chr16 57683038 57685167 A +chr16 57683038 57685196 A +chr16 57683038 57685350 A +chr16 57683038 57685614 A +chr16 57683038 57685855 A +chr16 57683038 57685964 A +chr16 57683038 57686087 A +chr16 57683038 57686822 A +chr16 57683038 57686971 A +chr16 57683039 57682799 A +chr16 57683039 57683505 A +chr16 57683039 57683619 A +chr16 57683039 57683680 A +chr16 57683039 57683956 A +chr16 57683039 57683965 A +chr16 57683039 57684037 A +chr16 57683039 57684309 A +chr16 57683039 57684682 A +chr16 57683039 57684972 A +chr16 57683039 57684978 A +chr16 57683039 57685143 A +chr16 57683039 57685494 A +chr16 57683039 57685530 A +chr16 57683039 57685720 A +chr16 57683039 57686872 A +chr16 57683040 57683382 A +chr16 57683040 57683726 A +chr16 57683040 57683898 A +chr16 57683040 57684299 A +chr16 57683040 57684351 A +chr16 57683040 57684530 A +chr16 57683040 57684540 A +chr16 57683040 57684596 A +chr16 57683040 57684759 A +chr16 57683040 57685052 A +chr16 57683040 57685095 A +chr16 57683040 57685139 A +chr16 57683040 57685222 A +chr16 57683040 57685339 A +chr16 57683040 57685552 A +chr16 57683040 57686045 A +chr16 57683041 57683173 A +chr16 57683041 57683792 A +chr16 57683041 57683861 A +chr16 57683041 57684170 A +chr16 57683041 57684233 A +chr16 57683041 57684408 A +chr16 57683041 57684529 A +chr16 57683041 57684561 A +chr16 57683041 57684664 A +chr16 57683041 57684799 A +chr16 57683041 57684819 A +chr16 57683041 57684844 A +chr16 57683041 57685038 A +chr16 57683041 57685174 A +chr16 57683041 57685315 A +chr16 57683041 57685446 A +chr16 57683041 57685472 A +chr16 57683041 57685735 A +chr16 57683041 57685840 A +chr16 57683041 57685885 A +chr16 57683041 57686004 A +chr16 57683041 57686053 A +chr16 57683041 57687043 A +chr16 57683042 57682764 A +chr16 57683042 57683462 A +chr16 57683042 57683723 A +chr16 57683042 57683742 A +chr16 57683042 57683929 A +chr16 57683042 57684049 A +chr16 57683042 57684229 A +chr16 57683042 57684239 A +chr16 57683042 57684359 A +chr16 57683042 57684509 A +chr16 57683042 57684567 A +chr16 57683042 57684618 A +chr16 57683042 57684764 A +chr16 57683042 57684888 A +chr16 57683042 57684903 A +chr16 57683042 57684905 A +chr16 57683042 57684973 A +chr16 57683042 57685087 A +chr16 57683042 57685137 A +chr16 57683042 57685230 A +chr16 57683042 57685237 A +chr16 57683042 57685321 A +chr16 57683042 57685447 A +chr16 57683042 57685510 A +chr16 57683042 57685656 A +chr16 57683042 57685714 A +chr16 57683042 57685762 A +chr16 57683042 57686028 A +chr16 57683042 57686185 A +chr16 57683042 57686553 A +chr16 57683042 57686607 A +chr16 57683043 57683933 A +chr16 57683043 57683951 A +chr16 57683043 57684002 A +chr16 57683043 57684054 A +chr16 57683043 57684489 A +chr16 57683043 57684599 A +chr16 57683043 57684815 A +chr16 57683043 57684881 A +chr16 57683043 57684992 A +chr16 57683043 57685121 A +chr16 57683043 57685399 A +chr16 57683043 57685658 A +chr16 57683043 57685738 A +chr16 57683043 57685759 A +chr16 57683043 57685787 A +chr16 57683043 57685788 A +chr16 57683043 57685830 A +chr16 57683043 57685850 A +chr16 57683043 57685960 A +chr16 57683043 57686171 A +chr16 57683043 57686248 A +chr16 57683043 57686382 A +chr16 57683044 57683683 A +chr16 57683044 57683694 A +chr16 57683044 57684618 A +chr16 57683044 57684704 A +chr16 57683044 57684745 A +chr16 57683044 57684759 A +chr16 57683044 57684784 A +chr16 57683044 57684980 A +chr16 57683044 57685227 A +chr16 57683044 57685251 A +chr16 57683044 57685296 A +chr16 57683044 57685450 A +chr16 57683044 57685524 A +chr16 57683044 57685568 A +chr16 57683044 57685739 A +chr16 57683044 57685776 A +chr16 57683044 57685888 A +chr16 57683044 57686132 A +chr16 57683044 57686271 A +chr16 57683044 57686340 A +chr16 57683045 57682492 A +chr16 57683045 57683324 A +chr16 57683045 57683612 A +chr16 57683045 57683700 A +chr16 57683045 57683844 A +chr16 57683045 57683973 A +chr16 57683045 57684818 A +chr16 57683045 57684989 A +chr16 57683045 57685148 A +chr16 57683045 57685223 A +chr16 57683045 57685235 A +chr16 57683045 57685360 A +chr16 57683045 57685385 A +chr16 57683045 57685742 A +chr16 57683045 57685944 A +chr16 57683045 57685958 A +chr16 57683045 57686069 A +chr16 57683045 57686182 A +chr16 57683045 57686317 A +chr16 57683045 57686633 A +chr16 57683045 57686683 A +chr16 57683045 57686806 A +chr16 57683046 57683208 A +chr16 57683046 57683563 A +chr16 57683046 57683671 A +chr16 57683046 57683707 A +chr16 57683046 57684165 A +chr16 57683046 57684450 A +chr16 57683046 57684664 A +chr16 57683046 57684810 A +chr16 57683046 57684959 A +chr16 57683046 57685457 A +chr16 57683046 57685606 A +chr16 57683046 57685834 A +chr16 57683046 57685865 A +chr16 57683046 57685899 A +chr16 57683046 57685973 A +chr16 57683046 57686157 A +chr16 57683046 57686306 A +chr16 57683047 57683378 A +chr16 57683047 57684201 A +chr16 57683047 57684545 A +chr16 57683047 57684621 A +chr16 57683047 57684629 A +chr16 57683047 57684745 A +chr16 57683047 57684979 A +chr16 57683047 57685306 A +chr16 57683047 57685474 A +chr16 57683047 57685714 A +chr16 57683047 57686575 A +chr16 57683048 57683746 A +chr16 57683048 57683800 A +chr16 57683048 57683851 A +chr16 57683048 57683943 A +chr16 57683048 57683966 A +chr16 57683048 57684283 A +chr16 57683048 57684813 A +chr16 57683048 57685107 A +chr16 57683048 57685525 A +chr16 57683048 57685562 A +chr16 57683048 57685801 A +chr16 57683048 57686485 A +chr16 57683048 57686530 A +chr16 57683048 57686711 A +chr16 57683048 57687037 A +chr16 57683048 57687224 A +chr16 57683049 57683765 A +chr16 57683049 57683847 A +chr16 57683049 57683863 A +chr16 57683049 57683912 A +chr16 57683049 57684014 A +chr16 57683049 57684193 A +chr16 57683049 57684511 A +chr16 57683049 57684553 A +chr16 57683049 57684664 A +chr16 57683049 57684805 A +chr16 57683049 57684872 A +chr16 57683049 57685031 A +chr16 57683049 57685076 A +chr16 57683049 57685394 A +chr16 57683049 57685580 A +chr16 57683049 57685643 A +chr16 57683049 57685826 A +chr16 57683049 57685892 A +chr16 57683049 57685990 A +chr16 57683049 57686004 A +chr16 57683049 57686013 A +chr16 57683049 57686474 A +chr16 57683049 57687178 A +chr16 57683050 57682599 A +chr16 57683050 57683011 A +chr16 57683050 57683102 A +chr16 57683050 57683135 A +chr16 57683050 57683683 A +chr16 57683050 57684432 A +chr16 57683050 57684735 A +chr16 57683050 57684753 A +chr16 57683050 57684878 A +chr16 57683050 57684923 A +chr16 57683050 57685062 A +chr16 57683050 57685256 A +chr16 57683050 57685542 A +chr16 57683050 57685708 A +chr16 57683050 57685786 A +chr16 57683050 57687005 A +chr16 57683050 57687370 A +chr16 57683051 57683581 A +chr16 57683051 57683776 A +chr16 57683051 57684173 A +chr16 57683051 57684565 A +chr16 57683051 57684856 A +chr16 57683051 57685173 A +chr16 57683051 57685581 A +chr16 57683051 57685625 A +chr16 57683051 57685819 A +chr16 57683051 57687007 A +chr16 57683052 57683607 A +chr16 57683052 57684116 A +chr16 57683052 57684182 A +chr16 57683052 57684234 A +chr16 57683052 57684423 A +chr16 57683052 57684521 A +chr16 57683052 57684535 A +chr16 57683052 57684664 A +chr16 57683052 57684891 A +chr16 57683052 57684913 A +chr16 57683052 57685000 A +chr16 57683052 57685485 A +chr16 57683052 57685588 A +chr16 57683052 57685692 A +chr16 57683052 57685723 A +chr16 57683052 57685759 A +chr16 57683052 57685806 A +chr16 57683052 57685984 A +chr16 57683052 57686650 A +chr16 57683052 57686666 A +chr16 57683053 57683083 A +chr16 57683053 57683182 A +chr16 57683053 57683667 A +chr16 57683053 57683688 A +chr16 57683053 57683773 A +chr16 57683053 57684256 A +chr16 57683053 57684322 A +chr16 57683053 57684350 A +chr16 57683053 57684599 A +chr16 57683053 57684736 A +chr16 57683053 57684770 A +chr16 57683053 57684847 A +chr16 57683053 57684913 A +chr16 57683053 57684957 A +chr16 57683053 57685448 A +chr16 57683053 57685462 A +chr16 57683053 57685599 A +chr16 57683053 57685897 A +chr16 57683053 57686144 A +chr16 57683053 57686622 A +chr16 57683054 57682497 A +chr16 57683054 57683855 A +chr16 57683054 57684086 A +chr16 57683054 57684137 A +chr16 57683054 57684330 A +chr16 57683054 57684342 A +chr16 57683054 57684368 A +chr16 57683054 57684479 A +chr16 57683054 57684575 A +chr16 57683054 57684970 A +chr16 57683054 57685150 A +chr16 57683054 57685176 A +chr16 57683054 57685217 A +chr16 57683054 57685217 A +chr16 57683054 57685295 A +chr16 57683054 57685305 A +chr16 57683054 57685360 A +chr16 57683054 57685401 A +chr16 57683054 57685751 A +chr16 57683054 57685941 A +chr16 57683054 57685975 A +chr16 57683054 57686042 A +chr16 57683054 57686205 A +chr16 57683055 57683312 A +chr16 57683055 57683325 A +chr16 57683055 57683530 A +chr16 57683055 57683914 A +chr16 57683055 57684058 A +chr16 57683055 57684425 A +chr16 57683055 57684679 A +chr16 57683055 57684882 A +chr16 57683055 57684887 A +chr16 57683055 57684915 A +chr16 57683055 57685402 A +chr16 57683055 57686219 A +chr16 57683055 57686340 A +chr16 57683055 57686516 A +chr16 57683056 57683854 A +chr16 57683056 57684283 A +chr16 57683056 57684479 A +chr16 57683056 57684557 A +chr16 57683056 57684669 A +chr16 57683056 57684922 A +chr16 57683056 57685072 A +chr16 57683056 57685494 A +chr16 57683056 57685538 A +chr16 57683056 57685600 A +chr16 57683056 57686028 A +chr16 57683056 57686151 A +chr16 57683056 57686181 A +chr16 57683056 57686591 A +chr16 57683056 57687432 A +chr16 57683057 57682856 A +chr16 57683057 57683350 A +chr16 57683057 57684552 A +chr16 57683057 57684630 A +chr16 57683057 57684673 A +chr16 57683057 57684783 A +chr16 57683057 57684797 A +chr16 57683057 57684853 A +chr16 57683057 57685029 A +chr16 57683057 57685047 A +chr16 57683057 57685060 A +chr16 57683057 57685105 A +chr16 57683057 57685464 A +chr16 57683057 57685501 A +chr16 57683057 57685802 A +chr16 57683057 57685872 A +chr16 57683057 57686008 A +chr16 57683057 57686771 A +chr16 57683057 57686946 A +chr16 57683058 57682956 A +chr16 57683058 57684215 A +chr16 57683058 57684279 A +chr16 57683058 57684529 A +chr16 57683058 57684726 A +chr16 57683058 57684823 A +chr16 57683058 57685120 A +chr16 57683058 57685391 A +chr16 57683058 57685562 A +chr16 57683058 57685718 A +chr16 57683058 57685867 A +chr16 57683058 57685991 A +chr16 57683058 57686238 A +chr16 57683058 57686342 A +chr16 57683058 57686375 A +chr16 57683059 57682419 A +chr16 57683059 57684157 A +chr16 57683059 57684255 A +chr16 57683059 57684288 A +chr16 57683059 57684383 A +chr16 57683059 57684920 A +chr16 57683059 57685023 A +chr16 57683059 57685151 A +chr16 57683059 57685315 A +chr16 57683059 57685325 A +chr16 57683059 57685349 A +chr16 57683059 57685666 A +chr16 57683059 57685717 A +chr16 57683059 57685825 A +chr16 57683059 57685987 A +chr16 57683059 57685997 A +chr16 57683059 57686142 A +chr16 57683059 57686466 A +chr16 57683059 57687081 A +chr16 57683060 57683863 A +chr16 57683060 57683972 A +chr16 57683060 57684048 A +chr16 57683060 57684119 A +chr16 57683060 57684216 A +chr16 57683060 57684386 A +chr16 57683060 57684393 A +chr16 57683060 57684549 A +chr16 57683060 57684689 A +chr16 57683060 57684829 A +chr16 57683060 57684956 A +chr16 57683060 57685034 A +chr16 57683060 57685045 A +chr16 57683060 57685057 A +chr16 57683060 57685099 A +chr16 57683060 57685446 A +chr16 57683060 57685964 A +chr16 57683060 57686032 A +chr16 57683060 57686275 A +chr16 57683060 57686532 A +chr16 57683060 57687085 A +chr16 57683061 57682426 A +chr16 57683061 57683253 A +chr16 57683061 57683901 A +chr16 57683061 57683969 A +chr16 57683061 57684569 A +chr16 57683061 57684653 A +chr16 57683061 57684670 A +chr16 57683061 57684855 A +chr16 57683061 57685027 A +chr16 57683061 57685351 A +chr16 57683061 57685763 A +chr16 57683061 57685885 A +chr16 57683061 57686343 A +chr16 57683061 57686602 A +chr16 57683061 57687605 A +chr16 57683062 57683460 A +chr16 57683062 57683593 A +chr16 57683062 57684047 A +chr16 57683062 57684142 A +chr16 57683062 57684187 A +chr16 57683062 57684298 A +chr16 57683062 57684424 A +chr16 57683062 57684546 A +chr16 57683062 57685075 A +chr16 57683062 57685120 A +chr16 57683062 57685167 A +chr16 57683062 57685294 A +chr16 57683062 57685335 A +chr16 57683062 57685468 A +chr16 57683062 57685559 A +chr16 57683062 57685662 A +chr16 57683062 57687543 A +chr16 57683063 57682992 A +chr16 57683063 57683162 A +chr16 57683063 57683325 A +chr16 57683063 57683523 A +chr16 57683063 57683700 A +chr16 57683063 57684049 A +chr16 57683063 57684211 A +chr16 57683063 57684220 A +chr16 57683063 57684290 A +chr16 57683063 57684319 A +chr16 57683063 57684669 A +chr16 57683063 57684687 A +chr16 57683063 57684811 A +chr16 57683063 57684814 A +chr16 57683063 57685086 A +chr16 57683063 57685325 A +chr16 57683063 57685380 A +chr16 57683063 57685539 A +chr16 57683063 57685783 A +chr16 57683063 57686708 A +chr16 57683063 57686758 A +chr16 57683063 57687170 A +chr16 57683064 57684133 A +chr16 57683064 57684184 A +chr16 57683064 57684306 A +chr16 57683064 57684724 A +chr16 57683064 57684937 A +chr16 57683064 57685002 A +chr16 57683064 57685054 A +chr16 57683064 57685326 A +chr16 57683064 57685539 A +chr16 57683064 57685540 A +chr16 57683064 57685625 A +chr16 57683064 57685829 A +chr16 57683064 57685891 A +chr16 57683064 57685968 A +chr16 57683064 57685978 A +chr16 57683064 57686574 A +chr16 57683064 57687607 A +chr16 57683065 57684000 A +chr16 57683065 57684365 A +chr16 57683065 57684505 A +chr16 57683065 57684688 A +chr16 57683065 57684712 A +chr16 57683065 57684719 A +chr16 57683065 57684745 A +chr16 57683065 57684952 A +chr16 57683065 57685072 A +chr16 57683065 57685551 A +chr16 57683065 57685666 A +chr16 57683065 57685708 A +chr16 57683065 57686199 A +chr16 57683065 57686253 A +chr16 57683065 57687653 A +chr16 57683066 57683084 A +chr16 57683066 57683840 A +chr16 57683066 57684225 A +chr16 57683066 57684250 A +chr16 57683066 57684267 A +chr16 57683066 57684442 A +chr16 57683066 57684583 A +chr16 57683066 57684720 A +chr16 57683066 57684840 A +chr16 57683066 57685209 A +chr16 57683066 57685289 A +chr16 57683066 57685498 A +chr16 57683066 57685499 A +chr16 57683066 57685717 A +chr16 57683066 57685969 A +chr16 57683066 57686016 A +chr16 57683066 57686263 A +chr16 57683066 57686332 A +chr16 57683066 57686721 A +chr16 57683066 57687288 A +chr16 57683067 57683280 A +chr16 57683067 57683542 A +chr16 57683067 57684082 A +chr16 57683067 57684109 A +chr16 57683067 57684399 A +chr16 57683067 57684498 A +chr16 57683067 57684806 A +chr16 57683067 57684916 A +chr16 57683067 57685080 A +chr16 57683067 57685083 A +chr16 57683067 57685163 A +chr16 57683067 57685276 A +chr16 57683067 57685399 A +chr16 57683067 57685491 A +chr16 57683067 57685594 A +chr16 57683067 57685761 A +chr16 57683067 57685781 A +chr16 57683067 57685943 A +chr16 57683067 57686494 A +chr16 57683068 57682700 A +chr16 57683068 57683329 A +chr16 57683068 57684110 A +chr16 57683068 57684474 A +chr16 57683068 57684550 A +chr16 57683068 57684684 A +chr16 57683068 57684869 A +chr16 57683068 57685235 A +chr16 57683068 57685329 A +chr16 57683068 57685436 A +chr16 57683068 57685758 A +chr16 57683068 57685794 A +chr16 57683068 57685902 A +chr16 57683068 57686231 A +chr16 57683069 57682984 A +chr16 57683069 57683238 A +chr16 57683069 57684251 A +chr16 57683069 57684525 A +chr16 57683069 57684582 A +chr16 57683069 57684603 A +chr16 57683069 57684756 A +chr16 57683069 57684929 A +chr16 57683069 57685053 A +chr16 57683069 57685392 A +chr16 57683069 57685701 A +chr16 57683069 57686540 A +chr16 57683069 57686566 A +chr16 57683069 57687933 A +chr16 57683070 57682966 A +chr16 57683070 57683435 A +chr16 57683070 57684316 A +chr16 57683070 57684670 A +chr16 57683070 57684780 A +chr16 57683070 57684825 A +chr16 57683070 57684923 A +chr16 57683070 57685046 A +chr16 57683070 57685152 A +chr16 57683070 57685155 A +chr16 57683070 57685586 A +chr16 57683070 57685890 A +chr16 57683070 57686076 A +chr16 57683070 57686371 A +chr16 57683070 57686433 A +chr16 57683070 57687362 A +chr16 57683071 57682596 A +chr16 57683071 57683889 A +chr16 57683071 57683926 A +chr16 57683071 57684112 A +chr16 57683071 57684145 A +chr16 57683071 57684570 A +chr16 57683071 57685030 A +chr16 57683071 57685031 A +chr16 57683071 57685242 A +chr16 57683071 57685406 A +chr16 57683071 57685431 A +chr16 57683071 57685702 A +chr16 57683071 57685813 A +chr16 57683071 57686197 A +chr16 57683071 57686270 A +chr16 57683071 57686298 A +chr16 57683071 57687306 A +chr16 57683072 57683795 A +chr16 57683072 57684057 A +chr16 57683072 57684091 A +chr16 57683072 57684117 A +chr16 57683072 57684317 A +chr16 57683072 57684317 A +chr16 57683072 57684330 A +chr16 57683072 57684453 A +chr16 57683072 57684487 A +chr16 57683072 57684599 A +chr16 57683072 57684737 A +chr16 57683072 57684909 A +chr16 57683072 57685006 A +chr16 57683072 57685026 A +chr16 57683072 57685389 A +chr16 57683072 57685435 A +chr16 57683072 57685656 A +chr16 57683072 57685777 A +chr16 57683072 57686187 A +chr16 57683072 57686471 A +chr16 57683072 57686503 A +chr16 57683072 57686627 A +chr16 57683073 57683301 A +chr16 57683073 57683739 A +chr16 57683073 57683838 A +chr16 57683073 57684125 A +chr16 57683073 57685002 A +chr16 57683073 57685138 A +chr16 57683073 57685473 A +chr16 57683073 57685696 A +chr16 57683073 57685846 A +chr16 57683073 57685957 A +chr16 57683073 57686522 A +chr16 57683073 57686530 A +chr16 57683073 57686555 A +chr16 57683073 57686555 A +chr16 57683074 57682877 A +chr16 57683074 57683906 A +chr16 57683074 57684282 A +chr16 57683074 57684320 A +chr16 57683074 57684490 A +chr16 57683074 57684509 A +chr16 57683074 57684548 A +chr16 57683074 57684818 A +chr16 57683074 57684984 A +chr16 57683074 57685128 A +chr16 57683074 57685238 A +chr16 57683074 57685270 A +chr16 57683074 57685291 A +chr16 57683074 57685292 A +chr16 57683074 57685307 A +chr16 57683074 57685409 A +chr16 57683074 57685468 A +chr16 57683074 57685588 A +chr16 57683074 57685899 A +chr16 57683074 57686262 A +chr16 57683074 57686932 A +chr16 57683074 57687286 A +chr16 57683075 57684210 A +chr16 57683075 57684513 A +chr16 57683075 57684542 A +chr16 57683075 57684637 A +chr16 57683075 57684668 A +chr16 57683075 57684773 A +chr16 57683075 57684840 A +chr16 57683075 57684853 A +chr16 57683075 57684901 A +chr16 57683075 57685359 A +chr16 57683075 57685408 A +chr16 57683075 57685458 A +chr16 57683075 57685714 A +chr16 57683075 57685727 A +chr16 57683075 57685822 A +chr16 57683075 57686373 A +chr16 57683075 57688300 A +chr16 57683076 57683094 A +chr16 57683076 57683452 A +chr16 57683076 57683468 A +chr16 57683076 57683820 A +chr16 57683076 57684074 A +chr16 57683076 57684323 A +chr16 57683076 57685046 A +chr16 57683076 57685088 A +chr16 57683076 57685088 A +chr16 57683076 57685701 A +chr16 57683076 57685756 A +chr16 57683076 57686070 A +chr16 57683076 57686087 A +chr16 57683076 57686165 A +chr16 57683076 57686753 A +chr16 57683076 57686766 A +chr16 57683077 57682191 A +chr16 57683077 57684023 A +chr16 57683077 57684096 A +chr16 57683077 57684197 A +chr16 57683077 57684216 A +chr16 57683077 57684305 A +chr16 57683077 57684338 A +chr16 57683077 57684371 A +chr16 57683077 57684443 A +chr16 57683077 57684505 A +chr16 57683077 57684663 A +chr16 57683077 57684806 A +chr16 57683077 57684815 A +chr16 57683077 57684906 A +chr16 57683077 57685215 A +chr16 57683077 57685253 A +chr16 57683077 57685269 A +chr16 57683077 57685335 A +chr16 57683077 57685357 A +chr16 57683077 57685464 A +chr16 57683077 57685553 A +chr16 57683077 57685809 A +chr16 57683077 57685835 A +chr16 57683077 57685874 A +chr16 57683077 57685892 A +chr16 57683077 57686161 A +chr16 57683077 57686292 A +chr16 57683077 57686428 A +chr16 57683077 57686651 A +chr16 57683077 57686721 A +chr16 57683077 57687266 A +chr16 57683077 57687452 A +chr16 57683077 57687675 A +chr16 57683078 57683292 A +chr16 57683078 57683329 A +chr16 57683078 57683554 A +chr16 57683078 57684480 A +chr16 57683078 57684853 A +chr16 57683078 57684991 A +chr16 57683078 57685111 A +chr16 57683078 57685346 A +chr16 57683078 57685417 A +chr16 57683078 57685572 A +chr16 57683078 57685672 A +chr16 57683078 57686547 A +chr16 57683079 57683453 A +chr16 57683079 57683626 A +chr16 57683079 57684007 A +chr16 57683079 57684195 A +chr16 57683079 57684209 A +chr16 57683079 57684366 A +chr16 57683079 57684402 A +chr16 57683079 57684441 A +chr16 57683079 57684445 A +chr16 57683079 57684451 A +chr16 57683079 57684629 A +chr16 57683079 57684879 A +chr16 57683079 57685094 A +chr16 57683079 57685126 A +chr16 57683079 57685150 A +chr16 57683079 57685243 A +chr16 57683079 57685628 A +chr16 57683079 57685660 A +chr16 57683079 57685746 A +chr16 57683079 57685822 A +chr16 57683079 57685830 A +chr16 57683079 57686029 A +chr16 57683079 57686124 A +chr16 57683079 57687137 A +chr16 57683079 57687332 A +chr16 57683079 57687624 A +chr16 57683080 57683544 A +chr16 57683080 57684094 A +chr16 57683080 57684500 A +chr16 57683080 57684525 A +chr16 57683080 57684554 A +chr16 57683080 57684571 A +chr16 57683080 57684773 A +chr16 57683080 57684776 A +chr16 57683080 57684778 A +chr16 57683080 57684963 A +chr16 57683080 57685134 A +chr16 57683080 57685135 A +chr16 57683080 57685589 A +chr16 57683080 57685589 A +chr16 57683080 57685770 A +chr16 57683080 57686027 A +chr16 57683081 57681667 A +chr16 57683081 57683743 A +chr16 57683081 57683757 A +chr16 57683081 57684162 A +chr16 57683081 57684199 A +chr16 57683081 57684503 A +chr16 57683081 57684535 A +chr16 57683081 57684791 A +chr16 57683081 57684918 A +chr16 57683081 57685131 A +chr16 57683081 57685165 A +chr16 57683081 57685242 A +chr16 57683081 57685498 A +chr16 57683081 57685786 A +chr16 57683081 57686034 A +chr16 57683081 57686043 A +chr16 57683081 57686094 A +chr16 57683081 57686339 A +chr16 57683081 57686872 A +chr16 57683082 57683334 A +chr16 57683082 57683917 A +chr16 57683082 57683990 A +chr16 57683082 57684042 A +chr16 57683082 57684742 A +chr16 57683082 57684919 A +chr16 57683082 57684971 A +chr16 57683082 57685098 A +chr16 57683082 57685222 A +chr16 57683082 57685390 A +chr16 57683082 57685784 A +chr16 57683082 57685854 A +chr16 57683082 57685859 A +chr16 57683082 57685980 A +chr16 57683082 57686275 A +chr16 57683082 57686449 A +chr16 57683082 57686608 A +chr16 57683082 57686896 A +chr16 57683082 57687322 A +chr16 57683083 57683073 A +chr16 57683083 57683627 A +chr16 57683083 57684066 A +chr16 57683083 57684321 A +chr16 57683083 57684749 A +chr16 57683083 57684834 A +chr16 57683083 57684970 A +chr16 57683083 57685345 A +chr16 57683083 57685362 A +chr16 57683083 57685367 A +chr16 57683083 57685660 A +chr16 57683083 57685864 A +chr16 57683083 57686078 A +chr16 57683083 57686168 A +chr16 57683083 57686223 A +chr16 57683084 57683085 A +chr16 57683084 57683322 A +chr16 57683084 57683470 A +chr16 57683084 57683558 A +chr16 57683084 57683718 A +chr16 57683084 57683743 A +chr16 57683084 57684046 A +chr16 57683084 57684169 A +chr16 57683084 57684464 A +chr16 57683084 57684484 A +chr16 57683084 57684547 A +chr16 57683084 57684752 A +chr16 57683084 57684883 A +chr16 57683084 57685148 A +chr16 57683084 57685180 A +chr16 57683084 57685226 A +chr16 57683084 57686040 A +chr16 57683084 57686154 A +chr16 57683085 57684672 A +chr16 57683085 57684675 A +chr16 57683085 57684689 A +chr16 57683085 57684764 A +chr16 57683085 57684768 A +chr16 57683085 57685005 A +chr16 57683085 57685036 A +chr16 57683085 57685088 A +chr16 57683085 57685122 A +chr16 57683085 57685147 A +chr16 57683085 57685557 A +chr16 57683085 57685561 A +chr16 57683085 57685565 A +chr16 57683085 57685747 A +chr16 57683085 57685811 A +chr16 57683085 57685827 A +chr16 57683085 57685850 A +chr16 57683085 57687506 A +chr16 57683086 57683858 A +chr16 57683086 57684079 A +chr16 57683086 57684346 A +chr16 57683086 57684402 A +chr16 57683086 57684529 A +chr16 57683086 57684549 A +chr16 57683086 57684594 A +chr16 57683086 57684656 A +chr16 57683086 57684878 A +chr16 57683086 57684920 A +chr16 57683086 57685192 A +chr16 57683086 57685295 A +chr16 57683086 57685389 A +chr16 57683086 57685439 A +chr16 57683086 57685496 A +chr16 57683086 57685553 A +chr16 57683086 57685579 A +chr16 57683086 57685661 A +chr16 57683086 57685745 A +chr16 57683086 57685771 A +chr16 57683086 57685778 A +chr16 57683086 57685817 A +chr16 57683086 57685922 A +chr16 57683086 57685977 A +chr16 57683086 57686290 A +chr16 57683086 57686412 A +chr16 57683087 57683017 A +chr16 57683087 57683215 A +chr16 57683087 57683615 A +chr16 57683087 57684101 A +chr16 57683087 57684141 A +chr16 57683087 57684265 A +chr16 57683087 57684337 A +chr16 57683087 57684348 A +chr16 57683087 57684418 A +chr16 57683087 57684496 A +chr16 57683087 57684677 A +chr16 57683087 57684848 A +chr16 57683087 57684911 A +chr16 57683087 57684947 A +chr16 57683087 57685133 A +chr16 57683087 57685205 A +chr16 57683087 57685447 A +chr16 57683087 57685476 A +chr16 57683087 57685717 A +chr16 57683087 57685750 A +chr16 57683087 57685820 A +chr16 57683087 57686042 A +chr16 57683087 57686100 A +chr16 57683087 57686168 A +chr16 57683087 57686203 A +chr16 57683087 57687215 A +chr16 57683088 57683198 A +chr16 57683088 57683341 A +chr16 57683088 57683683 A +chr16 57683088 57684169 A +chr16 57683088 57684294 A +chr16 57683088 57684564 A +chr16 57683088 57684763 A +chr16 57683088 57685010 A +chr16 57683088 57685102 A +chr16 57683088 57685314 A +chr16 57683088 57685343 A +chr16 57683088 57685651 A +chr16 57683088 57685682 A +chr16 57683088 57685768 A +chr16 57683088 57685771 A +chr16 57683088 57685825 A +chr16 57683088 57686115 A +chr16 57683088 57686545 A +chr16 57683088 57686777 A +chr16 57683089 57683866 A +chr16 57683089 57684225 A +chr16 57683089 57684225 A +chr16 57683089 57684474 A +chr16 57683089 57684708 A +chr16 57683089 57684728 A +chr16 57683089 57684867 A +chr16 57683089 57684875 A +chr16 57683089 57684878 A +chr16 57683089 57684891 A +chr16 57683089 57684909 A +chr16 57683089 57685080 A +chr16 57683089 57685130 A +chr16 57683089 57685245 A +chr16 57683089 57685965 A +chr16 57683089 57686115 A +chr16 57683089 57686562 A +chr16 57683089 57687044 A +chr16 57683089 57687370 A +chr16 57683090 57683341 A +chr16 57683090 57683509 A +chr16 57683090 57684181 A +chr16 57683090 57684213 A +chr16 57683090 57684604 A +chr16 57683090 57684620 A +chr16 57683090 57684623 A +chr16 57683090 57684732 A +chr16 57683090 57684799 A +chr16 57683090 57684817 A +chr16 57683090 57684932 A +chr16 57683090 57685070 A +chr16 57683090 57685078 A +chr16 57683090 57685154 A +chr16 57683090 57685197 A +chr16 57683090 57685256 A +chr16 57683090 57685394 A +chr16 57683090 57685512 A +chr16 57683090 57685961 A +chr16 57683090 57685997 A +chr16 57683090 57686000 A +chr16 57683090 57686084 A +chr16 57683090 57686936 A +chr16 57683091 57682547 A +chr16 57683091 57682823 A +chr16 57683091 57683462 A +chr16 57683091 57683744 A +chr16 57683091 57683749 A +chr16 57683091 57684090 A +chr16 57683091 57684112 A +chr16 57683091 57684209 A +chr16 57683091 57684249 A +chr16 57683091 57684609 A +chr16 57683091 57684612 A +chr16 57683091 57684706 A +chr16 57683091 57685070 A +chr16 57683091 57685310 A +chr16 57683091 57685556 A +chr16 57683091 57685682 A +chr16 57683091 57685808 A +chr16 57683091 57685986 A +chr16 57683091 57686361 A +chr16 57683091 57686614 A +chr16 57683091 57686895 A +chr16 57683091 57686938 A +chr16 57683092 57682277 A +chr16 57683092 57683439 A +chr16 57683092 57684353 A +chr16 57683092 57684365 A +chr16 57683092 57684474 A +chr16 57683092 57684633 A +chr16 57683092 57684665 A +chr16 57683092 57684830 A +chr16 57683092 57684890 A +chr16 57683092 57685039 A +chr16 57683092 57685100 A +chr16 57683092 57685138 A +chr16 57683092 57685338 A +chr16 57683092 57685482 A +chr16 57683092 57685581 A +chr16 57683092 57685619 A +chr16 57683092 57685843 A +chr16 57683092 57686190 A +chr16 57683092 57686265 A +chr16 57683092 57687177 A +chr16 57683092 57687642 A +chr16 57683093 57683691 A +chr16 57683093 57683900 A +chr16 57683093 57684143 A +chr16 57683093 57684216 A +chr16 57683093 57684376 A +chr16 57683093 57684640 A +chr16 57683093 57684832 A +chr16 57683093 57684937 A +chr16 57683093 57685013 A +chr16 57683093 57685123 A +chr16 57683093 57685234 A +chr16 57683093 57685240 A +chr16 57683093 57685254 A +chr16 57683093 57685369 A +chr16 57683093 57685507 A +chr16 57683093 57685538 A +chr16 57683093 57686000 A +chr16 57683093 57686888 A +chr16 57683094 57682513 A +chr16 57683094 57683502 A +chr16 57683094 57683955 A +chr16 57683094 57684184 A +chr16 57683094 57684298 A +chr16 57683094 57684362 A +chr16 57683094 57684565 A +chr16 57683094 57684609 A +chr16 57683094 57684652 A +chr16 57683094 57684662 A +chr16 57683094 57684982 A +chr16 57683094 57685103 A +chr16 57683094 57685128 A +chr16 57683094 57685129 A +chr16 57683094 57685277 A +chr16 57683094 57685322 A +chr16 57683094 57685440 A +chr16 57683094 57685685 A +chr16 57683094 57686079 A +chr16 57683094 57686254 A +chr16 57683094 57686323 A +chr16 57683094 57686395 A +chr16 57683094 57686507 A +chr16 57683094 57686964 A +chr16 57683095 57683307 A +chr16 57683095 57683827 A +chr16 57683095 57683962 A +chr16 57683095 57683965 A +chr16 57683095 57684023 A +chr16 57683095 57684466 A +chr16 57683095 57684538 A +chr16 57683095 57684708 A +chr16 57683095 57684809 A +chr16 57683095 57684851 A +chr16 57683095 57685027 A +chr16 57683095 57685040 A +chr16 57683095 57685153 A +chr16 57683095 57685248 A +chr16 57683095 57685390 A +chr16 57683095 57685399 A +chr16 57683095 57685484 A +chr16 57683095 57685487 A +chr16 57683095 57685646 A +chr16 57683095 57685773 A +chr16 57683095 57685792 A +chr16 57683095 57685830 A +chr16 57683095 57687026 A +chr16 57683095 57687472 A +chr16 57683095 57687521 A +chr16 57683096 57683268 A +chr16 57683096 57683630 A +chr16 57683096 57683770 A +chr16 57683096 57684096 A +chr16 57683096 57684346 A +chr16 57683096 57684384 A +chr16 57683096 57685407 A +chr16 57683096 57685413 A +chr16 57683096 57685414 A +chr16 57683096 57685449 A +chr16 57683096 57685599 A +chr16 57683096 57685691 A +chr16 57683096 57685724 A +chr16 57683096 57685775 A +chr16 57683096 57685832 A +chr16 57683096 57685897 A +chr16 57683096 57685978 A +chr16 57683096 57686196 A +chr16 57683096 57686198 A +chr16 57683097 57682881 A +chr16 57683097 57683864 A +chr16 57683097 57684014 A +chr16 57683097 57684073 A +chr16 57683097 57684105 A +chr16 57683097 57684196 A +chr16 57683097 57684283 A +chr16 57683097 57684296 A +chr16 57683097 57684380 A +chr16 57683097 57684400 A +chr16 57683097 57684693 A +chr16 57683097 57684772 A +chr16 57683097 57684980 A +chr16 57683097 57685004 A +chr16 57683097 57685211 A +chr16 57683097 57685265 A +chr16 57683097 57685331 A +chr16 57683097 57685335 A +chr16 57683097 57685564 A +chr16 57683097 57685647 A +chr16 57683097 57685770 A +chr16 57683097 57685794 A +chr16 57683097 57685832 A +chr16 57683097 57686111 A +chr16 57683097 57686309 A +chr16 57683097 57686314 A +chr16 57683098 57683482 A +chr16 57683098 57684101 A +chr16 57683098 57684128 A +chr16 57683098 57684181 A +chr16 57683098 57684308 A +chr16 57683098 57684645 A +chr16 57683098 57684832 A +chr16 57683098 57684920 A +chr16 57683098 57684980 A +chr16 57683098 57685028 A +chr16 57683098 57685340 A +chr16 57683098 57685492 A +chr16 57683098 57685880 A +chr16 57683098 57686103 A +chr16 57683098 57686292 A +chr16 57683098 57686389 A +chr16 57683098 57686993 A +chr16 57683099 57683253 A +chr16 57683099 57683311 A +chr16 57683099 57683872 A +chr16 57683099 57684347 A +chr16 57683099 57684699 A +chr16 57683099 57684919 A +chr16 57683099 57685166 A +chr16 57683099 57685311 A +chr16 57683099 57685320 A +chr16 57683099 57685662 A +chr16 57683099 57686330 A +chr16 57683099 57686378 A +chr16 57683099 57688357 A +chr16 57683100 57683375 A +chr16 57683100 57683950 A +chr16 57683100 57684159 A +chr16 57683100 57684379 A +chr16 57683100 57684420 A +chr16 57683100 57684468 A +chr16 57683100 57684587 A +chr16 57683100 57684694 A +chr16 57683100 57684697 A +chr16 57683100 57684912 A +chr16 57683100 57684991 A +chr16 57683100 57685119 A +chr16 57683100 57685238 A +chr16 57683100 57685272 A +chr16 57683100 57685491 A +chr16 57683100 57685557 A +chr16 57683100 57685770 A +chr16 57683100 57685779 A +chr16 57683100 57685861 A +chr16 57683100 57686360 A +chr16 57683100 57686417 A +chr16 57683101 57684277 A +chr16 57683101 57684559 A +chr16 57683101 57684598 A +chr16 57683101 57684815 A +chr16 57683101 57684880 A +chr16 57683101 57684894 A +chr16 57683101 57684917 A +chr16 57683101 57684927 A +chr16 57683101 57685297 A +chr16 57683101 57685661 A +chr16 57683101 57685805 A +chr16 57683101 57686002 A +chr16 57683101 57686089 A +chr16 57683101 57686172 A +chr16 57683101 57686220 A +chr16 57683101 57686473 A +chr16 57683101 57686511 A +chr16 57683102 57683319 A +chr16 57683102 57683992 A +chr16 57683102 57684074 A +chr16 57683102 57684216 A +chr16 57683102 57684261 A +chr16 57683102 57684290 A +chr16 57683102 57684415 A +chr16 57683102 57684478 A +chr16 57683102 57684916 A +chr16 57683102 57684927 A +chr16 57683102 57685039 A +chr16 57683102 57685133 A +chr16 57683102 57685159 A +chr16 57683102 57685161 A +chr16 57683102 57685221 A +chr16 57683102 57685403 A +chr16 57683102 57685557 A +chr16 57683102 57685699 A +chr16 57683102 57685853 A +chr16 57683102 57686304 A +chr16 57683102 57686344 A +chr16 57683102 57686678 A +chr16 57683102 57686783 A +chr16 57683103 57683092 A +chr16 57683103 57683483 A +chr16 57683103 57683509 A +chr16 57683103 57683712 A +chr16 57683103 57683857 A +chr16 57683103 57683925 A +chr16 57683103 57684131 A +chr16 57683103 57684166 A +chr16 57683103 57684290 A +chr16 57683103 57684355 A +chr16 57683103 57684897 A +chr16 57683103 57684937 A +chr16 57683103 57684963 A +chr16 57683103 57685195 A +chr16 57683103 57685628 A +chr16 57683103 57685640 A +chr16 57683103 57685758 A +chr16 57683103 57685779 A +chr16 57683103 57686066 A +chr16 57683103 57686116 A +chr16 57683103 57686207 A +chr16 57683103 57686313 A +chr16 57683103 57686890 A +chr16 57683103 57687248 A +chr16 57683104 57682683 A +chr16 57683104 57682995 A +chr16 57683104 57683511 A +chr16 57683104 57684153 A +chr16 57683104 57684197 A +chr16 57683104 57684672 A +chr16 57683104 57684807 A +chr16 57683104 57684901 A +chr16 57683104 57685368 A +chr16 57683104 57685502 A +chr16 57683104 57685647 A +chr16 57683104 57685724 A +chr16 57683104 57685731 A +chr16 57683104 57685955 A +chr16 57683104 57686236 A +chr16 57683104 57686532 A +chr16 57683105 57684346 A +chr16 57683105 57684709 A +chr16 57683105 57685023 A +chr16 57683105 57685062 A +chr16 57683105 57685218 A +chr16 57683105 57685328 A +chr16 57683105 57685375 A +chr16 57683105 57685385 A +chr16 57683105 57685427 A +chr16 57683105 57685645 A +chr16 57683105 57685883 A +chr16 57683105 57686121 A +chr16 57683105 57686338 A +chr16 57683105 57686543 A +chr16 57683105 57687065 A +chr16 57683106 57682979 A +chr16 57683106 57683479 A +chr16 57683106 57684260 A +chr16 57683106 57684401 A +chr16 57683106 57684402 A +chr16 57683106 57684424 A +chr16 57683106 57684575 A +chr16 57683106 57684707 A +chr16 57683106 57684731 A +chr16 57683106 57684904 A +chr16 57683106 57684910 A +chr16 57683106 57684917 A +chr16 57683106 57685176 A +chr16 57683106 57685228 A +chr16 57683106 57685457 A +chr16 57683106 57685459 A +chr16 57683106 57685632 A +chr16 57683106 57685712 A +chr16 57683106 57685732 A +chr16 57683106 57685783 A +chr16 57683106 57685844 A +chr16 57683106 57685844 A +chr16 57683106 57686065 A +chr16 57683106 57686257 A +chr16 57683106 57686271 A +chr16 57683106 57686637 A +chr16 57683107 57683832 A +chr16 57683107 57684224 A +chr16 57683107 57684290 A +chr16 57683107 57684418 A +chr16 57683107 57684726 A +chr16 57683107 57684907 A +chr16 57683107 57684986 A +chr16 57683107 57685085 A +chr16 57683107 57685178 A +chr16 57683107 57685184 A +chr16 57683107 57685365 A +chr16 57683107 57685532 A +chr16 57683107 57685655 A +chr16 57683107 57686172 A +chr16 57683107 57686349 A +chr16 57683107 57686467 A +chr16 57683107 57686981 A +chr16 57683107 57686982 A +chr16 57683107 57687444 A +chr16 57683108 57682353 A +chr16 57683108 57684004 A +chr16 57683108 57684231 A +chr16 57683108 57684440 A +chr16 57683108 57684495 A +chr16 57683108 57684736 A +chr16 57683108 57684840 A +chr16 57683108 57685012 A +chr16 57683108 57685070 A +chr16 57683108 57685117 A +chr16 57683108 57685246 A +chr16 57683108 57685461 A +chr16 57683108 57685521 A +chr16 57683108 57685574 A +chr16 57683108 57685835 A +chr16 57683108 57686128 A +chr16 57683108 57686190 A +chr16 57683108 57686209 A +chr16 57683108 57686432 A +chr16 57683108 57686588 A +chr16 57683108 57686912 A +chr16 57683108 57687050 A +chr16 57683109 57683954 A +chr16 57683109 57684225 A +chr16 57683109 57684339 A +chr16 57683109 57684543 A +chr16 57683109 57684772 A +chr16 57683109 57684804 A +chr16 57683109 57684810 A +chr16 57683109 57684853 A +chr16 57683109 57685114 A +chr16 57683109 57685120 A +chr16 57683109 57685130 A +chr16 57683109 57685326 A +chr16 57683109 57685381 A +chr16 57683109 57685486 A +chr16 57683109 57685535 A +chr16 57683109 57686027 A +chr16 57683109 57686189 A +chr16 57683109 57686308 A +chr16 57683109 57686471 A +chr16 57683109 57686666 A +chr16 57683109 57686793 A +chr16 57683109 57686883 A +chr16 57683110 57683534 A +chr16 57683110 57683726 A +chr16 57683110 57683791 A +chr16 57683110 57684006 A +chr16 57683110 57684946 A +chr16 57683110 57685269 A +chr16 57683110 57685340 A +chr16 57683110 57685374 A +chr16 57683110 57685965 A +chr16 57683110 57686114 A +chr16 57683111 57684091 A +chr16 57683111 57684369 A +chr16 57683111 57684371 A +chr16 57683111 57684446 A +chr16 57683111 57684719 A +chr16 57683111 57684902 A +chr16 57683111 57685143 A +chr16 57683111 57685258 A +chr16 57683111 57685313 A +chr16 57683111 57685326 A +chr16 57683111 57685607 A +chr16 57683111 57685696 A +chr16 57683111 57685842 A +chr16 57683111 57685939 A +chr16 57683111 57685950 A +chr16 57683111 57686061 A +chr16 57683111 57686202 A +chr16 57683111 57686440 A +chr16 57683112 57683227 A +chr16 57683112 57683804 A +chr16 57683112 57683983 A +chr16 57683112 57684005 A +chr16 57683112 57684356 A +chr16 57683112 57684835 A +chr16 57683112 57684852 A +chr16 57683112 57684927 A +chr16 57683112 57685314 A +chr16 57683112 57685337 A +chr16 57683112 57685414 A +chr16 57683112 57685456 A +chr16 57683112 57685952 A +chr16 57683112 57686364 A +chr16 57683112 57687296 A +chr16 57683112 57687415 A +chr16 57683113 57683387 A +chr16 57683113 57683672 A +chr16 57683113 57684349 A +chr16 57683113 57684580 A +chr16 57683113 57684789 A +chr16 57683113 57684813 A +chr16 57683113 57684985 A +chr16 57683113 57685081 A +chr16 57683113 57685631 A +chr16 57683113 57685657 A +chr16 57683113 57685706 A +chr16 57683113 57685954 A +chr16 57683113 57685971 A +chr16 57683113 57686215 A +chr16 57683113 57686230 A +chr16 57683113 57686292 A +chr16 57683113 57686946 A +chr16 57683113 57687086 A +chr16 57683114 57684284 A +chr16 57683114 57684331 A +chr16 57683114 57684343 A +chr16 57683114 57684379 A +chr16 57683114 57684549 A +chr16 57683114 57684553 A +chr16 57683114 57684741 A +chr16 57683114 57684751 A +chr16 57683114 57684839 A +chr16 57683114 57684961 A +chr16 57683114 57685100 A +chr16 57683114 57685260 A +chr16 57683114 57685361 A +chr16 57683114 57685438 A +chr16 57683114 57685523 A +chr16 57683114 57685843 A +chr16 57683114 57686189 A +chr16 57683115 57683866 A +chr16 57683115 57684324 A +chr16 57683115 57684392 A +chr16 57683115 57684530 A +chr16 57683115 57684550 A +chr16 57683115 57684710 A +chr16 57683115 57684759 A +chr16 57683115 57684779 A +chr16 57683115 57684853 A +chr16 57683115 57685232 A +chr16 57683115 57685442 A +chr16 57683115 57685685 A +chr16 57683115 57685732 A +chr16 57683115 57685770 A +chr16 57683115 57685852 A +chr16 57683115 57685949 A +chr16 57683115 57686376 A +chr16 57683115 57686432 A +chr16 57683115 57686462 A +chr16 57683115 57686537 A +chr16 57683115 57687130 A +chr16 57683116 57683093 A +chr16 57683116 57683641 A +chr16 57683116 57683707 A +chr16 57683116 57683840 A +chr16 57683116 57684012 A +chr16 57683116 57684176 A +chr16 57683116 57684247 A +chr16 57683116 57684371 A +chr16 57683116 57684716 A +chr16 57683116 57684894 A +chr16 57683116 57684902 A +chr16 57683116 57684978 A +chr16 57683116 57685008 A +chr16 57683116 57685120 A +chr16 57683116 57685142 A +chr16 57683116 57685267 A +chr16 57683116 57685804 A +chr16 57683116 57685820 A +chr16 57683116 57685886 A +chr16 57683116 57685912 A +chr16 57683116 57686124 A +chr16 57683116 57686163 A +chr16 57683116 57686190 A +chr16 57683116 57686472 A +chr16 57683117 57683389 A +chr16 57683117 57683624 A +chr16 57683117 57684101 A +chr16 57683117 57684149 A +chr16 57683117 57684183 A +chr16 57683117 57684240 A +chr16 57683117 57684327 A +chr16 57683117 57684386 A +chr16 57683117 57684593 A +chr16 57683117 57684682 A +chr16 57683117 57684810 A +chr16 57683117 57685051 A +chr16 57683117 57685548 A +chr16 57683117 57685846 A +chr16 57683117 57685957 A +chr16 57683117 57686228 A +chr16 57683117 57686272 A +chr16 57683117 57686624 A +chr16 57683118 57683640 A +chr16 57683118 57684474 A +chr16 57683118 57685083 A +chr16 57683118 57685308 A +chr16 57683118 57685639 A +chr16 57683118 57685957 A +chr16 57683118 57685968 A +chr16 57683118 57685999 A +chr16 57683118 57686263 A +chr16 57683118 57686295 A +chr16 57683118 57686983 A +chr16 57683119 57682823 A +chr16 57683119 57683986 A +chr16 57683119 57684235 A +chr16 57683119 57684299 A +chr16 57683119 57684364 A +chr16 57683119 57684703 A +chr16 57683119 57684877 A +chr16 57683119 57684884 A +chr16 57683119 57684925 A +chr16 57683119 57685098 A +chr16 57683119 57685343 A +chr16 57683119 57685503 A +chr16 57683119 57685828 A +chr16 57683119 57685881 A +chr16 57683119 57686099 A +chr16 57683119 57686793 A +chr16 57683120 57683331 A +chr16 57683120 57683718 A +chr16 57683120 57684398 A +chr16 57683120 57684408 A +chr16 57683120 57684529 A +chr16 57683120 57684631 A +chr16 57683120 57684696 A +chr16 57683120 57684750 A +chr16 57683120 57684857 A +chr16 57683120 57685215 A +chr16 57683120 57685305 A +chr16 57683120 57685453 A +chr16 57683120 57685570 A +chr16 57683120 57685783 A +chr16 57683120 57686074 A +chr16 57683120 57686134 A +chr16 57683120 57686947 A +chr16 57683120 57687351 A +chr16 57683121 57683092 A +chr16 57683121 57683476 A +chr16 57683121 57683940 A +chr16 57683121 57683958 A +chr16 57683121 57684177 A +chr16 57683121 57684389 A +chr16 57683121 57684992 A +chr16 57683121 57685002 A +chr16 57683121 57685022 A +chr16 57683121 57685178 A +chr16 57683121 57685297 A +chr16 57683121 57685399 A +chr16 57683121 57685439 A +chr16 57683121 57685455 A +chr16 57683121 57685743 A +chr16 57683121 57685757 A +chr16 57683121 57686028 A +chr16 57683121 57686097 A +chr16 57683121 57686411 A +chr16 57683122 57683982 A +chr16 57683122 57684239 A +chr16 57683122 57684480 A +chr16 57683122 57684871 A +chr16 57683122 57685025 A +chr16 57683122 57685028 A +chr16 57683122 57685169 A +chr16 57683122 57685295 A +chr16 57683122 57685363 A +chr16 57683122 57685678 A +chr16 57683122 57686156 A +chr16 57683122 57686818 A +chr16 57683123 57682826 A +chr16 57683123 57683736 A +chr16 57683123 57683757 A +chr16 57683123 57684348 A +chr16 57683123 57684577 A +chr16 57683123 57684720 A +chr16 57683123 57684772 A +chr16 57683123 57684937 A +chr16 57683123 57685051 A +chr16 57683123 57685370 A +chr16 57683123 57685518 A +chr16 57683123 57685586 A +chr16 57683123 57685611 A +chr16 57683123 57685932 A +chr16 57683123 57685949 A +chr16 57683123 57686202 A +chr16 57683123 57686238 A +chr16 57683123 57686259 A +chr16 57683123 57686505 A +chr16 57683123 57687166 A +chr16 57683124 57682465 A +chr16 57683124 57684169 A +chr16 57683124 57684199 A +chr16 57683124 57684678 A +chr16 57683124 57684813 A +chr16 57683124 57684943 A +chr16 57683124 57684951 A +chr16 57683124 57685123 A +chr16 57683124 57685208 A +chr16 57683124 57685706 A +chr16 57683124 57685759 A +chr16 57683124 57685869 A +chr16 57683124 57685956 A +chr16 57683124 57686808 A +chr16 57683124 57686934 A +chr16 57683124 57686972 A +chr16 57683124 57687589 A +chr16 57683125 57683788 A +chr16 57683125 57683903 A +chr16 57683125 57684071 A +chr16 57683125 57684168 A +chr16 57683125 57684415 A +chr16 57683125 57684518 A +chr16 57683125 57684919 A +chr16 57683125 57684923 A +chr16 57683125 57684943 A +chr16 57683125 57684983 A +chr16 57683125 57685153 A +chr16 57683125 57685236 A +chr16 57683125 57685260 A +chr16 57683125 57685546 A +chr16 57683125 57685735 A +chr16 57683125 57685747 A +chr16 57683125 57686933 A +chr16 57683126 57683549 A +chr16 57683126 57683645 A +chr16 57683126 57684203 A +chr16 57683126 57684216 A +chr16 57683126 57684378 A +chr16 57683126 57684465 A +chr16 57683126 57684570 A +chr16 57683126 57684821 A +chr16 57683126 57685096 A +chr16 57683126 57685210 A +chr16 57683126 57685374 A +chr16 57683126 57685509 A +chr16 57683126 57685761 A +chr16 57683126 57685769 A +chr16 57683126 57685944 A +chr16 57683126 57686029 A +chr16 57683126 57686171 A +chr16 57683127 57684202 A +chr16 57683127 57684310 A +chr16 57683127 57684544 A +chr16 57683127 57684628 A +chr16 57683127 57684791 A +chr16 57683127 57684825 A +chr16 57683127 57685069 A +chr16 57683127 57685593 A +chr16 57683127 57685673 A +chr16 57683127 57685784 A +chr16 57683127 57685926 A +chr16 57683127 57686063 A +chr16 57683127 57686105 A +chr16 57683127 57686275 A +chr16 57683128 57683783 A +chr16 57683128 57683883 A +chr16 57683128 57684203 A +chr16 57683128 57684557 A +chr16 57683128 57684811 A +chr16 57683128 57684826 A +chr16 57683128 57684968 A +chr16 57683128 57685067 A +chr16 57683128 57685100 A +chr16 57683128 57685320 A +chr16 57683128 57685352 A +chr16 57683128 57685484 A +chr16 57683128 57685697 A +chr16 57683128 57685939 A +chr16 57683128 57686013 A +chr16 57683128 57686603 A +chr16 57683128 57686802 A +chr16 57683129 57683257 A +chr16 57683129 57683292 A +chr16 57683129 57684101 A +chr16 57683129 57684298 A +chr16 57683129 57684514 A +chr16 57683129 57684517 A +chr16 57683129 57684650 A +chr16 57683129 57684905 A +chr16 57683129 57685318 A +chr16 57683129 57685499 A +chr16 57683129 57685599 A +chr16 57683129 57685756 A +chr16 57683129 57685822 A +chr16 57683129 57685953 A +chr16 57683129 57685977 A +chr16 57683129 57686070 A +chr16 57683129 57686238 A +chr16 57683129 57686316 A +chr16 57683129 57686520 A +chr16 57683129 57686692 A +chr16 57683129 57686781 A +chr16 57683130 57683571 A +chr16 57683130 57684050 A +chr16 57683130 57684153 A +chr16 57683130 57684512 A +chr16 57683130 57684551 A +chr16 57683130 57684796 A +chr16 57683130 57685020 A +chr16 57683130 57685056 A +chr16 57683130 57685446 A +chr16 57683130 57685529 A +chr16 57683130 57685657 A +chr16 57683130 57685897 A +chr16 57683130 57686068 A +chr16 57683130 57686109 A +chr16 57683130 57686234 A +chr16 57683130 57686369 A +chr16 57683130 57686621 A +chr16 57683130 57686793 A +chr16 57683130 57687052 A +chr16 57683131 57683572 A +chr16 57683131 57683779 A +chr16 57683131 57683927 A +chr16 57683131 57684465 A +chr16 57683131 57684502 A +chr16 57683131 57684634 A +chr16 57683131 57685120 A +chr16 57683131 57685219 A +chr16 57683131 57685242 A +chr16 57683131 57685333 A +chr16 57683131 57685491 A +chr16 57683131 57685524 A +chr16 57683131 57685840 A +chr16 57683131 57686163 A +chr16 57683131 57686374 A +chr16 57683131 57686425 A +chr16 57683131 57686599 A +chr16 57683131 57686696 A +chr16 57683132 57683255 A +chr16 57683132 57683517 A +chr16 57683132 57684015 A +chr16 57683132 57684646 A +chr16 57683132 57684680 A +chr16 57683132 57684782 A +chr16 57683132 57684794 A +chr16 57683132 57684815 A +chr16 57683132 57684966 A +chr16 57683132 57685204 A +chr16 57683132 57685209 A +chr16 57683132 57685588 A +chr16 57683132 57685657 A +chr16 57683132 57685961 A +chr16 57683132 57686446 A +chr16 57683132 57686587 A +chr16 57683132 57687916 A +chr16 57683133 57683690 A +chr16 57683133 57684094 A +chr16 57683133 57684398 A +chr16 57683133 57684488 A +chr16 57683133 57684825 A +chr16 57683133 57685179 A +chr16 57683133 57685475 A +chr16 57683133 57685510 A +chr16 57683133 57685607 A +chr16 57683133 57685696 A +chr16 57683133 57685727 A +chr16 57683133 57685743 A +chr16 57683133 57685927 A +chr16 57683133 57685932 A +chr16 57683133 57688477 A +chr16 57683134 57683565 A +chr16 57683134 57683738 A +chr16 57683134 57683769 A +chr16 57683134 57683799 A +chr16 57683134 57684274 A +chr16 57683134 57684870 A +chr16 57683134 57684880 A +chr16 57683134 57685098 A +chr16 57683134 57685169 A +chr16 57683134 57685354 A +chr16 57683134 57685486 A +chr16 57683134 57685492 A +chr16 57683134 57685634 A +chr16 57683134 57685969 A +chr16 57683134 57686046 A +chr16 57683134 57686325 A +chr16 57683135 57684015 A +chr16 57683135 57684118 A +chr16 57683135 57684414 A +chr16 57683135 57684876 A +chr16 57683135 57685049 A +chr16 57683135 57685230 A +chr16 57683135 57685266 A +chr16 57683135 57685324 A +chr16 57683135 57685707 A +chr16 57683135 57685936 A +chr16 57683135 57686108 A +chr16 57683135 57686234 A +chr16 57683135 57686354 A +chr16 57683135 57686585 A +chr16 57683135 57686855 A +chr16 57683135 57687391 A +chr16 57683136 57683691 A +chr16 57683136 57684044 A +chr16 57683136 57684156 A +chr16 57683136 57684156 A +chr16 57683136 57684169 A +chr16 57683136 57684238 A +chr16 57683136 57684379 A +chr16 57683136 57684401 A +chr16 57683136 57684601 A +chr16 57683136 57684684 A +chr16 57683136 57684755 A +chr16 57683136 57685214 A +chr16 57683136 57685222 A +chr16 57683136 57685260 A +chr16 57683136 57685310 A +chr16 57683136 57685659 A +chr16 57683136 57685747 A +chr16 57683136 57685768 A +chr16 57683136 57685829 A +chr16 57683136 57685990 A +chr16 57683136 57686112 A +chr16 57683136 57686121 A +chr16 57683136 57686296 A +chr16 57683136 57686578 A +chr16 57683137 57682101 A +chr16 57683137 57683015 A +chr16 57683137 57683904 A +chr16 57683137 57683990 A +chr16 57683137 57684280 A +chr16 57683137 57684700 A +chr16 57683137 57685025 A +chr16 57683137 57685081 A +chr16 57683137 57685296 A +chr16 57683137 57685549 A +chr16 57683137 57685709 A +chr16 57683137 57685858 A +chr16 57683137 57685887 A +chr16 57683137 57686105 A +chr16 57683137 57686211 A +chr16 57683137 57686478 A +chr16 57683138 57682400 A +chr16 57683138 57684052 A +chr16 57683138 57684234 A +chr16 57683138 57684308 A +chr16 57683138 57684514 A +chr16 57683138 57684548 A +chr16 57683138 57684570 A +chr16 57683138 57684574 A +chr16 57683138 57684622 A +chr16 57683138 57684772 A +chr16 57683138 57685102 A +chr16 57683138 57685372 A +chr16 57683138 57685408 A +chr16 57683138 57685455 A +chr16 57683138 57685658 A +chr16 57683138 57685755 A +chr16 57683138 57685822 A +chr16 57683138 57685899 A +chr16 57683138 57686212 A +chr16 57683138 57686722 A +chr16 57683138 57686741 A +chr16 57683138 57687569 A +chr16 57683139 57683750 A +chr16 57683139 57683968 A +chr16 57683139 57684181 A +chr16 57683139 57684286 A +chr16 57683139 57684367 A +chr16 57683139 57684650 A +chr16 57683139 57685201 A +chr16 57683139 57685279 A +chr16 57683139 57685285 A +chr16 57683139 57685418 A +chr16 57683139 57685689 A +chr16 57683139 57686373 A +chr16 57683139 57686548 A +chr16 57683139 57686720 A +chr16 57683140 57683486 A +chr16 57683140 57683820 A +chr16 57683140 57684003 A +chr16 57683140 57684116 A +chr16 57683140 57684230 A +chr16 57683140 57684294 A +chr16 57683140 57684338 A +chr16 57683140 57684364 A +chr16 57683140 57684544 A +chr16 57683140 57684674 A +chr16 57683140 57684833 A +chr16 57683140 57684851 A +chr16 57683140 57684881 A +chr16 57683140 57684920 A +chr16 57683140 57685035 A +chr16 57683140 57685149 A +chr16 57683140 57685463 A +chr16 57683140 57685620 A +chr16 57683140 57686015 A +chr16 57683140 57686230 A +chr16 57683140 57686343 A +chr16 57683140 57686403 A +chr16 57683140 57686525 A +chr16 57683140 57686606 A +chr16 57683140 57686944 A +chr16 57683140 57687048 A +chr16 57683141 57684197 A +chr16 57683141 57684314 A +chr16 57683141 57684381 A +chr16 57683141 57684614 A +chr16 57683141 57684781 A +chr16 57683141 57685078 A +chr16 57683141 57685396 A +chr16 57683141 57685464 A +chr16 57683141 57685519 A +chr16 57683141 57685623 A +chr16 57683141 57685918 A +chr16 57683141 57686007 A +chr16 57683141 57686044 A +chr16 57683141 57686474 A +chr16 57683141 57686530 A +chr16 57683141 57687401 A +chr16 57683142 57683390 A +chr16 57683142 57683915 A +chr16 57683142 57684002 A +chr16 57683142 57684067 A +chr16 57683142 57684287 A +chr16 57683142 57684303 A +chr16 57683142 57684445 A +chr16 57683142 57684539 A +chr16 57683142 57684812 A +chr16 57683142 57684890 A +chr16 57683142 57684923 A +chr16 57683142 57685038 A +chr16 57683142 57685080 A +chr16 57683142 57685127 A +chr16 57683142 57685145 A +chr16 57683142 57685305 A +chr16 57683142 57685450 A +chr16 57683142 57685459 A +chr16 57683142 57686035 A +chr16 57683142 57686210 A +chr16 57683142 57686585 A +chr16 57683143 57684213 A +chr16 57683143 57684219 A +chr16 57683143 57684321 A +chr16 57683143 57684651 A +chr16 57683143 57684829 A +chr16 57683143 57684954 A +chr16 57683143 57684958 A +chr16 57683143 57685125 A +chr16 57683143 57685199 A +chr16 57683143 57685272 A +chr16 57683143 57685396 A +chr16 57683143 57685573 A +chr16 57683143 57685642 A +chr16 57683143 57686286 A +chr16 57683143 57686403 A +chr16 57683143 57686932 A +chr16 57683144 57682813 A +chr16 57683144 57682903 A +chr16 57683144 57683887 A +chr16 57683144 57683897 A +chr16 57683144 57684017 A +chr16 57683144 57684077 A +chr16 57683144 57684102 A +chr16 57683144 57684474 A +chr16 57683144 57684498 A +chr16 57683144 57684696 A +chr16 57683144 57685122 A +chr16 57683144 57685459 A +chr16 57683144 57685507 A +chr16 57683144 57685642 A +chr16 57683144 57685782 A +chr16 57683144 57686043 A +chr16 57683144 57686401 A +chr16 57683144 57687171 A +chr16 57683145 57682338 A +chr16 57683145 57682767 A +chr16 57683145 57682975 A +chr16 57683145 57683872 A +chr16 57683145 57683980 A +chr16 57683145 57684667 A +chr16 57683145 57685008 A +chr16 57683145 57685242 A +chr16 57683145 57685329 A +chr16 57683145 57685443 A +chr16 57683145 57685805 A +chr16 57683145 57685934 A +chr16 57683145 57686026 A +chr16 57683145 57686129 A +chr16 57683145 57686497 A +chr16 57683145 57687632 A +chr16 57683146 57684675 A +chr16 57683146 57685052 A +chr16 57683146 57685202 A +chr16 57683146 57685351 A +chr16 57683146 57685493 A +chr16 57683146 57685697 A +chr16 57683146 57685742 A +chr16 57683146 57685871 A +chr16 57683146 57685963 A +chr16 57683146 57686650 A +chr16 57683146 57686885 A +chr16 57683146 57687063 A +chr16 57683147 57683798 A +chr16 57683147 57683807 A +chr16 57683147 57683965 A +chr16 57683147 57684138 A +chr16 57683147 57684172 A +chr16 57683147 57684229 A +chr16 57683147 57684539 A +chr16 57683147 57684572 A +chr16 57683147 57684739 A +chr16 57683147 57685084 A +chr16 57683147 57685100 A +chr16 57683147 57685130 A +chr16 57683147 57685333 A +chr16 57683147 57685376 A +chr16 57683147 57685378 A +chr16 57683147 57685450 A +chr16 57683147 57685549 A +chr16 57683147 57686209 A +chr16 57683147 57688095 A +chr16 57683148 57684670 A +chr16 57683148 57684736 A +chr16 57683148 57684745 A +chr16 57683148 57684943 A +chr16 57683148 57684963 A +chr16 57683148 57685357 A +chr16 57683148 57685584 A +chr16 57683148 57685680 A +chr16 57683148 57685783 A +chr16 57683148 57685860 A +chr16 57683148 57685875 A +chr16 57683148 57685944 A +chr16 57683148 57685981 A +chr16 57683148 57686085 A +chr16 57683148 57686102 A +chr16 57683148 57686826 A +chr16 57683149 57683286 A +chr16 57683149 57684415 A +chr16 57683149 57684500 A +chr16 57683149 57684501 A +chr16 57683149 57684567 A +chr16 57683149 57684745 A +chr16 57683149 57684775 A +chr16 57683149 57684829 A +chr16 57683149 57685028 A +chr16 57683149 57685062 A +chr16 57683149 57685099 A +chr16 57683149 57685190 A +chr16 57683149 57685430 A +chr16 57683149 57685658 A +chr16 57683149 57685948 A +chr16 57683149 57686562 A +chr16 57683149 57686603 A +chr16 57683149 57686610 A +chr16 57683150 57682462 A +chr16 57683150 57683930 A +chr16 57683150 57684743 A +chr16 57683150 57684794 A +chr16 57683150 57685011 A +chr16 57683150 57685446 A +chr16 57683150 57685513 A +chr16 57683150 57685584 A +chr16 57683150 57685617 A +chr16 57683150 57685767 A +chr16 57683150 57685827 A +chr16 57683150 57685981 A +chr16 57683150 57686288 A +chr16 57683150 57686505 A +chr16 57683151 57683377 A +chr16 57683151 57683392 A +chr16 57683151 57683562 A +chr16 57683151 57683936 A +chr16 57683151 57684758 A +chr16 57683151 57684765 A +chr16 57683151 57685089 A +chr16 57683151 57685181 A +chr16 57683151 57685203 A +chr16 57683151 57685519 A +chr16 57683151 57685617 A +chr16 57683151 57685624 A +chr16 57683151 57685740 A +chr16 57683151 57685855 A +chr16 57683151 57686016 A +chr16 57683151 57686370 A +chr16 57683151 57686495 A +chr16 57683151 57686926 A +chr16 57683151 57686948 A +chr16 57683152 57683256 A +chr16 57683152 57683404 A +chr16 57683152 57683589 A +chr16 57683152 57684127 A +chr16 57683152 57684356 A +chr16 57683152 57684442 A +chr16 57683152 57684498 A +chr16 57683152 57684539 A +chr16 57683152 57684607 A +chr16 57683152 57684878 A +chr16 57683152 57684907 A +chr16 57683152 57685038 A +chr16 57683152 57685171 A +chr16 57683152 57685485 A +chr16 57683152 57685548 A +chr16 57683152 57685601 A +chr16 57683152 57685841 A +chr16 57683152 57686216 A +chr16 57683152 57686226 A +chr16 57683152 57686260 A +chr16 57683152 57686365 A +chr16 57683152 57686620 A +chr16 57683152 57686716 A +chr16 57683152 57687309 A +chr16 57683153 57683596 A +chr16 57683153 57683611 A +chr16 57683153 57683697 A +chr16 57683153 57683731 A +chr16 57683153 57684074 A +chr16 57683153 57684109 A +chr16 57683153 57684673 A +chr16 57683153 57684719 A +chr16 57683153 57684767 A +chr16 57683153 57684807 A +chr16 57683153 57684999 A +chr16 57683153 57685291 A +chr16 57683153 57685293 A +chr16 57683153 57685973 A +chr16 57683153 57685990 A +chr16 57683153 57686210 A +chr16 57683153 57686241 A +chr16 57683153 57686369 A +chr16 57683153 57686970 A +chr16 57683154 57683903 A +chr16 57683154 57684040 A +chr16 57683154 57684557 A +chr16 57683154 57684762 A +chr16 57683154 57684888 A +chr16 57683154 57684940 A +chr16 57683154 57684956 A +chr16 57683154 57684969 A +chr16 57683154 57685013 A +chr16 57683154 57685073 A +chr16 57683154 57685096 A +chr16 57683154 57685278 A +chr16 57683154 57685413 A +chr16 57683154 57685622 A +chr16 57683154 57685664 A +chr16 57683154 57685701 A +chr16 57683154 57685868 A +chr16 57683154 57685978 A +chr16 57683154 57686072 A +chr16 57683154 57686193 A +chr16 57683154 57686706 A +chr16 57683155 57682767 A +chr16 57683155 57683736 A +chr16 57683155 57684579 A +chr16 57683155 57684690 A +chr16 57683155 57684801 A +chr16 57683155 57684924 A +chr16 57683155 57684979 A +chr16 57683155 57685042 A +chr16 57683155 57685223 A +chr16 57683155 57685340 A +chr16 57683155 57685502 A +chr16 57683155 57685565 A +chr16 57683155 57685575 A +chr16 57683155 57685592 A +chr16 57683155 57685698 A +chr16 57683155 57685763 A +chr16 57683155 57685803 A +chr16 57683155 57686082 A +chr16 57683155 57686872 A +chr16 57683155 57687825 A +chr16 57683156 57682958 A +chr16 57683156 57683610 A +chr16 57683156 57683775 A +chr16 57683156 57684514 A +chr16 57683156 57684528 A +chr16 57683156 57684576 A +chr16 57683156 57684638 A +chr16 57683156 57684861 A +chr16 57683156 57685055 A +chr16 57683156 57685102 A +chr16 57683156 57685234 A +chr16 57683156 57685529 A +chr16 57683156 57685679 A +chr16 57683156 57685733 A +chr16 57683156 57685896 A +chr16 57683156 57685947 A +chr16 57683156 57686167 A +chr16 57683156 57686275 A +chr16 57683156 57687227 A +chr16 57683156 57689211 A +chr16 57683157 57682447 A +chr16 57683157 57683229 A +chr16 57683157 57684075 A +chr16 57683157 57684282 A +chr16 57683157 57684353 A +chr16 57683157 57684613 A +chr16 57683157 57684619 A +chr16 57683157 57684643 A +chr16 57683157 57684721 A +chr16 57683157 57684812 A +chr16 57683157 57684851 A +chr16 57683157 57684854 A +chr16 57683157 57684877 A +chr16 57683157 57685036 A +chr16 57683157 57685114 A +chr16 57683157 57685161 A +chr16 57683157 57685260 A +chr16 57683157 57685624 A +chr16 57683157 57686032 A +chr16 57683157 57686441 A +chr16 57683157 57686674 A +chr16 57683157 57686689 A +chr16 57683157 57686739 A +chr16 57683157 57686962 A +chr16 57683157 57686994 A +chr16 57683157 57687405 A +chr16 57683158 57683421 A +chr16 57683158 57683541 A +chr16 57683158 57684206 A +chr16 57683158 57684501 A +chr16 57683158 57684637 A +chr16 57683158 57684785 A +chr16 57683158 57684816 A +chr16 57683158 57684894 A +chr16 57683158 57685036 A +chr16 57683158 57685116 A +chr16 57683158 57685192 A +chr16 57683158 57685225 A +chr16 57683158 57685330 A +chr16 57683158 57685540 A +chr16 57683158 57685736 A +chr16 57683158 57685915 A +chr16 57683158 57686055 A +chr16 57683158 57686185 A +chr16 57683158 57686545 A +chr16 57683159 57683437 A +chr16 57683159 57684103 A +chr16 57683159 57684247 A +chr16 57683159 57684721 A +chr16 57683159 57685023 A +chr16 57683159 57685271 A +chr16 57683159 57685306 A +chr16 57683159 57685565 A +chr16 57683159 57685593 A +chr16 57683159 57685594 A +chr16 57683159 57685644 A +chr16 57683159 57685874 A +chr16 57683159 57686187 A +chr16 57683159 57686255 A +chr16 57683159 57686276 A +chr16 57683159 57686599 A +chr16 57683160 57683646 A +chr16 57683160 57683767 A +chr16 57683160 57684172 A +chr16 57683160 57684381 A +chr16 57683160 57684553 A +chr16 57683160 57684663 A +chr16 57683160 57684671 A +chr16 57683160 57684734 A +chr16 57683160 57684877 A +chr16 57683160 57684909 A +chr16 57683160 57684950 A +chr16 57683160 57685116 A +chr16 57683160 57685151 A +chr16 57683160 57685471 A +chr16 57683160 57685494 A +chr16 57683160 57685631 A +chr16 57683160 57685795 A +chr16 57683160 57686013 A +chr16 57683160 57686149 A +chr16 57683160 57686409 A +chr16 57683160 57686530 A +chr16 57683160 57686801 A +chr16 57683160 57687091 A +chr16 57683161 57682255 A +chr16 57683161 57682853 A +chr16 57683161 57683179 A +chr16 57683161 57683355 A +chr16 57683161 57683878 A +chr16 57683161 57683986 A +chr16 57683161 57684011 A +chr16 57683161 57684333 A +chr16 57683161 57684456 A +chr16 57683161 57684843 A +chr16 57683161 57684967 A +chr16 57683161 57685054 A +chr16 57683161 57685246 A +chr16 57683161 57685388 A +chr16 57683161 57685439 A +chr16 57683161 57686036 A +chr16 57683161 57686139 A +chr16 57683161 57687560 A +chr16 57683162 57684048 A +chr16 57683162 57684657 A +chr16 57683162 57684736 A +chr16 57683162 57684774 A +chr16 57683162 57684959 A +chr16 57683162 57684979 A +chr16 57683162 57685162 A +chr16 57683162 57685389 A +chr16 57683162 57685477 A +chr16 57683162 57685680 A +chr16 57683162 57685744 A +chr16 57683162 57685930 A +chr16 57683162 57686614 A +chr16 57683163 57683332 A +chr16 57683163 57683367 A +chr16 57683163 57683884 A +chr16 57683163 57684325 A +chr16 57683163 57684341 A +chr16 57683163 57684614 A +chr16 57683163 57684732 A +chr16 57683163 57685263 A +chr16 57683163 57685263 A +chr16 57683163 57685290 A +chr16 57683163 57685340 A +chr16 57683163 57685565 A +chr16 57683163 57685611 A +chr16 57683163 57685663 A +chr16 57683163 57685779 A +chr16 57683163 57685797 A +chr16 57683163 57686209 A +chr16 57683163 57686301 A +chr16 57683163 57686436 A +chr16 57683163 57686669 A +chr16 57683163 57686725 A +chr16 57683164 57682916 A +chr16 57683164 57684319 A +chr16 57683164 57684431 A +chr16 57683164 57684517 A +chr16 57683164 57684851 A +chr16 57683164 57684855 A +chr16 57683164 57685106 A +chr16 57683164 57685196 A +chr16 57683164 57685294 A +chr16 57683164 57685408 A +chr16 57683164 57685649 A +chr16 57683164 57685876 A +chr16 57683164 57685924 A +chr16 57683164 57686039 A +chr16 57683164 57686101 A +chr16 57683164 57686408 A +chr16 57683164 57686913 A +chr16 57683164 57686915 A +chr16 57683164 57687021 A +chr16 57683165 57683147 A +chr16 57683165 57683882 A +chr16 57683165 57684342 A +chr16 57683165 57684433 A +chr16 57683165 57684570 A +chr16 57683165 57684795 A +chr16 57683165 57684937 A +chr16 57683165 57684968 A +chr16 57683165 57685188 A +chr16 57683165 57685264 A +chr16 57683165 57685273 A +chr16 57683165 57685331 A +chr16 57683165 57685346 A +chr16 57683165 57685401 A +chr16 57683165 57685773 A +chr16 57683165 57686274 A +chr16 57683165 57686403 A +chr16 57683165 57686671 A +chr16 57683165 57687214 A +chr16 57683166 57683796 A +chr16 57683166 57683797 A +chr16 57683166 57683930 A +chr16 57683166 57684313 A +chr16 57683166 57684498 A +chr16 57683166 57684683 A +chr16 57683166 57684869 A +chr16 57683166 57685008 A +chr16 57683166 57685141 A +chr16 57683166 57685200 A +chr16 57683166 57685240 A +chr16 57683166 57685328 A +chr16 57683166 57685358 A +chr16 57683166 57685433 A +chr16 57683166 57685464 A +chr16 57683166 57685552 A +chr16 57683166 57685809 A +chr16 57683166 57685818 A +chr16 57683166 57685865 A +chr16 57683166 57686341 A +chr16 57683166 57686512 A +chr16 57683166 57686768 A +chr16 57683167 57683328 A +chr16 57683167 57683649 A +chr16 57683167 57683779 A +chr16 57683167 57684194 A +chr16 57683167 57684343 A +chr16 57683167 57684473 A +chr16 57683167 57684659 A +chr16 57683167 57684789 A +chr16 57683167 57684928 A +chr16 57683167 57685325 A +chr16 57683167 57685337 A +chr16 57683167 57685865 A +chr16 57683167 57686364 A +chr16 57683167 57686376 A +chr16 57683167 57686396 A +chr16 57683168 57683546 A +chr16 57683168 57683929 A +chr16 57683168 57684265 A +chr16 57683168 57684321 A +chr16 57683168 57684469 A +chr16 57683168 57684668 A +chr16 57683168 57684736 A +chr16 57683168 57685336 A +chr16 57683168 57685357 A +chr16 57683168 57685395 A +chr16 57683168 57685428 A +chr16 57683168 57685483 A +chr16 57683168 57685496 A +chr16 57683168 57685576 A +chr16 57683168 57685784 A +chr16 57683168 57685813 A +chr16 57683168 57686011 A +chr16 57683168 57686099 A +chr16 57683168 57686243 A +chr16 57683169 57683968 A +chr16 57683169 57684138 A +chr16 57683169 57684450 A +chr16 57683169 57684522 A +chr16 57683169 57684707 A +chr16 57683169 57685351 A +chr16 57683169 57685510 A +chr16 57683169 57685725 A +chr16 57683169 57685743 A +chr16 57683169 57685854 A +chr16 57683169 57686003 A +chr16 57683169 57686192 A +chr16 57683169 57686520 A +chr16 57683170 57683269 A +chr16 57683170 57683795 A +chr16 57683170 57684399 A +chr16 57683170 57684425 A +chr16 57683170 57684439 A +chr16 57683170 57684464 A +chr16 57683170 57684655 A +chr16 57683170 57684741 A +chr16 57683170 57685006 A +chr16 57683170 57685021 A +chr16 57683170 57685506 A +chr16 57683170 57685600 A +chr16 57683170 57685606 A +chr16 57683170 57685875 A +chr16 57683170 57686071 A +chr16 57683170 57686300 A +chr16 57683170 57686306 A +chr16 57683171 57682742 A +chr16 57683171 57683164 A +chr16 57683171 57683453 A +chr16 57683171 57684098 A +chr16 57683171 57684153 A +chr16 57683171 57684390 A +chr16 57683171 57684722 A +chr16 57683171 57684750 A +chr16 57683171 57684863 A +chr16 57683171 57684958 A +chr16 57683171 57685390 A +chr16 57683171 57685411 A +chr16 57683171 57685510 A +chr16 57683171 57685621 A +chr16 57683171 57685641 A +chr16 57683171 57685733 A +chr16 57683171 57685968 A +chr16 57683171 57687345 A +chr16 57683172 57682930 A +chr16 57683172 57683557 A +chr16 57683172 57683657 A +chr16 57683172 57684027 A +chr16 57683172 57684373 A +chr16 57683172 57684382 A +chr16 57683172 57684422 A +chr16 57683172 57684547 A +chr16 57683172 57684598 A +chr16 57683172 57684747 A +chr16 57683172 57684783 A +chr16 57683172 57684875 A +chr16 57683172 57684947 A +chr16 57683172 57685717 A +chr16 57683172 57685843 A +chr16 57683172 57685893 A +chr16 57683172 57686470 A +chr16 57683172 57686602 A +chr16 57683172 57686896 A +chr16 57683173 57682892 A +chr16 57683173 57683701 A +chr16 57683173 57684499 A +chr16 57683173 57684868 A +chr16 57683173 57684929 A +chr16 57683173 57684937 A +chr16 57683173 57684998 A +chr16 57683173 57686505 A +chr16 57683174 57682976 A +chr16 57683174 57683394 A +chr16 57683174 57683839 A +chr16 57683174 57683996 A +chr16 57683174 57684053 A +chr16 57683174 57684268 A +chr16 57683174 57684467 A +chr16 57683174 57684619 A +chr16 57683174 57684747 A +chr16 57683174 57684939 A +chr16 57683174 57685095 A +chr16 57683174 57685206 A +chr16 57683174 57685329 A +chr16 57683174 57685332 A +chr16 57683174 57685739 A +chr16 57683174 57685913 A +chr16 57683174 57685926 A +chr16 57683174 57686138 A +chr16 57683175 57683469 A +chr16 57683175 57683694 A +chr16 57683175 57683932 A +chr16 57683175 57684163 A +chr16 57683175 57684302 A +chr16 57683175 57684350 A +chr16 57683175 57684817 A +chr16 57683175 57685048 A +chr16 57683175 57685386 A +chr16 57683175 57685574 A +chr16 57683175 57685845 A +chr16 57683175 57685876 A +chr16 57683175 57686580 A +chr16 57683175 57686716 A +chr16 57683176 57683247 A +chr16 57683176 57683614 A +chr16 57683176 57683788 A +chr16 57683176 57684526 A +chr16 57683176 57684897 A +chr16 57683176 57684989 A +chr16 57683176 57685039 A +chr16 57683176 57685140 A +chr16 57683176 57685328 A +chr16 57683176 57685330 A +chr16 57683176 57685366 A +chr16 57683176 57685636 A +chr16 57683176 57685671 A +chr16 57683176 57685698 A +chr16 57683176 57685863 A +chr16 57683176 57685949 A +chr16 57683176 57685988 A +chr16 57683176 57686125 A +chr16 57683176 57686170 A +chr16 57683176 57686260 A +chr16 57683176 57686723 A +chr16 57683176 57686776 A +chr16 57683177 57682265 A +chr16 57683177 57682989 A +chr16 57683177 57683084 A +chr16 57683177 57683870 A +chr16 57683177 57684550 A +chr16 57683177 57684643 A +chr16 57683177 57684680 A +chr16 57683177 57684963 A +chr16 57683177 57685362 A +chr16 57683177 57685480 A +chr16 57683177 57685792 A +chr16 57683177 57685803 A +chr16 57683177 57685967 A +chr16 57683177 57686031 A +chr16 57683177 57686542 A +chr16 57683177 57686616 A +chr16 57683177 57686775 A +chr16 57683178 57683923 A +chr16 57683178 57684059 A +chr16 57683178 57684262 A +chr16 57683178 57684815 A +chr16 57683178 57684827 A +chr16 57683178 57684872 A +chr16 57683178 57684907 A +chr16 57683178 57685071 A +chr16 57683178 57685200 A +chr16 57683178 57685214 A +chr16 57683178 57685318 A +chr16 57683178 57685542 A +chr16 57683178 57685597 A +chr16 57683178 57685621 A +chr16 57683178 57685658 A +chr16 57683178 57685751 A +chr16 57683178 57686003 A +chr16 57683178 57686067 A +chr16 57683178 57686302 A +chr16 57683178 57686318 A +chr16 57683178 57687517 A +chr16 57683179 57683579 A +chr16 57683179 57683724 A +chr16 57683179 57683905 A +chr16 57683179 57683927 A +chr16 57683179 57684233 A +chr16 57683179 57684403 A +chr16 57683179 57684445 A +chr16 57683179 57684586 A +chr16 57683179 57684600 A +chr16 57683179 57684666 A +chr16 57683179 57685001 A +chr16 57683179 57685149 A +chr16 57683179 57685213 A +chr16 57683179 57685341 A +chr16 57683179 57685497 A +chr16 57683179 57685526 A +chr16 57683179 57685762 A +chr16 57683179 57685941 A +chr16 57683179 57685975 A +chr16 57683179 57686021 A +chr16 57683179 57686034 A +chr16 57683179 57686134 A +chr16 57683179 57686151 A +chr16 57683179 57686413 A +chr16 57683179 57687714 A +chr16 57683180 57683726 A +chr16 57683180 57683809 A +chr16 57683180 57683872 A +chr16 57683180 57684420 A +chr16 57683180 57685243 A +chr16 57683180 57685432 A +chr16 57683180 57685452 A +chr16 57683180 57685467 A +chr16 57683180 57685523 A +chr16 57683180 57685779 A +chr16 57683180 57686496 A +chr16 57683181 57682731 A +chr16 57683181 57683231 A +chr16 57683181 57683762 A +chr16 57683181 57683932 A +chr16 57683181 57684032 A +chr16 57683181 57684172 A +chr16 57683181 57684298 A +chr16 57683181 57684576 A +chr16 57683181 57684617 A +chr16 57683181 57684760 A +chr16 57683181 57684774 A +chr16 57683181 57684809 A +chr16 57683181 57684933 A +chr16 57683181 57685152 A +chr16 57683181 57685287 A +chr16 57683181 57685386 A +chr16 57683181 57685397 A +chr16 57683181 57685649 A +chr16 57683181 57685842 A +chr16 57683181 57686006 A +chr16 57683181 57686253 A +chr16 57683181 57687489 A +chr16 57683182 57683218 A +chr16 57683182 57683387 A +chr16 57683182 57683956 A +chr16 57683182 57683990 A +chr16 57683182 57684280 A +chr16 57683182 57684290 A +chr16 57683182 57684363 A +chr16 57683182 57684498 A +chr16 57683182 57684937 A +chr16 57683182 57684975 A +chr16 57683182 57685088 A +chr16 57683182 57685233 A +chr16 57683182 57685345 A +chr16 57683182 57685506 A +chr16 57683182 57685862 A +chr16 57683182 57685901 A +chr16 57683182 57686261 A +chr16 57683183 57683276 A +chr16 57683183 57683551 A +chr16 57683183 57683673 A +chr16 57683183 57683819 A +chr16 57683183 57684389 A +chr16 57683183 57684600 A +chr16 57683183 57684840 A +chr16 57683183 57684909 A +chr16 57683183 57684977 A +chr16 57683183 57685022 A +chr16 57683183 57685489 A +chr16 57683183 57685573 A +chr16 57683183 57685949 A +chr16 57683183 57686162 A +chr16 57683183 57686846 A +chr16 57683183 57686888 A +chr16 57683184 57682972 A +chr16 57683184 57683021 A +chr16 57683184 57684114 A +chr16 57683184 57684453 A +chr16 57683184 57684556 A +chr16 57683184 57684745 A +chr16 57683184 57684862 A +chr16 57683184 57685108 A +chr16 57683184 57685158 A +chr16 57683184 57685180 A +chr16 57683184 57685417 A +chr16 57683184 57685431 A +chr16 57683184 57685556 A +chr16 57683184 57685825 A +chr16 57683184 57686032 A +chr16 57683184 57686289 A +chr16 57683184 57687047 A +chr16 57683185 57683203 A +chr16 57683185 57683991 A +chr16 57683185 57684065 A +chr16 57683185 57684142 A +chr16 57683185 57684207 A +chr16 57683185 57684247 A +chr16 57683185 57684268 A +chr16 57683185 57684495 A +chr16 57683185 57684525 A +chr16 57683185 57684554 A +chr16 57683185 57684912 A +chr16 57683185 57685032 A +chr16 57683185 57685248 A +chr16 57683185 57685865 A +chr16 57683185 57685910 A +chr16 57683185 57685971 A +chr16 57683185 57686046 A +chr16 57683185 57686405 A +chr16 57683185 57686476 A +chr16 57683185 57686565 A +chr16 57683186 57683240 A +chr16 57683186 57683493 A +chr16 57683186 57683731 A +chr16 57683186 57684092 A +chr16 57683186 57684118 A +chr16 57683186 57684243 A +chr16 57683186 57684265 A +chr16 57683186 57684871 A +chr16 57683186 57684907 A +chr16 57683186 57685005 A +chr16 57683186 57685399 A +chr16 57683186 57685431 A +chr16 57683186 57685822 A +chr16 57683186 57685963 A +chr16 57683186 57686007 A +chr16 57683186 57686046 A +chr16 57683186 57686441 A +chr16 57683186 57686622 A +chr16 57683186 57686733 A +chr16 57683186 57686759 A +chr16 57683186 57686776 A +chr16 57683186 57686911 A +chr16 57683187 57683585 A +chr16 57683187 57683934 A +chr16 57683187 57684157 A +chr16 57683187 57684244 A +chr16 57683187 57684384 A +chr16 57683187 57684422 A +chr16 57683187 57684558 A +chr16 57683187 57684686 A +chr16 57683187 57684996 A +chr16 57683187 57685074 A +chr16 57683187 57685134 A +chr16 57683187 57685294 A +chr16 57683187 57685401 A +chr16 57683187 57685532 A +chr16 57683187 57685555 A +chr16 57683187 57685679 A +chr16 57683187 57685755 A +chr16 57683187 57685872 A +chr16 57683187 57686276 A +chr16 57683187 57686365 A +chr16 57683187 57686657 A +chr16 57683187 57687475 A +chr16 57683188 57683594 A +chr16 57683188 57683976 A +chr16 57683188 57684141 A +chr16 57683188 57684325 A +chr16 57683188 57684395 A +chr16 57683188 57684427 A +chr16 57683188 57684557 A +chr16 57683188 57684604 A +chr16 57683188 57684674 A +chr16 57683188 57684821 A +chr16 57683188 57684952 A +chr16 57683188 57685090 A +chr16 57683188 57685447 A +chr16 57683188 57685523 A +chr16 57683188 57685757 A +chr16 57683188 57685793 A +chr16 57683188 57685942 A +chr16 57683188 57686049 A +chr16 57683188 57686352 A +chr16 57683188 57686418 A +chr16 57683188 57687953 A +chr16 57683189 57684837 A +chr16 57683189 57684877 A +chr16 57683189 57684885 A +chr16 57683189 57684914 A +chr16 57683189 57684929 A +chr16 57683189 57685304 A +chr16 57683189 57685314 A +chr16 57683189 57685480 A +chr16 57683189 57685593 A +chr16 57683189 57686106 A +chr16 57683189 57686129 A +chr16 57683189 57686186 A +chr16 57683189 57686307 A +chr16 57683189 57686428 A +chr16 57683190 57683270 A +chr16 57683190 57683461 A +chr16 57683190 57683848 A +chr16 57683190 57683901 A +chr16 57683190 57684457 A +chr16 57683190 57684558 A +chr16 57683190 57684563 A +chr16 57683190 57684927 A +chr16 57683190 57685022 A +chr16 57683190 57685093 A +chr16 57683190 57685287 A +chr16 57683190 57685360 A +chr16 57683190 57685437 A +chr16 57683190 57685802 A +chr16 57683190 57686281 A +chr16 57683190 57686289 A +chr16 57683190 57686406 A +chr16 57683191 57682287 A +chr16 57683191 57682773 A +chr16 57683191 57683440 A +chr16 57683191 57683912 A +chr16 57683191 57684344 A +chr16 57683191 57684663 A +chr16 57683191 57684978 A +chr16 57683191 57685042 A +chr16 57683191 57685058 A +chr16 57683191 57685137 A +chr16 57683191 57685210 A +chr16 57683191 57685293 A +chr16 57683191 57685516 A +chr16 57683191 57685524 A +chr16 57683191 57685955 A +chr16 57683191 57685967 A +chr16 57683191 57686008 A +chr16 57683191 57686282 A +chr16 57683191 57686488 A +chr16 57683191 57686649 A +chr16 57683191 57687117 A +chr16 57683192 57683666 A +chr16 57683192 57684368 A +chr16 57683192 57684427 A +chr16 57683192 57684650 A +chr16 57683192 57684944 A +chr16 57683192 57684971 A +chr16 57683192 57685067 A +chr16 57683192 57685276 A +chr16 57683192 57685377 A +chr16 57683192 57685392 A +chr16 57683192 57685443 A +chr16 57683192 57685689 A +chr16 57683192 57685760 A +chr16 57683192 57685776 A +chr16 57683192 57686332 A +chr16 57683192 57686792 A +chr16 57683193 57683106 A +chr16 57683193 57683347 A +chr16 57683193 57683368 A +chr16 57683193 57683630 A +chr16 57683193 57683764 A +chr16 57683193 57684474 A +chr16 57683193 57684652 A +chr16 57683193 57684861 A +chr16 57683193 57684893 A +chr16 57683193 57684916 A +chr16 57683193 57685230 A +chr16 57683193 57685264 A +chr16 57683193 57685363 A +chr16 57683193 57685379 A +chr16 57683193 57685584 A +chr16 57683193 57685588 A +chr16 57683193 57685655 A +chr16 57683193 57685884 A +chr16 57683193 57686353 A +chr16 57683193 57686528 A +chr16 57683194 57683903 A +chr16 57683194 57684021 A +chr16 57683194 57684033 A +chr16 57683194 57684077 A +chr16 57683194 57684207 A +chr16 57683194 57684265 A +chr16 57683194 57684790 A +chr16 57683194 57684848 A +chr16 57683194 57685110 A +chr16 57683194 57685164 A +chr16 57683194 57685207 A +chr16 57683194 57685287 A +chr16 57683194 57685370 A +chr16 57683194 57685747 A +chr16 57683194 57685796 A +chr16 57683194 57685931 A +chr16 57683194 57685996 A +chr16 57683194 57686151 A +chr16 57683194 57686244 A +chr16 57683194 57686290 A +chr16 57683194 57686466 A +chr16 57683194 57686685 A +chr16 57683194 57687581 A +chr16 57683195 57684881 A +chr16 57683195 57684896 A +chr16 57683195 57684942 A +chr16 57683195 57684978 A +chr16 57683195 57685105 A +chr16 57683195 57685187 A +chr16 57683195 57685295 A +chr16 57683195 57685500 A +chr16 57683195 57685518 A +chr16 57683195 57685615 A +chr16 57683195 57685717 A +chr16 57683195 57685833 A +chr16 57683195 57685963 A +chr16 57683195 57686389 A +chr16 57683195 57686650 A +chr16 57683195 57687192 A +chr16 57683196 57683259 A +chr16 57683196 57685257 A +chr16 57683196 57685283 A +chr16 57683196 57685766 A +chr16 57683196 57686108 A +chr16 57683196 57686813 A +chr16 57683196 57687808 A +chr16 57683197 57683824 A +chr16 57683197 57683896 A +chr16 57683197 57684003 A +chr16 57683197 57684006 A +chr16 57683197 57684132 A +chr16 57683197 57684136 A +chr16 57683197 57684247 A +chr16 57683197 57684292 A +chr16 57683197 57684524 A +chr16 57683197 57684535 A +chr16 57683197 57684590 A +chr16 57683197 57684625 A +chr16 57683197 57684904 A +chr16 57683197 57684996 A +chr16 57683197 57685155 A +chr16 57683197 57685202 A +chr16 57683197 57685434 A +chr16 57683197 57685505 A +chr16 57683197 57685653 A +chr16 57683197 57685880 A +chr16 57683197 57685903 A +chr16 57683197 57686379 A +chr16 57683197 57686400 A +chr16 57683197 57686660 A +chr16 57683197 57687040 A +chr16 57683198 57684386 A +chr16 57683198 57685262 A +chr16 57683198 57685340 A +chr16 57683198 57685344 A +chr16 57683198 57685493 A +chr16 57683198 57685535 A +chr16 57683198 57685816 A +chr16 57683198 57685838 A +chr16 57683198 57686300 A +chr16 57683198 57686374 A +chr16 57683198 57686482 A +chr16 57683198 57686716 A +chr16 57683199 57682363 A +chr16 57683199 57683038 A +chr16 57683199 57683706 A +chr16 57683199 57683715 A +chr16 57683199 57683885 A +chr16 57683199 57684033 A +chr16 57683199 57684342 A +chr16 57683199 57684347 A +chr16 57683199 57684426 A +chr16 57683199 57684631 A +chr16 57683199 57684983 A +chr16 57683199 57684992 A +chr16 57683199 57684998 A +chr16 57683199 57685086 A +chr16 57683199 57685178 A +chr16 57683199 57685526 A +chr16 57683199 57685983 A +chr16 57683199 57686005 A +chr16 57683199 57686487 A +chr16 57683199 57686964 A +chr16 57683199 57686992 A +chr16 57683200 57683760 A +chr16 57683200 57683827 A +chr16 57683200 57684645 A +chr16 57683200 57684671 A +chr16 57683200 57684687 A +chr16 57683200 57684724 A +chr16 57683200 57684782 A +chr16 57683200 57684844 A +chr16 57683200 57685173 A +chr16 57683200 57685419 A +chr16 57683200 57685610 A +chr16 57683200 57685703 A +chr16 57683200 57685958 A +chr16 57683200 57686072 A +chr16 57683200 57686085 A +chr16 57683200 57686259 A +chr16 57683200 57686392 A +chr16 57683200 57686565 A +chr16 57683201 57682770 A +chr16 57683201 57683962 A +chr16 57683201 57684055 A +chr16 57683201 57684699 A +chr16 57683201 57684816 A +chr16 57683201 57684944 A +chr16 57683201 57685020 A +chr16 57683201 57685126 A +chr16 57683201 57685635 A +chr16 57683201 57686097 A +chr16 57683201 57686325 A +chr16 57683201 57686405 A +chr16 57683201 57686613 A +chr16 57683202 57684154 A +chr16 57683202 57684156 A +chr16 57683202 57684289 A +chr16 57683202 57684477 A +chr16 57683202 57684554 A +chr16 57683202 57684710 A +chr16 57683202 57684721 A +chr16 57683202 57684808 A +chr16 57683202 57684837 A +chr16 57683202 57685008 A +chr16 57683202 57685268 A +chr16 57683202 57685519 A +chr16 57683202 57685568 A +chr16 57683202 57685599 A +chr16 57683202 57685690 A +chr16 57683202 57685933 A +chr16 57683202 57686319 A +chr16 57683202 57686737 A +chr16 57683202 57686839 A +chr16 57683202 57686857 A +chr16 57683203 57683186 A +chr16 57683203 57683588 A +chr16 57683203 57684070 A +chr16 57683203 57684583 A +chr16 57683203 57684591 A +chr16 57683203 57684845 A +chr16 57683203 57684871 A +chr16 57683203 57684933 A +chr16 57683203 57685016 A +chr16 57683203 57685053 A +chr16 57683203 57685060 A +chr16 57683203 57685271 A +chr16 57683203 57685441 A +chr16 57683203 57685579 A +chr16 57683203 57685678 A +chr16 57683203 57685729 A +chr16 57683203 57686037 A +chr16 57683203 57686352 A +chr16 57683203 57686370 A +chr16 57683203 57686728 A +chr16 57683203 57686893 A +chr16 57683204 57683568 A +chr16 57683204 57684465 A +chr16 57683204 57684737 A +chr16 57683204 57684782 A +chr16 57683204 57684900 A +chr16 57683204 57685323 A +chr16 57683204 57685381 A +chr16 57683204 57685805 A +chr16 57683204 57685955 A +chr16 57683204 57686057 A +chr16 57683204 57686058 A +chr16 57683204 57686199 A +chr16 57683204 57686348 A +chr16 57683204 57686513 A +chr16 57683204 57686700 A +chr16 57683204 57686923 A +chr16 57683204 57687584 A +chr16 57683205 57683975 A +chr16 57683205 57684977 A +chr16 57683205 57685059 A +chr16 57683205 57685065 A +chr16 57683205 57685365 A +chr16 57683205 57685395 A +chr16 57683205 57685436 A +chr16 57683205 57686140 A +chr16 57683205 57686146 A +chr16 57683205 57686219 A +chr16 57683205 57686326 A +chr16 57683205 57686475 A +chr16 57683205 57686483 A +chr16 57683205 57686585 A +chr16 57683205 57686879 A +chr16 57683205 57688085 A +chr16 57683206 57683221 A +chr16 57683206 57683755 A +chr16 57683206 57684003 A +chr16 57683206 57684358 A +chr16 57683206 57684367 A +chr16 57683206 57684548 A +chr16 57683206 57684649 A +chr16 57683206 57684810 A +chr16 57683206 57685042 A +chr16 57683206 57685184 A +chr16 57683206 57685211 A +chr16 57683206 57685436 A +chr16 57683206 57685534 A +chr16 57683206 57685591 A +chr16 57683206 57685908 A +chr16 57683206 57686060 A +chr16 57683206 57686213 A +chr16 57683206 57686464 A +chr16 57683206 57686736 A +chr16 57683207 57683383 A +chr16 57683207 57684378 A +chr16 57683207 57684498 A +chr16 57683207 57684737 A +chr16 57683207 57684760 A +chr16 57683207 57684767 A +chr16 57683207 57684913 A +chr16 57683207 57685096 A +chr16 57683207 57685114 A +chr16 57683207 57685119 A +chr16 57683207 57685314 A +chr16 57683207 57685624 A +chr16 57683207 57685713 A +chr16 57683207 57685809 A +chr16 57683207 57685922 A +chr16 57683207 57686014 A +chr16 57683207 57686142 A +chr16 57683207 57686151 A +chr16 57683207 57687417 A +chr16 57683207 57688011 A +chr16 57683208 57683401 A +chr16 57683208 57684278 A +chr16 57683208 57684471 A +chr16 57683208 57684649 A +chr16 57683208 57684701 A +chr16 57683208 57684859 A +chr16 57683208 57685083 A +chr16 57683208 57685180 A +chr16 57683208 57685255 A +chr16 57683208 57685502 A +chr16 57683208 57685603 A +chr16 57683208 57685625 A +chr16 57683208 57685892 A +chr16 57683208 57686145 A +chr16 57683208 57686258 A +chr16 57683208 57686339 A +chr16 57683208 57686588 A +chr16 57683208 57686746 A +chr16 57683208 57686755 A +chr16 57683208 57686937 A +chr16 57683208 57687016 A +chr16 57683209 57683990 A +chr16 57683209 57684046 A +chr16 57683209 57684080 A +chr16 57683209 57684445 A +chr16 57683209 57684505 A +chr16 57683209 57684557 A +chr16 57683209 57684744 A +chr16 57683209 57684995 A +chr16 57683209 57685003 A +chr16 57683209 57685013 A +chr16 57683209 57685071 A +chr16 57683209 57685101 A +chr16 57683209 57685329 A +chr16 57683209 57685388 A +chr16 57683209 57685441 A +chr16 57683209 57685645 A +chr16 57683209 57685679 A +chr16 57683209 57685882 A +chr16 57683209 57686203 A +chr16 57683209 57686241 A +chr16 57683209 57686257 A +chr16 57683209 57686376 A +chr16 57683209 57686474 A +chr16 57683210 57682602 A +chr16 57683210 57682797 A +chr16 57683210 57683563 A +chr16 57683210 57683825 A +chr16 57683210 57683857 A +chr16 57683210 57684246 A +chr16 57683210 57685356 A +chr16 57683210 57685689 A +chr16 57683210 57685817 A +chr16 57683210 57686016 A +chr16 57683210 57686493 A +chr16 57683210 57686657 A +chr16 57683210 57686850 A +chr16 57683211 57684093 A +chr16 57683211 57684643 A +chr16 57683211 57684725 A +chr16 57683211 57684772 A +chr16 57683211 57685061 A +chr16 57683211 57685093 A +chr16 57683211 57685102 A +chr16 57683211 57685109 A +chr16 57683211 57685182 A +chr16 57683211 57685378 A +chr16 57683211 57685564 A +chr16 57683211 57685768 A +chr16 57683211 57685835 A +chr16 57683211 57686345 A +chr16 57683211 57687961 A +chr16 57683212 57684194 A +chr16 57683212 57684877 A +chr16 57683212 57684946 A +chr16 57683212 57684992 A +chr16 57683212 57685114 A +chr16 57683212 57685144 A +chr16 57683212 57685194 A +chr16 57683212 57685214 A +chr16 57683212 57685343 A +chr16 57683212 57685526 A +chr16 57683212 57686188 A +chr16 57683212 57686229 A +chr16 57683212 57686240 A +chr16 57683212 57686301 A +chr16 57683212 57686370 A +chr16 57683212 57686476 A +chr16 57683212 57686490 A +chr16 57683212 57686719 A +chr16 57683213 57683741 A +chr16 57683213 57683919 A +chr16 57683213 57683929 A +chr16 57683213 57684085 A +chr16 57683213 57684222 A +chr16 57683213 57684304 A +chr16 57683213 57684313 A +chr16 57683213 57684322 A +chr16 57683213 57684390 A +chr16 57683213 57684409 A +chr16 57683213 57684602 A +chr16 57683213 57684619 A +chr16 57683213 57684658 A +chr16 57683213 57684758 A +chr16 57683213 57685004 A +chr16 57683213 57685013 A +chr16 57683213 57685336 A +chr16 57683213 57685473 A +chr16 57683213 57685525 A +chr16 57683213 57686735 A +chr16 57683213 57686770 A +chr16 57683214 57681562 A +chr16 57683214 57682516 A +chr16 57683214 57683752 A +chr16 57683214 57684048 A +chr16 57683214 57684311 A +chr16 57683214 57684442 A +chr16 57683214 57684484 A +chr16 57683214 57685145 A +chr16 57683214 57685240 A +chr16 57683214 57685374 A +chr16 57683214 57685409 A +chr16 57683214 57685528 A +chr16 57683214 57685637 A +chr16 57683214 57685692 A +chr16 57683214 57685739 A +chr16 57683214 57686003 A +chr16 57683214 57686057 A +chr16 57683214 57686194 A +chr16 57683214 57686621 A +chr16 57683214 57686764 A +chr16 57683215 57683723 A +chr16 57683215 57683848 A +chr16 57683215 57683938 A +chr16 57683215 57684127 A +chr16 57683215 57684316 A +chr16 57683215 57684599 A +chr16 57683215 57684599 A +chr16 57683215 57684910 A +chr16 57683215 57685022 A +chr16 57683215 57685093 A +chr16 57683215 57685133 A +chr16 57683215 57685396 A +chr16 57683215 57685429 A +chr16 57683215 57685435 A +chr16 57683215 57685516 A +chr16 57683215 57685567 A +chr16 57683215 57685694 A +chr16 57683215 57685969 A +chr16 57683215 57686255 A +chr16 57683216 57681909 A +chr16 57683216 57683718 A +chr16 57683216 57683720 A +chr16 57683216 57684165 A +chr16 57683216 57684210 A +chr16 57683216 57684281 A +chr16 57683216 57684539 A +chr16 57683216 57684638 A +chr16 57683216 57684669 A +chr16 57683216 57685000 A +chr16 57683216 57685087 A +chr16 57683216 57685201 A +chr16 57683216 57685384 A +chr16 57683216 57685677 A +chr16 57683216 57686189 A +chr16 57683216 57686283 A +chr16 57683216 57686331 A +chr16 57683216 57686475 A +chr16 57683216 57686562 A +chr16 57683216 57686708 A +chr16 57683216 57686888 A +chr16 57683217 57683132 A +chr16 57683217 57683729 A +chr16 57683217 57683750 A +chr16 57683217 57683758 A +chr16 57683217 57684570 A +chr16 57683217 57684714 A +chr16 57683217 57685019 A +chr16 57683217 57685125 A +chr16 57683217 57685131 A +chr16 57683217 57685254 A +chr16 57683217 57685611 A +chr16 57683217 57685652 A +chr16 57683217 57685711 A +chr16 57683217 57685797 A +chr16 57683217 57685861 A +chr16 57683217 57686369 A +chr16 57683217 57686376 A +chr16 57683217 57686448 A +chr16 57683217 57686491 A +chr16 57683217 57686576 A +chr16 57683217 57686809 A +chr16 57683217 57687067 A +chr16 57683217 57687104 A +chr16 57683217 57687270 A +chr16 57683218 57683508 A +chr16 57683218 57684039 A +chr16 57683218 57684057 A +chr16 57683218 57684213 A +chr16 57683218 57684458 A +chr16 57683218 57684818 A +chr16 57683218 57684883 A +chr16 57683218 57684906 A +chr16 57683218 57685202 A +chr16 57683218 57685215 A +chr16 57683218 57685219 A +chr16 57683218 57685249 A +chr16 57683218 57685293 A +chr16 57683218 57685329 A +chr16 57683218 57685337 A +chr16 57683218 57685368 A +chr16 57683218 57685400 A +chr16 57683218 57685571 A +chr16 57683218 57685630 A +chr16 57683218 57686137 A +chr16 57683218 57686414 A +chr16 57683218 57686705 A +chr16 57683218 57687176 A +chr16 57683219 57684141 A +chr16 57683219 57684489 A +chr16 57683219 57684586 A +chr16 57683219 57684713 A +chr16 57683219 57685231 A +chr16 57683219 57685434 A +chr16 57683219 57685437 A +chr16 57683219 57685510 A +chr16 57683219 57685687 A +chr16 57683219 57685920 A +chr16 57683219 57685930 A +chr16 57683219 57686147 A +chr16 57683219 57686194 A +chr16 57683219 57686376 A +chr16 57683219 57687390 A +chr16 57683219 57687509 A +chr16 57683220 57683064 A +chr16 57683220 57683115 A +chr16 57683220 57683396 A +chr16 57683220 57683512 A +chr16 57683220 57683856 A +chr16 57683220 57683989 A +chr16 57683220 57684329 A +chr16 57683220 57684329 A +chr16 57683220 57684603 A +chr16 57683220 57684852 A +chr16 57683220 57684913 A +chr16 57683220 57685206 A +chr16 57683220 57685544 A +chr16 57683220 57685723 A +chr16 57683220 57685864 A +chr16 57683220 57685965 A +chr16 57683220 57686117 A +chr16 57683220 57686164 A +chr16 57683220 57686627 A +chr16 57683220 57686850 A +chr16 57683221 57684073 A +chr16 57683221 57684160 A +chr16 57683221 57684356 A +chr16 57683221 57684481 A +chr16 57683221 57684624 A +chr16 57683221 57685017 A +chr16 57683221 57685304 A +chr16 57683221 57685540 A +chr16 57683221 57685631 A +chr16 57683221 57686414 A +chr16 57683221 57686510 A +chr16 57683221 57686577 A +chr16 57683222 57683542 A +chr16 57683222 57683853 A +chr16 57683222 57683885 A +chr16 57683222 57684139 A +chr16 57683222 57684227 A +chr16 57683222 57684285 A +chr16 57683222 57684313 A +chr16 57683222 57684365 A +chr16 57683222 57684406 A +chr16 57683222 57685241 A +chr16 57683222 57685518 A +chr16 57683222 57685630 A +chr16 57683222 57685997 A +chr16 57683222 57686052 A +chr16 57683222 57686138 A +chr16 57683222 57686601 A +chr16 57683223 57683200 A +chr16 57683223 57683433 A +chr16 57683223 57684174 A +chr16 57683223 57684386 A +chr16 57683223 57684852 A +chr16 57683223 57685070 A +chr16 57683223 57685116 A +chr16 57683223 57687341 A +chr16 57683224 57684223 A +chr16 57683224 57684433 A +chr16 57683224 57684617 A +chr16 57683224 57684789 A +chr16 57683224 57684906 A +chr16 57683224 57685064 A +chr16 57683224 57685113 A +chr16 57683224 57685139 A +chr16 57683224 57685227 A +chr16 57683224 57685254 A +chr16 57683224 57685403 A +chr16 57683224 57685421 A +chr16 57683224 57685503 A +chr16 57683224 57685615 A +chr16 57683224 57685767 A +chr16 57683224 57686087 A +chr16 57683224 57686545 A +chr16 57683224 57687511 A +chr16 57683225 57683285 A +chr16 57683225 57684249 A +chr16 57683225 57684960 A +chr16 57683225 57685643 A +chr16 57683225 57685874 A +chr16 57683225 57685891 A +chr16 57683225 57687060 A +chr16 57683226 57683018 A +chr16 57683226 57683888 A +chr16 57683226 57684310 A +chr16 57683226 57684462 A +chr16 57683226 57684556 A +chr16 57683226 57684699 A +chr16 57683226 57684748 A +chr16 57683226 57684750 A +chr16 57683226 57684843 A +chr16 57683226 57684915 A +chr16 57683226 57684944 A +chr16 57683226 57685113 A +chr16 57683226 57685207 A +chr16 57683226 57685373 A +chr16 57683226 57685974 A +chr16 57683226 57686007 A +chr16 57683226 57686177 A +chr16 57683226 57686252 A +chr16 57683226 57686319 A +chr16 57683226 57686389 A +chr16 57683226 57686615 A +chr16 57683226 57687023 A +chr16 57683227 57683971 A +chr16 57683227 57683973 A +chr16 57683227 57684110 A +chr16 57683227 57684542 A +chr16 57683227 57684946 A +chr16 57683227 57685040 A +chr16 57683227 57685074 A +chr16 57683227 57685157 A +chr16 57683227 57685172 A +chr16 57683227 57685209 A +chr16 57683227 57685221 A +chr16 57683227 57685502 A +chr16 57683227 57685941 A +chr16 57683227 57686013 A +chr16 57683227 57686100 A +chr16 57683227 57686520 A +chr16 57683228 57683591 A +chr16 57683228 57683995 A +chr16 57683228 57684172 A +chr16 57683228 57684812 A +chr16 57683228 57684907 A +chr16 57683228 57684970 A +chr16 57683228 57684985 A +chr16 57683228 57685112 A +chr16 57683228 57685216 A +chr16 57683228 57685226 A +chr16 57683228 57685228 A +chr16 57683228 57685283 A +chr16 57683228 57685382 A +chr16 57683228 57685398 A +chr16 57683228 57685943 A +chr16 57683228 57686699 A +chr16 57683228 57686963 A +chr16 57683229 57682306 A +chr16 57683229 57682360 A +chr16 57683229 57682901 A +chr16 57683229 57683086 A +chr16 57683229 57684386 A +chr16 57683229 57684438 A +chr16 57683229 57684478 A +chr16 57683229 57684512 A +chr16 57683229 57684616 A +chr16 57683229 57684646 A +chr16 57683229 57685078 A +chr16 57683229 57685318 A +chr16 57683229 57685480 A +chr16 57683229 57685772 A +chr16 57683229 57685948 A +chr16 57683229 57686035 A +chr16 57683229 57686481 A +chr16 57683230 57684071 A +chr16 57683230 57684081 A +chr16 57683230 57684572 A +chr16 57683230 57684789 A +chr16 57683230 57684859 A +chr16 57683230 57684861 A +chr16 57683230 57684989 A +chr16 57683230 57685043 A +chr16 57683230 57685056 A +chr16 57683230 57685102 A +chr16 57683230 57685313 A +chr16 57683230 57685354 A +chr16 57683230 57685624 A +chr16 57683230 57686351 A +chr16 57683230 57686474 A +chr16 57683230 57686724 A +chr16 57683230 57686735 A +chr16 57683231 57683836 A +chr16 57683231 57684219 A +chr16 57683231 57684522 A +chr16 57683231 57684650 A +chr16 57683231 57684746 A +chr16 57683231 57684799 A +chr16 57683231 57684944 A +chr16 57683231 57685062 A +chr16 57683231 57685454 A +chr16 57683231 57685608 A +chr16 57683231 57685769 A +chr16 57683231 57685798 A +chr16 57683231 57685979 A +chr16 57683231 57686098 A +chr16 57683231 57686542 A +chr16 57683232 57683265 A +chr16 57683232 57683614 A +chr16 57683232 57683894 A +chr16 57683232 57684501 A +chr16 57683232 57684618 A +chr16 57683232 57684681 A +chr16 57683232 57684829 A +chr16 57683232 57684918 A +chr16 57683232 57685013 A +chr16 57683232 57685058 A +chr16 57683232 57685482 A +chr16 57683232 57685620 A +chr16 57683232 57685807 A +chr16 57683232 57686145 A +chr16 57683232 57686265 A +chr16 57683232 57686285 A +chr16 57683232 57686937 A +chr16 57683233 57683472 A +chr16 57683233 57684443 A +chr16 57683233 57684490 A +chr16 57683233 57684559 A +chr16 57683233 57684592 A +chr16 57683233 57685053 A +chr16 57683233 57685061 A +chr16 57683233 57685176 A +chr16 57683233 57685249 A +chr16 57683233 57685278 A +chr16 57683233 57685367 A +chr16 57683233 57685395 A +chr16 57683233 57685449 A +chr16 57683233 57685613 A +chr16 57683233 57685644 A +chr16 57683233 57685712 A +chr16 57683233 57685769 A +chr16 57683233 57686022 A +chr16 57683233 57686621 A +chr16 57683233 57686852 A +chr16 57683234 57682550 A +chr16 57683234 57683647 A +chr16 57683234 57683946 A +chr16 57683234 57684000 A +chr16 57683234 57684215 A +chr16 57683234 57684341 A +chr16 57683234 57684363 A +chr16 57683234 57684437 A +chr16 57683234 57684468 A +chr16 57683234 57684539 A +chr16 57683234 57684541 A +chr16 57683234 57684660 A +chr16 57683234 57684819 A +chr16 57683234 57684824 A +chr16 57683234 57684899 A +chr16 57683234 57685006 A +chr16 57683234 57685277 A +chr16 57683234 57685306 A +chr16 57683234 57685462 A +chr16 57683234 57685901 A +chr16 57683234 57686061 A +chr16 57683234 57686498 A +chr16 57683234 57686926 A +chr16 57683234 57688014 A +chr16 57683235 57682509 A +chr16 57683235 57682928 A +chr16 57683235 57683771 A +chr16 57683235 57684047 A +chr16 57683235 57684846 A +chr16 57683235 57684873 A +chr16 57683235 57684987 A +chr16 57683235 57685081 A +chr16 57683235 57685297 A +chr16 57683235 57685387 A +chr16 57683235 57685537 A +chr16 57683235 57685563 A +chr16 57683235 57685632 A +chr16 57683235 57685815 A +chr16 57683235 57686001 A +chr16 57683235 57686118 A +chr16 57683235 57686351 A +chr16 57683235 57686465 A +chr16 57683235 57686726 A +chr16 57683236 57683766 A +chr16 57683236 57683782 A +chr16 57683236 57684442 A +chr16 57683236 57684605 A +chr16 57683236 57684836 A +chr16 57683236 57685003 A +chr16 57683236 57685136 A +chr16 57683236 57685716 A +chr16 57683236 57685827 A +chr16 57683236 57685982 A +chr16 57683236 57686197 A +chr16 57683236 57686436 A +chr16 57683237 57683771 A +chr16 57683237 57684280 A +chr16 57683237 57684428 A +chr16 57683237 57684673 A +chr16 57683237 57684872 A +chr16 57683237 57684879 A +chr16 57683237 57685116 A +chr16 57683237 57685336 A +chr16 57683237 57685597 A +chr16 57683237 57685661 A +chr16 57683237 57685670 A +chr16 57683237 57685810 A +chr16 57683237 57685850 A +chr16 57683237 57686064 A +chr16 57683238 57683874 A +chr16 57683238 57684253 A +chr16 57683238 57684375 A +chr16 57683238 57684507 A +chr16 57683238 57684590 A +chr16 57683238 57684748 A +chr16 57683238 57685171 A +chr16 57683238 57685286 A +chr16 57683238 57685348 A +chr16 57683238 57685400 A +chr16 57683238 57685533 A +chr16 57683238 57685651 A +chr16 57683238 57685812 A +chr16 57683238 57685841 A +chr16 57683238 57686469 A +chr16 57683238 57686532 A +chr16 57683239 57683734 A +chr16 57683239 57683876 A +chr16 57683239 57684047 A +chr16 57683239 57684242 A +chr16 57683239 57684447 A +chr16 57683239 57684719 A +chr16 57683239 57684738 A +chr16 57683239 57684798 A +chr16 57683239 57684838 A +chr16 57683239 57684977 A +chr16 57683239 57685087 A +chr16 57683239 57685139 A +chr16 57683239 57685229 A +chr16 57683239 57685410 A +chr16 57683239 57685435 A +chr16 57683239 57685441 A +chr16 57683239 57685641 A +chr16 57683239 57685923 A +chr16 57683239 57685949 A +chr16 57683239 57685985 A +chr16 57683239 57686013 A +chr16 57683239 57686240 A +chr16 57683239 57686572 A +chr16 57683240 57684468 A +chr16 57683240 57684481 A +chr16 57683240 57684581 A +chr16 57683240 57684607 A +chr16 57683240 57684828 A +chr16 57683240 57684923 A +chr16 57683240 57684970 A +chr16 57683240 57685868 A +chr16 57683240 57686189 A +chr16 57683240 57686647 A +chr16 57683240 57686783 A +chr16 57683240 57687577 A +chr16 57683241 57683740 A +chr16 57683241 57684044 A +chr16 57683241 57684403 A +chr16 57683241 57684442 A +chr16 57683241 57684544 A +chr16 57683241 57684710 A +chr16 57683241 57684922 A +chr16 57683241 57685037 A +chr16 57683241 57685193 A +chr16 57683241 57685242 A +chr16 57683241 57685246 A +chr16 57683241 57685333 A +chr16 57683241 57685393 A +chr16 57683241 57685425 A +chr16 57683241 57685665 A +chr16 57683241 57685819 A +chr16 57683241 57685912 A +chr16 57683241 57686299 A +chr16 57683241 57686517 A +chr16 57683241 57686923 A +chr16 57683242 57682502 A +chr16 57683242 57683800 A +chr16 57683242 57684075 A +chr16 57683242 57684162 A +chr16 57683242 57684357 A +chr16 57683242 57684372 A +chr16 57683242 57684549 A +chr16 57683242 57684752 A +chr16 57683242 57684798 A +chr16 57683242 57684829 A +chr16 57683242 57684890 A +chr16 57683242 57684925 A +chr16 57683242 57684990 A +chr16 57683242 57685281 A +chr16 57683242 57685333 A +chr16 57683242 57685486 A +chr16 57683242 57685972 A +chr16 57683242 57686115 A +chr16 57683242 57687085 A +chr16 57683243 57683656 A +chr16 57683243 57683907 A +chr16 57683243 57684088 A +chr16 57683243 57684690 A +chr16 57683243 57684796 A +chr16 57683243 57684838 A +chr16 57683243 57684865 A +chr16 57683243 57684926 A +chr16 57683243 57685053 A +chr16 57683243 57685059 A +chr16 57683243 57685100 A +chr16 57683243 57685301 A +chr16 57683243 57685394 A +chr16 57683243 57685416 A +chr16 57683243 57685725 A +chr16 57683243 57685784 A +chr16 57683243 57685975 A +chr16 57683243 57686022 A +chr16 57683243 57686378 A +chr16 57683243 57688448 A +chr16 57683244 57682667 A +chr16 57683244 57683803 A +chr16 57683244 57684079 A +chr16 57683244 57684237 A +chr16 57683244 57684745 A +chr16 57683244 57684918 A +chr16 57683244 57685073 A +chr16 57683244 57685306 A +chr16 57683244 57685317 A +chr16 57683244 57685547 A +chr16 57683244 57685797 A +chr16 57683244 57685864 A +chr16 57683244 57685960 A +chr16 57683244 57686264 A +chr16 57683244 57686290 A +chr16 57683244 57686389 A +chr16 57683244 57686441 A +chr16 57683244 57686537 A +chr16 57683244 57686546 A +chr16 57683244 57686562 A +chr16 57683244 57687245 A +chr16 57683245 57684226 A +chr16 57683245 57684439 A +chr16 57683245 57684555 A +chr16 57683245 57684704 A +chr16 57683245 57685049 A +chr16 57683245 57685082 A +chr16 57683245 57685218 A +chr16 57683245 57685293 A +chr16 57683245 57685555 A +chr16 57683245 57685748 A +chr16 57683245 57685889 A +chr16 57683245 57685951 A +chr16 57683245 57686272 A +chr16 57683246 57683314 A +chr16 57683246 57683351 A +chr16 57683246 57683861 A +chr16 57683246 57683883 A +chr16 57683246 57684057 A +chr16 57683246 57684102 A +chr16 57683246 57684445 A +chr16 57683246 57684513 A +chr16 57683246 57684833 A +chr16 57683246 57684862 A +chr16 57683246 57685026 A +chr16 57683246 57685046 A +chr16 57683246 57685084 A +chr16 57683246 57685203 A +chr16 57683246 57685646 A +chr16 57683246 57685890 A +chr16 57683246 57686089 A +chr16 57683246 57686126 A +chr16 57683246 57686182 A +chr16 57683246 57686379 A +chr16 57683246 57686624 A +chr16 57683247 57684159 A +chr16 57683247 57684306 A +chr16 57683247 57684618 A +chr16 57683247 57684723 A +chr16 57683247 57685082 A +chr16 57683247 57685502 A +chr16 57683247 57685638 A +chr16 57683247 57685855 A +chr16 57683247 57686201 A +chr16 57683247 57686376 A +chr16 57683248 57683829 A +chr16 57683248 57684321 A +chr16 57683248 57684387 A +chr16 57683248 57684463 A +chr16 57683248 57684696 A +chr16 57683248 57684738 A +chr16 57683248 57684789 A +chr16 57683248 57684906 A +chr16 57683248 57684924 A +chr16 57683248 57685013 A +chr16 57683248 57685523 A +chr16 57683248 57685964 A +chr16 57683248 57686141 A +chr16 57683248 57686402 A +chr16 57683248 57686717 A +chr16 57683248 57686771 A +chr16 57683249 57683863 A +chr16 57683249 57684086 A +chr16 57683249 57684483 A +chr16 57683249 57684486 A +chr16 57683249 57684493 A +chr16 57683249 57684802 A +chr16 57683249 57685013 A +chr16 57683249 57685032 A +chr16 57683249 57685061 A +chr16 57683249 57685679 A +chr16 57683249 57685723 A +chr16 57683249 57685745 A +chr16 57683249 57686180 A +chr16 57683249 57686228 A +chr16 57683249 57687957 A +chr16 57683250 57684044 A +chr16 57683250 57684169 A +chr16 57683250 57684266 A +chr16 57683250 57684368 A +chr16 57683250 57684496 A +chr16 57683250 57684943 A +chr16 57683250 57685087 A +chr16 57683250 57685095 A +chr16 57683250 57685177 A +chr16 57683250 57685289 A +chr16 57683250 57685390 A +chr16 57683250 57685391 A +chr16 57683250 57685431 A +chr16 57683250 57685661 A +chr16 57683250 57685959 A +chr16 57683250 57686300 A +chr16 57683250 57686372 A +chr16 57683250 57686734 A +chr16 57683251 57682962 A +chr16 57683251 57684326 A +chr16 57683251 57684770 A +chr16 57683251 57684828 A +chr16 57683251 57684938 A +chr16 57683251 57685134 A +chr16 57683251 57685197 A +chr16 57683251 57685214 A +chr16 57683251 57685415 A +chr16 57683251 57685645 A +chr16 57683251 57685866 A +chr16 57683251 57685899 A +chr16 57683251 57686214 A +chr16 57683251 57686313 A +chr16 57683251 57686523 A +chr16 57683252 57682562 A +chr16 57683252 57684728 A +chr16 57683252 57684854 A +chr16 57683252 57684978 A +chr16 57683252 57685067 A +chr16 57683252 57685178 A +chr16 57683252 57685317 A +chr16 57683252 57685380 A +chr16 57683252 57685536 A +chr16 57683252 57685692 A +chr16 57683252 57686100 A +chr16 57683252 57686137 A +chr16 57683252 57686472 A +chr16 57683252 57686821 A +chr16 57683252 57686911 A +chr16 57683252 57687163 A +chr16 57683252 57687180 A +chr16 57683252 57687638 A +chr16 57683253 57683633 A +chr16 57683253 57683796 A +chr16 57683253 57683950 A +chr16 57683253 57684240 A +chr16 57683253 57684310 A +chr16 57683253 57684474 A +chr16 57683253 57684478 A +chr16 57683253 57684490 A +chr16 57683253 57684527 A +chr16 57683253 57684537 A +chr16 57683253 57685017 A +chr16 57683253 57685072 A +chr16 57683253 57685467 A +chr16 57683253 57685473 A +chr16 57683253 57685601 A +chr16 57683253 57685646 A +chr16 57683253 57685657 A +chr16 57683253 57685734 A +chr16 57683253 57685781 A +chr16 57683253 57685824 A +chr16 57683253 57686053 A +chr16 57683253 57686416 A +chr16 57683253 57686490 A +chr16 57683253 57686594 A +chr16 57683253 57687479 A +chr16 57683254 57682609 A +chr16 57683254 57683599 A +chr16 57683254 57683784 A +chr16 57683254 57683882 A +chr16 57683254 57684243 A +chr16 57683254 57684415 A +chr16 57683254 57685000 A +chr16 57683254 57685089 A +chr16 57683254 57685099 A +chr16 57683254 57685192 A +chr16 57683254 57685246 A +chr16 57683254 57685303 A +chr16 57683254 57685395 A +chr16 57683254 57685655 A +chr16 57683254 57685678 A +chr16 57683254 57685759 A +chr16 57683254 57685807 A +chr16 57683254 57685912 A +chr16 57683254 57685988 A +chr16 57683254 57685997 A +chr16 57683254 57686068 A +chr16 57683254 57686096 A +chr16 57683254 57686576 A +chr16 57683254 57687210 A +chr16 57683255 57683337 A +chr16 57683255 57683717 A +chr16 57683255 57684068 A +chr16 57683255 57684195 A +chr16 57683255 57684209 A +chr16 57683255 57684381 A +chr16 57683255 57684487 A +chr16 57683255 57684914 A +chr16 57683255 57684940 A +chr16 57683255 57685068 A +chr16 57683255 57685251 A +chr16 57683255 57685283 A +chr16 57683255 57685323 A +chr16 57683255 57685699 A +chr16 57683255 57685793 A +chr16 57683255 57685844 A +chr16 57683255 57685947 A +chr16 57683255 57686014 A +chr16 57683255 57686131 A +chr16 57683255 57686323 A +chr16 57683255 57686342 A +chr16 57683255 57686401 A +chr16 57683255 57686443 A +chr16 57683255 57686542 A +chr16 57683255 57686862 A +chr16 57683255 57687001 A +chr16 57683256 57682911 A +chr16 57683256 57683406 A +chr16 57683256 57683515 A +chr16 57683256 57683739 A +chr16 57683256 57684180 A +chr16 57683256 57684439 A +chr16 57683256 57684909 A +chr16 57683256 57685032 A +chr16 57683256 57685261 A +chr16 57683256 57685377 A +chr16 57683256 57685437 A +chr16 57683256 57685681 A +chr16 57683256 57685705 A +chr16 57683256 57685705 A +chr16 57683256 57685944 A +chr16 57683256 57685968 A +chr16 57683256 57685976 A +chr16 57683256 57686106 A +chr16 57683256 57686317 A +chr16 57683256 57686548 A +chr16 57683256 57686566 A +chr16 57683256 57686625 A +chr16 57683256 57686707 A +chr16 57683257 57684207 A +chr16 57683257 57684279 A +chr16 57683257 57684542 A +chr16 57683257 57684581 A +chr16 57683257 57684673 A +chr16 57683257 57684748 A +chr16 57683257 57685028 A +chr16 57683257 57685165 A +chr16 57683257 57685174 A +chr16 57683257 57685187 A +chr16 57683257 57685510 A +chr16 57683257 57685687 A +chr16 57683257 57685808 A +chr16 57683257 57685825 A +chr16 57683257 57685885 A +chr16 57683257 57686018 A +chr16 57683257 57686058 A +chr16 57683257 57686362 A +chr16 57683257 57686454 A +chr16 57683257 57686496 A +chr16 57683257 57686593 A +chr16 57683257 57686647 A +chr16 57683257 57686851 A +chr16 57683258 57683102 A +chr16 57683258 57683399 A +chr16 57683258 57683851 A +chr16 57683258 57684078 A +chr16 57683258 57684082 A +chr16 57683258 57684589 A +chr16 57683258 57684646 A +chr16 57683258 57684674 A +chr16 57683258 57684799 A +chr16 57683258 57684813 A +chr16 57683258 57684876 A +chr16 57683258 57684986 A +chr16 57683258 57685383 A +chr16 57683258 57685504 A +chr16 57683258 57685961 A +chr16 57683258 57686522 A +chr16 57683258 57686767 A +chr16 57683259 57684057 A +chr16 57683259 57684137 A +chr16 57683259 57684255 A +chr16 57683259 57684344 A +chr16 57683259 57684570 A +chr16 57683259 57684695 A +chr16 57683259 57684806 A +chr16 57683259 57684993 A +chr16 57683259 57685038 A +chr16 57683259 57685296 A +chr16 57683259 57685299 A +chr16 57683259 57685370 A +chr16 57683259 57685718 A +chr16 57683259 57685859 A +chr16 57683259 57686314 A +chr16 57683259 57686355 A +chr16 57683259 57686379 A +chr16 57683259 57686436 A +chr16 57683259 57686820 A +chr16 57683260 57683876 A +chr16 57683260 57684331 A +chr16 57683260 57684523 A +chr16 57683260 57684523 A +chr16 57683260 57684752 A +chr16 57683260 57684779 A +chr16 57683260 57684800 A +chr16 57683260 57684932 A +chr16 57683260 57684976 A +chr16 57683260 57685027 A +chr16 57683260 57685059 A +chr16 57683260 57685179 A +chr16 57683260 57685356 A +chr16 57683260 57685597 A +chr16 57683260 57685803 A +chr16 57683260 57685859 A +chr16 57683260 57686001 A +chr16 57683260 57686307 A +chr16 57683260 57686455 A +chr16 57683261 57682834 A +chr16 57683261 57683918 A +chr16 57683261 57683942 A +chr16 57683261 57684671 A +chr16 57683261 57685108 A +chr16 57683261 57685195 A +chr16 57683261 57685222 A +chr16 57683261 57685277 A +chr16 57683261 57685329 A +chr16 57683261 57685397 A +chr16 57683261 57685400 A +chr16 57683261 57685528 A +chr16 57683261 57685539 A +chr16 57683261 57685573 A +chr16 57683261 57685655 A +chr16 57683261 57685741 A +chr16 57683261 57685952 A +chr16 57683261 57686037 A +chr16 57683261 57686054 A +chr16 57683261 57686112 A +chr16 57683261 57686368 A +chr16 57683261 57686956 A +chr16 57683262 57683453 A +chr16 57683262 57683519 A +chr16 57683262 57683636 A +chr16 57683262 57684281 A +chr16 57683262 57684747 A +chr16 57683262 57684799 A +chr16 57683262 57684885 A +chr16 57683262 57685323 A +chr16 57683262 57685359 A +chr16 57683262 57685445 A +chr16 57683262 57685629 A +chr16 57683262 57685724 A +chr16 57683262 57685919 A +chr16 57683262 57686463 A +chr16 57683262 57686480 A +chr16 57683262 57686491 A +chr16 57683262 57686854 A +chr16 57683262 57687163 A +chr16 57683263 57683271 A +chr16 57683263 57683369 A +chr16 57683263 57683463 A +chr16 57683263 57683627 A +chr16 57683263 57683980 A +chr16 57683263 57684069 A +chr16 57683263 57684393 A +chr16 57683263 57684418 A +chr16 57683263 57684923 A +chr16 57683263 57685172 A +chr16 57683263 57685188 A +chr16 57683263 57685240 A +chr16 57683263 57685378 A +chr16 57683263 57685626 A +chr16 57683263 57685678 A +chr16 57683263 57685761 A +chr16 57683263 57685836 A +chr16 57683263 57685900 A +chr16 57683263 57686146 A +chr16 57683264 57683561 A +chr16 57683264 57684482 A +chr16 57683264 57684493 A +chr16 57683264 57684539 A +chr16 57683264 57684547 A +chr16 57683264 57684768 A +chr16 57683264 57684823 A +chr16 57683264 57684932 A +chr16 57683264 57685320 A +chr16 57683264 57685453 A +chr16 57683264 57685828 A +chr16 57683264 57685833 A +chr16 57683264 57685852 A +chr16 57683264 57685856 A +chr16 57683264 57686020 A +chr16 57683264 57687447 A +chr16 57683265 57683878 A +chr16 57683265 57684043 A +chr16 57683265 57684063 A +chr16 57683265 57684353 A +chr16 57683265 57684740 A +chr16 57683265 57684993 A +chr16 57683265 57685191 A +chr16 57683265 57685359 A +chr16 57683265 57685481 A +chr16 57683265 57685730 A +chr16 57683265 57685827 A +chr16 57683265 57686185 A +chr16 57683265 57687260 A +chr16 57683266 57683688 A +chr16 57683266 57684415 A +chr16 57683266 57684534 A +chr16 57683266 57684537 A +chr16 57683266 57685267 A +chr16 57683266 57685396 A +chr16 57683266 57685557 A +chr16 57683266 57685559 A +chr16 57683266 57685718 A +chr16 57683266 57685719 A +chr16 57683266 57685912 A +chr16 57683266 57686013 A +chr16 57683266 57686175 A +chr16 57683266 57686191 A +chr16 57683266 57686361 A +chr16 57683266 57686364 A +chr16 57683266 57686502 A +chr16 57683266 57686568 A +chr16 57683267 57683349 A +chr16 57683267 57683512 A +chr16 57683267 57683526 A +chr16 57683267 57684074 A +chr16 57683267 57684393 A +chr16 57683267 57684416 A +chr16 57683267 57684862 A +chr16 57683267 57684869 A +chr16 57683267 57684881 A +chr16 57683267 57684979 A +chr16 57683267 57685277 A +chr16 57683267 57686038 A +chr16 57683267 57686485 A +chr16 57683267 57686958 A +chr16 57683268 57683142 A +chr16 57683268 57683326 A +chr16 57683268 57683390 A +chr16 57683268 57683877 A +chr16 57683268 57684024 A +chr16 57683268 57684343 A +chr16 57683268 57684364 A +chr16 57683268 57684404 A +chr16 57683268 57684582 A +chr16 57683268 57684695 A +chr16 57683268 57684946 A +chr16 57683268 57685220 A +chr16 57683268 57685369 A +chr16 57683268 57685388 A +chr16 57683268 57685646 A +chr16 57683268 57685673 A +chr16 57683268 57685678 A +chr16 57683268 57685703 A +chr16 57683268 57685747 A +chr16 57683268 57685938 A +chr16 57683268 57686048 A +chr16 57683268 57686097 A +chr16 57683268 57686339 A +chr16 57683268 57686433 A +chr16 57683268 57686544 A +chr16 57683268 57686751 A +chr16 57683268 57687219 A +chr16 57683269 57683005 A +chr16 57683269 57683953 A +chr16 57683269 57684751 A +chr16 57683269 57684757 A +chr16 57683269 57684769 A +chr16 57683269 57684791 A +chr16 57683269 57684834 A +chr16 57683269 57684869 A +chr16 57683269 57684952 A +chr16 57683269 57684955 A +chr16 57683269 57685074 A +chr16 57683269 57685099 A +chr16 57683269 57685189 A +chr16 57683269 57685228 A +chr16 57683269 57685248 A +chr16 57683269 57685390 A +chr16 57683269 57685556 A +chr16 57683269 57685822 A +chr16 57683269 57685910 A +chr16 57683270 57683578 A +chr16 57683270 57683912 A +chr16 57683270 57684073 A +chr16 57683270 57684273 A +chr16 57683270 57684761 A +chr16 57683270 57684951 A +chr16 57683270 57684995 A +chr16 57683270 57685278 A +chr16 57683270 57685351 A +chr16 57683270 57685496 A +chr16 57683270 57685668 A +chr16 57683270 57685773 A +chr16 57683270 57685994 A +chr16 57683270 57686067 A +chr16 57683270 57687207 A +chr16 57683271 57683082 A +chr16 57683271 57683416 A +chr16 57683271 57683511 A +chr16 57683271 57683998 A +chr16 57683271 57684307 A +chr16 57683271 57684456 A +chr16 57683271 57684589 A +chr16 57683271 57684826 A +chr16 57683271 57685346 A +chr16 57683271 57685355 A +chr16 57683271 57685551 A +chr16 57683271 57685581 A +chr16 57683271 57685642 A +chr16 57683271 57685717 A +chr16 57683271 57686026 A +chr16 57683271 57686155 A +chr16 57683271 57686645 A +chr16 57683271 57687053 A +chr16 57683271 57687298 A +chr16 57683272 57683835 A +chr16 57683272 57683975 A +chr16 57683272 57684332 A +chr16 57683272 57684930 A +chr16 57683272 57685105 A +chr16 57683272 57685268 A +chr16 57683272 57685279 A +chr16 57683272 57685353 A +chr16 57683272 57685568 A +chr16 57683272 57685604 A +chr16 57683272 57685652 A +chr16 57683272 57685706 A +chr16 57683272 57685874 A +chr16 57683272 57685969 A +chr16 57683272 57686236 A +chr16 57683272 57686647 A +chr16 57683272 57686902 A +chr16 57683272 57687046 A +chr16 57683272 57687177 A +chr16 57683272 57687241 A +chr16 57683273 57683673 A +chr16 57683273 57684143 A +chr16 57683273 57684723 A +chr16 57683273 57684739 A +chr16 57683273 57684789 A +chr16 57683273 57684842 A +chr16 57683273 57685004 A +chr16 57683273 57685131 A +chr16 57683273 57685249 A +chr16 57683273 57685296 A +chr16 57683273 57685511 A +chr16 57683273 57685609 A +chr16 57683273 57685778 A +chr16 57683273 57685937 A +chr16 57683273 57686004 A +chr16 57683273 57686072 A +chr16 57683273 57686074 A +chr16 57683273 57686311 A +chr16 57683273 57687061 A +chr16 57791126 57791446 - +chr16 57933601 57933921 - +chr16 58054590 58054910 - +chr16 58131133 58131453 - +chr16 58155848 58156168 - +chr16 58370580 58370900 - +chr16 58521649 58521969 - +chr16 58528910 58529230 - +chr16 58768130 58768450 - +chr16 58848178 58848498 - +chr16 63394947 63395267 - +chr16 65017931 65018251 - +chr16 65151895 65152215 - +chr16 66309765 66310085 - +chr16 66349237 66349405 - +chr16 66440345 66440665 - +chr16 66512637 66512957 - +chr16 66517749 66517877 - +chr16 66623349 66623669 - +chr16 66642457 66642777 - +chr16 66684941 66685096 - +chr16 66773894 66774214 - +chr16 66864760 66865080 - +chr16 66901743 66902063 - +chr16 66907067 66907387 - +chr16 66929087 66929407 - +chr16 66981060 66981380 - +chr16 66987991 66988311 - +chr16 66994941 66995261 - +chr16 67011568 67011888 - +chr16 67051740 67052060 - +chr16 67189286 67189606 - +chr16 67294589 67294909 - +chr16 67462047 67462367 - +chr16 67499818 67499971 - +chr16 67515106 67515426 - +chr16 67527910 67528230 - +chr16 67555071 67555187 - +chr16 67555519 67555654 - +chr16 67580893 67581213 - +chr16 67678683 67678802 - +chr16 67706631 67706951 - +chr16 67850531 67850727 - +chr16 67875683 67876003 - +chr16 67917639 67917959 - +chr16 67962406 67962726 - +chr16 68027165 68027485 - +chr16 68074511 68074831 - +chr16 68098656 68098976 - +chr16 68263315 68263635 - +chr16 68295488 68295808 - +chr16 68297895 68298215 - +chr16 68362562 68362882 - +chr16 68430474 68430794 - +chr16 68544191 68544325 - +chr16 68554604 68554777 - +chr16 68738264 68738584 - +chr16 68834214 68834534 - +chr16 68936316 68936636 - +chr16 69146591 69146911 - +chr16 69345026 69345346 - +chr16 69432214 69432534 - +chr16 69788850 69788996 - +chr16 70285778 70286098 - +chr16 70287757 70288077 - +chr16 70301463 70301783 - +chr16 70429266 70429586 - +chr16 70687672 70687992 - +chr16 70772417 70772520 - +chr16 70805122 70805249 - +chr16 71345534 71345854 - +chr16 71446639 71446959 - +chr16 71644959 71645279 - +chr16 71916354 71916674 - +chr16 72158786 72159106 - +chr16 72275870 72276007 - +chr16 72434188 72434508 - +chr16 72536149 72536241 - +chr16 72866522 72866715 - +chr16 73990536 73990856 - +chr16 74743319 74743639 - +chr16 74847109 74847220 - +chr16 75145276 75145365 - +chr16 75233241 75233561 - +chr16 75267749 75268069 - +chr16 75498677 75498904 - +chr16 75551291 75551611 - +chr16 77270023 77270207 - +chr16 77618174 77618494 - +chr16 77633744 77634064 - +chr16 77709641 77709961 - +chr16 78005509 78005829 - +chr16 78036867 78037187 - +chr16 78133844 78134164 - +chr16 80351715 80352035 - +chr16 80560359 80560679 - +chr16 81069239 81069399 - +chr16 81294672 81294992 - +chr16 81437151 81437471 - +chr16 81771622 81771942 - +chr16 81907504 81907824 - +chr16 81919635 81919955 - +chr16 81923996 81924316 - +chr16 82263751 82263877 - +chr16 82297232 82297366 - +chr16 82490242 82490562 - +chr16 82673365 82673685 - +chr16 83249149 83249231 - +chr16 83859716 83860036 - +chr16 83960355 83960489 - +chr16 83990477 83990560 - +chr16 84096382 84096702 - +chr16 84118402 84118722 - +chr16 84220709 84220792 - +chr16 84310871 84311191 - +chr16 84544973 84545293 - +chr16 84706045 84706127 - +chr16 84772107 84772427 - +chr16 84958759 84959079 - +chr16 84985839 84986159 - +chr16 85004172 85004492 - +chr16 85148541 85148861 - +chr16 85279236 85279556 - +chr16 85286906 85287014 - +chr16 85309347 85309436 - +chr16 85350141 85350239 - +chr16 85403696 85403796 - +chr16 85427311 85427631 - +chr16 85454394 85454548 - +chr16 85524652 85524972 - +chr16 85644165 85644485 - +chr16 85649575 85649895 - +chr16 85693365 85693685 - +chr16 85852811 85853131 - +chr16 85855000 85855320 - +chr16 86050027 86050347 - +chr16 86374306 86374626 - +chr16 86551322 86551642 - +chr16 86553823 86553906 - +chr16 86597452 86597736 - +chr16 86609310 86609630 - +chr16 86794845 86794987 - +chr16 86907774 86908094 - +chr16 87096317 87096637 - +chr16 87236791 87237111 - +chr16 87312252 87312572 - +chr16 87358435 87358516 - +chr16 87584015 87584335 - +chr16 87622380 87622700 - +chr16 87670017 87670337 - +chr16 87811628 87811737 - +chr16 87883669 87883787 - +chr16 87943084 87943201 - +chr16 88320890 88321210 - +chr16 88450181 88450501 - +chr16 88503833 88504153 - +chr16 88509443 88509763 - +chr16 88593895 88594215 - +chr16 88751599 88751919 - +chr16 88803730 88804050 - +chr16 88810191 88810511 - +chr16 88851875 88852195 - +chr16 88924176 88924496 - +chr16 89233894 89234214 - +chr16 89307194 89307514 - +chr16 89355904 89356224 - +chr16 89623888 89624208 - +chr16 89641039 89641359 - +chr16 89707654 89707974 - +chr16 89927311 89927518 - +chr16 89976762 89977082 - +chr16 90148414 90148616 - +chr17 262034 262354 - +chr17 655536 655623 - +chr17 899774 900094 - +chr17 1026380 1026544 - +chr17 1090605 1090734 - +chr17 1162404 1162724 - +chr17 1179472 1179792 - +chr17 1235846 1236166 - +chr17 1268763 1269083 - +chr17 1477746 1478066 - +chr17 1552094 1552414 - +chr17 1572646 1572966 - +chr17 1624226 1624353 - +chr17 1687331 1687651 - +chr17 1812003 1812323 - +chr17 1929313 1929633 - +chr17 1961812 1962132 - +chr17 1998394 1998526 - +chr17 2148177 2148314 - +chr17 2285889 2286209 - +chr17 2319368 2319688 - +chr17 2731734 2732054 - +chr17 2752190 2752282 - +chr17 2916298 2916618 - +chr17 3586360 3586537 - +chr17 3641005 3641325 - +chr17 3749499 3749819 - +chr17 3817234 3817378 - +chr17 4103132 4103452 - +chr17 4118661 4118981 - +chr17 4316521 4316841 - +chr17 4323914 4324234 - +chr17 4468870 4469190 - +chr17 4693207 4693527 - +chr17 4785331 4785651 - +chr17 4792389 4792709 - +chr17 4812428 4812748 - +chr17 4883901 4884221 - +chr17 4934695 4935015 - +chr17 4965728 4966048 - +chr17 5342427 5342747 - +chr17 5490786 5491106 - +chr17 5610193 5610513 - +chr17 6325358 6325678 - +chr17 6339339 6339659 - +chr17 6376571 6376718 - +chr17 6569424 6569565 - +chr17 6817186 6817506 - +chr17 6925424 6925514 - +chr17 6955376 6955696 - +chr17 7039596 7039916 - +chr17 7080751 7081071 - +chr17 7183885 7184205 - +chr17 7199735 7200055 - +chr17 7227628 7227756 - +chr17 7259342 7259662 - +chr17 7340303 7340623 - +chr17 7463684 7464004 - +chr17 7464296 7464616 - +chr17 7492670 7492990 - +chr17 7591346 7591666 - +chr17 7647008 7647132 - +chr17 7726045 7726365 - +chr17 7738171 7738491 - +chr17 7755107 7755427 - +chr17 7893606 7893730 - +chr17 7977097 7977190 - +chr17 7983239 7983559 - +chr17 7997176 7997496 - +chr17 8022066 8022386 - +chr17 8041885 8042205 - +chr17 8062109 8062429 - +chr17 8203971 8204291 - +chr17 8286506 8286826 - +chr17 8313602 8313922 - +chr17 8649288 8649608 - +chr17 8689160 8689480 - +chr17 8798291 8798611 - +chr17 8897566 8897886 - +chr17 9632471 9632604 - +chr17 9809262 9809582 - +chr17 9832195 9832515 - +chr17 9868946 9869266 - +chr17 9891147 9891467 - +chr17 10130296 10130431 - +chr17 10231009 10231141 - +chr17 10517910 10518230 - +chr17 10519908 10520104 - +chr17 10521557 10521686 - +chr17 10534410 10534730 - +chr17 10549743 10550063 - +chr17 10640426 10640746 - +chr17 10675765 10676085 - +chr17 10748496 10748816 - +chr17 11839251 11839365 - +chr17 12449257 12449577 - +chr17 12645619 12645939 - +chr17 12927859 12928052 - +chr17 12933744 12934064 - +chr17 12935149 12935469 - +chr17 13017846 13018166 - +chr17 13241143 13241463 - +chr17 13498110 13498430 - +chr17 14205894 14206214 - +chr17 15074870 15075022 - +chr17 15160677 15160997 - +chr17 15406153 15406473 - +chr17 16267201 16267317 - +chr17 16356560 16356880 - +chr17 16424614 16424934 - +chr17 16487556 16487876 - +chr17 16492763 16492918 - +chr17 16819331 16819651 - +chr17 16894746 16895066 - +chr17 16905196 16905337 - +chr17 17045195 17045515 - +chr17 17258237 17258557 - +chr17 17259432 17259658 - +chr17 17338746 17339066 - +chr17 17472050 17472370 - +chr17 17517023 17517343 - +chr17 17545150 17545470 - +chr17 17715625 17715791 - +chr17 17763643 17763797 - +chr17 17794467 17794575 - +chr17 17822525 17822845 - +chr17 18120792 18120978 - +chr17 18528961 18529281 - +chr17 18864150 18864470 - +chr17 19350438 19350758 - +chr17 19417225 19417545 - +chr17 19665848 19666168 - +chr17 19669893 19670056 - +chr17 19912570 19912662 - +chr17 20024953 20025273 - +chr17 20130081 20130401 - +chr17 20224133 20224453 - +chr17 20811632 20811952 - +chr17 20896297 20896464 - +chr17 20946881 20947201 - +chr17 21023100 21023191 - +chr17 21183414 21183572 - +chr17 21360361 21360681 - +chr17 25563743 25564063 - +chr17 25659888 25660208 - +chr17 25928767 25929087 - +chr17 25930207 25930527 - +chr17 25978518 25978838 - +chr17 25981746 25982066 - +chr17 26083836 26084156 - +chr17 26087091 26087411 - +chr17 26132477 26132797 - +chr17 26207067 26207387 - +chr17 26220340 26220660 - +chr17 26578675 26578995 - +chr17 26743863 26744183 - +chr17 26821864 26821958 - +chr17 26867747 26868067 - +chr17 26875280 26875442 - +chr17 26940314 26940634 - +chr17 27053962 27054102 - +chr17 27077180 27077384 - +chr17 27090764 27091084 - +chr17 27230031 27230351 - +chr17 27280553 27280873 - +chr17 27292629 27292949 - +chr17 27438959 27439112 - +chr17 27444142 27444462 - +chr17 27475893 27476213 - +chr17 27507334 27507654 - +chr17 27546448 27546768 - +chr17 27907339 27907659 - +chr17 27918775 27918878 - +chr17 27940463 27940783 - +chr17 27966549 27966869 - +chr17 28258393 28258713 - +chr17 28348790 28349110 - +chr17 28431529 28431849 - +chr17 28667128 28667448 - +chr17 29024935 29025255 - +chr17 29764307 29764627 - +chr17 29852388 29852708 - +chr17 30169600 30169920 - +chr17 30334179 30334499 - +chr17 30580147 30580317 - +chr17 30646552 30646728 - +chr17 30771153 30771473 - +chr17 30986422 30986588 - +chr17 31003507 31003827 - +chr17 31204244 31204564 - +chr17 31236941 31237261 - +chr17 31263970 31264290 - +chr17 31325370 31325690 - +chr17 31376857 31377177 - +chr17 32078171 32078491 - +chr17 32106147 32106467 - +chr17 32498273 32498593 - +chr17 32527044 32527245 - +chr17 32688613 32688762 - +chr17 32747316 32747636 - +chr17 32869796 32870116 - +chr17 32965312 32965632 - +chr17 33325230 33325550 - +chr17 33446801 33447121 - +chr17 33469232 33469552 - +chr17 33639747 33640067 - +chr17 33866420 33866740 - +chr17 34074531 34074851 - +chr17 34091083 34091403 - +chr17 34131754 34132074 - +chr17 34135916 34136236 - +chr17 34274142 34274462 - +chr17 34336391 34336711 - +chr17 34476425 34476629 - +chr17 34838871 34839191 - +chr17 34959372 34959692 - +chr17 34985844 34985983 - +chr17 35404962 35405282 - +chr17 35730554 35730767 - +chr17 35754681 35755001 - +chr17 35887389 35887709 - +chr17 36130933 36131253 - +chr17 36204445 36204581 - +chr17 36505111 36505259 - +chr17 36571103 36571195 - +chr17 36579992 36580189 - +chr17 36600494 36600583 - +chr17 36648297 36648617 - +chr17 36665604 36665924 - +chr17 36714362 36714682 - +chr17 36740623 36740943 - +chr17 36762648 36762968 - +chr17 36819399 36819719 - +chr17 36823867 36824096 - +chr17 36833405 36833537 - +chr17 36891362 36891682 - +chr17 37309159 37309479 - +chr17 37321003 37321086 - +chr17 37331214 37331534 - +chr17 37388359 37388679 - +chr17 37391991 37392311 - +chr17 37401116 37401209 - +chr17 37704172 37704492 - +chr17 37778254 37778472 - +chr17 37779789 37779918 - +chr17 37790681 37791001 - +chr17 37810258 37810357 - +chr17 37828441 37828761 - +chr17 38169988 38170308 - +chr17 38183307 38183627 - +chr17 38191392 38191712 - +chr17 38195151 38195471 - +chr17 38231793 38232113 - +chr17 38267915 38268235 - +chr17 38269376 38269696 - +chr17 38358070 38358390 - +chr17 38375334 38375654 - +chr17 38511299 38511619 - +chr17 38514750 38515070 - +chr17 38518840 38519160 - +chr17 38597231 38597551 - +chr17 38716634 38716779 - +chr17 38776199 38776519 - +chr17 38959789 38960109 - +chr17 39065554 39065874 - +chr17 39478579 39478899 - +chr17 39485739 39485824 - +chr17 39653024 39653344 - +chr17 39705118 39705438 - +chr17 39705512 39705832 - +chr17 39735838 39736158 - +chr17 39805028 39805124 - +chr17 39818986 39819236 - +chr17 39822267 39822403 - +chr17 39823264 39823584 - +chr17 39852247 39852567 - +chr17 39894253 39894573 - +chr17 39957851 39958171 - +chr17 40114998 40115318 - +chr17 40171854 40172174 - +chr17 40176954 40177274 - +chr17 40219414 40219547 - +chr17 40253706 40253802 - +chr17 40268754 40269074 - +chr17 40374192 40374512 - +chr17 40474872 40475025 - +chr17 40580456 40580776 - +chr17 40669516 40669836 - +chr17 40679280 40679600 - +chr17 40730470 40730790 - +chr17 40831764 40832084 - +chr17 40915460 40915780 - +chr17 40971978 40972298 - +chr17 40976241 40976561 - +chr17 40995478 40995664 - +chr17 41089789 41090109 - +chr17 41150002 41150322 - +chr17 41176440 41176760 - +chr17 41387219 41387379 - +chr17 41411702 41411818 - +chr17 41416306 41416626 - +chr17 41561033 41561224 - +chr17 41654013 41654333 - +chr17 41751275 41751416 - +chr17 41755297 41755617 - +chr17 41771762 41772082 - +chr17 41837846 41837984 - +chr17 41843815 41844135 - +chr17 41923705 41923898 - +chr17 41956027 41956159 - +chr17 41957753 41958073 - +chr17 41987578 41987898 - +chr17 42017981 42018301 - +chr17 42021372 42021692 - +chr17 42072292 42072612 - +chr17 42193108 42193198 - +chr17 42219021 42219341 - +chr17 42245958 42246040 - +chr17 42281052 42281372 - +chr17 42287772 42287949 - +chr17 42428143 42428463 - +chr17 42600565 42600885 - +chr17 42610888 42611208 - +chr17 42852225 42852442 - +chr17 42856476 42856796 - +chr17 42942536 42942856 - +chr17 42997395 42997715 - +chr17 43176531 43176851 - +chr17 43209239 43209559 - +chr17 43210584 43210904 - +chr17 43212695 43212915 - +chr17 43226519 43226839 - +chr17 43229385 43229705 - +chr17 43275268 43275463 - +chr17 43296273 43296593 - +chr17 43313054 43313374 - +chr17 43354585 43354905 - +chr17 43462998 43463167 - +chr17 43505522 43505666 - +chr17 43717844 43717983 - +chr17 43909517 43909837 - +chr17 43930096 43930416 - +chr17 44026434 44026754 - +chr17 44079893 44080213 - +chr17 44121711 44122031 - +chr17 44290275 44290595 - +chr17 44913270 44913590 - +chr17 44920254 44920574 - +chr17 45018204 45018524 - +chr17 45020013 45020151 - +chr17 45078646 45078741 - +chr17 45307198 45307376 - +chr17 45741729 45742049 - +chr17 45776205 45776525 - +chr17 45918677 45918761 - +chr17 45920501 45920656 - +chr17 45967327 45967647 - +chr17 46024274 46024594 - +chr17 46026609 46026929 - +chr17 46048050 46048370 - +chr17 46055728 46056048 - +chr17 46095739 46096059 - +chr17 46103771 46104091 - +chr17 46114322 46114642 - +chr17 46208621 46208941 - +chr17 46385194 46385514 - +chr17 46534738 46534868 - +chr17 46569879 46570199 - +chr17 46662862 46662954 - +chr17 46671251 46671571 - +chr17 46679960 46680280 - +chr17 46695990 46696310 - +chr17 46749016 46749336 - +chr17 46755793 46756113 - +chr17 46801903 46802223 - +chr17 46868421 46868741 - +chr17 46882223 46882543 - +chr17 46891825 46892145 - +chr17 46956341 46956661 - +chr17 46962059 46962379 - +chr17 47109547 47109867 - +chr17 47113492 47113659 - +chr17 47257679 47257999 - +chr17 47300921 47301026 - +chr17 47392856 47393176 - +chr17 47472404 47472724 - +chr17 47485346 47485666 - +chr17 47633802 47633920 - +chr17 47634757 47635077 - +chr17 47645345 47645447 - +chr17 47647516 47647621 - +chr17 47772841 47773161 - +chr17 47785616 47785936 - +chr17 47929139 47929459 - +chr17 47938496 47938816 - +chr17 47957966 47958134 - +chr17 48029966 48030090 - +chr17 48073972 48074292 - +chr17 48075636 48075956 - +chr17 48104202 48104522 - +chr17 48164854 48165174 - +chr17 48171627 48171947 - +chr17 48238856 48239176 - +chr17 48246681 48247001 - +chr17 48277945 48278265 - +chr17 48347501 48347821 - +chr17 48439365 48439685 - +chr17 48474804 48475124 - +chr17 48555929 48556249 - +chr17 48559841 48560161 - +chr17 48624343 48624503 - +chr17 48628210 48628530 - +chr17 48703507 48703827 - +chr17 48707967 48708287 - +chr17 48758677 48758997 - +chr17 48860625 48860778 - +chr17 48919259 48919579 - +chr17 49027718 49028038 - +chr17 49034171 49034491 - +chr17 49244436 49244756 - +chr17 53373420 53373538 - +chr17 53426160 53426480 - +chr17 53434128 53434448 - +chr17 53591398 53591718 - +chr17 53603892 53603975 - +chr17 53679180 53679500 - +chr17 53719190 53719510 - +chr17 53813203 53813523 - +chr17 54203905 54204225 - +chr17 54731337 54731657 - +chr17 54857604 54857924 - +chr17 54897064 54897384 - +chr17 55038458 55038540 - +chr17 55087392 55087712 - +chr17 55191525 55191668 - +chr17 55208334 55208654 - +chr17 55320549 55320757 - +chr17 55675197 55675517 - +chr17 55749384 55749516 - +chr17 55824928 55825248 - +chr17 55860654 55860974 - +chr17 55910729 55911049 - +chr17 55935319 55935639 - +chr17 55952834 55953154 - +chr17 55969918 55970106 - +chr17 56190526 56190646 - +chr17 56327035 56327355 - +chr17 56381650 56381970 - +chr17 56394764 56395084 - +chr17 56494947 56495267 - +chr17 56524885 56525205 - +chr17 56655003 56655323 - +chr17 56665101 56665254 - +chr17 57082268 57082588 - +chr17 57406949 57407269 - +chr17 57598889 57599209 - +chr17 58114588 58114708 - +chr17 58498907 58499033 - +chr17 58513311 58513631 - +chr17 58678545 58678865 - +chr17 58798150 58798470 - +chr17 59467331 59467651 - +chr17 59489733 59490053 - +chr17 59494319 59494454 - +chr17 59713318 59713638 - +chr17 60143346 60143666 - +chr17 60781144 60781464 - +chr17 61502783 61503103 - +chr17 61556283 61556603 - +chr17 61689730 61689853 - +chr17 61934434 61934754 - +chr17 61996670 61996830 - +chr17 62067788 62068108 - +chr17 62103286 62103400 - +chr17 62359359 62359679 - +chr17 62487204 62487524 - +chr17 62645265 62645585 - +chr17 62773096 62773186 - +chr17 62915581 62915704 - +chr17 62935041 62935361 - +chr17 63096689 63097009 - +chr17 63133342 63133461 - +chr17 63227845 63228165 - +chr17 63290316 63290636 - +chr17 63552555 63552875 - +chr17 64196760 64197080 - +chr17 64563381 64563701 - +chr17 64902822 64903142 - +chr17 65289855 65290175 - +chr17 65370764 65371084 - +chr17 65383108 65383428 - +chr17 65471671 65471767 - +chr17 65713776 65714096 - +chr17 65775140 65775460 - +chr17 65796516 65796836 - +chr17 65821372 65821692 - +chr17 66168649 66168806 - +chr17 66197041 66197149 - +chr17 66201414 66201734 - +chr17 66395220 66395540 - +chr17 66432653 66432733 - +chr17 66462303 66462623 - +chr17 66756038 66756358 - +chr17 67522156 67522476 - +chr17 67603316 67603636 - +chr17 67605701 67606021 - +chr17 68164198 68164518 - +chr17 69469356 69469676 - +chr17 70114764 70115084 - +chr17 70536518 70536838 - +chr17 70553089 70553409 - +chr17 70556369 70556689 - +chr17 70721764 70721844 - +chr17 70723828 70724148 - +chr17 70982261 70982581 - +chr17 71152383 71152703 - +chr17 71187611 71187931 - +chr17 71251840 71252160 - +chr17 71307099 71307419 - +chr17 71425610 71425738 - +chr17 72172874 72173194 - +chr17 72195857 72196001 - +chr17 72199629 72199767 - +chr17 72206658 72206749 - +chr17 72364378 72364698 - +chr17 72489538 72489858 - +chr17 72740846 72740999 - +chr17 72787734 72787854 - +chr17 72858386 72858706 - +chr17 72861122 72861202 - +chr17 72954132 72954273 - +chr17 72956803 72957123 - +chr17 73101920 73102240 - +chr17 73110146 73110466 - +chr17 73127434 73127754 - +chr17 73234761 73235081 - +chr17 73400364 73400511 - +chr17 73408701 73409021 - +chr17 73422397 73422717 - +chr17 73511928 73512063 - +chr17 73597311 73597631 - +chr17 73629011 73629331 - +chr17 73649073 73649393 - +chr17 73745963 73746283 - +chr17 73844539 73844859 - +chr17 73857407 73857727 - +chr17 73901058 73901378 - +chr17 73996521 73996841 - +chr17 74001687 74002007 - +chr17 74010287 74010607 - +chr17 74117512 74117832 - +chr17 74178279 74178599 - +chr17 74275694 74275791 - +chr17 74394194 74394333 - +chr17 74404680 74405000 - +chr17 74456646 74456966 - +chr17 74462088 74462408 - +chr17 74502502 74502822 - +chr17 74519297 74519422 - +chr17 74619993 74620313 - +chr17 74677334 74677654 - +chr17 74723117 74723437 - +chr17 74812659 74812979 - +chr17 75052480 75052587 - +chr17 75062888 75063208 - +chr17 75082599 75082919 - +chr17 75243248 75243328 - +chr17 75327511 75327831 - +chr17 75426168 75426488 - +chr17 75491048 75491368 - +chr17 75538934 75539025 - +chr17 76081054 76081374 - +chr17 76108832 76108926 - +chr17 76172650 76172970 - +chr17 76247037 76247255 - +chr17 76285876 76286196 - +chr17 76356870 76357190 - +chr17 76418219 76418353 - +chr17 76704612 76704932 - +chr17 76861957 76862077 - +chr17 76967386 76967706 - +chr17 76989263 76989583 - +chr17 77038570 77038698 - +chr17 77042056 77042199 - +chr17 77078745 77079065 - +chr17 77083674 77083994 - +chr17 77387878 77388198 - +chr17 77389400 77389720 - +chr17 77397309 77397448 - +chr17 77682461 77682781 - +chr17 77715843 77715980 - +chr17 77719966 77720286 - +chr17 77763669 77763989 - +chr17 77813579 77813899 - +chr17 77979700 77979850 - +chr17 77982204 77982524 - +chr17 78169861 78170181 - +chr17 78400695 78401015 - +chr17 78549191 78549406 - +chr17 78840024 78840344 - +chr17 78873171 78873491 - +chr17 79075575 79075817 - +chr17 79215457 79215777 - +chr17 79265536 79265856 - +chr17 79451338 79451658 - +chr17 79700783 79701103 - +chr17 79827221 79827541 - +chr17 79977803 79978123 - +chr17 79979035 79979355 - +chr17 80111765 80112085 - +chr17 80163736 80164056 - +chr17 80194270 80194590 - +chr17 80258659 80258979 - +chr17 80279369 80279689 - +chr17 80309716 80310036 - +chr17 80382473 80382793 - +chr17 80452715 80452827 - +chr17 80509728 80510048 - +chr17 80512329 80512649 - +chr17 80732860 80733180 - +chr17 80739765 80739925 - +chr17 80921100 80921420 - +chr18 839368 839688 - +chr18 864879 865060 - +chr18 961623 961943 - +chr18 2104907 2105227 - +chr18 3062987 3063307 - +chr18 3129299 3129619 - +chr18 3151457 3151777 - +chr18 3246045 3246365 - +chr18 3594424 3594514 - +chr18 3666298 3666618 - +chr18 3703681 3704001 - +chr18 3771340 3771660 - +chr18 3995527 3995847 - +chr18 4068578 4068898 - +chr18 4106457 4106777 - +chr18 5546704 5547024 - +chr18 6219163 6219483 - +chr18 6920039 6920359 - +chr18 7221431 7221751 - +chr18 8389668 8389988 - +chr18 8478765 8478921 - +chr18 8696767 8697087 - +chr18 8988289 8988609 - +chr18 9516511 9516831 - +chr18 9575494 9575814 - +chr18 9771601 9771733 - +chr18 10520121 10520441 - +chr18 10787331 10787651 - +chr18 11310520 11310840 - +chr18 11553459 11553565 - +chr18 11655337 11655657 - +chr18 11732962 11733282 - +chr18 11995454 11995774 - +chr18 12039095 12039188 - +chr18 12237231 12237551 - +chr18 12327925 12328245 - +chr18 12391766 12391932 - +chr18 12659896 12660216 - +chr18 12727078 12727398 - +chr18 12893811 12894131 - +chr18 12934511 12934831 - +chr18 13159522 13159842 - +chr18 13813288 13813608 - +chr18 13823882 13824202 - +chr18 19069098 19069418 - +chr18 19192013 19192333 - +chr18 19477736 19478056 - +chr18 19536038 19536358 - +chr18 19577580 19577900 - +chr18 20510075 20510395 - +chr18 20958845 20959165 - +chr18 21078781 21079101 - +chr18 21085512 21085832 - +chr18 21096040 21096125 - +chr18 21166897 21167217 - +chr18 21202221 21202541 - +chr18 21498440 21498760 - +chr18 21756734 21757054 - +chr18 22067632 22067952 - +chr18 22068900 22069220 - +chr18 22306720 22307040 - +chr18 23412030 23412350 - +chr18 23476146 23476466 - +chr18 25179230 25179550 - +chr18 25769393 25769713 - +chr18 29738018 29738338 - +chr18 32241490 32241810 - +chr18 32446758 32447078 - +chr18 32723974 32724294 - +chr18 32810955 32811275 - +chr18 32935065 32935385 - +chr18 32966499 32966819 - +chr18 33530471 33530604 - +chr18 34268013 34268333 - +chr18 34902405 34902725 - +chr18 34923408 34923494 - +chr18 35117871 35118191 - +chr18 38971623 38971943 - +chr18 39608229 39608549 - +chr18 39610581 39610901 - +chr18 39878746 39879066 - +chr18 42792852 42792954 - +chr18 42855501 42855821 - +chr18 43201030 43201207 - +chr18 43245194 43245313 - +chr18 43271270 43271590 - +chr18 43302894 43303214 - +chr18 43419435 43419755 - +chr18 43460026 43460196 - +chr18 43490402 43490722 - +chr18 43753410 43753533 - +chr18 43861238 43861361 - +chr18 44061072 44061392 - +chr18 44626631 44626951 - +chr18 44731332 44731652 - +chr18 45275651 45275766 - +chr18 45282919 45283007 - +chr18 45299008 45299328 - +chr18 45938744 45939064 - +chr18 45971984 45972090 - +chr18 45973013 45973333 - +chr18 46002275 46002595 - +chr18 46190789 46190886 - +chr18 46201620 46201940 - +chr18 46203176 46203496 - +chr18 46303448 46303768 - +chr18 47037038 47037358 - +chr18 47385136 47385456 - +chr18 47883638 47883958 - +chr18 48346273 48346593 - +chr18 48636717 48637037 - +chr18 52901877 52902197 - +chr18 53051817 53052137 - +chr18 53082543 53082863 - +chr18 53448908 53449228 - +chr18 55225140 55225460 - +chr18 55549061 55549381 - +chr18 55710766 55711086 - +chr18 55789157 55789477 - +chr18 55803046 55803366 - +chr18 56161066 56161386 - +chr18 56197031 56197351 - +chr18 56431644 56431777 - +chr18 56435671 56435991 - +chr18 56448798 56449118 - +chr18 56724460 56724780 - +chr18 56737697 56738017 - +chr18 56751442 56751762 - +chr18 56916435 56916755 - +chr18 56975720 56976040 - +chr18 57069907 57069993 - +chr18 57364654 57364737 - +chr18 57490876 57491196 - +chr18 57655374 57655694 - +chr18 57663624 57663944 - +chr18 59218769 59219089 - +chr18 59249174 59249308 - +chr18 59303069 59303389 - +chr18 59568715 59568875 - +chr18 59798594 59798914 - +chr18 59850975 59851077 - +chr18 60092936 60093256 - +chr18 60251672 60251803 - +chr18 60253526 60253649 - +chr18 60683511 60683831 - +chr18 61106534 61106854 - +chr18 61178530 61178850 - +chr18 61630895 61631215 - +chr18 61646618 61646938 - +chr18 61662067 61662387 - +chr18 64771958 64772278 - +chr18 66290622 66290942 - +chr18 66403010 66403330 - +chr18 68101156 68101476 - +chr18 68121980 68122300 - +chr18 68159003 68159323 - +chr18 70211578 70211898 - +chr18 71510127 71510300 - +chr18 71707973 71708293 - +chr18 71863648 71863968 - +chr18 72143428 72143748 - +chr18 72219222 72219542 - +chr18 74061061 74061381 - +chr18 74110976 74111112 - +chr18 74421760 74422080 - +chr18 74513993 74514313 - +chr18 74515727 74516047 - +chr18 74695385 74695475 - +chr18 74712493 74712686 - +chr18 74766944 74767264 - +chr18 74923826 74924146 - +chr18 77006630 77006775 - +chr18 77138996 77139316 - +chr18 77140838 77141158 - +chr18 77288799 77289119 - +chr18 77404115 77404435 - +chr18 77915735 77915869 - +chr18 77929111 77929431 - +chr18 78015836 78015924 - +chr19 267519 267603 - +chr19 495664 495984 - +chr19 506703 507023 - +chr19 510730 510902 - +chr19 584498 584649 - +chr19 665009 665094 - +chr19 734171 734287 - +chr19 770267 770587 - +chr19 1001682 1002002 - +chr19 1064893 1065213 - +chr19 1113238 1113558 - +chr19 1244384 1244704 - +chr19 1266995 1267315 - +chr19 1279728 1280048 - +chr19 1438135 1438455 - +chr19 1445186 1445506 - +chr19 1503478 1503798 - +chr19 1513883 1514203 - +chr19 1546290 1546456 - +chr19 1624447 1624767 - +chr19 1672248 1672568 - +chr19 1767457 1767777 - +chr19 1788821 1789141 - +chr19 1795836 1796156 - +chr19 1826142 1826462 - +chr19 1837712 1837855 - +chr19 1848690 1849010 - +chr19 1853345 1853665 - +chr19 2019591 2019911 - +chr19 2069812 2069919 - +chr19 2156087 2156234 - +chr19 2236256 2236576 - +chr19 2261845 2262165 - +chr19 2273740 2274060 - +chr19 2391302 2391622 - +chr19 2579471 2579791 - +chr19 2611209 2611529 - +chr19 2613832 2614152 - +chr19 2637007 2637327 - +chr19 2749329 2749473 - +chr19 2819717 2819827 - +chr19 2951034 2951354 - +chr19 3075976 3076296 - +chr19 3108705 3109025 - +chr19 3126025 3126345 - +chr19 3146562 3146882 - +chr19 3163375 3163526 - +chr19 3304956 3305276 - +chr19 3324457 3324777 - +chr19 3326970 3327290 - +chr19 3429605 3429780 - +chr19 3472009 3472329 - +chr19 3473043 3473363 - +chr19 3485022 3485342 - +chr19 3545085 3545405 - +chr19 3618582 3618902 - +chr19 3665844 3666164 - +chr19 3706099 3706419 - +chr19 3719113 3719433 - +chr19 3822021 3822126 - +chr19 3987800 3988120 - +chr19 4140368 4140688 - +chr19 4186433 4186753 - +chr19 4204623 4204943 - +chr19 4233376 4233696 - +chr19 4352688 4352826 - +chr19 4400587 4400907 - +chr19 4523298 4523618 - +chr19 4525150 4525470 - +chr19 4558472 4558664 - +chr19 4567685 4567810 - +chr19 4633249 4633569 - +chr19 4636263 4636583 - +chr19 4732658 4732978 - +chr19 4791248 4791335 - +chr19 4809684 4809820 - +chr19 4811699 4812019 - +chr19 4851657 4851977 - +chr19 4872640 4872960 - +chr19 5031630 5031950 - +chr19 5048402 5048536 - +chr19 5578360 5578506 - +chr19 5687138 5687458 - +chr19 5690177 5690286 - +chr19 5720637 5720755 - +chr19 5794162 5794482 - +chr19 5804112 5804298 - +chr19 5805892 5806212 - +chr19 5892570 5892890 - +chr19 5967953 5968273 - +chr19 5987511 5987597 - +chr19 6048076 6048200 - +chr19 6048836 6049156 - +chr19 6125566 6125886 - +chr19 6165996 6166316 - +chr19 6218633 6218713 - +chr19 6516886 6517206 - +chr19 6632062 6632382 - +chr19 6633263 6633583 - +chr19 6736561 6736881 - +chr19 6862803 6863123 - +chr19 6863618 6863938 - +chr19 7406139 7406239 - +chr19 7419285 7419605 - +chr19 7717012 7717126 - +chr19 7918137 7918457 - +chr19 7990060 7990380 - +chr19 8035678 8035998 - +chr19 8067373 8067525 - +chr19 8317896 8318216 - +chr19 8333787 8334107 - +chr19 8418837 8419157 - +chr19 8571057 8571377 - +chr19 8643743 8643834 - +chr19 8645717 8645900 - +chr19 8680711 8680861 - +chr19 8764083 8764245 - +chr19 9190499 9190819 - +chr19 9247176 9247496 - +chr19 9316304 9316434 - +chr19 9542174 9542494 - +chr19 9546124 9546444 - +chr19 9697399 9697719 - +chr19 9736506 9736826 - +chr19 9900847 9901167 - +chr19 9906880 9907200 - +chr19 9957797 9958117 - +chr19 10041556 10041876 - +chr19 10044775 10045095 - +chr19 10111016 10111134 - +chr19 10138431 10138751 - +chr19 10230505 10230825 - +chr19 10360785 10361105 - +chr19 10369691 10370011 - +chr19 10380235 10380334 - +chr19 10499577 10499897 - +chr19 10501398 10501718 - +chr19 10514800 10515120 - +chr19 10527188 10527508 - +chr19 10535007 10535327 - +chr19 10616273 10616593 - +chr19 10620924 10621244 - +chr19 10652124 10652444 - +chr19 10683143 10683246 - +chr19 10691729 10691932 - +chr19 10755485 10755805 - +chr19 10968947 10969267 - +chr19 10975207 10975318 - +chr19 11052717 11053037 - +chr19 11146617 11146937 - +chr19 11236793 11237113 - +chr19 11241754 11241877 - +chr19 11248380 11248700 - +chr19 11314517 11314837 - +chr19 11404533 11404688 - +chr19 11545770 11546090 - +chr19 11557582 11557902 - +chr19 11593447 11593767 - +chr19 11607157 11607477 - +chr19 11646896 11647216 - +chr19 11698561 11698697 - +chr19 11908262 11908582 - +chr19 11992028 11992348 - +chr19 12034466 12034786 - +chr19 12099701 12100021 - +chr19 12170291 12170611 - +chr19 12251988 12252308 - +chr19 12476660 12476980 - +chr19 12675040 12675126 - +chr19 12782859 12783179 - +chr19 12787164 12787484 - +chr19 12868406 12868726 - +chr19 12871406 12871726 - +chr19 12886352 12886672 - +chr19 12888401 12888721 - +chr19 12907451 12907581 - +chr19 12943433 12943753 - +chr19 12958417 12958737 - +chr19 13075838 13075980 - +chr19 13076258 13076392 - +chr19 13088740 13088912 - +chr19 13094414 13094734 - +chr19 13144912 13145232 - +chr19 13172214 13172534 - +chr19 13204618 13204938 - +chr19 13345391 13345711 - +chr19 13388811 13389131 - +chr19 13541943 13542058 - +chr19 13700087 13700407 - +chr19 13832201 13832320 - +chr19 13973097 13973417 - +chr19 13990970 13991290 - +chr19 14143027 14143347 - +chr19 14159490 14159570 - +chr19 14169778 14170098 - +chr19 14185905 14186225 - +chr19 14247599 14247919 - +chr19 14259028 14259151 - +chr19 14328192 14328512 - +chr19 14459378 14459504 - +chr19 14583004 14583324 - +chr19 14590884 14591204 - +chr19 14597230 14597550 - +chr19 14616234 14616554 - +chr19 14667600 14667681 - +chr19 14682809 14683129 - +chr19 15212328 15212648 - +chr19 15288301 15288621 - +chr19 15374870 15375190 - +chr19 15667231 15667551 - +chr19 15695474 15695794 - +chr19 15740123 15740443 - +chr19 16136474 16136794 - +chr19 16152345 16152665 - +chr19 16173638 16173780 - +chr19 16222098 16222418 - +chr19 16293681 16293778 - +chr19 16358418 16358738 - +chr19 16494075 16494395 - +chr19 16497479 16497799 - +chr19 16528542 16528862 - +chr19 16825543 16825863 - +chr19 16889639 16889959 - +chr19 17178264 17178584 - +chr19 17214009 17214329 - +chr19 17371686 17372006 - +chr19 17393488 17393808 - +chr19 17403623 17403943 - +chr19 17457015 17457335 - +chr19 17496907 17497050 - +chr19 17564259 17564579 - +chr19 17577181 17577276 - +chr19 17607184 17607304 - +chr19 17685021 17685177 - +chr19 17814690 17815010 - +chr19 17877480 17877800 - +chr19 17886180 17886500 - +chr19 17895213 17895533 - +chr19 17966404 17966724 - +chr19 17970239 17970559 - +chr19 18043337 18043657 - +chr19 18049082 18049402 - +chr19 18057124 18057444 - +chr19 18107684 18108004 - +chr19 18117762 18118082 - +chr19 18169221 18169541 - +chr19 18223920 18224049 - +chr19 18279013 18279333 - +chr19 18362751 18362873 - +chr19 18390205 18390525 - +chr19 18439158 18439478 - +chr19 18460372 18460692 - +chr19 18496898 18497218 - +chr19 18499221 18499541 - +chr19 18521442 18521762 - +chr19 18561833 18561982 - +chr19 18572696 18573016 - +chr19 18580525 18580611 - +chr19 18699365 18699685 - +chr19 18958058 18958378 - +chr19 18977428 18977567 - +chr19 19097588 19097908 - +chr19 19106566 19106886 - +chr19 19164633 19164953 - +chr19 19250473 19250793 - +chr19 19256966 19257077 - +chr19 19285892 19286130 - +chr19 19335943 19336263 - +chr19 19453563 19453883 - +chr19 19471866 19472186 - +chr19 19477764 19477844 - +chr19 19643023 19643343 - +chr19 19663573 19663893 - +chr19 19754680 19755000 - +chr19 21385698 21386018 - +chr19 23466769 23467089 - +chr19 30019235 30019555 - +chr19 30101510 30101830 - +chr19 30185512 30185832 - +chr19 30207546 30207721 - +chr19 30250989 30251309 - +chr19 30262665 30262985 - +chr19 31140955 31141275 - +chr19 31157320 31157493 - +chr19 31160353 31160673 - +chr19 31173865 31174185 - +chr19 31545387 31545707 - +chr19 31601286 31601606 - +chr19 31828993 31829313 - +chr19 32715613 32715782 - +chr19 32831181 32831501 - +chr19 33061181 33061501 - +chr19 33467140 33467460 - +chr19 33616818 33617138 - +chr19 33634911 33635036 - +chr19 33726160 33726480 - +chr19 33732573 33732893 - +chr19 33897766 33898086 - +chr19 33921602 33921922 - +chr19 34262495 34262815 - +chr19 34269009 34269175 - +chr19 34532864 34533184 - +chr19 34611969 34612289 - +chr19 34711418 34711738 - +chr19 34836794 34837114 - +chr19 34992503 34992731 - +chr19 35130424 35130744 - +chr19 35464176 35464304 - +chr19 35503979 35504118 - +chr19 35535370 35535501 - +chr19 35558009 35558329 - +chr19 35605599 35605796 - +chr19 35626318 35626638 - +chr19 35843681 35844001 - +chr19 35986382 35986541 - +chr19 36001273 36001593 - +chr19 36184755 36184917 - +chr19 36207841 36208161 - +chr19 36422556 36422876 - +chr19 36438315 36438635 - +chr19 36618640 36618960 - +chr19 36661645 36661965 - +chr19 38393239 38393559 - +chr19 38625245 38625565 - +chr19 38633093 38633413 - +chr19 38724430 38724750 - +chr19 38826148 38826468 - +chr19 39005733 39006053 - +chr19 39125281 39125601 - +chr19 39127481 39127801 - +chr19 39220344 39220664 - +chr19 39360467 39360787 - +chr19 39378135 39378455 - +chr19 39421046 39421220 - +chr19 39452770 39453090 - +chr19 39467233 39467553 - +chr19 39542923 39543243 - +chr19 39601022 39601342 - +chr19 39612711 39613031 - +chr19 39678002 39678100 - +chr19 39818767 39819087 - +chr19 39888407 39888517 - +chr19 39902462 39902782 - +chr19 39903330 39903650 - +chr19 39964799 39965119 - +chr19 40023533 40023694 - +chr19 40030796 40031116 - +chr19 40042088 40042408 - +chr19 40256136 40256456 - +chr19 40283740 40283832 - +chr19 40727799 40728119 - +chr19 40926654 40926734 - +chr19 40970379 40970459 - +chr19 41035005 41035325 - +chr19 41104515 41104835 - +chr19 41108055 41108375 - +chr19 41140470 41140697 - +chr19 41168679 41168999 - +chr19 41312923 41313019 - +chr19 41480476 41480796 - +chr19 41650351 41650671 - +chr19 41710291 41710443 - +chr19 41768189 41768509 - +chr19 41814441 41814761 - +chr19 41827684 41828004 - +chr19 41859897 41860217 - +chr19 41955374 41955502 - +chr19 42069791 42070111 - +chr19 42077140 42077229 - +chr19 42103931 42104251 - +chr19 42148295 42148615 - +chr19 42257152 42257472 - +chr19 42257873 42258193 - +chr19 42289503 42289823 - +chr19 42340094 42340414 - +chr19 42377192 42377512 - +chr19 42439159 42439283 - +chr19 42455886 42456066 - +chr19 42502656 42502976 - +chr19 42538905 42539225 - +chr19 42585744 42586064 - +chr19 42612745 42612951 - +chr19 42637248 42637568 - +chr19 42688510 42688830 - +chr19 42784045 42784158 - +chr19 42800302 42800479 - +chr19 42829298 42829618 - +chr19 42905458 42905778 - +chr19 42924178 42924498 - +chr19 42942731 42942813 - +chr19 42944633 42944713 - +chr19 43033436 43033756 - +chr19 43071119 43071439 - +chr19 43100662 43100982 - +chr19 43281210 43281530 - +chr19 43427312 43427632 - +chr19 43937137 43937262 - +chr19 43951179 43951499 - +chr19 44023666 44023986 - +chr19 44059467 44059580 - +chr19 44098921 44099076 - +chr19 44156475 44156795 - +chr19 44249446 44249766 - +chr19 44263638 44263958 - +chr19 44278566 44278886 - +chr19 44289546 44289729 - +chr19 45017012 45017332 - +chr19 45055553 45055873 - +chr19 45086184 45086504 - +chr19 45160537 45160857 - +chr19 45192552 45192680 - +chr19 45227148 45227468 - +chr19 45287640 45287960 - +chr19 45303837 45303994 - +chr19 45348028 45348348 - +chr19 45348807 45349127 - +chr19 45406969 45407079 - +chr19 45454856 45455176 - +chr19 45562757 45563077 - +chr19 45629356 45629676 - +chr19 45665317 45665637 - +chr19 45752029 45752135 - +chr19 45930852 45931172 - +chr19 45942802 45942953 - +chr19 45982346 45982484 - +chr19 45988517 45988837 - +chr19 45989114 45989232 - +chr19 46026920 46027240 - +chr19 46110646 46110729 - +chr19 46145405 46145725 - +chr19 46146461 46146637 - +chr19 46184851 46185007 - +chr19 46196706 46196797 - +chr19 46273429 46273749 - +chr19 46302335 46302655 - +chr19 46320503 46320636 - +chr19 46386753 46386863 - +chr19 46513014 46513334 - +chr19 46532249 46532394 - +chr19 46580950 46581126 - +chr19 46692741 46693061 - +chr19 46768517 46768837 - +chr19 46850685 46850809 - +chr19 46898805 46899125 - +chr19 46991232 46991356 - +chr19 47016829 47016955 - +chr19 47032930 47033054 - +chr19 47040864 47041027 - +chr19 47045965 47046285 - +chr19 47117414 47117734 - +chr19 47141823 47142143 - +chr19 47165311 47165631 - +chr19 47207843 47207991 - +chr19 47232145 47232248 - +chr19 47251793 47252113 - +chr19 47260581 47260901 - +chr19 47269953 47270273 - +chr19 47343856 47343975 - +chr19 47359379 47359699 - +chr19 47494438 47494758 - +chr19 47516356 47516453 - +chr19 47535497 47535817 - +chr19 47539152 47539289 - +chr19 47582572 47582892 - +chr19 47747527 47747847 - +chr19 47750662 47750982 - +chr19 47787568 47787888 - +chr19 47808551 47808871 - +chr19 47852258 47852578 - +chr19 47894709 47895029 - +chr19 47930656 47930758 - +chr19 47939884 47940204 - +chr19 47987102 47987244 - +chr19 48018806 48018915 - +chr19 48201460 48201644 - +chr19 48202606 48202926 - +chr19 48210582 48210902 - +chr19 48216405 48216725 - +chr19 48269144 48269464 - +chr19 48292067 48292387 - +chr19 48293614 48293934 - +chr19 48366760 48366921 - +chr19 48372157 48372477 - +chr19 48606347 48606667 - +chr19 48610597 48610917 - +chr19 48763401 48763721 - +chr19 48774423 48774743 - +chr19 48794521 48794841 - +chr19 48823287 48823607 - +chr19 48854553 48854873 - +chr19 48893863 48894183 - +chr19 48894829 48894935 - +chr19 48948571 48948891 - +chr19 48993138 48993260 - +chr19 49087562 49087882 - +chr19 49092302 49092383 - +chr19 49149772 49150092 - +chr19 49178655 49178975 - +chr19 49298387 49298707 - +chr19 49339012 49339332 - +chr19 49474212 49474532 - +chr19 49501284 49501604 - +chr19 49503022 49503342 - +chr19 49601164 49601306 - +chr19 49609951 49610271 - +chr19 49649114 49649434 - +chr19 49702151 49702471 - +chr19 49711586 49711906 - +chr19 49729328 49729648 - +chr19 49804552 49804872 - +chr19 49895161 49895481 - +chr19 49977255 49977575 - +chr19 50009838 50010158 - +chr19 50073513 50073704 - +chr19 50175617 50175937 - +chr19 50184056 50184239 - +chr19 50187690 50188010 - +chr19 50220742 50221062 - +chr19 50440938 50441258 - +chr19 50486900 50487220 - +chr19 50755591 50755911 - +chr19 50860330 50860650 - +chr19 50871829 50872149 - +chr19 50918045 50918365 - +chr19 50964505 50964825 - +chr19 51040972 51041292 - +chr19 51059826 51060146 - +chr19 51172637 51172957 - +chr19 51208717 51209037 - +chr19 51227777 51227882 - +chr19 51286859 51287179 - +chr19 51425528 51425848 - +chr19 51433103 51433423 - +chr19 51672606 51672926 - +chr19 51841975 51842295 - +chr19 51925299 51925379 - +chr19 52018172 52018492 - +chr19 52111199 52111519 - +chr19 52160225 52160545 - +chr19 52164295 52164615 - +chr19 52642479 52642799 - +chr19 52741093 52741413 - +chr19 53698566 53698886 - +chr19 53928284 53928604 - +chr19 54015190 54015510 - +chr19 54249585 54249905 - +chr19 54296278 54296470 - +chr19 54356203 54356523 - +chr19 54410114 54410217 - +chr19 54527167 54527487 - +chr19 54672244 54672564 - +chr19 54735574 54735894 - +chr19 54836410 54836730 - +chr19 54902606 54902926 - +chr19 54989880 54990200 - +chr19 54992494 54992814 - +chr19 55652618 55652938 - +chr19 55671922 55672242 - +chr19 55813198 55813518 - +chr19 55929560 55929880 - +chr19 55966072 55966392 - +chr19 55979295 55979400 - +chr19 56015414 56015734 - +chr19 56028243 56028563 - +chr19 56117562 56117882 - +chr19 56143778 56144098 - +chr19 56165060 56165274 - +chr19 56182936 56183256 - +chr19 56577811 56578131 - +chr19 56587825 56588145 - +chr19 56683380 56683520 - +chr19 56764300 56764457 - +chr19 56820979 56821299 - +chr19 56861860 56862180 - +chr19 56904471 56904791 - +chr19 57615398 57615718 - +chr19 57917939 57918259 - +chr19 57977105 57977425 - +chr19 58666540 58666620 - +chr19 59030309 59030629 - +chr19 59030881 59031201 - +chr19 59092617 59092788 - +chr2 290155 290475 - +chr2 714308 714628 - +chr2 1136192 1136372 - +chr2 1549799 1550119 - +chr2 1554512 1554832 - +chr2 1595693 1596013 - +chr2 1679529 1679849 - +chr2 1754697 1754824 - +chr2 1773547 1773867 - +chr2 1821417 1821573 - +chr2 1890195 1890515 - +chr2 3307409 3307729 - +chr2 3427769 3427924 - +chr2 3466035 3466355 - +chr2 6913298 6913618 - +chr2 7237480 7237617 - +chr2 8825916 8826236 - +chr2 9279864 9280184 - +chr2 9282887 9283207 - +chr2 9346295 9346615 - +chr2 9473778 9474098 - +chr2 9910541 9910861 - +chr2 9939916 9940038 - +chr2 9953968 9954288 - +chr2 10212719 10213039 - +chr2 10237525 10237845 - +chr2 10549385 10549501 - +chr2 10575088 10575408 - +chr2 10800131 10800451 - +chr2 10972470 10972790 - +chr2 10975448 10975768 - +chr2 11213407 11213727 - +chr2 11264771 11264869 - +chr2 11272725 11273045 - +chr2 11552157 11552477 - +chr2 11586856 11587176 - +chr2 11724389 11724709 - +chr2 11822285 11822400 - +chr2 11828553 11828873 - +chr2 11833060 11833380 - +chr2 11919358 11919678 - +chr2 12003852 12003989 - +chr2 13071382 13071702 - +chr2 13147038 13147358 - +chr2 14774537 14774857 - +chr2 15717247 15717567 - +chr2 15777421 15777741 - +chr2 16790338 16790658 - +chr2 17900863 17901183 - +chr2 18011825 18012145 - +chr2 18033022 18033342 - +chr2 18360023 18360343 - +chr2 18559346 18559666 - +chr2 18569047 18569192 - +chr2 18701819 18702139 - +chr2 20063167 20063487 - +chr2 20293415 20293735 - +chr2 20297949 20298269 - +chr2 20335503 20335618 - +chr2 20525954 20526274 - +chr2 20578801 20579121 - +chr2 20611211 20611531 - +chr2 20835593 20835913 - +chr2 20842249 20842414 - +chr2 20861367 20861448 - +chr2 20881966 20882286 - +chr2 21185864 21186184 - +chr2 21265748 21266068 - +chr2 23535997 23536113 - +chr2 23574408 23574728 - +chr2 23683130 23683450 - +chr2 24149849 24150169 - +chr2 24307194 24307514 - +chr2 24632825 24633145 - +chr2 25050507 25050827 - +chr2 25419285 25419605 - +chr2 25451670 25451990 - +chr2 25565357 25565510 - +chr2 25584833 25585153 - +chr2 25599599 25599919 - +chr2 25887588 25887908 - +chr2 26174018 26174107 - +chr2 26552922 26553242 - +chr2 26800798 26800963 - +chr2 26828322 26828642 - +chr2 26968636 26968956 - +chr2 27165494 27165814 - +chr2 27184687 27184794 - +chr2 27321592 27321725 - +chr2 27392841 27392954 - +chr2 27473023 27473343 - +chr2 27502768 27503088 - +chr2 27906384 27906704 - +chr2 27926636 27926956 - +chr2 27946274 27946401 - +chr2 27977153 27977242 - +chr2 28675475 28675627 - +chr2 28789721 28789883 - +chr2 28817178 28817498 - +chr2 29419736 29420056 - +chr2 29461409 29461729 - +chr2 29531990 29532310 - +chr2 30064398 30064551 - +chr2 30145991 30146311 - +chr2 30295148 30295468 - +chr2 30338668 30338817 - +chr2 30387900 30388220 - +chr2 30431260 30431580 - +chr2 30462239 30462402 - +chr2 30929899 30930219 - +chr2 30934552 30934712 - +chr2 31393004 31393324 - +chr2 31556572 31556733 - +chr2 33171333 33171653 - +chr2 33620768 33621088 - +chr2 33666334 33666654 - +chr2 33700756 33701076 - +chr2 33808337 33808657 - +chr2 36734525 36734845 - +chr2 36997871 36998191 - +chr2 37012466 37012589 - +chr2 37623030 37623350 - +chr2 37641579 37641899 - +chr2 38059751 38060071 - +chr2 38118112 38118432 - +chr2 38342484 38342660 - +chr2 38763229 38763549 - +chr2 38830569 38830889 - +chr2 39820909 39821229 - +chr2 39955999 39956319 - +chr2 40065109 40065249 - +chr2 40781860 40782180 - +chr2 40987216 40987536 - +chr2 42067890 42068047 - +chr2 42096877 42097197 - +chr2 42722279 42722367 - +chr2 42740394 42740714 - +chr2 42890367 42890548 - +chr2 43054773 43055093 - +chr2 43055993 43056313 - +chr2 43086117 43086437 - +chr2 43173993 43174313 - +chr2 43202318 43202415 - +chr2 43251738 43251859 - +chr2 43839630 43839950 - +chr2 44105155 44105276 - +chr2 44271438 44271536 - +chr2 44314345 44314665 - +chr2 44706632 44706800 - +chr2 45182792 45183112 - +chr2 45870811 45871131 - +chr2 46220554 46220874 - +chr2 46290093 46290413 - +chr2 46688651 46688774 - +chr2 46691645 46691965 - +chr2 46877700 46878020 - +chr2 47214257 47214411 - +chr2 47261252 47261572 - +chr2 47499529 47499849 - +chr2 47550375 47550493 - +chr2 47995161 47995278 - +chr2 48333695 48334015 - +chr2 48338298 48338618 - +chr2 48647905 48648225 - +chr2 54334095 54334415 - +chr2 55065726 55066046 - +chr2 55276123 55276443 - +chr2 56357610 56357930 - +chr2 57986978 57987298 - +chr2 58133309 58133629 - +chr2 59252317 59252637 - +chr2 60808982 60809302 - +chr2 60901739 60902059 - +chr2 62684076 62684396 - +chr2 63416207 63416318 - +chr2 63715280 63715600 - +chr2 64276858 64277178 - +chr2 64558018 64558338 - +chr2 64715144 64715464 - +chr2 64873466 64873786 - +chr2 65058982 65059116 - +chr2 65144774 65145094 - +chr2 65267918 65268074 - +chr2 65414418 65414738 - +chr2 65780419 65780739 - +chr2 66656155 66656475 - +chr2 67877978 67878298 - +chr2 68067657 68067977 - +chr2 68073626 68073946 - +chr2 68349539 68349859 - +chr2 68668898 68669218 - +chr2 68822345 68822665 - +chr2 68978894 68979214 - +chr2 69137071 69137189 - +chr2 69152561 69152665 - +chr2 69170650 69170970 - +chr2 69239977 69240297 - +chr2 69951787 69952107 - +chr2 70044120 70044440 - +chr2 70298109 70298206 - +chr2 70369818 70370138 - +chr2 70529117 70529437 - +chr2 70568341 70568661 - +chr2 70781293 70781613 - +chr2 70844737 70845057 - +chr2 70875117 70875287 - +chr2 71089084 71089225 - +chr2 71146992 71147312 - +chr2 71175699 71175797 - +chr2 71192043 71192363 - +chr2 71211979 71212299 - +chr2 71221976 71222296 - +chr2 71228268 71228588 - +chr2 71737390 71737710 - +chr2 71775796 71776116 - +chr2 72025483 72025803 - +chr2 72098737 72099057 - +chr2 72364723 72364864 - +chr2 72377498 72377818 - +chr2 72553010 72553330 - +chr2 73089791 73089917 - +chr2 73115057 73115219 - +chr2 73119844 73120164 - +chr2 73120873 73120974 - +chr2 73200870 73200993 - +chr2 73312849 73312985 - +chr2 73383754 73384074 - +chr2 73390929 73391249 - +chr2 73404522 73404842 - +chr2 73439650 73439970 - +chr2 74007171 74007491 - +chr2 74426154 74426474 - +chr2 74477259 74477579 - +chr2 74562081 74562219 - +chr2 74619206 74619301 - +chr2 74710315 74710635 - +chr2 74942159 74942479 - +chr2 75292099 75292419 - +chr2 75426699 75427019 - +chr2 75836365 75836464 - +chr2 75855152 75855270 - +chr2 75867740 75867954 - +chr2 75904549 75904665 - +chr2 80163467 80163787 - +chr2 80465858 80466178 - +chr2 84530076 84530396 - +chr2 84541960 84542280 - +chr2 85152899 85153219 - +chr2 85154153 85154473 - +chr2 85293950 85294270 - +chr2 85359971 85360291 - +chr2 85413338 85413658 - +chr2 85493615 85493935 - +chr2 85512613 85512933 - +chr2 85527488 85527808 - +chr2 85618380 85618700 - +chr2 85647753 85648073 - +chr2 85765728 85766048 - +chr2 85779224 85779544 - +chr2 85906423 85906529 - +chr2 85934376 85934696 - +chr2 85956194 85956346 - +chr2 85977637 85977957 - +chr2 86041350 86041670 - +chr2 86054539 86054859 - +chr2 86116172 86116492 - +chr2 86307044 86307364 - +chr2 87035493 87035813 - +chr2 88301471 88301614 - +chr2 88359919 88360239 - +chr2 89039655 89039749 - +chr2 89179910 89180230 - +chr2 91815094 91815295 - +chr2 91816629 91816949 - +chr2 91818658 91818978 - +chr2 95622568 95622888 - +chr2 95652163 95652483 - +chr2 95688125 95688316 - +chr2 95742492 95742812 - +chr2 95781105 95781425 - +chr2 95897404 95897724 - +chr2 96011284 96011604 - +chr2 96055024 96055169 - +chr2 96079771 96080091 - +chr2 96688468 96688788 - +chr2 96840298 96840618 - +chr2 97073079 97073399 - +chr2 97173888 97174023 - +chr2 97175263 97175349 - +chr2 97220139 97220459 - +chr2 97466758 97467078 - +chr2 97523788 97524108 - +chr2 97534689 97535009 - +chr2 97616967 97617287 - +chr2 97626155 97626475 - +chr2 98207267 98207587 - +chr2 98313794 98314114 - +chr2 98316654 98316974 - +chr2 98326474 98326585 - +chr2 98356984 98357304 - +chr2 98423212 98423532 - +chr2 99001646 99001820 - +chr2 99185692 99186012 - +chr2 99266000 99266156 - +chr2 100175635 100175762 - +chr2 100859442 100859762 - +chr2 100879341 100879661 - +chr2 100997662 100997982 - +chr2 101038818 101039138 - +chr2 101601351 101601671 - +chr2 101769298 101769618 - +chr2 101805175 101805495 - +chr2 102622713 102623033 - +chr2 102637088 102637408 - +chr2 102857403 102857530 - +chr2 102972065 102972385 - +chr2 103089420 103089740 - +chr2 103232406 103232581 - +chr2 103422745 103423065 - +chr2 105852121 105852441 - +chr2 106162220 106162540 - +chr2 106720929 106721069 - +chr2 107159327 107159415 - +chr2 107199042 107199362 - +chr2 108966479 108966799 - +chr2 109252906 109253041 - +chr2 109544863 109545183 - +chr2 109744440 109744760 - +chr2 110267823 110267959 - +chr2 110303791 110303938 - +chr2 110914787 110915107 - +chr2 111874608 111874928 - +chr2 113017113 113017275 - +chr2 113238615 113238935 - +chr2 113357617 113357937 - +chr2 113384123 113384443 - +chr2 113484328 113484648 - +chr2 113507353 113507456 - +chr2 113642512 113642832 - +chr2 113699435 113699567 - +chr2 113716432 113716752 - +chr2 113837658 113837978 - +chr2 113956688 113957008 - +chr2 113961621 113961709 - +chr2 114006762 114007082 - +chr2 114147889 114148209 - +chr2 114361714 114361888 - +chr2 118572016 118572195 - +chr2 118593495 118593815 - +chr2 118960315 118960635 - +chr2 119613366 119613446 - +chr2 119639150 119639470 - +chr2 119640216 119640536 - +chr2 119932037 119932357 - +chr2 120022765 120023085 - +chr2 120233935 120234026 - +chr2 120275057 120275377 - +chr2 120399164 120399484 - +chr2 120417649 120417969 - +chr2 120460533 120460663 - +chr2 121041215 121041535 - +chr2 121070850 121071003 - +chr2 121100950 121101270 - +chr2 121278913 121279069 - +chr2 121433143 121433463 - +chr2 121828705 121828787 - +chr2 121834563 121834883 - +chr2 121837783 121838103 - +chr2 121852885 121853205 - +chr2 122055075 122055395 - +chr2 122113672 122113992 - +chr2 122159922 122160242 - +chr2 127534981 127535301 - +chr2 127729689 127729819 - +chr2 127807837 127807940 - +chr2 127825742 127825827 - +chr2 127955409 127955729 - +chr2 128349147 128349304 - +chr2 128407334 128407654 - +chr2 128821092 128821229 - +chr2 128936785 128937105 - +chr2 128991114 128991434 - +chr2 129494239 129494411 - +chr2 131090150 131090470 - +chr2 131130061 131130381 - +chr2 131555007 131555327 - +chr2 131631321 131631641 - +chr2 131678028 131678348 - +chr2 131911575 131911895 - +chr2 132250149 132250469 - +chr2 132431511 132431831 - +chr2 133106687 133107007 - +chr2 133206420 133206740 - +chr2 134537744 134538064 - +chr2 134601883 134602203 - +chr2 134965206 134965526 - +chr2 135002989 135003309 - +chr2 135169608 135169928 - +chr2 135208523 135208714 - +chr2 135761827 135762147 - +chr2 136467491 136467811 - +chr2 136555078 136555398 - +chr2 137181035 137181355 - +chr2 138847995 138848315 - +chr2 141344000 141344320 - +chr2 143791940 143792260 - +chr2 144041686 144042006 - +chr2 144270764 144270889 - +chr2 145277649 145277969 - +chr2 149267112 149267280 - +chr2 149873924 149874244 - +chr2 150169984 150170304 - +chr2 151344183 151344503 - +chr2 152855539 152855859 - +chr2 152882463 152882579 - +chr2 152893399 152893719 - +chr2 153032403 153032723 - +chr2 153046440 153046760 - +chr2 153492205 153492525 - +chr2 154335292 154335612 - +chr2 158184215 158184303 - +chr2 158880842 158881162 - +chr2 159610555 159610875 - +chr2 159975122 159975442 - +chr2 161180708 161181028 - +chr2 161920253 161920573 - +chr2 163077510 163077830 - +chr2 165698542 165698862 - +chr2 165871559 165871879 - +chr2 165873361 165873681 - +chr2 169998312 169998632 - +chr2 170384761 170385081 - +chr2 170681310 170681630 - +chr2 171265585 171265905 - +chr2 171729842 171729962 - +chr2 171785303 171785623 - +chr2 171829923 171830243 - +chr2 172272618 172272938 - +chr2 172430612 172430932 - +chr2 172968607 172968927 - +chr2 172992019 172992339 - +chr2 173037176 173037496 - +chr2 173326790 173327110 - +chr2 173330467 173330787 - +chr2 173671172 173671492 - +chr2 173728986 173729306 - +chr2 173836767 173837087 - +chr2 174075348 174075668 - +chr2 174267152 174267472 - +chr2 174313947 174314267 - +chr2 174904662 174904982 - +chr2 175206322 175206481 - +chr2 175561939 175562259 - +chr2 175636302 175636622 - +chr2 176193850 176194170 - +chr2 176944558 176944878 - +chr2 176953051 176953371 - +chr2 176962331 176962651 - +chr2 176975966 176976286 - +chr2 177005189 177005509 - +chr2 177319673 177319993 - +chr2 177517079 177517399 - +chr2 177518644 177518964 - +chr2 178013806 178013889 - +chr2 178029373 178029587 - +chr2 178128206 178128526 - +chr2 178199711 178200031 - +chr2 178589417 178589737 - +chr2 178628432 178628752 - +chr2 178770933 178771070 - +chr2 178847938 178848258 - +chr2 178976916 178977236 - +chr2 179171118 179171438 - +chr2 179279418 179279738 - +chr2 179387644 179387964 - +chr2 179565605 179565925 - +chr2 179756269 179756397 - +chr2 179779018 179779338 - +chr2 180083730 180084050 - +chr2 180275642 180275962 - +chr2 180286783 180287103 - +chr2 180942065 180942385 - +chr2 181969278 181969598 - +chr2 182321868 182322188 - +chr2 182331401 182331721 - +chr2 188523660 188523980 - +chr2 189156391 189156711 - +chr2 190525823 190526143 - +chr2 190539028 190539348 - +chr2 191184640 191184960 - +chr2 191237145 191237465 - +chr2 191861099 191861419 - +chr2 192431007 192431327 - +chr2 196933117 196933222 - +chr2 197876792 197877112 - +chr2 198033414 198033734 - +chr2 198677922 198678242 - +chr2 199174422 199174742 - +chr2 200328143 200328463 - +chr2 200336175 200336495 - +chr2 200350717 200351037 - +chr2 200353554 200353874 - +chr2 200604569 200604889 - +chr2 200844785 200845105 - +chr2 200850373 200850693 - +chr2 201374678 201374998 - +chr2 201577703 201578023 - +chr2 201766209 201766529 - +chr2 202096975 202097295 - +chr2 202228696 202229016 - +chr2 202660641 202660961 - +chr2 202725306 202725626 - +chr2 202968997 202969317 - +chr2 203438588 203438908 - +chr2 203453376 203453696 - +chr2 204647049 204647369 - +chr2 205009967 205010287 - +chr2 206631506 206631826 - +chr2 206750377 206750697 - +chr2 206790592 206790912 - +chr2 206812830 206813150 - +chr2 207646314 207646634 - +chr2 207846605 207846688 - +chr2 207854094 207854414 - +chr2 208337888 208338208 - +chr2 208610804 208611124 - +chr2 208905653 208905741 - +chr2 209040178 209040272 - +chr2 209119018 209119338 - +chr2 209238231 209238551 - +chr2 211367759 211368079 - +chr2 214018434 214018754 - +chr2 214111444 214111764 - +chr2 216796027 216796347 - +chr2 216878185 216878505 - +chr2 216898706 216899026 - +chr2 216979394 216979714 - +chr2 217268893 217268977 - +chr2 217531443 217531619 - +chr2 217561428 217561748 - +chr2 217642435 217642755 - +chr2 217971441 217971761 - +chr2 218515501 218515821 - +chr2 218578313 218578465 - +chr2 218621298 218621618 - +chr2 218650195 218650515 - +chr2 218654038 218654160 - +chr2 218751475 218751795 - +chr2 218764326 218764646 - +chr2 218853279 218853599 - +chr2 218858609 218858929 - +chr2 218933539 218933859 - +chr2 219051025 219051345 - +chr2 219122440 219122760 - +chr2 219137398 219137718 - +chr2 219166115 219166208 - +chr2 219260651 219260971 - +chr2 219271238 219271558 - +chr2 219513245 219513565 - +chr2 219552196 219552516 - +chr2 219575444 219575764 - +chr2 219712505 219712657 - +chr2 219717578 219717898 - +chr2 219741457 219741608 - +chr2 219762978 219763298 - +chr2 219844627 219844947 - +chr2 219894501 219894650 - +chr2 219922732 219923052 - +chr2 219938125 219938445 - +chr2 219968889 219969209 - +chr2 220024896 220025035 - +chr2 220035888 220036208 - +chr2 220142494 220142662 - +chr2 220159692 220159804 - +chr2 220163497 220163675 - +chr2 220267638 220267958 - +chr2 220324208 220324385 - +chr2 220341001 220341321 - +chr2 220360119 220360439 - +chr2 220374509 220374611 - +chr2 220386057 220386377 - +chr2 220395019 220395127 - +chr2 220406296 220406616 - +chr2 220425096 220425416 - +chr2 220507287 220507483 - +chr2 221551986 221552306 - +chr2 222438264 222438584 - +chr2 223478496 223478816 - +chr2 223555396 223555584 - +chr2 223572997 223573317 - +chr2 223803985 223804305 - +chr2 223836452 223836772 - +chr2 223846706 223847026 - +chr2 224539734 224540054 - +chr2 224557097 224557417 - +chr2 226147611 226147931 - +chr2 226525198 226525518 - +chr2 226533335 226533655 - +chr2 227080662 227080982 - +chr2 227661614 227661934 - +chr2 227898124 227898444 - +chr2 227971841 227972161 - +chr2 227994572 227994892 - +chr2 228103131 228103211 - +chr2 228336635 228336955 - +chr2 228631064 228631384 - +chr2 228749158 228749478 - +chr2 230138479 230138799 - +chr2 230451848 230452168 - +chr2 230645496 230645816 - +chr2 230872918 230873238 - +chr2 230931718 230932038 - +chr2 231024891 231025211 - +chr2 231476106 231476426 - +chr2 231712633 231712953 - +chr2 231714909 231715037 - +chr2 231766875 231767195 - +chr2 231851956 231852276 - +chr2 231917693 231917828 - +chr2 231921124 231921444 - +chr2 231968863 231969183 - +chr2 232057170 232057490 - +chr2 232092048 232092368 - +chr2 232161769 232162089 - +chr2 232277203 232277335 - +chr2 232347459 232347779 - +chr2 232428085 232428405 - +chr2 232466367 232466687 - +chr2 232477275 232477595 - +chr2 232539557 232539877 - +chr2 232542357 232542677 - +chr2 232571725 232572045 - +chr2 232578545 232578865 - +chr2 232580009 232580329 - +chr2 232765329 232765414 - +chr2 232770796 232771116 - +chr2 233157355 233157675 - +chr2 233191843 233192163 - +chr2 233476352 233476672 - +chr2 233918692 233919012 - +chr2 234160007 234160327 - +chr2 234312904 234313224 - +chr2 234352206 234352526 - +chr2 234357861 234357977 - +chr2 234620097 234620417 - +chr2 235044330 235044650 - +chr2 235166797 235167117 - +chr2 235197581 235197901 - +chr2 235453607 235453927 - +chr2 235459196 235459516 - +chr2 235790381 235790701 - +chr2 236015792 236016112 - +chr2 236044630 236044950 - +chr2 236504091 236504411 - +chr2 236654996 236655316 - +chr2 236674870 236675190 - +chr2 236688074 236688394 - +chr2 237085122 237085442 - +chr2 237086709 237087029 - +chr2 237218210 237218530 - +chr2 237252217 237252537 - +chr2 237421629 237421949 - +chr2 237936109 237936429 - +chr2 237981009 237981329 - +chr2 238062724 238063044 - +chr2 238330177 238330497 - +chr2 238384091 238384411 - +chr2 238391988 238392308 - +chr2 238421444 238421764 - +chr2 238480671 238480991 - +chr2 238653692 238654012 - +chr2 238767122 238767213 - +chr2 238832384 238832551 - +chr2 239279422 239279742 - +chr2 239693664 239693984 - +chr2 239867457 239867777 - +chr2 239892279 239892437 - +chr2 240065593 240065913 - +chr2 240960934 240961254 - +chr2 241083225 241083545 - +chr2 241089623 241089943 - +chr2 241172315 241172635 - +chr2 241377797 241377896 - +chr2 241453375 241453504 - +chr2 241790771 241790869 - +chr2 241853767 241854087 - +chr2 242598751 242599071 - +chr2 242604837 242605157 - +chr2 242892974 242893294 - +chr20 247640 247960 - +chr20 311262 311582 - +chr20 335119 335439 - +chr20 542435 542615 - +chr20 626256 626382 - +chr20 648089 648409 - +chr20 719794 719921 - +chr20 796755 796942 - +chr20 937186 937363 - +chr20 1089137 1089457 - +chr20 1193586 1193906 - +chr20 1294248 1294378 - +chr20 1417954 1418274 - +chr20 1447597 1447715 - +chr20 1450740 1451060 - +chr20 1484373 1484547 - +chr20 1745081 1745401 - +chr20 1757899 1758219 - +chr20 1946308 1946628 - +chr20 1987330 1987650 - +chr20 2153223 2153543 - +chr20 2432800 2433120 - +chr20 2502374 2502694 - +chr20 2505129 2505233 - +chr20 2723754 2724074 - +chr20 2733403 2733723 - +chr20 2736687 2737007 - +chr20 3034600 3034721 - +chr20 3096211 3096531 - +chr20 3199163 3199483 - +chr20 3657293 3657613 - +chr20 3691559 3691879 - +chr20 3696909 3697229 - +chr20 3733712 3734032 - +chr20 3800870 3800957 - +chr20 3835774 3836094 - +chr20 4052726 4053046 - +chr20 4065928 4066248 - +chr20 4072542 4072862 - +chr20 4089361 4089553 - +chr20 4153154 4153474 - +chr20 4230161 4230481 - +chr20 4573239 4573559 - +chr20 4587891 4588211 - +chr20 4721485 4721805 - +chr20 4741217 4741537 - +chr20 5186050 5186370 - +chr20 5203642 5203962 - +chr20 5510410 5510536 - +chr20 5769027 5769347 - +chr20 6058034 6058354 - +chr20 7266595 7266915 - +chr20 8112678 8112758 - +chr20 8701826 8702146 - +chr20 8771227 8771383 - +chr20 9373966 9374286 - +chr20 9893525 9893845 - +chr20 10407779 10408099 - +chr20 10534033 10534353 - +chr20 10677279 10677599 - +chr20 11185832 11186152 - +chr20 11657210 11657530 - +chr20 11696817 11697137 - +chr20 11993006 11993165 - +chr20 12188512 12188832 - +chr20 12905820 12906140 - +chr20 13100182 13100502 - +chr20 13439533 13439853 - +chr20 16414744 16415064 - +chr20 16782596 16782916 - +chr20 16784627 16784947 - +chr20 16802235 16802396 - +chr20 17412421 17412741 - +chr20 17517747 17517829 - +chr20 17663287 17663607 - +chr20 17879192 17879512 - +chr20 17919783 17919913 - +chr20 17976743 17977063 - +chr20 18257426 18257746 - +chr20 18612879 18613199 - +chr20 19186613 19186933 - +chr20 19564487 19564807 - +chr20 19643154 19643294 - +chr20 19669021 19669341 - +chr20 19673288 19673608 - +chr20 20218053 20218373 - +chr20 20341047 20341367 - +chr20 20840839 20841159 - +chr20 21216308 21216628 - +chr20 21485933 21486130 - +chr20 21552730 21553050 - +chr20 22664664 22664984 - +chr20 22672255 22672575 - +chr20 23216089 23216409 - +chr20 23422486 23422806 - +chr20 23587326 23587473 - +chr20 23649014 23649334 - +chr20 24762052 24762372 - +chr20 24898995 24899315 - +chr20 24975466 24975786 - +chr20 25159836 25160156 - +chr20 25275197 25275311 - +chr20 25291046 25291366 - +chr20 29591561 29591881 - +chr20 30090941 30091261 - +chr20 30101976 30102296 - +chr20 30147904 30148224 - +chr20 30150735 30150903 - +chr20 30157331 30157651 - +chr20 30200745 30200970 - +chr20 30222035 30222355 - +chr20 30310592 30310912 - +chr20 30311686 30312006 - +chr20 30410705 30410842 - +chr20 30421325 30421441 - +chr20 30434461 30434597 - +chr20 30440627 30440730 - +chr20 30539810 30540130 - +chr20 30619242 30619389 - +chr20 30679723 30680043 - +chr20 31044787 31045107 - +chr20 31073021 31073341 - +chr20 31109514 31109834 - +chr20 31240740 31241060 - +chr20 31262160 31262480 - +chr20 31366376 31366508 - +chr20 31377913 31378129 - +chr20 31481958 31482278 - +chr20 31490435 31490515 - +chr20 31879767 31879879 - +chr20 31887056 31887189 - +chr20 32045006 32045326 - +chr20 32241076 32241396 - +chr20 32255729 32256049 - +chr20 32380084 32380176 - +chr20 32657677 32657997 - +chr20 32778428 32778748 - +chr20 32900544 32900788 - +chr20 33126487 33126807 - +chr20 33166902 33167222 - +chr20 33461858 33462178 - +chr20 33680630 33680726 - +chr20 33857108 33857428 - +chr20 33891650 33891970 - +chr20 33894788 33895108 - +chr20 34026842 34027162 - +chr20 34035127 34035268 - +chr20 34111171 34111491 - +chr20 34542337 34542657 - +chr20 34580678 34580998 - +chr20 34583028 34583175 - +chr20 34744697 34744806 - +chr20 34787685 34788005 - +chr20 34790860 34791012 - +chr20 34802640 34802960 - +chr20 35067898 35068218 - +chr20 35170425 35170745 - +chr20 35185031 35185171 - +chr20 35233813 35234133 - +chr20 35402645 35402965 - +chr20 35436980 35437300 - +chr20 35515700 35515856 - +chr20 35548149 35548469 - +chr20 35820780 35820909 - +chr20 35823945 35824265 - +chr20 35830803 35831123 - +chr20 35898098 35898418 - +chr20 36013112 36013432 - +chr20 36192869 36193189 - +chr20 36685921 36686241 - +chr20 36701868 36702188 - +chr20 36793725 36794045 - +chr20 36849517 36849837 - +chr20 36851246 36851344 - +chr20 36984995 36985315 - +chr20 37037335 37037479 - +chr20 37063986 37064190 - +chr20 37075078 37075398 - +chr20 37262410 37262509 - +chr20 37274924 37275018 - +chr20 37360138 37360458 - +chr20 37522468 37522788 - +chr20 37554619 37554939 - +chr20 37676864 37676947 - +chr20 37678818 37678924 - +chr20 37709835 37710155 - +chr20 39318685 39319005 - +chr20 39370164 39370484 - +chr20 39591647 39591967 - +chr20 39597389 39597709 - +chr20 39619177 39619497 - +chr20 39724043 39724363 - +chr20 39801895 39801995 - +chr20 39968669 39968989 - +chr20 40326472 40326792 - +chr20 40363166 40363327 - +chr20 40367257 40367577 - +chr20 41816426 41816746 - +chr20 42037976 42038296 - +chr20 42166743 42167063 - +chr20 42320614 42320934 - +chr20 42378492 42378634 - +chr20 42709496 42709638 - +chr20 42745321 42745641 - +chr20 42818888 42819208 - +chr20 42843714 42844034 - +chr20 42846011 42846331 - +chr20 42939808 42940128 - +chr20 43013031 43013351 - +chr20 43017165 43017485 - +chr20 43238728 43238902 - +chr20 43313851 43314171 - +chr20 43362037 43362357 - +chr20 43376174 43376494 - +chr20 43672723 43673043 - +chr20 43724484 43724804 - +chr20 43767457 43767777 - +chr20 43919659 43919783 - +chr20 43922517 43922613 - +chr20 43935163 43935483 - +chr20 43940271 43940591 - +chr20 43991491 43991811 - +chr20 44002511 44002831 - +chr20 44065547 44065867 - +chr20 44074651 44074971 - +chr20 44258182 44258502 - +chr20 44399822 44400142 - +chr20 44427839 44428159 - +chr20 44518660 44518980 - +chr20 44615924 44616244 - +chr20 44642764 44643084 - +chr20 44651529 44651849 - +chr20 44761274 44761594 - +chr20 44935705 44936025 - +chr20 44977635 44977955 - +chr20 45191142 45191462 - +chr20 45310025 45310345 - +chr20 45376593 45376913 - +chr20 45386858 45386993 - +chr20 45825108 45825224 - +chr20 45901593 45901913 - +chr20 46012840 46013006 - +chr20 46292626 46292754 - +chr20 46354142 46354462 - +chr20 47237042 47237362 - +chr20 47237638 47237784 - +chr20 47252341 47252661 - +chr20 47274372 47274452 - +chr20 47521599 47521919 - +chr20 47833681 47834001 - +chr20 47950555 47950714 - +chr20 48150810 48151130 - +chr20 48158300 48158620 - +chr20 48225581 48225703 - +chr20 48227719 48227887 - +chr20 48505045 48505365 - +chr20 48545223 48545543 - +chr20 48595623 48595770 - +chr20 48687944 48688049 - +chr20 48702011 48702331 - +chr20 48737385 48737705 - +chr20 48786020 48786121 - +chr20 48802917 48803000 - +chr20 48806827 48807147 - +chr20 49077321 49077641 - +chr20 49080833 49081153 - +chr20 49204828 49204933 - +chr20 49252550 49252675 - +chr20 49407187 49407507 - +chr20 49434939 49435259 - +chr20 49546021 49546154 - +chr20 49598027 49598347 - +chr20 49604282 49604602 - +chr20 49631675 49631995 - +chr20 49836759 49837079 - +chr20 49923339 49923659 - +chr20 49988830 49989150 - +chr20 50211581 50211901 - +chr20 50255779 50256099 - +chr20 50416712 50417032 - +chr20 50582905 50583225 - +chr20 50745057 50745377 - +chr20 50928725 50929045 - +chr20 50996304 50996624 - +chr20 52164328 52164648 - +chr20 52486353 52486673 - +chr20 52532314 52532634 - +chr20 52719988 52720308 - +chr20 52816206 52816526 - +chr20 52824197 52824517 - +chr20 53265955 53266275 - +chr20 55125750 55125927 - +chr20 55956853 55957173 - +chr20 56070316 56070636 - +chr20 56169414 56169734 - +chr20 56171987 56172307 - +chr20 56229343 56229663 - +chr20 56662345 56662665 - +chr20 56754626 56754946 - +chr20 56767666 56767986 - +chr20 56784973 56785186 - +chr20 56808983 56809303 - +chr20 56812152 56812472 - +chr20 57022866 57023186 - +chr20 57445446 57445579 - +chr20 57662884 57663204 - +chr20 57797673 57797806 - +chr20 58070983 58071303 - +chr20 58094521 58094841 - +chr20 58100577 58100897 - +chr20 58146606 58146926 - +chr20 58509062 58509382 - +chr20 58632466 58632786 - +chr20 58786306 58786386 - +chr20 59804312 59804469 - +chr20 59838274 59838426 - +chr20 60462297 60462617 - +chr20 60471345 60471665 - +chr20 60711911 60712231 - +chr20 60732008 60732328 - +chr20 60876981 60877096 - +chr20 60877638 60877958 - +chr20 60957906 60958001 - +chr20 61075244 61075564 - +chr20 61299979 61300086 - +chr20 61422907 61423030 - +chr20 61439649 61439969 - +chr20 61531622 61531942 - +chr20 61552622 61552942 - +chr20 61603756 61603836 - +chr20 61640561 61640647 - +chr20 61687740 61688060 - +chr20 61814680 61815000 - +chr20 61846443 61846763 - +chr20 61854731 61855051 - +chr20 61940888 61941208 - +chr20 62013304 62013461 - +chr20 62127441 62127761 - +chr20 62169277 62169597 - +chr20 62406522 62406842 - +chr20 62417728 62417862 - +chr20 62495600 62495920 - +chr20 62605362 62605484 - +chr20 62612153 62612473 - +chr20 62689146 62689466 - +chr20 62701102 62701422 - +chr20 62751805 62752125 - +chr21 9909516 9909836 - +chr21 16236967 16237287 - +chr21 16855536 16855856 - +chr21 19297355 19297675 - +chr21 22371907 22372227 - +chr21 26979754 26980074 - +chr21 26980595 26980915 - +chr21 27173934 27174148 - +chr21 27234858 27235178 - +chr21 27254245 27254333 - +chr21 28088246 28088566 - +chr21 29889313 29889633 - +chr21 30117396 30117716 - +chr21 30118052 30118372 - +chr21 31222062 31222382 - +chr21 32470893 32471213 - +chr21 32953116 32953436 - +chr21 32976929 32977249 - +chr21 33238370 33238690 - +chr21 33630686 33631006 - +chr21 33942106 33942426 - +chr21 33973440 33973555 - +chr21 34499662 34499982 - +chr21 34733558 34733878 - +chr21 35267108 35267428 - +chr21 35296321 35296641 - +chr21 35574276 35574596 - +chr21 35734248 35734400 - +chr21 35763107 35763427 - +chr21 35802019 35802339 - +chr21 35815852 35816172 - +chr21 35882851 35883171 - +chr21 35975390 35975710 - +chr21 36117813 36118133 - +chr21 37400890 37400995 - +chr21 37485104 37485424 - +chr21 37497853 37498173 - +chr21 37792272 37792592 - +chr21 37794536 37794856 - +chr21 37876552 37876872 - +chr21 38003059 38003379 - +chr21 38289889 38290209 - +chr21 38382199 38382519 - +chr21 39047524 39047844 - +chr21 39072967 39073287 - +chr21 39084573 39084724 - +chr21 39131276 39131596 - +chr21 39288616 39288936 - +chr21 39498799 39499119 - +chr21 39531394 39531714 - +chr21 39590984 39591304 - +chr21 39724585 39724905 - +chr21 39728060 39728186 - +chr21 40105832 40106152 - +chr21 40510096 40510416 - +chr21 40760519 40760839 - +chr21 40813460 40813780 - +chr21 40890230 40890550 - +chr21 43030637 43030778 - +chr21 43198530 43198613 - +chr21 43236932 43237252 - +chr21 43237621 43237941 - +chr21 43537578 43537898 - +chr21 43565629 43565949 - +chr21 43678726 43679046 - +chr21 44238378 44238698 - +chr21 44343961 44344281 - +chr21 44527677 44527997 - +chr21 44559119 44559439 - +chr21 44596871 44596967 - +chr21 45031754 45032074 - +chr21 45066561 45066881 - +chr21 45209199 45209519 - +chr21 45230721 45230874 - +chr21 45359020 45359206 - +chr21 45395370 45395690 - +chr21 45567034 45567354 - +chr21 45717554 45717874 - +chr21 45907729 45908049 - +chr21 46213239 46213559 - +chr21 46312809 46312950 - +chr21 46369695 46370015 - +chr21 46548578 46548898 - +chr21 46557031 46557351 - +chr21 46560283 46560603 - +chr21 46712474 46712630 - +chr21 46728987 46729307 - +chr21 46859335 46859655 - +chr21 46898004 46898324 - +chr21 47016663 47016983 - +chr21 47240508 47240828 - +chr21 47300374 47300694 - +chr21 47343208 47343365 - +chr21 47390582 47390902 - +chr21 47393528 47393848 - +chr21 47563548 47563647 - +chr21 47573147 47573467 - +chr21 48081109 48081429 - +chr22 17539196 17539340 - +chr22 17555841 17556161 - +chr22 17652727 17653047 - +chr22 18038242 18038362 - +chr22 18225215 18225535 - +chr22 18312816 18313026 - +chr22 18639918 18640238 - +chr22 18941909 18942229 - +chr22 19132219 19132539 - +chr22 19617489 19617809 - +chr22 19667115 19667435 - +chr22 19718182 19718502 - +chr22 19897487 19897807 - +chr22 19935293 19935613 - +chr22 19958012 19958332 - +chr22 20000978 20001298 - +chr22 20144600 20144769 - +chr22 20273292 20273612 - +chr22 20772445 20772765 - +chr22 20810566 20810886 - +chr22 20850759 20851079 - +chr22 20918660 20918889 - +chr22 20925806 20926126 - +chr22 20960515 20960835 - +chr22 21158229 21158549 - +chr22 21239799 21240119 - +chr22 21271345 21271665 - +chr22 21316146 21316466 - +chr22 21984303 21984623 - +chr22 22001136 22001239 - +chr22 22067468 22067788 - +chr22 22292531 22292641 - +chr22 22293007 22293145 - +chr22 22326674 22326994 - +chr22 22415004 22415324 - +chr22 22465326 22465646 - +chr22 22511131 22511451 - +chr22 22549750 22550070 - +chr22 22555520 22555840 - +chr22 22562210 22562530 - +chr22 22676377 22676697 - +chr22 22697354 22697439 - +chr22 22777107 22777187 - +chr22 22918799 22919119 - +chr22 23053221 23053541 - +chr22 23090781 23091101 - +chr22 23112693 23113013 - +chr22 23130704 23131024 - +chr22 23190566 23190886 - +chr22 23298135 23298455 - +chr22 23470640 23470960 - +chr22 23516222 23516349 - +chr22 23520747 23520868 - +chr22 23624034 23624354 - +chr22 23632698 23633018 - +chr22 23728181 23728349 - +chr22 23881800 23882120 - +chr22 23933324 23933644 - +chr22 24110514 24110834 - +chr22 24191741 24192061 - +chr22 24298790 24299110 - +chr22 24372319 24372639 - +chr22 24553010 24553133 - +chr22 24571186 24571506 - +chr22 24585727 24585834 - +chr22 24643549 24643869 - +chr22 24899102 24899422 - +chr22 24921438 24921758 - +chr22 25199125 25199233 - +chr22 25246289 25246609 - +chr22 25598335 25598655 - +chr22 25758116 25758436 - +chr22 25843803 25844123 - +chr22 25858589 25858752 - +chr22 26168938 26169258 - +chr22 26727022 26727342 - +chr22 26756396 26756716 - +chr22 26763608 26763928 - +chr22 26857221 26857541 - +chr22 27061477 27061797 - +chr22 27075080 27075235 - +chr22 27152563 27152883 - +chr22 27801884 27802204 - +chr22 28282558 28282878 - +chr22 28405339 28405659 - +chr22 28416138 28416235 - +chr22 28447729 28448049 - +chr22 28838258 28838578 - +chr22 29225496 29225624 - +chr22 29251620 29251940 - +chr22 29436698 29437018 - +chr22 29550567 29550683 - +chr22 29641626 29641797 - +chr22 29790983 29791303 - +chr22 30021592 30021912 - +chr22 30085810 30086130 - +chr22 30401331 30401493 - +chr22 30651927 30652074 - +chr22 30652069 30652389 - +chr22 30658850 30659170 - +chr22 30682830 30683150 - +chr22 30684036 30684356 - +chr22 30787110 30787430 - +chr22 30881394 30881714 - +chr22 31001509 31001829 - +chr22 31030415 31030735 - +chr22 31286629 31286949 - +chr22 31316415 31316501 - +chr22 31481208 31481528 - +chr22 31485123 31485443 - +chr22 31503653 31503973 - +chr22 31545086 31545263 - +chr22 31607946 31608266 - +chr22 31627174 31627494 - +chr22 31709385 31709493 - +chr22 31741785 31742105 - +chr22 31957278 31957598 - +chr22 32029917 32030076 - +chr22 32366646 32366966 - +chr22 32373172 32373492 - +chr22 32662182 32662502 - +chr22 32924913 32925233 - +chr22 33056211 33056531 - +chr22 33192999 33193319 - +chr22 33382038 33382358 - +chr22 33396704 33397024 - +chr22 33414277 33414597 - +chr22 33445265 33445370 - +chr22 33698104 33698250 - +chr22 33975623 33975718 - +chr22 34319711 34320031 - +chr22 35420058 35420378 - +chr22 35546536 35546856 - +chr22 35550926 35551246 - +chr22 35627637 35627724 - +chr22 35826916 35827032 - +chr22 35934893 35935213 - +chr22 35979135 35979455 - +chr22 36019357 36019677 - +chr22 36462233 36462433 - +chr22 36520067 36520387 - +chr22 36541130 36541450 - +chr22 36590960 36591280 - +chr22 36682560 36682880 - +chr22 36697371 36697691 - +chr22 36866415 36866544 - +chr22 36882695 36883015 - +chr22 36907060 36907380 - +chr22 37153656 37153976 - +chr22 37178086 37178406 - +chr22 37252468 37252660 - +chr22 37374126 37374446 - +chr22 37382304 37382624 - +chr22 37418124 37418210 - +chr22 37473950 37474103 - +chr22 37562582 37562902 - +chr22 37595510 37595830 - +chr22 37655620 37655940 - +chr22 37656923 37657243 - +chr22 37663177 37663327 - +chr22 37713017 37713128 - +chr22 37825029 37825349 - +chr22 37853391 37853711 - +chr22 37934858 37935178 - +chr22 37941160 37941480 - +chr22 37957758 37958078 - +chr22 37963371 37963691 - +chr22 38003167 38003262 - +chr22 38039311 38039631 - +chr22 38077607 38077927 - +chr22 38187892 38188212 - +chr22 38213636 38213956 - +chr22 38283794 38284114 - +chr22 38329963 38330283 - +chr22 38470229 38470549 - +chr22 38510512 38510832 - +chr22 38749277 38749357 - +chr22 39114466 39114786 - +chr22 39129306 39129626 - +chr22 39162170 39162490 - +chr22 39164839 39165159 - +chr22 39264873 39265193 - +chr22 39266362 39266480 - +chr22 39369626 39369946 - +chr22 39399098 39399418 - +chr22 39566802 39567122 - +chr22 39706910 39707230 - +chr22 39712432 39712752 - +chr22 39739527 39739847 - +chr22 39795521 39795841 - +chr22 39892414 39892734 - +chr22 39925327 39925487 - +chr22 39930797 39931117 - +chr22 40328841 40329161 - +chr22 40420834 40421154 - +chr22 40744110 40744430 - +chr22 40921321 40921641 - +chr22 41042260 41042580 - +chr22 41079127 41079447 - +chr22 41799824 41800144 - +chr22 41913906 41913990 - +chr22 41922763 41923083 - +chr22 41929253 41929573 - +chr22 41939791 41940111 - +chr22 41993312 41993632 - +chr22 42062756 42062882 - +chr22 42093248 42093350 - +chr22 42142609 42142929 - +chr22 42337303 42337623 - +chr22 42353742 42353845 - +chr22 42486859 42487179 - +chr22 42565007 42565327 - +chr22 42833532 42833852 - +chr22 43017522 43017842 - +chr22 43057996 43058316 - +chr22 43176092 43176412 - +chr22 43263760 43264080 - +chr22 43429903 43430223 - +chr22 43456273 43456593 - +chr22 43532296 43532616 - +chr22 43627326 43627453 - +chr22 43669917 43670237 - +chr22 43742455 43742775 - +chr22 43781828 43781908 - +chr22 43784468 43784560 - +chr22 44280903 44281223 - +chr22 44341782 44342102 - +chr22 44389172 44389309 - +chr22 44477090 44477410 - +chr22 45022235 45022358 - +chr22 45080732 45080906 - +chr22 45093429 45093514 - +chr22 45216501 45216821 - +chr22 45371527 45371847 - +chr22 45492267 45492390 - +chr22 45605097 45605417 - +chr22 45659701 45660021 - +chr22 45664868 45665188 - +chr22 45831684 45831798 - +chr22 45837086 45837406 - +chr22 45944949 45945269 - +chr22 45949691 45950011 - +chr22 45973889 45974209 - +chr22 46259893 46260047 - +chr22 46374945 46375265 - +chr22 46390727 46391047 - +chr22 46402227 46402547 - +chr22 46512423 46512743 - +chr22 46518319 46518639 - +chr22 46710195 46710515 - +chr22 46751303 46751623 - +chr22 46763533 46763853 - +chr22 46938295 46938450 - +chr22 46959240 46959560 - +chr22 47194584 47194749 - +chr22 47199311 47199631 - +chr22 50160973 50161293 - +chr22 50431833 50432153 - +chr22 50468730 50469050 - +chr22 50585318 50585445 - +chr22 50648290 50648610 - +chr22 50679536 50679856 - +chr22 50780762 50781082 - +chr22 51001165 51001485 - +chr22 51058859 51059179 - +chr22 51135114 51135434 - +chr22 51170566 51170886 - +chr3 396070 396390 - +chr3 3221397 3221477 - +chr3 4331998 4332318 - +chr3 4675102 4675422 - +chr3 4697686 4698006 - +chr3 4753085 4753405 - +chr3 5022076 5022220 - +chr3 5163704 5164024 - +chr3 8281559 8281659 - +chr3 8483727 8484047 - +chr3 8649261 8649581 - +chr3 8664611 8664931 - +chr3 8811164 8811302 - +chr3 9012788 9012881 - +chr3 9109152 9109278 - +chr3 9344273 9344593 - +chr3 9394876 9395196 - +chr3 9542235 9542555 - +chr3 9545000 9545125 - +chr3 9757592 9757718 - +chr3 9974505 9974825 - +chr3 10234561 10234881 - +chr3 10266631 10266951 - +chr3 10385930 10386061 - +chr3 10780588 10780908 - +chr3 10825081 10825401 - +chr3 10983774 10984094 - +chr3 10987389 10987709 - +chr3 11025039 11025359 - +chr3 11101242 11101562 - +chr3 11102391 11102711 - +chr3 11197119 11197439 - +chr3 11278328 11278648 - +chr3 11985584 11985904 - +chr3 12176654 12176789 - +chr3 12217375 12217695 - +chr3 12748017 12748337 - +chr3 12761578 12761898 - +chr3 12851546 12851866 - +chr3 12883225 12883545 - +chr3 12909560 12909697 - +chr3 12933518 12933838 - +chr3 13036483 13036803 - +chr3 13133760 13133891 - +chr3 13537882 13538033 - +chr3 13689847 13690021 - +chr3 13692063 13692210 - +chr3 13699354 13699674 - +chr3 14244973 14245293 - +chr3 14302752 14302881 - +chr3 14503272 14503592 - +chr3 14509681 14509821 - +chr3 14550406 14550726 - +chr3 14962749 14962923 - +chr3 15149623 15149943 - +chr3 15244285 15244465 - +chr3 15309584 15309703 - +chr3 15419783 15420103 - +chr3 16219955 16220073 - +chr3 16288834 16289154 - +chr3 16592306 16592626 - +chr3 17882838 17882998 - +chr3 24040456 24040776 - +chr3 24279540 24279860 - +chr3 25635554 25635874 - +chr3 25689567 25689887 - +chr3 27175696 27176016 - +chr3 27353585 27353905 - +chr3 27399196 27399516 - +chr3 27700710 27701030 - +chr3 27753578 27753898 - +chr3 28709160 28709480 - +chr3 30735026 30735346 - +chr3 30810259 30810579 - +chr3 32219419 32219739 - +chr3 32338217 32338537 - +chr3 32509347 32509667 - +chr3 32822980 32823300 - +chr3 32989203 32989523 - +chr3 33116074 33116212 - +chr3 33414370 33414690 - +chr3 33828946 33829266 - +chr3 36742490 36742810 - +chr3 36910382 36910702 - +chr3 37000231 37000551 - +chr3 37821087 37821407 - +chr3 37896891 37897211 - +chr3 38008928 38009035 - +chr3 38016166 38016486 - +chr3 38038461 38038591 - +chr3 38044906 38045226 - +chr3 38159504 38159824 - +chr3 38388037 38388357 - +chr3 38521362 38521682 - +chr3 38537564 38537884 - +chr3 38562895 38563215 - +chr3 38669484 38669804 - +chr3 38692799 38692987 - +chr3 38780170 38780490 - +chr3 39072327 39072647 - +chr3 39231317 39231637 - +chr3 39329862 39330182 - +chr3 39333451 39333592 - +chr3 39489386 39489706 - +chr3 39540090 39540410 - +chr3 40233483 40233596 - +chr3 40350873 40351193 - +chr3 40369137 40369457 - +chr3 40622030 40622350 - +chr3 40626172 40626280 - +chr3 42141310 42141630 - +chr3 42232673 42232993 - +chr3 42485456 42485545 - +chr3 42589050 42589175 - +chr3 42881158 42881478 - +chr3 42922514 42922834 - +chr3 43197238 43197558 - +chr3 43810951 43811271 - +chr3 43897186 43897506 - +chr3 44240165 44240485 - +chr3 44364877 44365197 - +chr3 44955419 44955739 - +chr3 45079216 45079536 - +chr3 45082255 45082575 - +chr3 45252099 45252419 - +chr3 45549585 45549905 - +chr3 45553006 45553326 - +chr3 45634758 45634862 - +chr3 45698294 45698614 - +chr3 45701633 45701770 - +chr3 45802634 45802954 - +chr3 45948352 45948672 - +chr3 45970672 45970798 - +chr3 45980280 45980600 - +chr3 46120338 46120658 - +chr3 46419904 46420224 - +chr3 46491294 46491614 - +chr3 46530248 46530460 - +chr3 46599820 46600140 - +chr3 46671529 46671715 - +chr3 46746609 46746740 - +chr3 46908256 46908576 - +chr3 46936726 46937046 - +chr3 46951044 46951364 - +chr3 46964993 46965313 - +chr3 46977066 46977386 - +chr3 47365787 47366107 - +chr3 47462636 47462766 - +chr3 47517425 47517745 - +chr3 47563603 47563923 - +chr3 47879208 47879528 - +chr3 48031492 48031812 - +chr3 48193027 48193347 - +chr3 48199612 48199932 - +chr3 48235933 48236253 - +chr3 48315094 48315414 - +chr3 48477487 48477697 - +chr3 48506808 48507128 - +chr3 48586288 48586425 - +chr3 48701525 48701845 - +chr3 48774384 48774704 - +chr3 48777394 48777714 - +chr3 49051545 49051865 - +chr3 49059320 49059640 - +chr3 49208462 49208782 - +chr3 49486585 49486905 - +chr3 49577089 49577409 - +chr3 49709152 49709472 - +chr3 49738858 49739178 - +chr3 49740990 49741090 - +chr3 49840006 49840326 - +chr3 50158983 50159303 - +chr3 50226859 50226972 - +chr3 50230157 50230259 - +chr3 50231953 50232273 - +chr3 50263521 50263841 - +chr3 50283756 50283872 - +chr3 50297400 50297720 - +chr3 50310357 50310677 - +chr3 50311747 50312067 - +chr3 50362013 50362333 - +chr3 50557064 50557156 - +chr3 50579900 50580220 - +chr3 50587556 50587876 - +chr3 50609553 50609873 - +chr3 50651624 50651762 - +chr3 50653447 50653643 - +chr3 51008921 51009241 - +chr3 51137705 51138025 - +chr3 51426806 51427126 - +chr3 51428336 51428656 - +chr3 51536079 51536399 - +chr3 51671915 51672235 - +chr3 51703399 51703538 - +chr3 51716145 51716465 - +chr3 51976082 51976402 - +chr3 52007977 52008297 - +chr3 52035515 52035835 - +chr3 52038243 52038563 - +chr3 52157276 52157406 - +chr3 52178427 52178526 - +chr3 52254563 52254660 - +chr3 52265523 52265843 - +chr3 52481114 52481353 - +chr3 52854811 52855131 - +chr3 52968695 52969015 - +chr3 53106715 53107035 - +chr3 53144231 53144551 - +chr3 53147956 53148276 - +chr3 53168730 53169050 - +chr3 53229962 53230282 - +chr3 53254031 53254136 - +chr3 53744227 53744547 - +chr3 54462339 54462659 - +chr3 54864720 54865040 - +chr3 54985427 54985747 - +chr3 55756174 55756494 - +chr3 56528872 56528969 - +chr3 56591597 56591917 - +chr3 57029471 57029791 - +chr3 57134256 57134364 - +chr3 57204195 57204515 - +chr3 57993826 57994146 - +chr3 58153177 58153329 - +chr3 58163448 58163768 - +chr3 58173444 58173764 - +chr3 58202947 58203267 - +chr3 58291730 58292050 - +chr3 58449411 58449731 - +chr3 58518507 58518827 - +chr3 58554328 58554648 - +chr3 58571092 58571412 - +chr3 58615872 58616192 - +chr3 58626365 58626685 - +chr3 58809969 58810289 - +chr3 58980479 58980799 - +chr3 59561133 59561453 - +chr3 61550249 61550569 - +chr3 63335315 63335635 - +chr3 63336866 63337186 - +chr3 63979416 63979526 - +chr3 64009153 64009312 - +chr3 64016367 64016687 - +chr3 64470117 64470437 - +chr3 64516149 64516469 - +chr3 64519313 64519633 - +chr3 65327784 65328104 - +chr3 65425308 65425628 - +chr3 65658594 65658914 - +chr3 65736822 65737142 - +chr3 66458188 66458508 - +chr3 67022278 67022598 - +chr3 67164262 67164582 - +chr3 67840501 67840821 - +chr3 68925320 68925640 - +chr3 69021573 69021893 - +chr3 69170172 69170270 - +chr3 69210767 69211087 - +chr3 69355539 69355859 - +chr3 69459543 69459863 - +chr3 69740245 69740565 - +chr3 70157253 70157573 - +chr3 70169557 70169877 - +chr3 70599914 70600234 - +chr3 71730347 71730667 - +chr3 71779027 71779347 - +chr3 71924580 71924900 - +chr3 73150367 73150459 - +chr3 75241237 75241557 - +chr3 76157138 76157458 - +chr3 81539678 81539998 - +chr3 82182181 82182501 - +chr3 87399788 87400108 - +chr3 88252722 88253042 - +chr3 94243920 94244240 - +chr3 97438223 97438303 - +chr3 97742108 97742428 - +chr3 97758183 97758503 - +chr3 98833510 98833648 - +chr3 98891914 98892234 - +chr3 99218076 99218396 - +chr3 99251142 99251462 - +chr3 99527062 99527382 - +chr3 99809217 99809537 - +chr3 99973061 99973381 - +chr3 100309427 100309747 - +chr3 100612469 100612789 - +chr3 101292427 101292747 - +chr3 101355139 101355459 - +chr3 101522057 101522377 - +chr3 101947916 101948236 - +chr3 102392358 102392678 - +chr3 106328601 106328921 - +chr3 106783349 106783669 - +chr3 107148761 107149081 - +chr3 107565121 107565441 - +chr3 107646380 107646700 - +chr3 108633909 108634229 - +chr3 109450753 109451073 - +chr3 111056946 111057266 - +chr3 111258593 111258913 - +chr3 111434830 111435150 - +chr3 111436406 111436726 - +chr3 111686574 111686894 - +chr3 111732412 111732732 - +chr3 111762515 111762835 - +chr3 112361771 112362091 - +chr3 112754277 112754597 - +chr3 112770079 112770399 - +chr3 113007016 113007200 - +chr3 113345758 113345840 - +chr3 113871290 113871610 - +chr3 114009711 114010031 - +chr3 114102629 114102949 - +chr3 114811346 114811666 - +chr3 115411802 115412122 - +chr3 115503383 115503703 - +chr3 115533660 115533980 - +chr3 115538657 115538977 - +chr3 116477081 116477401 - +chr3 116489709 116490029 - +chr3 117716346 117716666 - +chr3 118603541 118603861 - +chr3 118960355 118960675 - +chr3 119539864 119540184 - +chr3 120277781 120278101 - +chr3 120280083 120280403 - +chr3 120513919 120514239 - +chr3 122473720 122474040 - +chr3 122661737 122662057 - +chr3 123005224 123005401 - +chr3 123047828 123047933 - +chr3 123208271 123208591 - +chr3 123361236 123361556 - +chr3 123442434 123442754 - +chr3 124187141 124187317 - +chr3 124269306 124269626 - +chr3 124278738 124279058 - +chr3 124283820 124284140 - +chr3 124407716 124408036 - +chr3 124480109 124480429 - +chr3 124620292 124620402 - +chr3 124884061 124884381 - +chr3 125156778 125156908 - +chr3 125304499 125304819 - +chr3 125917114 125917434 - +chr3 125978315 125978635 - +chr3 126108204 126108345 - +chr3 126241135 126241455 - +chr3 126398087 126398407 - +chr3 126471089 126471409 - +chr3 126701348 126701668 - +chr3 127174522 127174626 - +chr3 127263109 127263429 - +chr3 127293908 127294084 - +chr3 127371134 127371233 - +chr3 127409717 127410037 - +chr3 127794448 127794768 - +chr3 127814003 127814323 - +chr3 127852387 127852707 - +chr3 127889885 127889965 - +chr3 128145103 128145423 - +chr3 128327216 128327536 - +chr3 128336657 128336769 - +chr3 128399533 128399853 - +chr3 128513926 128514246 - +chr3 128689122 128689442 - +chr3 128880420 128880740 - +chr3 128964557 128964877 - +chr3 129102152 129102472 - +chr3 129118415 129118735 - +chr3 129209058 129209378 - +chr3 129346777 129347097 - +chr3 129653376 129653696 - +chr3 129661089 129661409 - +chr3 129995042 129995362 - +chr3 130478609 130478929 - +chr3 131513631 131513951 - +chr3 131748252 131748388 - +chr3 131753780 131754100 - +chr3 132071430 132071750 - +chr3 132118191 132118511 - +chr3 132760105 132760425 - +chr3 133142445 133142765 - +chr3 133537684 133538004 - +chr3 133931872 133932014 - +chr3 134027316 134027453 - +chr3 134168145 134168465 - +chr3 134319665 134319985 - +chr3 134474015 134474335 - +chr3 134513835 134513954 - +chr3 135222725 135223045 - +chr3 135614005 135614325 - +chr3 135794091 135794411 - +chr3 135969031 135969351 - +chr3 136471094 136471414 - +chr3 136751346 136751523 - +chr3 136829267 136829587 - +chr3 137690059 137690379 - +chr3 137771062 137771382 - +chr3 137787533 137787685 - +chr3 137837749 137838069 - +chr3 138136469 138136606 - +chr3 138198324 138198416 - +chr3 138313494 138313814 - +chr3 138634822 138635142 - +chr3 138642339 138642659 - +chr3 138655044 138655364 - +chr3 138830705 138831025 - +chr3 139048276 139048460 - +chr3 139072829 139073149 - +chr3 140718892 140719212 - +chr3 140730380 140730700 - +chr3 140911123 140911443 - +chr3 141160255 141160575 - +chr3 141170902 141171222 - +chr3 141178969 141179289 - +chr3 141280433 141280539 - +chr3 141363708 141364028 - +chr3 141456949 141457269 - +chr3 141594353 141594673 - +chr3 141681408 141681728 - +chr3 142547568 142547888 - +chr3 143393247 143393567 - +chr3 145877971 145878291 - +chr3 145907378 145907698 - +chr3 146909595 146909915 - +chr3 147663833 147664153 - +chr3 148990947 148991267 - +chr3 149070268 149070588 - +chr3 149111159 149111479 - +chr3 149198225 149198545 - +chr3 149206895 149207215 - +chr3 149211987 149212307 - +chr3 149489484 149489804 - +chr3 149526040 149526360 - +chr3 149654526 149654846 - +chr3 149939185 149939298 - +chr3 149945585 149945905 - +chr3 150505578 150505898 - +chr3 150863853 150864173 - +chr3 150945517 150945837 - +chr3 151186101 151186421 - +chr3 153486690 153487010 - +chr3 153970860 153971180 - +chr3 155100249 155100569 - +chr3 155523934 155524254 - +chr3 155868285 155868605 - +chr3 155869627 155869947 - +chr3 156100282 156100602 - +chr3 156799552 156799872 - +chr3 156956931 156957251 - +chr3 157164858 157165178 - +chr3 157290382 157290702 - +chr3 157324529 157324622 - +chr3 158362044 158362364 - +chr3 158390393 158390713 - +chr3 158509130 158509450 - +chr3 158635720 158636040 - +chr3 159040167 159040487 - +chr3 159522962 159523282 - +chr3 159706333 159706653 - +chr3 159724363 159724683 - +chr3 159750128 159750228 - +chr3 160638854 160639174 - +chr3 161569791 161570111 - +chr3 161632540 161632860 - +chr3 167645842 167646162 - +chr3 168867079 168867399 - +chr3 169581224 169581544 - +chr3 169755532 169755852 - +chr3 170027943 170028263 - +chr3 170132018 170132184 - +chr3 170408201 170408521 - +chr3 170529785 170530105 - +chr3 171332372 171332692 - +chr3 171547736 171548056 - +chr3 172050314 172050634 - +chr3 172179371 172179691 - +chr3 172195110 172195430 - +chr3 172384652 172384972 - +chr3 172428871 172429191 - +chr3 173301954 173302274 - +chr3 176590540 176590860 - +chr3 179276597 179276917 - +chr3 179646858 179647178 - +chr3 181418259 181418579 - +chr3 181419889 181420209 - +chr3 182401489 182401586 - +chr3 183602587 183602907 - +chr3 183613036 183613356 - +chr3 183945188 183945508 - +chr3 183968152 183968297 - +chr3 184072113 184072433 - +chr3 184089406 184089555 - +chr3 184135759 184135869 - +chr3 184231556 184231876 - +chr3 184261919 184261999 - +chr3 184279133 184279273 - +chr3 184445282 184445602 - +chr3 184521181 184521332 - +chr3 185025944 185026264 - +chr3 185071907 185072227 - +chr3 185378313 185378468 - +chr3 185395026 185395346 - +chr3 185661411 185661574 - +chr3 185695220 185695540 - +chr3 185701324 185701644 - +chr3 185977422 185977546 - +chr3 186332762 186332904 - +chr3 186334862 186335182 - +chr3 186857464 186857784 - +chr3 186937987 186938307 - +chr3 187036630 187036950 - +chr3 187037582 187037763 - +chr3 187397436 187397756 - +chr3 187432221 187432541 - +chr3 187456621 187456941 - +chr3 187457842 187457960 - +chr3 187458673 187458993 - +chr3 187459824 187460144 - +chr3 187465874 187466028 - +chr3 188672448 188672768 - +chr3 189679361 189679681 - +chr3 190040149 190040469 - +chr3 190602032 190602352 - +chr3 191134821 191135141 - +chr3 192162330 192162650 - +chr3 192185077 192185397 - +chr3 193290200 193290520 - +chr3 193323722 193324042 - +chr3 193498580 193498900 - +chr3 193505938 193506258 - +chr3 194033588 194033691 - +chr3 194054502 194054822 - +chr3 194064668 194064988 - +chr3 194193138 194193272 - +chr3 194403323 194403643 - +chr3 194827260 194827428 - +chr3 194992234 194992554 - +chr3 195577808 195578049 - +chr3 195586280 195586445 - +chr3 195590088 195590195 - +chr3 195621590 195621747 - +chr3 195641919 195642239 - +chr3 195800128 195800448 - +chr3 195923537 195923857 - +chr3 196336131 196336451 - +chr3 196411790 196412110 - +chr3 196564747 196565067 - +chr3 197237026 197237346 - +chr3 197632870 197633190 - +chr4 668021 668341 - +chr4 736531 736656 - +chr4 1066695 1067015 - +chr4 1124443 1124636 - +chr4 1160937 1161257 - +chr4 1345673 1345993 - +chr4 1350007 1350327 - +chr4 1550594 1550914 - +chr4 1630962 1631133 - +chr4 1695251 1695382 - +chr4 1728083 1728229 - +chr4 1747949 1748269 - +chr4 1811767 1811902 - +chr4 1901000 1901320 - +chr4 2221772 2222092 - +chr4 2443417 2443737 - +chr4 2765833 2766153 - +chr4 2980139 2980459 - +chr4 3039059 3039379 - +chr4 3049141 3049461 - +chr4 3108343 3108663 - +chr4 3445305 3445465 - +chr4 3482329 3482519 - +chr4 4336993 4337313 - +chr4 4432352 4432672 - +chr4 4490157 4490477 - +chr4 4670919 4671239 - +chr4 4996850 4997170 - +chr4 5052176 5052496 - +chr4 5401575 5401895 - +chr4 5792020 5792340 - +chr4 5823347 5823667 - +chr4 6000083 6000403 - +chr4 6021423 6021743 - +chr4 6329027 6329347 - +chr4 6333787 6334107 - +chr4 6548408 6548728 - +chr4 6721980 6722300 - +chr4 6875738 6876058 - +chr4 6976232 6976391 - +chr4 7072631 7072755 - +chr4 7103724 7104044 - +chr4 7158618 7158938 - +chr4 7337521 7337841 - +chr4 7721330 7721650 - +chr4 7803888 7804208 - +chr4 7805760 7806080 - +chr4 7989143 7989463 - +chr4 8178658 8178797 - +chr4 8243917 8244237 - +chr4 8268908 8269228 - +chr4 8404928 8405248 - +chr4 8470586 8470725 - +chr4 8609352 8609672 - +chr4 8919756 8920076 - +chr4 10159410 10159730 - +chr4 10462920 10463240 - +chr4 10539747 10539873 - +chr4 10551317 10551637 - +chr4 10583716 10584036 - +chr4 11721955 11722275 - +chr4 11767523 11767843 - +chr4 12634709 12635029 - +chr4 13550577 13550897 - +chr4 13655705 13656025 - +chr4 13658130 13658450 - +chr4 15884252 15884572 - +chr4 15957910 15958230 - +chr4 16003039 16003359 - +chr4 16796490 16796810 - +chr4 16918392 16918712 - +chr4 17449370 17449690 - +chr4 17638453 17638773 - +chr4 24421090 24421410 - +chr4 24474658 24474978 - +chr4 24490312 24490632 - +chr4 24981750 24982070 - +chr4 25314234 25314326 - +chr4 25441586 25441906 - +chr4 25769634 25769761 - +chr4 25989859 25989992 - +chr4 26062904 26063224 - +chr4 27011997 27012317 - +chr4 37604719 37604877 - +chr4 38181295 38181615 - +chr4 38321533 38321716 - +chr4 38677177 38677497 - +chr4 38732869 38732968 - +chr4 38862519 38862839 - +chr4 39136960 39137280 - +chr4 39470993 39471313 - +chr4 39978081 39978401 - +chr4 39979479 39979799 - +chr4 40311275 40311595 - +chr4 40327465 40327785 - +chr4 40843025 40843345 - +chr4 40899644 40899964 - +chr4 40910704 40911024 - +chr4 40926672 40926806 - +chr4 41663611 41663730 - +chr4 41694895 41695215 - +chr4 41884922 41885032 - +chr4 46933999 46934319 - +chr4 47034701 47035021 - +chr4 47124845 47124949 - +chr4 48580414 48580734 - +chr4 48965213 48965326 - +chr4 52761989 52762309 - +chr4 53411756 53411878 - +chr4 53777887 53778207 - +chr4 54357208 54357528 - +chr4 55083998 55084318 - +chr4 55094265 55094585 - +chr4 55099473 55099793 - +chr4 55100451 55100771 - +chr4 55185185 55185349 - +chr4 55189016 55189336 - +chr4 56178703 56179023 - +chr4 56212610 56212930 - +chr4 56814762 56815082 - +chr4 57167683 57168003 - +chr4 57410828 57410922 - +chr4 57666338 57666658 - +chr4 57904189 57904509 - +chr4 58030298 58030388 - +chr4 61066822 61067142 - +chr4 68192999 68193319 - +chr4 69241721 69242041 - +chr4 69364165 69364485 - +chr4 71370971 71371121 - +chr4 71467155 71467475 - +chr4 71553800 71554120 - +chr4 74122905 74123225 - +chr4 74439447 74439767 - +chr4 74595283 74595603 - +chr4 74999999 75000319 - +chr4 75094439 75094759 - +chr4 75236590 75236910 - +chr4 75799998 75800318 - +chr4 76498701 76499021 - +chr4 76649155 76649475 - +chr4 76837168 76837488 - +chr4 76988864 76989184 - +chr4 77100997 77101317 - +chr4 77679501 77679821 - +chr4 78016505 78016825 - +chr4 78124798 78125118 - +chr4 78161640 78161960 - +chr4 78522067 78522387 - +chr4 78539826 78540146 - +chr4 79565910 79566230 - +chr4 81283780 81284100 - +chr4 81973648 81973968 - +chr4 81990935 81991255 - +chr4 82272855 82273175 - +chr4 82388587 82388907 - +chr4 83189766 83190086 - +chr4 83206391 83206711 - +chr4 83660504 83660824 - +chr4 84031556 84031876 - +chr4 84173990 84174310 - +chr4 84412069 84412389 - +chr4 84430250 84430570 - +chr4 84692302 84692622 - +chr4 87144728 87145048 - +chr4 87417269 87417589 - +chr4 87843637 87843795 - +chr4 88071921 88072241 - +chr4 88600642 88600962 - +chr4 88894773 88895093 - +chr4 88976093 88976413 - +chr4 89594099 89594419 - +chr4 90823012 90823332 - +chr4 91236271 91236591 - +chr4 94712759 94713079 - +chr4 95327849 95328169 - +chr4 99388688 99389008 - +chr4 99977208 99977528 - +chr4 100326702 100327022 - +chr4 100870137 100870457 - +chr4 101014573 101014893 - +chr4 101017222 101017542 - +chr4 106395117 106395437 - +chr4 107160053 107160373 - +chr4 107423266 107423586 - +chr4 108104340 108104660 - +chr4 108658204 108658524 - +chr4 110343573 110343893 - +chr4 110971114 110971434 - +chr4 111132459 111132779 - +chr4 111437543 111437863 - +chr4 111440072 111440392 - +chr4 112996358 112996678 - +chr4 113242983 113243303 - +chr4 113558431 113558751 - +chr4 114199503 114199823 - +chr4 114468707 114469027 - +chr4 119274047 119274367 - +chr4 121640133 121640453 - +chr4 121898780 121899100 - +chr4 122405718 122406038 - +chr4 122821583 122821903 - +chr4 122871328 122871648 - +chr4 123638014 123638334 - +chr4 124485925 124486245 - +chr4 125183427 125183747 - +chr4 127525468 127525788 - +chr4 128321929 128322249 - +chr4 128459981 128460301 - +chr4 128508170 128508490 - +chr4 129112475 129112795 - +chr4 129375435 129375755 - +chr4 129784684 129785004 - +chr4 134067933 134068082 - +chr4 139840309 139840629 - +chr4 140216871 140217191 - +chr4 140572776 140573096 - +chr4 140578166 140578486 - +chr4 140655937 140656257 - +chr4 140933782 140934102 - +chr4 141174387 141174707 - +chr4 141444916 141445236 - +chr4 146155266 146155586 - +chr4 146478954 146479056 - +chr4 146877473 146877793 - +chr4 147163132 147163452 - +chr4 148082065 148082385 - +chr4 148440114 148440230 - +chr4 148787990 148788310 - +chr4 148905196 148905516 - +chr4 150185355 150185675 - +chr4 150835689 150836009 - +chr4 150902034 150902354 - +chr4 151164060 151164380 - +chr4 152020461 152020781 - +chr4 152197940 152198260 - +chr4 152283933 152284104 - +chr4 152722762 152723082 - +chr4 154006442 154006600 - +chr4 154064120 154064440 - +chr4 154265554 154265874 - +chr4 156275126 156275446 - +chr4 156363460 156363780 - +chr4 157966714 157967034 - +chr4 158899932 158900252 - +chr4 159588821 159589141 - +chr4 159651159 159651479 - +chr4 160124164 160124484 - +chr4 160367679 160367999 - +chr4 160376789 160377109 - +chr4 160408397 160408717 - +chr4 164253917 164254237 - +chr4 164465577 164465897 - +chr4 166052518 166052625 - +chr4 168998204 168998524 - +chr4 169506674 169506994 - +chr4 170414094 170414414 - +chr4 170731486 170731806 - +chr4 171011255 171011575 - +chr4 174178944 174179264 - +chr4 174427009 174427329 - +chr4 177968513 177968833 - +chr4 178171334 178171498 - +chr4 182222936 182223256 - +chr4 182449760 182450080 - +chr4 183727532 183727852 - +chr4 183859268 183859588 - +chr4 184183621 184183941 - +chr4 184328297 184328617 - +chr4 184761408 184761728 - +chr4 185021468 185021788 - +chr4 185089722 185089806 - +chr4 185926481 185926611 - +chr4 185960964 185961284 - +chr4 186518855 186518994 - +chr4 186942741 186943061 - +chr4 187508606 187508926 - +chr4 187543036 187543356 - +chr4 187870158 187870478 - +chr4 188527685 188528005 - +chr4 190011383 190011703 - +chr4 190045354 190045674 - +chr4 190123454 190123774 - +chr4 190714212 190714532 - +chr4 190825634 190825954 - +chr4 190861739 190862059 - +chr5 188725 189045 - +chr5 317245 317565 - +chr5 431094 431414 - +chr5 671797 671949 - +chr5 923633 923748 - +chr5 1168275 1168595 - +chr5 1479730 1480050 - +chr5 1641726 1642046 - +chr5 1645025 1645345 - +chr5 1763840 1763974 - +chr5 1772840 1773160 - +chr5 5497310 5497630 - +chr5 6765822 6766142 - +chr5 7746270 7746590 - +chr5 7816409 7816729 - +chr5 7927910 7928230 - +chr5 8184624 8184944 - +chr5 8908201 8908521 - +chr5 9361875 9362195 - +chr5 9544597 9544917 - +chr5 10284133 10284453 - +chr5 10695180 10695500 - +chr5 10857280 10857404 - +chr5 10863920 10864240 - +chr5 11267400 11267720 - +chr5 13987970 13988290 - +chr5 14352200 14352520 - +chr5 14648693 14649013 - +chr5 14727483 14727803 - +chr5 15003592 15003912 - +chr5 15105899 15106219 - +chr5 15824443 15824763 - +chr5 16550477 16550797 - +chr5 16552967 16553086 - +chr5 16690364 16690684 - +chr5 16934176 16934496 - +chr5 17062292 17062612 - +chr5 17216220 17216540 - +chr5 18698658 18698978 - +chr5 21459328 21459648 - +chr5 31186817 31187137 - +chr5 32232575 32232895 - +chr5 32329492 32329812 - +chr5 32613212 32613532 - +chr5 32709857 32710177 - +chr5 33934847 33935167 - +chr5 33970003 33970323 - +chr5 34474632 34474952 - +chr5 34898415 34898735 - +chr5 35003505 35003825 - +chr5 35089817 35090137 - +chr5 35183335 35183655 - +chr5 35617733 35618053 - +chr5 35925035 35925355 - +chr5 35926580 35926900 - +chr5 36079305 36079625 - +chr5 36507457 36507777 - +chr5 36690442 36690762 - +chr5 36739774 36740094 - +chr5 37413094 37413414 - +chr5 37837496 37837816 - +chr5 38193257 38193577 - +chr5 38265671 38265991 - +chr5 38382949 38383269 - +chr5 38420745 38421065 - +chr5 38441951 38442033 - +chr5 38809961 38810281 - +chr5 38922243 38922563 - +chr5 39080731 39081051 - +chr5 39217908 39218228 - +chr5 39714199 39714519 - +chr5 40904900 40905108 - +chr5 42889448 42889768 - +chr5 42909032 42909352 - +chr5 42995023 42995343 - +chr5 43040443 43040763 - +chr5 43065009 43065329 - +chr5 43105868 43106092 - +chr5 43389877 43390197 - +chr5 43627149 43627469 - +chr5 50258869 50259009 - +chr5 53525036 53525356 - +chr5 53813253 53813573 - +chr5 54442533 54442659 - +chr5 54640248 54640568 - +chr5 55231965 55232285 - +chr5 55496978 55497298 - +chr5 55566441 55566609 - +chr5 56795203 56795523 - +chr5 57794194 57794514 - +chr5 58182486 58182806 - +chr5 58470102 58470422 - +chr5 59105811 59106131 - +chr5 60954375 60954695 - +chr5 65602351 65602452 - +chr5 66283809 66284129 - +chr5 67548530 67548850 - +chr5 68258735 68259055 - +chr5 68303762 68303850 - +chr5 68389479 68389799 - +chr5 68630622 68630942 - +chr5 71078757 71079077 - +chr5 71314120 71314258 - +chr5 71604289 71604609 - +chr5 71907065 71907210 - +chr5 72497781 72498101 - +chr5 72511889 72512209 - +chr5 72747136 72747456 - +chr5 72750288 72750608 - +chr5 72815530 72815850 - +chr5 72832671 72832991 - +chr5 73088598 73088918 - +chr5 74345664 74345984 - +chr5 75470897 75471217 - +chr5 76070514 76070650 - +chr5 76440385 76440705 - +chr5 76788251 76788571 - +chr5 77304219 77304539 - +chr5 78195681 78196001 - +chr5 78316861 78317181 - +chr5 78320944 78321031 - +chr5 78429442 78429762 - +chr5 78551796 78552116 - +chr5 78852026 78852346 - +chr5 79154060 79154193 - +chr5 79378658 79378978 - +chr5 79777139 79777459 - +chr5 79968021 79968341 - +chr5 80490898 80491218 - +chr5 80802814 80803134 - +chr5 81521432 81521752 - +chr5 81608181 81608501 - +chr5 82089681 82090001 - +chr5 82726349 82726446 - +chr5 82769382 82769702 - +chr5 83014055 83014375 - +chr5 83222984 83223304 - +chr5 86176654 86176974 - +chr5 87076967 87077287 - +chr5 87976267 87976587 - +chr5 87986872 87987022 - +chr5 89561123 89561443 - +chr5 89729960 89730280 - +chr5 90079570 90079890 - +chr5 91854037 91854357 - +chr5 92917078 92917398 - +chr5 94073610 94073930 - +chr5 94980035 94980355 - +chr5 95116761 95117081 - +chr5 95182711 95183031 - +chr5 95194579 95194660 - +chr5 95672116 95672436 - +chr5 96204589 96204909 - +chr5 96497715 96498035 - +chr5 97685442 97685762 - +chr5 98103941 98104063 - +chr5 98132065 98132385 - +chr5 98368218 98368538 - +chr5 98396080 98396400 - +chr5 101897865 101898185 - +chr5 102017138 102017458 - +chr5 102116212 102116532 - +chr5 102475028 102475348 - +chr5 102568155 102568475 - +chr5 106907771 106908091 - +chr5 107009471 107009574 - +chr5 107211632 107211952 - +chr5 107596385 107596509 - +chr5 107978023 107978343 - +chr5 108017023 108017343 - +chr5 109026735 109027055 - +chr5 110493887 110494095 - +chr5 111496387 111496545 - +chr5 112183964 112184284 - +chr5 112331734 112332054 - +chr5 113785633 113785953 - +chr5 114433806 114434126 - +chr5 114970485 114970658 - +chr5 115166876 115167196 - +chr5 115911755 115912075 - +chr5 116790413 116790733 - +chr5 121412380 121412700 - +chr5 121751242 121751562 - +chr5 122680550 122680870 - +chr5 122946836 122947156 - +chr5 125171309 125171629 - +chr5 125570309 125570629 - +chr5 125936366 125936686 - +chr5 125992123 125992443 - +chr5 126625624 126625944 - +chr5 126649580 126649900 - +chr5 127537061 127537381 - +chr5 128010595 128010915 - +chr5 129103610 129103930 - +chr5 130925264 130925584 - +chr5 131343607 131343927 - +chr5 131399269 131399376 - +chr5 131400495 131400660 - +chr5 131515864 131516184 - +chr5 131525923 131526243 - +chr5 131573962 131574282 - +chr5 131608559 131608655 - +chr5 131619155 131619475 - +chr5 131721095 131721415 - +chr5 131722248 131722568 - +chr5 131755347 131755667 - +chr5 131836922 131837242 - +chr5 131862466 131862786 - +chr5 132155846 132156166 - +chr5 132165624 132165944 - +chr5 132446925 132447245 - +chr5 132759517 132759837 - +chr5 133204916 133205236 - +chr5 133375315 133375500 - +chr5 133437807 133437927 - +chr5 133476204 133476524 - +chr5 133481550 133481870 - +chr5 133519882 133520202 - +chr5 133863374 133863459 - +chr5 133913507 133913827 - +chr5 133922596 133922689 - +chr5 134240342 134240460 - +chr5 134527071 134527391 - +chr5 134793184 134793504 - +chr5 134801818 134802138 - +chr5 134896515 134896835 - +chr5 134901986 134902118 - +chr5 135165820 135166140 - +chr5 135223072 135223392 - +chr5 135418166 135418486 - +chr5 135547927 135548247 - +chr5 135585366 135585686 - +chr5 136105815 136106135 - +chr5 136656142 136656462 - +chr5 136680526 136680685 - +chr5 136965261 136965581 - +chr5 137087593 137087913 - +chr5 137371640 137371960 - +chr5 137404489 137404809 - +chr5 137558501 137558821 - +chr5 137688079 137688345 - +chr5 137785668 137785988 - +chr5 137799776 137800096 - +chr5 137839472 137839792 - +chr5 137914970 137915290 - +chr5 137977518 137977838 - +chr5 138598440 138598579 - +chr5 138758004 138758324 - +chr5 138897524 138897844 - +chr5 139012985 139013104 - +chr5 139145117 139145303 - +chr5 139154688 139155008 - +chr5 139167143 139167269 - +chr5 139224018 139224338 - +chr5 139243823 139243929 - +chr5 139348497 139348583 - +chr5 139523825 139524145 - +chr5 139536738 139537058 - +chr5 139619422 139619742 - +chr5 139624064 139624384 - +chr5 139737349 139737440 - +chr5 139742866 139742988 - +chr5 139752483 139752803 - +chr5 140571740 140572060 - +chr5 140699732 140700052 - +chr5 140705582 140705902 - +chr5 140709824 140710144 - +chr5 140729561 140729657 - +chr5 140777275 140777595 - +chr5 140792349 140792669 - +chr5 140797143 140797230 - +chr5 140891933 140892253 - +chr5 140904308 140904628 - +chr5 140940567 140940650 - +chr5 140971593 140971913 - +chr5 140998668 140998988 - +chr5 141044576 141044896 - +chr5 141071658 141071978 - +chr5 141081717 141082037 - +chr5 141258833 141258976 - +chr5 141340301 141340621 - +chr5 141524910 141525230 - +chr5 141928379 141928699 - +chr5 142035249 142035569 - +chr5 142078792 142078927 - +chr5 142092728 142093048 - +chr5 142097129 142097449 - +chr5 142260100 142260420 - +chr5 143204729 143205049 - +chr5 143583334 143583654 - +chr5 144726610 144726930 - +chr5 145267578 145267898 - +chr5 145429024 145429344 - +chr5 145912844 145913164 - +chr5 145921839 145922159 - +chr5 146833429 146833749 - +chr5 146956546 146956866 - +chr5 147007667 147007987 - +chr5 147237130 147237450 - +chr5 147754342 147754662 - +chr5 148337204 148337524 - +chr5 148414878 148415198 - +chr5 148494279 148494599 - +chr5 148507626 148507946 - +chr5 148520847 148521167 - +chr5 148608624 148608944 - +chr5 148747157 148747278 - +chr5 148750954 148751274 - +chr5 148761583 148761687 - +chr5 148814036 148814356 - +chr5 148822081 148822401 - +chr5 148931308 148931628 - +chr5 148998296 148998616 - +chr5 149016392 149016712 - +chr5 149047127 149047447 - +chr5 149208011 149208331 - +chr5 149213835 149214155 - +chr5 149217329 149217460 - +chr5 149232714 149232822 - +chr5 149370357 149370677 - +chr5 149509535 149509855 - +chr5 149541275 149541434 - +chr5 149597743 149598063 - +chr5 149652121 149652441 - +chr5 149829487 149829807 - +chr5 149838406 149838726 - +chr5 149924346 149924666 - +chr5 149940521 149940841 - +chr5 149969347 149969667 - +chr5 150427497 150427622 - +chr5 150431221 150431541 - +chr5 150495003 150495136 - +chr5 150544372 150544692 - +chr5 150618823 150619143 - +chr5 150695701 150695837 - +chr5 150826901 150827221 - +chr5 150880426 150880537 - +chr5 150902977 150903297 - +chr5 151031733 151031838 - +chr5 151038524 151038844 - +chr5 151120262 151120582 - +chr5 151122408 151122728 - +chr5 151204953 151205273 - +chr5 151522445 151522765 - +chr5 152876367 152876687 - +chr5 153344651 153344971 - +chr5 153451894 153452214 - +chr5 153559681 153560001 - +chr5 153784807 153785127 - +chr5 153847255 153847575 - +chr5 153900502 153900822 - +chr5 153989931 153990251 - +chr5 154022067 154022387 - +chr5 154191468 154191788 - +chr5 154519195 154519515 - +chr5 154996383 154996703 - +chr5 155370838 155371158 - +chr5 156575866 156576186 - +chr5 156680498 156680818 - +chr5 156755444 156755764 - +chr5 156793766 156794086 - +chr5 156903982 156904302 - +chr5 157170584 157170773 - +chr5 157188995 157189315 - +chr5 157375815 157376135 - +chr5 157502285 157502605 - +chr5 157601965 157602285 - +chr5 158122545 158122865 - +chr5 158533865 158534185 - +chr5 158758356 158758676 - +chr5 159392595 159392915 - +chr5 159414391 159414711 - +chr5 159772376 159772696 - +chr5 159865403 159865723 - +chr5 159866149 159866469 - +chr5 162835480 162835800 - +chr5 167150293 167150613 - +chr5 167541435 167541755 - +chr5 167646809 167647129 - +chr5 167696364 167696684 - +chr5 167913332 167913451 - +chr5 168106481 168106801 - +chr5 168133592 168133912 - +chr5 168729547 168729867 - +chr5 169008665 169008985 - +chr5 169407623 169407943 - +chr5 169611093 169611413 - +chr5 169694876 169695196 - +chr5 169862012 169862332 - +chr5 170060054 170060374 - +chr5 170065725 170066045 - +chr5 170171492 170171812 - +chr5 170763988 170764308 - +chr5 170845807 170845887 - +chr5 171467196 171467516 - +chr5 171533878 171534198 - +chr5 171615691 171616011 - +chr5 171720511 171720831 - +chr5 171797688 171798008 - +chr5 171878235 171878555 - +chr5 171889217 171889537 - +chr5 171906125 171906233 - +chr5 172124549 172124869 - +chr5 172140046 172140366 - +chr5 172141813 172141961 - +chr5 172143814 172144134 - +chr5 172199008 172199328 - +chr5 172359426 172359746 - +chr5 172386289 172386389 - +chr5 172571384 172571508 - +chr5 172710872 172711192 - +chr5 172720955 172721275 - +chr5 172759144 172759243 - +chr5 172822266 172822586 - +chr5 173000116 173000436 - +chr5 173400191 173400511 - +chr5 173744400 173744548 - +chr5 173754326 173754646 - +chr5 173846566 173846886 - +chr5 174220849 174220929 - +chr5 174337233 174337553 - +chr5 174362902 174363222 - +chr5 174378194 174378514 - +chr5 174871717 174872037 - +chr5 174891661 174891981 - +chr5 174961152 174961339 - +chr5 175156815 175157135 - +chr5 175875066 175875386 - +chr5 176001043 176001363 - +chr5 176085041 176085361 - +chr5 176087903 176087983 - +chr5 176498848 176499168 - +chr5 176538974 176539116 - +chr5 176540972 176541153 - +chr5 176736696 176737016 - +chr5 176744897 176745217 - +chr5 176817613 176817725 - +chr5 176830743 176831063 - +chr5 176881417 176881737 - +chr5 176883393 176883713 - +chr5 176918466 176918786 - +chr5 176927998 176928318 - +chr5 176988280 176988600 - +chr5 177034469 177034789 - +chr5 177503150 177503470 - +chr5 177573046 177573366 - +chr5 177591294 177591467 - +chr5 177706956 177707276 - +chr5 178157677 178157997 - +chr5 178266385 178266705 - +chr5 178770981 178771301 - +chr5 178831348 178831668 - +chr5 179095450 179095770 - +chr5 179190164 179190484 - +chr5 179554067 179554387 - +chr5 179720489 179720809 - +chr5 179742804 179742972 - +chr5 179898108 179898189 - +chr5 180032965 180033285 - +chr5 180211537 180211857 - +chr5 180296012 180296332 - +chr5 180586422 180586507 - +chr5 180608915 180609235 - +chr6 188835 189155 - +chr6 722019 722339 - +chr6 913103 913423 - +chr6 1397248 1397568 - +chr6 1424579 1424899 - +chr6 1555293 1555613 - +chr6 1596617 1596718 - +chr6 1604576 1604896 - +chr6 2209506 2209826 - +chr6 2449026 2449346 - +chr6 2634716 2635036 - +chr6 2791436 2791756 - +chr6 2932507 2932700 - +chr6 2952857 2953026 - +chr6 2989251 2989571 - +chr6 3126599 3126919 - +chr6 3251805 3251929 - +chr6 3279806 3280126 - +chr6 3411179 3411499 - +chr6 3732798 3733118 - +chr6 3742636 3742956 - +chr6 3748531 3748851 - +chr6 3849393 3849713 - +chr6 3991087 3991407 - +chr6 4210883 4211203 - +chr6 5004207 5004527 - +chr6 5057483 5057566 - +chr6 5136038 5136358 - +chr6 5143878 5144198 - +chr6 5215027 5215347 - +chr6 5379920 5380240 - +chr6 5471230 5471550 - +chr6 6547311 6547631 - +chr6 6677175 6677255 - +chr6 6724786 6725106 - +chr6 7243741 7244061 - +chr6 7276261 7276426 - +chr6 8983406 8983726 - +chr6 10801639 10801725 - +chr6 10954794 10955114 - +chr6 11144589 11144909 - +chr6 11241927 11242247 - +chr6 11496464 11496784 - +chr6 11575887 11576004 - +chr6 12289708 12289805 - +chr6 12606047 12606367 - +chr6 12638277 12638597 - +chr6 12890438 12890758 - +chr6 13336723 13337043 - +chr6 13575465 13575785 - +chr6 13615357 13615677 - +chr6 13795349 13795669 - +chr6 14046734 14047054 - +chr6 14144876 14145196 - +chr6 14711318 14711638 - +chr6 15441516 15441836 - +chr6 15491634 15491954 - +chr6 15691580 15691774 - +chr6 16206830 16207150 - +chr6 16286337 16286657 - +chr6 16316918 16317238 - +chr6 16331570 16331890 - +chr6 16332355 16332675 - +chr6 16339290 16339610 - +chr6 16409966 16410286 - +chr6 16511334 16511654 - +chr6 16763303 16763623 - +chr6 16771254 16771574 - +chr6 16965258 16965371 - +chr6 17804442 17804762 - +chr6 17815593 17815913 - +chr6 18122822 18123142 - +chr6 18504246 18504566 - +chr6 18596495 18596815 - +chr6 19852107 19852427 - +chr6 20127095 20127175 - +chr6 20291924 20292244 - +chr6 20404828 20405148 - +chr6 21589503 21589823 - +chr6 24359931 24360251 - +chr6 24581339 24581659 - +chr6 24583689 24584009 - +chr6 25071020 25071340 - +chr6 25223717 25224037 - +chr6 25265939 25266259 - +chr6 25556691 25557011 - +chr6 25992768 25993088 - +chr6 26003352 26003672 - +chr6 26026509 26026829 - +chr6 26066158 26066478 - +chr6 26341853 26342173 - +chr6 26572839 26573159 - +chr6 26596049 26596369 - +chr6 26607611 26607931 - +chr6 27655224 27655544 - +chr6 27740339 27740473 - +chr6 27863314 27863634 - +chr6 28072423 28072743 - +chr6 28984931 28985251 - +chr6 29557344 29557496 - +chr6 29617875 29618195 - +chr6 29670292 29670612 - +chr6 29720945 29721104 - +chr6 29802646 29802966 - +chr6 29815828 29816148 - +chr6 29981156 29981476 - +chr6 30070948 30071090 - +chr6 30098909 30099069 - +chr6 30137827 30138147 - +chr6 30187065 30187162 - +chr6 30325377 30325697 - +chr6 30364677 30364997 - +chr6 30379327 30379437 - +chr6 30421538 30421635 - +chr6 30449856 30450176 - +chr6 30509351 30509477 - +chr6 30565144 30565464 - +chr6 30640769 30641089 - +chr6 30646750 30647070 - +chr6 30690436 30690756 - +chr6 30796281 30796601 - +chr6 30845517 30845837 - +chr6 30910364 30910684 - +chr6 31036776 31036885 - +chr6 31038472 31038792 - +chr6 31047584 31047904 - +chr6 31096049 31096160 - +chr6 31105215 31105535 - +chr6 31164836 31164955 - +chr6 31334703 31335023 - +chr6 31364717 31365037 - +chr6 31367437 31367577 - +chr6 31459707 31460027 - +chr6 31461639 31461774 - +chr6 31514503 31514647 - +chr6 31548589 31548909 - +chr6 31566388 31566708 - +chr6 31697116 31697436 - +chr6 31737574 31737894 - +chr6 31763632 31763952 - +chr6 31856557 31856877 - +chr6 31920133 31920453 - +chr6 32015925 32016245 - +chr6 32086702 32086805 - +chr6 32212720 32213040 - +chr6 32381238 32381558 - +chr6 32383276 32383499 - +chr6 32764828 32765148 - +chr6 32806460 32806780 - +chr6 32821744 32822064 - +chr6 32836004 32836324 - +chr6 32921031 32921351 - +chr6 32952589 32952909 - +chr6 33115872 33116192 - +chr6 33129318 33129457 - +chr6 33140442 33140762 - +chr6 33216126 33216446 - +chr6 33359013 33359333 - +chr6 33553565 33553646 - +chr6 33557432 33557752 - +chr6 33600366 33600686 - +chr6 33712177 33712497 - +chr6 33729991 33730311 - +chr6 33736468 33736595 - +chr6 33739303 33739623 - +chr6 33995401 33995721 - +chr6 34023011 34023331 - +chr6 34122661 34122771 - +chr6 34131622 34131942 - +chr6 34164153 34164248 - +chr6 34360517 34360837 - +chr6 35114185 35114505 - +chr6 35172952 35173062 - +chr6 35208889 35209209 - +chr6 35295241 35295561 - +chr6 35310090 35310209 - +chr6 35383021 35383341 - +chr6 35436024 35436113 - +chr6 35490621 35490941 - +chr6 35520094 35520414 - +chr6 35704088 35704408 - +chr6 35995130 35995450 - +chr6 36063009 36063329 - +chr6 36270291 36270611 - +chr6 36308879 36308995 - +chr6 36328296 36328616 - +chr6 36582958 36583278 - +chr6 36591234 36591554 - +chr6 36648348 36648475 - +chr6 36727046 36727203 - +chr6 36734053 36734176 - +chr6 36853381 36853701 - +chr6 36986073 36986212 - +chr6 36997054 36997374 - +chr6 37019241 37019561 - +chr6 37139979 37140299 - +chr6 37175816 37176136 - +chr6 37178249 37178569 - +chr6 37293619 37293772 - +chr6 37451989 37452309 - +chr6 37534249 37534353 - +chr6 37659893 37660047 - +chr6 37673195 37673515 - +chr6 37759230 37759550 - +chr6 37980408 37980728 - +chr6 38839888 38839989 - +chr6 38997237 38997557 - +chr6 39277746 39278066 - +chr6 39315733 39315836 - +chr6 39833111 39833431 - +chr6 39860331 39860651 - +chr6 40555243 40555563 - +chr6 40862667 40862987 - +chr6 40995731 40996051 - +chr6 41286665 41286868 - +chr6 41336680 41336926 - +chr6 41392228 41392548 - +chr6 41559856 41559971 - +chr6 41570813 41571133 - +chr6 41649925 41650245 - +chr6 41651436 41651536 - +chr6 41673641 41673961 - +chr6 41703232 41703552 - +chr6 41743654 41743974 - +chr6 41747632 41747952 - +chr6 41755165 41755485 - +chr6 41814543 41814863 - +chr6 41888986 41889306 - +chr6 41904159 41904266 - +chr6 42012075 42012188 - +chr6 42043290 42043610 - +chr6 42134308 42134469 - +chr6 42515423 42515531 - +chr6 42629311 42629631 - +chr6 42694356 42694676 - +chr6 43097583 43097903 - +chr6 43111253 43111573 - +chr6 43197066 43197386 - +chr6 43231228 43231548 - +chr6 43422337 43422474 - +chr6 43474590 43474696 - +chr6 43576060 43576380 - +chr6 43634983 43635303 - +chr6 43726291 43726611 - +chr6 43737338 43737458 - +chr6 43741097 43741417 - +chr6 43742481 43742801 - +chr6 44042303 44042623 - +chr6 44046916 44047236 - +chr6 44064475 44064795 - +chr6 44067811 44068131 - +chr6 44119737 44119846 - +chr6 44140551 44140643 - +chr6 44189291 44189611 - +chr6 44201929 44202249 - +chr6 44205681 44206001 - +chr6 44235839 44235992 - +chr6 44272535 44272855 - +chr6 44281097 44281188 - +chr6 44424024 44424344 - +chr6 44437564 44437884 - +chr6 44443449 44443769 - +chr6 44623239 44623559 - +chr6 45682510 45682830 - +chr6 45898262 45898423 - +chr6 46816694 46817014 - +chr6 47277721 47277867 - +chr6 47381741 47382061 - +chr6 47755796 47756116 - +chr6 49377685 49378005 - +chr6 50783835 50784155 - +chr6 52128690 52128776 - +chr6 52151655 52151975 - +chr6 52264609 52264929 - +chr6 52441376 52441696 - +chr6 52463069 52463389 - +chr6 52516871 52517191 - +chr6 52528491 52528811 - +chr6 52704404 52704724 - +chr6 52859935 52860255 - +chr6 52861484 52861804 - +chr6 52926540 52926860 - +chr6 53036699 53036974 - +chr6 53349573 53349670 - +chr6 53590667 53590987 - +chr6 53593407 53593727 - +chr6 53820912 53821232 - +chr6 53850993 53851313 - +chr6 54582664 54582984 - +chr6 56179514 56179834 - +chr6 56819973 56820293 - +chr6 57181311 57181631 - +chr6 64308491 64308811 - +chr6 64722252 64722370 - +chr6 65040834 65041154 - +chr6 70506783 70507103 - +chr6 71079167 71079280 - +chr6 71594097 71594417 - +chr6 71665666 71665986 - +chr6 73121954 73122191 - +chr6 73290095 73290415 - +chr6 73839745 73839920 - +chr6 74098458 74098778 - +chr6 74385230 74385550 - +chr6 74442838 74443158 - +chr6 74513534 74513854 - +chr6 75918687 75919007 - +chr6 75940249 75940569 - +chr6 78175703 78176023 - +chr6 80487813 80487946 - +chr6 80523646 80523966 - +chr6 80579335 80579655 - +chr6 82715877 82716197 - +chr6 82750324 82750644 - +chr6 83767487 83767807 - +chr6 83775434 83775754 - +chr6 83777130 83777450 - +chr6 83924899 83925219 - +chr6 84221561 84221881 - +chr6 84245785 84246105 - +chr6 84302540 84302860 - +chr6 86033194 86033514 - +chr6 86064772 86065092 - +chr6 86192899 86193096 - +chr6 87788830 87788995 - +chr6 88032145 88032339 - +chr6 88639534 88639854 - +chr6 89490552 89490872 - +chr6 89991561 89991881 - +chr6 90077858 90078178 - +chr6 90081871 90082033 - +chr6 90319804 90320124 - +chr6 91103050 91103370 - +chr6 91189080 91189400 - +chr6 92039920 92040240 - +chr6 97345661 97345981 - +chr6 97634314 97634634 - +chr6 97842053 97842373 - +chr6 97884888 97885208 - +chr6 99293680 99294000 - +chr6 99777973 99778293 - +chr6 99968693 99969013 - +chr6 100315975 100316295 - +chr6 100353426 100353746 - +chr6 100449821 100450141 - +chr6 104973632 104973952 - +chr6 105130954 105131274 - +chr6 105851333 105851653 - +chr6 105932593 105932913 - +chr6 106787243 106787563 - +chr6 106808248 106808568 - +chr6 107014441 107014761 - +chr6 107318840 107319160 - +chr6 107349302 107349384 - +chr6 107996558 107996878 - +chr6 108169463 108169783 - +chr6 108301530 108301850 - +chr6 108371093 108371413 - +chr6 108847941 108848261 - +chr6 108863228 108863548 - +chr6 109103502 109103822 - +chr6 109156499 109156819 - +chr6 109509031 109509351 - +chr6 110026509 110026596 - +chr6 110360759 110360873 - +chr6 110663182 110663502 - +chr6 111378513 111378833 - +chr6 111580325 111580434 - +chr6 112042888 112043208 - +chr6 112318392 112318712 - +chr6 112350446 112350766 - +chr6 112364233 112364553 - +chr6 112364920 112365240 - +chr6 112484661 112484770 - +chr6 112658386 112658706 - +chr6 114180131 114180451 - +chr6 114200682 114201002 - +chr6 114316025 114316345 - +chr6 114429582 114429902 - +chr6 116331980 116332300 - +chr6 117926399 117926719 - +chr6 118939953 118940273 - +chr6 119121714 119122034 - +chr6 121721955 121722275 - +chr6 122170702 122171022 - +chr6 122702694 122703014 - +chr6 122813209 122813325 - +chr6 122851647 122851967 - +chr6 124123728 124124048 - +chr6 125474689 125475009 - +chr6 125513482 125513802 - +chr6 125635260 125635580 - +chr6 126179894 126180214 - +chr6 126227591 126227911 - +chr6 126277784 126278104 - +chr6 126317230 126317383 - +chr6 127442584 127442904 - +chr6 127587553 127587873 - +chr6 127786047 127786367 - +chr6 127837864 127838184 - +chr6 127963189 127963509 - +chr6 130096410 130096509 - +chr6 130643856 130644176 - +chr6 130686572 130686652 - +chr6 131019122 131019442 - +chr6 131081211 131081531 - +chr6 131193426 131193746 - +chr6 131949456 131949607 - +chr6 132016149 132016230 - +chr6 132231135 132231455 - +chr6 132236319 132236639 - +chr6 132600872 132601192 - +chr6 133284159 133284479 - +chr6 133890098 133890253 - +chr6 134237724 134238044 - +chr6 134242420 134242740 - +chr6 134385110 134385194 - +chr6 134432230 134432550 - +chr6 134499597 134499917 - +chr6 134962104 134962424 - +chr6 135144102 135144422 - +chr6 135203628 135203948 - +chr6 135483836 135484156 - +chr6 136547452 136547648 - +chr6 136638198 136638518 - +chr6 136929337 136929657 - +chr6 136965291 136965611 - +chr6 137114133 137114453 - +chr6 137397696 137398016 - +chr6 137592745 137593065 - +chr6 137635451 137635771 - +chr6 137736781 137737101 - +chr6 138026482 138026802 - +chr6 138322914 138323234 - +chr6 138430570 138430890 - +chr6 138909976 138910296 - +chr6 138912058 138912378 - +chr6 139013356 139013676 - +chr6 139540357 139540677 - +chr6 139571718 139572038 - +chr6 142468038 142468358 - +chr6 142528581 142528901 - +chr6 142888088 142888171 - +chr6 143014036 143014356 - +chr6 143748468 143748621 - +chr6 143896518 143896838 - +chr6 144334583 144334903 - +chr6 144537352 144537672 - +chr6 145252618 145252938 - +chr6 145955180 145955500 - +chr6 146864451 146864771 - +chr6 147178590 147178910 - +chr6 148466095 148466415 - +chr6 148785913 148786233 - +chr6 148880921 148881241 - +chr6 148899320 148899640 - +chr6 149269042 149269362 - +chr6 149753161 149753481 - +chr6 149867327 149867647 - +chr6 150045008 150045328 - +chr6 150155682 150156002 - +chr6 150247467 150247577 - +chr6 150392251 150392571 - +chr6 150401803 150402123 - +chr6 151138958 151139278 - +chr6 151213057 151213377 - +chr6 151335620 151335701 - +chr6 151645496 151645588 - +chr6 151996899 151997219 - +chr6 152085460 152085780 - +chr6 152659799 152659907 - +chr6 152873765 152874085 - +chr6 153218968 153219288 - +chr6 153552196 153552516 - +chr6 154477190 154477510 - +chr6 154996740 154997060 - +chr6 155569578 155569898 - +chr6 156885536 156885856 - +chr6 156983121 156983441 - +chr6 157505544 157505640 - +chr6 158088221 158088541 - +chr6 158177695 158178015 - +chr6 158294799 158295119 - +chr6 158493217 158493537 - +chr6 158635219 158635539 - +chr6 158641912 158642232 - +chr6 159026344 159026664 - +chr6 159125257 159125577 - +chr6 159463642 159463962 - +chr6 159525171 159525491 - +chr6 159531051 159531371 - +chr6 159555875 159556012 - +chr6 159572364 159572684 - +chr6 159595393 159595555 - +chr6 160509460 160509780 - +chr6 160559261 160559581 - +chr6 160877005 160877325 - +chr6 160940886 160941206 - +chr6 160955277 160955597 - +chr6 161791854 161792174 - +chr6 161795005 161795178 - +chr6 164377691 164378011 - +chr6 165341062 165341382 - +chr6 166264063 166264143 - +chr6 166689265 166689416 - +chr6 166726749 166727069 - +chr6 167188703 167189023 - +chr6 169284771 169285091 - +chr6 169295126 169295446 - +chr6 169357116 169357204 - +chr6 170190306 170190474 - +chr6 170231755 170232075 - +chr6 170826145 170826465 - +chr6 170838939 170839259 - +chr7 362030 362350 - +chr7 814186 814506 - +chr7 826059 826379 - +chr7 912991 913311 - +chr7 948468 948788 - +chr7 1032866 1033033 - +chr7 1105412 1105732 - +chr7 1215888 1216008 - +chr7 1222666 1222986 - +chr7 1265145 1265465 - +chr7 1472221 1472347 - +chr7 1513701 1514021 - +chr7 1619153 1619473 - +chr7 2143935 2144255 - +chr7 2249134 2249454 - +chr7 2286789 2287109 - +chr7 2491033 2491353 - +chr7 2508579 2508899 - +chr7 2517248 2517568 - +chr7 2710660 2710770 - +chr7 2749447 2749587 - +chr7 2904686 2905006 - +chr7 2926637 2926957 - +chr7 4671279 4671599 - +chr7 4793554 4793874 - +chr7 4869745 4870065 - +chr7 5013459 5013779 - +chr7 5125167 5125327 - +chr7 5279861 5280181 - +chr7 5405673 5405993 - +chr7 5526630 5526786 - +chr7 5534382 5534492 - +chr7 5569216 5569536 - +chr7 5603216 5603536 - +chr7 5705013 5705333 - +chr7 5717661 5717981 - +chr7 5744119 5744439 - +chr7 5821185 5821505 - +chr7 6199829 6200149 - +chr7 6362633 6362953 - +chr7 6388605 6388925 - +chr7 6475292 6475612 - +chr7 6556087 6556210 - +chr7 6642001 6642103 - +chr7 7464058 7464378 - +chr7 11251753 11252073 - +chr7 12107111 12107431 - +chr7 12663528 12663848 - +chr7 14731274 14731594 - +chr7 16138234 16138554 - +chr7 17022274 17022594 - +chr7 17809495 17809815 - +chr7 18385304 18385624 - +chr7 18705873 18705980 - +chr7 19091428 19091748 - +chr7 19161639 19161959 - +chr7 19748630 19748950 - +chr7 21547276 21547596 - +chr7 22122546 22122866 - +chr7 22143345 22143665 - +chr7 22528518 22528838 - +chr7 22589859 22590179 - +chr7 22599490 22599810 - +chr7 22822286 22822606 - +chr7 23095238 23095558 - +chr7 23125651 23125971 - +chr7 23301590 23301712 - +chr7 23312684 23313004 - +chr7 23586593 23586913 - +chr7 23610230 23610550 - +chr7 23902839 23903159 - +chr7 24851970 24852290 - +chr7 24859850 24860170 - +chr7 25132975 25133295 - +chr7 25133882 25134202 - +chr7 25138062 25138382 - +chr7 25299285 25299605 - +chr7 25605591 25605795 - +chr7 26101348 26101668 - +chr7 26707227 26707547 - +chr7 27176055 27176375 - +chr7 27183386 27183706 - +chr7 27200607 27200927 - +chr7 27219352 27219672 - +chr7 27249972 27250292 - +chr7 28059982 28060302 - +chr7 28118029 28118155 - +chr7 28312014 28312334 - +chr7 28961433 28961753 - +chr7 29961075 29961395 - +chr7 30315730 30316050 - +chr7 30588413 30588733 - +chr7 30780185 30780321 - +chr7 30978383 30978703 - +chr7 31023230 31023550 - +chr7 31776037 31776357 - +chr7 32244054 32244374 - +chr7 32519216 32519536 - +chr7 32886246 32886566 - +chr7 33925733 33926053 - +chr7 33934944 33935264 - +chr7 33972511 33972831 - +chr7 34127764 34128084 - +chr7 34173835 34174155 - +chr7 34360555 34360875 - +chr7 34863307 34863627 - +chr7 35077479 35077799 - +chr7 35559869 35560189 - +chr7 35577063 35577383 - +chr7 35664220 35664540 - +chr7 35749114 35749434 - +chr7 35946097 35946417 - +chr7 36024787 36025107 - +chr7 36331115 36331435 - +chr7 36358680 36359000 - +chr7 36406721 36407041 - +chr7 36710463 36710783 - +chr7 37006935 37007255 - +chr7 37026745 37027065 - +chr7 37221637 37221957 - +chr7 37238491 37238811 - +chr7 37241227 37241547 - +chr7 38147043 38147363 - +chr7 38255340 38255660 - +chr7 38375264 38375584 - +chr7 38438341 38438661 - +chr7 38972185 38972505 - +chr7 39493375 39493695 - +chr7 39785794 39786114 - +chr7 40174370 40174690 - +chr7 40644475 40644795 - +chr7 41736066 41736386 - +chr7 41744731 41745051 - +chr7 41980381 41980701 - +chr7 41982739 41982841 - +chr7 42054250 42054570 - +chr7 42916075 42916395 - +chr7 42980862 42981182 - +chr7 43148125 43148265 - +chr7 43300904 43301224 - +chr7 43479415 43479735 - +chr7 43622361 43622498 - +chr7 43878566 43878743 - +chr7 44111891 44112211 - +chr7 44235998 44236318 - +chr7 44280578 44280898 - +chr7 44517044 44517218 - +chr7 44605201 44605381 - +chr7 44887258 44887578 - +chr7 44896549 44896869 - +chr7 44961737 44962057 - +chr7 45013380 45013700 - +chr7 45175770 45176090 - +chr7 45194218 45194538 - +chr7 45299723 45299869 - +chr7 45703000 45703111 - +chr7 45957204 45957524 - +chr7 45961857 45962009 - +chr7 45962828 45963148 - +chr7 47219824 47220144 - +chr7 47305850 47306170 - +chr7 47475435 47475755 - +chr7 47859046 47859171 - +chr7 48008559 48008707 - +chr7 48031342 48031662 - +chr7 48149791 48150111 - +chr7 48194412 48194732 - +chr7 48339201 48339521 - +chr7 50485843 50485984 - +chr7 50535745 50536065 - +chr7 50700722 50700859 - +chr7 51228761 51229081 - +chr7 51230442 51230762 - +chr7 54841869 54842189 - +chr7 54907344 54907664 - +chr7 55322559 55322879 - +chr7 55323251 55323571 - +chr7 55661477 55661797 - +chr7 56072540 56072860 - +chr7 56142023 56142343 - +chr7 56160629 56160949 - +chr7 62476060 62476380 - +chr7 64411656 64411976 - +chr7 64772695 64773015 - +chr7 65187261 65187581 - +chr7 66022639 66022817 - +chr7 66836810 66837130 - +chr7 66843197 66843517 - +chr7 66949241 66949561 - +chr7 70163243 70163563 - +chr7 70294137 70294457 - +chr7 70321946 70322266 - +chr7 71200975 71201295 - +chr7 72756579 72756899 - +chr7 72866253 72866333 - +chr7 73021339 73021659 - +chr7 73140156 73140342 - +chr7 73149044 73149364 - +chr7 73226054 73226374 - +chr7 73269962 73270132 - +chr7 73280487 73280807 - +chr7 73441252 73441400 - +chr7 73482166 73482486 - +chr7 73620917 73621237 - +chr7 73815701 73815865 - +chr7 73831853 73831959 - +chr7 73864265 73864585 - +chr7 73981178 73981369 - +chr7 75495883 75496022 - +chr7 75649477 75649797 - +chr7 75807526 75807681 - +chr7 75892300 75892620 - +chr7 75920953 75921273 - +chr7 75943688 75943869 - +chr7 76876562 76876882 - +chr7 77855409 77855729 - +chr7 78139933 78140253 - +chr7 78927507 78927827 - +chr7 79648525 79648845 - +chr7 82201574 82201894 - +chr7 82369596 82369916 - +chr7 83565412 83565732 - +chr7 84816039 84816359 - +chr7 86941689 86941828 - +chr7 91510117 91510203 - +chr7 91763003 91763323 - +chr7 91808308 91808425 - +chr7 92053925 92054245 - +chr7 92076511 92076831 - +chr7 92090898 92091218 - +chr7 93003718 93004038 - +chr7 93072062 93072382 - +chr7 93918396 93918583 - +chr7 95141038 95141358 - +chr7 95545921 95546241 - +chr7 95704752 95705072 - +chr7 96656998 96657318 - +chr7 96733972 96734292 - +chr7 97650847 97651167 - +chr7 97735877 97736197 - +chr7 97797933 97798253 - +chr7 97870714 97871034 - +chr7 97924160 97924480 - +chr7 98050309 98050629 - +chr7 98375054 98375374 - +chr7 98429301 98429621 - +chr7 98439404 98439724 - +chr7 98910665 98910752 - +chr7 99510760 99511080 - +chr7 99588529 99588849 - +chr7 99591921 99592002 - +chr7 99679408 99679728 - +chr7 99730260 99730580 - +chr7 100066881 100067201 - +chr7 100080846 100081166 - +chr7 100157480 100157663 - +chr7 100161584 100161770 - +chr7 100289302 100289622 - +chr7 100302860 100303180 - +chr7 100397095 100397415 - +chr7 100449705 100450025 - +chr7 100492400 100492720 - +chr7 100609416 100609736 - +chr7 100702720 100702888 - +chr7 100720969 100721121 - +chr7 100738092 100738412 - +chr7 100781533 100781853 - +chr7 100813078 100813398 - +chr7 100835889 100836011 - +chr7 100887508 100887828 - +chr7 100926378 100926541 - +chr7 101007217 101007537 - +chr7 101054567 101054887 - +chr7 101182127 101182447 - +chr7 101186109 101186429 - +chr7 101245706 101246026 - +chr7 101331603 101331923 - +chr7 101879993 101880313 - +chr7 101915628 101915747 - +chr7 101922761 101923081 - +chr7 102031206 102031526 - +chr7 102045059 102045379 - +chr7 102937660 102937980 - +chr7 103087629 103087949 - +chr7 103871890 103872210 - +chr7 104488883 104489203 - +chr7 104524231 104524373 - +chr7 105172391 105172711 - +chr7 105292833 105293153 - +chr7 105649957 105650277 - +chr7 105681596 105681916 - +chr7 105683283 105683603 - +chr7 105697392 105697712 - +chr7 105794337 105794657 - +chr7 105817139 105817459 - +chr7 106300462 106300782 - +chr7 107103603 107103923 - +chr7 107786703 107787023 - +chr7 111392279 111392599 - +chr7 112190657 112190759 - +chr7 112509247 112509567 - +chr7 112799834 112800154 - +chr7 113061827 113062147 - +chr7 113350461 113350586 - +chr7 114505636 114505956 - +chr7 116311817 116312137 - +chr7 116570511 116570831 - +chr7 116759537 116759857 - +chr7 116853046 116853366 - +chr7 118910040 118910360 - +chr7 120432163 120432316 - +chr7 120497596 120497916 - +chr7 120627676 120627996 - +chr7 121036298 121036618 - +chr7 121733018 121733338 - +chr7 123220913 123221233 - +chr7 123341591 123341911 - +chr7 124405870 124406190 - +chr7 124425270 124425590 - +chr7 126894083 126894198 - +chr7 127087425 127087745 - +chr7 127848518 127848838 - +chr7 127849941 127850096 - +chr7 127987636 127987956 - +chr7 128027770 128028090 - +chr7 128038727 128038854 - +chr7 128065950 128066030 - +chr7 128099405 128099725 - +chr7 128520255 128520341 - +chr7 128523673 128523993 - +chr7 128731681 128731808 - +chr7 128850372 128850692 - +chr7 128858408 128858728 - +chr7 129007434 129007754 - +chr7 129247606 129247926 - +chr7 129275544 129275630 - +chr7 129650713 129651033 - +chr7 129702834 129703154 - +chr7 129711998 129712318 - +chr7 129882411 129882731 - +chr7 129917867 129918187 - +chr7 130406788 130407108 - +chr7 130560070 130560390 - +chr7 130581728 130582048 - +chr7 130725227 130725547 - +chr7 130754022 130754342 - +chr7 131192624 131192944 - +chr7 131198438 131198530 - +chr7 131231429 131231749 - +chr7 131339479 131339799 - +chr7 131374005 131374119 - +chr7 131376940 131377020 - +chr7 131544047 131544182 - +chr7 131630159 131630479 - +chr7 131734492 131734812 - +chr7 132003537 132003857 - +chr7 133826177 133826262 - +chr7 134043027 134043347 - +chr7 134199339 134199659 - +chr7 134232129 134232449 - +chr7 134291116 134291436 - +chr7 134434878 134435198 - +chr7 134959232 134959552 - +chr7 134966256 134966576 - +chr7 135357596 135357916 - +chr7 135800029 135800349 - +chr7 136090987 136091307 - +chr7 136853942 136854262 - +chr7 137277054 137277374 - +chr7 137311249 137311569 - +chr7 137341035 137341355 - +chr7 137583884 137584204 - +chr7 137687089 137687409 - +chr7 138423507 138423827 - +chr7 139330693 139331013 - +chr7 139702120 139702440 - +chr7 139874963 139875283 - +chr7 140026083 140026403 - +chr7 140217715 140218035 - +chr7 140346869 140347189 - +chr7 140353243 140353563 - +chr7 141233151 141233471 - +chr7 141349735 141350055 - +chr7 141354293 141354613 - +chr7 141437842 141438162 - +chr7 142135090 142135410 - +chr7 142148393 142148477 - +chr7 142172136 142172456 - +chr7 142193365 142193685 - +chr7 142219046 142219366 - +chr7 142246145 142246465 - +chr7 142273752 142274072 - +chr7 142340015 142340335 - +chr7 142491274 142491363 - +chr7 142705024 142705344 - +chr7 142912288 142912444 - +chr7 142937697 142938017 - +chr7 143077255 143077575 - +chr7 147959854 147960174 - +chr7 148272720 148273040 - +chr7 148659816 148660136 - +chr7 148663970 148664098 - +chr7 148702883 148703203 - +chr7 148762987 148763158 - +chr7 148787785 148788105 - +chr7 148823307 148823627 - +chr7 148892441 148892761 - +chr7 148906695 148907015 - +chr7 148936498 148936818 - +chr7 149321802 149322122 - +chr7 149322625 149322945 - +chr7 149439368 149439510 - +chr7 149450785 149451105 - +chr7 149580540 149580860 - +chr7 150145784 150146104 - +chr7 150595572 150595892 - +chr7 150660167 150660271 - +chr7 150683152 150683472 - +chr7 150685628 150685948 - +chr7 150715385 150715474 - +chr7 150808166 150808486 - +chr7 150821127 150821286 - +chr7 150869691 150869921 - +chr7 150941544 150941627 - +chr7 151012149 151012469 - +chr7 151047389 151047709 - +chr7 151065224 151065544 - +chr7 151451186 151451506 - +chr7 151456650 151456970 - +chr7 151479987 151480307 - +chr7 152404048 152404368 - +chr7 154705438 154705758 - +chr7 154744749 154745069 - +chr7 154982426 154982558 - +chr7 155172834 155173154 - +chr7 155391445 155391765 - +chr7 156556796 156557116 - +chr7 156686137 156686457 - +chr7 157066373 157066464 - +chr7 157132839 157133159 - +chr7 157179931 157180251 - +chr7 157225664 157225984 - +chr7 158769770 158770090 - +chr8 215734 216054 - +chr8 427667 427777 - +chr8 1683633 1683953 - +chr8 1972355 1972446 - +chr8 1978838 1978990 - +chr8 2672048 2672368 - +chr8 6407583 6407903 - +chr8 6539984 6540304 - +chr8 6637143 6637463 - +chr8 6658274 6658594 - +chr8 6698163 6698483 - +chr8 7212785 7213105 - +chr8 8105350 8105670 - +chr8 8594474 8594794 - +chr8 8608229 8608549 - +chr8 8887332 8887652 - +chr8 8923251 8923571 - +chr8 9008287 9008370 - +chr8 9226628 9226720 - +chr8 9688604 9688698 - +chr8 10131416 10131736 - +chr8 10190264 10190584 - +chr8 10275365 10275467 - +chr8 10282789 10282926 - +chr8 10405588 10405908 - +chr8 10444806 10445126 - +chr8 10447700 10448020 - +chr8 10696407 10696727 - +chr8 10834675 10834995 - +chr8 11269009 11269329 - +chr8 11325111 11325431 - +chr8 11411383 11411489 - +chr8 11422033 11422353 - +chr8 11424614 11424934 - +chr8 11447003 11447323 - +chr8 11623075 11623207 - +chr8 11759786 11760106 - +chr8 11801183 11801503 - +chr8 11816522 11816631 - +chr8 12698216 12698536 - +chr8 12899834 12900154 - +chr8 13519729 13520049 - +chr8 14717696 14718016 - +chr8 15095756 15096076 - +chr8 15235827 15236147 - +chr8 16870518 16870687 - +chr8 17009922 17010242 - +chr8 17434370 17434690 - +chr8 17557110 17557430 - +chr8 18537792 18538112 - +chr8 18725949 18726269 - +chr8 18938413 18938733 - +chr8 19125170 19125490 - +chr8 19229604 19229924 - +chr8 19265938 19266258 - +chr8 19612179 19612499 - +chr8 19615267 19615587 - +chr8 19637051 19637371 - +chr8 19991212 19991532 - +chr8 20089303 20089623 - +chr8 20233807 20233977 - +chr8 20238299 20238619 - +chr8 20292805 20293125 - +chr8 21641884 21642204 - +chr8 21763620 21763710 - +chr8 21894440 21894523 - +chr8 21994900 21995220 - +chr8 22022048 22022368 - +chr8 22094802 22095122 - +chr8 22225327 22225647 - +chr8 22419403 22419723 - +chr8 22431955 22432196 - +chr8 22446660 22446799 - +chr8 22452849 22452953 - +chr8 22480396 22480716 - +chr8 22530530 22530850 - +chr8 22553199 22553519 - +chr8 22613594 22613914 - +chr8 22766631 22766743 - +chr8 22775956 22776276 - +chr8 22876767 22877087 - +chr8 22941677 22941997 - +chr8 23020903 23021223 - +chr8 23104256 23104576 - +chr8 23145408 23145728 - +chr8 23268037 23268118 - +chr8 23329863 23330183 - +chr8 23331791 23332111 - +chr8 23550250 23550407 - +chr8 23605347 23605667 - +chr8 23606501 23606821 - +chr8 24052029 24052349 - +chr8 24857487 24857807 - +chr8 24923101 24923421 - +chr8 25041125 25041445 - +chr8 25434697 25435017 - +chr8 25473299 25473619 - +chr8 25946218 25946538 - +chr8 25991344 25991450 - +chr8 26184951 26185271 - +chr8 26428775 26429095 - +chr8 26528063 26528383 - +chr8 26548234 26548554 - +chr8 27062647 27062770 - +chr8 27172800 27173120 - +chr8 27283707 27284027 - +chr8 27449760 27450080 - +chr8 27454748 27454926 - +chr8 27584792 27585112 - +chr8 27721457 27721541 - +chr8 27850107 27850427 - +chr8 27918594 27918914 - +chr8 27930280 27930600 - +chr8 28181940 28182090 - +chr8 28213079 28213399 - +chr8 28220934 28221254 - +chr8 28277058 28277378 - +chr8 28451513 28451833 - +chr8 28479720 28480040 - +chr8 28627171 28627491 - +chr8 28861859 28862179 - +chr8 29034524 29034844 - +chr8 29177889 29178209 - +chr8 29197819 29197961 - +chr8 30027771 30028091 - +chr8 30085135 30085455 - +chr8 30239773 30240093 - +chr8 30413395 30413715 - +chr8 30670289 30670609 - +chr8 31109591 31109911 - +chr8 32081064 32081384 - +chr8 32406694 32406776 - +chr8 32412216 32412536 - +chr8 33282240 33282377 - +chr8 33448475 33448795 - +chr8 33497722 33498042 - +chr8 33539018 33539338 - +chr8 36948393 36948713 - +chr8 37010786 37011106 - +chr8 37248138 37248220 - +chr8 37557854 37558174 - +chr8 37593837 37594157 - +chr8 37637369 37637689 - +chr8 37708974 37709294 - +chr8 37924664 37924984 - +chr8 38268167 38268487 - +chr8 38571852 38572172 - +chr8 38578931 38579251 - +chr8 38592167 38592487 - +chr8 38625828 38626148 - +chr8 39853211 39853531 - +chr8 39891534 39891854 - +chr8 40059296 40059616 - +chr8 40809105 40809425 - +chr8 40851359 40851549 - +chr8 41003993 41004100 - +chr8 41261703 41262023 - +chr8 41386343 41386663 - +chr8 41400237 41400557 - +chr8 41559329 41559434 - +chr8 41692356 41692468 - +chr8 41998189 41998509 - +chr8 42010908 42011228 - +chr8 42029138 42029458 - +chr8 42177188 42177508 - +chr8 42242985 42243305 - +chr8 42547643 42547963 - +chr8 43135935 43136255 - +chr8 47868596 47868916 - +chr8 48078992 48079312 - +chr8 48114918 48115006 - +chr8 48909835 48910155 - +chr8 49084792 49084927 - +chr8 49891534 49891854 - +chr8 52921148 52921288 - +chr8 53024952 53025272 - +chr8 53320218 53320538 - +chr8 53408751 53409071 - +chr8 53854179 53854499 - +chr8 54500639 54500959 - +chr8 56923242 56923400 - +chr8 57006241 57006561 - +chr8 57154572 57154788 - +chr8 57841831 57841918 - +chr8 59521149 59521469 - +chr8 59614342 59614662 - +chr8 59720498 59720818 - +chr8 60834135 60834455 - +chr8 61048697 61049017 - +chr8 61325094 61325414 - +chr8 61936101 61936421 - +chr8 62051695 62051787 - +chr8 62380174 62380309 - +chr8 62381351 62381671 - +chr8 62722558 62722878 - +chr8 62841583 62841903 - +chr8 64303885 64304205 - +chr8 66557327 66557440 - +chr8 66674102 66674422 - +chr8 66746817 66747137 - +chr8 66977174 66977494 - +chr8 67089088 67089408 - +chr8 67351159 67351479 - +chr8 67601140 67601460 - +chr8 67976339 67976659 - +chr8 68402539 68402859 - +chr8 70468517 70468837 - +chr8 71000831 71001151 - +chr8 71446719 71447039 - +chr8 71520545 71520865 - +chr8 72887474 72887794 - +chr8 72930801 72931121 - +chr8 73856273 73856593 - +chr8 74086644 74086821 - +chr8 75000376 75000696 - +chr8 77280138 77280278 - +chr8 80677486 80677806 - +chr8 80997280 80997600 - +chr8 81143073 81143393 - +chr8 81595282 81595445 - +chr8 81904193 81904513 - +chr8 82692827 82693000 - +chr8 85538421 85538588 - +chr8 86132950 86133104 - +chr8 87631260 87631580 - +chr8 89302022 89302342 - +chr8 90996813 90997133 - +chr8 94577679 94577999 - +chr8 94834570 94834890 - +chr8 94987477 94987584 - +chr8 95092022 95092342 - +chr8 95369331 95369651 - +chr8 96069146 96069466 - +chr8 96317623 96317943 - +chr8 96902800 96903120 - +chr8 97145009 97145329 - +chr8 97176571 97176891 - +chr8 97192352 97192672 - +chr8 97340258 97340578 - +chr8 97345409 97345729 - +chr8 98387884 98388204 - +chr8 98464924 98465244 - +chr8 98608070 98608390 - +chr8 98881779 98881865 - +chr8 99097814 99098134 - +chr8 99176838 99177158 - +chr8 99182333 99182492 - +chr8 99370337 99370657 - +chr8 99953185 99953505 - +chr8 100180394 100180714 - +chr8 100370818 100371138 - +chr8 100872040 100872360 - +chr8 101571941 101572261 - +chr8 101576229 101576549 - +chr8 101839248 101839568 - +chr8 101847745 101848065 - +chr8 101859683 101860003 - +chr8 102149475 102149795 - +chr8 102313649 102313969 - +chr8 102339206 102339526 - +chr8 102644775 102645095 - +chr8 103597416 103597736 - +chr8 103819738 103820058 - +chr8 103823115 103823435 - +chr8 103826972 103827115 - +chr8 104137133 104137453 - +chr8 104385460 104385780 - +chr8 105337671 105337991 - +chr8 105338273 105338593 - +chr8 105402159 105402479 - +chr8 105430379 105430502 - +chr8 105772007 105772200 - +chr8 107093266 107093586 - +chr8 107717296 107717616 - +chr8 107868760 107869080 - +chr8 109092689 109093009 - +chr8 117460146 117460466 - +chr8 117778442 117778762 - +chr8 117800618 117800938 - +chr8 117899864 117899959 - +chr8 118056983 118057303 - +chr8 118316530 118316610 - +chr8 118342470 118342619 - +chr8 118874319 118874639 - +chr8 119427335 119427415 - +chr8 119755962 119756089 - +chr8 120429079 120429399 - +chr8 120781251 120781571 - +chr8 121101488 121101808 - +chr8 121135711 121136031 - +chr8 121820880 121821200 - +chr8 121963451 121963771 - +chr8 122261053 122261373 - +chr8 122273103 122273423 - +chr8 123584575 123584895 - +chr8 123651145 123651465 - +chr8 123685729 123686049 - +chr8 123691119 123691439 - +chr8 123745822 123746142 - +chr8 123807297 123807617 - +chr8 124167013 124167333 - +chr8 124172777 124172911 - +chr8 124232879 124233199 - +chr8 124778476 124778796 - +chr8 124934673 124934993 - +chr8 125437549 125437869 - +chr8 125438291 125438611 - +chr8 125452652 125452972 - +chr8 125605186 125605506 - +chr8 126656815 126657135 - +chr8 127888983 127889085 - +chr8 128412931 128413251 - +chr8 128746162 128746482 - +chr8 130315613 130315933 - +chr8 130548056 130548376 - +chr8 130832352 130832672 - +chr8 130838464 130838784 - +chr8 131761885 131762205 - +chr8 132816481 132816801 - +chr8 133204275 133204595 - +chr8 133887735 133888055 - +chr8 134215169 134215489 - +chr8 134221511 134221831 - +chr8 134461739 134462059 - +chr8 134465606 134465926 - +chr8 134468008 134468328 - +chr8 134581858 134582178 - +chr8 135697841 135698161 - +chr8 135703536 135703856 - +chr8 135898698 135899018 - +chr8 135917734 135918054 - +chr8 135918487 135918593 - +chr8 136464732 136464848 - +chr8 136478117 136478437 - +chr8 139739730 139739917 - +chr8 141109247 141109567 - +chr8 141169729 141170049 - +chr8 141403942 141404262 - +chr8 141477401 141477508 - +chr8 141646168 141646488 - +chr8 141647578 141647723 - +chr8 141678382 141678702 - +chr8 142094422 142094742 - +chr8 142185011 142185331 - +chr8 142357147 142357227 - +chr8 142394828 142394937 - +chr8 142396875 142397195 - +chr8 142441593 142441913 - +chr8 142737112 142737432 - +chr8 143528876 143529196 - +chr8 143553289 143553609 - +chr8 143705417 143705737 - +chr8 143716136 143716268 - +chr8 143799529 143799849 - +chr8 143820111 143820431 - +chr8 143820872 143821192 - +chr8 143863926 143864246 - +chr8 143897775 143898095 - +chr8 144110796 144110969 - +chr8 144363785 144364105 - +chr8 144465872 144465996 - +chr8 144483425 144483587 - +chr8 144512964 144513284 - +chr8 144594884 144595019 - +chr8 144614360 144614506 - +chr8 144976700 144977020 - +chr8 145049005 145049325 - +chr8 145288506 145288826 - +chr8 145539647 145539967 - +chr8 145597522 145597615 - +chr8 145634651 145634971 - +chr8 145692290 145692610 - +chr8 145704359 145704679 - +chr8 145735205 145735301 - +chr8 145955462 145955782 - +chr8 146095093 146095413 - +chr8 146125583 146125903 - +chr8 146252127 146252447 - +chr9 385835 386155 - +chr9 613856 614176 - +chr9 1009119 1009439 - +chr9 2728120 2728440 - +chr9 2859810 2860130 - +chr9 3240697 3241017 - +chr9 3467696 3467855 - +chr9 4077497 4077817 - +chr9 4339428 4339621 - +chr9 4759446 4759589 - +chr9 4797399 4797719 - +chr9 5590059 5590379 - +chr9 5600531 5600851 - +chr9 5799257 5799577 - +chr9 5832955 5833275 - +chr9 5886237 5886337 - +chr9 6054543 6054863 - +chr9 7052615 7052935 - +chr9 7186252 7186572 - +chr9 9942833 9943153 - +chr9 13034202 13034522 - +chr9 14353497 14353817 - +chr9 14428831 14429151 - +chr9 15104833 15105153 - +chr9 15135197 15135517 - +chr9 15142956 15143276 - +chr9 15211492 15211812 - +chr9 16192181 16192501 - +chr9 16266677 16266997 - +chr9 17063581 17063901 - +chr9 17811025 17811345 - +chr9 18255093 18255413 - +chr9 18442667 18442987 - +chr9 18791129 18791449 - +chr9 19049007 19049327 - +chr9 19457554 19457680 - +chr9 19471819 19472139 - +chr9 19493107 19493427 - +chr9 19926122 19926222 - +chr9 20356342 20356662 - +chr9 20941111 20941431 - +chr9 21396133 21396453 - +chr9 21455350 21455670 - +chr9 21813004 21813324 - +chr9 22008613 22008933 - +chr9 25716294 25716614 - +chr9 27333164 27333484 - +chr9 27529788 27529925 - +chr9 32579749 32580069 - +chr9 33287025 33287345 - +chr9 33374922 33375242 - +chr9 33415447 33415767 - +chr9 33452762 33453082 - +chr9 33466737 33466912 - +chr9 33623988 33624308 - +chr9 33722192 33722333 - +chr9 33778783 33779103 - +chr9 33922002 33922322 - +chr9 33925186 33925506 - +chr9 34074228 34074548 - +chr9 34253071 34253169 - +chr9 34379454 34379595 - +chr9 34403927 34404019 - +chr9 34500770 34500919 - +chr9 34548035 34548355 - +chr9 34591732 34592052 - +chr9 34603775 34603910 - +chr9 34612798 34613118 - +chr9 34633930 34634250 - +chr9 34664388 34664546 - +chr9 34675171 34675298 - +chr9 34701296 34701616 - +chr9 34702213 34702367 - +chr9 34760487 34760807 - +chr9 34899562 34899882 - +chr9 34985331 34985651 - +chr9 34994430 34994750 - +chr9 35096092 35096412 - +chr9 35116838 35117158 - +chr9 35477269 35477589 - +chr9 35481394 35481714 - +chr9 35553976 35554296 - +chr9 35603871 35604191 - +chr9 35611256 35611576 - +chr9 35646778 35647098 - +chr9 35696577 35696897 - +chr9 35729739 35730059 - +chr9 35736203 35736315 - +chr9 35790300 35790620 - +chr9 35825992 35826312 - +chr9 35882349 35882485 - +chr9 35945765 35946085 - +chr9 35954949 35955269 - +chr9 36008855 36009095 - +chr9 36023450 36023770 - +chr9 36166528 36166848 - +chr9 36407052 36407372 - +chr9 36450187 36450507 - +chr9 36555948 36556075 - +chr9 36597446 36597766 - +chr9 36728950 36729270 - +chr9 36862149 36862469 - +chr9 36873287 36873607 - +chr9 36995847 36996167 - +chr9 37747187 37747507 - +chr9 37904244 37904564 - +chr9 37913612 37913932 - +chr9 38088071 38088209 - +chr9 38160655 38160975 - +chr9 38672377 38672697 - +chr9 71199454 71199617 - +chr9 71614578 71614898 - +chr9 71642578 71642898 - +chr9 71650559 71650710 - +chr9 71669135 71669455 - +chr9 71838492 71838812 - +chr9 72027584 72027904 - +chr9 72287793 72288113 - +chr9 72586840 72587160 - +chr9 72725934 72726254 - +chr9 72728373 72728693 - +chr9 73099514 73099834 - +chr9 73900087 73900407 - +chr9 74219401 74219721 - +chr9 74421767 74422087 - +chr9 74640029 74640126 - +chr9 74653996 74654091 - +chr9 74886837 74887157 - +chr9 74895961 74896103 - +chr9 75637899 75638219 - +chr9 75722453 75722773 - +chr9 77388808 77389128 - +chr9 77801729 77802049 - +chr9 77880424 77880744 - +chr9 79425296 79425616 - +chr9 79626844 79627164 - +chr9 79765714 79766034 - +chr9 80048683 80048868 - +chr9 80071778 80072098 - +chr9 80072943 80073263 - +chr9 80984571 80984891 - +chr9 81054402 81054722 - +chr9 81216891 81217211 - +chr9 81744846 81745166 - +chr9 85576460 85576780 - +chr9 86198097 86198417 - +chr9 86264038 86264358 - +chr9 86876100 86876420 - +chr9 86930218 86930538 - +chr9 87012002 87012322 - +chr9 88817221 88817541 - +chr9 89563389 89563709 - +chr9 90293600 90293920 - +chr9 90299498 90299818 - +chr9 90427873 90428077 - +chr9 90788321 90788641 - +chr9 91193426 91193746 - +chr9 92163977 92164297 - +chr9 93633342 93633662 - +chr9 93677195 93677515 - +chr9 93796091 93796411 - +chr9 94444028 94444348 - +chr9 94496690 94497010 - +chr9 94520813 94520897 - +chr9 94537725 94538045 - +chr9 94717061 94717381 - +chr9 94896219 94896539 - +chr9 94900386 94900706 - +chr9 94902579 94902899 - +chr9 95397014 95397334 - +chr9 95478885 95479024 - +chr9 95527629 95527949 - +chr9 95871478 95871798 - +chr9 95916618 95916938 - +chr9 95922233 95922553 - +chr9 96067210 96067530 - +chr9 96328635 96328955 - +chr9 96444437 96444757 - +chr9 96589722 96590042 - +chr9 96725265 96725585 - +chr9 96827748 96828068 - +chr9 97365573 97365893 - +chr9 97431231 97431335 - +chr9 97890026 97890182 - +chr9 97893742 97893872 - +chr9 97930356 97930462 - +chr9 98980584 98980673 - +chr9 98994815 98994935 - +chr9 99079669 99079989 - +chr9 99090058 99090378 - +chr9 99619214 99619352 - +chr9 100149325 100149509 - +chr9 100161523 100161695 - +chr9 100174073 100174259 - +chr9 100230902 100231222 - +chr9 100360995 100361315 - +chr9 100652252 100652572 - +chr9 100797034 100797354 - +chr9 100836686 100837006 - +chr9 100968077 100968236 - +chr9 101026193 101026275 - +chr9 101077936 101078048 - +chr9 101553472 101553792 - +chr9 101610855 101611175 - +chr9 101737279 101737440 - +chr9 101761186 101761506 - +chr9 101762766 101763086 - +chr9 101903109 101903254 - +chr9 101947013 101947123 - +chr9 102028853 102029173 - +chr9 102569589 102569909 - +chr9 102790506 102790652 - +chr9 103200180 103200500 - +chr9 103267892 103268212 - +chr9 103363461 103363781 - +chr9 103476914 103477234 - +chr9 104221330 104221650 - +chr9 107625522 107625842 - +chr9 107688702 107689022 - +chr9 108177695 108178015 - +chr9 108604346 108604666 - +chr9 108644016 108644336 - +chr9 109934479 109934799 - +chr9 110103483 110103803 - +chr9 110187649 110187806 - +chr9 110227850 110228170 - +chr9 111328078 111328398 - +chr9 111524105 111524425 - +chr9 112006840 112007160 - +chr9 112083267 112083587 - +chr9 112174206 112174526 - +chr9 112232053 112232203 - +chr9 112282796 112283116 - +chr9 112650160 112650480 - +chr9 112665858 112666178 - +chr9 113798760 113799080 - +chr9 114384290 114384433 - +chr9 114837200 114837280 - +chr9 114860060 114860380 - +chr9 115607084 115607206 - +chr9 115665459 115665779 - +chr9 116163638 116163958 - +chr9 116169212 116169532 - +chr9 116352235 116352555 - +chr9 116353948 116354268 - +chr9 116417718 116418038 - +chr9 116420388 116420493 - +chr9 116444598 116444736 - +chr9 116569681 116570001 - +chr9 116725339 116725659 - +chr9 116844776 116845096 - +chr9 116870292 116870474 - +chr9 116917546 116917866 - +chr9 117053309 117053629 - +chr9 117068152 117068472 - +chr9 117101401 117101721 - +chr9 117167876 117168196 - +chr9 117249877 117250197 - +chr9 117415927 117416030 - +chr9 117424493 117424625 - +chr9 117443809 117444129 - +chr9 117501121 117501441 - +chr9 118353803 118354123 - +chr9 119334471 119334791 - +chr9 119499444 119499602 - +chr9 119622844 119623164 - +chr9 120406952 120407272 - +chr9 122318356 122318676 - +chr9 122724066 122724386 - +chr9 122731517 122731635 - +chr9 123239807 123239968 - +chr9 123518128 123518448 - +chr9 123698835 123699155 - +chr9 123975590 123975736 - +chr9 124041012 124041332 - +chr9 124082365 124082685 - +chr9 124308358 124308472 - +chr9 124312795 124313115 - +chr9 124449263 124449405 - +chr9 124457760 124458080 - +chr9 124497300 124497620 - +chr9 124885349 124885533 - +chr9 124887457 124887777 - +chr9 125112880 125113200 - +chr9 125215476 125215796 - +chr9 125227308 125227628 - +chr9 125261095 125261415 - +chr9 126112237 126112389 - +chr9 126116349 126116669 - +chr9 126164236 126164556 - +chr9 126306315 126306635 - +chr9 126803327 126803647 - +chr9 126889116 126889436 - +chr9 127082739 127082836 - +chr9 127177724 127178044 - +chr9 127181853 127182173 - +chr9 127299029 127299349 - +chr9 127314045 127314365 - +chr9 127379794 127380114 - +chr9 127474248 127474568 - +chr9 127534283 127534603 - +chr9 127601938 127602258 - +chr9 127615394 127615550 - +chr9 127905963 127906119 - +chr9 128137012 128137332 - +chr9 128170270 128170590 - +chr9 128320747 128321067 - +chr9 128991290 128991385 - +chr9 129064286 129064606 - +chr9 129160864 129160988 - +chr9 129253696 129253793 - +chr9 129261566 129261735 - +chr9 129263173 129263493 - +chr9 129320437 129320603 - +chr9 129373862 129374182 - +chr9 129467554 129467874 - +chr9 129535310 129535630 - +chr9 129544109 129544429 - +chr9 129622587 129622754 - +chr9 129728244 129728324 - +chr9 129961679 129961759 - +chr9 130285719 130286039 - +chr9 130348770 130349090 - +chr9 130352160 130352480 - +chr9 130496878 130497198 - +chr9 130632327 130632409 - +chr9 130683866 130684186 - +chr9 130705177 130705259 - +chr9 130706068 130706198 - +chr9 130797480 130797800 - +chr9 130879975 130880204 - +chr9 131017515 131017835 - +chr9 131057623 131057943 - +chr9 131124414 131124734 - +chr9 131182464 131182784 - +chr9 131200917 131201237 - +chr9 131397471 131397791 - +chr9 131464640 131464960 - +chr9 131580579 131580708 - +chr9 131625999 131626112 - +chr9 131690243 131690563 - +chr9 131769182 131769266 - +chr9 131790751 131791071 - +chr9 131894067 131894227 - +chr9 131901407 131901727 - +chr9 131905341 131905476 - +chr9 131936407 131936548 - +chr9 132373357 132373677 - +chr9 132386465 132386785 - +chr9 132404522 132404842 - +chr9 132488895 132489215 - +chr9 132499874 132500194 - +chr9 132546198 132546288 - +chr9 132552617 132552937 - +chr9 132648079 132648183 - +chr9 132998559 132998694 - +chr9 133005490 133005810 - +chr9 133336576 133336896 - +chr9 133557938 133558258 - +chr9 133559525 133559705 - +chr9 133576456 133576776 - +chr9 133741793 133742113 - +chr9 133816518 133816838 - +chr9 133873737 133874057 - +chr9 133922170 133922490 - +chr9 133928567 133928887 - +chr9 133933339 133933493 - +chr9 134053766 134053926 - +chr9 134211763 134211891 - +chr9 134225424 134225744 - +chr9 134466564 134466884 - +chr9 134631013 134631333 - +chr9 134637863 134638183 - +chr9 135045212 135045532 - +chr9 135111912 135112016 - +chr9 135134660 135134835 - +chr9 135144161 135144481 - +chr9 135450453 135450773 - +chr9 135675572 135675892 - +chr9 135777234 135777554 - +chr9 135848081 135848211 - +chr9 135853028 135853348 - +chr9 135897807 135898028 - +chr9 135978578 135978898 - +chr9 136019918 136020064 - +chr9 136023005 136023165 - +chr9 136036459 136036578 - +chr9 136055328 136055648 - +chr9 136061416 136061736 - +chr9 136272099 136272181 - +chr9 136351558 136351641 - +chr9 136608165 136608359 - +chr9 136647301 136647621 - +chr9 136658132 136658452 - +chr9 136859631 136859762 - +chr9 137035926 137036140 - +chr9 137291297 137291617 - +chr9 137335285 137335415 - +chr9 137341539 137341624 - +chr9 137552283 137552603 - +chr9 137581511 137581635 - +chr9 137590385 137590705 - +chr9 137631009 137631329 - +chr9 137977914 137978234 - +chr9 138235821 138236141 - +chr9 138304525 138304686 - +chr9 138392295 138392615 - +chr9 138494569 138494889 - +chr9 138495765 138496085 - +chr9 138593777 138594097 - +chr9 138689515 138689835 - +chr9 138756321 138756641 - +chr9 138859487 138859631 - +chr9 139014854 139014986 - +chr9 139062116 139062436 - +chr9 139117981 139118301 - +chr9 139233675 139233995 - +chr9 139294611 139294931 - +chr9 139379487 139379807 - +chr9 139383104 139383424 - +chr9 139525879 139526199 - +chr9 139538968 139539288 - +chr9 139566188 139566286 - +chr9 139652844 139653164 - +chr9 139655315 139655635 - +chr9 139686340 139686514 - +chr9 139736837 139736921 - +chr9 139839182 139839502 - +chr9 139923922 139924049 - +chr9 139929830 139929918 - +chr9 139940973 139941293 - +chr9 139958633 139958782 - +chr9 139962229 139962549 - +chr9 140121975 140122295 - +chr9 140131027 140131144 - +chr9 140339745 140340065 - +chr9 140473314 140473634 - +chr9 140567297 140567617 - +chrX 1573188 1573337 - +chrX 1700089 1700409 - +chrX 1767496 1767816 - +chrX 1771154 1771474 - +chrX 2511376 2511696 - +chrX 2730509 2730829 - +chrX 4465270 4465590 - +chrX 8783516 8783836 - +chrX 8847808 8848128 - +chrX 9677059 9677379 - +chrX 9853214 9853534 - +chrX 9963817 9964137 - +chrX 10087680 10087870 - +chrX 10157679 10157999 - +chrX 12789649 12789969 - +chrX 12964981 12965147 - +chrX 13766006 13766326 - +chrX 15489146 15489466 - +chrX 15779747 15780067 - +chrX 16141346 16141666 - +chrX 16463327 16463647 - +chrX 16713225 16713545 - +chrX 16911033 16911179 - +chrX 17064661 17064981 - +chrX 17609305 17609625 - +chrX 17806561 17806881 - +chrX 20271069 20271389 - +chrX 20548470 20548790 - +chrX 22003446 22003766 - +chrX 22099624 22099944 - +chrX 22210814 22211134 - +chrX 22766568 22766888 - +chrX 33452658 33452978 - +chrX 37604648 37604968 - +chrX 37684512 37684832 - +chrX 40403132 40403452 - +chrX 40433042 40433362 - +chrX 41039479 41039799 - +chrX 45046193 45046513 - +chrX 45684020 45684340 - +chrX 46268310 46268630 - +chrX 46618705 46619025 - +chrX 46656070 46656390 - +chrX 47243944 47244264 - +chrX 48774519 48774839 - +chrX 48803067 48803387 - +chrX 49518888 49519208 - +chrX 49648894 49649214 - +chrX 49680262 49680582 - +chrX 49684316 49684636 - +chrX 52004654 52004974 - +chrX 52950280 52950600 - +chrX 52963942 52964262 - +chrX 53118559 53118679 - +chrX 53313827 53314147 - +chrX 53342183 53342503 - +chrX 53346785 53346927 - +chrX 53463098 53463198 - +chrX 53472952 53473272 - +chrX 53509205 53509285 - +chrX 62470150 62470470 - +chrX 64748692 64748894 - +chrX 64815376 64815537 - +chrX 66775470 66775790 - +chrX 67238444 67238764 - +chrX 67731126 67731446 - +chrX 67819428 67819748 - +chrX 68042524 68042844 - +chrX 68047634 68047954 - +chrX 68353843 68354163 - +chrX 68505848 68506168 - +chrX 69250348 69250668 - +chrX 69317299 69317619 - +chrX 69482859 69483179 - +chrX 69671893 69672213 - +chrX 69674263 69674583 - +chrX 70349048 70349368 - +chrX 71346542 71346634 - +chrX 71465353 71465673 - +chrX 71497032 71497352 - +chrX 73075377 73075697 - +chrX 73602557 73602877 - +chrX 73770170 73770490 - +chrX 83984761 83985081 - +chrX 84339847 84340167 - +chrX 99702971 99703062 - +chrX 99926163 99926483 - +chrX 99940742 99941062 - +chrX 100023783 100024027 - +chrX 101600876 101601196 - +chrX 102118698 102119018 - +chrX 102154905 102155225 - +chrX 102155568 102155888 - +chrX 102189584 102189904 - +chrX 102431337 102431657 - +chrX 102719713 102720033 - +chrX 102788382 102788702 - +chrX 103381143 103381463 - +chrX 103515626 103515946 - +chrX 104166897 104167078 - +chrX 106137602 106137922 - +chrX 106898389 106898709 - +chrX 108729112 108729204 - +chrX 109134269 109134372 - +chrX 109656392 109656712 - +chrX 111112816 111113136 - +chrX 111390480 111390800 - +chrX 113845509 113845829 - +chrX 114565144 114565464 - +chrX 114885251 114885571 - +chrX 114924975 114925295 - +chrX 117737883 117738203 - +chrX 118138994 118139157 - +chrX 118142106 118142426 - +chrX 118252782 118253102 - +chrX 118400402 118400722 - +chrX 118646477 118646797 - +chrX 118650257 118650577 - +chrX 119021600 119021920 - +chrX 119347570 119347890 - +chrX 128674018 128674338 - +chrX 128735100 128735420 - +chrX 128740934 128741254 - +chrX 128771605 128771741 - +chrX 129527472 129527792 - +chrX 129673707 129674027 - +chrX 129709843 129710163 - +chrX 130050857 130051177 - +chrX 130767552 130767872 - +chrX 130811729 130812049 - +chrX 130937116 130937436 - +chrX 130975423 130975743 - +chrX 132926559 132926761 - +chrX 134246185 134246505 - +chrX 135014086 135014406 - +chrX 135245016 135245336 - +chrX 135659740 135660060 - +chrX 136007341 136007661 - +chrX 136134578 136134898 - +chrX 139139133 139139453 - +chrX 139815945 139816265 - +chrX 139844282 139844602 - +chrX 139848221 139848541 - +chrX 148552446 148552766 - +chrX 149940630 149940950 - +chrX 150250959 150251049 - +chrX 151800975 151801295 - +chrX 151989745 151990065 - +chrX 152073114 152073434 - +chrX 152127447 152127767 - +chrX 152773019 152773339 - +chrX 152804952 152805272 - +chrX 152820261 152820581 - +chrX 152875309 152875629 - +chrX 152934764 152935084 - +chrX 153029402 153029722 - +chrX 153212756 153213076 - +chrX 153218901 153219070 - +chrX 153279077 153279397 - +chrX 153316698 153317018 - +chrX 153536723 153537043 - +chrX 153763163 153763325 - +chrX 153943242 153943562 - +chrX 153977025 153977345 - diff --git a/tests/bedshift/bedshift_analysis.yaml b/tests/bedshift/bedshift_analysis.yaml new file mode 100644 index 00000000..bf9c9a28 --- /dev/null +++ b/tests/bedshift/bedshift_analysis.yaml @@ -0,0 +1,27 @@ +bedshift_operations: + - add: + rate: 0.1 + mean: 100 + stdev: 20 + - drop_from_file: + file: tests/bedshift/test.bed + rate: 0.1 + delimiter: \t + - shift_from_file: + file: tests/bedshift/test2.bed + rate: 0.5 + mean: 100 + stdev: 200 + - add_from_file: + file: tests/bedshift/small_test.bed + rate: 0.2 + - cut: + rate: 0.2 + - shift: + rate: 0.3 + mean: 100 + stdev: 200 + - drop: + rate: 0.30 + - merge: + rate: 0.15 diff --git a/tests/bedshift/chrom_sizes_1 b/tests/bedshift/chrom_sizes_1 new file mode 100644 index 00000000..407f3917 --- /dev/null +++ b/tests/bedshift/chrom_sizes_1 @@ -0,0 +1 @@ +1 10000 diff --git a/tests/bedshift/chrom_sizes_2 b/tests/bedshift/chrom_sizes_2 new file mode 100644 index 00000000..c63a4587 --- /dev/null +++ b/tests/bedshift/chrom_sizes_2 @@ -0,0 +1 @@ +1 100 diff --git a/tests/bedshift/conftest.py b/tests/bedshift/conftest.py new file mode 100644 index 00000000..4bddccef --- /dev/null +++ b/tests/bedshift/conftest.py @@ -0,0 +1,15 @@ +import os + +import pytest + +from geniml.bedshift import bedshift + +SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__)) + + +@pytest.fixture +def bs(): + return bedshift.Bedshift( + os.path.join(SCRIPT_PATH, "test.bed"), + chrom_sizes=os.path.join(SCRIPT_PATH, "hg38.chrom.sizes"), + ) diff --git a/tests/bedshift/from_file.bed b/tests/bedshift/from_file.bed new file mode 100644 index 00000000..151d15d6 --- /dev/null +++ b/tests/bedshift/from_file.bed @@ -0,0 +1,4 @@ +chr1 2100 2550 +chr15 12345600 12400000 +chrY 500 1100 +chr16 57682000 57683001 \ No newline at end of file diff --git a/tests/bedshift/header_test.bed b/tests/bedshift/header_test.bed new file mode 100644 index 00000000..604ff6ea --- /dev/null +++ b/tests/bedshift/header_test.bed @@ -0,0 +1,4 @@ +chrom start end +chr16 57683001 57683273 +chr1 10000 50000 +chr2 123456 234567 \ No newline at end of file diff --git a/tests/bedshift/hg19.chrom.sizes b/tests/bedshift/hg19.chrom.sizes new file mode 100644 index 00000000..ae5e9d32 --- /dev/null +++ b/tests/bedshift/hg19.chrom.sizes @@ -0,0 +1,93 @@ +chr1 249250621 +chr2 243199373 +chr3 198022430 +chr4 191154276 +chr5 180915260 +chr6 171115067 +chr7 159138663 +chr8 146364022 +chr9 141213431 +chr10 135534747 +chr11 135006516 +chr12 133851895 +chr13 115169878 +chr14 107349540 +chr15 102531392 +chr16 90354753 +chr17 81195210 +chr18 78077248 +chr19 59128983 +chr20 63025520 +chr21 48129895 +chr22 51304566 +chrX 155270560 +chrY 59373566 +chrM 16571 +chr1_gl000191_random 106433 +chr1_gl000192_random 547496 +chr4_ctg9_hap1 590426 +chr4_gl000193_random 189789 +chr4_gl000194_random 191469 +chr6_apd_hap1 4622290 +chr6_cox_hap2 4795371 +chr6_dbb_hap3 4610396 +chr6_mann_hap4 4683263 +chr6_mcf_hap5 4833398 +chr6_qbl_hap6 4611984 +chr6_ssto_hap7 4928567 +chr7_gl000195_random 182896 +chr8_gl000196_random 38914 +chr8_gl000197_random 37175 +chr9_gl000198_random 90085 +chr9_gl000199_random 169874 +chr9_gl000200_random 187035 +chr9_gl000201_random 36148 +chr11_gl000202_random 40103 +chr17_ctg5_hap1 1680828 +chr17_gl000203_random 37498 +chr17_gl000204_random 81310 +chr17_gl000205_random 174588 +chr17_gl000206_random 41001 +chr18_gl000207_random 4262 +chr19_gl000208_random 92689 +chr19_gl000209_random 159169 +chr21_gl000210_random 27682 +chrUn_gl000211 166566 +chrUn_gl000212 186858 +chrUn_gl000213 164239 +chrUn_gl000214 137718 +chrUn_gl000215 172545 +chrUn_gl000216 172294 +chrUn_gl000217 172149 +chrUn_gl000218 161147 +chrUn_gl000219 179198 +chrUn_gl000220 161802 +chrUn_gl000221 155397 +chrUn_gl000222 186861 +chrUn_gl000223 180455 +chrUn_gl000224 179693 +chrUn_gl000225 211173 +chrUn_gl000226 15008 +chrUn_gl000227 128374 +chrUn_gl000228 129120 +chrUn_gl000229 19913 +chrUn_gl000230 43691 +chrUn_gl000231 27386 +chrUn_gl000232 40652 +chrUn_gl000233 45941 +chrUn_gl000234 40531 +chrUn_gl000235 34474 +chrUn_gl000236 41934 +chrUn_gl000237 45867 +chrUn_gl000238 39939 +chrUn_gl000239 33824 +chrUn_gl000240 41933 +chrUn_gl000241 42152 +chrUn_gl000242 43523 +chrUn_gl000243 43341 +chrUn_gl000244 39929 +chrUn_gl000245 36651 +chrUn_gl000246 38154 +chrUn_gl000247 36422 +chrUn_gl000248 39786 +chrUn_gl000249 38502 diff --git a/tests/bedshift/hg38.chrom.sizes b/tests/bedshift/hg38.chrom.sizes new file mode 100644 index 00000000..63962a61 --- /dev/null +++ b/tests/bedshift/hg38.chrom.sizes @@ -0,0 +1,24 @@ +chr1 248956422 +chr2 242193529 +chr3 198295559 +chr4 190214555 +chr5 181538259 +chr6 170805979 +chr7 159345973 +chr8 145138636 +chr9 138394717 +chr10 133797422 +chr11 135086622 +chr12 133275309 +chr13 114364328 +chr14 107043718 +chr15 101991189 +chr16 90338345 +chr17 83257441 +chr18 80373285 +chr19 58617616 +chr20 64444167 +chr21 46709983 +chr22 50818468 +chrX 156040895 +chrY 57227415 diff --git a/tests/bedshift/shell_test.sh b/tests/bedshift/shell_test.sh new file mode 100755 index 00000000..1a3f75e3 --- /dev/null +++ b/tests/bedshift/shell_test.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +bedshift --bedfile $(dirname "$0")/test.bed -l $(dirname "$0")/hg38.chrom.sizes --droprate 0.1 --addrate 0.2 --addmean 320.0 --addstdev 30.0 --shiftrate 0.3 --shiftmean 0.0 --shiftstdev 150.0 --cutrate 0.1 --mergerate 0.2 --outputfile $(dirname "$0")/sh_output.bed + +bedshift --bedfile $(dirname "$0")/test.bed -l $(dirname "$0")/hg38.chrom.sizes --shiftrate 0.4 --droprate 0.1 --addrate 0.2 --addfile $(dirname "$0")/test.bed --outputfile $(dirname "$0")/sh_output2.bed + +bedshift --bedfile $(dirname "$0")/test.bed -l $(dirname "$0")/hg38.chrom.sizes --addrate 0.1 --cutrate 0.5 --addfile $(dirname "$0")/test.bed --outputfile $(dirname "$0")/sh_output3.bed -r 3 diff --git a/tests/bedshift/small_test.bed b/tests/bedshift/small_test.bed new file mode 100644 index 00000000..d080f0ed --- /dev/null +++ b/tests/bedshift/small_test.bed @@ -0,0 +1 @@ +chr16 57683001 57683273 \ No newline at end of file diff --git a/tests/bedshift/small_test2.bed b/tests/bedshift/small_test2.bed new file mode 100644 index 00000000..5ad16f3e --- /dev/null +++ b/tests/bedshift/small_test2.bed @@ -0,0 +1,7 @@ +chr16 57683001 57683273 +chr1 2000 2500 +chr1 2600 2777 +chr2 100000 200000 +chr2 200000 210000 +chr15 12345678 12345789 +chrY 1000 1200 diff --git a/tests/bedshift/test.bed b/tests/bedshift/test.bed new file mode 100644 index 00000000..87b887ac --- /dev/null +++ b/tests/bedshift/test.bed @@ -0,0 +1,1000 @@ +chr16 57683001 57683273 +chr6 53036699 53036974 +chr10 76995732 76995979 +chr12 54773504 54773736 +chr3 52481114 52481353 +chr1 17036345 17036564 +chr12 58299186 58299434 +chr17 17259432 17259658 +chr17 36823867 36824096 +chr16 4304077 4304332 +chr14 50328828 50329070 +chr1 114889180 114889434 +chr9 36008855 36009095 +chr2 91815094 91815295 +chr9 135897807 135898028 +chr19 19285892 19286130 +chr3 195577808 195578049 +chr15 40566969 40567171 +chr12 57632931 57633148 +chr16 19897657 19897872 +chr2 178029373 178029587 +chr6 32383276 32383499 +chr6 73121954 73122191 +chr17 39818986 39819236 +chr14 21560709 21560965 +chr5 43105868 43106092 +chr17 79075575 79075817 +chr2 75867740 75867954 +chr5 110493887 110494095 +chrX 100023783 100024027 +chr10 46983727 46983962 +chr7 150869691 150869921 +chr20 30200745 30200970 +chr16 50058820 50059049 +chr9 130879975 130880204 +chr22 20918660 20918889 +chr19 41140470 41140697 +chr17 37778254 37778472 +chr16 57610355 57610564 +chr7 93918396 93918583 +chr20 32900544 32900788 +chr1 150952019 150952241 +chr6 41336680 41336926 +chr8 22431955 22432196 +chrX 132926559 132926761 +chr11 129234101 129234306 +chr15 45410376 45410584 +chr10 88699223 88699443 +chr16 86597452 86597736 +chr20 31377913 31378129 +chr10 35852305 35852523 +chr3 50653447 50653643 +chr5 40904900 40905108 +chr17 42852225 42852442 +chr5 137688079 137688345 +chr9 90427873 90428077 +chr15 56538296 56538511 +chr10 118976186 118976410 +chr10 45353289 45353492 +chr1 32429773 32429986 +chr17 35730554 35730767 +chr12 46465981 46466196 +chr17 32527044 32527245 +chr1 9687084 9687319 +chr9 80048683 80048868 +chr14 104544390 104544564 +chr1 68808510 68808711 +chr12 111325896 111326107 +chr22 18312816 18313026 +chr1 168115113 168115318 +chr3 46530248 46530460 +chr17 43212695 43212915 +chr17 55320549 55320757 +chr15 75242802 75243001 +chr11 62791635 62791849 +chr19 34992503 34992731 +chr17 34476425 34476629 +chr17 78549191 78549406 +chr20 56784973 56785186 +chr16 75498677 75498904 +chr9 137035926 137036140 +chr19 42612745 42612951 +chr6 41286665 41286868 +chr1 11898793 11898994 +chr14 94952671 94952871 +chr21 27173934 27174148 +chr2 38342484 38342660 +chr17 36579992 36580189 +chr11 18753621 18753798 +chr6 88032145 88032339 +chr1 225662708 225662915 +chr1 45251830 45252026 +chr8 57154572 57154788 +chr20 542435 542615 +chr14 94359384 94359583 +chr13 30687177 30687378 +chr11 107670083 107670262 +chr12 54973589 54973775 +chr12 14783125 14783315 +chr6 136547452 136547648 +chr2 223555396 223555584 +chr14 23590073 23590285 +chr11 119979005 119979203 +chr10 46951331 46951536 +chrX 64748692 64748894 +chr1 19239633 19239837 +chr6 86192899 86193096 +chr1 16359416 16359607 +chr3 48477487 48477697 +chr16 89927311 89927518 +chr17 40995478 40995664 +chr8 27454748 27454926 +chr7 73981178 73981369 +chr13 29937246 29937416 +chr19 10691729 10691932 +chr20 37063986 37064190 +chr10 30287338 30287539 +chr18 74712493 74712686 +chr14 76853608 76853803 +chr1 155034070 155034258 +chr7 25605591 25605795 +chr22 37252468 37252660 +chr16 67850531 67850727 +chr11 72145255 72145459 +chr11 76838250 76838481 +chr2 42890367 42890548 +chr16 2473197 2473391 +chr2 99001646 99001820 +chr17 45307198 45307376 +chr10 82265418 82265597 +chr12 49208454 49208658 +chr10 106051253 106051435 +chr13 53542320 53542506 +chr15 45722524 45722699 +chr10 71267703 71267890 +chr22 36462233 36462433 +chr12 4713885 4714081 +chr2 95688125 95688316 +chr19 48201460 48201644 +chr9 4339428 4339621 +chr17 76247037 76247255 +chr1 37953081 37953272 +chr7 73140156 73140342 +chr3 187037582 187037763 +chr4 3482329 3482519 +chr17 10519908 10520104 +chr11 60897347 60897518 +chr3 127293908 127294084 +chr16 90148414 90148616 +chr6 15691580 15691774 +chr9 129261566 129261735 +chr20 4089361 4089553 +chr7 100157480 100157663 +chr3 113007016 113007200 +chr14 21439148 21439344 +chr17 41923705 41923898 +chr1 21697127 21697301 +chr5 55566441 55566609 +chr18 43201030 43201207 +chr2 114361714 114361888 +chr4 1124443 1124636 +chr9 3467696 3467855 +chr20 21485933 21486130 +chr2 99266000 99266156 +chr16 66349237 66349405 +chr17 55969918 55970106 +chr9 136608165 136608359 +chr11 117714785 117714967 +chr12 52607378 52607572 +chr17 12927859 12928052 +chr19 56165060 56165274 +chr12 48136064 48136258 +chr11 65990236 65990436 +chr2 239892279 239892437 +chr1 204131710 204131879 +chr2 135208523 135208714 +chr11 62673514 62673720 +chr1 47641095 47641281 +chr12 122232168 122232369 +chr7 66022639 66022817 +chr16 72866522 72866715 +chr12 54140483 54140670 +chr1 7727680 7727859 +chr2 129494239 129494411 +chr11 36768729 36768902 +chr11 19098613 19098791 +chr3 38692799 38692987 +chr1 21695062 21695245 +chr3 46671529 46671715 +chr19 46146461 46146637 +chr7 100161584 100161770 +chr13 77477795 77477958 +chr3 194827260 194827428 +chr12 125667780 125667968 +chr6 2932507 2932700 +chr8 74086644 74086821 +chr10 50801198 50801375 +chr14 21572730 21572913 +chr22 31545086 31545263 +chr4 38321533 38321716 +chr14 59974154 59974331 +chr17 17715625 17715791 +chr17 30580147 30580317 +chr2 70875117 70875287 +chr19 46580950 46581126 +chr9 123239807 123239968 +chr19 30207546 30207721 +chr2 118572016 118572195 +chr1 16382676 16382872 +chr11 66176010 66176188 +chr14 23538168 23538350 +chr19 8645717 8645900 +chr12 9483825 9484008 +chr18 864879 865060 +chr20 937186 937363 +chr1 94279657 94279816 +chr16 20861286 20861447 +chr10 115281050 115281197 +chr19 50184056 50184239 +chr19 42455886 42456066 +chr19 35986382 35986541 +chr22 47194584 47194749 +chr5 176540972 176541153 +chr9 116870292 116870474 +chr1 41314299 41314471 +chr20 796755 796942 +chr6 87788830 87788995 +chr1 204096880 204097037 +chr20 8771227 8771383 +chr14 78328628 78328826 +chr3 139048276 139048460 +chr19 44289546 44289729 +chr12 78319402 78319584 +chr2 1136192 1136372 +chr7 44605201 44605381 +chr8 139739730 139739917 +chr19 13088740 13088912 +chr9 129622587 129622754 +chr10 129809704 129809866 +chr19 54296278 54296470 +chr3 13692063 13692210 +chr9 100161523 100161695 +chrX 64815376 64815537 +chr9 136023005 136023165 +chr12 13431185 13431365 +chr9 71199454 71199617 +chr19 59092617 59092788 +chr11 63920641 63920831 +chr8 40851359 40851549 +chr15 89600842 89601008 +chr12 5112290 5112476 +chr11 74959769 74959970 +chr19 40023533 40023694 +chr16 77270023 77270207 +chr5 71314120 71314258 +chr20 43238728 43238902 +chr19 35605599 35605796 +chr11 20631761 20631926 +chr1 181159086 181159268 +chr3 136751346 136751523 +chr12 111332265 111332448 +chr4 6976232 6976391 +chr18 71510127 71510300 +chr16 27166439 27166606 +chr14 99732998 99733161 +chr10 34926524 34926681 +chr11 44642795 44642964 +chr17 66168649 66168806 +chr21 47343208 47343365 +chr16 23607645 23607817 +chr9 100149325 100149509 +chr20 16802235 16802396 +chr17 30986422 30986588 +chr17 27077180 27077384 +chr15 80285832 80286012 +chr3 185378313 185378468 +chr12 120868337 120868524 +chr14 91885247 91885428 +chr12 51735294 51735468 +chr22 25858589 25858752 +chr7 75807526 75807681 +chr19 5804112 5804298 +chr15 41149937 41150113 +chr2 31556572 31556733 +chr4 55185185 55185349 +chr8 23550250 23550407 +chr15 76440340 76440522 +chr7 128731681 128731808 +chr6 73839745 73839920 +chr19 32715613 32715782 +chr3 124187141 124187317 +chr9 133559525 133559705 +chr16 21566235 21566399 +chr11 117688547 117688725 +chr7 44517044 44517218 +chr3 13689847 13690021 +chr2 220163497 220163675 +chr1 33764365 33764546 +chr2 128349147 128349304 +chr2 217531443 217531619 +chr2 220324208 220324385 +chr6 90081871 90082033 +chr7 73269962 73270132 +chr8 16870518 16870687 +chr2 30934552 30934712 +chr9 100174073 100174259 +chr21 45359020 45359206 +chr1 226735482 226735663 +chrX 153763163 153763325 +chr12 50449848 50450014 +chr17 19669893 19670056 +chr17 30646552 30646728 +chrX 10087680 10087870 +chr1 120413638 120413788 +chr2 1821417 1821573 +chr1 157961836 157962013 +chr1 40804480 40804645 +chr14 25148241 25148410 +chr6 161795005 161795178 +chr9 33466737 33466912 +chr7 148762987 148763158 +chr2 30462239 30462402 +chr1 201223789 201223939 +chr3 5022076 5022220 +chr2 44706632 44706800 +chr11 48194499 48194662 +chr17 41561033 41561224 +chr12 69550933 69551099 +chr17 76418219 76418353 +chr3 195586280 195586445 +chr17 20896297 20896464 +chr12 49012357 49012522 +chr6 159595393 159595555 +chr18 9771601 9771733 +chr6 2952857 2953026 +chr5 149541275 149541434 +chr8 1978838 1978990 +chr11 58903468 58903624 +chr19 42800302 42800479 +chr19 31157320 31157493 +chr20 48227719 48227887 +chr17 80739765 80739925 +chr5 157170584 157170773 +chr20 35515700 35515856 +chr8 82692827 82693000 +chr10 115323324 115323495 +chr8 20233807 20233977 +chr10 73656292 73656446 +chr4 178171334 178171498 +chr21 35734248 35734400 +chr1 205561047 205561202 +chr15 69768201 69768350 +chr12 53553239 53553431 +chr3 137787533 137787685 +chr1 35318309 35318629 +chr8 9688604 9688698 +chr3 123005224 123005401 +chr7 75943688 75943869 +chr13 30046597 30046769 +chr13 49343378 49343519 +chr2 128821092 128821229 +chr11 192082 192268 +chr7 43878566 43878743 +chr11 65652213 65652385 +chr20 30150735 30150903 +chr3 45701633 45701770 +chr20 59838274 59838426 +chr5 179742804 179742972 +chrX 153218901 153219070 +chr20 34790860 34791012 +chr13 29210967 29211121 +chr3 170132018 170132184 +chr13 100089228 100089411 +chr6 29720945 29721104 +chr17 47957966 47958134 +chr2 218654038 218654160 +chr22 29641626 29641797 +chr4 87843637 87843795 +chr1 16951519 16951697 +chr22 45080732 45080906 +chr3 15244285 15244465 +chr8 99182333 99182492 +chr15 32943548 32943707 +chr16 57118245 57118402 +chr5 177591294 177591467 +chr15 90863356 90863499 +chr1 27371106 27371287 +chr22 27075080 27075235 +chr15 91429332 91429496 +chr2 220507287 220507483 +chr1 212185425 212185560 +chr10 112431916 112432097 +chr21 45230721 45230874 +chr18 77006630 77006775 +chr1 57286828 57286982 +chr5 111496387 111496545 +chr17 43462998 43463167 +chr14 69378413 69378584 +chr15 89448083 89448219 +chr7 5526630 5526786 +chr12 113612308 113612466 +chr8 56923242 56923400 +chr8 81595282 81595445 +chr1 15757740 15757903 +chr9 138304525 138304686 +chr1 201377434 201377605 +chr1 5574724 5574874 +chr20 30619242 30619389 +chr22 33698104 33698250 +chr12 69684872 69685047 +chr17 18120792 18120978 +chr22 32029917 32030076 +chr17 1026380 1026544 +chr15 51425197 51425348 +chr14 100095281 100095443 +chr10 28952904 28953021 +chr19 47040864 47041027 +chr1 44471407 44471576 +chr17 56665101 56665254 +chr11 1108747 1108898 +chr15 80145877 80146197 +chr22 30401331 30401493 +chr8 141647578 141647723 +chr10 47081063 47081222 +chr11 73046606 73046773 +chr11 87063901 87064035 +chr7 100702720 100702888 +chr19 3429605 3429780 +chr11 47174755 47174911 +chr2 18569047 18569192 +chr14 105116757 105116922 +chr10 6130696 6130937 +chr9 127905963 127906119 +chr3 14962749 14962923 +chr1 41134667 41134832 +chr20 40363166 40363327 +chr12 110243175 110243331 +chr3 185661411 185661574 +chr7 142912288 142912444 +chr11 64321074 64321192 +chr21 46712474 46712630 +chr22 39925327 39925487 +chr1 28184660 28184804 +chr6 7276261 7276426 +chr2 40065109 40065249 +chr1 66708482 66708623 +chr10 118547686 118547849 +chr2 30338668 30338817 +chr6 31514503 31514647 +chr9 134053766 134053926 +chr20 19643154 19643294 +chr15 85471534 85471691 +chr12 54793170 54793335 +chr2 238832384 238832551 +chr11 64877856 64878022 +chr17 21183414 21183572 +chr19 50073513 50073704 +chr6 126317230 126317383 +chr8 33282240 33282377 +chr11 17620599 17620761 +chr4 152283933 152284104 +chr11 65479320 65479475 +chr17 36204445 36204581 +chr1 155220562 155220757 +chr20 47950555 47950714 +chr9 124885349 124885533 +chr14 75075703 75075844 +chr11 122073772 122073924 +chr9 133933339 133933493 +chr3 17882838 17882998 +chr16 67499818 67499971 +chr6 166689265 166689416 +chr4 1630962 1631133 +chr2 103232406 103232581 +chr19 11404533 11404688 +chr17 26875280 26875442 +chr1 202205556 202205724 +chr22 37473950 37474103 +chr10 13628781 13628949 +chr1 201192133 201192286 +chr1 16947628 16947785 +chr19 8067373 8067525 +chr16 147016 147175 +chr1 159906289 159906458 +chr19 8764083 8764245 +chr10 74100298 74100439 +chr9 97890026 97890182 +chr2 44105155 44105276 +chr2 149267112 149267280 +chr9 71650559 71650710 +chr10 99519273 99519445 +chr12 13069272 13069401 +chr1 153541128 153541287 +chr9 131580579 131580708 +chr15 33993010 33993151 +chr2 220142494 220142662 +chr11 60955133 60955276 +chr10 103769701 103769859 +chr6 131949456 131949607 +chr14 95155933 95156081 +chr16 48657237 48657360 +chr17 7227628 7227756 +chr1 47902216 47902372 +chr17 43275268 43275463 +chr10 75608910 75609046 +chr19 47930656 47930758 +chr2 20842249 20842414 +chr16 85454394 85454548 +chr19 49601164 49601306 +chr19 4558472 4558664 +chr2 113017113 113017275 +chr16 46707396 46707552 +chr22 23728181 23728349 +chr8 23145408 23145728 +chr20 34583028 34583175 +chr15 70780471 70780639 +chr7 150821127 150821286 +chr14 94603160 94603318 +chr10 134036702 134036857 +chr10 73808405 73808556 +chr9 139686340 139686514 +chr9 101737279 101737440 +chr6 130096410 130096509 +chr17 72740846 72740999 +chrX 118138994 118139157 +chr19 45942802 45942953 +chrX 104166897 104167078 +chr8 144110796 144110969 +chr1 20959795 20959956 +chr12 111834823 111834973 +chr17 77979700 77979850 +chr11 125218779 125218889 +chr4 8470586 8470725 +chr18 12391766 12391932 +chr2 26800798 26800963 +chr1 154990098 154990252 +chr19 510730 510902 +chr15 75092617 75092746 +chr1 223263514 223263669 +chr4 154006442 154006600 +chr10 73456574 73456738 +chr17 61996670 61996830 +chr12 54595200 54595361 +chr5 174961152 174961339 +chr7 73815701 73815865 +chr11 61500988 61501147 +chr11 66462299 66462483 +chr2 72364723 72364864 +chr2 28675475 28675627 +chr17 55749384 55749516 +chr10 43601048 43601184 +chr19 33634911 33635036 +chr18 43460026 43460196 +chr11 117297038 117297189 +chr11 19281382 19281544 +chr15 101629101 101629218 +chr14 70480121 70480261 +chr10 124027699 124027847 +chr20 48595623 48595770 +chr15 51178184 51178330 +chr20 62013304 62013461 +chr1 201482197 201482356 +chr2 12003852 12003989 +chr12 114350639 114350794 +chr20 59804312 59804469 +chr15 74189854 74189982 +chr3 58153177 58153329 +chr2 73312849 73312985 +chr11 78356355 78356507 +chr15 73928855 73929015 +chr20 42709496 42709638 +chr8 118342470 118342619 +chr2 175206322 175206481 +chr12 95730378 95730512 +chr17 6376571 6376718 +chr2 25565357 25565510 +chr3 64009153 64009312 +chr7 128038727 128038854 +chr1 51762574 51762737 +chr16 441898 442043 +chr1 204716088 204716239 +chr20 719794 719921 +chr12 114418432 114418587 +chr1 109806290 109806448 +chr9 134211763 134211891 +chr8 124172777 124172911 +chr10 79471288 79471448 +chr11 130731965 130732105 +chr3 42485456 42485545 +chr20 11993006 11993165 +chr7 1032866 1033033 +chr2 121070850 121071003 +chr17 42287772 42287949 +chr9 4759446 4759589 +chr14 90865998 90866132 +chr22 46938295 46938450 +chr1 201762271 201762419 +chr11 68868973 68869114 +chr3 9109152 9109278 +chr9 34500770 34500919 +chr12 53259216 53259369 +chr11 73000616 73000765 +chr1 91630938 91631059 +chr22 37663177 37663327 +chr17 72954132 72954273 +chr17 40474872 40475025 +chr17 32688613 32688762 +chr3 184089406 184089555 +chr6 170190306 170190474 +chr9 135134660 135134835 +chr20 47237638 47237784 +chr7 45299723 45299869 +chr7 45961857 45962009 +chr7 120432163 120432316 +chr19 46320503 46320636 +chr19 39421046 39421220 +chr1 11702594 11702741 +chr6 42134308 42134469 +chr11 120173890 120174036 +chr9 38088071 38088209 +chr15 64237570 64237722 +chr4 53411756 53411878 +chr11 111749735 111749902 +chr5 114970485 114970658 +chr22 46259893 46260047 +chr20 35185031 35185171 +chr17 48624343 48624503 +chr16 4380396 4380555 +chrX 12964981 12965147 +chr19 1546290 1546456 +chr9 131894067 131894227 +chr1 116012433 116012525 +chr1 204958574 204958694 +chr12 7959574 7959693 +chr15 51169269 51169363 +chr3 133931872 133932014 +chr1 37920550 37920697 +chr9 74895961 74896103 +chr22 20144600 20144769 +chr9 34664388 34664546 +chr17 45920501 45920656 +chr1 230778012 230778186 +chr17 16492763 16492918 +chr10 29785429 29785575 +chr1 222638767 222638924 +chr1 115655523 115655674 +chr15 41196276 41196465 +chr7 86941689 86941828 +chr15 41758107 41758237 +chr17 27438959 27439112 +chr18 74110976 74111112 +chr20 55125750 55125927 +chr7 126894083 126894198 +chr4 186518855 186518994 +chr22 22697354 22697439 +chr3 13537882 13538033 +chr7 5125167 5125327 +chr1 12680026 12680121 +chr1 181394128 181394320 +chr6 30509351 30509477 +chr2 218578313 218578465 +chr10 72968943 72969087 +chr12 6172435 6172564 +chr10 54539991 54540134 +chr6 31461639 31461774 +chr11 3113144 3113271 +chr22 38329963 38330283 +chr20 35820780 35820909 +chr16 11012714 11012858 +chr19 35503979 35504118 +chr19 48366760 48366921 +chr17 1998394 1998526 +chr16 11490181 11490335 +chr22 44389172 44389309 +chr1 144918099 144918257 +chr2 71089084 71089225 +chr5 71907065 71907210 +chr1 9065284 9065439 +chr11 67150115 67150275 +chr11 61103787 61103897 +chr4 3445305 3445465 +chr6 152659799 152659907 +chr16 47175689 47175840 +chr6 53590667 53590987 +chr8 77280138 77280278 +chr19 5578360 5578506 +chr19 18977428 18977567 +chr1 68001189 68001334 +chr19 17685021 17685177 +chr20 46012840 46013006 +chr1 205626794 205626965 +chr4 25314234 25314326 +chr4 134067933 134068082 +chr11 68151508 68151649 +chr14 24901076 24901227 +chr9 132998559 132998694 +chr5 131399269 131399376 +chr14 106004308 106004450 +chr9 34379454 34379595 +chr11 60666924 60667079 +chr1 11779943 11780100 +chr9 129320437 129320603 +chr16 68554604 68554777 +chr6 27740339 27740473 +chr6 31367437 31367577 +chr8 144483425 144483587 +chr6 29557344 29557496 +chr11 46391093 46391256 +chr1 209801115 209801261 +chr19 41955374 41955502 +chr15 62126409 62126552 +chr3 8811164 8811302 +chr16 70805122 70805249 +chr12 54137544 54137632 +chr9 131936407 131936548 +chr17 43505522 43505666 +chr8 105772007 105772200 +chr2 47995161 47995278 +chr10 102902109 102902229 +chr3 134027316 134027453 +chr1 206907975 206908134 +chr17 43717844 43717983 +chrX 16911033 16911179 +chr12 13387126 13387217 +chr1 23188685 23188822 +chr11 69616437 69616586 +chr5 172141813 172141961 +chr1 19210752 19210865 +chr9 97431231 97431335 +chr17 1624226 1624353 +chr14 71284480 71284631 +chr20 34035127 34035268 +chr18 8478765 8478921 +chr14 55518195 55518357 +chr5 87986872 87987022 +chr7 75495883 75496022 +chr9 110187649 110187806 +chr9 132404522 132404842 +chr5 671797 671949 +chr1 158057233 158057370 +chr13 27504510 27504647 +chr11 74413523 74413672 +chr11 728170 728252 +chr9 34702213 34702367 +chr9 34403927 34404019 +chr10 88720293 88720435 +chr2 47214257 47214411 +chr12 13408808 13408897 +chr11 58443141 58443284 +chr8 85538421 85538588 +chr1 231892642 231892786 +chr3 187465874 187466028 +chr17 77042056 77042199 +chr12 52477294 52477454 +chr19 47539152 47539289 +chr17 47113492 47113659 +chr15 65346704 65346860 +chr6 110026509 110026596 +chr15 72448490 72448630 +chr1 19567851 19568005 +chr1 120154333 120154479 +chr10 45428253 45428395 +chr1 37116182 37116327 +chr2 3427769 3427924 +chr17 41387219 41387379 +chr5 136680526 136680685 +chr17 59494319 59494454 +chr7 43148125 43148265 +chr19 8680711 8680861 +chr4 1728083 1728229 +chr2 110303791 110303938 +chr2 65267918 65268074 +chr10 102774147 102774308 +chr2 121278913 121279069 +chr1 205304826 205304988 +chr7 1472221 1472347 +chr6 36308879 36308995 +chr1 159861471 159861616 +chr9 114384290 114384433 +chr11 125274881 125275034 +chr5 176538974 176539116 +chr4 37604719 37604877 +chr1 25428050 25428165 +chr17 15074870 15075022 +chr14 24559845 24559980 +chr2 85906423 85906529 +chr7 30780185 30780321 +chr14 37411542 37411686 +chr5 131400495 131400660 +chr2 23535997 23536113 +chr11 123451389 123451517 +chr8 22446660 22446799 +chr2 120460533 120460663 +chr17 3586360 3586537 +chr6 143748468 143748621 +chr11 61666672 61666838 +chr17 27053962 27054102 +chr10 33297753 33297908 +chr11 1853809 1853955 +chr7 149439368 149439510 +chr17 9632471 9632604 +chr11 20408760 20408863 +chr20 1484373 1484547 +chr10 102295481 102295590 +chr17 17763643 17763797 +chr12 50260991 50261159 +chr21 39084573 39084724 +chr9 119499444 119499602 +chr8 52921148 52921288 +chr3 184521181 184521332 +chr2 42067890 42068047 +chr18 59249174 59249308 +chr19 1837712 1837855 +chr9 102790506 102790652 +chr17 73400364 73400511 +chr19 2579471 2579791 +chr11 74437737 74437877 +chr1 19147363 19147522 +chr5 141258833 141258976 +chr7 113350461 113350586 +chr3 14302752 14302881 +chr9 95922233 95922553 +chr13 48669281 48669439 +chr16 74847109 74847220 +chr2 241453375 241453504 +chr18 77915735 77915869 +chr14 70070407 70070490 +chr11 1224328 1224470 +chr11 45676950 45677083 +chr17 74519297 74519422 +chr1 111326053 111326160 +chr1 11002157 11002269 +chr16 28082180 28082296 +chr1 155959274 155959418 +chr6 45898262 45898423 +chr6 37659893 37660047 +chr14 91164046 91164184 +chr14 72887566 72887697 +chr6 30098909 30099069 +chr15 39110041 39110140 +chr14 55737822 55738142 +chr14 74684483 74684563 +chr11 62379985 62380142 +chr22 42062756 42062882 +chr8 103826972 103827115 +chr6 36727046 36727203 +chr17 36505111 36505259 +chr5 150427497 150427622 +chr4 71370971 71371121 +chr7 41982739 41982841 +chr7 100926378 100926541 +chr19 10111016 10111134 +chr15 90304084 90304237 +chr2 73439650 73439970 +chr6 3251805 3251929 +chr11 61284540 61284620 +chr14 105175074 105175216 +chr20 45386858 45386993 +chr6 44235839 44235992 +chr5 50258869 50259009 +chr18 839368 839688 +chr6 133890098 133890253 +chr17 48860625 48860778 +chr1 33560719 33560865 +chr3 51703399 51703538 +chr1 147364935 147365097 +chr6 151335620 151335701 +chr12 113863271 113863391 +chr2 28789721 28789883 +chr11 45792874 45793026 +chr13 22051258 22051418 +chr1 32986122 32986278 +chr12 54151050 54151192 +chr6 44281097 44281188 +chr6 80579335 80579655 +chr19 46532249 46532394 +chr2 88301471 88301614 +chr17 6569424 6569565 +chr11 16810020 16810155 +chr9 115607084 115607206 +chr20 49204828 49204933 +chr17 38716634 38716779 +chr19 56764300 56764457 +chr1 168051608 168051737 +chr12 57607163 57607302 +chr15 82400932 82401062 +chr20 39801895 39801995 +chr12 12857690 12857815 +chr10 72663169 72663316 +chr6 122813209 122813325 +chr11 10955380 10955501 +chr2 43251738 43251859 +chr2 109252906 109253041 +chr8 143716136 143716268 +chr9 126112237 126112389 +chr9 112232053 112232203 +chr20 49252550 49252675 +chr13 113098399 113098542 +chr7 30978383 30978703 +chr19 46184851 46185007 +chr8 28181940 28182090 +chr5 142078792 142078927 +chr9 19457554 19457680 +chr11 18001434 18001597 +chr8 135918487 135918593 +chr17 2148177 2148314 +chr16 19918901 19919027 +chr5 150695701 150695837 +chr7 6556087 6556210 +chr1 206663521 206663664 +chr19 6125566 6125886 +chr1 145059083 145059230 +chr10 105245546 105245700 +chr14 24399136 24399234 +chr21 46312809 46312950 +chr15 79476862 79476991 +chr9 131905341 131905476 +chr2 144270764 144270889 +chr13 67990559 67990679 +chr2 44271438 44271536 +chr9 129160864 129160988 +chr2 30064398 30064551 +chr17 3817234 3817378 +chr19 47207843 47207991 +chr15 71407636 71407756 +chr19 41710291 41710443 +chr6 80487813 80487946 +chr15 65020514 65020661 +chr14 24031988 24032308 +chr20 23587326 23587473 +chrX 1573188 1573337 +chr11 62310334 62310497 +chr12 12717068 12717228 +chr8 86132950 86133104 +chr22 37853391 37853711 +chr1 85219341 85219484 +chr2 179756269 179756397 +chr2 65058982 65059116 +chr10 8095352 8095483 +chr11 34195990 34196140 +chr1 90228641 90228765 +chr4 7072631 7072755 +chr1 9241446 9241531 +chr16 57451062 57451190 +chr11 125011748 125011895 +chr19 584498 584649 +chr14 105512167 105512342 +chr12 54069859 54070179 +chr6 34122661 34122771 +chr3 33116074 33116212 +chr11 62554078 62554237 +chr9 131625999 131626112 +chr6 10801639 10801725 +chr5 10857280 10857404 +chr17 27918775 27918878 +chr4 25989859 25989992 +chr13 96130884 96131040 +chr1 95271491 95271626 +chr11 77757353 77757498 +chr9 132546198 132546288 +chr19 3163375 3163526 +chr11 72504858 72504965 +chr3 194193138 194193272 +chr2 100879341 100879661 +chr1 44683495 44683636 +chr6 6677175 6677255 +chr12 6574760 6574897 +chr3 125156778 125156908 +chr6 43422337 43422474 +chr19 16173638 16173780 +chr10 101152304 101152459 +chr12 47377796 47378116 +chr11 65547370 65547690 +chr14 75762926 75763060 +chr2 113507353 113507456 +chr3 149939185 149939298 +chr17 74010287 74010607 +chr1 15426891 15427211 +chr18 56431644 56431777 +chr1 211871970 211872053 +chr19 47987102 47987244 +chr15 77835806 77835917 +chr21 39728060 39728186 +chr2 238480671 238480991 +chr8 49084792 49084927 +chr16 48664438 48664567 +chr11 74991319 74991456 +chr12 6097865 6097984 +chr7 142148393 142148477 +chr18 59568715 59568875 +chr19 45303837 45303994 +chr3 38038461 38038591 +chr1 226384792 226384927 +chr2 27977153 27977242 +chr15 51484993 51485141 +chr16 81069239 81069399 +chr13 114992299 114992453 +chr1 211496820 211497140 +chr8 144594884 144595019 +chr18 45971984 45972090 diff --git a/tests/bedshift/test2.bed b/tests/bedshift/test2.bed new file mode 100644 index 00000000..731f64b4 --- /dev/null +++ b/tests/bedshift/test2.bed @@ -0,0 +1,500 @@ +chr17 75491048 75491368 - +chrX 91560343 91560648 A +chr14 54006224 54006544 - +chr5 108831115 108831429 A +chr16 85427311 85427631 - +chr16 77709641 77709961 - +chr6 13615357 13615677 - +chr1 200772148 200772280 - +chr12 132093592 132093912 - +chr13 110874586 110874906 - +chr3 146343046 146343398 A +chr2 51377256 51377625 A +chr19 5720637 5720755 - +chr12 6851090 6851399 A +chrX 10157679 10157999 - +chr8 7283392 7283693 A +chr14 70804087 70804454 A +chr6 108863228 108863548 - +chr12 114404146 114404466 - +chr2 27926636 27926956 - +chr13 99097720 99097849 - +chr20 36793725 36794045 - +chr11 131557897 131558217 - +chr5 12538050 12538379 A +chr12 53937242 53937562 - +chr2 238062724 238063044 - +chr11 132057843 132058162 A +chr4 165955334 165955604 A +chr2 133175538 133175859 A +chr1 226856481 226856798 A +chr12 53447429 53447749 - +chr1 183680014 183680342 A +chr14 75380707 75381027 - +chr18 2542927 2543235 A +chr13 31377046 31377366 - +chr2 43086117 43086437 - +chr17 42245958 42246040 - +chrX 17609305 17609625 - +chr11 16635222 16635542 - +chr9 117621404 117621696 A +chr10 29824469 29824789 - +chr1 156808798 156808882 - +chr11 15230200 15230520 - +chr11 67030166 67030486 - +chr9 108220298 108220680 A +chr17 34959372 34959692 - +chr16 48664438 48664567 - +chr13 87859697 87860037 A +chr3 119539864 119540184 - +chr3 11102391 11102711 - +chr16 43916332 43916686 A +chr8 113438874 113439189 A +chr19 34836794 34837114 - +chr15 78260607 78260898 A +chr4 150185355 150185675 - +chr4 3482329 3482519 - +chr8 10190264 10190584 - +chr5 167512368 167512696 A +chr1 135903243 135903575 A +chr1 45118654 45118816 - +chr20 62701102 62701422 - +chr16 10603450 10603770 - +chr7 50842903 50843230 A +chr1 30176533 30176613 - +chr1 487505 487831 A +chr2 92220525 92220903 A +chr17 70536518 70536838 - +chr11 73062256 73062576 - +chr8 142357147 142357227 - +chr2 87167156 87167459 A +chr10 70166453 70166743 A +chr7 35077479 35077799 - +chr15 69351475 69351795 - +chr7 142135090 142135410 - +chr6 154057966 154058279 A +chr2 202968997 202969317 - +chr18 12934511 12934831 - +chr10 29681360 29681680 - +chr17 47485346 47485666 - +chr6 80579335 80579655 - +chr3 52265523 52265843 - +chr18 11553459 11553565 - +chr15 62546869 62547226 A +chr9 18590193 18590552 A +chr15 76654690 76654996 A +chr6 154886969 154887335 A +chr11 57224927 57225059 - +chr20 62406522 62406842 - +chr16 68191047 68191366 A +chr21 46712474 46712630 - +chr5 141524910 141525230 - +chr10 113536135 113536389 A +chr16 20786085 20786405 - +chr2 218654038 218654160 - +chr11 5923378 5923672 A +chr11 18192942 18193262 - +chr20 19900204 19900485 A +chr2 165223417 165223717 A +chr4 72298932 72299274 A +chr11 45894293 45894613 - +chr3 12883225 12883545 - +chr2 205047912 205048201 A +chr11 72145255 72145459 - +chr5 169611093 169611413 - +chr10 108384616 108384936 - +chrX 105602892 105603208 A +chr3 46120338 46120658 - +chr20 61814680 61815000 - +chr11 44560065 44560385 - +chr3 111686574 111686894 - +chr4 4490157 4490477 - +chr18 51611008 51611292 A +chr9 140473314 140473634 - +chrX 112995852 112996143 A +chr7 115010422 115010780 A +chr1 214801505 214801825 - +chr11 103577244 103577579 A +chr20 12990747 12991041 A +chr1 203830408 203830728 - +chr19 47032930 47033054 - +chr18 22068900 22069220 - +chr17 76636671 76637020 A +chr12 120868337 120868524 - +chrX 56047535 56047824 A +chr1 27935136 27935456 - +chr3 50283756 50283872 - +chr18 74513993 74514313 - +chr10 121352552 121352795 A +chr17 41771762 41772082 - +chr18 4068578 4068898 - +chr4 146478954 146479056 - +chr12 99437131 99437451 - +chr14 15296342 15296608 A +chr3 12748017 12748337 - +chr3 106328601 106328921 - +chr1 206757030 206757350 - +chr17 8286506 8286826 - +chr15 73533250 73533570 - +chr5 173846566 173846886 - +chr1 131455711 131456067 A +chr5 66283809 66284129 - +chr10 79479414 79479560 - +chr19 1727699 1728055 A +chr5 3621170 3621453 A +chr4 1350007 1350327 - +chr2 131130061 131130381 - +chr3 127174522 127174626 - +chr7 70294137 70294457 - +chr1 41047816 41048122 A +chr10 45007432 45007717 A +chr12 91775817 91776137 - +chr7 127987636 127987956 - +chr6 30565144 30565464 - +chr8 142396875 142397195 - +chr13 99685406 99685699 A +chr8 42010908 42011228 - +chr13 32820554 32820874 - +chr10 52770709 52771029 - +chr16 39198142 39198496 A +chr4 1747949 1748269 - +chr8 38662478 38662824 A +chr5 81521432 81521752 - +chr16 9233011 9233147 - +chr7 48031342 48031662 - +chr11 131737592 131737912 - +chr18 66290622 66290942 - +chrX 104166897 104167078 - +chr6 142528581 142528901 - +chr8 105430379 105430502 - +chr3 113345758 113345840 - +chr10 95463959 95464295 A +chr1 156355395 156355715 - +chrX 58064776 58065126 A +chr1 12289935 12290043 - +chr11 6464673 6464993 - +chr19 12747152 12747479 A +chr6 58844492 58844830 A +chr4 158899932 158900252 - +chr8 8105350 8105670 - +chr11 18081061 18081403 A +chrX 37604648 37604968 - +chr5 131722248 131722568 - +chr19 43937137 43937262 - +chr14 78921197 78921459 A +chr11 130722419 130722726 A +chr6 2952857 2953026 - +chr6 99320750 99321095 A +chr5 150431221 150431541 - +chr8 119041072 119041361 A +chr3 10825081 10825401 - +chr2 72098737 72099057 - +chr11 34932755 34933090 A +chr2 161920253 161920573 - +chr8 54500639 54500959 - +chr19 19477764 19477844 - +chr9 140339745 140340065 - +chr16 4250253 4250573 - +chr4 11721955 11722275 - +chr10 48377814 48378134 - +chr15 41149937 41150113 - +chr14 94502924 94503244 - +chr10 29583266 29583641 A +chr19 46302335 46302655 - +chr4 52761989 52762309 - +chr9 126896952 126897297 A +chr3 124884061 124884381 - +chrX 40027405 40027657 A +chr11 32830795 32831115 - +chr19 46580950 46581126 - +chr4 7103724 7104044 - +chr6 112658386 112658706 - +chr3 11985584 11985904 - +chr22 7469665 7469981 A +chr2 177545239 177545523 A +chr16 68263315 68263635 - +chr17 43176531 43176851 - +chr16 67580893 67581213 - +chr19 4851657 4851977 - +chr11 72853345 72853665 - +chr20 18144015 18144339 A +chr3 90007411 90007700 A +chr6 20407746 20408085 A +chr1 20901736 20902056 - +chr10 99674526 99674846 - +chr3 24279540 24279860 - +chr10 130957596 130957949 A +chr21 46213239 46213559 - +chr7 128038727 128038854 - +chr3 101522057 101522377 - +chr5 180296012 180296332 - +chr9 130683866 130684186 - +chr8 136464732 136464848 - +chr11 74734069 74734187 - +chrX 14747260 14747629 A +chr22 38283794 38284114 - +chr21 23433649 23433954 A +chr4 21685346 21685653 A +chr8 143705417 143705737 - +chr10 33482270 33482590 - +chr2 135002989 135003309 - +chr7 73947645 73947983 A +chr19 3706099 3706419 - +chr12 92205486 92205806 - +chr13 31253930 31254250 - +chr19 55929560 55929880 - +chr6 13795349 13795669 - +chr15 101212213 101212533 - +chr22 21271345 21271665 - +chr7 5569216 5569536 - +chr17 6325358 6325678 - +chr10 54539991 54540134 - +chr12 118312668 118312988 - +chr2 11213407 11213727 - +chr3 141160255 141160575 - +chr10 102774147 102774308 - +chr2 74619206 74619301 - +chr10 75403935 75404255 - +chr21 21002602 21002937 A +chr8 73437782 73438099 A +chr12 120375895 120376215 - +chr4 49781147 49781421 A +chr11 65627676 65627996 - +chr3 196040522 196040804 A +chr17 1162404 1162724 - +chr12 54585381 54585701 - +chr18 39191853 39192189 A +chr19 44263638 44263958 - +chr20 45310025 45310345 - +chr22 35420058 35420378 - +chr1 78486582 78486881 A +chr13 108330475 108330795 - +chr7 146363837 146364178 A +chr1 155017391 155017711 - +chr13 27936236 27936360 - +chr11 59262054 59262438 A +chr11 60542835 60543155 - +chr15 29396103 29396194 - +chrX 153316698 153317018 - +chr17 12933744 12934064 - +chrX 68047634 68047954 - +chr21 39498799 39499119 - +chrX 118646477 118646797 - +chr11 16807074 16807394 - +chr4 4336993 4337313 - +chr17 59376680 59376982 A +chr6 30449856 30450176 - +chr14 95933843 95934174 A +chr2 121837783 121838103 - +chr5 139154688 139155008 - +chr8 141635838 141636144 A +chr21 35802019 35802339 - +chr9 132552617 132552937 - +chr4 190825634 190825954 - +chr20 42843714 42844034 - +chr9 132404522 132404842 - +chr5 113785633 113785953 - +chr2 226525198 226525518 - +chr13 30687177 30687378 - +chr3 141178969 141179289 - +chr19 40023533 40023694 - +chr1 178501471 178501791 - +chr14 105405170 105405503 A +chr16 3989197 3989277 - +chr15 68851953 68852273 - +chrX 109134269 109134372 - +chr2 144041686 144042006 - +chr9 133336576 133336896 - +chr20 17361060 17361403 A +chrX 130050857 130051177 - +chr13 27272856 27273176 - +chr3 168088377 168088643 A +chr17 17472050 17472370 - +chr11 61284540 61284620 - +chr10 129948298 129948618 - +chr20 3034600 3034721 - +chr1 23000652 23000972 - +chr19 11236793 11237113 - +chr11 3251435 3251738 A +chr9 131936407 131936548 - +chr8 11422033 11422353 - +chr2 69551718 69552090 A +chr9 112650160 112650480 - +chr1 223863661 223863993 A +chr11 124154886 124155191 A +chr7 142912288 142912444 - +chr9 12381250 12381585 A +chr12 103595908 103596208 A +chr20 33166902 33167222 - +chr10 16656107 16656410 A +chr19 7399040 7399407 A +chr11 44712786 44713085 A +chr12 57576775 57577095 - +chr22 4290975 4291265 A +chr20 11893026 11893347 A +chr6 83767487 83767807 - +chr11 20321199 20321488 A +chr9 116349664 116349926 A +chr18 77915735 77915869 - +chr20 36032318 36032664 A +chr8 28181940 28182090 - +chr2 179756269 179756397 - +chr10 71875954 71876274 - +chr15 65596598 65596918 - +chr6 155569578 155569898 - +chr7 37574602 37574923 A +chr6 30640769 30641089 - +chr11 59448063 59448383 - +chr15 85823882 85824184 A +chr3 143123488 143123797 A +chr17 36600494 36600583 - +chr9 74653996 74654091 - +chr17 4316521 4316841 - +chr6 160509460 160509780 - +chr3 118603541 118603861 - +chr15 66544706 66545026 - +chr8 121101488 121101808 - +chr10 64733321 64733649 A +chr11 2326686 2326792 - +chr1 37618694 37619060 A +chr17 30169600 30169920 - +chr22 20144600 20144769 - +chr6 3849393 3849713 - +chr15 57853206 57853526 - +chr14 87635310 87635590 A +chr2 223478496 223478816 - +chr7 137277054 137277374 - +chr7 107103603 107103923 - +chr6 160940886 160941206 - +chr18 45299008 45299328 - +chr12 52317206 52317526 - +chr15 13189463 13189755 A +chr1 156767106 156767426 - +chr9 103874965 103875248 A +chr3 132463672 132464005 A +chr8 27454748 27454926 - +chr20 5203642 5203962 - +chr22 12553683 12553996 A +chr1 213141147 213141467 - +chr13 53726073 53726154 - +chr2 23574408 23574728 - +chr17 18528961 18529281 - +chr4 84954539 84954833 A +chr3 128399533 128399853 - +chr19 31601286 31601606 - +chr1 210547845 210547929 - +chr10 118204068 118204388 - +chr3 42232673 42232993 - +chrY 24260682 24261074 A +chrX 60907113 60907412 A +chr9 6763587 6763935 A +chr20 30434461 30434597 - +chr18 39608229 39608549 - +chr5 149690879 149691180 A +chr4 35378615 35378967 A +chr20 37709835 37710155 - +chr12 16512856 16513176 - +chr10 86024478 86024563 - +chr3 179276597 179276917 - +chr5 128010595 128010915 - +chr17 48164854 48165174 - +chr9 70701760 70702109 A +chr19 45303837 45303994 - +chr4 36550712 36550995 A +chr9 131901407 131901727 - +chr17 42193108 42193198 - +chr12 6574760 6574897 - +chr8 121963451 121963771 - +chr9 77075186 77075510 A +chr3 46671529 46671715 - +chr7 122651549 122651844 A +chr9 133741793 133742113 - +chr14 75774507 75774827 - +chr16 36516476 36516828 A +chr9 134631013 134631333 - +chr9 71669135 71669455 - +chr2 237981009 237981329 - +chr1 30170858 30171178 - +chr1 152025059 152025379 - +chr1 33761095 33761415 - +chr2 38830569 38830889 - +chr5 148608624 148608944 - +chr7 134966256 134966576 - +chr18 45275651 45275766 - +chr11 125218779 125218889 - +chr4 114199503 114199823 - +chr6 22768086 22768387 A +chr20 45901593 45901913 - +chr1 239113000 239113306 A +chr16 85693365 85693685 - +chr2 30934552 30934712 - +chr10 99436448 99436768 - +chr3 187397436 187397756 - +chr14 98980252 98980554 A +chr1 15650045 15650365 - +chr9 127670785 127671079 A +chr2 1595693 1596013 - +chr15 72961550 72961844 A +chr22 43176092 43176412 - +chr1 30035028 30035371 A +chr11 116661975 116662295 - +chr4 100471291 100471650 A +chr5 104040677 104040936 A +chr12 40563820 40564177 A +chr5 107179168 107179514 A +chr17 74500566 74500908 A +chr10 121249390 121249522 - +chr3 47563603 47563923 - +chr6 44623239 44623559 - +chr9 116870292 116870474 - +chr11 20132445 20132765 - +chr6 41888986 41889306 - +chr2 200353554 200353874 - +chr13 21286753 21287073 - +chr1 94979690 94980010 - +chr3 52035515 52035835 - +chr9 132048592 132048861 A +chrX 109656392 109656712 - +chr6 145955180 145955500 - +chr13 66163548 66163894 A +chr2 71221976 71222296 - +chr3 58809969 58810289 - +chr4 100326702 100327022 - +chrX 102719713 102720033 - +chr6 160011083 160011383 A +chr14 80331024 80331371 A +chr11 59074478 59074783 A +chr14 76853608 76853803 - +chr16 40748955 40749291 A +chr9 100149325 100149509 - +chr6 2209506 2209826 - +chr19 48794521 48794841 - +chr13 78784910 78785203 A +chr15 99753443 99753763 - +chr21 29889313 29889633 - +chr17 25981746 25982066 - +chr9 123518128 123518448 - +chr2 84530076 84530396 - +chr3 140730380 140730700 - +chr3 133537684 133538004 - +chr6 88032145 88032339 - +chr22 22292531 22292641 - +chr9 27529788 27529925 - +chr16 22199933 22200253 - +chr11 109816958 109817278 - +chr7 13137998 13138346 A +chr7 39493375 39493695 - +chr8 23331791 23332111 - +chr12 121668104 121668424 - +chr2 236688074 236688394 - +chr11 73000616 73000765 - +chr2 216501571 216501888 A +chr3 178051653 178051949 A +chr20 4573239 4573559 - +chr3 115692428 115692665 A +chr14 104808526 104808841 A +chr11 130894961 130895281 - +chr3 56528872 56528969 - +chr17 44079893 44080213 - +chr17 28348790 28349110 - +chr7 50460442 50460748 A diff --git a/tests/bedshift/test_bedshift.py b/tests/bedshift/test_bedshift.py new file mode 100755 index 00000000..ec036a2f --- /dev/null +++ b/tests/bedshift/test_bedshift.py @@ -0,0 +1,186 @@ +import os + +import pytest + +from geniml.bedshift import BedshiftYAMLHandler, bedshift + +SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__)) + + +class TestBedshift: + def test_read_bed(self): + reader = bedshift.Bedshift(os.path.join(SCRIPT_PATH, "header_test.bed")) + assert list(reader.bed.columns) == [0, 1, 2, 3] + assert list(reader.bed.index) == [0, 1, 2] + + def test_read_chrom_sizes(self, bs): + bs._read_chromsizes(os.path.join(SCRIPT_PATH, "hg19.chrom.sizes")) + assert len(bs.chrom_lens) == 93 + + def test_add(self, bs): + added = bs.add(0.1, 100, 20) + assert added == 100 + bs.reset_bed() + + def test_check_rate(self, bs): + with pytest.raises(ValueError): + bs.shift(-0.1, 250, 250) + with pytest.raises(ValueError): + bs.cut(1.5) + + def test_add_high_rate(self, bs): + added = bs.add(1.23, 500, 123) + assert added == 1230 + bs.reset_bed() + + def test_add_valid_regions(self, bs): + added = bs.add( + 0.5, + 2000, + 1000, + valid_bed=os.path.join(SCRIPT_PATH, "small_test.bed"), + delimiter="\t", + ) + assert added == 500 + # bs.to_bed(os.path.join(SCRIPT_PATH, "add_valid_test.bed")) + bs.reset_bed() + + def test_add_from_file(self, bs): + added = bs.add_from_file(os.path.join(SCRIPT_PATH, "test.bed"), 0.25) + assert added == 250 + bs.reset_bed() + + def test_drop(self, bs): + dropped = bs.drop(0.315) + assert dropped == 315 + bs.reset_bed() + + def test_shift(self, bs): + shifted = bs.shift(0.129, 200, 30) + assert shifted == pytest.approx(129, 2) + bs.reset_bed() + + def test_cut(self, bs): + cut = bs.cut(0.909) + assert cut == 909 + bs.reset_bed() + + def test_merge(self, bs): + merged = bs.merge(0.2) + assert merged == pytest.approx(400, 3) + bs.reset_bed() + + def test_combo(self, bs): + _ = bs.drop(0.4) + _ = bs.add(0.2, 200, 10) + assert len(bs.bed) == 720 + bs.reset_bed() + + @pytest.mark.skip("Not implemented yet") + def test_drop_from_file(self, bs): + dropped = bs.drop_from_file(os.path.join(SCRIPT_PATH, "test.bed"), 0.25) + self.assertEqual(dropped, 250) + bs.reset_bed() + + @pytest.mark.skip("Not implemented yet") + def test_drop_from_file_high_rate(self, bs): + dropped = bs.drop_from_file(os.path.join(SCRIPT_PATH, "test.bed"), 1) + assert dropped == 100 + bs.reset_bed() + + @pytest.mark.skip("Not implemented yet") + def test_drop_from_file_zero_rate(self, bs): + dropped = bs.drop_from_file(os.path.join(SCRIPT_PATH, "test.bed"), 0) + assert dropped == 0 + bs.reset_bed() + + @pytest.mark.skip("Not implemented yet") + def test_all_perturbations1(self, bs): + perturbed = bs.all_perturbations( + addrate=0.5, + addmean=320.0, + addstdev=20.0, + shiftrate=0.23, + shiftmean=-10.0, + shiftstdev=120.0, + cutrate=0.12, + droprate=0.42, + ) + assert perturbed == pytest.approx(16156, 2) + assert len(bs.bed) == pytest.approx(9744, 2) + bs.reset_bed() + + @pytest.mark.skip("Not implemented yet") + def test_all_perturbations2(self, bs): + perturbed = bs.all_perturbations( + addrate=0.3, + addmean=320.0, + addstdev=20.0, + shiftrate=0.3, + shiftmean=-10.0, + shiftstdev=120.0, + cutrate=0.1, + mergerate=0.11, + droprate=0.03, + ) + # merge sometimes merges more or less than expected because it depends + # if the randomly chosen regions are adjacent + assert perturbed == pytest.approx(9400, 3) + + def test_to_bed(self, tmp_path, bs): + bs.to_bed(os.path.join(tmp_path, "py_output.bed")) + assert os.path.exists(os.path.join(tmp_path, "py_output.bed")) + + def test_small_file(self): + bs_small = bedshift.Bedshift( + os.path.join(SCRIPT_PATH, "small_test.bed"), + chrom_sizes=os.path.join(SCRIPT_PATH, "hg38.chrom.sizes"), + ) + shifted = bs_small.shift(0.3, 50, 50) + assert shifted == 0 + shifted = bs_small.shift(1.0, 50, 50) + assert shifted == 1 + added = bs_small.add(0.2, 100, 50) + assert added == 0 + added = bs_small.add(1.0, 100, 50) + assert added == 1 + added = bs_small.add(2.0, 100, 50) + assert added == 4 + + +class TestBedshiftYAMLHandler: + @pytest.mark.skip("Not implemented yet") + def test_handle_yaml(self): + bedshifter = bedshift.Bedshift( + os.path.join(SCRIPT_PATH, "test.bed"), + chrom_sizes=os.path.join(SCRIPT_PATH, "hg38.chrom.sizes"), + ) + yamled = BedshiftYAMLHandler.BedshiftYAMLHandler( + bedshifter=bedshifter, + yaml_fp=os.path.join(SCRIPT_PATH, "bedshift_analysis.yaml"), + ).handle_yaml() + bedshifter.reset_bed() + + added = bedshifter.add(addrate=0.1, addmean=100, addstdev=20) + f_drop_10 = bedshifter.drop_from_file( + fp=os.path.join(SCRIPT_PATH, "test.bed"), droprate=0.1 + ) + f_shift_30 = bedshifter.shift_from_file( + fp=os.path.join(SCRIPT_PATH, "test2.bed"), + shiftrate=0.50, + shiftmean=100, + shiftstdev=200, + ) + f_added_20 = bedshifter.add_from_file( + fp=os.path.join(SCRIPT_PATH, "small_test.bed"), addrate=0.2 + ) + cut = bedshifter.cut(cutrate=0.2) + shifted = bedshifter.shift(shiftrate=0.3, shiftmean=100, shiftstdev=200) + dropped = bedshifter.drop(droprate=0.3) + merged = bedshifter.merge(mergerate=0.15) + + total = added + f_drop_10 + f_shift_30 + f_added_20 + cut + dropped + shifted + merged + + # yamled and total both should be around 16750, but can vary by over 100 + assert yamled == pytest.approx(total, 3) + bedshifter.reset_bed() diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..ee9c9a47 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,33 @@ +import pytest + + +def pytest_addoption(parser): + """ + Adding options in commandline for pytest. The options decide which tests to skip + To actually run some test, use this command in terminal: + pytest