diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml new file mode 100644 index 0000000..f58e4c6 --- /dev/null +++ b/.github/workflows/black.yml @@ -0,0 +1,11 @@ +name: Lint + +on: [push, pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + - uses: psf/black@stable diff --git a/.github/workflows/test-pipeline.yml b/.github/workflows/test-pipeline.yml new file mode 100644 index 0000000..f65c756 --- /dev/null +++ b/.github/workflows/test-pipeline.yml @@ -0,0 +1,70 @@ +name: Test bedstat pipeline + +on: + push: + branches: [master, dev] + pull_request: + branches: [master, dev] + +jobs: + pytest: + strategy: + matrix: + python-version: [3.6, 3.7, 3.8] + os: [ubuntu-latest] # can't use macOS when using service containers or container jobs + r: [release] + runs-on: ${{ matrix.os }} + services: + postgres: + image: postgres + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: bedbasepassword + POSTGRES_DB: postgres + ports: + - 5432:5432 + options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 + steps: + - uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependancies + run: if [ -f requirements/requirements-all.txt ]; then pip install -r requirements/requirements-all.txt; fi + + - name: Install dev dependancies + run: if [ -f requirements/requirements-dev.txt ]; then pip install -r requirements/requirements-dev.txt; fi + + - name: Install test dependancies + run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi + + - name: Install libcurl + run: | + sudo apt-get update + sudo apt-get install libcurl4-openssl-dev + + - uses: r-lib/actions/setup-r@master + with: + r-version: ${{ matrix.r }} + + - name: Install R dependancies + run: Rscript scripts/installRdeps.R + + - name: Run pipeline + run: looper run -p local tests/data/bedstat_config.yaml + + - name: Test plots exist + run: | + if ls $GITHUB_WORKSPACE/outputs/bedstat_output/a6a08126cb6f4b1953ba0ec8675df85a/test_hg38*.png 1> /dev/null 2>&1; then + echo "SUCCESS"; + else + echo "ERROR: files do not exist: $GITHUB_WORKSPACE/outputs/bedstat_output/a6a08126cb6f4b1953ba0ec8675df85a/test_hg38*.png"; + exit 1 + fi + + - name: Test record in PostgreSQL + run: | + echo "from bbconf import BedBaseConf; from bbconf.const import *; bbc = BedBaseConf('$GITHUB_WORKSPACE/tests/data/config_min.yaml'); assert bbc.bed.record_count == 1, 'Number of records in the bedfiles table not equal 1'" | python3 - diff --git a/README.md b/README.md index 3f2bace..d821232 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ +![Test bedstat pipeline](https://github.com/databio/bedstat/workflows/Test%20bedstat%20pipeline/badge.svg) # bedstat pipeline for obtaining statistics about bed files @@ -35,60 +36,37 @@ The input PEP can be validated against the [JSON schema in this repository](pep_ eido validate -s https://schema.databio.org/pipelines/bedstat.yaml ``` -### 2. Create a persistent volume to house elasticsearch data +### 2. Run PostgreSQL -``` -docker volume create es-data -``` - -### 3. Run the docker container for elasticsearch +For example, to run an instance in a container and make the data persist, execute: ``` -docker run -p 9200:9200 -p 9300:9300 -v es-data:/usr/share/elasticsearch/data -e "xpack.ml.enabled=false" \ - -e "discovery.type=single-node" elasticsearch:7.5.1 +docker volume create postgres-data +docker run -d --name bedbase-postgres -p 5432:5432 -e POSTGRES_PASSWORD=bedbasepassword -e POSTGRES_USER=postgres -e POSTGRES_DB=postgres -v postgres-data:/var/lib/postgresql/data postgres ``` +Provided environment variables need to match the settings in bedbase configuration file -### 4. Run the bedstat pipeline on the PEP +### 3. Run the bedstat pipeline on the PEP -Then simply run the looper command to run the pipeline for each bed file. It will create a set of plots and statistics per bed file and insert the metadata into Elastic: +Then simply run the `looper run` command to run the pipeline for each bed file. It will create a set of plots and statistics per bed file and insert the metadata into PostgreSQL: ``` looper run project/bedstat_config.yaml ``` -The data loaded into elasticsearch should persist between elasticsearch invocations, on the es-data docker volume created above in step 2. - -### 5. (optional) Run Kibana - -Kibana can be used in order to see ElasticSearch data in a "GUI" kind of a way. - -Pull a matching Kibana docker image. Make sure the Elasticsearch and Kibana container tags match: -``` -docker pull docker.elastic.co/kibana/kibana:7.5.1 -``` - -Get the ID of the docker container (started above) running ElasticSearch via -``` -docker ps | grep elasticsearch -``` - -Run Kibana to link to that container: -``` -docker run --link :elasticsearch -p 5601:5601 docker.elastic.co/kibana/kibana:7.5.1 -``` - -Point your local web browser to http://localhost:5601 - ---- +The data loaded into PostgreSQL should persist between PostgreSQL invocations, on the `postgres-data` docker volume created above in step 2. ## Additional dependencies [regionstat.R](tools/regionstat.R) script is used to calculate the bed file statistics, so the pipeline also depends on several R packages: +* `R.utils` * `BiocManager` * `optparse` * `devtools` * `GenomicRanges` +* `GenomicFeatures` +* `ensembldb` * `GenomicDistributions` * `BSgenome..UCSC.` *depending on the genome used* * `LOLA` diff --git a/pep_schema.yaml b/pep_schema.yaml index 02ce4b7..da49b90 100644 --- a/pep_schema.yaml +++ b/pep_schema.yaml @@ -18,7 +18,6 @@ properties: genome: type: string description: "organism genome code" - enum: ["hg18", "hg19", "hg38", "mm9", "mm10"] narrowpeak: type: integer minimum: 0 diff --git a/pipeline/bedstat.py b/pipeline/bedstat.py index de81999..0c8cbe9 100755 --- a/pipeline/bedstat.py +++ b/pipeline/bedstat.py @@ -3,105 +3,245 @@ bedfile statistics generating pipeline """ -__author__ = ["Michal Stolarczyk", "Ognen Duzlevski", "Jose Verdezoto"] +__author__ = [ + "Michal Stolarczyk", + "Ognen Duzlevski", + "Jose Verdezoto", + "Bingjie Xue", +] __email__ = "michal@virginia.edu" -__version__ = "0.0.1" +__version__ = "0.0.4-dev" from argparse import ArgumentParser from hashlib import md5 import json import yaml import os +import sys import warnings +import tempfile +import requests +import gzip -from bbconf.const import * import pypiper import bbconf +import time parser = ArgumentParser( description="A pipeline to read a file in BED format and produce metadata " - "in JSON format.") + "in JSON format." +) parser.add_argument( - '--bedfile', help='a full path to bed file to process', required=True) + "--bedfile", help="a full path to bed file to process", required=True +) parser.add_argument( - '--open-signal-matrix', type=str, required=False, default=None, - help='a full path to the openSignalMatrix required for the tissue ' - 'specificity plots') + "--open-signal-matrix", + type=str, + required=False, + default=None, + help="a full path to the openSignalMatrix required for the tissue " + "specificity plots", +) parser.add_argument( - "--bedbase-config", dest="bedbase_config", type=str, default=None, - help="a path to the bedbase configuratiion file") + "--bigbed", + type=str, + required=False, + default=None, + help="a full path to the bigbed files", +) parser.add_argument( - "-y", "--sample-yaml", dest="sample_yaml", type=str, required=False, + "--bedbase-config", + dest="bedbase_config", + type=str, + default=None, + help="a path to the bedbase configuratiion file", +) +parser.add_argument( + "-y", + "--sample-yaml", + dest="sample_yaml", + type=str, + required=False, help="a yaml config file with sample attributes to pass on more metadata " - "into the database") + "into the database", +) exclusive_group = parser.add_mutually_exclusive_group() exclusive_group.add_argument( - '--no-db-commit', action='store_true', - help='whether the JSON commit to the database should be skipped') + "--no-db-commit", + action="store_true", + help="whether the JSON commit to the database should be skipped", +) exclusive_group.add_argument( - '--just-db-commit', action='store_true', - help='whether just to commit the JSON to the database') -parser = pypiper.add_pypiper_args(parser, - groups=["pypiper", "common", "looper", "ngs"]) + "--just-db-commit", + action="store_true", + help="whether just to commit the JSON to the database", +) +parser = pypiper.add_pypiper_args(parser, groups=["pypiper", "common", "looper", "ngs"]) args = parser.parse_args() -bbc = bbconf.BedBaseConf(filepath=bbconf.get_bedbase_cfg(args.bedbase_config)) -bed_digest = md5(open(args.bedfile, 'rb').read()).hexdigest() +def hash_bedfile(filepath): + """generate digest for bedfile""" + with gzip.open(filepath, "rb") as f: + # concate column values + chrs = ",".join([row.split()[0].decode("utf-8") for row in f]) + starts = ",".join([row.split()[1].decode("utf-8") for row in f]) + ends = ",".join([row.split()[2].decode("utf-8") for row in f]) + # hash column values + chr_digest = md5(chrs.encode("utf-8")).hexdigest() + start_digest = md5(starts.encode("utf-8")).hexdigest() + end_digest = md5(ends.encode("utf-8")).hexdigest() + # hash column digests + bed_digest = md5( + ",".join([chr_digest, start_digest, end_digest]).encode("utf-8") + ).hexdigest() + + return bed_digest + + +def convert_unit(size_in_bytes): + """Convert the size from bytes to other units like KB, MB or GB""" + if size_in_bytes < 1024: + return str(size_in_bytes) + "bytes" + elif size_in_bytes in range(1024, 1024 * 1024): + return str(round(size_in_bytes / 1024, 2)) + "KB" + elif size_in_bytes in range(1024 * 1024, 1024 * 1024 * 1024): + return str(round(size_in_bytes / (1024 * 1024))) + "MB" + elif size_in_bytes >= 1024 * 1024 * 1024: + return str(round(size_in_bytes / (1024 * 1024 * 1024))) + "GB" + + +def get_file_size(file_name): + """Get file in size in given unit like KB, MB or GB""" + size = os.path.getsize(file_name) + return convert_unit(size) + + +bbc = bbconf.BedBaseConf(config_path=args.bedbase_config, database_only=True) +bedstat_output_path = bbc.get_bedstat_output_path() + +bed_digest = md5(open(args.bedfile, "rb").read()).hexdigest() +# bed_digest = hash_bedfile(args.bedfile) bedfile_name = os.path.split(args.bedfile)[1] # need to split twice since there are 2 exts fileid = os.path.splitext(os.path.splitext(bedfile_name)[0])[0] -outfolder = os.path.abspath(os.path.join( - bbc[CFG_PATH_KEY][CFG_BEDSTAT_OUTPUT_KEY], bed_digest)) +outfolder = os.path.abspath(os.path.join(bedstat_output_path, bed_digest)) json_file_path = os.path.abspath(os.path.join(outfolder, fileid + ".json")) +json_plots_file_path = os.path.abspath(os.path.join(outfolder, fileid + "_plots.json")) +bed_relpath = os.path.relpath( + args.bedfile, + os.path.abspath(os.path.join(bedstat_output_path, os.pardir, os.pardir)), +) +bigbed_relpath = os.path.relpath( + os.path.join(args.bigbed, fileid + ".bigBed"), + os.path.abspath(os.path.join(bedstat_output_path, os.pardir, os.pardir)), +) + + +def main(): + if not args.just_db_commit: + pm = pypiper.PipelineManager( + name="bedstat-pipeline", outfolder=outfolder, args=args + ) + + # run Rscript + rscript_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), + "tools", + "regionstat.R", + ) + assert os.path.exists(rscript_path), FileNotFoundError( + f"'{rscript_path}' script not found" + ) + command = ( + f"Rscript {rscript_path} --bedfilePath={args.bedfile} " + f"--fileId={fileid} --openSignalMatrix={args.open_signal_matrix} " + f"--outputFolder={outfolder} --genome={args.genome_assembly} " + f"--digest={bed_digest}" + ) + print(command) + pm.run(cmd=command, target=json_file_path) + pm.stop_pipeline() + + # now get the resulting json file and load it into Elasticsearch + # if the file exists, of course + if not args.no_db_commit: + data = {} + if os.path.exists(json_file_path): + with open(json_file_path, "r", encoding="utf-8") as f: + data = json.loads(f.read()) + if os.path.exists(json_plots_file_path): + with open(json_plots_file_path, "r", encoding="utf-8") as f_plots: + plots = json.loads(f_plots.read()) + if args.sample_yaml: + # get the sample-specific metadata from the sample yaml representation + other = {} + if os.path.exists(args.sample_yaml): + y = yaml.safe_load(open(args.sample_yaml, "r")) + data.update({"other": y}) + # unlist the data, since the output of regionstat.R is a dict of lists of + # length 1 and force keys to lower to correspond with the + # postgres column identifiers + data = {k.lower(): v[0] if isinstance(v, list) else v for k, v in data.items()} + data.update( + { + "bedfile": { + "path": bed_relpath, + "size": get_file_size(args.bedfile), + "title": "Path to the BED file", + } + } + ) + + if os.path.exists( + os.path.join(args.bigbed, fileid + ".bigBed") + ) and not os.path.islink(os.path.join(args.bigbed, fileid + ".bigBed")): + digest = requests.get( + f"http://refgenomes.databio.org/genomes/genome_digest/{args.genome_assembly}" + ).text.strip('""') + + data.update( + { + "genome": { + "alias": args.genome_assembly, + "digest": digest, + } + } + ) + data.update( + { + "bigbedfile": { + "path": bigbed_relpath, + "size": get_file_size( + os.path.join(args.bigbed, fileid + ".bigBed") + ), + "title": "Path to the big BED file", + } + } + ) -if not args.just_db_commit: - pm = pypiper.PipelineManager(name="bedstat-pipeline", outfolder=outfolder, - args=args) - rscript_path = os.path.join(os.path.dirname( - os.path.dirname(os.path.abspath(__file__))), "tools", "regionstat.R") - assert os.path.exists(rscript_path), \ - FileNotFoundError("'{}' script not found".format(rscript_path)) - cmd_vars = dict(rscript=rscript_path, bed=args.bedfile, id=fileid, - matrix=args.open_signal_matrix, out=outfolder, - genome=args.genome_assembly, digest=bed_digest) - command = "Rscript {rscript} --bedfile={bed} --fileId={id} " \ - "--openSignalMatrix={matrix} --outputfolder={out} " \ - "--genome={genome} --digest={digest}".format(**cmd_vars) - pm.run(cmd=command, target=json_file_path) - pm.stop_pipeline() - -# now get the resulting json file and load it into Elasticsearch -# it the file exists, of course -if not args.no_db_commit: - # open connection to elastic - bbc.establish_elasticsearch_connection() - data = {} - with open(json_file_path, 'r', encoding='utf-8') as f: - data = json.loads(f.read()) - if args.sample_yaml: - # get the sample line from the yaml config file - if os.path.exists(args.sample_yaml): - y = yaml.safe_load(open(args.sample_yaml, "r")) - for key in JSON_METADATA_VALUES: - try: - data[key] = [y[key]] - except KeyError: - print("'{}' metadata not available".format(key)) else: - warnings.warn("Specified sample_yaml path does not exist: {}". - format(args.sample_yaml)) - # enrich the data from R with the data from the sample line itself - # the bedfile_path below needs to be overwritten in Elastic in case the - # pipeline run was split into two computing environments. Currently used - # for the development. This concept leverages the potability introduced by - # environment variable in the bedfile path in the PEP. Locally the - # environment variable points to a different path than on the compute - # cluster where the heavy calculations happen. - data[BEDFILE_PATH_KEY] = [args.bedfile] - print("Data: {}".format(data)) - bbc.insert_bedfiles_data(data=data, doc_id=bed_digest) + data.update( + { + "genome": { + "alias": args.genome_assembly, + "digest": "", + } + } + ) + + for plot in plots: + plot_id = plot["name"] + del plot["name"] + data.update({plot_id: plot}) + bbc.bed.report(record_identifier=bed_digest, values=data) +if __name__ == "__main__": + try: + sys.exit(main()) + except KeyboardInterrupt: + print("Pipeline aborted.") + sys.exit(1) diff --git a/pipeline_interface.yaml b/pipeline_interface.yaml index a94dfa0..fdbcbc9 100644 --- a/pipeline_interface.yaml +++ b/pipeline_interface.yaml @@ -1,15 +1,16 @@ -protocol_mapping: - bedstat: bedstat - -pipelines: - bedstat: - name: BEDSTAT - path: pipeline/bedstat.py - looper_args: True - arguments: - "--bedfile": output_file_path - "--genome": genome - "--sample-yaml": yaml_file - optional_arguments: - "--open-signal-matrix": open_signal_matrix - +pipeline_name: BEDSTAT +pipeline_type: sample +var_templates: + path: "{looper.piface_dir}/pipeline/bedstat.py" +input_schema: http://schema.databio.org/pipelines/bedstat.yaml +pre_submit: + python_functions: + - looper.write_sample_yaml +command_template: > + {pipeline.var_templates.path} + --bedfile {sample.output_file_path} + --genome {sample.genome} + --sample-yaml {sample.yaml_file} + {% if sample.bedbase_config is defined %} --bedbase-config {sample.bedbase_config} {% endif %} + {% if sample.open_signal_matrix is defined %} --open-signal-matrix {sample.open_signal_matrix} {% endif %} + {% if sample.bigbed is defined %} --bigbed {sample.bigbed} {% endif %} diff --git a/pipeline_interface_new.yaml b/pipeline_interface_just_db_commit.yaml similarity index 63% rename from pipeline_interface_new.yaml rename to pipeline_interface_just_db_commit.yaml index 9da5f31..919f3a4 100644 --- a/pipeline_interface_new.yaml +++ b/pipeline_interface_just_db_commit.yaml @@ -1,11 +1,17 @@ pipeline_name: BEDSTAT pipeline_type: sample -path: pipeline/bedstat.py +var_templates: + path: "{looper.piface_dir}/pipeline/bedstat.py" input_schema: http://schema.databio.org/pipelines/bedstat.yaml +pre_submit: + python_functions: + - looper.write_sample_yaml command_template: > - {pipeline.path} + {pipeline.var_templates.path} --bedfile {sample.output_file_path} --genome {sample.genome} --sample-yaml {sample.yaml_file} + --just-db-commit {% if sample.bedbase_config is defined %} --bedbase-config {sample.bedbase_config} {% endif %} {% if sample.open_signal_matrix is defined %} --open-signal-matrix {sample.open_signal_matrix} {% endif %} + {% if sample.bigbed is defined %} --bigbed {sample.bigbed} {% endif %} diff --git a/project/bedstat_config.yaml b/project/bedstat_config.yaml deleted file mode 100644 index 2cc802a..0000000 --- a/project/bedstat_config.yaml +++ /dev/null @@ -1,4 +0,0 @@ -metadata: - sample_annotation: ../lolacore.csv - output_dir: output - pipeline_interfaces: ../pipeline_interface.yaml \ No newline at end of file diff --git a/project/bedstat_config_new.yaml b/project/bedstat_config_new.yaml deleted file mode 100644 index d8d366d..0000000 --- a/project/bedstat_config_new.yaml +++ /dev/null @@ -1,7 +0,0 @@ -pep_version: 2.0.0 -sample_table: path -looper: - output_dir: output -sample_modifiers: - append: - pipeline_interfaces: ../pipeline_interface_new.yaml diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index f803877..0000000 --- a/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -piper -loopercli -bbconf>=0.0.1 \ No newline at end of file diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt new file mode 100644 index 0000000..ffc1db4 --- /dev/null +++ b/requirements/requirements-all.txt @@ -0,0 +1,3 @@ +piper>=0.12.1 +looper>=1.2.1 +# bbconf>=0.1.0 \ No newline at end of file diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt new file mode 100644 index 0000000..6c7c573 --- /dev/null +++ b/requirements/requirements-dev.txt @@ -0,0 +1,4 @@ +-e git+https://github.com/databio/yacman@dev#egg=yacman +-e git+https://github.com/pepkit/pipestat@dev#egg=pipestat + + diff --git a/scripts/process_LOLA.py b/scripts/process_LOLA.py index d5fc326..7d780b8 100644 --- a/scripts/process_LOLA.py +++ b/scripts/process_LOLA.py @@ -10,22 +10,27 @@ # for reading TSV files import csv + class LOLAIndexFileError(Exception): pass + class LOLAIndexColumnError(Exception): pass + global idx_file_keywords -idx_file_keywords = ['filename', - 'cellType', - 'cellTypeSubtype', - 'antibody', - 'mappingGenome', - 'description', - 'tissue', - 'species'] +idx_file_keywords = [ + "filename", + "cellType", + "cellTypeSubtype", + "antibody", + "mappingGenome", + "description", + "tissue", + "species", +] # process index file from LOLA core DB # expects a base directory for the collection @@ -38,71 +43,88 @@ def process_index_file(base_dir, genome): idx_file = os.path.join(base_dir, "index.txt") with open(idx_file, "r") as tsvfile, open(idx_file, "r") as test: try: - reader = csv.DictReader(test, dialect='excel-tab') + reader = csv.DictReader(test, dialect="excel-tab") n = reader.__next__() l = list(n) if len(n) == 1: # csv case - if (l[0].find(',') != -1): + if l[0].find(",") != -1: test.seek(0) reader = csv.DictReader(test) else: - raise Exception("Not tsv and not csv format for %s. Aborting." % idx_file) + raise Exception( + "Not tsv and not csv format for %s. Aborting." % idx_file + ) else: # tsv case - reader = csv.DictReader(tsvfile, dialect='excel-tab') + reader = csv.DictReader(tsvfile, dialect="excel-tab") for row in reader: try: - s = '' + s = "" for kw in idx_file_keywords: if kw not in row: - if kw == 'filename': + if kw == "filename": raise LOLAIndexColumnError - s = s + ',unspecified' + s = s + ",unspecified" else: - if kw == 'filename': + if kw == "filename": # see if the file name makes sense - sometimes some filenames have , or . in them # and are split inappropriately - s = os.path.abspath(os.path.join(bed_path_base, row['filename'])) + s = os.path.abspath( + os.path.join(bed_path_base, row["filename"]) + ) f = Path(s) if not (f.exists() and f.is_file()): # skip file raise LOLAIndexFileError else: - if row[kw] == '': - t = 'unspecified' + if row[kw] == "": + t = "unspecified" else: t = row[kw] - s = s + ',' + t + s = s + "," + t except LOLAIndexFileError as lie: - fn_err = os.path.abspath(os.path.join(bed_path_base, row['filename'])) - msg = "Skipping line #{} in index file {}, file {} not found.\n".format(reader.line_num, idx_file, fn_err) + fn_err = os.path.abspath( + os.path.join(bed_path_base, row["filename"]) + ) + msg = "Skipping line #{} in index file {}, file {} not found.\n".format( + reader.line_num, idx_file, fn_err + ) sys.stderr.write(msg) except LOLAIndexColumnError as lce: - msg = "Skipping line {} in index file {}, no filename column.\n".format(row, idx_file) + msg = "Skipping line {} in index file {}, no filename column.\n".format( + row, idx_file + ) sys.stderr.write(msg) # add the protocol and genome at the end of each row - s = s + ',bedstat' + ',' + genome + s = s + ",bedstat" + "," + genome print(s) except Exception as e: print("Exception reading %s file" % idx_file) raise + def process_collection_file(base_dir, cfile): - #print("Processing collection file %s" % cfile) - #print("Base dir: %s" % base_dir) + # print("Processing collection file %s" % cfile) + # print("Base dir: %s" % base_dir) pass -if __name__ == '__main__': - parser = ArgumentParser(description="Script to process LOLA database into PEP format.") - parser.add_argument('--lola_loc', help='full path to base LOLA db (folder whose subfolders are hg38, hg19 etc.') - parser.add_argument('--genome', help='genome to process, e.g. hg38', default='hg38') +if __name__ == "__main__": + parser = ArgumentParser( + description="Script to process LOLA database into PEP format." + ) + + parser.add_argument( + "--lola_loc", + help="full path to base LOLA db (folder whose subfolders are hg38, hg19 etc.", + ) + parser.add_argument("--genome", help="genome to process, e.g. hg38", default="hg38") args = parser.parse_args() # get all the arguments and prepare to call R script - if (args.lola_loc is None): + if args.lola_loc is None: parser.print_help() sys.exit() @@ -114,24 +136,25 @@ def process_collection_file(base_dir, cfile): full_path = os.path.abspath(os.path.join(lola_base, genome)) # what we are interested in for now from index.txt files - header = 'sample_name,' + ','.join(idx_file_keywords[1:]) + ',protocol,genome' + header = "sample_name," + ",".join(idx_file_keywords[1:]) + ",protocol,genome" print(header) # search for subfolders and index.txt - for filename in Path(full_path).glob('**/index.txt'): + for filename in Path(full_path).glob("**/index.txt"): # break down the path from each index.txt file # to get its base directory relative to lola_loc # and see if collection.txt can be obtained as well try: path_parts = os.path.split(filename) - if (len(path_parts) != 2): break + if len(path_parts) != 2: + break # process index file process_index_file(path_parts[0], genome) - + # process collections file - #collection_file = os.path.abspath(os.path.join(path_parts[0], 'collection.txt')) - #cpath = Path(collection_file) - #if (cpath.exists() and cpath.is_file()): - # we also have a collection file + # collection_file = os.path.abspath(os.path.join(path_parts[0], 'collection.txt')) + # cpath = Path(collection_file) + # if (cpath.exists() and cpath.is_file()): + # we also have a collection file # process_collection_file(path_parts[0], cpath) except Exception as e: print("Error accessing file: {0}".format(e)) diff --git a/tests/conftest.py b/tests/conftest.py index 729cec9..eb228e6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -24,4 +24,4 @@ def tsv_data_path(data_path): @pytest.fixture def tsv_base_dir(tsv_data_path): - return os.path.join(tsv_data_path, "hg38", "test_data_tsv") \ No newline at end of file + return os.path.join(tsv_data_path, "hg38", "test_data_tsv") diff --git a/tests/data/bedstat_annotation_sheet.csv b/tests/data/bedstat_annotation_sheet.csv new file mode 100644 index 0000000..8ea1c8d --- /dev/null +++ b/tests/data/bedstat_annotation_sheet.csv @@ -0,0 +1,2 @@ +sample_name,file_name,genome,exp_protocol,cell_type,tissue,antibody,treatment,data_source,GSE,GSM,description,format +bedbase_demo1,test_hg38.bed.gz,hg38,ChiPseq,GM12878,NA,IKZF1,None,GEO,GSE105587,,IKZF1 ChIP-seq on human GM12878,bed \ No newline at end of file diff --git a/tests/data/bedstat_config.yaml b/tests/data/bedstat_config.yaml new file mode 100644 index 0000000..78a2a84 --- /dev/null +++ b/tests/data/bedstat_config.yaml @@ -0,0 +1,17 @@ +pep_version: 2.0.0 +sample_table: bedstat_annotation_sheet.csv + +looper: + output_dir: $GITHUB_WORKSPACE/outputs/bedstat_output + +sample_modifiers: + append: + bedbase_config: $GITHUB_WORKSPACE/tests/data/config_min.yaml + pipeline_interfaces: $GITHUB_WORKSPACE/pipeline_interface.yaml + output_file_path: OUTPUT + yaml_file: SAMPLE_YAML + derive: + attributes: [output_file_path, yaml_file] + sources: + OUTPUT: "$GITHUB_WORKSPACE/tests/data/{file_name}" + SAMPLE_YAML: "$GITHUB_WORKSPACE/outputs/bedstat_output/submission/{sample_name}.yaml" \ No newline at end of file diff --git a/tests/data/config_min.yaml b/tests/data/config_min.yaml new file mode 100644 index 0000000..e55baf4 --- /dev/null +++ b/tests/data/config_min.yaml @@ -0,0 +1,14 @@ +database: + name: pipestat-test + user: postgres + password: pipestat-password + host: localhost + port: 5432 +path: + pipeline_output_path: $BEDBASE_DATA_PATH/outputs + bedstat_dir: bedstat_output + bedbuncher_dir: bedbuncher_output + remote_url_base: null +server: + host: 0.0.0.0 + port: 8000 \ No newline at end of file diff --git a/tests/data/test_hg38.bed.gz b/tests/data/test_hg38.bed.gz new file mode 100644 index 0000000..344da49 Binary files /dev/null and b/tests/data/test_hg38.bed.gz differ diff --git a/tools/regionstat.R b/tools/regionstat.R index 7197637..d8c1320 100644 --- a/tools/regionstat.R +++ b/tools/regionstat.R @@ -2,156 +2,235 @@ library(GenomicDistributions) library(GenomicDistributionsData) library(optparse) library(tools) -data(TSS_hg38) +library(R.utils) + +trim <- IRanges::trim option_list = list( - make_option(c("--bedfile"), type="character", default=NULL, - help="path to a BED file to process", metavar="character"), - make_option(c("--fileId"), type="character", default=NULL, + make_option(c("--bedfilePath"), type="character", default=NULL, + help="full path to a BED file to process", metavar="character"), + make_option(c("--fileId"), type="character", default=NULL, help="BED file ID to use for output files prefix", metavar="character"), - make_option(c("--openSignalMatrix"), type="character", - help="path to the open signal matrix required for the tissue specificity plot", metavar="character"), - make_option(c("--digest"), type="character", default=NULL, + make_option(c("--openSignalMatrix"), type="character", + help="path to the open signal matrix required for the tissue specificity plot", metavar="character"), + make_option(c("--digest"), type="character", default=NULL, help="digest of the BED file", metavar="character"), - make_option(c("--outputfolder"), type="character", default="output", + make_option(c("--outputFolder"), type="character", default="output", help="base output folder for results", metavar="character"), - make_option(c("--genome"), type="character", default="hg38", + make_option(c("--genome"), type="character", default="hg38", help="genome reference to calculate against", metavar="character")) - + opt_parser = OptionParser(option_list=option_list); opt = parse_args(opt_parser); -if (is.null(opt$bedfile)) { - print_help(opt_parser) - stop("Bed file input missing.") +if (is.null(opt$bedfilePath)) { + print_help(opt_parser) + stop("Bed file input missing.") } if (is.null(opt$fileId)) { - print_help(opt_parser) - stop("fileId input missing.") + print_help(opt_parser) + stop("fileId input missing.") } if (is.null(opt$digest)) { - print_help(opt_parser) - stop("digest input missing.") + print_help(opt_parser) + stop("digest input missing.") } -plotBoth <- function(plotPth, g){ - print(paste0("Plotting: ", plotPth)) - ggplot2::ggsave(paste0(plotPth, ".png"), g, device="png", width=8, height=8, units="in") - ggplot2::ggsave(paste0(plotPth, ".pdf"), g, device="pdf", width=8, height=8, units="in") +plotBoth <- function(plotId, g){ + pth = paste0(opt$outputFolder, "/", fileId, "_", plotId) + print(paste0("Plotting: ", pth)) + ggplot2::ggsave(paste0(pth, ".png"), g, device="png", width=8, height=8, units="in") + ggplot2::ggsave(paste0(pth, ".pdf"), g, device="pdf", width=8, height=8, units="in") } -doItAall <- function(query, fname, fileId, genome, cellMatrix) { - plots = data.frame(stringsAsFactors=F) - bsGenomeAvail = ifelse((requireNamespace(BSg, quietly=TRUE) | requireNamespace(BSgm, quietly=TRUE)), TRUE, FALSE) - ## continue on with calculations - TSSdist = calcFeatureDistRefTSS(query, genome) - plotId = "tssdist" - plotBoth(paste0(outfolder, "/", fileId, "_", plotId), - plotFeatureDist(TSSdist, featureName="TSS")) - newPlot = data.frame("name"=plotId, "caption"="Region-TSS distance distribution") - plots = rbind(plots, newPlot) - - - # Chromosomes region distribution plot - x = calcChromBinsRef(query, genome) - plotId = "chrombins" - plotBoth(paste0(outfolder, "/", fileId, "_", plotId), - plotChromBins(x)) - newPlot = data.frame("name"=plotId, "caption"="Regions distribution over chromosomes") - plots = rbind(plots, newPlot) - - # OPTIONAL: Plot GC content only if proper BSgenome package is installed. - if (bsGenomeAvail) { - gcvec = calcGCContentRef(query, genome) - plotId = "gccontent" - plotBoth(paste0(outfolder, "/", fileId, "_", plotId), - plotGCContent(gcvec)) - newPlot = data.frame("name"=plotId, "caption"="GC content") - plots = rbind(plots, newPlot) - } - # Partition Plots, default to percentages - gp = calcPartitionsRef(query, genome) - plotId = "partitions" - plotBoth(paste0(outfolder, "/", fileId, "_", plotId), - plotPartitions(gp)) - newPlot = data.frame("name"=plotId, "caption"="Regions distribution over genomic partitions") - plots = rbind(plots, newPlot) - - ep = calcExpectedPartitionsRef(query, genome) - plotId = "expected_partitions" - plotBoth(paste0(outfolder, "/", fileId, "_", plotId), - plotExpectedPartitions(ep)) - newPlot = data.frame("name"=plotId, "caption"="Expected distribution over genomic partitions") - plots = rbind(plots, newPlot) - - cp = calcCumulativePartitionsRef(query, genome) - plotId = "cumulative_partitions" - plotBoth(paste0(outfolder, "/", fileId, "_", plotId), - plotCumulativePartitions(cp)) - newPlot = data.frame("name"=plotId, "caption"="Cumulative distribution over genomic partitions") - plots = rbind(plots, newPlot) - - - # flatten the result returned by the function above - partiotionNames = as.vector(gp[,"partition"]) - partitionsList = list() - for(i in seq_along(partiotionNames)){ - partitionsList[[paste0(partiotionNames[i], "_frequency")]] = - as.vector(gp[,"Freq"])[i] - partitionsList[[paste0(partiotionNames[i], "_percentage")]] = - as.vector(gp[,"Freq"])[i]/length(query) - } - - # QThist plot - widths = calcWidth(query) - plotId = "widths_histogram" - plotBoth(paste0(outfolder, "/", fileId, "_", plotId), - plotQTHist(widths)) - newPlot = data.frame("name"=plotId, "caption"="Quantile-Trimmed Histogram of Widths") - plots = rbind(plots, newPlot) - - # Neighbor regions distance plots - dist = calcNeighborDist(query) - plotId = "neighbor_distances" - plotBoth(paste0(outfolder, "/", fileId, "_", plotId), - plotNeighborDist(dist)) - newPlot = data.frame("name"=plotId, "caption"="Distance between neighbor regions") - plots = rbind(plots, newPlot) +getPlotReportDF <- function(plotId, title){ + pth = paste0(opt$outputFolder, "/", fileId, "_", plotId) + rel_pth = getRelativePath(pth, paste0(opt$outputFolder, "/../../../")) + print(paste0("Writing plot json: ", rel_pth)) + newPlot = data.frame( + "name"=plotId, + "title"=title, + "thumbnail_path"=paste0(rel_pth, ".png"), + "path"=paste0(rel_pth, ".pdf") + ) + return(newPlot) +} - # OPTIONAL: Add tissue specificity plot if open signal matrix is provided - if (cellMatrix == "None") { - message("open signal matrix not provided. Skipping tissue specificity plot ... ") - } else { - matrix = data.table::fread(cellMatrix) - op = calcOpenSignal(query, matrix) - plotId = "open_chromatin" - plotBoth(paste0(outfolder, "/", fileId, "_", plotId), - plotOpenSignal(op)) - newPlot = data.frame("name"=plotId, "caption"="Cell specific enrichment for open chromatin") - plots = rbind(plots, newPlot) - } - # Note: names of the list elements MUST match what's defined in: https://github.com/databio/bbconf/blob/master/bbconf/const.py - bedmeta = list( - id=fileId, - gc_content=ifelse(bsGenomeAvail, mean(gcvec), NA), - regions_no=length(query), - mean_absolute_TSS_dist=mean(abs(TSSdist), na.rm=TRUE), - mean_region_width=mean(widths), - md5sum=opt$digest, - plots=plots, - bedfile_path=fname - ) - write(jsonlite::toJSON(c(bedmeta, partitionsList), pretty=TRUE), paste0(outfolder, "/", fileId, ".json")) +doItAall <- function(query, fileId, genome, cellMatrix) { + plots = data.frame(stringsAsFactors=F) + bsGenomeAvail = ifelse((requireNamespace(BSg, quietly=TRUE) | requireNamespace(BSgm, quietly=TRUE)), TRUE, FALSE) + # TSS distance plot + tryCatch( + expr = { + TSSdist = calcFeatureDistRefTSS(query, genome) + plotBoth("tssdist", plotFeatureDist(TSSdist, featureName="TSS")) + plots = rbind(plots, getPlotReportDF("tssdist", "Region-TSS distance distribution")) + message("Successfully calculated and plot TSS distance.") + }, + error = function(e){ + message('Caught an error!') + print(e) + } + ) + + + # Chromosomes region distribution plot + tryCatch( + expr = { + plotBoth("chrombins", plotChromBins(calcChromBinsRef(query, genome))) + plots = rbind(plots, getPlotReportDF("chrombins", "Regions distribution over chromosomes")) + message("Successfully calculated and plot chromosomes region distribution.") + }, + error = function(e){ + message('Caught an error!') + print(e) + } + ) + + + # OPTIONAL: Plot GC content only if proper BSgenome package is installed. + if (bsGenomeAvail) { + tryCatch( + expr = { + gcvec = calcGCContentRef(query, genome) + plotBoth("gccontent", plotGCContent(gcvec)) + plots = rbind(plots, getPlotReportDF("gccontent", "GC content")) + message("Successfully calculated and plot GC content.") + }, + error = function(e){ + message('Caught an error!') + print(e, gcvec) + } + ) + } + + # Partition plots, default to percentages + tryCatch( + expr = { + gp = calcPartitionsRef(query, genome) + plotBoth("paritions", plotPartitions(gp)) + plots = rbind(plots, getPlotReportDF("paritions", "Regions distribution over genomic partitions")) + # flatten the result returned by the function above + partiotionNames = as.vector(gp[,"partition"]) + partitionsList = list() + for(i in seq_along(partiotionNames)){ + partitionsList[[paste0(partiotionNames[i], "_frequency")]] = + as.vector(gp[,"Freq"])[i] + partitionsList[[paste0(partiotionNames[i], "_percentage")]] = + as.vector(gp[,"Freq"])[i]/length(query) + } + message("Successfully calculated and plot regions distribution over genomic partitions.") + }, + error = function(e){ + message('Caught an error!') + print(e) + } + ) + + # Expected partition plots + tryCatch( + expr = { + plotBoth("expected_partitions", plotExpectedPartitions(calcExpectedPartitionsRef(query, genome))) + plots = rbind(plots, getPlotReportDF("expected_partitions", "Expected distribution over genomic partitions")) + message("Successfully calculated and plot expected distribution over genomic partitions.") + }, + error = function(e){ + message('Caught an error!') + print(e) + } + ) + + # Cumulative partition plots + tryCatch( + expr = { + plotBoth("cumulative_partitions", plotCumulativePartitions(calcCumulativePartitionsRef(query, genome))) + plots = rbind(plots, getPlotReportDF("cumulative_partitions", "Cumulative distribution over genomic partitions")) + message("Successfully calculated and plot cumulative distribution over genomic partitions.") + }, + error = function(e){ + message('Caught an error!') + print(e) + } + ) + + # QThist plot + tryCatch( + expr = { + widths = calcWidth(query) + plotBoth("widths_histogram", plotQTHist(widths)) + plots = rbind(plots, getPlotReportDF("widths_histogram", "Quantile-trimmed histogram of widths")) + message("Successfully calculated and plot quantile-trimmed histogram of widths.") + }, + error = function(e){ + message('Caught an error!') + print(e, widths) + } + ) + + # Neighbor regions distance plots + tryCatch( + expr = { + plotBoth("neighbor_distances", plotNeighborDist(calcNeighborDist(query))) + plots = rbind(plots, getPlotReportDF("neighbor_distances", "Distance between neighbor regions")) + message("Successfully calculated and plot distance between neighbor regions.") + }, + error = function(e){ + message('Caught an error!') + print(e) + } + ) + + # Tissue specificity plot if open signal matrix is provided + if (cellMatrix == "None") { + message("open signal matrix not provided. Skipping tissue specificity plot ... ") + } else { + tryCatch( + expr = { + plotBoth("open_chromatin", plotSummarySignal(calcSummarySignal(query, data.table::fread(cellMatrix)))) + plots = rbind(plots, getPlotReportDF("open_chromatin", "Cell specific enrichment for open chromatin")) + message("Successfully calculated and plot cell specific enrichment for open chromatin.") + }, + error = function(e){ + message('Caught an error!') + print(e) + } + ) + } + + # Note: names of the list elements MUST match what's defined in: https://github.com/databio/bbconf/blob/master/bbconf/schemas/bedfiles_schema.yaml + bedmeta = list( + name=fileId, + regions_no=length(query), + mean_region_width=ifelse(exists('widths'), signif(mean(widths), digits = 4), NA), + md5sum=opt$digest + ) + if (exists('gcvec') && !isEmpty(gcvec)){ + gc_content <- list(gc_content = signif(mean(gcvec), digits = 4)) + bedmeta = append(bedmeta, gc_content) + } + if (exists('TSSdist') && !all(is.na(TSSdist))){ + tss <- list(median_TSS_dist = signif(median(abs(TSSdist), na.rm=TRUE), digits = 4)) + bedmeta = append(bedmeta, tss) + } + if (exists('partitionsList')){ + write(jsonlite::toJSON(c(bedmeta, partitionsList), pretty=TRUE), paste0(outfolder, "/", fileId, ".json")) + } else { + write(jsonlite::toJSON(c(bedmeta), pretty=TRUE), paste0(outfolder, "/", fileId, ".json")) + } + + if (exists('plots')){ + write(jsonlite::toJSON(plots, pretty=TRUE), paste0(outfolder, "/", fileId, "_plots.json")) + } } # define values and output folder for doitall() fileId = opt$fileId -fn = opt$bedfile -outfolder = opt$outputfolder +bedPath = opt$bedfilePath +outfolder = opt$outputFolder genome = opt$genome cellMatrix = opt$openSignalMatrix orgName = "Mmusculus" @@ -163,7 +242,5 @@ BSg = paste0("BSgenome.", orgName , ".UCSC.", genome) BSgm = paste0(BSg, ".masked") # read bed file and run doitall() -query = LOLA::readBed(fn) -doItAall(query, fn, fileId, genome, cellMatrix) - - +query = LOLA::readBed(bedPath) +doItAall(query, fileId, genome, cellMatrix)