diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 0000000..4e1ef42 --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,31 @@ +# This workflows will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +name: Upload Python Package + +on: + release: + types: [created] + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine + - name: Build and publish + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + python setup.py sdist bdist_wheel + twine upload dist/* diff --git a/.github/workflows/test-pipeline.yml b/.github/workflows/test-pipeline.yml index f65c756..cbab6eb 100644 --- a/.github/workflows/test-pipeline.yml +++ b/.github/workflows/test-pipeline.yml @@ -2,7 +2,7 @@ name: Test bedstat pipeline on: push: - branches: [master, dev] + branches: [master, dev, dev_alex] pull_request: branches: [master, dev] @@ -10,7 +10,7 @@ jobs: pytest: strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.6, 3.8] os: [ubuntu-latest] # can't use macOS when using service containers or container jobs r: [release] runs-on: ${{ matrix.os }} @@ -46,25 +46,27 @@ jobs: sudo apt-get update sudo apt-get install libcurl4-openssl-dev - - uses: r-lib/actions/setup-r@master - with: - r-version: ${{ matrix.r }} + - name: Install package + run: python -m pip install . - - name: Install R dependancies - run: Rscript scripts/installRdeps.R + - name: Run pytest tests + run: pytest tests -x -vv - - name: Run pipeline - run: looper run -p local tests/data/bedstat_config.yaml + # - uses: r-lib/actions/setup-r@master +# with: +# r-version: ${{ matrix.r }} +# +# - name: Install R dependancies +# run: Rscript scripts/installRdeps.R - - name: Test plots exist - run: | - if ls $GITHUB_WORKSPACE/outputs/bedstat_output/a6a08126cb6f4b1953ba0ec8675df85a/test_hg38*.png 1> /dev/null 2>&1; then - echo "SUCCESS"; - else - echo "ERROR: files do not exist: $GITHUB_WORKSPACE/outputs/bedstat_output/a6a08126cb6f4b1953ba0ec8675df85a/test_hg38*.png"; - exit 1 - fi - - - name: Test record in PostgreSQL - run: | - echo "from bbconf import BedBaseConf; from bbconf.const import *; bbc = BedBaseConf('$GITHUB_WORKSPACE/tests/data/config_min.yaml'); assert bbc.bed.record_count == 1, 'Number of records in the bedfiles table not equal 1'" | python3 - +# - name: Run pipeline +# run: looper run -p local tests/data/bedstat_config.yaml +# +# - name: Test plots exist +# run: | +# if ls $GITHUB_WORKSPACE/outputs/bedstat_output/a6a08126cb6f4b1953ba0ec8675df85a/test_hg38*.png 1> /dev/null 2>&1; then +# echo "SUCCESS"; +# else +# echo "ERROR: files do not exist: $GITHUB_WORKSPACE/outputs/bedstat_output/a6a08126cb6f4b1953ba0ec8675df85a/test_hg38*.png"; +# exit 1 +# fi \ No newline at end of file diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..1b78bad --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,9 @@ +Copyright 2017 Nathan Sheffield + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..5ba56a6 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,6 @@ +include README.md +include requirements/* +include schemas/* +include bedstat/scripts/* +include bedstat/tools/regionstat.R +include bedstat/pep_schema.yaml diff --git a/README.md b/README.md index d821232..32e0807 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ The above command will build the csv file looper needs to run the pipeline on al ### 1. Validate your PEP with [`eido`](https://github.com/pepkit/eido) -The input PEP can be validated against the [JSON schema in this repository](pep_schema.yaml). This ensures the PEP consists of all required attributes to run `bedstat` pipeline. +The input PEP can be validated against the [JSON schema in this repository](schemas/pep_schema.yaml). This ensures the PEP consists of all required attributes to run `bedstat` pipeline. ``` eido validate -s https://schema.databio.org/pipelines/bedstat.yaml @@ -58,7 +58,7 @@ The data loaded into PostgreSQL should persist between PostgreSQL invocations, o ## Additional dependencies -[regionstat.R](tools/regionstat.R) script is used to calculate the bed file statistics, so the pipeline also depends on several R packages: +[regionstat.R](bedstat/tools/regionstat.R) script is used to calculate the bed file statistics, so the pipeline also depends on several R packages: * `R.utils` * `BiocManager` @@ -71,7 +71,7 @@ The data loaded into PostgreSQL should persist between PostgreSQL invocations, o * `BSgenome..UCSC.` *depending on the genome used* * `LOLA` -you can use [installRdeps.R](scripts/installRdeps.R) helper script to easily install the required packages: +you can use [installRdeps.R](bedstat/scripts/installRdeps.R) helper script to easily install the required packages: ``` Rscript scripts/installRdeps.R diff --git a/bedstat/__init__.py b/bedstat/__init__.py new file mode 100644 index 0000000..c776603 --- /dev/null +++ b/bedstat/__init__.py @@ -0,0 +1,3 @@ +""" Package-level data """ +from .bedstat import * +from ._version import __version__ diff --git a/bedstat/_version.py b/bedstat/_version.py new file mode 100644 index 0000000..3dc1f76 --- /dev/null +++ b/bedstat/_version.py @@ -0,0 +1 @@ +__version__ = "0.1.0" diff --git a/bedstat/bedstat.py b/bedstat/bedstat.py new file mode 100755 index 0000000..f382b44 --- /dev/null +++ b/bedstat/bedstat.py @@ -0,0 +1,318 @@ +#!/usr/bin/env python3 +""" +bedfile statistics generating pipeline +""" + +from argparse import ArgumentParser +from hashlib import md5 +import json +import yaml +import os +import sys +import warnings +import tempfile +import requests +import gzip + +import pypiper +import bbconf +import time + + +__author__ = [ + "Michal Stolarczyk", + "Ognen Duzlevski", + "Jose Verdezoto", + "Bingjie Xue", + "Oleksandr Khoroshevskyi", +] +__email__ = "khorosh@virginia.edu" + +SCHEMA_PATH = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "pep_schema.yaml" +) + + +def hash_bedfile(filepath): + """generate digest for bedfile""" + with gzip.open(filepath, "rb") as f: + # concate column values + chrs = ",".join([row.split()[0].decode("utf-8") for row in f]) + starts = ",".join([row.split()[1].decode("utf-8") for row in f]) + ends = ",".join([row.split()[2].decode("utf-8") for row in f]) + # hash column values + chr_digest = md5(chrs.encode("utf-8")).hexdigest() + start_digest = md5(starts.encode("utf-8")).hexdigest() + end_digest = md5(ends.encode("utf-8")).hexdigest() + # hash column digests + bed_digest = md5( + ",".join([chr_digest, start_digest, end_digest]).encode("utf-8") + ).hexdigest() + + return bed_digest + + +def convert_unit(size_in_bytes): + """Convert the size from bytes to other units like KB, MB or GB""" + if size_in_bytes < 1024: + return str(size_in_bytes) + "bytes" + elif size_in_bytes in range(1024, 1024 * 1024): + return str(round(size_in_bytes / 1024, 2)) + "KB" + elif size_in_bytes in range(1024 * 1024, 1024 * 1024 * 1024): + return str(round(size_in_bytes / (1024 * 1024))) + "MB" + elif size_in_bytes >= 1024 * 1024 * 1024: + return str(round(size_in_bytes / (1024 * 1024 * 1024))) + "GB" + + +def get_file_size(file_name): + """Get file in size in given unit like KB, MB or GB""" + size = os.path.getsize(file_name) + return convert_unit(size) + + +def run_bedstat( + bedfile: str, + bedbase_config: str, + genome_assembly: str, + ensdb: str = None, + open_signal_matrix: str = None, + bigbed: str = None, + sample_yaml: str = None, + just_db_commit: bool = False, + no_db_commit: bool = False, +): + """ + Main function to run bedstats. Can be used without runing from command line + :param bedfile: a full path to bed file to process + :param bigbed: a path to the bedbase configuration file + :param bedbase_config: a path to the bedbase configuration file + :param open_signal_matrix: a full path to the openSignalMatrix required for the tissue + specificity plots + :param genome_assembly: genome assembly of the sample + :param ensdb: a full path to the ensdb gtf file required for genomes not in GDdata + :param sample_yaml: a yaml config file with sample attributes to pass on more metadata + into the database + :param schema: sample table schema + :param just_db_commit: whether just to commit the JSON to the database + :param no_db_commit: whether the JSON commit to the database should be skipped + """ + bbc = bbconf.BedBaseConf(config_path=bedbase_config, database_only=True) + bedstat_output_path = bbc.get_bedstat_output_path() + + bed_digest = md5(open(bedfile, "rb").read()).hexdigest() + # bed_digest = hash_bedfile(args.bedfile) + bedfile_name = os.path.split(bedfile)[1] + # need to split twice since there are 2 exts + fileid = os.path.splitext(os.path.splitext(bedfile_name)[0])[0] + outfolder = os.path.abspath(os.path.join(bedstat_output_path, bed_digest)) + json_file_path = os.path.abspath(os.path.join(outfolder, fileid + ".json")) + json_plots_file_path = os.path.abspath( + os.path.join(outfolder, fileid + "_plots.json") + ) + bed_relpath = os.path.relpath( + bedfile, + os.path.abspath(os.path.join(bedstat_output_path, os.pardir, os.pardir)), + ) + bigbed_relpath = os.path.relpath( + os.path.join(bigbed, fileid + ".bigBed"), + os.path.abspath(os.path.join(bedstat_output_path, os.pardir, os.pardir)), + ) + if not just_db_commit: + pm = pypiper.PipelineManager( + name="bedstat-pipeline", + outfolder=outfolder, + ) + + # run Rscript + rscript_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), + "bedstat", + "tools", + "regionstat.R", + ) + assert os.path.exists(rscript_path), FileNotFoundError( + f"'{rscript_path}' script not found" + ) + command = ( + f"Rscript {rscript_path} --bedfilePath={bedfile} " + f"--fileId={fileid} --openSignalMatrix={open_signal_matrix} " + f"--outputFolder={outfolder} --genome={genome_assembly} " + f"--ensdb={ensdb} --digest={bed_digest}" + ) + print(command) + pm.run(cmd=command, target=json_file_path) + pm.stop_pipeline() + + # now get the resulting json file and load it into Elasticsearch + # if the file exists, of course + if not no_db_commit: + data = {} + if os.path.exists(json_file_path): + with open(json_file_path, "r", encoding="utf-8") as f: + data = json.loads(f.read()) + if os.path.exists(json_plots_file_path): + with open(json_plots_file_path, "r", encoding="utf-8") as f_plots: + plots = json.loads(f_plots.read()) + else: + plots = [] + if sample_yaml and os.path.exists(sample_yaml): + # get the sample-specific metadata from the sample yaml representation + y = yaml.safe_load(open(sample_yaml, "r")) + # if schema and os.path.exists(schema): + schema = yaml.safe_load(open(SCHEMA_PATH, "r")) + schema = schema["properties"]["samples"]["items"]["properties"] + + for key in list(y): + if key in schema: + if not schema[key]["db_commit"]: + y.pop(key, None) + elif key in ["bedbase_config", "pipeline_interfaces", "yaml_file"]: + y.pop(key, None) + data.update({"other": y}) + # unlist the data, since the output of regionstat.R is a dict of lists of + # length 1 and force keys to lower to correspond with the + # postgres column identifiers + data = {k.lower(): v[0] if isinstance(v, list) else v for k, v in data.items()} + data.update( + { + "bedfile": { + "path": bed_relpath, + "size": get_file_size(bedfile), + "title": "Path to the BED file", + } + } + ) + + if os.path.exists( + os.path.join(bigbed, fileid + ".bigBed") + ) and not os.path.islink(os.path.join(bigbed, fileid + ".bigBed")): + digest = requests.get( + f"http://refgenomes.databio.org/genomes/genome_digest/{genome_assembly}" + ).text.strip('""') + + data.update( + { + "genome": { + "alias": genome_assembly, + "digest": digest, + } + } + ) + data.update( + { + "bigbedfile": { + "path": bigbed_relpath, + "size": get_file_size(os.path.join(bigbed, fileid + ".bigBed")), + "title": "Path to the big BED file", + } + } + ) + + else: + data.update( + { + "genome": { + "alias": genome_assembly, + "digest": "", + } + } + ) + + for plot in plots: + plot_id = plot["name"] + del plot["name"] + data.update({plot_id: plot}) + bbc.bed.report(record_identifier=bed_digest, values=data) + + +def _parse_cmdl(): + parser = ArgumentParser( + description="A pipeline to read a file in BED format and produce metadata " + "in JSON format." + ) + parser.add_argument( + "--bedfile", help="a full path to bed file to process", required=True + ) + parser.add_argument( + "--open-signal-matrix", + type=str, + required=False, + default=None, + help="a full path to the openSignalMatrix required for the tissue " + "specificity plots", + ) + + parser.add_argument( + "--ensdb", + type=str, + required=False, + default=None, + help="a full path to the ensdb gtf file required for genomes not in GDdata ", + ) + + parser.add_argument( + "--bigbed", + type=str, + required=False, + default=None, + help="a full path to the bigbed files", + ) + + parser.add_argument( + "--bedbase-config", + dest="bedbase_config", + type=str, + default=None, + help="a path to the bedbase configuration file", + ) + parser.add_argument( + "-y", + "--sample-yaml", + dest="sample_yaml", + type=str, + required=False, + help="a yaml config file with sample attributes to pass on more metadata " + "into the database", + ) + # parser.add_argument( + # "--schema", + # dest="schema", + # type=str, + # required=False, + # help="schema for the sample table", + # ) + parser.add_argument( + "--genome", + dest="genome_assembly", + type=str, + required=True, + help="genome assembly of the sample", + ) + exclusive_group = parser.add_mutually_exclusive_group() + exclusive_group.add_argument( + "--no-db-commit", + action="store_true", + help="whether the JSON commit to the database should be skipped", + ) + exclusive_group.add_argument( + "--just-db-commit", + action="store_true", + help="whether just to commit the JSON to the database", + ) + args = parser.parse_args(sys.argv[1:]) + + return args + + +def main(): + args = _parse_cmdl() + args_dict = vars(args) + run_bedstat(**args_dict) + + +if __name__ == "__main__": + try: + sys.exit(main()) + except KeyboardInterrupt: + print("Pipeline aborted.") + sys.exit(1) diff --git a/pep_schema.yaml b/bedstat/pep_schema.yaml similarity index 63% rename from pep_schema.yaml rename to bedstat/pep_schema.yaml index da49b90..65bc588 100644 --- a/pep_schema.yaml +++ b/bedstat/pep_schema.yaml @@ -8,46 +8,72 @@ properties: properties: sample_name: type: string + db_commit: TRUE description: "name of the sample, which is the name of the output BED file" input_file_path: type: string + db_commit: FALSE description: "absolute path the file to convert" output_file_path: type: string + db_commit: FALSE description: "absolute path the file to the output BED file (derived attribute)" + bigbed: + type: string + db_commit: FALSE + description: "dir path where the bigbed file stored (derived attribute)" genome: type: string + db_commit: TRUE description: "organism genome code" narrowpeak: - type: integer - minimum: 0 - maximum: 1 + type: boolean + db_commit: TRUE description: "binary number indicating whether the regions are narrow (transcription factor implies narrow, histone mark implies broad peaks)" format: type: string + db_commit: TRUE description: "file format" enum: ["bigWig", "bigBed", "bed", "wig", "bedGraph"] cell_type: type: string + db_commit: TRUE description: "cell type code" antibody: type: string + db_commit: TRUE description: "antibody used if ChIP-seq experiment" description: type: string + db_commit: TRUE description: "freeform description of the sample" exp_protocol: type: string + db_commit: TRUE description: "type of the experiment the file was generated in" data_source: type: string + db_commit: TRUE description: "source of the sample, preferably a GSE* code" treatment: type: string + db_commit: TRUE description: "freeform description of the sample treatment" + ensdb: + type: string + db_commit: FALSE + description: "path of gtf annotation for genomes not in GDdata" + fasta: + type: string + db_commit: FALSE + description: "path of for genomes not in GDdata" + open_signal_matrix: + type: string + db_commit: FALSE + description: "path of for the open signal matrixm file for the given genome" required: - output_file_path - genome - sample_name required: - - samples + - samples \ No newline at end of file diff --git a/scripts/__init__.py b/bedstat/scripts/__init__.py similarity index 100% rename from scripts/__init__.py rename to bedstat/scripts/__init__.py diff --git a/scripts/installRdeps.R b/bedstat/scripts/installRdeps.R similarity index 88% rename from scripts/installRdeps.R rename to bedstat/scripts/installRdeps.R index 3cad82f..e0416f6 100644 --- a/scripts/installRdeps.R +++ b/bedstat/scripts/installRdeps.R @@ -17,9 +17,10 @@ .install_pkg("ensembldb", bioc=TRUE) .install_pkg("LOLA", bioc=TRUE) .install_pkg("BSgenome", bioc=TRUE) +.install_pkg("GenomicDistributions", bioc=TRUE) if(!require(package = "GenomicDistributions", character.only=TRUE)) { devtools::install_github("databio/GenomicDistributions") } if(!require(package = "GenomicDistributionsData", character.only=TRUE)) { - install.packages("http://big.databio.org/GenomicDistributionsData/GenomicDistributionsData_0.0.1.tar.gz", repos=NULL) + install.packages("http://big.databio.org/GenomicDistributionsData/GenomicDistributionsData_0.0.2.tar.gz", repos=NULL) } diff --git a/scripts/process_LOLA.py b/bedstat/scripts/process_LOLA.py similarity index 100% rename from scripts/process_LOLA.py rename to bedstat/scripts/process_LOLA.py diff --git a/tools/regionstat.R b/bedstat/tools/regionstat.R similarity index 54% rename from tools/regionstat.R rename to bedstat/tools/regionstat.R index d8c1320..540b243 100644 --- a/tools/regionstat.R +++ b/bedstat/tools/regionstat.R @@ -1,5 +1,7 @@ library(GenomicDistributions) library(GenomicDistributionsData) +library(GenomeInfoDb) +library(ensembldb) library(optparse) library(tools) library(R.utils) @@ -18,7 +20,11 @@ option_list = list( make_option(c("--outputFolder"), type="character", default="output", help="base output folder for results", metavar="character"), make_option(c("--genome"), type="character", default="hg38", - help="genome reference to calculate against", metavar="character")) + help="genome reference to calculate against", metavar="character"), + make_option(c("--ensdb"), type="character", + help="path to the Ensembl annotation gtf file", metavar="character") +) + opt_parser = OptionParser(option_list=option_list); opt = parse_args(opt_parser); @@ -38,6 +44,29 @@ if (is.null(opt$digest)) { stop("digest input missing.") } +myPartitionList <- function(gtffile){ + features = c("gene", "exon", "three_prime_utr", "five_prime_utr") + geneModels = getGeneModelsFromGTF(gtffile, features, TRUE) + partitionList = genomePartitionList(geneModels$gene, + geneModels$exon, + geneModels$three_prime_utr, + geneModels$five_prime_utr) + + return (partitionList) +} + + +myChromSizes <- function(genome){ + if (requireNamespace(BSgm, quietly=TRUE)){ + library (BSgm, character.only = TRUE) + BSG = eval(as.name(BSgm)) + } else { + library (BSg, character.only = TRUE) + BSG = eval(as.name(BSg)) + } + chromSizesGenome = seqlengths(BSG) + return(chromSizesGenome) +} plotBoth <- function(plotId, g){ pth = paste0(opt$outputFolder, "/", fileId, "_", plotId) @@ -66,8 +95,18 @@ doItAall <- function(query, fileId, genome, cellMatrix) { # TSS distance plot tryCatch( expr = { - TSSdist = calcFeatureDistRefTSS(query, genome) - plotBoth("tssdist", plotFeatureDist(TSSdist, featureName="TSS")) + if (!(genome %in% c("hg19", "hg38", "mm10", "mm9")) && gtffile == "None"){ + message("Ensembl annotation gtf file not provided. Skipping TSS distance plot ... ") + } else{ + if (genome %in% c("hg19", "hg38", "mm10", "mm9")) { + TSSdist = calcFeatureDistRefTSS(query, genome) + plotBoth("tssdist", plotFeatureDist( TSSdist, featureName="TSS")) + } else { + tss = getTssFromGTF(gtffile, TRUE) + TSSdist = calcFeatureDist(query, tss) + plotBoth("tssdist", plotFeatureDist( TSSdist, featureName="TSS")) + } + } plots = rbind(plots, getPlotReportDF("tssdist", "Region-TSS distance distribution")) message("Successfully calculated and plot TSS distance.") }, @@ -81,7 +120,14 @@ doItAall <- function(query, fileId, genome, cellMatrix) { # Chromosomes region distribution plot tryCatch( expr = { - plotBoth("chrombins", plotChromBins(calcChromBinsRef(query, genome))) + if (genome %in% c("mm39", "dm3", "dm6", "ce10", "ce11", "danRer10", "danRer10", "T2T")){ + chromSizes = myChromSizes(genome) + genomeBins = getGenomeBins(chromSizes) + plotBoth("chrombins", plotChromBins(calcChromBins(query, genomeBins))) + } else{ + plotBoth("chrombins", plotChromBins(calcChromBinsRef(query, genome))) + } + plots = rbind(plots, getPlotReportDF("chrombins", "Regions distribution over chromosomes")) message("Successfully calculated and plot chromosomes region distribution.") }, @@ -96,7 +142,15 @@ doItAall <- function(query, fileId, genome, cellMatrix) { if (bsGenomeAvail) { tryCatch( expr = { - gcvec = calcGCContentRef(query, genome) + if (requireNamespace(BSgm, quietly=TRUE)){ + library (BSgm, character.only = TRUE) + bsg = eval(as.name(BSgm)) + gcvec = calcGCContent(query, bsg) + } else { + library (BSg, character.only = TRUE) + bsg = eval(as.name(BSg)) + gcvec = calcGCContent(query, bsg) + } plotBoth("gccontent", plotGCContent(gcvec)) plots = rbind(plots, getPlotReportDF("gccontent", "GC content")) message("Successfully calculated and plot GC content.") @@ -111,19 +165,29 @@ doItAall <- function(query, fileId, genome, cellMatrix) { # Partition plots, default to percentages tryCatch( expr = { - gp = calcPartitionsRef(query, genome) - plotBoth("paritions", plotPartitions(gp)) - plots = rbind(plots, getPlotReportDF("paritions", "Regions distribution over genomic partitions")) - # flatten the result returned by the function above - partiotionNames = as.vector(gp[,"partition"]) - partitionsList = list() - for(i in seq_along(partiotionNames)){ - partitionsList[[paste0(partiotionNames[i], "_frequency")]] = - as.vector(gp[,"Freq"])[i] - partitionsList[[paste0(partiotionNames[i], "_percentage")]] = - as.vector(gp[,"Freq"])[i]/length(query) + if (!(genome %in% c("hg19", "hg38", "mm10")) && gtffile == "None"){ + message("Ensembl annotation gtf file not provided. Skipping partition plot ... ") + } else { + if (genome %in% c("hg19", "hg38", "mm10")) { + gp = calcPartitionsRef(query, genome) + plotBoth("paritions", plotPartitions(gp)) + } else { + partitionList = myPartitionList(gtffile) + gp = calcPartitions(query, partitionList) + plotBoth("paritions", plotPartitions(gp)) + } + plots = rbind(plots, getPlotReportDF("paritions", "Regions distribution over genomic partitions")) + # flatten the result returned by the function above + partiotionNames = as.vector(gp[,"partition"]) + partitionsList = list() + for(i in seq_along(partiotionNames)){ + partitionsList[[paste0(partiotionNames[i], "_frequency")]] = + as.vector(gp[,"Freq"])[i] + partitionsList[[paste0(partiotionNames[i], "_percentage")]] = + as.vector(gp[,"Freq"])[i]/length(query) + } + message("Successfully calculated and plot regions distribution over genomic partitions.") } - message("Successfully calculated and plot regions distribution over genomic partitions.") }, error = function(e){ message('Caught an error!') @@ -134,9 +198,20 @@ doItAall <- function(query, fileId, genome, cellMatrix) { # Expected partition plots tryCatch( expr = { - plotBoth("expected_partitions", plotExpectedPartitions(calcExpectedPartitionsRef(query, genome))) - plots = rbind(plots, getPlotReportDF("expected_partitions", "Expected distribution over genomic partitions")) - message("Successfully calculated and plot expected distribution over genomic partitions.") + if (!(genome %in% c("hg19", "hg38", "mm10")) && gtffile == "None"){ + message("Ensembl annotation gtf file not provided. Skipping expected partition plot ... ") + } else{ + if (genome %in% c("hg19", "hg38", "mm10")) { + plotBoth("expected_partitions", plotExpectedPartitions(calcExpectedPartitionsRef(query, genome))) + } else { + partitionList = myPartitionList(gtffile) + chromSizes = myChromSizes(genome) + genomeSize = sum(chromSizes) + plotBoth("expected_partitions", plotExpectedPartitions(calcExpectedPartitions(query, partitionList, genomeSize))) + } + plots = rbind(plots, getPlotReportDF("expected_partitions", "Expected distribution over genomic partitions")) + message("Successfully calculated and plot expected distribution over genomic partitions.") + } }, error = function(e){ message('Caught an error!') @@ -147,9 +222,18 @@ doItAall <- function(query, fileId, genome, cellMatrix) { # Cumulative partition plots tryCatch( expr = { - plotBoth("cumulative_partitions", plotCumulativePartitions(calcCumulativePartitionsRef(query, genome))) - plots = rbind(plots, getPlotReportDF("cumulative_partitions", "Cumulative distribution over genomic partitions")) - message("Successfully calculated and plot cumulative distribution over genomic partitions.") + if (!(genome %in% c("hg19", "hg38", "mm10")) && gtffile == "None"){ + message("Ensembl annotation gtf file not provided. Skipping cumulative partition plot ... ") + } else{ + if (genome %in% c("hg19", "hg38", "mm10")) { + plotBoth("cumulative_partitions", plotCumulativePartitions(calcCumulativePartitionsRef(query, genome))) + } else{ + partitionList = myPartitionList(gtffile) + plotBoth("cumulative_partitions", plotCumulativePartitions(calcCumulativePartitions(query, partitionList))) + } + plots = rbind(plots, getPlotReportDF("cumulative_partitions", "Cumulative distribution over genomic partitions")) + message("Successfully calculated and plot cumulative distribution over genomic partitions.") + } }, error = function(e){ message('Caught an error!') @@ -233,12 +317,29 @@ bedPath = opt$bedfilePath outfolder = opt$outputFolder genome = opt$genome cellMatrix = opt$openSignalMatrix -orgName = "Mmusculus" +gtffile = opt$ensdb + # build BSgenome package ID to check whether it's installed -if (startsWith(genome, "hg") | startsWith(genome, "grch")) orgName = "Hsapiens" +if (genome == "T2T"){ + BSg = "BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0" +} else { + if (startsWith(genome, "hg") | startsWith(genome, "grch")) { + orgName = "Hsapiens" + } else if (startsWith(genome, "mm") | startsWith(genome, "grcm")){ + orgName = "Mmusculus" + } else if (startsWith(genome, "dm")){ + orgName = "Dmelanogaster" + } else if (startsWith(genome, "ce")){ + orgName = "Celegans" + } else if (startsWith(genome, "danRer")){ + orgName = "Drerio" + } else if (startsWith(genome, "TAIR")){ + orgName = "Athaliana" + } + BSg = paste0("BSgenome.", orgName , ".UCSC.", genome) +} -BSg = paste0("BSgenome.", orgName , ".UCSC.", genome) BSgm = paste0(BSg, ".masked") # read bed file and run doitall() diff --git a/pipeline/bedstat.py b/pipeline/bedstat.py deleted file mode 100755 index 0c8cbe9..0000000 --- a/pipeline/bedstat.py +++ /dev/null @@ -1,247 +0,0 @@ -#!/usr/bin/env python3 -""" -bedfile statistics generating pipeline -""" - -__author__ = [ - "Michal Stolarczyk", - "Ognen Duzlevski", - "Jose Verdezoto", - "Bingjie Xue", -] -__email__ = "michal@virginia.edu" -__version__ = "0.0.4-dev" - -from argparse import ArgumentParser -from hashlib import md5 -import json -import yaml -import os -import sys -import warnings -import tempfile -import requests -import gzip - -import pypiper -import bbconf -import time - -parser = ArgumentParser( - description="A pipeline to read a file in BED format and produce metadata " - "in JSON format." -) - -parser.add_argument( - "--bedfile", help="a full path to bed file to process", required=True -) -parser.add_argument( - "--open-signal-matrix", - type=str, - required=False, - default=None, - help="a full path to the openSignalMatrix required for the tissue " - "specificity plots", -) -parser.add_argument( - "--bigbed", - type=str, - required=False, - default=None, - help="a full path to the bigbed files", -) -parser.add_argument( - "--bedbase-config", - dest="bedbase_config", - type=str, - default=None, - help="a path to the bedbase configuratiion file", -) -parser.add_argument( - "-y", - "--sample-yaml", - dest="sample_yaml", - type=str, - required=False, - help="a yaml config file with sample attributes to pass on more metadata " - "into the database", -) -exclusive_group = parser.add_mutually_exclusive_group() -exclusive_group.add_argument( - "--no-db-commit", - action="store_true", - help="whether the JSON commit to the database should be skipped", -) -exclusive_group.add_argument( - "--just-db-commit", - action="store_true", - help="whether just to commit the JSON to the database", -) -parser = pypiper.add_pypiper_args(parser, groups=["pypiper", "common", "looper", "ngs"]) - -args = parser.parse_args() - - -def hash_bedfile(filepath): - """generate digest for bedfile""" - with gzip.open(filepath, "rb") as f: - # concate column values - chrs = ",".join([row.split()[0].decode("utf-8") for row in f]) - starts = ",".join([row.split()[1].decode("utf-8") for row in f]) - ends = ",".join([row.split()[2].decode("utf-8") for row in f]) - # hash column values - chr_digest = md5(chrs.encode("utf-8")).hexdigest() - start_digest = md5(starts.encode("utf-8")).hexdigest() - end_digest = md5(ends.encode("utf-8")).hexdigest() - # hash column digests - bed_digest = md5( - ",".join([chr_digest, start_digest, end_digest]).encode("utf-8") - ).hexdigest() - - return bed_digest - - -def convert_unit(size_in_bytes): - """Convert the size from bytes to other units like KB, MB or GB""" - if size_in_bytes < 1024: - return str(size_in_bytes) + "bytes" - elif size_in_bytes in range(1024, 1024 * 1024): - return str(round(size_in_bytes / 1024, 2)) + "KB" - elif size_in_bytes in range(1024 * 1024, 1024 * 1024 * 1024): - return str(round(size_in_bytes / (1024 * 1024))) + "MB" - elif size_in_bytes >= 1024 * 1024 * 1024: - return str(round(size_in_bytes / (1024 * 1024 * 1024))) + "GB" - - -def get_file_size(file_name): - """Get file in size in given unit like KB, MB or GB""" - size = os.path.getsize(file_name) - return convert_unit(size) - - -bbc = bbconf.BedBaseConf(config_path=args.bedbase_config, database_only=True) -bedstat_output_path = bbc.get_bedstat_output_path() - -bed_digest = md5(open(args.bedfile, "rb").read()).hexdigest() -# bed_digest = hash_bedfile(args.bedfile) -bedfile_name = os.path.split(args.bedfile)[1] -# need to split twice since there are 2 exts -fileid = os.path.splitext(os.path.splitext(bedfile_name)[0])[0] -outfolder = os.path.abspath(os.path.join(bedstat_output_path, bed_digest)) -json_file_path = os.path.abspath(os.path.join(outfolder, fileid + ".json")) -json_plots_file_path = os.path.abspath(os.path.join(outfolder, fileid + "_plots.json")) -bed_relpath = os.path.relpath( - args.bedfile, - os.path.abspath(os.path.join(bedstat_output_path, os.pardir, os.pardir)), -) -bigbed_relpath = os.path.relpath( - os.path.join(args.bigbed, fileid + ".bigBed"), - os.path.abspath(os.path.join(bedstat_output_path, os.pardir, os.pardir)), -) - - -def main(): - if not args.just_db_commit: - pm = pypiper.PipelineManager( - name="bedstat-pipeline", outfolder=outfolder, args=args - ) - - # run Rscript - rscript_path = os.path.join( - os.path.dirname(os.path.dirname(os.path.abspath(__file__))), - "tools", - "regionstat.R", - ) - assert os.path.exists(rscript_path), FileNotFoundError( - f"'{rscript_path}' script not found" - ) - command = ( - f"Rscript {rscript_path} --bedfilePath={args.bedfile} " - f"--fileId={fileid} --openSignalMatrix={args.open_signal_matrix} " - f"--outputFolder={outfolder} --genome={args.genome_assembly} " - f"--digest={bed_digest}" - ) - print(command) - pm.run(cmd=command, target=json_file_path) - pm.stop_pipeline() - - # now get the resulting json file and load it into Elasticsearch - # if the file exists, of course - if not args.no_db_commit: - data = {} - if os.path.exists(json_file_path): - with open(json_file_path, "r", encoding="utf-8") as f: - data = json.loads(f.read()) - if os.path.exists(json_plots_file_path): - with open(json_plots_file_path, "r", encoding="utf-8") as f_plots: - plots = json.loads(f_plots.read()) - if args.sample_yaml: - # get the sample-specific metadata from the sample yaml representation - other = {} - if os.path.exists(args.sample_yaml): - y = yaml.safe_load(open(args.sample_yaml, "r")) - data.update({"other": y}) - # unlist the data, since the output of regionstat.R is a dict of lists of - # length 1 and force keys to lower to correspond with the - # postgres column identifiers - data = {k.lower(): v[0] if isinstance(v, list) else v for k, v in data.items()} - data.update( - { - "bedfile": { - "path": bed_relpath, - "size": get_file_size(args.bedfile), - "title": "Path to the BED file", - } - } - ) - - if os.path.exists( - os.path.join(args.bigbed, fileid + ".bigBed") - ) and not os.path.islink(os.path.join(args.bigbed, fileid + ".bigBed")): - digest = requests.get( - f"http://refgenomes.databio.org/genomes/genome_digest/{args.genome_assembly}" - ).text.strip('""') - - data.update( - { - "genome": { - "alias": args.genome_assembly, - "digest": digest, - } - } - ) - data.update( - { - "bigbedfile": { - "path": bigbed_relpath, - "size": get_file_size( - os.path.join(args.bigbed, fileid + ".bigBed") - ), - "title": "Path to the big BED file", - } - } - ) - - else: - data.update( - { - "genome": { - "alias": args.genome_assembly, - "digest": "", - } - } - ) - - for plot in plots: - plot_id = plot["name"] - del plot["name"] - data.update({plot_id: plot}) - bbc.bed.report(record_identifier=bed_digest, values=data) - - -if __name__ == "__main__": - try: - sys.exit(main()) - except KeyboardInterrupt: - print("Pipeline aborted.") - sys.exit(1) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index ffc1db4..af4b488 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,3 +1,6 @@ piper>=0.12.1 looper>=1.2.1 -# bbconf>=0.1.0 \ No newline at end of file +bbconf>=0.1.0 +requests>=2.27.0 +yacman>=0.8.4 +pipestat>=0.3.1 diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index 6c7c573..b28b04f 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -1,4 +1,3 @@ --e git+https://github.com/databio/yacman@dev#egg=yacman --e git+https://github.com/pepkit/pipestat@dev#egg=pipestat + diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt new file mode 100644 index 0000000..f77c910 --- /dev/null +++ b/requirements/requirements-test.txt @@ -0,0 +1 @@ +pytest>=7.0.0 diff --git a/schemas/pep_schema.yaml b/schemas/pep_schema.yaml new file mode 100644 index 0000000..65bc588 --- /dev/null +++ b/schemas/pep_schema.yaml @@ -0,0 +1,79 @@ +description: bedstat PEP schema + +properties: + samples: + type: array + items: + type: object + properties: + sample_name: + type: string + db_commit: TRUE + description: "name of the sample, which is the name of the output BED file" + input_file_path: + type: string + db_commit: FALSE + description: "absolute path the file to convert" + output_file_path: + type: string + db_commit: FALSE + description: "absolute path the file to the output BED file (derived attribute)" + bigbed: + type: string + db_commit: FALSE + description: "dir path where the bigbed file stored (derived attribute)" + genome: + type: string + db_commit: TRUE + description: "organism genome code" + narrowpeak: + type: boolean + db_commit: TRUE + description: "binary number indicating whether the regions are narrow (transcription factor implies narrow, histone mark implies broad peaks)" + format: + type: string + db_commit: TRUE + description: "file format" + enum: ["bigWig", "bigBed", "bed", "wig", "bedGraph"] + cell_type: + type: string + db_commit: TRUE + description: "cell type code" + antibody: + type: string + db_commit: TRUE + description: "antibody used if ChIP-seq experiment" + description: + type: string + db_commit: TRUE + description: "freeform description of the sample" + exp_protocol: + type: string + db_commit: TRUE + description: "type of the experiment the file was generated in" + data_source: + type: string + db_commit: TRUE + description: "source of the sample, preferably a GSE* code" + treatment: + type: string + db_commit: TRUE + description: "freeform description of the sample treatment" + ensdb: + type: string + db_commit: FALSE + description: "path of gtf annotation for genomes not in GDdata" + fasta: + type: string + db_commit: FALSE + description: "path of for genomes not in GDdata" + open_signal_matrix: + type: string + db_commit: FALSE + description: "path of for the open signal matrixm file for the given genome" + required: + - output_file_path + - genome + - sample_name +required: + - samples \ No newline at end of file diff --git a/pipeline_interface.yaml b/schemas/pipeline_interface.yaml similarity index 67% rename from pipeline_interface.yaml rename to schemas/pipeline_interface.yaml index fdbcbc9..0dd5915 100644 --- a/pipeline_interface.yaml +++ b/schemas/pipeline_interface.yaml @@ -2,7 +2,8 @@ pipeline_name: BEDSTAT pipeline_type: sample var_templates: path: "{looper.piface_dir}/pipeline/bedstat.py" -input_schema: http://schema.databio.org/pipelines/bedstat.yaml +#input_schema: http://schema.databio.org/pipelines/bedstat.yaml +input_schema: pep_schema.yaml pre_submit: python_functions: - looper.write_sample_yaml @@ -11,6 +12,9 @@ command_template: > --bedfile {sample.output_file_path} --genome {sample.genome} --sample-yaml {sample.yaml_file} + --schema "{looper.piface_dir}/{pipeline.input_schema}" {% if sample.bedbase_config is defined %} --bedbase-config {sample.bedbase_config} {% endif %} {% if sample.open_signal_matrix is defined %} --open-signal-matrix {sample.open_signal_matrix} {% endif %} + {% if sample.ensdb is defined %} --ensdb {sample.ensdb} {% endif %} + {% if sample.fasta is defined %} --fasta {sample.fasta} {% endif %} {% if sample.bigbed is defined %} --bigbed {sample.bigbed} {% endif %} diff --git a/pipeline_interface_just_db_commit.yaml b/schemas/pipeline_interface_just_db_commit.yaml similarity index 68% rename from pipeline_interface_just_db_commit.yaml rename to schemas/pipeline_interface_just_db_commit.yaml index 919f3a4..510cde5 100644 --- a/pipeline_interface_just_db_commit.yaml +++ b/schemas/pipeline_interface_just_db_commit.yaml @@ -2,7 +2,8 @@ pipeline_name: BEDSTAT pipeline_type: sample var_templates: path: "{looper.piface_dir}/pipeline/bedstat.py" -input_schema: http://schema.databio.org/pipelines/bedstat.yaml +#input_schema: http://schema.databio.org/pipelines/bedstat.yaml +input_schema: pep_schema.yaml pre_submit: python_functions: - looper.write_sample_yaml @@ -11,7 +12,10 @@ command_template: > --bedfile {sample.output_file_path} --genome {sample.genome} --sample-yaml {sample.yaml_file} + --schema "{looper.piface_dir}/{pipeline.input_schema}" --just-db-commit {% if sample.bedbase_config is defined %} --bedbase-config {sample.bedbase_config} {% endif %} {% if sample.open_signal_matrix is defined %} --open-signal-matrix {sample.open_signal_matrix} {% endif %} + {% if sample.ensdb is defined %} --ensdb {sample.ensdb} {% endif %} + {% if sample.fasta is defined %} --fasta {sample.fasta} {% endif %} {% if sample.bigbed is defined %} --bigbed {sample.bigbed} {% endif %} diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..ebdc80f --- /dev/null +++ b/setup.py @@ -0,0 +1,71 @@ +#! /usr/bin/env python + +import os +from setuptools import setup +import sys + +PACKAGE = "bedstat" +REQDIR = "requirements" + +# Additional keyword arguments for setup(). +extra = {} + +# Ordinary dependencies + + +def read_reqs(reqs_name): + deps = [] + with open(os.path.join(REQDIR, "requirements-{}.txt".format(reqs_name)), "r") as f: + for l in f: + if not l.strip(): + continue + deps.append(l) + return deps + + +DEPENDENCIES = read_reqs("all") +extra["install_requires"] = DEPENDENCIES + +scripts = None + +with open("{}/_version.py".format(PACKAGE), "r") as versionfile: + version = versionfile.readline().split()[-1].strip("\"'\n") + +with open("README.md") as f: + long_description = f.read() + +setup( + name=PACKAGE, + packages=[PACKAGE], + version=version, + description="Python Package to convert various genome region files to bed file.", + long_description=long_description, + long_description_content_type="text/markdown", + classifiers=[ + "Development Status :: 3 - Alpha", + "License :: OSI Approved :: BSD License", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Topic :: Scientific/Engineering :: Bio-Informatics", + ], + keywords="pipeline, bioinformatics, workflow", + url="https://github.com/databio/{}/".format(PACKAGE), + author="Oleksandr Khoroshevskyi, Bingjie Xue, Michal Stolarczyk, Ognen Duzlevski," + "Jose Verdezoto, ", + license="BSD2", + entry_points={ + "console_scripts": [ + "bedstat = bedstat.bedstat:main", + ], + }, + package_data={PACKAGE: ["templates/*"]}, + scripts=scripts, + include_package_data=True, + test_suite="tests", + tests_require=read_reqs("dev"), + setup_requires=( + ["pytest-runner"] if {"test", "pytest", "ptr"} & set(sys.argv) else [] + ), + **extra +) diff --git a/tests/config_db_github.yaml b/tests/config_db_github.yaml new file mode 100644 index 0000000..2231d73 --- /dev/null +++ b/tests/config_db_github.yaml @@ -0,0 +1,16 @@ +path: + pipeline_output_path: $BEDBASE_DATA_PATH_HOST/ + bedstat_dir: outputs/bedstat_output + bedbuncher_dir: outputs/bedbuncher_output + remote_url_base: null +database: + name: postgres + user: postgres + password: bedbasepassword + host: localhost + port: 5432 + dialect: postgresql + driver: psycopg2 +server: + host: 0.0.0.0 + port: 8000 \ No newline at end of file diff --git a/tests/config_db_local.yaml b/tests/config_db_local.yaml new file mode 100644 index 0000000..35e3334 --- /dev/null +++ b/tests/config_db_local.yaml @@ -0,0 +1,16 @@ +path: + pipeline_output_path: ./ + bedstat_dir: outputs/bedstat_output + bedbuncher_dir: outputs/bedbuncher_output + remote_url_base: null +database: + host: localhost + port: 5432 + password: bedbasepassword + user: postgres + name: postgres + dialect: postgresql + driver: psycopg2 +server: + host: 0.0.0.0 + port: 80pip \ No newline at end of file diff --git a/tests/data/bedstat_annotation_sheet.csv b/tests/data/bedstat_annotation_sheet.csv deleted file mode 100644 index 8ea1c8d..0000000 --- a/tests/data/bedstat_annotation_sheet.csv +++ /dev/null @@ -1,2 +0,0 @@ -sample_name,file_name,genome,exp_protocol,cell_type,tissue,antibody,treatment,data_source,GSE,GSM,description,format -bedbase_demo1,test_hg38.bed.gz,hg38,ChiPseq,GM12878,NA,IKZF1,None,GEO,GSE105587,,IKZF1 ChIP-seq on human GM12878,bed \ No newline at end of file diff --git a/tests/data/bedstat_config.yaml b/tests/data/bedstat_config.yaml deleted file mode 100644 index 78a2a84..0000000 --- a/tests/data/bedstat_config.yaml +++ /dev/null @@ -1,17 +0,0 @@ -pep_version: 2.0.0 -sample_table: bedstat_annotation_sheet.csv - -looper: - output_dir: $GITHUB_WORKSPACE/outputs/bedstat_output - -sample_modifiers: - append: - bedbase_config: $GITHUB_WORKSPACE/tests/data/config_min.yaml - pipeline_interfaces: $GITHUB_WORKSPACE/pipeline_interface.yaml - output_file_path: OUTPUT - yaml_file: SAMPLE_YAML - derive: - attributes: [output_file_path, yaml_file] - sources: - OUTPUT: "$GITHUB_WORKSPACE/tests/data/{file_name}" - SAMPLE_YAML: "$GITHUB_WORKSPACE/outputs/bedstat_output/submission/{sample_name}.yaml" \ No newline at end of file diff --git a/tests/data/config_min.yaml b/tests/data/config_min.yaml deleted file mode 100644 index e55baf4..0000000 --- a/tests/data/config_min.yaml +++ /dev/null @@ -1,14 +0,0 @@ -database: - name: pipestat-test - user: postgres - password: pipestat-password - host: localhost - port: 5432 -path: - pipeline_output_path: $BEDBASE_DATA_PATH/outputs - bedstat_dir: bedstat_output - bedbuncher_dir: bedbuncher_output - remote_url_base: null -server: - host: 0.0.0.0 - port: 8000 \ No newline at end of file diff --git a/tests/data/csv/hg38/test_data_csv/index.txt b/tests/data/csv/hg38/test_data_csv/index.txt deleted file mode 100644 index 0d38fc3..0000000 --- a/tests/data/csv/hg38/test_data_csv/index.txt +++ /dev/null @@ -1,4 +0,0 @@ -filename,cellType,treatment,antibody,description -a.bed,a,None,None,first cell line -b.bed,b,None,None,second cell line -c.bed,c,EWS-FLI1,None,test diff --git a/tests/data/csv/hg38/test_data_csv/regions/a.bed b/tests/data/csv/hg38/test_data_csv/regions/a.bed deleted file mode 100644 index 2e65efe..0000000 --- a/tests/data/csv/hg38/test_data_csv/regions/a.bed +++ /dev/null @@ -1 +0,0 @@ -a \ No newline at end of file diff --git a/tests/data/csv/hg38/test_data_csv/regions/b.bed b/tests/data/csv/hg38/test_data_csv/regions/b.bed deleted file mode 100644 index 63d8dbd..0000000 --- a/tests/data/csv/hg38/test_data_csv/regions/b.bed +++ /dev/null @@ -1 +0,0 @@ -b \ No newline at end of file diff --git a/tests/data/csv/hg38/test_data_csv/regions/c.bed b/tests/data/csv/hg38/test_data_csv/regions/c.bed deleted file mode 100644 index 3410062..0000000 --- a/tests/data/csv/hg38/test_data_csv/regions/c.bed +++ /dev/null @@ -1 +0,0 @@ -c \ No newline at end of file diff --git a/tests/data/f1/AML_db358.bed.gz b/tests/data/f1/AML_db358.bed.gz new file mode 100644 index 0000000..acf6882 Binary files /dev/null and b/tests/data/f1/AML_db358.bed.gz differ diff --git a/tests/data/f1/AML_db358.bigBed b/tests/data/f1/AML_db358.bigBed new file mode 100644 index 0000000..900f3a3 Binary files /dev/null and b/tests/data/f1/AML_db358.bigBed differ diff --git a/tests/data/f1/AML_db358_sample.yaml b/tests/data/f1/AML_db358_sample.yaml new file mode 100644 index 0000000..3e7e2a5 --- /dev/null +++ b/tests/data/f1/AML_db358_sample.yaml @@ -0,0 +1,19 @@ +sample_name: AML_db358 +file_name: AML_db358.bed.gz +genome: hg19 +exp_protocol: ChiPseq +cell_type: ' K562' +tissue: Lung +antibody: ZEB2 +treatment: None +data_source: GEO +GSE: 'GSE91663 ' +description: ZEB2 ChIP-seq on human K562 (ENCODE) +format: bed +old_name: GSE91663_ENCFF129WGK_optimal_idr_thresholded_peaks_hg19.bigBed +bedbase_config: $CODE/bedbase/tutorial_files/bedbase_configuration_compose.yaml +pipeline_interfaces: $CODE/bedstat/pipeline_interface.yaml +output_file_path: /home/bnt4me/Virginia/bed_base_all/bedbase/bedbase_tutorial/bed_files/AML_db358.bed.gz +yaml_file: /home/bnt4me/Virginia/bed_base_all/bedbase/bedbase_tutorial/outputs/bedstat_output/bedstat_pipeline_logs/submission/AML_db358_sample.yaml +open_signal_matrix: /home/bnt4me/Virginia/bed_base_all/bedbase/bedbase_tutorial/openSignalMatrix_hg19_percentile99_01_quantNormalized_round4d.txt.gz +bigbed: /home/bnt4me/Virginia/bed_base_all/bedbase/bedbase_tutorial/bigbed_files diff --git a/tests/data/test_hg38.bed.gz b/tests/data/test_hg38.bed.gz deleted file mode 100644 index 344da49..0000000 Binary files a/tests/data/test_hg38.bed.gz and /dev/null differ diff --git a/tests/data/tsv/hg38/test_data_tsv/index.txt b/tests/data/tsv/hg38/test_data_tsv/index.txt deleted file mode 100644 index c92a6ab..0000000 --- a/tests/data/tsv/hg38/test_data_tsv/index.txt +++ /dev/null @@ -1,4 +0,0 @@ -filename cellType treatment description -a.bed a None first cell line -b.bed b None second cell line -c.bed c EWS-FLI1 test \ No newline at end of file diff --git a/tests/data/tsv/hg38/test_data_tsv/regions/a.bed b/tests/data/tsv/hg38/test_data_tsv/regions/a.bed deleted file mode 100644 index 2e65efe..0000000 --- a/tests/data/tsv/hg38/test_data_tsv/regions/a.bed +++ /dev/null @@ -1 +0,0 @@ -a \ No newline at end of file diff --git a/tests/data/tsv/hg38/test_data_tsv/regions/b.bed b/tests/data/tsv/hg38/test_data_tsv/regions/b.bed deleted file mode 100644 index 63d8dbd..0000000 --- a/tests/data/tsv/hg38/test_data_tsv/regions/b.bed +++ /dev/null @@ -1 +0,0 @@ -b \ No newline at end of file diff --git a/tests/data/tsv/hg38/test_data_tsv/regions/c.bed b/tests/data/tsv/hg38/test_data_tsv/regions/c.bed deleted file mode 100644 index 3410062..0000000 --- a/tests/data/tsv/hg38/test_data_tsv/regions/c.bed +++ /dev/null @@ -1 +0,0 @@ -c \ No newline at end of file diff --git a/tests/test_bedstat.py b/tests/test_bedstat.py new file mode 100644 index 0000000..681558f --- /dev/null +++ b/tests/test_bedstat.py @@ -0,0 +1,31 @@ +import bedstat +import os +import pytest +import bbconf + + +file_path = "./tests" +# file_path = "./" +CONFIG_FILE = f"{file_path}/config_db_github.yaml" +# CONFIG_FILE = f"{file_path}/config_db_local.yaml" + + +class TestBedstat: + def test_uploading(self): + bedstat.run_bedstat( + bedfile=f"{file_path}/data/f1/AML_db358.bed.gz", + sample_yaml=f"{file_path}/data/f1/AML_db358_sample.yaml", + bedbase_config=CONFIG_FILE, + bigbed=f"{file_path}/data/f1/", + just_db_commit=True, + genome_assembly="hg19", + ) + + def test_db_content(self): + bbc = bbconf.BedBaseConf(config_path=CONFIG_FILE, database_only=True) + assert bbc.bed.record_count == 1 + + def test_db_record(self): + bbc = bbconf.BedBaseConf(config_path=CONFIG_FILE, database_only=True) + geonome_check = bbc.bed.select(columns=["other"])[0][0]["genome"] + assert geonome_check == "hg19" diff --git a/tests/test_full.py b/tests/test_full.py new file mode 100644 index 0000000..8a64fa3 --- /dev/null +++ b/tests/test_full.py @@ -0,0 +1,30 @@ +import bedstat +import os +import pytest +import bbconf + + +file_path = "./tests" +# file_path = "./" +# CONFIG_FILE = f"{file_path}/config_db_github.yaml" +CONFIG_FILE = f"{file_path}/config_db_local.yaml" +SAMPLE_YAML = f"{file_path}/data/f1/AML_db358_sample_new.yaml" + +# has to be change manually +OPEN_MATRIX19 = "/home/bnt4me/Virginia/bed_base_all/bedbase/bedbase_tutorial/openSignalMatrix_hg19_percentile99_01_quantNormalized_round4d.txt.gz" + + +@pytest.mark.skipif(True, reason="This test can be run only locally") +class TestBedstatProcessing: + def test_processing(self): + bedstat.run_bedstat( + bedfile=f"{file_path}/data/f1/AML_db358.bed.gz", + sample_yaml=SAMPLE_YAML, + bedbase_config=CONFIG_FILE, + bigbed=f"{file_path}/data/f1/", + just_db_commit=False, + genome_assembly="hg19", + open_signal_matrix=OPEN_MATRIX19, + ) + + # We should add some additionall tests here diff --git a/tests/test_process_LOLA.py b/tests/test_process_LOLA.py deleted file mode 100644 index 5497db0..0000000 --- a/tests/test_process_LOLA.py +++ /dev/null @@ -1,11 +0,0 @@ -from scripts.process_LOLA import process_index_file - - -def test_process_index_file(capsys, tsv_base_dir, csv_base_dir): - """ Check if process_index_file produces the same results for TSV and CSV files """ - process_index_file(csv_base_dir, "hg38") - out1, _ = capsys.readouterr() - process_index_file(tsv_base_dir, "hg38") - out2, _ = capsys.readouterr() - # number of characters in each captured output should be exactly the same - assert len(out1) == len(out2)