diff --git a/.gitignore b/.gitignore
index 3cbf9c0d..11d3337a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@
*~
\#*#
+extensions/__pycache__/code.cpython-38.pyc
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e36222aa..d204441f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,13 +52,24 @@ list( APPEND PepSIRF_LINK_LIBS
)
if(OpenMP_FOUND)
- message( "OpenMP enabled" )
- list( APPEND PepSIRF_LINK_LIBS OpenMP::OpenMP_CXX )
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xpreprocessor -fopenmp")
- add_definitions( -DENABLE_OPENMP )
+ message("OpenMP enabled")
+
+ if(APPLE)
+ # Get libomp filepath
+ execute_process(COMMAND brew --prefix libomp OUTPUT_VARIABLE BREW_PREFIX OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xpreprocessor -fopenmp -I${BREW_PREFIX}/include")
+ list(APPEND PepSIRF_LINK_LIBS "${BREW_PREFIX}/lib/libomp.dylib")
+ else()
+ list(APPEND PepSIRF_LINK_LIBS OpenMP::OpenMP_CXX)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xpreprocessor -fopenmp")
+ endif()
+
+ # Define OpenMP macro
+ add_definitions(-DENABLE_OPENMP)
else()
- message( "WARNING: OpenMP not found, parallelism disabled." )
+ message("WARNING: OpenMP not found, parallelism disabled.")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas -Wno-unused-value")
endif()
diff --git a/docs/5-changelog.md b/docs/5-changelog.md
index 5cfbd939..b7d8fe05 100644
--- a/docs/5-changelog.md
+++ b/docs/5-changelog.md
@@ -10,10 +10,14 @@ permalink: /changelog/
## 1.7.0 | 2024-10-3
-Docker: added new feature (Issue #254). Added the ability to run PepSIRF as a Docker image and added a page for instructions.
+### Bug Fixes:
CMakelists: bug fix (Issue #197). Resolved CMake not locating OpenMP on MacOS. Tutorial for fix added to installation page.
+## New Features:
+
+Docker: added new feature (Issue #254). Added the ability to run PepSIRF as a Docker image and added a page for instructions.
+
Subjoin: added new feature (Issue #236). Added a functionality to the "-i" option in Subjoin to accept a regex pattern instead of a filename which contains sample/peptide names. The sample/peptide names used from the score matrix file will be filtered by whether they contain the regex pattern.
Demux: added new feature (Issue #234). Added "--unmapped-reads-output" option to Demux, which writes all reads that have not been mapped to a sample/peptide to the specified filename.
diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock
index 1eba005c..767c0058 100644
--- a/docs/Gemfile.lock
+++ b/docs/Gemfile.lock
@@ -1,23 +1,31 @@
GEM
remote: https://rubygems.org/
specs:
- activesupport (6.0.5)
+ activesupport (7.1.3.4)
+ base64
+ bigdecimal
concurrent-ruby (~> 1.0, >= 1.0.2)
- i18n (>= 0.7, < 2)
- minitest (~> 5.1)
- tzinfo (~> 1.1)
- zeitwerk (~> 2.2, >= 2.2.2)
+ connection_pool (>= 2.2.5)
+ drb
+ i18n (>= 1.6, < 2)
+ minitest (>= 5.1)
+ mutex_m
+ tzinfo (~> 2.0)
addressable (2.8.0)
public_suffix (>= 2.0.2, < 5.0)
+ base64 (0.2.0)
+ bigdecimal (3.1.8)
coffee-script (2.4.1)
coffee-script-source
execjs
coffee-script-source (1.11.1)
colorator (1.1.0)
- commonmarker (0.23.9)
- concurrent-ruby (1.1.10)
+ commonmarker (0.23.10)
+ concurrent-ruby (1.3.3)
+ connection_pool (2.4.1)
dnsruby (1.61.9)
simpleidn (~> 0.1)
+ drb (2.2.1)
em-websocket (0.5.3)
eventmachine (>= 0.12.9)
http_parser.rb (~> 0)
@@ -51,12 +59,12 @@ GEM
ffi (1.15.5)
forwardable-extended (2.6.0)
gemoji (3.0.1)
- github-pages (226)
+ github-pages (228)
github-pages-health-check (= 1.17.9)
- jekyll (= 3.9.2)
+ jekyll (= 3.9.3)
jekyll-avatar (= 0.7.0)
jekyll-coffeescript (= 1.1.1)
- jekyll-commonmark-ghpages (= 0.2.0)
+ jekyll-commonmark-ghpages (= 0.4.0)
jekyll-default-layout (= 0.1.4)
jekyll-feed (= 0.15.1)
jekyll-gist (= 1.5.0)
@@ -90,10 +98,10 @@ GEM
jemoji (= 0.12.0)
kramdown (= 2.3.2)
kramdown-parser-gfm (= 1.1.0)
- liquid (= 4.0.3)
+ liquid (= 4.0.4)
mercenary (~> 0.3)
minima (= 2.5.1)
- nokogiri (>= 1.13.4, < 2.0)
+ nokogiri (>= 1.13.6, < 2.0)
rouge (= 3.26.0)
terminal-table (~> 1.4)
github-pages-health-check (1.17.9)
@@ -106,13 +114,13 @@ GEM
activesupport (>= 2)
nokogiri (>= 1.4)
http_parser.rb (0.8.0)
- i18n (0.9.5)
+ i18n (1.14.5)
concurrent-ruby (~> 1.0)
- jekyll (3.9.2)
+ jekyll (3.9.3)
addressable (~> 2.4)
colorator (~> 1.0)
em-websocket (~> 0.5)
- i18n (~> 0.7)
+ i18n (>= 0.7, < 2)
jekyll-sass-converter (~> 1.0)
jekyll-watch (~> 2.0)
kramdown (>= 1.17, < 3)
@@ -128,11 +136,11 @@ GEM
coffee-script-source (~> 1.11.1)
jekyll-commonmark (1.4.0)
commonmarker (~> 0.22)
- jekyll-commonmark-ghpages (0.2.0)
- commonmarker (~> 0.23.4)
+ jekyll-commonmark-ghpages (0.4.0)
+ commonmarker (~> 0.23.7)
jekyll (~> 3.9.0)
jekyll-commonmark (~> 1.4.0)
- rouge (>= 2.0, < 4.0)
+ rouge (>= 2.0, < 5.0)
jekyll-default-layout (0.1.4)
jekyll (~> 3.0)
jekyll-feed (0.15.1)
@@ -220,7 +228,7 @@ GEM
rexml
kramdown-parser-gfm (1.1.0)
kramdown (~> 2.0)
- liquid (4.0.3)
+ liquid (4.0.4)
listen (3.7.1)
rb-fsevent (~> 0.10, >= 0.10.3)
rb-inotify (~> 0.9, >= 0.9.10)
@@ -230,8 +238,9 @@ GEM
jekyll (>= 3.5, < 5.0)
jekyll-feed (~> 0.9)
jekyll-seo-tag (~> 2.1)
- minitest (5.15.0)
+ minitest (5.23.1)
multipart-post (2.1.1)
+ mutex_m (0.2.0)
nokogiri (1.14.3)
mini_portile2 (~> 2.8.0)
racc (~> 1.4)
@@ -245,7 +254,8 @@ GEM
rb-fsevent (0.11.1)
rb-inotify (0.10.1)
ffi (~> 1.0)
- rexml (3.2.5)
+ rexml (3.3.0)
+ strscan
rouge (3.26.0)
ruby2_keywords (0.0.5)
rubyzip (2.3.2)
@@ -260,18 +270,17 @@ GEM
faraday (> 0.8, < 2.0)
simpleidn (0.2.1)
unf (~> 0.1.4)
+ strscan (3.1.0)
terminal-table (1.8.0)
unicode-display_width (~> 1.1, >= 1.1.1)
- thread_safe (0.3.6)
typhoeus (1.4.0)
ethon (>= 0.9.0)
- tzinfo (1.2.9)
- thread_safe (~> 0.1)
+ tzinfo (2.0.6)
+ concurrent-ruby (~> 1.0)
unf (0.1.4)
unf_ext
unf_ext (0.0.8.1)
unicode-display_width (1.8.0)
- zeitwerk (2.5.4)
PLATFORMS
ruby
diff --git a/extensions/e_k_bias.py b/extensions/e_k_bias.py
new file mode 100644
index 00000000..e89fceac
--- /dev/null
+++ b/extensions/e_k_bias.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+
+import pandas as pd
+import argparse
+import fastatools as ft
+import numpy as np
+
+def main():
+
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+ parser.add_argument('-i', '--fasta-file', help='Directory with enriched petide files for input', required=True)
+ parser.add_argument('-o', '--output-file', default="e_k_bias_out.tsv", help='Name of .tsv to output file with AA bias data')
+
+ args = parser.parse_args()
+
+ # get proportion of e's and k's for each peptide
+ e_k_props = get_e_k_props(args.fasta_file)
+
+ # get percentiles
+ percentile_dict = get_percentiles(e_k_props)
+
+ # create output df
+ out_data = [(name, round(prop, 3), round(percentile_dict[name], 2)) for name, prop in e_k_props.items()]
+
+ pd.DataFrame(out_data, columns=["CodeName", "e_k_Prop", "e_k_Percentile"]).to_csv(args.output_file, index=False, sep='\t')
+
+
+# get proportion of e's and k's for each peptide
+def get_e_k_props(fasta_file)->dict:
+ e_k_props = dict()
+
+ # get props for peptide file
+ fasta_dict = ft.read_fasta_dict(fasta_file)
+
+ # iterate through each sequence
+ for name, seq in fasta_dict.items():
+ e_k_count = 0
+
+ # loop through each AA, get count of e and k
+ for aa in seq:
+ if aa.lower() == 'e' or aa.lower() =='k':
+ e_k_count += 1
+
+ # add proportion to dict
+ e_k_props[name] = (e_k_count) / len(seq)
+
+ return e_k_props
+
+# get percentile of each peptide using its e and k proportion
+def get_percentiles(e_k_props)->dict:
+ # Calculate percentile of each peptide
+ names = list(e_k_props.keys())
+ all_props = np.array(list(e_k_props.values()))
+
+ # Get unique values and array for mapping each original value to its corresponding index in the unique array
+ unique_props, inverse_indices = np.unique(all_props, return_inverse=True)
+
+ # Calculate percentiles based on the unique props
+ percentile_ranks = np.linspace(0, 100, len(unique_props))
+
+ # Map sorted values back to the original corresponding name
+ percentile_dict = {names[i]: percentile_ranks[inverse_indices[i]] for i in range(len(names))}
+
+ return percentile_dict
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/extensions/findEpitopes.py b/extensions/findEpitopes.py
new file mode 100644
index 00000000..7d0ed97c
--- /dev/null
+++ b/extensions/findEpitopes.py
@@ -0,0 +1,105 @@
+import os
+import matplotlib.pyplot as plt
+import numpy as np
+import argparse
+
+def main():
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+ parser.add_argument('-i', '--input-dir', help='Directory with alignment output files and files that contain the mapped location of peptides', required=True)
+ parser.add_argument('-o', '--output-dir', default="clust_align_visualizations", help='Name of directory to output line plots.')
+
+ args = parser.parse_args()
+
+ directory_path = args.input_dir
+ alignment_to_use_dict = read_check_align_file(directory_path)
+ #print(probes_dict)
+ alignCountsD = process_probes(alignment_to_use_dict, directory_path)
+ #print(alignCountsD)
+
+ if not os.path.exists(args.output_dir):
+ os.mkdir(args.output_dir)
+ create_line_chart(alignCountsD, args.output_dir)
+
+
+def create_line_chart(alignCountsD, out_dir):
+ for file, pos_dict in alignCountsD.items():
+ x = list(pos_dict.keys())
+ y = list(pos_dict.values())
+ fig, ax = plt.subplots(figsize=(max(x)/10, 10), facecolor='w')
+ ax.plot(x, y, linestyle='-')
+ ax.set_xticks(np.arange(min(x), max(x)+5, 5))
+ ax.set_xlim(left=min(x))
+ ax.set_ylim(bottom=min(y))
+ plt.grid()
+ plt.xlabel("Sequence Position")
+ plt.ylabel("Count")
+ plt.title(file)
+ plt.savefig(os.path.join(out_dir, f"{file.split('_')[-2]}_epitopes_lineplot.png"), dpi=300, bbox_inches='tight')
+
+
+def find_smallest_value_with_substring(data_dict, substring):
+ # Filter the dictionary to only include items with the specified substring in the key
+ filtered_dict = {k: v for k, v in data_dict.items() if substring in k}
+
+ # If there are no matches, return None
+ if not filtered_dict:
+ return None
+
+ # Find the key-value pair with the smallest value
+ smallest_pair = min(filtered_dict.items(), key=lambda item: item[1])
+
+ return smallest_pair
+
+
+def read_check_align_file(directory):
+ data_dict = {}
+ clusters = set()
+
+ # Construct the full file path
+ filepath = os.path.join(directory, 'checkAlignLength.out')
+ # Read the file content
+ with open(filepath, 'r') as file:
+ alignedCluster = None
+ for line in file:
+ if "mafft" in line:
+ alignedCluster = line.strip()
+ clusters.add(line.split('_')[-2])
+ elif "Alignment:" in line and alignedCluster:
+ alignLength = line.replace('Alignment:','').strip()
+ #print(alignedCluster,alignLength)
+ data_dict[alignedCluster] = alignLength
+ # Find alignment with shortest length for each cluster
+ results = {}
+ for cluster in clusters:
+ result = find_smallest_value_with_substring(data_dict, cluster)
+ results[result[0]] = result[1]
+
+ return results
+
+
+def process_probes(probes_dict, directory_path):
+ result = {}
+
+ for filename, data in probes_dict.items():
+ aligned_probes_file = filename.replace('.fasta', '_probesAligned.txt')
+ aligned_probes_path = os.path.join(directory_path, aligned_probes_file)
+
+ aligned_length = int(data)
+ #print(range(0, aligned_length))
+
+ alignD = {key: 0 for key in range(aligned_length + 1)}
+
+ with open(aligned_probes_path, 'r') as file:
+ for line_count, line in enumerate(file):
+ if line_count > 0:
+ seq_positions = line.split('\t')[-1].split('~')
+ for pos in seq_positions:
+ alignD[int(pos)] += 1
+
+ result[filename] = alignD
+
+ return result
+
+if __name__ == "__main__":
+ main()
diff --git a/extensions/link_patternedKmers.py b/extensions/link_patternedKmers.py
new file mode 100755
index 00000000..fd11d778
--- /dev/null
+++ b/extensions/link_patternedKmers.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+
+import argparse, os
+import kmertools as kt
+import fastatools as ft
+import inout as io
+from collections import defaultdict
+
+# This script works like the link module in PepSIRF, but implements patterned kmers
+
+def main():
+
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+ reqArgs = parser.add_argument_group('required arguments')
+ # Note that these arguments are added directly to the new argument group "reqArgs", not "parser"
+ reqArgs.add_argument('--protein_file', help='Name of fasta file containing target protein sequences.', required=True)
+ reqArgs.add_argument('--peptide_file', help='Name of fasta file containing aa peptides of interest. These will generally be peptides that are contained in a particular assay.', required=True)
+ reqArgs.add_argument('--meta', help='Taxonomic information for each protein contained in "--protein_file". Three comma-separated strings should be provided: 1) name of tab-delimited metadata file, 2) header for column containing protein sequence name and 3) header for column containing ID to be used in creating the linkage map.', required=True)
+
+ parser.add_argument('-k', '--kmer_size', default=8, type=int, help='Number of positions at which there must be a match.')
+ parser.add_argument('-s', '--span', default=10, type=int, help='Number of positions across which the match can span.')
+ parser.add_argument('-o', '--output', default="link_output.tsv", help='Name for output linkage map.')
+
+ args = parser.parse_args()
+
+ # Generate all patterns and report the number being tested
+ allPatts = kt.generatePatterns(args.kmer_size, args.span)
+ print(f"Considering {len(allPatts)} patterns for generating linkage map.\n")
+
+ # Read in metadata
+ metaF, seqN, metaV = args.meta.split(",")
+ metaD = io.fileDictHeaderLists(metaF, metaV, seqN)
+
+ # Read in target sequences
+ prot_fD = ft.read_fasta_dict(args.protein_file)
+ # Read in peptide sequences
+ pep_fD = ft.read_fasta_dict(args.peptide_file)
+
+ outScoresD = defaultdict(list)
+
+ # Determine total number of categories to process
+ total = len(metaD)
+ numProc=0
+
+ # Step through each metadata category
+ for each, nameL in metaD.items():
+ print(each, len(nameL))
+ # Step through each pattern of interest
+ pepScoreD = defaultdict(int)
+ for patt in allPatts:
+ print(patt)
+ targetKmers = {}
+ # Step through each sequence for this group and add it's kmers to the dictionary
+ for name in nameL:
+ if name in prot_fD:
+ for every in kt.patternedKmers(prot_fD[name], patt, outType="set", filter=["X", "B", "J", "Z"]):
+ targetKmers[every] = ""
+
+ this_pepScoreD = {}
+ # Step through each peptide
+ for pepN, pepS in pep_fD.items():
+ pepKmers = kt.patternedKmers(pepS, patt, outType="set", filter=["X", "B", "J", "Z"])
+ thisScore = sum([1 for kmer in pepKmers if kmer in targetKmers])
+ this_pepScoreD[pepN] = thisScore
+ #Update scores, if they're higher than for previous patterns
+ for pepN, pepScore in this_pepScoreD.items():
+ if pepScore > pepScoreD[pepN]:
+ pepScoreD[pepN] = pepScore
+ #for peptides with scores>0, add info to outScoresD
+ for pepN, pepScore in pepScoreD.items():
+ if pepScore > 0:
+ outScoresD[pepN].append(f"{each}:{pepScore}")
+
+ # Increment counter
+ numProc+=1
+ if numProc%1==0:
+ print(numProc/total*100)
+
+ #Write scores to output file
+ with open(args.output, "w") as fout:
+ fout.write("Peptide Name\tLinked Species IDs with counts\n")
+ for pepN in pep_fD:
+ if pepN in outScoresD:
+ outStr = ",".join(outScoresD[pepN])
+ fout.write(f"{pepN}\t{outStr}\n")
+
+
+#----------------------End of main()
+
+
+
+###------------------------------------->>>>
+
+if __name__ == "__main__":
+ main()
+
diff --git a/extensions/pepCountPoisson.py b/extensions/pepCountPoisson.py
new file mode 100755
index 00000000..49c080ca
--- /dev/null
+++ b/extensions/pepCountPoisson.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+
+import argparse, os, glob
+import inout as io
+import pandas as pd
+from scipy.stats import poisson
+pd.options.mode.copy_on_write = True
+from collections import defaultdict as dd
+# import numpy as np
+# import inout as io
+
+
+def main():
+
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+ reqArgs = parser.add_argument_group('required arguments')
+ # Note that these arguments are added directly to the new argument group "reqArgs", not "parser"
+ reqArgs.add_argument('-e', '--enrDir', help='Directory containing enriched peptide files.', required=True)
+ reqArgs.add_argument('-r', '--protMeta', help='Protein metadata file.', required=True)
+ reqArgs.add_argument('-p', '--pepMeta', help='Peptide metadata file.', required=True)
+ reqArgs.add_argument('-o', '--outDir', help='Name/path for directory to place output files within. It will be created if it does not already exist', required=True)
+
+ parser.add_argument('--protNameHeader', default="Sequence", help='Header for protein name in protein metadata file.')
+ parser.add_argument('--taxHeader', default="MultiProteinCluster", help='Header for taxonomic category name in protein metadata file.')
+ parser.add_argument('--pepNameHeader', default="CodeName", help='Header for peptide name in peptide metadata file.')
+ parser.add_argument('--parentSeqHeader', default="ParentSeqName", help='Header for parent sequence name in peptide metadata file.')
+ parser.add_argument('--enrPepStartStr', default="", help='Expected string at the beginning of all enriched peptide files.')
+ parser.add_argument('--enrPepExtension', default="_enriched.txt", help='Expected string at the end of all enriched peptide files.')
+ parser.add_argument('--pepThresh', default=1, type=int, help='Minimum number of enriched peptides for a p-value to be calculated for a given taxonomic group.')
+ parser.add_argument('--basePval', default=0.01, type=float, help='Base p-value threshold to use for reporting.')
+ parser.add_argument('--noBonferroni', default=False, action="store_true", help='Use this flag if you do not want to the pvalue threshold to be adjusted using the Bonferroni method.')
+ parser.add_argument('--taxMeta', help='Optional metadata file with information about the taxonomic groups to include in the output files.')
+ parser.add_argument('--taxMetaKey', default="MultiProteinCluster", help='Header containing taxonomic group name within taxMeta file.')
+ parser.add_argument('--taxMetaHeaders', default="Species_List,Species_Prop,Genus_List,Genus_Prop", help='Comma-separated list of headers from the taxMeta file to include in the output.')
+
+ parser.add_argument('rest', nargs=argparse.REMAINDER)
+
+ opts = parser.parse_args()
+
+ # Read metadata linking protein targets to tax clusters
+ prot2taxD = io.fileDictHeader(opts.protMeta, opts.protNameHeader, opts.taxHeader)
+
+ # Read metadata linking peptides to protein targets
+ pep2protD = io.fileDictHeader(opts.pepMeta, opts.pepNameHeader, opts.parentSeqHeader)
+
+ # If provided, read in taxonomic metadata
+ if opts.taxMeta:
+ taxDF = pd.read_csv(opts.taxMeta, header=0, index_col=0, sep="\t")
+ opts.taxMetaHeaders = opts.taxMetaHeaders.split(",")
+
+ # Create output directory and subdirectories, if they doesn't already exist
+ io.makeDir(opts.outDir)
+ io.makeDir(f"{opts.outDir}/sigTaxa")
+
+
+ # For each taxonomic groups, calculate the proportion of the total peptides it represents
+ totalPeps = len(pep2protD)
+ tax2pepsD = dd(list)
+ for pepN, protN in pep2protD.items():
+ if protN in prot2taxD:
+ tax2pepsD[prot2taxD[protN]].append(pepN)
+
+ propLibD = {k:len(v)/totalPeps for k,v in tax2pepsD.items()}
+
+ # Grab all of the enriched peptide files and step through them one by one
+ for eachF in glob.glob(f"{opts.enrDir}/{opts.enrPepStartStr}*{opts.enrPepExtension}"):
+ print(eachF)
+ # Read in enriched peptides
+ enrPL = io.fileList(eachF, header=False)
+ if enrPL != [" "]:
+ totalEnrPeps = len(enrPL)
+
+ # Generate dictionary with counts of enriched peptides for each taxonomic group
+ countD = dd(int)
+
+ # Step through each enriched peptide and add a count to the countD
+ for pn in enrPL:
+ this_prot = pep2protD[pn]
+ if this_prot in prot2taxD:
+ countD[prot2taxD[this_prot]]+=1
+
+ # Step through each taxonomic groups and calculate probability of observing this number of peptides, given a poisson distribution
+ pvD = {}
+ for tn, pc in countD.items():
+ if pc >= opts.pepThresh:
+ pv = poisson.sf(pc, totalEnrPeps*propLibD[tn])
+ pvD[tn]=pv
+
+ # Correct p-value threshold, unless otherwise specified
+ if not opts.noBonferroni and len(pvD)>1:
+ pval = opts.basePval/len(pvD)
+ else:
+ pval = opts.basePval
+
+ sigNames = [(pv, tn) for tn, pv in pvD.items() if pv<=pval]
+ if len(sigNames) > 0:
+ with open(f"{opts.outDir}/sigTaxa/{os.path.basename(eachF)[:-4]}.tsv", "w") as fout:
+ if opts.taxMeta:
+ extraHeaders = "\t".join(opts.taxMetaHeaders)
+ fout.write(f"TaxGroup\tNumEnrPeptides\tPoissonProbPvalue\t{extraHeaders}\n")
+ else:
+ fout.write("TaxGroup\tNumEnrPeptides\tPoissonProbPvalue\n")
+ for pv,tn in sorted(sigNames):
+ if opts.taxMeta:
+ extraInfo = "\t".join([str(taxDF[hd][tn]) for hd in opts.taxMetaHeaders])
+ fout.write(f"{tn}\t{countD[tn]}\t{pv}\t{extraInfo}\n")
+ else:
+ fout.write(f"{tn}\t{countD[tn]}\t{pv}\n")
+
+
+
+#----------------------End of main()
+
+
+
+###------------------------------------->>>>
+
+if __name__ == "__main__":
+ main()
+
diff --git a/include/modules/core/pepsirf_version.h b/include/modules/core/pepsirf_version.h
index 45e5100d..0ab5e07e 100644
--- a/include/modules/core/pepsirf_version.h
+++ b/include/modules/core/pepsirf_version.h
@@ -1,6 +1,6 @@
#ifndef PEPSIRF_VERSION_HH_INCLUDED
#define PEPSIRF_VERSION_HH_INCLUDED
-#define PEPSIRF_VERSION "1.6.0"
+#define PEPSIRF_VERSION "1.7.0"
#endif
diff --git a/include/modules/core/sequence.h b/include/modules/core/sequence.h
index 0b8b9c35..32f1f190 100644
--- a/include/modules/core/sequence.h
+++ b/include/modules/core/sequence.h
@@ -72,6 +72,14 @@ class sequence
**/
bool operator==( const sequence& s ) const;
+ /**
+ * Less than operator override.
+ * For two sequences a and b we say a < b iff
+ * a.seq < b.seq.
+ * @param s Sequence to compare against.
+ **/
+ bool operator<( const sequence& s ) const;
+
/**
* Get the length of the 'seq' member of this class.
diff --git a/include/modules/demux/module_demux.h b/include/modules/demux/module_demux.h
index d78c1ee3..0d57132c 100644
--- a/include/modules/demux/module_demux.h
+++ b/include/modules/demux/module_demux.h
@@ -109,6 +109,22 @@ class module_demux : public module
std::vector &lib_seqs
);
+ /**
+ * Outputs truncated sequence info for unqiue sequences, non-unqiue
+ * sequences, and the new fasta-formatted file
+ * @param seq_length Sequence length specified with the "--seq" option
+ * @param lib_seqs Value of library sequences received from file
+ * specified by the "--library"
+ * @param library_fname Library file name, specified by "--library"
+ * @param trunc_info_outdir Specified output directory with the
+ * "--trunc_info_outdir" option
+ **/
+ void output_trunc_info(
+ std::size_t seq_length,
+ std::vector lib_seqs,
+ std::string library_fname,
+ std::string trunc_info_outdir
+ );
/**
* Method to zero a vector of size_t elements.
@@ -380,11 +396,32 @@ class module_demux : public module
* Creates a single output fastq file containing all of the reads that have not been mapped to a sample/peptide
* @param filename file to output to
* @param samp_map fastaq output map
- * @reads_dup vector of all reads
+ * @param reads_dup vector of all reads
**/
void create_unmapped_reads_file( std::string filename,
std::map> samp_map, std::vector reads_dup );
+ /**
+ * Outputs a vector to a file with a delemiter between entries
+ * @param vector vector to be outputted, make sure the type of its values can use the << operator for output
+ * @param filename file to output to
+ * @param delimiter delimiter to output between values
+ **/
+ template
+ void simple_vector_out(std::vector vector, std::string output_file, std::string delimiter)
+ {
+ std::ofstream outfile(output_file, std::ios::out);
+ for (unsigned int i=0; i
#include
#include
+#include
#include
#include
@@ -67,7 +68,7 @@ void module_deconv::run(options *opts)
);
}
- auto in_dir_iter = fs_tools::directory_iterator(input_base);
+ auto in_dir_iter = boost::filesystem::directory_iterator(input_base);
for (auto& input_f : boost::make_iterator_range(in_dir_iter, {}))
{
diff --git a/src/modules/demux/module_demux.cpp b/src/modules/demux/module_demux.cpp
index cb5b9867..ee0b7fdd 100644
--- a/src/modules/demux/module_demux.cpp
+++ b/src/modules/demux/module_demux.cpp
@@ -132,6 +132,11 @@ void module_demux::run( options *opts )
if (seq_length < lib_length)
{
trunc_lib_seqs(seq_length, library_seqs);
+
+ if(!d_opts->trunc_info_outdir.empty())
+ {
+ output_trunc_info(seq_length, library_seqs, d_opts->library_fname, d_opts->trunc_info_outdir);
+ }
}
else if (seq_length > lib_length)
{
@@ -208,7 +213,7 @@ void module_demux::run( options *opts )
#else
Log::error(
"A gzipped file was required, but boost zlib is not available."
- " Please either configure boost with zlib or unzip the file
+ " Please either configure boost with zlib or unzip the file"
" before attempting to demux."
);
#endif
@@ -1041,18 +1046,11 @@ void module_demux::create_unmapped_reads_file( std::string filename,
}
}
- // delete from reads_dup
- // info_str1 << "total reads: "<< reads_dup.size() << "\n";
- // info_str2 << "mapped_reads: "<< to_remove_set.size() << "\n";
- // Log::info(info_str1.str());
- // Log::info(info_str2.str());
reads_dup.erase(std::remove_if(reads_dup.begin(), reads_dup.end(),
[&to_remove_set](const fastq_sequence& value) {
return to_remove_set.find(value.seq) != to_remove_set.end();
}),
reads_dup.end());
- // info_str3 << "unmapped_reads: "<< reads_dup.size() << "\n";
- // Log::info(info_str3.str());
// write to file
std::ofstream output( filename, std::ios_base::app );
@@ -1066,3 +1064,72 @@ void module_demux::create_unmapped_reads_file( std::string filename,
output.close();
}
+void module_demux::output_trunc_info(
+ std::size_t seq_length,
+ std::vector lib_seqs,
+ std::string library_fname,
+ std::string trunc_info_outdir
+) {
+ boost::filesystem::path outdir_path = trunc_info_outdir;
+ if (!boost::filesystem::exists(outdir_path))
+ {
+ boost::filesystem::create_directory(outdir_path);
+ }
+
+ boost::filesystem::path library_path(library_fname);
+ std::string new_fasta_filename = "trunc" + std::to_string(seq_length) + "_" + library_path.filename().string();
+ boost::filesystem::path new_fasta_path = outdir_path / new_fasta_filename;
+
+ std::unordered_map seq_counts;
+
+ std::ofstream outfile(new_fasta_path.string(), std::ios::out);
+ for (sequence lib_seq : lib_seqs)
+ {
+ outfile << ">" + lib_seq.name << std::endl;
+ outfile << lib_seq.seq << std::endl;
+
+ seq_counts[lib_seq.seq]++;
+ }
+ outfile.close();
+
+ std::unordered_map> duplicates;
+ std::vector nonunique;
+ std::vector unique;
+
+ for (sequence lib_seq : lib_seqs)
+ {
+ std::string name = lib_seq.name;
+ if(seq_counts[lib_seq.seq] > 1)
+ {
+ duplicates[lib_seq.seq].insert(name);
+ nonunique.emplace_back(name);
+ }
+ else
+ {
+ unique.emplace_back(name);
+ }
+ }
+
+ outfile.open(new_fasta_path.replace_extension("").string() + "_notUniqGrouped.tsv", std::ios::out);
+ for( const auto& dup : duplicates)
+ {
+ outfile << dup.first << "\t";
+ std::unordered_set peptides = dup.second;
+ for( auto it = peptides.begin(); it != peptides.end(); ++it)
+ {
+ outfile << *it;
+ if( std::next(it) != peptides.end())
+ {
+ outfile << '\t';
+ }
+ else
+ {
+ outfile << std::endl;
+ }
+ }
+ }
+ outfile.close();
+
+ simple_vector_out(unique, new_fasta_path.replace_extension("").string() + "_uniq.txt", "\n");
+ simple_vector_out(nonunique, new_fasta_path.replace_extension("").string() + "_notUniq.txt", "\n");
+}
\ No newline at end of file
diff --git a/src/modules/demux/options_demux.cpp b/src/modules/demux/options_demux.cpp
index 03a83c5f..59ed6c47 100644
--- a/src/modules/demux/options_demux.cpp
+++ b/src/modules/demux/options_demux.cpp
@@ -35,7 +35,8 @@ std::string options_demux::get_arguments()
<< "--fastq_output " << fastq_out << "\n"
<< "--logfile " << logfile << "\n"
<< "--replicate_info " << replicate_info_fname << "\n"
- << "--unmapped_reads_output " << unmapped_reads_fname << "\n\n";
+ << "--unmapped_reads_output " << unmapped_reads_fname << "\n"
+ << "--trunc_info_output " << trunc_info_outdir << "\n\n";
return str_stream.str();
}
diff --git a/src/modules/demux/options_parser_demux.cpp b/src/modules/demux/options_parser_demux.cpp
index eb874284..a55fdeab 100644
--- a/src/modules/demux/options_parser_demux.cpp
+++ b/src/modules/demux/options_parser_demux.cpp
@@ -186,7 +186,7 @@ bool options_parser_demux::parse(int argc, char ***argv, options *opts)
)
("fastq_output,q", po::value(&opts_demux->fastq_out)
->default_value(""),
- "Include this to output sample-level fastq files"
+ "Include this to output sample-level fastq files\n"
)
("include_toggle", po::value(&opts_demux->pos_toggle)
->default_value(true),
@@ -217,6 +217,11 @@ bool options_parser_demux::parse(int argc, char ***argv, options *opts)
" all of the reads that have not been mapped to a sample/peptide (i.e., all of those that"
"would not be included in any of the files created by the -q option).\n"
)
+ ("trunc_info_output", po::value(&opts_demux->trunc_info_outdir)
+ ->default_value(""),
+ "Name of directory to output truncated sequence information. This will include outputs for"
+ " unqiue sequences, non-unqiue sequences, and the new fasta-formatted file.\n\n"
+ )
;
po::store(po::command_line_parser(argc, *argv).options(desc).allow_unregistered().run(), vm);
diff --git a/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30.fna b/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30.fna
new file mode 100644
index 00000000..fc762444
--- /dev/null
+++ b/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30.fna
@@ -0,0 +1,30 @@
+>NS30_000000-1
+GTACCGATCACCCAGAACCCGGTTGAAAACTACATCGACG
+>NS30_000000-2
+GTTCCGATCACCCAGAACCCGGTTGAAAACTACATCGACG
+>NS30_000000-3
+GTCCCGATCACCCAGAACCCGGTCGAAAACTACATCGACG
+>NS30_000001-1
+GAAGACGAACTGGAAGAAGTTGTTATCGACAAAATGAAAC
+>NS30_000001-2
+GAAGATGAACTGGAAGAAGTCGTTATCGACAAAATGAAAC
+>NS30_000009-1
+CAGGACATCCACCTGATCGACGACCTGGGCCAGACCCGCA
+>NS30_000009-3
+CAGGACATCCACCTGATCGACGACCTGGGCCAGACCCGCA
+>NS30_000010-2
+CCGCCGGCTGAAAAAAAAGACCCGTACGCTGACCTGACCT
+>NS30_000010-3
+CCGCCGGCTGAAAAAAAAGACCCGTACGCTGACCTGACCT
+>NS30_000027-1
+CTGAAAAACCTGCTGAACTCCCGCAAACGTGACCCGCTGT
+>NS30_000027-2
+CTGAAAAACCTGCTGAACTCCCGCAAACGTGACCCGCTGT
+>NS30_000032-2
+AAAGGCCTGTCTGACGAAGAATACGAAGAATACAAACGCG
+>NS30_000032-3
+AAAGGCCTGTCTGACGAAGAATACGAAGAATACAAACGCG
+>NS30_000041-1
+ACCAAAGAAGTTATGATCAAAGCTGAAAAAATGGGCAAAT
+>NS30_000041-2
+ACCAAAGAAGTTATGATCAAAGCTGAAAAAATGGGCAAAT
diff --git a/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_notUniq.txt b/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_notUniq.txt
new file mode 100644
index 00000000..53812782
--- /dev/null
+++ b/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_notUniq.txt
@@ -0,0 +1,10 @@
+NS30_000009-1
+NS30_000009-3
+NS30_000010-2
+NS30_000010-3
+NS30_000027-1
+NS30_000027-2
+NS30_000032-2
+NS30_000032-3
+NS30_000041-1
+NS30_000041-2
\ No newline at end of file
diff --git a/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_notUniqGrouped.tsv b/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_notUniqGrouped.tsv
new file mode 100644
index 00000000..8f9f8a08
--- /dev/null
+++ b/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_notUniqGrouped.tsv
@@ -0,0 +1,5 @@
+AAAGGCCTGTCTGACGAAGAATACGAAGAATACAAACGCG NS30_000032-2 NS30_000032-3
+CTGAAAAACCTGCTGAACTCCCGCAAACGTGACCCGCTGT NS30_000027-1 NS30_000027-2
+ACCAAAGAAGTTATGATCAAAGCTGAAAAAATGGGCAAAT NS30_000041-1 NS30_000041-2
+CCGCCGGCTGAAAAAAAAGACCCGTACGCTGACCTGACCT NS30_000010-2 NS30_000010-3
+CAGGACATCCACCTGATCGACGACCTGGGCCAGACCCGCA NS30_000009-1 NS30_000009-3
diff --git a/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_uniq.txt b/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_uniq.txt
new file mode 100644
index 00000000..2d544711
--- /dev/null
+++ b/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_uniq.txt
@@ -0,0 +1,5 @@
+NS30_000000-1
+NS30_000000-2
+NS30_000000-3
+NS30_000001-1
+NS30_000001-2
\ No newline at end of file
diff --git a/test/pepsirf_test.cpp b/test/pepsirf_test.cpp
index 677e4250..0cf81ba0 100644
--- a/test/pepsirf_test.cpp
+++ b/test/pepsirf_test.cpp
@@ -520,6 +520,7 @@ TEST_CASE("Automatic truncation of library sequences", "[module_demux]")
d_opts.sample_indexes = { "Index1", "Index2" };
d_opts.set_info(&options_demux::index1_data, "12,10,1");
d_opts.set_info(&options_demux::index2_data, "0,8,1");
+ d_opts.num_threads = 1;
SECTION("Library sequences are resized to expected size from user")
{
@@ -576,19 +577,21 @@ TEST_CASE("Automatic truncation of library sequences", "[module_demux]")
d_opts.set_info(&options_demux::seq_data, "41,100,2");
REQUIRE_THROWS(d_mod.run(&d_opts));
}
- SECTION("Demux outputs unmapped reads file.")
+ SECTION("Output of truncated sequence information is correct")
{
d_opts.set_info(&options_demux::seq_data, "41,40,2");
- d_opts.unmapped_reads_fname = "../test/test_unmapped_reads.fastq";
+ d_opts.trunc_info_outdir = "../test/test_demux_trunc_info";
d_mod.run(&d_opts);
- std::string expected = "../test/expected/test_expected_demux_unmapped_reads.fastq";
- std::string actual = "../test/test_unmapped_reads.fastq";
+ // test if not unique sequences file is identical
+ std::string expected = "../test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_notUniq.txt";
+ std::string actual = "../test/test_demux_trunc_info/trunc40_test_extended_demux_library_NS30_notUniq.txt";
std::ifstream ifexpected(expected, std::ios_base::in);
std::ifstream ifactual(actual, std::ios_base::in);
std::string expected_line;
std::string actual_line;
+ // construct set of lines for each actual demux output
std::unordered_set actual_line_set;
while (!ifactual.eof())
{
@@ -603,6 +606,131 @@ TEST_CASE("Automatic truncation of library sequences", "[module_demux]")
actual_line_set.find(expected_line) != actual_line_set.end()
);
}
+
+ // test if not unique grouped file is identical
+ expected = "../test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_notUniqGrouped.tsv";
+ actual = "../test/test_demux_trunc_info/trunc40_test_extended_demux_library_NS30_notUniqGrouped.tsv";
+ ifexpected = std::ifstream(expected, std::ios_base::in);
+ ifactual = std::ifstream(actual, std::ios_base::in);
+
+ std::unordered_map> expected_duplicates;
+ std::unordered_map> actual_duplicates;
+
+ while (std::getline(ifexpected, expected_line)) {
+ boost::algorithm::trim(expected_line);
+
+ std::vector split_line;
+ boost::split(split_line, expected_line, boost::is_any_of("\t"));
+
+ std::string seq;
+ for (size_t i = 0; i < split_line.size(); ++i) {
+ if (i == 0) {
+ seq = split_line[i];
+ } else {
+ expected_duplicates[seq].insert(split_line[i]);
+ }
+ }
+ }
+
+ while (std::getline(ifactual, actual_line)) {
+ boost::algorithm::trim(actual_line);
+
+ std::vector split_line;
+ boost::split(split_line, actual_line, boost::is_any_of("\t"));
+
+ std::string seq;
+ for (size_t i = 0; i < split_line.size(); ++i) {
+ if (i == 0) {
+ seq = split_line[i];
+ } else {
+ actual_duplicates[seq].insert(split_line[i]);
+ }
+ }
+ }
+
+ REQUIRE(expected_duplicates == actual_duplicates);
+
+ // test if unique sequences file is identical
+ expected = "../test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_uniq.txt";
+ actual = "../test/test_demux_trunc_info/trunc40_test_extended_demux_library_NS30_uniq.txt";
+ ifexpected = std::ifstream(expected, std::ios_base::in);
+ ifactual = std::ifstream(actual, std::ios_base::in);
+
+ // construct set of lines for each actual demux output
+ actual_line_set = std::unordered_set();
+ while (!ifactual.eof())
+ {
+ std::getline(ifactual, actual_line);
+ actual_line_set.insert(actual_line);
+ }
+
+ while (!ifexpected.eof())
+ {
+ std::getline(ifexpected, expected_line);
+ REQUIRE(
+ actual_line_set.find(expected_line) != actual_line_set.end()
+ );
+ }
+
+ // test if truncated fasta output is identical
+ fasta_parser fp;
+ std::vector expected_sequences;
+ std::vector actual_sequences;
+ expected_sequences = fp.parse( "../test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30.fna" );
+ actual_sequences = fp.parse( "../test/test_demux_trunc_info/trunc40_test_extended_demux_library_NS30.fna" );
+ REQUIRE(expected_sequences==actual_sequences);
+ }
+}
+
+TEST_CASE("Demux outputs unmapped reads file", "[module_demux]")
+{
+ // run demux using a real-world dataset (31250 records) with diagnostic output
+ module_demux d_mod;
+ options_demux d_opts;
+
+ d_opts.input_r1_fname = std::string("../test/input_data/test_input_r1_NS30.fastq");
+ d_opts.input_r2_fname = std::string("../test/input_data/test_input_r2_NS30.fastq");
+ d_opts.index_fname = std::string("../test/input_data/test_barcodes.fa");
+ d_opts.library_fname = std::string("../test/input_data/test_extended_demux_library_NS30.fna");
+ d_opts.samplelist_fname = std::string("../test/input_data/test_samplelist_NS30.tsv");
+ d_opts.flexible_idx_fname = std::string("");
+ d_opts.output_fname = std::string("../test/test_actual_trunc_demux_output.tsv");
+ d_opts.diagnostic_fname = std::string("../test/test_actual_trunc_diagnostic_output.tsv");
+ d_opts.read_per_loop = 80000;
+ d_opts.aggregate_fname = std::string();
+ d_opts.concatemer = std::string();
+ d_opts.min_phred_score = 0;
+ d_opts.phred_base = 33;
+ d_opts.samplename = "SampleName";
+ d_opts.indexes = "Index1,Index2";
+ d_opts.sample_indexes = { "Index1", "Index2" };
+ d_opts.set_info(&options_demux::index1_data, "12,10,1");
+ d_opts.set_info(&options_demux::index2_data, "0,8,1");
+ d_opts.set_info(&options_demux::seq_data, "41,40,2");
+ d_opts.unmapped_reads_fname = "../test/test_unmapped_reads.fastq";
+ d_opts.num_threads = 1;
+ d_mod.run(&d_opts);
+
+ std::string expected = "../test/expected/test_expected_demux_unmapped_reads.fastq";
+ std::string actual = "../test/test_unmapped_reads.fastq";
+ std::ifstream ifexpected(expected, std::ios_base::in);
+ std::ifstream ifactual(actual, std::ios_base::in);
+ std::string expected_line;
+ std::string actual_line;
+
+ std::unordered_set actual_line_set;
+ while (!ifactual.eof())
+ {
+ std::getline(ifactual, actual_line);
+ actual_line_set.insert(actual_line);
+ }
+
+ while (!ifexpected.eof())
+ {
+ std::getline(ifexpected, expected_line);
+ REQUIRE(
+ actual_line_set.find(expected_line) != actual_line_set.end()
+ );
}
}