diff --git a/.gitignore b/.gitignore index 3cbf9c0d..11d3337a 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ *~ \#*# +extensions/__pycache__/code.cpython-38.pyc diff --git a/CMakeLists.txt b/CMakeLists.txt index e36222aa..d204441f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,13 +52,24 @@ list( APPEND PepSIRF_LINK_LIBS ) if(OpenMP_FOUND) - message( "OpenMP enabled" ) - list( APPEND PepSIRF_LINK_LIBS OpenMP::OpenMP_CXX ) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xpreprocessor -fopenmp") - add_definitions( -DENABLE_OPENMP ) + message("OpenMP enabled") + + if(APPLE) + # Get libomp filepath + execute_process(COMMAND brew --prefix libomp OUTPUT_VARIABLE BREW_PREFIX OUTPUT_STRIP_TRAILING_WHITESPACE) + + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xpreprocessor -fopenmp -I${BREW_PREFIX}/include") + list(APPEND PepSIRF_LINK_LIBS "${BREW_PREFIX}/lib/libomp.dylib") + else() + list(APPEND PepSIRF_LINK_LIBS OpenMP::OpenMP_CXX) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xpreprocessor -fopenmp") + endif() + + # Define OpenMP macro + add_definitions(-DENABLE_OPENMP) else() - message( "WARNING: OpenMP not found, parallelism disabled." ) + message("WARNING: OpenMP not found, parallelism disabled.") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas -Wno-unused-value") endif() diff --git a/docs/5-changelog.md b/docs/5-changelog.md index 5cfbd939..b7d8fe05 100644 --- a/docs/5-changelog.md +++ b/docs/5-changelog.md @@ -10,10 +10,14 @@ permalink: /changelog/ ## 1.7.0 | 2024-10-3 -Docker: added new feature (Issue #254). Added the ability to run PepSIRF as a Docker image and added a page for instructions. +### Bug Fixes: CMakelists: bug fix (Issue #197). Resolved CMake not locating OpenMP on MacOS. Tutorial for fix added to installation page. +## New Features: + +Docker: added new feature (Issue #254). Added the ability to run PepSIRF as a Docker image and added a page for instructions. + Subjoin: added new feature (Issue #236). Added a functionality to the "-i" option in Subjoin to accept a regex pattern instead of a filename which contains sample/peptide names. The sample/peptide names used from the score matrix file will be filtered by whether they contain the regex pattern. Demux: added new feature (Issue #234). Added "--unmapped-reads-output" option to Demux, which writes all reads that have not been mapped to a sample/peptide to the specified filename. diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock index 1eba005c..767c0058 100644 --- a/docs/Gemfile.lock +++ b/docs/Gemfile.lock @@ -1,23 +1,31 @@ GEM remote: https://rubygems.org/ specs: - activesupport (6.0.5) + activesupport (7.1.3.4) + base64 + bigdecimal concurrent-ruby (~> 1.0, >= 1.0.2) - i18n (>= 0.7, < 2) - minitest (~> 5.1) - tzinfo (~> 1.1) - zeitwerk (~> 2.2, >= 2.2.2) + connection_pool (>= 2.2.5) + drb + i18n (>= 1.6, < 2) + minitest (>= 5.1) + mutex_m + tzinfo (~> 2.0) addressable (2.8.0) public_suffix (>= 2.0.2, < 5.0) + base64 (0.2.0) + bigdecimal (3.1.8) coffee-script (2.4.1) coffee-script-source execjs coffee-script-source (1.11.1) colorator (1.1.0) - commonmarker (0.23.9) - concurrent-ruby (1.1.10) + commonmarker (0.23.10) + concurrent-ruby (1.3.3) + connection_pool (2.4.1) dnsruby (1.61.9) simpleidn (~> 0.1) + drb (2.2.1) em-websocket (0.5.3) eventmachine (>= 0.12.9) http_parser.rb (~> 0) @@ -51,12 +59,12 @@ GEM ffi (1.15.5) forwardable-extended (2.6.0) gemoji (3.0.1) - github-pages (226) + github-pages (228) github-pages-health-check (= 1.17.9) - jekyll (= 3.9.2) + jekyll (= 3.9.3) jekyll-avatar (= 0.7.0) jekyll-coffeescript (= 1.1.1) - jekyll-commonmark-ghpages (= 0.2.0) + jekyll-commonmark-ghpages (= 0.4.0) jekyll-default-layout (= 0.1.4) jekyll-feed (= 0.15.1) jekyll-gist (= 1.5.0) @@ -90,10 +98,10 @@ GEM jemoji (= 0.12.0) kramdown (= 2.3.2) kramdown-parser-gfm (= 1.1.0) - liquid (= 4.0.3) + liquid (= 4.0.4) mercenary (~> 0.3) minima (= 2.5.1) - nokogiri (>= 1.13.4, < 2.0) + nokogiri (>= 1.13.6, < 2.0) rouge (= 3.26.0) terminal-table (~> 1.4) github-pages-health-check (1.17.9) @@ -106,13 +114,13 @@ GEM activesupport (>= 2) nokogiri (>= 1.4) http_parser.rb (0.8.0) - i18n (0.9.5) + i18n (1.14.5) concurrent-ruby (~> 1.0) - jekyll (3.9.2) + jekyll (3.9.3) addressable (~> 2.4) colorator (~> 1.0) em-websocket (~> 0.5) - i18n (~> 0.7) + i18n (>= 0.7, < 2) jekyll-sass-converter (~> 1.0) jekyll-watch (~> 2.0) kramdown (>= 1.17, < 3) @@ -128,11 +136,11 @@ GEM coffee-script-source (~> 1.11.1) jekyll-commonmark (1.4.0) commonmarker (~> 0.22) - jekyll-commonmark-ghpages (0.2.0) - commonmarker (~> 0.23.4) + jekyll-commonmark-ghpages (0.4.0) + commonmarker (~> 0.23.7) jekyll (~> 3.9.0) jekyll-commonmark (~> 1.4.0) - rouge (>= 2.0, < 4.0) + rouge (>= 2.0, < 5.0) jekyll-default-layout (0.1.4) jekyll (~> 3.0) jekyll-feed (0.15.1) @@ -220,7 +228,7 @@ GEM rexml kramdown-parser-gfm (1.1.0) kramdown (~> 2.0) - liquid (4.0.3) + liquid (4.0.4) listen (3.7.1) rb-fsevent (~> 0.10, >= 0.10.3) rb-inotify (~> 0.9, >= 0.9.10) @@ -230,8 +238,9 @@ GEM jekyll (>= 3.5, < 5.0) jekyll-feed (~> 0.9) jekyll-seo-tag (~> 2.1) - minitest (5.15.0) + minitest (5.23.1) multipart-post (2.1.1) + mutex_m (0.2.0) nokogiri (1.14.3) mini_portile2 (~> 2.8.0) racc (~> 1.4) @@ -245,7 +254,8 @@ GEM rb-fsevent (0.11.1) rb-inotify (0.10.1) ffi (~> 1.0) - rexml (3.2.5) + rexml (3.3.0) + strscan rouge (3.26.0) ruby2_keywords (0.0.5) rubyzip (2.3.2) @@ -260,18 +270,17 @@ GEM faraday (> 0.8, < 2.0) simpleidn (0.2.1) unf (~> 0.1.4) + strscan (3.1.0) terminal-table (1.8.0) unicode-display_width (~> 1.1, >= 1.1.1) - thread_safe (0.3.6) typhoeus (1.4.0) ethon (>= 0.9.0) - tzinfo (1.2.9) - thread_safe (~> 0.1) + tzinfo (2.0.6) + concurrent-ruby (~> 1.0) unf (0.1.4) unf_ext unf_ext (0.0.8.1) unicode-display_width (1.8.0) - zeitwerk (2.5.4) PLATFORMS ruby diff --git a/extensions/e_k_bias.py b/extensions/e_k_bias.py new file mode 100644 index 00000000..e89fceac --- /dev/null +++ b/extensions/e_k_bias.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 + +import pandas as pd +import argparse +import fastatools as ft +import numpy as np + +def main(): + + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument('-i', '--fasta-file', help='Directory with enriched petide files for input', required=True) + parser.add_argument('-o', '--output-file', default="e_k_bias_out.tsv", help='Name of .tsv to output file with AA bias data') + + args = parser.parse_args() + + # get proportion of e's and k's for each peptide + e_k_props = get_e_k_props(args.fasta_file) + + # get percentiles + percentile_dict = get_percentiles(e_k_props) + + # create output df + out_data = [(name, round(prop, 3), round(percentile_dict[name], 2)) for name, prop in e_k_props.items()] + + pd.DataFrame(out_data, columns=["CodeName", "e_k_Prop", "e_k_Percentile"]).to_csv(args.output_file, index=False, sep='\t') + + +# get proportion of e's and k's for each peptide +def get_e_k_props(fasta_file)->dict: + e_k_props = dict() + + # get props for peptide file + fasta_dict = ft.read_fasta_dict(fasta_file) + + # iterate through each sequence + for name, seq in fasta_dict.items(): + e_k_count = 0 + + # loop through each AA, get count of e and k + for aa in seq: + if aa.lower() == 'e' or aa.lower() =='k': + e_k_count += 1 + + # add proportion to dict + e_k_props[name] = (e_k_count) / len(seq) + + return e_k_props + +# get percentile of each peptide using its e and k proportion +def get_percentiles(e_k_props)->dict: + # Calculate percentile of each peptide + names = list(e_k_props.keys()) + all_props = np.array(list(e_k_props.values())) + + # Get unique values and array for mapping each original value to its corresponding index in the unique array + unique_props, inverse_indices = np.unique(all_props, return_inverse=True) + + # Calculate percentiles based on the unique props + percentile_ranks = np.linspace(0, 100, len(unique_props)) + + # Map sorted values back to the original corresponding name + percentile_dict = {names[i]: percentile_ranks[inverse_indices[i]] for i in range(len(names))} + + return percentile_dict + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/extensions/findEpitopes.py b/extensions/findEpitopes.py new file mode 100644 index 00000000..7d0ed97c --- /dev/null +++ b/extensions/findEpitopes.py @@ -0,0 +1,105 @@ +import os +import matplotlib.pyplot as plt +import numpy as np +import argparse + +def main(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument('-i', '--input-dir', help='Directory with alignment output files and files that contain the mapped location of peptides', required=True) + parser.add_argument('-o', '--output-dir', default="clust_align_visualizations", help='Name of directory to output line plots.') + + args = parser.parse_args() + + directory_path = args.input_dir + alignment_to_use_dict = read_check_align_file(directory_path) + #print(probes_dict) + alignCountsD = process_probes(alignment_to_use_dict, directory_path) + #print(alignCountsD) + + if not os.path.exists(args.output_dir): + os.mkdir(args.output_dir) + create_line_chart(alignCountsD, args.output_dir) + + +def create_line_chart(alignCountsD, out_dir): + for file, pos_dict in alignCountsD.items(): + x = list(pos_dict.keys()) + y = list(pos_dict.values()) + fig, ax = plt.subplots(figsize=(max(x)/10, 10), facecolor='w') + ax.plot(x, y, linestyle='-') + ax.set_xticks(np.arange(min(x), max(x)+5, 5)) + ax.set_xlim(left=min(x)) + ax.set_ylim(bottom=min(y)) + plt.grid() + plt.xlabel("Sequence Position") + plt.ylabel("Count") + plt.title(file) + plt.savefig(os.path.join(out_dir, f"{file.split('_')[-2]}_epitopes_lineplot.png"), dpi=300, bbox_inches='tight') + + +def find_smallest_value_with_substring(data_dict, substring): + # Filter the dictionary to only include items with the specified substring in the key + filtered_dict = {k: v for k, v in data_dict.items() if substring in k} + + # If there are no matches, return None + if not filtered_dict: + return None + + # Find the key-value pair with the smallest value + smallest_pair = min(filtered_dict.items(), key=lambda item: item[1]) + + return smallest_pair + + +def read_check_align_file(directory): + data_dict = {} + clusters = set() + + # Construct the full file path + filepath = os.path.join(directory, 'checkAlignLength.out') + # Read the file content + with open(filepath, 'r') as file: + alignedCluster = None + for line in file: + if "mafft" in line: + alignedCluster = line.strip() + clusters.add(line.split('_')[-2]) + elif "Alignment:" in line and alignedCluster: + alignLength = line.replace('Alignment:','').strip() + #print(alignedCluster,alignLength) + data_dict[alignedCluster] = alignLength + # Find alignment with shortest length for each cluster + results = {} + for cluster in clusters: + result = find_smallest_value_with_substring(data_dict, cluster) + results[result[0]] = result[1] + + return results + + +def process_probes(probes_dict, directory_path): + result = {} + + for filename, data in probes_dict.items(): + aligned_probes_file = filename.replace('.fasta', '_probesAligned.txt') + aligned_probes_path = os.path.join(directory_path, aligned_probes_file) + + aligned_length = int(data) + #print(range(0, aligned_length)) + + alignD = {key: 0 for key in range(aligned_length + 1)} + + with open(aligned_probes_path, 'r') as file: + for line_count, line in enumerate(file): + if line_count > 0: + seq_positions = line.split('\t')[-1].split('~') + for pos in seq_positions: + alignD[int(pos)] += 1 + + result[filename] = alignD + + return result + +if __name__ == "__main__": + main() diff --git a/extensions/link_patternedKmers.py b/extensions/link_patternedKmers.py new file mode 100755 index 00000000..fd11d778 --- /dev/null +++ b/extensions/link_patternedKmers.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python + +import argparse, os +import kmertools as kt +import fastatools as ft +import inout as io +from collections import defaultdict + +# This script works like the link module in PepSIRF, but implements patterned kmers + +def main(): + + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + reqArgs = parser.add_argument_group('required arguments') + # Note that these arguments are added directly to the new argument group "reqArgs", not "parser" + reqArgs.add_argument('--protein_file', help='Name of fasta file containing target protein sequences.', required=True) + reqArgs.add_argument('--peptide_file', help='Name of fasta file containing aa peptides of interest. These will generally be peptides that are contained in a particular assay.', required=True) + reqArgs.add_argument('--meta', help='Taxonomic information for each protein contained in "--protein_file". Three comma-separated strings should be provided: 1) name of tab-delimited metadata file, 2) header for column containing protein sequence name and 3) header for column containing ID to be used in creating the linkage map.', required=True) + + parser.add_argument('-k', '--kmer_size', default=8, type=int, help='Number of positions at which there must be a match.') + parser.add_argument('-s', '--span', default=10, type=int, help='Number of positions across which the match can span.') + parser.add_argument('-o', '--output', default="link_output.tsv", help='Name for output linkage map.') + + args = parser.parse_args() + + # Generate all patterns and report the number being tested + allPatts = kt.generatePatterns(args.kmer_size, args.span) + print(f"Considering {len(allPatts)} patterns for generating linkage map.\n") + + # Read in metadata + metaF, seqN, metaV = args.meta.split(",") + metaD = io.fileDictHeaderLists(metaF, metaV, seqN) + + # Read in target sequences + prot_fD = ft.read_fasta_dict(args.protein_file) + # Read in peptide sequences + pep_fD = ft.read_fasta_dict(args.peptide_file) + + outScoresD = defaultdict(list) + + # Determine total number of categories to process + total = len(metaD) + numProc=0 + + # Step through each metadata category + for each, nameL in metaD.items(): + print(each, len(nameL)) + # Step through each pattern of interest + pepScoreD = defaultdict(int) + for patt in allPatts: + print(patt) + targetKmers = {} + # Step through each sequence for this group and add it's kmers to the dictionary + for name in nameL: + if name in prot_fD: + for every in kt.patternedKmers(prot_fD[name], patt, outType="set", filter=["X", "B", "J", "Z"]): + targetKmers[every] = "" + + this_pepScoreD = {} + # Step through each peptide + for pepN, pepS in pep_fD.items(): + pepKmers = kt.patternedKmers(pepS, patt, outType="set", filter=["X", "B", "J", "Z"]) + thisScore = sum([1 for kmer in pepKmers if kmer in targetKmers]) + this_pepScoreD[pepN] = thisScore + #Update scores, if they're higher than for previous patterns + for pepN, pepScore in this_pepScoreD.items(): + if pepScore > pepScoreD[pepN]: + pepScoreD[pepN] = pepScore + #for peptides with scores>0, add info to outScoresD + for pepN, pepScore in pepScoreD.items(): + if pepScore > 0: + outScoresD[pepN].append(f"{each}:{pepScore}") + + # Increment counter + numProc+=1 + if numProc%1==0: + print(numProc/total*100) + + #Write scores to output file + with open(args.output, "w") as fout: + fout.write("Peptide Name\tLinked Species IDs with counts\n") + for pepN in pep_fD: + if pepN in outScoresD: + outStr = ",".join(outScoresD[pepN]) + fout.write(f"{pepN}\t{outStr}\n") + + +#----------------------End of main() + + + +###------------------------------------->>>> + +if __name__ == "__main__": + main() + diff --git a/extensions/pepCountPoisson.py b/extensions/pepCountPoisson.py new file mode 100755 index 00000000..49c080ca --- /dev/null +++ b/extensions/pepCountPoisson.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python + +import argparse, os, glob +import inout as io +import pandas as pd +from scipy.stats import poisson +pd.options.mode.copy_on_write = True +from collections import defaultdict as dd +# import numpy as np +# import inout as io + + +def main(): + + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + reqArgs = parser.add_argument_group('required arguments') + # Note that these arguments are added directly to the new argument group "reqArgs", not "parser" + reqArgs.add_argument('-e', '--enrDir', help='Directory containing enriched peptide files.', required=True) + reqArgs.add_argument('-r', '--protMeta', help='Protein metadata file.', required=True) + reqArgs.add_argument('-p', '--pepMeta', help='Peptide metadata file.', required=True) + reqArgs.add_argument('-o', '--outDir', help='Name/path for directory to place output files within. It will be created if it does not already exist', required=True) + + parser.add_argument('--protNameHeader', default="Sequence", help='Header for protein name in protein metadata file.') + parser.add_argument('--taxHeader', default="MultiProteinCluster", help='Header for taxonomic category name in protein metadata file.') + parser.add_argument('--pepNameHeader', default="CodeName", help='Header for peptide name in peptide metadata file.') + parser.add_argument('--parentSeqHeader', default="ParentSeqName", help='Header for parent sequence name in peptide metadata file.') + parser.add_argument('--enrPepStartStr', default="", help='Expected string at the beginning of all enriched peptide files.') + parser.add_argument('--enrPepExtension', default="_enriched.txt", help='Expected string at the end of all enriched peptide files.') + parser.add_argument('--pepThresh', default=1, type=int, help='Minimum number of enriched peptides for a p-value to be calculated for a given taxonomic group.') + parser.add_argument('--basePval', default=0.01, type=float, help='Base p-value threshold to use for reporting.') + parser.add_argument('--noBonferroni', default=False, action="store_true", help='Use this flag if you do not want to the pvalue threshold to be adjusted using the Bonferroni method.') + parser.add_argument('--taxMeta', help='Optional metadata file with information about the taxonomic groups to include in the output files.') + parser.add_argument('--taxMetaKey', default="MultiProteinCluster", help='Header containing taxonomic group name within taxMeta file.') + parser.add_argument('--taxMetaHeaders', default="Species_List,Species_Prop,Genus_List,Genus_Prop", help='Comma-separated list of headers from the taxMeta file to include in the output.') + + parser.add_argument('rest', nargs=argparse.REMAINDER) + + opts = parser.parse_args() + + # Read metadata linking protein targets to tax clusters + prot2taxD = io.fileDictHeader(opts.protMeta, opts.protNameHeader, opts.taxHeader) + + # Read metadata linking peptides to protein targets + pep2protD = io.fileDictHeader(opts.pepMeta, opts.pepNameHeader, opts.parentSeqHeader) + + # If provided, read in taxonomic metadata + if opts.taxMeta: + taxDF = pd.read_csv(opts.taxMeta, header=0, index_col=0, sep="\t") + opts.taxMetaHeaders = opts.taxMetaHeaders.split(",") + + # Create output directory and subdirectories, if they doesn't already exist + io.makeDir(opts.outDir) + io.makeDir(f"{opts.outDir}/sigTaxa") + + + # For each taxonomic groups, calculate the proportion of the total peptides it represents + totalPeps = len(pep2protD) + tax2pepsD = dd(list) + for pepN, protN in pep2protD.items(): + if protN in prot2taxD: + tax2pepsD[prot2taxD[protN]].append(pepN) + + propLibD = {k:len(v)/totalPeps for k,v in tax2pepsD.items()} + + # Grab all of the enriched peptide files and step through them one by one + for eachF in glob.glob(f"{opts.enrDir}/{opts.enrPepStartStr}*{opts.enrPepExtension}"): + print(eachF) + # Read in enriched peptides + enrPL = io.fileList(eachF, header=False) + if enrPL != [" "]: + totalEnrPeps = len(enrPL) + + # Generate dictionary with counts of enriched peptides for each taxonomic group + countD = dd(int) + + # Step through each enriched peptide and add a count to the countD + for pn in enrPL: + this_prot = pep2protD[pn] + if this_prot in prot2taxD: + countD[prot2taxD[this_prot]]+=1 + + # Step through each taxonomic groups and calculate probability of observing this number of peptides, given a poisson distribution + pvD = {} + for tn, pc in countD.items(): + if pc >= opts.pepThresh: + pv = poisson.sf(pc, totalEnrPeps*propLibD[tn]) + pvD[tn]=pv + + # Correct p-value threshold, unless otherwise specified + if not opts.noBonferroni and len(pvD)>1: + pval = opts.basePval/len(pvD) + else: + pval = opts.basePval + + sigNames = [(pv, tn) for tn, pv in pvD.items() if pv<=pval] + if len(sigNames) > 0: + with open(f"{opts.outDir}/sigTaxa/{os.path.basename(eachF)[:-4]}.tsv", "w") as fout: + if opts.taxMeta: + extraHeaders = "\t".join(opts.taxMetaHeaders) + fout.write(f"TaxGroup\tNumEnrPeptides\tPoissonProbPvalue\t{extraHeaders}\n") + else: + fout.write("TaxGroup\tNumEnrPeptides\tPoissonProbPvalue\n") + for pv,tn in sorted(sigNames): + if opts.taxMeta: + extraInfo = "\t".join([str(taxDF[hd][tn]) for hd in opts.taxMetaHeaders]) + fout.write(f"{tn}\t{countD[tn]}\t{pv}\t{extraInfo}\n") + else: + fout.write(f"{tn}\t{countD[tn]}\t{pv}\n") + + + +#----------------------End of main() + + + +###------------------------------------->>>> + +if __name__ == "__main__": + main() + diff --git a/include/modules/core/pepsirf_version.h b/include/modules/core/pepsirf_version.h index 45e5100d..0ab5e07e 100644 --- a/include/modules/core/pepsirf_version.h +++ b/include/modules/core/pepsirf_version.h @@ -1,6 +1,6 @@ #ifndef PEPSIRF_VERSION_HH_INCLUDED #define PEPSIRF_VERSION_HH_INCLUDED -#define PEPSIRF_VERSION "1.6.0" +#define PEPSIRF_VERSION "1.7.0" #endif diff --git a/include/modules/core/sequence.h b/include/modules/core/sequence.h index 0b8b9c35..32f1f190 100644 --- a/include/modules/core/sequence.h +++ b/include/modules/core/sequence.h @@ -72,6 +72,14 @@ class sequence **/ bool operator==( const sequence& s ) const; + /** + * Less than operator override. + * For two sequences a and b we say a < b iff + * a.seq < b.seq. + * @param s Sequence to compare against. + **/ + bool operator<( const sequence& s ) const; + /** * Get the length of the 'seq' member of this class. diff --git a/include/modules/demux/module_demux.h b/include/modules/demux/module_demux.h index d78c1ee3..0d57132c 100644 --- a/include/modules/demux/module_demux.h +++ b/include/modules/demux/module_demux.h @@ -109,6 +109,22 @@ class module_demux : public module std::vector &lib_seqs ); + /** + * Outputs truncated sequence info for unqiue sequences, non-unqiue + * sequences, and the new fasta-formatted file + * @param seq_length Sequence length specified with the "--seq" option + * @param lib_seqs Value of library sequences received from file + * specified by the "--library" + * @param library_fname Library file name, specified by "--library" + * @param trunc_info_outdir Specified output directory with the + * "--trunc_info_outdir" option + **/ + void output_trunc_info( + std::size_t seq_length, + std::vector lib_seqs, + std::string library_fname, + std::string trunc_info_outdir + ); /** * Method to zero a vector of size_t elements. @@ -380,11 +396,32 @@ class module_demux : public module * Creates a single output fastq file containing all of the reads that have not been mapped to a sample/peptide * @param filename file to output to * @param samp_map fastaq output map - * @reads_dup vector of all reads + * @param reads_dup vector of all reads **/ void create_unmapped_reads_file( std::string filename, std::map> samp_map, std::vector reads_dup ); + /** + * Outputs a vector to a file with a delemiter between entries + * @param vector vector to be outputted, make sure the type of its values can use the << operator for output + * @param filename file to output to + * @param delimiter delimiter to output between values + **/ + template + void simple_vector_out(std::vector vector, std::string output_file, std::string delimiter) + { + std::ofstream outfile(output_file, std::ios::out); + for (unsigned int i=0; i #include #include +#include #include #include @@ -67,7 +68,7 @@ void module_deconv::run(options *opts) ); } - auto in_dir_iter = fs_tools::directory_iterator(input_base); + auto in_dir_iter = boost::filesystem::directory_iterator(input_base); for (auto& input_f : boost::make_iterator_range(in_dir_iter, {})) { diff --git a/src/modules/demux/module_demux.cpp b/src/modules/demux/module_demux.cpp index cb5b9867..ee0b7fdd 100644 --- a/src/modules/demux/module_demux.cpp +++ b/src/modules/demux/module_demux.cpp @@ -132,6 +132,11 @@ void module_demux::run( options *opts ) if (seq_length < lib_length) { trunc_lib_seqs(seq_length, library_seqs); + + if(!d_opts->trunc_info_outdir.empty()) + { + output_trunc_info(seq_length, library_seqs, d_opts->library_fname, d_opts->trunc_info_outdir); + } } else if (seq_length > lib_length) { @@ -208,7 +213,7 @@ void module_demux::run( options *opts ) #else Log::error( "A gzipped file was required, but boost zlib is not available." - " Please either configure boost with zlib or unzip the file + " Please either configure boost with zlib or unzip the file" " before attempting to demux." ); #endif @@ -1041,18 +1046,11 @@ void module_demux::create_unmapped_reads_file( std::string filename, } } - // delete from reads_dup - // info_str1 << "total reads: "<< reads_dup.size() << "\n"; - // info_str2 << "mapped_reads: "<< to_remove_set.size() << "\n"; - // Log::info(info_str1.str()); - // Log::info(info_str2.str()); reads_dup.erase(std::remove_if(reads_dup.begin(), reads_dup.end(), [&to_remove_set](const fastq_sequence& value) { return to_remove_set.find(value.seq) != to_remove_set.end(); }), reads_dup.end()); - // info_str3 << "unmapped_reads: "<< reads_dup.size() << "\n"; - // Log::info(info_str3.str()); // write to file std::ofstream output( filename, std::ios_base::app ); @@ -1066,3 +1064,72 @@ void module_demux::create_unmapped_reads_file( std::string filename, output.close(); } +void module_demux::output_trunc_info( + std::size_t seq_length, + std::vector lib_seqs, + std::string library_fname, + std::string trunc_info_outdir +) { + boost::filesystem::path outdir_path = trunc_info_outdir; + if (!boost::filesystem::exists(outdir_path)) + { + boost::filesystem::create_directory(outdir_path); + } + + boost::filesystem::path library_path(library_fname); + std::string new_fasta_filename = "trunc" + std::to_string(seq_length) + "_" + library_path.filename().string(); + boost::filesystem::path new_fasta_path = outdir_path / new_fasta_filename; + + std::unordered_map seq_counts; + + std::ofstream outfile(new_fasta_path.string(), std::ios::out); + for (sequence lib_seq : lib_seqs) + { + outfile << ">" + lib_seq.name << std::endl; + outfile << lib_seq.seq << std::endl; + + seq_counts[lib_seq.seq]++; + } + outfile.close(); + + std::unordered_map> duplicates; + std::vector nonunique; + std::vector unique; + + for (sequence lib_seq : lib_seqs) + { + std::string name = lib_seq.name; + if(seq_counts[lib_seq.seq] > 1) + { + duplicates[lib_seq.seq].insert(name); + nonunique.emplace_back(name); + } + else + { + unique.emplace_back(name); + } + } + + outfile.open(new_fasta_path.replace_extension("").string() + "_notUniqGrouped.tsv", std::ios::out); + for( const auto& dup : duplicates) + { + outfile << dup.first << "\t"; + std::unordered_set peptides = dup.second; + for( auto it = peptides.begin(); it != peptides.end(); ++it) + { + outfile << *it; + if( std::next(it) != peptides.end()) + { + outfile << '\t'; + } + else + { + outfile << std::endl; + } + } + } + outfile.close(); + + simple_vector_out(unique, new_fasta_path.replace_extension("").string() + "_uniq.txt", "\n"); + simple_vector_out(nonunique, new_fasta_path.replace_extension("").string() + "_notUniq.txt", "\n"); +} \ No newline at end of file diff --git a/src/modules/demux/options_demux.cpp b/src/modules/demux/options_demux.cpp index 03a83c5f..59ed6c47 100644 --- a/src/modules/demux/options_demux.cpp +++ b/src/modules/demux/options_demux.cpp @@ -35,7 +35,8 @@ std::string options_demux::get_arguments() << "--fastq_output " << fastq_out << "\n" << "--logfile " << logfile << "\n" << "--replicate_info " << replicate_info_fname << "\n" - << "--unmapped_reads_output " << unmapped_reads_fname << "\n\n"; + << "--unmapped_reads_output " << unmapped_reads_fname << "\n" + << "--trunc_info_output " << trunc_info_outdir << "\n\n"; return str_stream.str(); } diff --git a/src/modules/demux/options_parser_demux.cpp b/src/modules/demux/options_parser_demux.cpp index eb874284..a55fdeab 100644 --- a/src/modules/demux/options_parser_demux.cpp +++ b/src/modules/demux/options_parser_demux.cpp @@ -186,7 +186,7 @@ bool options_parser_demux::parse(int argc, char ***argv, options *opts) ) ("fastq_output,q", po::value(&opts_demux->fastq_out) ->default_value(""), - "Include this to output sample-level fastq files" + "Include this to output sample-level fastq files\n" ) ("include_toggle", po::value(&opts_demux->pos_toggle) ->default_value(true), @@ -217,6 +217,11 @@ bool options_parser_demux::parse(int argc, char ***argv, options *opts) " all of the reads that have not been mapped to a sample/peptide (i.e., all of those that" "would not be included in any of the files created by the -q option).\n" ) + ("trunc_info_output", po::value(&opts_demux->trunc_info_outdir) + ->default_value(""), + "Name of directory to output truncated sequence information. This will include outputs for" + " unqiue sequences, non-unqiue sequences, and the new fasta-formatted file.\n\n" + ) ; po::store(po::command_line_parser(argc, *argv).options(desc).allow_unregistered().run(), vm); diff --git a/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30.fna b/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30.fna new file mode 100644 index 00000000..fc762444 --- /dev/null +++ b/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30.fna @@ -0,0 +1,30 @@ +>NS30_000000-1 +GTACCGATCACCCAGAACCCGGTTGAAAACTACATCGACG +>NS30_000000-2 +GTTCCGATCACCCAGAACCCGGTTGAAAACTACATCGACG +>NS30_000000-3 +GTCCCGATCACCCAGAACCCGGTCGAAAACTACATCGACG +>NS30_000001-1 +GAAGACGAACTGGAAGAAGTTGTTATCGACAAAATGAAAC +>NS30_000001-2 +GAAGATGAACTGGAAGAAGTCGTTATCGACAAAATGAAAC +>NS30_000009-1 +CAGGACATCCACCTGATCGACGACCTGGGCCAGACCCGCA +>NS30_000009-3 +CAGGACATCCACCTGATCGACGACCTGGGCCAGACCCGCA +>NS30_000010-2 +CCGCCGGCTGAAAAAAAAGACCCGTACGCTGACCTGACCT +>NS30_000010-3 +CCGCCGGCTGAAAAAAAAGACCCGTACGCTGACCTGACCT +>NS30_000027-1 +CTGAAAAACCTGCTGAACTCCCGCAAACGTGACCCGCTGT +>NS30_000027-2 +CTGAAAAACCTGCTGAACTCCCGCAAACGTGACCCGCTGT +>NS30_000032-2 +AAAGGCCTGTCTGACGAAGAATACGAAGAATACAAACGCG +>NS30_000032-3 +AAAGGCCTGTCTGACGAAGAATACGAAGAATACAAACGCG +>NS30_000041-1 +ACCAAAGAAGTTATGATCAAAGCTGAAAAAATGGGCAAAT +>NS30_000041-2 +ACCAAAGAAGTTATGATCAAAGCTGAAAAAATGGGCAAAT diff --git a/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_notUniq.txt b/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_notUniq.txt new file mode 100644 index 00000000..53812782 --- /dev/null +++ b/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_notUniq.txt @@ -0,0 +1,10 @@ +NS30_000009-1 +NS30_000009-3 +NS30_000010-2 +NS30_000010-3 +NS30_000027-1 +NS30_000027-2 +NS30_000032-2 +NS30_000032-3 +NS30_000041-1 +NS30_000041-2 \ No newline at end of file diff --git a/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_notUniqGrouped.tsv b/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_notUniqGrouped.tsv new file mode 100644 index 00000000..8f9f8a08 --- /dev/null +++ b/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_notUniqGrouped.tsv @@ -0,0 +1,5 @@ +AAAGGCCTGTCTGACGAAGAATACGAAGAATACAAACGCG NS30_000032-2 NS30_000032-3 +CTGAAAAACCTGCTGAACTCCCGCAAACGTGACCCGCTGT NS30_000027-1 NS30_000027-2 +ACCAAAGAAGTTATGATCAAAGCTGAAAAAATGGGCAAAT NS30_000041-1 NS30_000041-2 +CCGCCGGCTGAAAAAAAAGACCCGTACGCTGACCTGACCT NS30_000010-2 NS30_000010-3 +CAGGACATCCACCTGATCGACGACCTGGGCCAGACCCGCA NS30_000009-1 NS30_000009-3 diff --git a/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_uniq.txt b/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_uniq.txt new file mode 100644 index 00000000..2d544711 --- /dev/null +++ b/test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_uniq.txt @@ -0,0 +1,5 @@ +NS30_000000-1 +NS30_000000-2 +NS30_000000-3 +NS30_000001-1 +NS30_000001-2 \ No newline at end of file diff --git a/test/pepsirf_test.cpp b/test/pepsirf_test.cpp index 677e4250..0cf81ba0 100644 --- a/test/pepsirf_test.cpp +++ b/test/pepsirf_test.cpp @@ -520,6 +520,7 @@ TEST_CASE("Automatic truncation of library sequences", "[module_demux]") d_opts.sample_indexes = { "Index1", "Index2" }; d_opts.set_info(&options_demux::index1_data, "12,10,1"); d_opts.set_info(&options_demux::index2_data, "0,8,1"); + d_opts.num_threads = 1; SECTION("Library sequences are resized to expected size from user") { @@ -576,19 +577,21 @@ TEST_CASE("Automatic truncation of library sequences", "[module_demux]") d_opts.set_info(&options_demux::seq_data, "41,100,2"); REQUIRE_THROWS(d_mod.run(&d_opts)); } - SECTION("Demux outputs unmapped reads file.") + SECTION("Output of truncated sequence information is correct") { d_opts.set_info(&options_demux::seq_data, "41,40,2"); - d_opts.unmapped_reads_fname = "../test/test_unmapped_reads.fastq"; + d_opts.trunc_info_outdir = "../test/test_demux_trunc_info"; d_mod.run(&d_opts); - std::string expected = "../test/expected/test_expected_demux_unmapped_reads.fastq"; - std::string actual = "../test/test_unmapped_reads.fastq"; + // test if not unique sequences file is identical + std::string expected = "../test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_notUniq.txt"; + std::string actual = "../test/test_demux_trunc_info/trunc40_test_extended_demux_library_NS30_notUniq.txt"; std::ifstream ifexpected(expected, std::ios_base::in); std::ifstream ifactual(actual, std::ios_base::in); std::string expected_line; std::string actual_line; + // construct set of lines for each actual demux output std::unordered_set actual_line_set; while (!ifactual.eof()) { @@ -603,6 +606,131 @@ TEST_CASE("Automatic truncation of library sequences", "[module_demux]") actual_line_set.find(expected_line) != actual_line_set.end() ); } + + // test if not unique grouped file is identical + expected = "../test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_notUniqGrouped.tsv"; + actual = "../test/test_demux_trunc_info/trunc40_test_extended_demux_library_NS30_notUniqGrouped.tsv"; + ifexpected = std::ifstream(expected, std::ios_base::in); + ifactual = std::ifstream(actual, std::ios_base::in); + + std::unordered_map> expected_duplicates; + std::unordered_map> actual_duplicates; + + while (std::getline(ifexpected, expected_line)) { + boost::algorithm::trim(expected_line); + + std::vector split_line; + boost::split(split_line, expected_line, boost::is_any_of("\t")); + + std::string seq; + for (size_t i = 0; i < split_line.size(); ++i) { + if (i == 0) { + seq = split_line[i]; + } else { + expected_duplicates[seq].insert(split_line[i]); + } + } + } + + while (std::getline(ifactual, actual_line)) { + boost::algorithm::trim(actual_line); + + std::vector split_line; + boost::split(split_line, actual_line, boost::is_any_of("\t")); + + std::string seq; + for (size_t i = 0; i < split_line.size(); ++i) { + if (i == 0) { + seq = split_line[i]; + } else { + actual_duplicates[seq].insert(split_line[i]); + } + } + } + + REQUIRE(expected_duplicates == actual_duplicates); + + // test if unique sequences file is identical + expected = "../test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30_uniq.txt"; + actual = "../test/test_demux_trunc_info/trunc40_test_extended_demux_library_NS30_uniq.txt"; + ifexpected = std::ifstream(expected, std::ios_base::in); + ifactual = std::ifstream(actual, std::ios_base::in); + + // construct set of lines for each actual demux output + actual_line_set = std::unordered_set(); + while (!ifactual.eof()) + { + std::getline(ifactual, actual_line); + actual_line_set.insert(actual_line); + } + + while (!ifexpected.eof()) + { + std::getline(ifexpected, expected_line); + REQUIRE( + actual_line_set.find(expected_line) != actual_line_set.end() + ); + } + + // test if truncated fasta output is identical + fasta_parser fp; + std::vector expected_sequences; + std::vector actual_sequences; + expected_sequences = fp.parse( "../test/expected/test_expected_demux_trunc_info/trunc40_test_extended_demux_library_NS30.fna" ); + actual_sequences = fp.parse( "../test/test_demux_trunc_info/trunc40_test_extended_demux_library_NS30.fna" ); + REQUIRE(expected_sequences==actual_sequences); + } +} + +TEST_CASE("Demux outputs unmapped reads file", "[module_demux]") +{ + // run demux using a real-world dataset (31250 records) with diagnostic output + module_demux d_mod; + options_demux d_opts; + + d_opts.input_r1_fname = std::string("../test/input_data/test_input_r1_NS30.fastq"); + d_opts.input_r2_fname = std::string("../test/input_data/test_input_r2_NS30.fastq"); + d_opts.index_fname = std::string("../test/input_data/test_barcodes.fa"); + d_opts.library_fname = std::string("../test/input_data/test_extended_demux_library_NS30.fna"); + d_opts.samplelist_fname = std::string("../test/input_data/test_samplelist_NS30.tsv"); + d_opts.flexible_idx_fname = std::string(""); + d_opts.output_fname = std::string("../test/test_actual_trunc_demux_output.tsv"); + d_opts.diagnostic_fname = std::string("../test/test_actual_trunc_diagnostic_output.tsv"); + d_opts.read_per_loop = 80000; + d_opts.aggregate_fname = std::string(); + d_opts.concatemer = std::string(); + d_opts.min_phred_score = 0; + d_opts.phred_base = 33; + d_opts.samplename = "SampleName"; + d_opts.indexes = "Index1,Index2"; + d_opts.sample_indexes = { "Index1", "Index2" }; + d_opts.set_info(&options_demux::index1_data, "12,10,1"); + d_opts.set_info(&options_demux::index2_data, "0,8,1"); + d_opts.set_info(&options_demux::seq_data, "41,40,2"); + d_opts.unmapped_reads_fname = "../test/test_unmapped_reads.fastq"; + d_opts.num_threads = 1; + d_mod.run(&d_opts); + + std::string expected = "../test/expected/test_expected_demux_unmapped_reads.fastq"; + std::string actual = "../test/test_unmapped_reads.fastq"; + std::ifstream ifexpected(expected, std::ios_base::in); + std::ifstream ifactual(actual, std::ios_base::in); + std::string expected_line; + std::string actual_line; + + std::unordered_set actual_line_set; + while (!ifactual.eof()) + { + std::getline(ifactual, actual_line); + actual_line_set.insert(actual_line); + } + + while (!ifexpected.eof()) + { + std::getline(ifexpected, expected_line); + REQUIRE( + actual_line_set.find(expected_line) != actual_line_set.end() + ); } }