diff --git a/conf/modules.config b/conf/modules.config index 0ec7ef2..9301b85 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -225,7 +225,7 @@ if(params.run_trim_galore_fastqc && !params.skip_fastqc) { if(params.run_trim_galore_fastqc && !params.skip_trimming) { process { withName: 'CLIPSEQ:FASTQC_TRIMGALORE:TRIMGALORE' { - ext.args = "--fastqc --length ${params.trim_length} -q 20" + ext.args = "${params.trimgalore_params}" publishDir = [ [ path: { "${params.outdir}/01_prealign/post_trim_fastqc" }, @@ -279,6 +279,31 @@ if(params.run_alignment) { ] } + withName: 'CLIPSEQ:RNA_ALIGN:BOWTIE_ALIGN_K1' { + ext.args = { "-v 2 -m 100 --norc --best --strata -k 1" } + ext.prefix = { "${meta.id}_withK1" } + publishDir = [ + [ + path: { "${params.outdir}/02_alignment/smrna" }, + mode: "${params.publish_dir_mode}", + pattern: '*.out', + enabled: true + ], + [ + path: { "${params.outdir}/02_alignment/smrna" }, + mode: "${params.publish_dir_mode}", + pattern: '*.bam', + enabled: false + ], + [ + path: { "${params.outdir}/02_alignment/smrna/unmapped" }, + mode: "${params.publish_dir_mode}", + pattern: '*.fastq.gz', + enabled: false + ] + ] + } + withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_SORT_SMRNA' { ext.prefix = { "${meta.id}_sorted" } publishDir = [ @@ -297,9 +322,22 @@ if(params.run_alignment) { ] } + withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_SORT_SMRNA_K1' { + publishDir = [ + enabled: false + ] + } + + withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_INDEX_SMRNA_K1' { + publishDir = [ + enabled: false + ] + } + withName: 'CLIPSEQ:RNA_ALIGN:STAR_ALIGN' { - ext.args = { "--readFilesCommand zcat --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM ${params.star_params}" } + ext.args = { "--readFilesCommand zcat --outSAMtype BAM Unsorted --quantMode TranscriptomeSAM ${params.star_params}" } + ext.prefix = { "${meta.id}_multi" } publishDir = [ [ path: { "${params.outdir}/02_alignment/genome/log" }, @@ -321,7 +359,7 @@ if(params.run_alignment) { ] } - withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_INDEX_GENOME' { + withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_INDEX_TRANSCRIPT' { publishDir = [ path: { "${params.outdir}/02_alignment/genome" }, mode: "${params.publish_dir_mode}", @@ -329,8 +367,16 @@ if(params.run_alignment) { ] } - withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_INDEX_TRANSCRIPT' { - ext.prefix = { "${meta.id}_Aligned.toTranscriptome_sorted.out" } + withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_SORT_GENOME' { + ext.prefix = { "${meta.id}_multi.Aligned.toGenome_sorted.out" } + publishDir = [ + path: { "${params.outdir}/02_alignment/genome" }, + mode: "${params.publish_dir_mode}", + enabled: params.save_align_intermed + ] + } + + withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_INDEX_GENOME' { publishDir = [ path: { "${params.outdir}/02_alignment/genome" }, mode: "${params.publish_dir_mode}", @@ -339,7 +385,7 @@ if(params.run_alignment) { } withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_SORT_TRANSCRIPT' { - ext.prefix = { "${meta.id}_Aligned.toTranscriptome_sorted.out" } + ext.prefix = { "${meta.id}_multi.Aligned.toTranscriptome_sorted.out" } publishDir = [ path: { "${params.outdir}/02_alignment/genome" }, mode: "${params.publish_dir_mode}", @@ -347,7 +393,27 @@ if(params.run_alignment) { ] } + withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_VIEW_GENOME' { + ext.prefix = { "${meta.id}_unique_genome" } + ext.args = "-q 5 --output-fmt bam --write-index" + ext.index_type = "bai" + publishDir = [ + path: { "${params.outdir}/02_alignment/genome" }, + mode: "${params.publish_dir_mode}", + enabled: params.save_align_intermed + ] + } + withName: 'CLIPSEQ:RNA_ALIGN:SAMTOOLS_VIEW_TRANSCRIPT' { + ext.prefix = { "${meta.id}_unique_transcriptome" } + ext.args = "-q 5 --output-fmt bam --write-index" + ext.index_type = "bai" + publishDir = [ + path: { "${params.outdir}/02_alignment/genome" }, + mode: "${params.publish_dir_mode}", + enabled: params.save_align_intermed + ] + } } } @@ -393,9 +459,9 @@ if(params.run_read_filter) { if(params.run_umi_dedup) { process { - withName: 'CLIPSEQ:GENOME_DEDUP:UMICOLLAPSE' { + withName: 'CLIPSEQ:GENOME_UNIQUE_DEDUP:UMICOLLAPSE' { ext.args = { "--umi-sep ${params.umi_separator}" } - ext.prefix = { "${meta.id}.genome.dedup" } + ext.prefix = { "${meta.id}.unique_genome.dedup" } publishDir = [ path: { "${params.outdir}/03_filt_dedup" }, mode: "${params.publish_dir_mode}", @@ -403,7 +469,9 @@ if(params.run_umi_dedup) { ] } - withName: 'CLIPSEQ:GENOME_DEDUP:SAMTOOLS_INDEX' { + withName: 'CLIPSEQ:GENOME_MULTI_DEDUP:UMICOLLAPSE' { + ext.args = { "--umi-sep ${params.umi_separator}" } + ext.prefix = { "${meta.id}.multi_genome.dedup" } publishDir = [ path: { "${params.outdir}/03_filt_dedup" }, mode: "${params.publish_dir_mode}", @@ -411,12 +479,55 @@ if(params.run_umi_dedup) { ] } - withName: 'CLIPSEQ:GENOME_DEDUP:BAM_STATS_SAMTOOLS:.*' { - ext.prefix = { "${meta.id}.genome.dedup" } + withName: 'CLIPSEQ:SMRNA_DEDUP:UMICOLLAPSE' { + ext.args = { "--umi-sep ${params.umi_separator}" } + ext.prefix = { "${meta.id}.smrna.dedup" } publishDir = [ path: { "${params.outdir}/03_filt_dedup" }, mode: "${params.publish_dir_mode}", - pattern: "*.{stats,flagstat,idxstats}" + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'CLIPSEQ:SMRNA_K1_DEDUP:UMICOLLAPSE' { + ext.args = { "--umi-sep ${params.umi_separator}" } + ext.prefix = { "${meta.id}.smrna_withk1.dedup" } + publishDir = [ + path: { "${params.outdir}/03_filt_dedup" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'CLIPSEQ:GENOME_UNIQUE_DEDUP:SAMTOOLS_INDEX' { + publishDir = [ + path: { "${params.outdir}/03_filt_dedup" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'CLIPSEQ:GENOME_MULTI_DEDUP:SAMTOOLS_INDEX' { + publishDir = [ + path: { "${params.outdir}/03_filt_dedup" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'CLIPSEQ:SMRNA_DEDUP:SAMTOOLS_INDEX' { + publishDir = [ + path: { "${params.outdir}/03_filt_dedup" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'CLIPSEQ:SMRNA_K1_DEDUP:SAMTOOLS_INDEX' { + publishDir = [ + path: { "${params.outdir}/03_filt_dedup" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } @@ -569,6 +680,37 @@ if(params.run_calc_crosslinks) { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS:MERGE_AND_SORT' { + ext.cmd1 = 'sort -k1,1 -k2,2n' + ext.suffix = '.smrna_withk1' + ext.ext = 'bed' + publishDir = [ + path: { "${params.outdir}/04_crosslinks" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS:CROSSLINK_COVERAGE' { + ext.cmd1 = 'awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n' + ext.suffix = '.smrna_withk1' + ext.ext = 'bedgraph' + publishDir = [ + path: { "${params.outdir}/04_crosslinks" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS:CROSSLINK_NORMCOVERAGE' { + ext.cmd1 = 'awk -v total=\$CMD2 \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n' + ext.cmd2 = 'awk \'BEGIN {total=0} {total=total+\$5} END {print total}\'' + ext.suffix = '.norm.smrna_withk1' + ext.ext = 'bedgraph' + publishDir = [ + enabled: false + ] + } } } @@ -670,8 +812,13 @@ if(params.run_peak_calling) { ] } - withName: 'CLIPSEQ:ICOUNT_ANALYSE:ICOUNT_SUMMARY' { + publishDir = [ + enabled: false + ] + } + + withName: 'CLIPSEQ:ICOUNT_ANALYSE:MERGE_SUMMARY' { publishDir = [ path: { "${params.outdir}/05_peak_calling/icount" }, mode: "${params.publish_dir_mode}", @@ -689,7 +836,10 @@ if(params.run_peak_calling) { withName: 'CLIPSEQ:ICOUNT_ANALYSE:ICOUNT_SIGXLS' { publishDir = [ - enabled: false + path: { "${params.outdir}/05_peak_calling/icount" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.scores.tsv" ] } diff --git a/main.nf b/main.nf index df58fce..7d32840 100644 --- a/main.nf +++ b/main.nf @@ -125,8 +125,12 @@ include { PREPARE_CLIPSEQ } from './subwor include { PARSE_FASTQ_INPUT } from './subworkflows/goodwright/parse_fastq_input/main' include { FASTQC_TRIMGALORE } from './subworkflows/goodwright/fastqc_trimgalore/main' include { RNA_ALIGN } from './subworkflows/goodwright/rna_align/main' -include { BAM_DEDUP_SAMTOOLS_UMITOOLS as GENOME_DEDUP } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main' +include { BAM_DEDUP_SAMTOOLS_UMITOOLS as GENOME_UNIQUE_DEDUP } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main' +include { BAM_DEDUP_SAMTOOLS_UMITOOLS as GENOME_MULTI_DEDUP } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main' +include { BAM_DEDUP_SAMTOOLS_UMITOOLS as SMRNA_DEDUP } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main' +include { BAM_DEDUP_SAMTOOLS_UMITOOLS as SMRNA_K1_DEDUP } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main' include { BAM_DEDUP_SAMTOOLS_UMITOOLS as TRANSCRIPT_DEDUP } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main' +include { CLIP_CALC_CROSSLINKS as CALC_SMRNA_K1_CROSSLINKS } from './subworkflows/goodwright/clip_calc_crosslinks/main' include { CLIP_CALC_CROSSLINKS as CALC_GENOME_CROSSLINKS } from './subworkflows/goodwright/clip_calc_crosslinks/main' include { CLIP_CALC_CROSSLINKS as CALC_TRANSCRIPT_CROSSLINKS } from './subworkflows/goodwright/clip_calc_crosslinks/main' include { PARACLU_ANALYSE as PARACLU_ANALYSE_GENOME } from './subworkflows/goodwright/paraclu_analyse/main' @@ -315,13 +319,19 @@ workflow CLIPSEQ { ch_filtered_gtf, ch_fasta ) - ch_versions = ch_versions.mix(RNA_ALIGN.out.versions) - ch_genome_bam = RNA_ALIGN.out.genome_bam - ch_genome_bai = RNA_ALIGN.out.genome_bai - ch_transcript_bam = RNA_ALIGN.out.transcript_bam - ch_transcript_bai = RNA_ALIGN.out.transcript_bai - ch_bt_log = RNA_ALIGN.out.bt_log - ch_star_log = RNA_ALIGN.out.star_log_final + ch_versions = ch_versions.mix(RNA_ALIGN.out.versions) + ch_genome_unique_bam = RNA_ALIGN.out.genome_unique_bam + ch_genome_unique_bai = RNA_ALIGN.out.genome_unique_bai + ch_genome_multi_bam = RNA_ALIGN.out.genome_multi_bam + ch_genome_multi_bai = RNA_ALIGN.out.genome_multi_bai + ch_smrna_bam = RNA_ALIGN.out.smrna_bam + ch_smrna_bai = RNA_ALIGN.out.smrna_bai + ch_smrna_k1_bam = RNA_ALIGN.out.smrna_k1_bam + ch_smrna_k1_bai = RNA_ALIGN.out.smrna_k1_bai + ch_transcript_bam = RNA_ALIGN.out.transcript_bam + ch_transcript_bai = RNA_ALIGN.out.transcript_bai + ch_bt_log = RNA_ALIGN.out.bt_log + ch_star_log = RNA_ALIGN.out.star_log_final } if(params.run_read_filter) { @@ -360,9 +370,24 @@ workflow CLIPSEQ { /* * CHANNEL: Combine bam and bai files on id */ - ch_genome_bam_bai = ch_genome_bam + ch_genome_unique_bam_bai = ch_genome_unique_bam .map { row -> [row[0].id, row ].flatten()} - .join ( ch_genome_bai.map { row -> [row[0].id, row ].flatten()} ) + .join ( ch_genome_unique_bai.map { row -> [row[0].id, row ].flatten()} ) + .map { row -> [row[1], row[2], row[4]] } + + ch_genome_multi_bam_bai = ch_genome_multi_bam + .map { row -> [row[0].id, row ].flatten()} + .join ( ch_genome_multi_bai.map { row -> [row[0].id, row ].flatten()} ) + .map { row -> [row[1], row[2], row[4]] } + + ch_smrna_bam_bai = ch_smrna_bam + .map { row -> [row[0].id, row ].flatten()} + .join ( ch_smrna_bai.map { row -> [row[0].id, row ].flatten()} ) + .map { row -> [row[1], row[2], row[4]] } + + ch_smrna_k1_bam_bai = ch_smrna_k1_bam + .map { row -> [row[0].id, row ].flatten()} + .join ( ch_smrna_k1_bai.map { row -> [row[0].id, row ].flatten()} ) .map { row -> [row[1], row[2], row[4]] } ch_transcript_bam_bai = ch_transcript_bam @@ -373,13 +398,31 @@ workflow CLIPSEQ { /* * SUBWORKFLOW: Run umi deduplication on genome-level alignments */ - GENOME_DEDUP ( - ch_genome_bam_bai + GENOME_UNIQUE_DEDUP ( + ch_genome_unique_bam_bai ) - ch_versions = ch_versions.mix(GENOME_DEDUP.out.versions) - ch_genome_bam = GENOME_DEDUP.out.bam - ch_genome_bai = GENOME_DEDUP.out.bai - ch_umi_log = GENOME_DEDUP.out.umi_log + ch_versions = ch_versions.mix(GENOME_UNIQUE_DEDUP.out.versions) + ch_genome_bam = GENOME_UNIQUE_DEDUP.out.bam + ch_genome_bai = GENOME_UNIQUE_DEDUP.out.bai + ch_umi_log = GENOME_UNIQUE_DEDUP.out.umi_log + + GENOME_MULTI_DEDUP ( + ch_genome_multi_bam_bai + ) + ch_versions = ch_versions.mix(GENOME_MULTI_DEDUP.out.versions) + + SMRNA_DEDUP ( + ch_smrna_bam_bai + ) + ch_versions = ch_versions.mix(SMRNA_DEDUP.out.versions) + + SMRNA_K1_DEDUP ( + ch_smrna_k1_bam_bai + ) + ch_versions = ch_versions.mix(SMRNA_K1_DEDUP.out.versions) + ch_smrna_k1_bam = SMRNA_K1_DEDUP.out.bam + ch_smrna_k1_bai = SMRNA_K1_DEDUP.out.bai + ch_umi_log = SMRNA_K1_DEDUP.out.umi_log /* * SUBWORKFLOW: Run umi deduplication on transcript-level alignments @@ -399,6 +442,18 @@ workflow CLIPSEQ { ch_trans_crosslink_coverage = Channel.empty() ch_trans_crosslink_coverage_norm = Channel.empty() if(params.run_calc_crosslinks) { + /* + * SUBWORKFLOW: Run crosslink calculation for smRNA with -k 1 + */ + CALC_SMRNA_K1_CROSSLINKS ( + ch_smrna_k1_bam, + ch_smrna_fasta_fai.collect{ it[1] } + ) + ch_versions = ch_versions.mix(CALC_SMRNA_K1_CROSSLINKS.out.versions) + ch_smrna_crosslink_bed = CALC_SMRNA_K1_CROSSLINKS.out.bed + ch_smrna_crosslink_coverage = CALC_SMRNA_K1_CROSSLINKS.out.coverage + ch_smrna_crosslink_coverage_norm = CALC_SMRNA_K1_CROSSLINKS.out.coverage_norm + /* * SUBWORKFLOW: Run crosslink calculation for genome */ @@ -463,6 +518,7 @@ workflow CLIPSEQ { * SUBWORKFLOW: Run iCount on genome-level crosslinks */ ICOUNT_ANALYSE ( + ch_smrna_crosslink_bed, ch_genome_crosslink_bed, ch_regions_resolved_gtf.collect{ it[1] }, ch_seg_resolved_gtf.collect{ it[1] }, diff --git a/modules.json b/modules.json index 54d14a8..3519b29 100644 --- a/modules.json +++ b/modules.json @@ -176,6 +176,11 @@ "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", "installed_by": ["bam_stats_samtools"] }, + "samtools/view": { + "branch": "master", + "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", + "installed_by": ["modules"] + }, "star/align": { "branch": "master", "git_sha": "cc08a888069f67cab8120259bddab8032d4c0fe3", @@ -191,6 +196,11 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "umicollapse": { + "branch": "master", + "git_sha": "b97197968ac12dde2463fa54541f6350c46f2035", + "installed_by": ["modules"] + }, "umitools/extract": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", diff --git a/modules/goodwright/peka/main.nf b/modules/goodwright/peka/main.nf index 82c833f..127274d 100644 --- a/modules/goodwright/peka/main.nf +++ b/modules/goodwright/peka/main.nf @@ -16,11 +16,14 @@ process PEKA { path gtf output: - tuple val(meta), path("*mer_cluster_distribution*"), emit: cluster, optional: true - tuple val(meta), path("*mer_distribution*") , emit: distribution, optional: true - tuple val(meta), path("*rtxn*") , emit: rtxn, optional: true - tuple val(meta), path("*.pdf") , emit: pdf, optional: true - path "versions.yml" , emit: versions + tuple val(meta), path("*mer_cluster_distribution*") , emit: cluster, optional: true + tuple val(meta), path("*mer_distribution*") , emit: distribution, optional: true + tuple val(meta), path("*rtxn*") , emit: rtxn, optional: true + tuple val(meta), path("*.pdf") , emit: pdf, optional: true + tuple val(meta), path("*thresholded_sites*.bed.gz") , emit: tsites, optional: true + tuple val(meta), path("*oxn*.bed.gz") , emit: oxn, optional: true + tuple val(meta), path("*_clusters.csv") , emit: clust, optional: true + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/goodwright/umicollapse/main.nf b/modules/goodwright/umicollapse/main.nf index 355dcb8..1090815 100644 --- a/modules/goodwright/umicollapse/main.nf +++ b/modules/goodwright/umicollapse/main.nf @@ -1,6 +1,6 @@ process UMICOLLAPSE { tag "$meta.id" - label "process_high" + label "process_medium" container 'docker.io/elly1502/umicollapse:latest' diff --git a/modules/local/clipseq/merge_summary/main.nf b/modules/local/clipseq/merge_summary/main.nf new file mode 100644 index 0000000..966e244 --- /dev/null +++ b/modules/local/clipseq/merge_summary/main.nf @@ -0,0 +1,29 @@ +process MERGE_SUMMARY { + tag "$gtf" + label "process_single" + + conda "conda-forge::pandas=1.4.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.4.3': + 'biocontainers/pandas:1.4.3' }" + + input: + tuple val(meta), path(summary_type) + tuple val(meta), path(summary_subtype) + tuple val(meta), path(summary_gene) + tuple val(meta), path(smrna_premapped_k1_cDNA) + + output: + tuple val(meta), path("*summary_type_premapadjusted.tsv") , emit: summary_type_adjusted + tuple val(meta), path("*summary_subtype_premapadjusted.tsv"), emit: summary_subtype_adjusted + tuple val(meta), path("*summary_gene_premapadjusted.tsv") , emit: summary_gene_adjusted + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + process_name = task.process + template 'merge_summary.py' +} + diff --git a/modules/local/clipseq/merge_summary/meta.yml b/modules/local/clipseq/merge_summary/meta.yml new file mode 100644 index 0000000..cdbb89e --- /dev/null +++ b/modules/local/clipseq/merge_summary/meta.yml @@ -0,0 +1,52 @@ +name: merge_summary +description: Merge results of pre-mapping with results of iCount summary +tools: + - pandas: + description: | + Flexible and powerful data analysis / manipulation library for Python, + providing labeled data structures similar to R data.frame objects, + statistical functions, and much more. + homepage: https://pandas.pydata.org/ + documentation: https://pandas.pydata.org/docs/ + licence: ["BSD-3"] +input: + - summary_type: + type: file + description: Output from iCount Summary + pattern: "*.tsv" + - summary_subtype: + type: file + description: Output from iCount Summary + pattern: "*.tsv" + - summary_gene: + type: file + description: Output from iCount Summary + pattern: "*.tsv" + - smrna_premapped_k1_cDNA: + type: file + description: smRNA premapped k1 cDNA (deduplicated) bed file + pattern: "*.bed" + - smrna_premapped_k1_reads_log: + type: file + description: smRNA premapped k1 reads bowtie log to get read number before deduplication + pattern: "*.out" + +output: + - summary_type_adjusted: + type: file + description: Output from iCount Summary adjusted with pre-mapping + pattern: "*.tsv" + - summary_subtype_adjusted: + type: file + description: Output from iCount Summary adjusted with pre-mapping + pattern: "*.tsv" + - summary_gene_adjusted: + type: file + description: Output from iCount Summary adjusted with pre-mapping + pattern: "*.tsv" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@charlotteanne" diff --git a/modules/local/clipseq/merge_summary/templates/merge_summary.py b/modules/local/clipseq/merge_summary/templates/merge_summary.py new file mode 100644 index 0000000..2cc032a --- /dev/null +++ b/modules/local/clipseq/merge_summary/templates/merge_summary.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +"""Merge pre-mapped results with genome-computed iCount Summary""" + +import platform +import argparse +from sys import exit +import pandas as pd +import os + + +def dump_versions(process_name): + with open("versions.yml", "w") as out_f: + out_f.write(process_name + ":\n") + out_f.write(" python: " + platform.python_version() + "\n") + out_f.write(" pandas: " + pd.__version__ + "\n") + +def extract_cdna_from_bed(file_path): + # Read the file into a DataFrame + df = pd.read_csv(file_path, sep='\t', header=None) + # Sum the values in the 5th column (index 4) + total_sum = df[4].sum() + return total_sum + +def adjust_summary_file(file_path, number_cdnas_premapped): + # Read the file into a DataFrame + df = pd.read_csv(file_path, sep='\t', header=0) + # Add the new values + new_row = ["premapped rRNA_tRNA", "NA", number_cdnas_premapped, 0] + # Append the new row using loc indexer + df.loc[len(df)] = new_row + # Correct the percentages + # Calculate the total cDNA # + total_cDNA = df['cDNA #'].sum() + # Update the cDNA % column + df['cDNA %'] = (df['cDNA #'] / total_cDNA) * 100 + # Create the output file name + base_name, extension = os.path.splitext(os.path.basename(file_path)) + output_file = base_name + "_premapadjusted" + extension + print(f"Saving to: {output_file}") # Debugging + # Write the updated DataFrame to the new file + df.to_csv(output_file, sep='\t', index=False) + +def main(processname, subtype, type, gene, cdna): + # Dump version file + dump_versions(processname) + + # Get number of cDNAs + number_cdnas_premapped = extract_cdna_from_bed(cdna) + print("Number of cDNAs premapped: " + str(number_cdnas_premapped)) + + adjust_summary_file(type, number_cdnas_premapped) + adjust_summary_file(subtype, number_cdnas_premapped) + adjust_summary_file(gene, number_cdnas_premapped) + +if __name__ == "__main__": + # Allows switching between nextflow templating and standalone python running using arguments + parser = argparse.ArgumentParser() + parser.add_argument("--processname", default="!{process_name}") + parser.add_argument("--subtype", default="!{summary_subtype}") + parser.add_argument("--type", default="!{summary_type}") + parser.add_argument("--gene", default="!{summary_gene}") + parser.add_argument("--cdna", default="!{smrna_premapped_k1_cDNA}") + args = parser.parse_args() + + main(args.processname, args.subtype, args.type, args.gene, args.cdna) diff --git a/modules/nf-core/samtools/view/environment.yml b/modules/nf-core/samtools/view/environment.yml new file mode 100644 index 0000000..04c82f1 --- /dev/null +++ b/modules/nf-core/samtools/view/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/view/main.nf b/modules/nf-core/samtools/view/main.nf new file mode 100644 index 0000000..a41b876 --- /dev/null +++ b/modules/nf-core/samtools/view/main.nf @@ -0,0 +1,69 @@ +process SAMTOOLS_VIEW { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input), path(index) + tuple val(meta2), path(fasta) + path qname + + output: + tuple val(meta), path("*.bam"), emit: bam, optional: true + tuple val(meta), path("*.cram"), emit: cram, optional: true + tuple val(meta), path("*.sam"), emit: sam, optional: true + tuple val(meta), path("*.bai"), emit: bai, optional: true + tuple val(meta), path("*.csi"), emit: csi, optional: true + tuple val(meta), path("*.crai"), emit: crai, optional: true + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" + def readnames = qname ? "--qname-file ${qname}": "" + def file_type = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt bam") ? "bam" : + args.contains("--output-fmt cram") ? "cram" : + input.getExtension() + def index_type = task.ext.index_type ?: '' + def output_name = args.contains("--write-index") ? "${prefix}.${file_type}##idx##${prefix}.${file_type}.${index_type}" : + "${prefix}.${file_type}" + if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools \\ + view \\ + --threads ${task.cpus-1} \\ + ${reference} \\ + ${readnames} \\ + $args \\ + -o ${output_name}\\ + $input \\ + $args2 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + touch ${prefix}.cram + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/view/meta.yml b/modules/nf-core/samtools/view/meta.yml new file mode 100644 index 0000000..3dadafa --- /dev/null +++ b/modules/nf-core/samtools/view/meta.yml @@ -0,0 +1,89 @@ +name: samtools_view +description: filter/convert SAM/BAM/CRAM file +keywords: + - view + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - index: + type: file + description: BAM.BAI/BAM.CSI/CRAM.CRAI file (optional) + pattern: "*.{.bai,.csi,.crai}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: Reference file the CRAM was created with (optional) + pattern: "*.{fasta,fa}" + - qname: + type: file + description: Optional file with read names to output only select alignments + pattern: "*.{txt,list}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: optional filtered/converted BAM file + pattern: "*.{bam}" + - cram: + type: file + description: optional filtered/converted CRAM file + pattern: "*.{cram}" + - sam: + type: file + description: optional filtered/converted SAM file + pattern: "*.{sam}" + # bai, csi, and crai are created with `--write-index` + - bai: + type: file + description: optional BAM file index + pattern: "*.{bai}" + - csi: + type: file + description: optional tabix BAM file index + pattern: "*.{csi}" + - crai: + type: file + description: optional CRAM file index + pattern: "*.{crai}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@joseespinosa" + - "@FriederikeHanssen" + - "@priyanka-surana" +maintainers: + - "@drpatelh" + - "@joseespinosa" + - "@FriederikeHanssen" + - "@priyanka-surana" diff --git a/modules/nf-core/umicollapse/environment.yml b/modules/nf-core/umicollapse/environment.yml new file mode 100644 index 0000000..8dbc65d --- /dev/null +++ b/modules/nf-core/umicollapse/environment.yml @@ -0,0 +1,7 @@ +name: umicollapse +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::umicollapse=1.0.0 diff --git a/modules/nf-core/umicollapse/main.nf b/modules/nf-core/umicollapse/main.nf new file mode 100644 index 0000000..dae290e --- /dev/null +++ b/modules/nf-core/umicollapse/main.nf @@ -0,0 +1,73 @@ +process UMICOLLAPSE { + tag "$meta.id" + label "process_high" + label "process_high_memory" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/umicollapse:1.0.0--hdfd78af_1' : + 'biocontainers/umicollapse:1.0.0--hdfd78af_1' }" + + input: + tuple val(meta), path(input), path(bai) + val(mode) + + output: + tuple val(meta), path("*.bam"), emit: bam, optional: true + tuple val(meta), path("*dedup*fastq.gz"), emit: fastq, optional: true + tuple val(meta), path("*_UMICollapse.log"), emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '1.0.0-1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + // Memory allocation: We need to make sure that both heap and stack size is sufficiently large for + // umicollapse. We set the stack size to 5% of the available memory, the heap size to 90% + // which leaves 5% for stuff happening outside of java without the scheduler killing the process. + def max_heap_size_mega = (task.memory.toMega() * 0.9).intValue() + def max_stack_size_mega = 999 //most java jdks will not allow Xss > 1GB, so fixing this to the allowed max + if ( mode !in [ 'fastq', 'bam' ] ) { + error "Mode must be one of 'fastq' or 'bam'." + } + extension = mode.contains("fastq") ? "fastq.gz" : "bam" + """ + # Getting the umicollapse jar file like this because `umicollapse` is a Python wrapper script generated + # by conda that allows to set the heap size (Xmx), but not the stack size (Xss). + # `which` allows us to get the directory that contains `umicollapse`, independent of whether we + # are in a container or conda environment. + UMICOLLAPSE_JAR=\$(dirname \$(which umicollapse))/../share/umicollapse-${VERSION}/umicollapse.jar + java \\ + -Xmx${max_heap_size_mega}M \\ + -Xss${max_stack_size_mega}M \\ + -jar \$UMICOLLAPSE_JAR \\ + $mode \\ + -i ${input} \\ + -o ${prefix}.${extension} \\ + $args | tee ${prefix}_UMICollapse.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + umicollapse: $VERSION + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '1.0.0-1' + if ( mode !in [ 'fastq', 'bam' ] ) { + error "Mode must be one of 'fastq' or 'bam'." + } + extension = mode.contains("fastq") ? "fastq.gz" : "bam" + """ + touch ${prefix}.dedup.${extension} + touch ${prefix}_UMICollapse.log + cat <<-END_VERSIONS > versions.yml + "${task.process}": + umicollapse: $VERSION + END_VERSIONS + """ +} diff --git a/modules/nf-core/umicollapse/meta.yml b/modules/nf-core/umicollapse/meta.yml new file mode 100644 index 0000000..c1361f9 --- /dev/null +++ b/modules/nf-core/umicollapse/meta.yml @@ -0,0 +1,63 @@ +--- +name: "umicollapse" +description: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read. +keywords: + - umicollapse + - deduplication + - genomics +tools: + - "umicollapse": + description: "UMICollapse contains tools for dealing with Unique Molecular Identifiers (UMIs)/Random Molecular Tags (RMTs)." + homepage: "https://github.com/Daniel-Liu-c0deb0t/UMICollapse" + documentation: "https://github.com/Daniel-Liu-c0deb0t/UMICollapse" + tool_dev_url: "https://github.com/Daniel-Liu-c0deb0t/UMICollapse" + doi: "10.7717/peerj.8275" + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: | + BAM file containing reads to be deduplicated via UMIs. + pattern: "*.{bam}" + - bai: + type: file + description: | + BAM index files corresponding to the input BAM file. Optionally can be skipped using [] when using FastQ input. + pattern: "*.{bai}" + - mode: + type: string + description: | + Selects the mode of Umicollapse - either fastq or bam need to be provided. + pattern: "{fastq,bam}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file with deduplicated UMIs. + pattern: "*.{bam}" + - log: + type: file + description: A log file with the deduplication statistics. + pattern: "*_{UMICollapse.log}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@CharlotteAnne" + - "@chris-cheshire" +maintainers: + - "@CharlotteAnne" + - "@chris-cheshire" + - "@apeltzer" + - "@MatthiasZepper" diff --git a/modules/nf-core/umicollapse/tests/main.nf.test b/modules/nf-core/umicollapse/tests/main.nf.test new file mode 100644 index 0000000..2dec45b --- /dev/null +++ b/modules/nf-core/umicollapse/tests/main.nf.test @@ -0,0 +1,249 @@ +nextflow_process { + + name "Test Process UMICOLLAPSE" + script "../main.nf" + process "UMICOLLAPSE" + + tag "modules" + tag "modules_nfcore" + tag "umicollapse" + tag "umitools/extract" + tag "samtools/index" + tag "bwa/index" + tag "bwa/mem" + + test("umicollapse single end test") { + setup{ + run("UMITOOLS_EXTRACT"){ + script "../../umitools/extract/main.nf" + config "./nextflow_SE.config" + process{ + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + """ + } + } + + run("BWA_INDEX"){ + script "../../bwa/index/main.nf" + process{ + """ + input[0] = [[ id:'sarscov2'],file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)] + """ + } + } + run("BWA_MEM"){ + script "../../bwa/mem/main.nf" + process{ + """ + input[0] = UMITOOLS_EXTRACT.out.reads + input[1] = BWA_INDEX.out.index + input[2] = [[ id:'sarscov2'],file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)] + input[3] = true + """ + } + } + run("SAMTOOLS_INDEX"){ + script "../../samtools/index/main.nf" + process{ + """ + input[0] = BWA_MEM.out.bam + """ + } + } + } + + when { + config "./nextflow_SE.config" + process { + """ + input[0] = BWA_MEM.out.bam.join(SAMTOOLS_INDEX.out.bai, by: [0]) + input[1] = 'bam' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.bam, + process.out.versions).match() } + ) + } + + } + + test("umicollapse paired tests") { + setup{ + run("UMITOOLS_EXTRACT"){ + script "../../umitools/extract/main.nf" + config "./nextflow_PE.config" + process{ + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + """ + } + } + + run("BWA_INDEX"){ + script "../../bwa/index/main.nf" + process{ + """ + input[0] = [ + [ id:'sarscov2'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + run("BWA_MEM"){ + script "../../bwa/mem/main.nf" + process{ + """ + input[0] = UMITOOLS_EXTRACT.out.reads + input[1] = BWA_INDEX.out.index + input[2] = [[ id:'sarscov2'],file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)] + input[3] = true + """ + } + } + run("SAMTOOLS_INDEX"){ + script "../../samtools/index/main.nf" + process{ + """ + input[0] = BWA_MEM.out.bam + """ + } + } + } + + when { + config "./nextflow_PE.config" + process { + """ + input[0] = BWA_MEM.out.bam.join(SAMTOOLS_INDEX.out.bai, by: [0]) + input[1] = 'bam' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.bam, + process.out.versions).match() } + ) + } + + } + + test("umicollapse fastq tests") { + + when { + config "./nextflow_SE.config" + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + [] + ] + input[1] = 'fastq' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.fastq, + process.out.versions).match() } + ) + } + } + + test("umicollapse stub tests") { + options "-stub-run" + setup{ + run("UMITOOLS_EXTRACT"){ + script "../../umitools/extract/main.nf" + config "./nextflow_PE.config" + process{ + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + """ + } + } + + run("BWA_INDEX"){ + script "../../bwa/index/main.nf" + process{ + """ + input[0] = [ + [ id:'sarscov2'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + run("BWA_MEM"){ + script "../../bwa/mem/main.nf" + process{ + """ + input[0] = UMITOOLS_EXTRACT.out.reads + input[1] = BWA_INDEX.out.index + input[2] = [[ id:'sarscov2'],file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)] + input[3] = true + """ + } + } + run("SAMTOOLS_INDEX"){ + script "../../samtools/index/main.nf" + process{ + """ + input[0] = BWA_MEM.out.bam + """ + } + } + } + when { + config "./nextflow_PE.config" + process { + """ + input[0] = BWA_MEM.out.bam.join(SAMTOOLS_INDEX.out.bai, by: [0]) + input[1] = 'bam' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/umicollapse/tests/main.nf.test.snap b/modules/nf-core/umicollapse/tests/main.nf.test.snap new file mode 100644 index 0000000..861e9ca --- /dev/null +++ b/modules/nf-core/umicollapse/tests/main.nf.test.snap @@ -0,0 +1,124 @@ +{ + "umicollapse single end test": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.dedup.bam:md5,05c5331185263cbee6f508c0669be864" + ] + ], + [ + "versions.yml:md5,c1e0275d81b1c97a9344d216f9154996" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-14T13:41:23.869211282" + }, + "umicollapse fastq tests": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.dedup.fastq.gz:md5,c9bac08c7fd8df3e0203e3eeafc73155" + ] + ], + [ + "versions.yml:md5,c1e0275d81b1c97a9344d216f9154996" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-30T10:45:56.053352008" + }, + "umicollapse stub tests": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.dedup.dedup.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.dedup_UMICollapse.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,c1e0275d81b1c97a9344d216f9154996" + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + "test.dedup.dedup.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "fastq": [ + + ], + "log": [ + [ + { + "id": "test", + "single_end": false + }, + "test.dedup_UMICollapse.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,c1e0275d81b1c97a9344d216f9154996" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-30T10:46:12.482697713" + }, + "umicollapse paired tests": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.dedup.bam:md5,f4f05467cb456309fe22851d8b4d4387" + ] + ], + [ + "versions.yml:md5,c1e0275d81b1c97a9344d216f9154996" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-14T13:41:54.486079388" + } +} \ No newline at end of file diff --git a/modules/nf-core/umicollapse/tests/nextflow.config b/modules/nf-core/umicollapse/tests/nextflow.config new file mode 100644 index 0000000..844edbd --- /dev/null +++ b/modules/nf-core/umicollapse/tests/nextflow.config @@ -0,0 +1,8 @@ +process { + withName: UMITOOLS_EXTRACT { + ext.args = '--bc-pattern="NNNN"' + } + withName: UMICOLLAPSE { + ext.prefix = { "${meta.id}.dedup" } + } +} \ No newline at end of file diff --git a/modules/nf-core/umicollapse/tests/nextflow_PE.config b/modules/nf-core/umicollapse/tests/nextflow_PE.config new file mode 100644 index 0000000..ae4c963 --- /dev/null +++ b/modules/nf-core/umicollapse/tests/nextflow_PE.config @@ -0,0 +1,10 @@ +process { + + withName: UMITOOLS_EXTRACT { + ext.args = '--bc-pattern="NNNN" --bc-pattern2="NNNN"' + } + + withName: UMICOLLAPSE { + ext.prefix = { "${meta.id}.dedup" } + } +} diff --git a/modules/nf-core/umicollapse/tests/nextflow_SE.config b/modules/nf-core/umicollapse/tests/nextflow_SE.config new file mode 100644 index 0000000..d4b9443 --- /dev/null +++ b/modules/nf-core/umicollapse/tests/nextflow_SE.config @@ -0,0 +1,10 @@ +process { + + withName: UMITOOLS_EXTRACT { + ext.args = '--bc-pattern="NNNN"' + } + + withName: UMICOLLAPSE { + ext.prefix = { "${meta.id}.dedup" } + } +} diff --git a/modules/nf-core/umicollapse/tests/tags.yml b/modules/nf-core/umicollapse/tests/tags.yml new file mode 100644 index 0000000..912879c --- /dev/null +++ b/modules/nf-core/umicollapse/tests/tags.yml @@ -0,0 +1,2 @@ +umicollapse: + - "modules/nf-core/umicollapse/**" diff --git a/nextflow.config b/nextflow.config index 3dc2f22..6bbe635 100644 --- a/nextflow.config +++ b/nextflow.config @@ -72,11 +72,11 @@ params { move_umi_to_header = false umi_header_format = null save_unaligned = true // DO NOT CHANGE - trim_length = 10 umi_separator = "rbc:" paraclu_min_value = 10 + trimgalore_params = "--fastqc --length 10 -q 20" bowtie_params = "-v 2 -m 100 --norc --best --strata" - star_params = "--outFilterMultimapNmax 1 --outFilterMultimapScoreRange 1 --outSAMattributes All --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterType BySJout --alignIntronMin 20 --alignIntronMax 1000000 --outFilterScoreMin 10 --alignEndsType Extend5pOfRead1 --twopassMode Basic" + star_params = "--outFilterMultimapNmax 100 --outFilterMultimapScoreRange 1 --outSAMattributes All --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterType BySJout --alignIntronMin 20 --alignIntronMax 1000000 --outFilterScoreMin 10 --alignEndsType Extend5pOfRead1 --twopassMode Basic" clippy_params = "" icount_peaks_params = "" peka_params = "" diff --git a/schema/clipseq.json b/schema/clipseq.json index 0f67086..0fe828e 100644 --- a/schema/clipseq.json +++ b/schema/clipseq.json @@ -287,11 +287,6 @@ "description": "Additional pipeline configuration options.", "advanced": true, "params": { - "trim_length": { - "name": "Minimum trim length.", - "description": "Minimum length of read to keep after Trim Galore! trimming.", - "type": "number" - }, "move_umi_to_header": { "name": "Extract UMI to header", "description": "Runs UMI to header extraction based on the head format provided in UMI header format.", diff --git a/subworkflows/goodwright/bam_dedup_samtools_umitools/main.nf b/subworkflows/goodwright/bam_dedup_samtools_umitools/main.nf index 0cb71f3..58ad0b6 100644 --- a/subworkflows/goodwright/bam_dedup_samtools_umitools/main.nf +++ b/subworkflows/goodwright/bam_dedup_samtools_umitools/main.nf @@ -2,7 +2,7 @@ * UMIcollapse, index BAM file and run samtools stats, flagstat and idxstats */ -include { UMICOLLAPSE } from '../../../modules/goodwright/umicollapse/main' +include { UMICOLLAPSE } from '../../../modules/nf-core/umicollapse/main' include { SAMTOOLS_INDEX } from '../../../modules/nf-core/samtools/index/main' workflow BAM_DEDUP_SAMTOOLS_UMITOOLS { @@ -16,7 +16,8 @@ workflow BAM_DEDUP_SAMTOOLS_UMITOOLS { * MODULE: UMI-tools collapse */ UMICOLLAPSE ( - bam_bai + bam_bai, + 'bam' ) ch_versions = ch_versions.mix(UMICOLLAPSE.out.versions) diff --git a/subworkflows/goodwright/icount_analyse/main.nf b/subworkflows/goodwright/icount_analyse/main.nf index b6e5ab5..d2fabd6 100644 --- a/subworkflows/goodwright/icount_analyse/main.nf +++ b/subworkflows/goodwright/icount_analyse/main.nf @@ -5,15 +5,18 @@ /* * MODULES */ + include { ICOUNT_SUMMARY } from '../../../modules/goodwright/icount/summary/main.nf' include { ICOUNT_RNAMAPS } from '../../../modules/goodwright/icount/rnamaps/main.nf' include { ICOUNT_SIGXLS } from '../../../modules/goodwright/icount/sigxls/main.nf' include { ICOUNT_PEAKS } from '../../../modules/goodwright/icount/peaks/main.nf' include { GUNZIP as GUNZIP_SIGXLS } from '../../../modules/nf-core/gunzip/main.nf' include { GUNZIP as GUNZIP_PEAKS } from '../../../modules/nf-core/gunzip/main.nf' +include { MERGE_SUMMARY } from '../../../modules/local/clipseq/merge_summary/main.nf' workflow ICOUNT_ANALYSE { take: + smrna_bed // channel: [ val(meta), [ bed ] ] bed // channel: [ val(meta), [ bed ] ] gtf_regions // channel: [ [ gtf ] ] gtf_resolved // channel: [ [ gtf.gz ] ] @@ -23,14 +26,24 @@ workflow ICOUNT_ANALYSE { ch_versions = Channel.empty() /* - * MODULE: Run iCount summary + * MODULE: Run iCount summary */ + + ICOUNT_SUMMARY ( bed, gtf_regions ) ch_versions = ch_versions.mix(ICOUNT_SUMMARY.out.versions) + MERGE_SUMMARY ( + ICOUNT_SUMMARY.out.summary_type, + ICOUNT_SUMMARY.out.summary_subtype, + ICOUNT_SUMMARY.out.summary_gene, + smrna_bed + ) + ch_versions = ch_versions.mix(MERGE_SUMMARY.out.versions) + /* * MODULE: Run iCount rnamaps */ diff --git a/subworkflows/goodwright/rna_align/main.nf b/subworkflows/goodwright/rna_align/main.nf index 0e85370..f2c67a9 100644 --- a/subworkflows/goodwright/rna_align/main.nf +++ b/subworkflows/goodwright/rna_align/main.nf @@ -5,16 +5,20 @@ /* * MODULES -*/ -include { BOWTIE_ALIGN } from '../../../modules/nf-core/bowtie/align/main.nf' -include { STAR_ALIGN } from '../../../modules/nf-core/star/align/main.nf' -include { SAMTOOLS_INDEX } from '../../../modules/nf-core/samtools/index/main' - -/* -* SUBWORKFLOWS -*/ -include { BAM_STATS_SAMTOOLS as BAM_STATS_SAMTOOLS_GENOME } from '../../nf-core/bam_stats_samtools/main.nf' -include { BAM_SORT_STATS_SAMTOOLS as BAM_SORT_STATS_SAMTOOLS_TRANSCRIPT } from '../../nf-core/bam_sort_stats_samtools/main.nf' +*/ +include { BOWTIE_ALIGN } from '../../../modules/nf-core/bowtie/align/main.nf' +include { BOWTIE_ALIGN as BOWTIE_ALIGN_K1 } from '../../../modules/nf-core/bowtie/align/main.nf' +include { STAR_ALIGN } from '../../../modules/nf-core/star/align/main.nf' +include { SAMTOOLS_SORT as SAMTOOLS_SORT_GENOME } from '../../../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_SORT as SAMTOOLS_SORT_TRANSCRIPT } from '../../../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_SORT as SAMTOOLS_SORT_SMRNA } from '../../../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_SORT as SAMTOOLS_SORT_SMRNA_K1 } from '../../../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_SMRNA } from '../../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_SMRNA_K1 } from '../../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_TRANSCRIPT } from '../../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_GENOME } from '../../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_GENOME } from '../../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_TRANSCRIPT } from '../../../modules/nf-core/samtools/view/main' workflow RNA_ALIGN { take: @@ -36,6 +40,29 @@ workflow RNA_ALIGN { ) ch_versions = ch_versions.mix(BOWTIE_ALIGN.out.versions) + SAMTOOLS_SORT_SMRNA ( BOWTIE_ALIGN.out.bam ) + ch_versions = ch_versions.mix(SAMTOOLS_SORT_SMRNA.out.versions) + + SAMTOOLS_INDEX_SMRNA ( SAMTOOLS_SORT_SMRNA.out.bam ) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX_SMRNA.out.versions) + + /* + * MODULE: Align reads to smrna genome, here allowing 100 multimappers but only reporting one alignment per multimapped read + * so that we can accurately count it in the crosslink summary later + */ + + BOWTIE_ALIGN_K1 ( + fastq, + bt_index.collect{it[1]} + ) + ch_versions = ch_versions.mix(BOWTIE_ALIGN_K1.out.versions) + + SAMTOOLS_SORT_SMRNA_K1 ( BOWTIE_ALIGN_K1.out.bam ) + ch_versions = ch_versions.mix(SAMTOOLS_SORT_SMRNA_K1.out.versions) + + SAMTOOLS_INDEX_SMRNA_K1 ( SAMTOOLS_SORT_SMRNA_K1.out.bam ) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX_SMRNA_K1.out.versions) + /* * MODULE: Align reads that did not align to the smrna genome to the primary genome */ @@ -49,18 +76,30 @@ workflow RNA_ALIGN { ) ch_versions = ch_versions.mix(STAR_ALIGN.out.versions) + SAMTOOLS_SORT_GENOME ( STAR_ALIGN.out.bam ) + + /* + * MODULE: Index genome-level BAM file + */ + SAMTOOLS_INDEX_GENOME ( SAMTOOLS_SORT_GENOME.out.bam ) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX_GENOME.out.versions.first()) + /* - * MODULE: Index genome-level BAM file + * MODULE: Index transcript-level BAM file */ - SAMTOOLS_INDEX ( STAR_ALIGN.out.bam_sorted ) - ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first()) + SAMTOOLS_SORT_TRANSCRIPT ( STAR_ALIGN.out.bam_transcript ) + ch_versions = ch_versions.mix(SAMTOOLS_SORT_TRANSCRIPT.out.versions.first()) + + SAMTOOLS_INDEX_TRANSCRIPT ( SAMTOOLS_SORT_TRANSCRIPT.out.bam ) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX_TRANSCRIPT.out.versions.first()) + /* * CHANNEL: Join bam and bai files */ - ch_bam_bai = STAR_ALIGN.out.bam_sorted - .join(SAMTOOLS_INDEX.out.bai, by: [0], remainder: true) - .join(SAMTOOLS_INDEX.out.csi, by: [0], remainder: true) + ch_bam_bai = SAMTOOLS_SORT_GENOME.out.bam + .join(SAMTOOLS_INDEX_GENOME.out.bai, by: [0], remainder: true) + .join(SAMTOOLS_INDEX_GENOME.out.csi, by: [0], remainder: true) .map { meta, bam, bai, csi -> if (bai) { @@ -71,38 +110,49 @@ workflow RNA_ALIGN { } /* - * SUBWORKFLOW: Stats on genome-level bam + * CHANNEL: Join bam and bai files */ - BAM_STATS_SAMTOOLS_GENOME ( - ch_bam_bai, - fasta - ) - ch_versions = ch_versions.mix(BAM_STATS_SAMTOOLS_GENOME.out.versions) + ch_transcript_bam_bai = SAMTOOLS_SORT_TRANSCRIPT.out.bam + .join(SAMTOOLS_INDEX_TRANSCRIPT.out.bai, by: [0], remainder: true) + .join(SAMTOOLS_INDEX_TRANSCRIPT.out.csi, by: [0], remainder: true) + .map { + meta, bam, bai, csi -> + if (bai) { + [ meta, bam, bai ] + } else { + [ meta, bam, csi ] + } + } + /* - * SUBWORKFLOW: Sort, index and stats on transcript-level bam + * CHANNEL: Filter for uniquely mapping reads for downstream analysis; samtools view -b -q 5 -o output.bam alignments.bam */ - BAM_SORT_STATS_SAMTOOLS_TRANSCRIPT ( - STAR_ALIGN.out.bam_transcript, - fasta + SAMTOOLS_VIEW_GENOME ( + ch_bam_bai, + [[],[]], + [] + ) + + SAMTOOLS_VIEW_TRANSCRIPT ( + ch_transcript_bam_bai, + [[],[]], + [] ) emit: - bt_bam = BOWTIE_ALIGN.out.bam // channel: [ val(meta), [ bam ] ] - bt_log = BOWTIE_ALIGN.out.log // channel: [ val(meta), [ txt ] ] - star_bam = STAR_ALIGN.out.bam_sorted // channel: [ val(meta), [ bam ] ] - star_bam_transcript = STAR_ALIGN.out.bam_transcript // channel: [ val(meta), [ bam ] ] + bt_log = BOWTIE_ALIGN.out.log // channel: [ val(meta), [ txt ] ] star_log = STAR_ALIGN.out.log // channel: [ val(meta), [ txt ] ] star_log_final = STAR_ALIGN.out.log_final // channel: [ val(meta), [ txt ] ] - genome_bam = STAR_ALIGN.out.bam_sorted // channel: [ val(meta), [ bam ] ] - genome_bai = SAMTOOLS_INDEX.out.bai // channel: [ val(meta), [ bai ] ] - genome_stats = BAM_STATS_SAMTOOLS_GENOME.out.stats // channel: [ val(meta), [ stats ] ] - genome_flagstat = BAM_STATS_SAMTOOLS_GENOME.out.flagstat // channel: [ val(meta), [ flagstat ] ] - genome_idxstats = BAM_STATS_SAMTOOLS_GENOME.out.idxstats // channel: [ val(meta), [ idxstats ] ] - transcript_bam = BAM_SORT_STATS_SAMTOOLS_TRANSCRIPT.out.bam // channel: [ val(meta), [ bam ] ] - transcript_bai = BAM_SORT_STATS_SAMTOOLS_TRANSCRIPT.out.bai // channel: [ val(meta), [ bai ] ] - transcript_stats = BAM_SORT_STATS_SAMTOOLS_TRANSCRIPT.out.stats // channel: [ val(meta), [ stats ] ] - transcript_flagstat = BAM_SORT_STATS_SAMTOOLS_TRANSCRIPT.out.flagstat // channel: [ val(meta), [ flagstat ] ] - transcript_idxstats = BAM_SORT_STATS_SAMTOOLS_TRANSCRIPT.out.idxstats // channel: [ val(meta), [ idxstats ] ] + genome_unique_bam = SAMTOOLS_VIEW_GENOME.out.bam // channel: [ val(meta), [ bam ] ] + genome_unique_bai = SAMTOOLS_VIEW_GENOME.out.bai // channel: [ val(meta), [ bai ] ] + genome_multi_bam = SAMTOOLS_SORT_GENOME.out.bam // channel: [ val(meta), [ bam ] ] + genome_multi_bai = SAMTOOLS_INDEX_GENOME.out.bai // channel: [ val(meta), [ bai ] ] + transcript_bam = SAMTOOLS_VIEW_TRANSCRIPT.out.bam // channel: [ val(meta), [ bam ] ] + transcript_bai = SAMTOOLS_VIEW_TRANSCRIPT.out.bai // channel: [ val(meta), [ bai ] ] + smrna_bam = SAMTOOLS_SORT_SMRNA.out.bam // channel: [ val(meta), [ bam ] ] + smrna_bai = SAMTOOLS_INDEX_SMRNA.out.bai // channel: [ val(meta), [ bai ] ] + smrna_k1_bam = SAMTOOLS_SORT_SMRNA_K1.out.bam // channel: [ val(meta), [ bam ] ] + smrna_k1_bai = SAMTOOLS_INDEX_SMRNA_K1.out.bai // channel: [ val(meta), [ bai ] ] versions = ch_versions // channel: [ versions.yml ] }