workflow-nuvs/workflow.py at main · virtool/workflow-nuvs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
import asyncio
import collections
import os
import shlex
import shutil
from pathlib import Path

import aiofiles
from pyfixtures import fixture
from structlog import get_logger
from virtool.bio import (
    find_orfs,
    read_fasta,
)
from virtool.models.enums import LibraryType
from virtool.utils import compress_file, decompress_file
from virtool.workflow import RunSubprocess, hooks, step
from virtool.workflow.analysis import ReadPaths
from virtool.workflow.data.analyses import WFAnalysis
from virtool.workflow.data.hmms import WFHMMs
from virtool.workflow.data.indexes import WFIndex
from virtool.workflow.data.samples import WFSample
from virtool.workflow.data.subtractions import WFSubtraction

from utils import (
    SkewerConfiguration,
    SkewerMode,
    SkewerRunner,
    calculate_trimming_min_length,
    filter_reads_by_headers,
    read_fastq_headers,
)

logger = get_logger("workflow")


@hooks.on_failure
async def delete_analysis(analysis: WFAnalysis):
    await analysis.delete()


@fixture
async def trimmed_path(work_path: Path) -> Path:
    """The path to a directory for trimmed reads."""
    trimmed_path = work_path / "trimmed"
    trimmed_path.mkdir(exist_ok=True)

    return trimmed_path


@fixture
async def trimmed_read_paths(sample: WFSample, trimmed_path: Path) -> ReadPaths:
    if sample.paired:
        return (
            trimmed_path / "reads_1.fq.gz",
            trimmed_path / "reads_2.fq.gz",
        )

    return (trimmed_path / "reads_1.fq.gz",)


@step()
async def trim_reads(
    proc: int,
    sample: WFSample,
    skewer: SkewerRunner,
    work_path: Path,
):
    """Trim reads using Skewer."""
    trimmed_path = work_path / "trimmed"
    await asyncio.to_thread(trimmed_path.mkdir, parents=True)

    await skewer(
        SkewerConfiguration(
            min_length=calculate_trimming_min_length(sample),
            mode=SkewerMode.PAIRED_END if sample.paired else SkewerMode.SINGLE_END,
            number_of_processes=proc,
        ),
        sample.read_paths,
        output_path=trimmed_path,
    )


@step(name="Eliminate OTUs")
async def eliminate_otus(
    index: WFIndex,
    proc: int,
    run_subprocess: RunSubprocess,
    trimmed_read_paths: ReadPaths,
    work_path: Path,
):
    """Map sample reads to reference OTUs and discard.

    Bowtie2 is set to use the search parameter ``--very-fast-local`` and retain
    unaligned reads to the FASTQ file ``unmapped_subtraction.fq``.

    """
    command = [
        "bowtie2",
        "-p",
        proc,
        "-k",
        1,
        "--very-fast-local",
        "-x",
        index.bowtie_path,
        "--un",
        work_path / "unmapped_otus.fq",
        "-U",
        *trimmed_read_paths,
    ]

    await run_subprocess(command)


@step
async def eliminate_subtraction(
    proc: int,
    run_subprocess: RunSubprocess,
    subtractions: list[WFSubtraction],
    work_path: Path,
):
    """Map remaining reads to the subtraction and discard.

    Reads that were not mapped to the reference OTUs in the previous step
    (`unmapped_otus.fq`) are mapped against the subtraction. Reads with no
    alignment against the subtraction (`unmapped_subtractions.fq`) are carried
    forward into the next step.

    Bowtie2 is set to use the search parameter ``--very-fast-local`` and retain
    unaligned reads to the FASTQ file ``unmapped_subtraction.fq``. Providing the `--un`
    option to Bowtie2 writes any unmapped reads to the path provided with the
    option.

    """
    if subtractions:
        await asyncio.to_thread(
            shutil.copyfile,
            work_path / "unmapped_otus.fq",
            work_path / "working_otus.fq",
        )

        for subtraction in subtractions:
            await run_subprocess(
                [
                    "bowtie2",
                    "--very-fast-local",
                    "-k",
                    1,
                    "-p",
                    proc,
                    "-x",
                    shlex.quote(str(subtraction.bowtie2_index_path)),
                    "--un",
                    work_path / "unmapped_subtractions.fq",
                    "-U",
                    work_path / "working_otus.fq",
                ],
            )

            await asyncio.to_thread(
                shutil.copyfile,
                work_path / "unmapped_subtractions.fq",
                work_path / "working_otus.fq",
            )

        await asyncio.to_thread(
            os.rename,
            work_path / "working_otus.fq",
            work_path / "unmapped_subtractions.fq",
        )

    else:
        await asyncio.to_thread(
            shutil.copyfile,
            work_path / "unmapped_otus.fq",
            work_path / "unmapped_subtractions.fq",
        )


@step
async def reunite_pairs(
    proc: int,
    sample: WFSample,
    trimmed_read_paths: ReadPaths,
    work_path: Path,
):
    """Reunite paired reads after elimination."""
    if sample.paired:
        headers = await asyncio.to_thread(
            read_fastq_headers,
            work_path / "unmapped_subtractions.fq",
        )

        for path in trimmed_read_paths:
            await asyncio.to_thread(
                decompress_file,
                path,
                path.with_suffix(".fq"),
                proc,
            )

        path_1, path_2 = trimmed_read_paths

        await asyncio.to_thread(
            filter_reads_by_headers,
            headers,
            (
                work_path / "unmapped_1.fq",
                work_path / "unmapped_2.fq",
            ),
            (path_1.with_suffix(".fq"), path_2.with_suffix(".fq")),
        )


@step
async def assemble(
    analysis: WFAnalysis,
    mem: int,
    proc: int,
    run_subprocess: RunSubprocess,
    sample: WFSample,
    work_path: Path,
):
    """Assemble reads using SPAdes."""
    spades_path = work_path / "spades"

    k = "21,33,55,75"

    if sample.library_type == LibraryType.srna:
        k = "17,21,23"

    command = [
        "spades.py",
        "-t",
        proc,
        "-m",
        mem,
        "-k",
        k,
        "-o",
        spades_path,
    ]

    logger = get_logger("spades")

    if sample.paired:
        command += [
            "-1",
            work_path / "unmapped_1.fq",
            "-2",
            work_path / "unmapped_2.fq",
        ]
    else:
        command += [
            "-s",
            work_path / "unmapped_subtractions.fq",
        ]

    async def handler(line: bytes) -> None:
        logger.info("stdout", line=line.decode().strip())

    await run_subprocess([str(c) for c in command], stdout_handler=handler)

    compressed_assembly_path = work_path / "assembly.fa.gz"

    await asyncio.to_thread(
        compress_file,
        spades_path / "scaffolds.fasta",
        compressed_assembly_path,
        processes=proc,
    )

    await analysis.upload_file(compressed_assembly_path, "fasta")


@step
async def process_assembly(
    analysis: WFAnalysis,
    proc: int,
    results: dict,
    work_path: Path,
):
    """Find ORFs in the assembled contigs.

    Only ORFs that are 100+ amino acids long are recorded. Contigs with no acceptable
    ORFs are discarded.

    """
    assembly_path = work_path / "spades/scaffolds.fa"

    await asyncio.to_thread(
        os.rename,
        work_path / "spades/scaffolds.fasta",
        assembly_path,
    )

    assembly = await asyncio.to_thread(read_fasta, assembly_path)

    sequences = []

    for _, sequence in assembly:
        sequence_length = len(sequence)

        # Don't consider the sequence if it is shorter than 300 bp.
        if sequence_length < 300:
            continue

        orfs = find_orfs(sequence)

        # Don't consider the sequence if it has no ORFs.
        if len(orfs) == 0:
            continue

        # Add an index field to each orf dict.
        orfs = [dict(o, index=i) for i, o in enumerate(orfs)]

        for orf in orfs:
            orf.pop("nuc")
            orf["hits"] = []

        # Make an entry for the nucleotide sequence containing a unique integer index,
        # the sequence itself, and all ORFs in the sequence.
        sequences.append({"index": len(sequences), "sequence": sequence, "orfs": orfs})

    # Write the ORFs to a FASTA file so that they can be analyzed using HMMER and vFAM.
    orfs_path = work_path / "orfs.fa"

    async with aiofiles.open(orfs_path, "w") as f:
        for entry in sequences:
            for orf in entry["orfs"]:
                await f.write(
                    f">sequence_{entry['index']}.{orf['index']}\n{orf['pro']}\n",
                )

    compressed_orfs_path = Path(f"{orfs_path}.gz")

    await asyncio.to_thread(
        compress_file,
        orfs_path,
        compressed_orfs_path,
        processes=proc,
    )

    await analysis.upload_file(compressed_orfs_path, "fasta")

    results["hits"] = sequences


@step(name="VFam")
async def vfam(
    analysis: WFAnalysis,
    hmms: WFHMMs,
    proc: int,
    results: dict,
    run_subprocess: RunSubprocess,
    work_path: Path,
):
    """Search for viral motifs in ORF translations.

    ORF translations are generated by :meth:`.process_fasta`. Viral motifs are found
    using ``hmmscan`` to search through ``candidates.fa`` using the profile HMMs in
    ``data_path/hmm/vFam.hmm``.

    Saves two files:

    - ``hmm.tsv`` contains the raw output of `hmmer`
    - ``hits.tsv`` contains the `hmmer` results formatted and annotated with the
      annotations from the Virtool HMM database collection

    """
    logger.info("running hmmpress on database")
    await run_subprocess(["hmmpress", str(hmms.profiles_path)])

    tsv_path = work_path / "hmm.tsv"

    logger.info("running hmmscan")
    await run_subprocess(
        [
            str(c)
            for c in [
                "hmmscan",
                "--tblout",
                tsv_path,
                "--noali",
                "--cpu",
                proc - 1,
                hmms.path / "profiles.hmm",
                work_path / "orfs.fa",
            ]
        ],
    )

    hmmer_hits = collections.defaultdict(lambda: collections.defaultdict(list))

    # Go through the raw HMMER results and annotate the HMM hits with data from the
    # database.
    logger.info("annotating hits")
    async with aiofiles.open(tsv_path) as f:
        async for line in f:
            if line.startswith("vFam"):
                line = line.split()

                cluster_id = int(line[0].split("_")[1])

                annotation_id = hmms.cluster_annotation_map[cluster_id]

                # Expecting sequence_0.0
                sequence_index, orf_index = (
                    int(x) for x in line[2].split("_")[1].split(".")
                )

                hmmer_hits[sequence_index][orf_index].append(
                    {
                        "hit": annotation_id,
                        "full_e": float(line[4]),
                        "full_score": float(line[5]),
                        "full_bias": float(line[6]),
                        "best_e": float(line[7]),
                        "best_bias": float(line[8]),
                        "best_score": float(line[9]),
                    },
                )

    hits = results["hits"]

    for sequence_index in hmmer_hits:
        for orf_index in hmmer_hits[sequence_index]:
            hits[sequence_index]["orfs"][orf_index]["hits"] = hmmer_hits[
                sequence_index
            ][orf_index]

        sequence = results["hits"][sequence_index]

        if all(len(orf["hits"]) == 0 for orf in sequence["orfs"]):
            hits.remove(sequence)

    logger.info("uploading result files")
    await analysis.upload_file(tsv_path, "tsv")
    await analysis.upload_result(results)