-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathassembly.py
More file actions
168 lines (137 loc) · 7.07 KB
/
assembly.py
File metadata and controls
168 lines (137 loc) · 7.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import os
import shutil
import subprocess
import re
# BioPython imports
from Bio import SeqIO
from Bio.Alphabet import generic_dna
# (Optional) Function to run commands and pipe the output to the terminal
# Only necessary if the subprocess prints to the terminal
def run_command(command):
p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
# Grab stdout line by line as it becomes available. This will loop until
# p terminates.
while p.poll() is None:
l = p.stdout.readline() # This blocks until it receives a newline.
print(l)
# When the subprocess terminates there might be unconsumed output
# that still needs to be processed.
print(p.stdout.read())
# Helper function that the deletes the original SRA database file in
# /home/user/ncbi/public/sra' after splitting to preserve drive space
def delete_ascension(ascension):
ascension_folder = os.path.join('/home/{}/ncbi/public/sra'.format(getpass.getuser()))
os.remove(os.path.join(ascension_folder,'{}.sra'.format(ascension)))
print('.sra file of {} has been deleted!'.format(ascension))
# Download & split paired-end read libraries from the SRA database
# SRA Toolkit manual: https://www.ncbi.nlm.nih.gov/books/NBK242621/
def get_ascensions(download_directory):
os.chdir(download_directory)
# Read the ascensions from the text file
with open('SRR_Acc_List.txt', 'r') as f:
for line in f:
# strip the "\n from the end of each line"
line = line.rstrip('\n')
# Specify the output directory
output_path = os.path.join(download_directory, line)
# Check if the ascension was already downloaded
if os.path.exists(output_path) == True:
print('Ascension {} already downloaded'.format(line))
else:
# Download and split the reads
os.mkdir(output_path)
print('Downloading & splitting: {}'.format(line))
subprocess.call('fastq-dump -I --split-files {0} --outdir {1}'.format(line, output_path), shell=True)
print('{} complete!'.format(line))
# Delete the .sra file to save disk space
delete_ascension(line)
print('No more ascensions to download.')
# Helper function to rename sequence ID's if they're improperly named during subsetting
# If the read ID's are incorrect, Abyss will throw an error and say
# that all reads are mateless
def correct_read_ids(directory, file):
subprocess.call('sed -i "/^@{0}/ s/.2 /.1 /g" {1}'.format(directory, file), shell=True)
# Make subsets of each paired-end library of 250k reads each
# Seqtk manual: https://github.com/lh3/seqtk
def subset(download_directory):
# Index the contents of the download directory
paths, directories, files = next(os.walk(download_directory))
for directory in directories:
# Go to the sample directory and check its contents
os.chdir(os.path.join(download_directory ,directory))
libraries = os.listdir(os.path.join(download_directory, directory))
# Check whether the libraries have already been subsetted
# by checking whether the appended string "250k_" is in the filenames
if any('250k_' in l for l in libraries):
pass
else:
# Subset the read files
print('Subsetting {}'.format(directory))
subprocess.call('seqtk sample -s100 {0} 250000 > 250k_{0}'.format(libraries[0]), shell=True)
subprocess.call('seqtk sample -s100 {0} 250000 > 250k_{0}'.format(libraries[1]), shell=True)
# Remove the old files, only the subsets are needed
print('Removing original files')
os.remove(libraries[0])
os.remove(libraries[1])
print('250k subsets of {0} generated'.format(directory))
libraries = os.listdir(os.path.join(download_directory, directory))
# Correct the read id's if improperly named by seqtk
print('Correcting read IDs')
correct_read_ids(directory, libraries[1])
print('Corrected reads of {}'.format(directory))
print('All subsets have been generated')
# Assemble all subsetted libraries de novo using Abyss
# Abyss manual: https://github.com/bcgsc/abyss
def assemble(download_directory):
# Index the contents of the download directory
paths, directories, files = next(os.walk(download_directory))
os.chdir(download_directory)
for directory in directories:
# Specify the path to each ascension directory & index it
ascension_directory = os.path.join(download_directory, directory)
libraries = os.listdir(ascension_directory)
# Check to see if the ascension is not yet assembled by checking
# if only the paired-end read files are present
if len(libraries) == 2:
os.chdir(ascension_directory)
subprocess.call('abyss-pe k=64 name={0} in="{1} {2}"'.format(directory, libraries[0], libraries[1]), shell=True)
print('Ascension {} has been assembled!'.format(directory))
print('All ascensions have been assembled!')
# (Optional) Helper function to extract the path of the result file containing
# the contigs from the performed assembly
# Only to be used if the condition in "collect_assemblies" doesn't work
def get_abyss_results(directory):
# Define a list with terms used to filter the files in the directory
filter_list = ['scaffolds', 'contigs', 'bubbles', 'unitigs', 'indel']
result_files = [os.path.splitext(file)[0].split('-') for file in os.listdir(directory) if file.endswith('.fa')]
result_files[:] = [result for result in result_files if not result[1] in filter_list]
result_files.sort()
result_file = '{0}-{1}.fa'.format(directory, result_files[-1][1])
return os.path.join(directory, result_file)
# Helper function to filter contigs in a file based on size (6000 bp < contig < 14000 bp)
# This function uses BioPython's SeqIO module to parse the file
def filter_contigs(path_to_results, filename, contigs_directory):
# Define the path to the output file
path_output_file = os.path.join(contigs_directory, '{}.fa'.format(filename.split('-')[0]))
# Check if it already exists to prevent unnecessary code execution
if os.path.exists(path_output_file) == True:
pass
else:
print('Getting results for {}'.format(filename.split('-'[0])))
# Open the necessary files and start filtering
with open(path_to_results, "rU") as raw_contigs, open(path_output_file, "a") as output:
filtered_contigs = [contig for contig in raw_contigs if 6000 <= len(contig.seq) <= 14000]
# Write to output file if contig meets criteria
SeqIO.write(filtered_contigs, output, "fasta")
print('Filtered {0} contigs for ascension {1}'.format(len(filtered_contigs), filename.split('-')[0]))
# Collect & filter the assembly results in a single folder "filtered_contigs"
def collect_assemblies(download_directory, contigs_directory):
# Index the contents of the download directory
paths, directories, files = next(os.walk(download_directory))
for directory in directories:
# Define the necessary variables and use the "filter_contigs"
# function to write filtered output files
filename = '{}-8.fa'.format(directory)
path_to_results = os.path.join(download_directory, directory, filename)
filter_contigs(path_to_results, filename, contigs_directory)
print('All assembly results have been filtered!')