-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathFastaChomper_coding.py
More file actions
96 lines (79 loc) · 2.99 KB
/
FastaChomper_coding.py
File metadata and controls
96 lines (79 loc) · 2.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#######################################################################################
#
# Sub set only certain transcripts from a fasta file (of coding transcripts from Annocript)
#
# Usage: FastaChomper_coding.py <options> -f <fasta> -l <idlist> -o <output>
#
# Where:
# fasta = the fasta file containing all the sequences
# idlist = List of sequences IDs to keep or exclude from the fasta file. One per line.
# output = Name of the output file
#
# Options:
# -i = Keep only the sequences of the Ids included in the list
# -e = Exclude the sequence of the Ids in the list
#
#######################################################################################
#!/usr/bin/python
import sys, getopt
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
genes_seq = dict()
Ids = list()
fasta = ""
input1 = ""
keep = 0
exclude = 0
outputfile = ""
# Check for the arguments and print useful help messages
try:
opts, args = getopt.getopt(sys.argv[1:],"hief:l:o:",["fasta=","idlist=","output="])
except getopt.GetoptError:
print '\n', '#### Invalid use ####', '\n'
print 'Usage: FastaChomper_coding.py <options> -f <fasta> -l <idlist> -o <output>'
print 'For help use ClustersSeq_Corset.py -h'
sys.exit(99)
for opt, arg in opts:
if opt == '-h':
print '\n', 'Sub set only certain transcripts from a fasta file (of coding transcripts from Annocript).', '\n'
print 'Usage: FastaChomper.py <options> -f <fasta> -l <idlist> -o <output>', '\n'
print 'Where: fasta = the fasta file containing all the sequences'
print 'idlist = List of sequences IDs to keep or exclude from the fasta file. One per line.'
print 'output = Name of the output file' , '\n'
print 'Options:'
print '-i = Keep only the sequences of the Ids included in the list'
print '-e = Exclude the sequence of the Ids in the list'
sys.exit()
elif opt == "-i":
keep = 1
elif opt == "-e":
exclude = 1
elif opt in ("-f", "--fasta"):
fasta = open(arg)
elif opt in ("-l", "--idlist"):
input1 = open(arg)
elif opt in ("-o", "--output"):
outname = arg + ".fasta"
outputfile = open(outname,"w")
else:
assert False, "unhandled option"
## Make a list of the transcript IDs to search
for line in input1:
list = line.split("\n")
id = str(list[0])
Ids.append(id)
## Open the fasta file and save the information the user wants
for seq_record in SeqIO.parse(fasta, "fasta"):
gene_id = str(seq_record.id)
gene_id2 = gene_id.split("|")
gene_id2 = str(gene_id2[0]+ "|" + gene_id2[1])
bases = (seq_record.seq)
genes_seq[gene_id]= bases
if exclude == 1 and gene_id2 not in Ids:
fasta_format_string = SeqRecord(bases, id=gene_id2)
outputfile.write(fasta_format_string.format("fasta"))
if keep == 1 :
for id in Ids:
fasta_format_string = SeqRecord(genes_seq[id], id=id)
outputfile.write(fasta_format_string.format("fasta"))
outputfile.close()