Skip to content
This repository was archived by the owner on Dec 8, 2022. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,219 changes: 1,219 additions & 0 deletions stdlib/bio/seqfeature.seq

Large diffs are not rendered by default.

98 changes: 98 additions & 0 deletions stdlib/biopy/SeqUtils/checksum.seq
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@

# TODO/ CAVEATS:
# @jordan - crc32() needs to be implemented
# def crc32(seq):
# """
# crc32(seq)

# Return the crc32 checksum for a sequence (string or Seq object).
# """
# return _crc32(seq)

def _init_table_h() -> list[int]:
_table_h = list[int]()
for i in range(256):
part_l = i
part_h = 0
for j in range(8):
rflag = part_l & 1
part_l >>= 1
if part_h & 1:
part_l |= 1 << 31
part_h >>= 1
if rflag:
part_h ^= 0xD8000000
_table_h.append(part_h)
return _table_h

#Initialisation
_table_h = _init_table_h()

def crc64(s) -> str:
"""
crc64(s)

Return the crc64 checksum for a sequence (string or Seq object).

TODO/CAVEATS:
@jordan - return formatting crch and crcl as hex
"""
crcl = 0
crch = 0
for c in s:
shr = (crch & 0xFF) << 24
temp1h = crch >> 8
temp1l = (crcl >> 8) | shr
idx = (crcl ^ ord(c)) & 0xFF
crch = temp1h ^ _table_h[idx]
crcl = temp1l

# need it in 08X return f"CRC-{crch}{crcl}"
return f"CRC-{crch}{crcl}"
#"CRC-%08X%08X" % (crch, crcl)

def gcg(s) -> int:
"""
gcg(s)

Given a nucleotide or amino-acid secuence (or any string),
returns the GCG checksum (int). Checksum used by GCG program.
seq type = str.
"""

index = checksum = 0

for char in s:
index += 1
checksum += index * ord(char.upper())
if index == 57:
index = 0
return checksum % 10000

# TODO/CAVEATS:
# @jordan - This still needs to be implemented
#hashlib and base64??
# def seguid(s) -> str:
# """
# seguid(seq: seq)

# Return the SEGUID (string) for a sequence (string or Seq object).

# Given a nucleotide or amino-acid secuence (or any string),
# returns the SEGUID string (A SEquence Globally Unique IDentifier).
# seq type = str.
# """
# import hashlib
# import base64

# m = hashlib.sha1()
# m.update(_as_bytes(s.upper()))
# try:
# # For Python 3+
# tmp = base64.encodebytes(m.digest())
# return tmp.decode().replace("\n", "").rstrip("=")
# except AttributeError:
# pass
# # For all other Pythons
# return base64.b64encode(m.digest()).rstrip("=")

194 changes: 194 additions & 0 deletions stdlib/biopy/SeqUtils/codonusage.seq
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@

"""Methods for codon usage calculations."""

import math
from biopy.sequtils.codonusageindices import SharpEcoliIndex
import bio
#DONT HAVE YET from Bio import SeqIO

# Turn black code style off
# fmt: off

CodonsDict = {
"TTT": 0, "TTC": 0, "TTA": 0, "TTG": 0,
"CTT": 0, "CTC": 0, "CTA": 0, "CTG": 0,
"ATT": 0, "ATC": 0, "ATA": 0, "ATG": 0,
"GTT": 0, "GTC": 0, "GTA": 0, "GTG": 0,
"TAT": 0, "TAC": 0, "TAA": 0, "TAG": 0,
"CAT": 0, "CAC": 0, "CAA": 0, "CAG": 0,
"AAT": 0, "AAC": 0, "AAA": 0, "AAG": 0,
"GAT": 0, "GAC": 0, "GAA": 0, "GAG": 0,
"TCT": 0, "TCC": 0, "TCA": 0, "TCG": 0,
"CCT": 0, "CCC": 0, "CCA": 0, "CCG": 0,
"ACT": 0, "ACC": 0, "ACA": 0, "ACG": 0,
"GCT": 0, "GCC": 0, "GCA": 0, "GCG": 0,
"TGT": 0, "TGC": 0, "TGA": 0, "TGG": 0,
"CGT": 0, "CGC": 0, "CGA": 0, "CGG": 0,
"AGT": 0, "AGC": 0, "AGA": 0, "AGG": 0,
"GGT": 0, "GGC": 0, "GGA": 0, "GGG": 0}

# Turn black code style on
# fmt: on

# this dictionary shows which codons encode the same AA
SynonymousCodons = {
"CYS": ["TGT", "TGC"],
"ASP": ["GAT", "GAC"],
"SER": ["TCT", "TCG", "TCA", "TCC", "AGC", "AGT"],
"GLN": ["CAA", "CAG"],
"MET": ["ATG"],
"ASN": ["AAC", "AAT"],
"PRO": ["CCT", "CCG", "CCA", "CCC"],
"LYS": ["AAG", "AAA"],
"STOP": ["TAG", "TGA", "TAA"],
"THR": ["ACC", "ACA", "ACG", "ACT"],
"PHE": ["TTT", "TTC"],
"ALA": ["GCA", "GCC", "GCG", "GCT"],
"GLY": ["GGT", "GGG", "GGA", "GGC"],
"ILE": ["ATC", "ATA", "ATT"],
"LEU": ["TTA", "TTG", "CTC", "CTT", "CTG", "CTA"],
"HIS": ["CAT", "CAC"],
"ARG": ["CGA", "CGC", "CGG", "CGT", "AGG", "AGA"],
"TRP": ["TGG"],
"VAL": ["GTA", "GTC", "GTG", "GTT"],
"GLU": ["GAG", "GAA"],
"TYR": ["TAT", "TAC"],
}

class CodonAdaptationIndex:
"""
A codon adaptation index (CAI) implementation.

Implements the codon adaptation index (CAI) described by Sharp and
Li (Nucleic Acids Res. 1987 Feb 11;15(3):1281-95).

NOTE - This implementation does not currently cope with alternative genetic
codes: only the synonymous codons in the standard table are considered.
"""
index: dict[str, float]
codon_count: dict[str, int]

def __init__(self: CodonAdaptationIndex):
self.index = dict[str, float]()
self.codon_count = dict[str, int]()

# use this method with predefined CAI index
def set_cai_index(self, index):
"""
set_cai_index(self, index)

Set up an index to be used when calculating CAI for a gene.

Just pass a dictionary similar to the SharpEcoliIndex in the
CodonUsageIndices module.
"""
self.index = index

def generate_index(self, fasta_file):
"""
generate_index(self, fasta_file)

Generate a codon usage index from a FASTA file of CDS sequences.

Takes a location of a Fasta file containing CDS sequences
(which must all have a whole number of codons) and generates a codon
usage index.

RCSU values
"""
# first make sure we're not overwriting an existing index:
if self.index != dict[str, float]() or self.codon_count != dict[str, int]():
raise ValueError(
"an index has already been set or a codon count "
"has been done. Cannot overwrite either."
)

# count codon occurrences in the file.
self._count_codons(fasta_file)

# now to calculate the index we first need to sum the number of times
# synonymous codons were used all together.
for aa in SynonymousCodons:
total = 0.0
# RCSU values are CodonCount/((1/num of synonymous codons) * sum of
# all synonymous codons)
rcsu = list[float]()
codons = SynonymousCodons[aa]

for codon in codons:
total += self.codon_count[codon]

# calculate the RSCU value for each of the codons
for codon in codons:
denominator = float(total) / len(codons)
rcsu.append(self.codon_count[codon] / denominator)

# now generate the index W=RCSUi/RCSUmax:
rcsu_max = max(rcsu)
for codon_index, codon in enumerate(codons):
self.index[codon] = rcsu[codon_index] / rcsu_max

def cai_for_gene(self, dna_sequence: str) -> float:
"""
cai_for_gene(self, dna_sequence)

Calculate the CAI (float) for the provided DNA sequence (string).

This method uses the Index (either the one you set or the one you
generated) and returns the CAI for the DNA sequence.
"""
cai_value, cai_length = 0.0, 0

# if no index is set or generated, the default SharpEcoliIndex will
# be used.
if self.index == dict[str, float]():
self.set_cai_index(SharpEcoliIndex)

if dna_sequence.islower():
dna_sequence = dna_sequence.upper()

for i in range(0, len(dna_sequence), 3):
codon = dna_sequence[i : i + 3]
if codon in self.index:
# these two codons are always one, exclude them:
if codon not in ["ATG", "TGG"]:
cai_value += math.log(self.index[codon])
cai_length += 1
# some indices may not include stop codons:
elif codon not in ["TGA", "TAA", "TAG"]:
raise TypeError(
f"illegal codon in sequence: {codon}.\n{self.index}"
)

return math.exp(cai_value / (cai_length - 1.0))

def _count_codons(self, fasta_file):

# make the codon dictionary local
self.codon_count = CodonsDict.copy()

dna_sequence = ''

# iterate over sequence and count all the codons in the FastaFile.
for cur_record in bio.FASTA(fasta_file):
# make sure the sequence is lower case
if str(cur_record.seq).islower():
dna_sequence = str(cur_record.seq).upper()
else:
dna_sequence = str(cur_record.seq)
for i in range(0, len(dna_sequence), 3):
codon = dna_sequence[i : i + 3]
if codon in self.codon_count:
self.codon_count[codon] += 1
else:
raise TypeError(
f"illegal codon {codon} in gene: {cur_record}"
)

def print_index(self):
"""
Print out the index used.
This just gives the index when the objects is printed.
"""
for i in sorted(self.index):
print(f"{i}\t{self.index[i]}")
25 changes: 25 additions & 0 deletions stdlib/biopy/SeqUtils/codonusageindices.seq
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""
Codon adaption indxes, including Sharp and Li (1987) E. coli index.

Currently this module only defines a single codon adaption index from
Sharp & Li, Nucleic Acids Res. 1987.
"""
# Turn black code style off
# fmt: off

SharpEcoliIndex = {
"GCA": 0.586, "GCC": 0.122, "GCG": 0.424, "GCT": 1.0, "AGA": 0.004,
"AGG": 0.002, "CGA": 0.004, "CGC": 0.356, "CGG": 0.004, "CGT": 1.0, "AAC": 1.0,
"AAT": 0.051, "GAC": 1.0, "GAT": 0.434, "TGC": 1.0, "TGT": 0.5, "CAA": 0.124,
"CAG": 1.0, "GAA": 1.0, "GAG": 0.259, "GGA": 0.01, "GGC": 0.724, "GGG": 0.019,
"GGT": 1.0, "CAC": 1.0, "CAT": 0.291, "ATA": 0.003, "ATC": 1.0, "ATT": 0.185,
"CTA": 0.007, "CTC": 0.037, "CTG": 1.0, "CTT": 0.042, "TTA": 0.02,
"TTG": 0.02, "AAA": 1.0, "AAG": 0.253, "ATG": 1.0, "TTC": 1.0, "TTT": 0.296,
"CCA": 0.135, "CCC": 0.012, "CCG": 1.0, "CCT": 0.07, "AGC": 0.41,
"AGT": 0.085, "TCA": 0.077, "TCC": 0.744, "TCG": 0.017, "TCT": 1.0,
"ACA": 0.076, "ACC": 1.0, "ACG": 0.099, "ACT": 0.965, "TGG": 1.0, "TAC": 1.0,
"TAT": 0.239, "GTA": 0.495, "GTC": 0.066, "GTG": 0.221, "GTT": 1.0
}

# Turn black code style on
# fmt: on
Loading