seq-lang · inumanag · Dec 9, 2019 · Dec 9, 2019 · Dec 10, 2019 · Dec 10, 2019
@@ -0,0 +1,98 @@
+
+# TODO/ CAVEATS:
+# @jordan - crc32() needs to be implemented
+# def crc32(seq):
+#     """
+#     crc32(seq)
+
+#     Return the crc32 checksum for a sequence (string or Seq object).
+#     """
+#     return _crc32(seq)
+
+def _init_table_h() -> list[int]:
+    _table_h = list[int]()
+    for i in range(256):
+        part_l = i
+        part_h = 0
+        for j in range(8):
+            rflag = part_l & 1
+            part_l >>= 1
+            if part_h & 1:
+                part_l |= 1 << 31
+            part_h >>= 1
+            if rflag:
+                part_h ^= 0xD8000000
+        _table_h.append(part_h)
+    return _table_h
+
+#Initialisation
+_table_h = _init_table_h()
+
+def crc64(s) -> str:
+    """
+    crc64(s)
+
+    Return the crc64 checksum for a sequence (string or Seq object).
+
+    TODO/CAVEATS:
+    @jordan - return formatting crch and crcl as hex
+    """
+    crcl = 0
+    crch = 0
+    for c in s:
+        shr = (crch & 0xFF) << 24
+        temp1h = crch >> 8
+        temp1l = (crcl >> 8) | shr
+        idx = (crcl ^ ord(c)) & 0xFF
+        crch = temp1h ^ _table_h[idx]
+        crcl = temp1l
+
+    # need it in 08X return f"CRC-{crch}{crcl}" 
+    return f"CRC-{crch}{crcl}"
+    #"CRC-%08X%08X" % (crch, crcl)
+
+def gcg(s) -> int:
+    """
+    gcg(s)
+
+    Given a nucleotide or amino-acid secuence (or any string),
+    returns the GCG checksum (int). Checksum used by GCG program.
+    seq type = str.
+    """
+
+    index = checksum = 0
+
+    for char in s:
+        index += 1
+        checksum += index * ord(char.upper())
+        if index == 57:
+            index = 0
+    return checksum % 10000
+
+# TODO/CAVEATS:
+# @jordan - This still needs to be implemented
+#hashlib and base64??
+# def seguid(s) -> str:
+#     """
+#     seguid(seq: seq)
+
+#     Return the SEGUID (string) for a sequence (string or Seq object).
+
+#     Given a nucleotide or amino-acid secuence (or any string),
+#     returns the SEGUID string (A SEquence Globally Unique IDentifier).
+#     seq type = str.
+#     """
+#     import hashlib
+#     import base64
+
+#     m = hashlib.sha1()
+#     m.update(_as_bytes(s.upper()))
+#     try:
+#         # For Python 3+
+#         tmp = base64.encodebytes(m.digest())
+#         return tmp.decode().replace("\n", "").rstrip("=")
+#     except AttributeError:
+#         pass
+#     # For all other Pythons
+#     return base64.b64encode(m.digest()).rstrip("=")
+
@@ -0,0 +1,194 @@
+
+"""Methods for codon usage calculations."""
+
+import math
+from biopy.sequtils.codonusageindices import SharpEcoliIndex
+import bio
+#DONT HAVE YET from Bio import SeqIO
+
+# Turn black code style off
+# fmt: off
+
+CodonsDict = {
+    "TTT": 0, "TTC": 0, "TTA": 0, "TTG": 0,
+    "CTT": 0, "CTC": 0, "CTA": 0, "CTG": 0,
+    "ATT": 0, "ATC": 0, "ATA": 0, "ATG": 0,
+    "GTT": 0, "GTC": 0, "GTA": 0, "GTG": 0,
+    "TAT": 0, "TAC": 0, "TAA": 0, "TAG": 0,
+    "CAT": 0, "CAC": 0, "CAA": 0, "CAG": 0,
+    "AAT": 0, "AAC": 0, "AAA": 0, "AAG": 0,
+    "GAT": 0, "GAC": 0, "GAA": 0, "GAG": 0,
+    "TCT": 0, "TCC": 0, "TCA": 0, "TCG": 0,
+    "CCT": 0, "CCC": 0, "CCA": 0, "CCG": 0,
+    "ACT": 0, "ACC": 0, "ACA": 0, "ACG": 0,
+    "GCT": 0, "GCC": 0, "GCA": 0, "GCG": 0,
+    "TGT": 0, "TGC": 0, "TGA": 0, "TGG": 0,
+    "CGT": 0, "CGC": 0, "CGA": 0, "CGG": 0,
+    "AGT": 0, "AGC": 0, "AGA": 0, "AGG": 0,
+    "GGT": 0, "GGC": 0, "GGA": 0, "GGG": 0}
+
+# Turn black code style on
+# fmt: on
+
+# this dictionary shows which codons encode the same AA
+SynonymousCodons = {
+    "CYS": ["TGT", "TGC"],
+    "ASP": ["GAT", "GAC"],
+    "SER": ["TCT", "TCG", "TCA", "TCC", "AGC", "AGT"],
+    "GLN": ["CAA", "CAG"],
+    "MET": ["ATG"],
+    "ASN": ["AAC", "AAT"],
+    "PRO": ["CCT", "CCG", "CCA", "CCC"],
+    "LYS": ["AAG", "AAA"],
+    "STOP": ["TAG", "TGA", "TAA"],
+    "THR": ["ACC", "ACA", "ACG", "ACT"],
+    "PHE": ["TTT", "TTC"],
+    "ALA": ["GCA", "GCC", "GCG", "GCT"],
+    "GLY": ["GGT", "GGG", "GGA", "GGC"],
+    "ILE": ["ATC", "ATA", "ATT"],
+    "LEU": ["TTA", "TTG", "CTC", "CTT", "CTG", "CTA"],
+    "HIS": ["CAT", "CAC"],
+    "ARG": ["CGA", "CGC", "CGG", "CGT", "AGG", "AGA"],
+    "TRP": ["TGG"],
+    "VAL": ["GTA", "GTC", "GTG", "GTT"],
+    "GLU": ["GAG", "GAA"],
+    "TYR": ["TAT", "TAC"],
+}
+
+class CodonAdaptationIndex:
+    """
+    A codon adaptation index (CAI) implementation.
+
+    Implements the codon adaptation index (CAI) described by Sharp and
+    Li (Nucleic Acids Res. 1987 Feb 11;15(3):1281-95).
+
+    NOTE - This implementation does not currently cope with alternative genetic
+    codes: only the synonymous codons in the standard table are considered.
+    """
+    index: dict[str, float]
+    codon_count: dict[str, int]
+
+    def __init__(self: CodonAdaptationIndex):
+        self.index = dict[str, float]()
+        self.codon_count = dict[str, int]()
+
+    # use this method with predefined CAI index
+    def set_cai_index(self, index):
+        """
+        set_cai_index(self, index)
+
+        Set up an index to be used when calculating CAI for a gene.
+
+        Just pass a dictionary similar to the SharpEcoliIndex in the
+        CodonUsageIndices module.
+        """
+        self.index = index
+
+    def generate_index(self, fasta_file):
+        """
+        generate_index(self, fasta_file)
+
+        Generate a codon usage index from a FASTA file of CDS sequences.
+
+        Takes a location of a Fasta file containing CDS sequences
+        (which must all have a whole number of codons) and generates a codon
+        usage index.
+
+        RCSU values
+        """
+        # first make sure we're not overwriting an existing index:
+        if self.index != dict[str, float]() or self.codon_count != dict[str, int]():
+            raise ValueError(
+                "an index has already been set or a codon count "
+                "has been done. Cannot overwrite either."
+            )
+
+        # count codon occurrences in the file.
+        self._count_codons(fasta_file)
+
+        # now to calculate the index we first need to sum the number of times
+        # synonymous codons were used all together.
+        for aa in SynonymousCodons:
+            total = 0.0
+            # RCSU values are CodonCount/((1/num of synonymous codons) * sum of
+            # all synonymous codons)
+            rcsu = list[float]()
+            codons = SynonymousCodons[aa]
+
+            for codon in codons:
+                total += self.codon_count[codon]
+
+            # calculate the RSCU value for each of the codons
+            for codon in codons:
+                denominator = float(total) / len(codons)
+                rcsu.append(self.codon_count[codon] / denominator)
+
+            # now generate the index W=RCSUi/RCSUmax:
+            rcsu_max = max(rcsu)
+            for codon_index, codon in enumerate(codons):
+                self.index[codon] = rcsu[codon_index] / rcsu_max
+
+    def cai_for_gene(self, dna_sequence: str) -> float:
+        """
+        cai_for_gene(self, dna_sequence)
+
+        Calculate the CAI (float) for the provided DNA sequence (string).
+
+        This method uses the Index (either the one you set or the one you
+        generated) and returns the CAI for the DNA sequence.
+        """
+        cai_value, cai_length = 0.0, 0
+
+        # if no index is set or generated, the default SharpEcoliIndex will
+        # be used.
+        if self.index == dict[str, float]():
+            self.set_cai_index(SharpEcoliIndex)
+
+        if dna_sequence.islower():
+            dna_sequence = dna_sequence.upper()
+
+        for i in range(0, len(dna_sequence), 3):
+            codon = dna_sequence[i : i + 3]
+            if codon in self.index:
+                # these two codons are always one, exclude them:
+                if codon not in ["ATG", "TGG"]:
+                    cai_value += math.log(self.index[codon])
+                    cai_length += 1
+            # some indices may not include stop codons:
+            elif codon not in ["TGA", "TAA", "TAG"]:
+                raise TypeError(
+                    f"illegal codon in sequence: {codon}.\n{self.index}"
+                )
+
+        return math.exp(cai_value / (cai_length - 1.0))
+
+    def _count_codons(self, fasta_file):
+
+        # make the codon dictionary local
+        self.codon_count = CodonsDict.copy()
+
+        dna_sequence = ''
+
+        # iterate over sequence and count all the codons in the FastaFile.
+        for cur_record in bio.FASTA(fasta_file):
+            # make sure the sequence is lower case
+            if str(cur_record.seq).islower():
+                dna_sequence = str(cur_record.seq).upper()
+            else:
+                dna_sequence = str(cur_record.seq)
+            for i in range(0, len(dna_sequence), 3):
+                codon = dna_sequence[i : i + 3]
+                if codon in self.codon_count:
+                    self.codon_count[codon] += 1
+                else:
+                    raise TypeError(
+                        f"illegal codon {codon} in gene: {cur_record}" 
+                    )
+
+    def print_index(self):
+        """
+        Print out the index used.
+        This just gives the index when the objects is printed.
+        """
+        for i in sorted(self.index):
+            print(f"{i}\t{self.index[i]}")
@@ -0,0 +1,25 @@
+"""
+Codon adaption indxes, including Sharp and Li (1987) E. coli index.
+
+Currently this module only defines a single codon adaption index from
+Sharp & Li, Nucleic Acids Res. 1987.
+"""
+# Turn black code style off
+# fmt: off
+
+SharpEcoliIndex = {
+    "GCA": 0.586, "GCC": 0.122, "GCG": 0.424, "GCT": 1.0, "AGA": 0.004,
+    "AGG": 0.002, "CGA": 0.004, "CGC": 0.356, "CGG": 0.004, "CGT": 1.0, "AAC": 1.0,
+    "AAT": 0.051, "GAC": 1.0, "GAT": 0.434, "TGC": 1.0, "TGT": 0.5, "CAA": 0.124,
+    "CAG": 1.0, "GAA": 1.0, "GAG": 0.259, "GGA": 0.01, "GGC": 0.724, "GGG": 0.019,
+    "GGT": 1.0, "CAC": 1.0, "CAT": 0.291, "ATA": 0.003, "ATC": 1.0, "ATT": 0.185,
+    "CTA": 0.007, "CTC": 0.037, "CTG": 1.0, "CTT": 0.042, "TTA": 0.02,
+    "TTG": 0.02, "AAA": 1.0, "AAG": 0.253, "ATG": 1.0, "TTC": 1.0, "TTT": 0.296,
+    "CCA": 0.135, "CCC": 0.012, "CCG": 1.0, "CCT": 0.07, "AGC": 0.41,
+    "AGT": 0.085, "TCA": 0.077, "TCC": 0.744, "TCG": 0.017, "TCT": 1.0,
+    "ACA": 0.076, "ACC": 1.0, "ACG": 0.099, "ACT": 0.965, "TGG": 1.0, "TAC": 1.0,
+    "TAT": 0.239, "GTA": 0.495, "GTC": 0.066, "GTG": 0.221, "GTT": 1.0
+}
+
+# Turn black code style on
+# fmt: on