diff --git a/stdlib/bio/seqfeature.seq b/stdlib/bio/seqfeature.seq new file mode 100644 index 000000000..bb3fe186d --- /dev/null +++ b/stdlib/bio/seqfeature.seq @@ -0,0 +1,1219 @@ + +""" +Represent a Sequence Feature holding info about a part of a sequence. +TODO/CAVEATS: +@jordan- For now MutableSeq is left out +""" +EXACT = 0 +BEFORE = 1 +AFTER = 2 + +class Position: + """ + Specify the specific position of a boundary. + """ + pos: int + mode: int + extension: int + + def __init__(cls: Position, position: int, mode: int, extension: int=0): + if extension != 0: + raise AttributeError(f"Non-zero extension {extension} for exact position.") + + if not 0 <= mode <= 2: + raise ValueError(f"mode must be a value of EXACT={EXACT}, BEFORE={BEFORE}, AFTER={AFTER}.") + + cls.pos = position + cls.mode = mode + + @property + def position(self: Position): + """ + Legacy attribute to get position as integer (OBSOLETE). + """ + return self.pos + + @property + def extension(self: Position): # noqa: D402 + """ + Legacy attribute to get extension (zero) as integer (OBSOLETE). + """ + return 0 + + def __repr__(self: Position): + """ + Represent the location as a string for debugging. + """ + return f"{self.__class__.__name__}({self.pos})" + + def __str__(self: Position): + """ + Return a representation of the Position object. + """ + if self.mode == EXACT: + return f"{self.pos}" + elif self.mode == BEFORE: + return f"<{self.pos}" + else: + return f">{self.pos}" + + def _shift(self: Position, offset: int): + """ + Return a copy of the position object with its location shifted (PRIVATE). + """ + return self.__class__(self.pos + offset) + + def _flip(self: Position, length: int): + """ + Return a copy of the location after the parent is reversed (PRIVATE). + """ + if self.mode == EXACT: + # By default perserve any subclass + return self.__class__(length = self.pos) + elif self.mode == BEFORE: + return Position(length - self.pos, 2, 0) + else: + return Position(length - self.pos, 1, 0) + +# class WithinPosition(int, AbstractPosition): +class WithinPosition: + """ + Specify the position of a boundary within some coordinates. + """ + pos: int + left: int + right: int + + def __init__(cls: WithinPosition, position: int, left: int, right: int): + """ + Create a WithinPosition object. + """ + if not (position == left or position == right): + raise RuntimeError(f"WithinPosition: {position} should match left {left} or right {right}") + cls.pos = position + cls.left = left + cls.right = right + + def __getnewargs__(self: WithinPosition): + """ + Return the arguments accepted by __init__. + Necessary to allow pickling and unpickling of class instances. + """ + return (int(self), self._left, self._right) + + def __repr__(self: WithinPosition): + """ + Represent the WithinPosition object as a string for debugging. + """ + return f"{self.__class__.__name__}({int(self)}, left={self._left}, right={self._right})" \ + + def __str__(self: WithinPosition): + """ + Return a representation of the WithinPosition object. + """ + return f"({self.left}.{self.right})" + + @property + def position(self: WithinPosition): + """ + Legacy attribute to get (left) position as integer (OBSOLETE). + """ + return self.pos + + @property + def extension(self: WithinPosition): # noqa: D402 + """ + Legacy attribute to get extension (from left to right) as an integer (OBSOLETE). + """ + return self.right - self.left + + def _shift(self: WithinPosition, offset: int): + """ + Return a copy of the position object with its location shifted (PRIVATE). + """ + return self.__class__(int(self) + offset, + self.left + offset, + self.right + offset) + + def _flip(self: WithinPosition, length: int): + """ + Return a copy of the location after the parent is reversed (PRIVATE). + """ + return self.__class__(length - int(self), + length - self.right, + length - self.left) + +extend int: + def __init__(self: int, w: WithinPosition): + return w.pos + +# --- Handling feature locations + +#class FeatureLocation(object): +class FeatureLocation: + """ + Specify the location of a feature along a sequence. + """ + _start: int + _end: int + _strand: int + _ref: str + _ref_db: str + + def __init__(self: FeatureLocation, start: int, end: int, strand: optional[int]=None, ref: optional[str]=None, ref_db: optional[str]=None): + """ + Initialize the class. + """ + + self._start = start + self._end = end + + if self._start > self._end: + raise ValueError(f"End location {self._end} must be greater than or equal to start location {self._start}.") + + self._strand = strand + self._ref = ref + self._ref_db = ref_db + + def __init__(self: FeatureLocation, start: Position, end: Position, strand: int, ref: str, ref_db: str): + """ + Initialize the class. + """ + + self._start = start.pos + self._end = end.pos + + if self._start > self._end: + raise ValueError(f"""End location ({self._end}) must be greater than or equal + to start location ({self._start})""") + self._strand = strand + self._ref = ref + self._ref_db = ref_db + + # def __init__(self: FeatureLocation, start: Position, end: WithinPosition, strand: int, ref: str, ref_db: str): + # """ + # Initialize the class. + # """ + + # self._start = start.pos + # self._end = end.pos + + # if self._start > self._end: + # raise ValueError(f"""End location ({self._end}) must be greater than or equal + # to start location ({self._start})""") + # self._strand = strand + # self._ref = ref + # self._ref_db = ref_db + + def _get_strand(self: FeatureLocation) -> int: + """ + Get function for the strand property (PRIVATE). + """ + return self._strand + + def _set_strand(self: FeatureLocation, value: int): + """ + Set function for the strand property (PRIVATE). + """ + if value not in [+1, -1, 0, None]: + raise ValueError(f"Strand should be +1, -1, 0 or None, not {value}") + self._strand = value + + @property + def strand(self: FeatureLocation, fget: optional[int]=None, fset: optional[int]=None, doc: str="Strand of the location (+1, -1, 0 or None)."): + if fget is None: + fget = self._get_strand + if fset is None: + fset = self._set_strand + return self._strand + + def __str__(self: FeatureLocation): + """ + Return a representation of the FeatureLocation object. + """ + answer = f"[{self._start}:{self._end}]" + if self._ref and self._ref_db: + answer = f"{self._ref_db}:{self._ref}{answer}" + elif self._ref: + answer = self._ref + answer + if self._strand: + return answer + elif self._strand == +1: + return answer + "(+)" + elif self._strand == -1: + return answer + "(-)" + else: + # strand = 0, stranded but strand unknown, ? in GFF3 + return answer + "(?)" + + def __repr__(self: FeatureLocation): + """ + Represent the FeatureLocation object as a string for debugging. + """ + optional = "" + if self._strand is not None: + optional += F", strand={self._strand}" + if self.ref is not None: + optional += f", ref={self._ref}" + if self.ref_db is not None: + optional += f", ref_db={self._ref_db}" + return f"{self.__class__.__name__}({self._start}, {self._end}{optional})" \ + + # Not sure about `other` type + def __radd__(self: FeatureLocation, other: int): + """ + Add a feature locationanother FeatureLocation object to the left. + """ + return self._shift(other) + + + def __nonzero__(self: FeatureLocation): + """ + Return True regardless of the length of the feature. + """ + return True + + def __len__(self: FeatureLocation): + """ + Return the length of the region described by the FeatureLocation object. + """ + return int(self._end) - int(self._start) + + def __contains__(self: FeatureLocation, value: int): + """ + Check if an integer position is within the FeatureLocation object. + """ + if value < self._start or value >= self._end: + return False + else: + return True + + def __iter__(self: FeatureLocation): + """ + Iterate over the parent positions within the FeatureLocation object. + """ + if self.strand == -1: + for i in range(self._end - 1, self._start - 1, -1): + yield i + else: + for i in range(self._start, self._end): + yield i + + def __eq__(self: FeatureLocation, other: FeatureLocation): + """ + Implement equality by comparing all the location attributes. + """ + return self._start == other._start and \ + self._end == other._end and \ + self._strand == other._strand and \ + self._ref == other._ref and \ + self._ref_db == other._ref_db + + def __ne__(self: FeatureLocation, other: FeatureLocation): + """ + Implement the not-equal operand. + """ + return not self == other + + def _shift(self: FeatureLocation, offset: int): + """ + Return a copy of the FeatureLocation shifted by an offset (PRIVATE). + """ + if self._ref or self._ref_db: + raise ValueError("Feature references another sequence.") + return FeatureLocation(start=self._start._shift(offset), + end=self._end._shift(offset), + strand=self._strand) + + def _flip(self: FeatureLocation, length): + """ + Return a copy of the location after the parent is reversed (PRIVATE). + """ + flip_strand = 0 + if self.ref or self._ref_db: + raise ValueError("Feature references another sequence.") + # Note this will flip the start and end too! + if self._strand == +1: + flip_strand = -1 + elif self._strand == -1: + flip_strand = +1 + else: + # 0 or None + flip_strand = self._strand + return FeatureLocation(start=self._end._flip(length), + end=self._start._flip(length), + strand=flip_strand) + + @property + def parts(self: FeatureLocation): + """ + Read only list of sections (always one, the FeatureLocation object). + """ + return [self] + + @property + def start(self: FeatureLocation): + """ + Start location - left most (minimum) value, regardless of strand. + """ + return self._start + + @property + def end(self: FeatureLocation): + """ + End location - right most (maximum) value, regardless of strand. + """ + return self._end + + @property + def nofuzzy_start(self: FeatureLocation): + """ + Start position (integer, approximated if fuzzy, read only) (OBSOLETE). + """ + return self._start + + @property + def nofuzzy_end(self: FeatureLocation): + """ + End position (integer, approximated if fuzzy, read only) (OBSOLETE). + """ + return self._end + + # TODO/CAVEATS: + # @jordan- reverse_complement needs to be implemented still + # def extract(self: FeatureLocation, parent_sequence: seq): + # """ + # Extract the sequence from supplied parent sequence using the FeatureLocation object. + # """ + # if self.ref or self.ref_db: + # raise ValueError("Feature references another sequence.") + + # f_seq = parent_sequence[self.nofuzzy_start:self.nofuzzy_end] + # if self.strand == -1: + # f_seq = reverse_complement(f_seq) + + # return f_seq + + # def extract(self: FeatureLocation, parent_sequence: str): + # """ + # Extract the sequence from supplied parent sequence using the FeatureLocation object. + # """ + # if self.ref or self.ref_db: + # raise ValueError("Feature references another sequence.") + + # f_seq = parent_sequence[self.nofuzzy_start:self.nofuzzy_end] + # if self.strand == -1: + # f_seq = reverse_complement(f_seq) + + # return f_seq + +# class SeqFeature(object): +class SeqFeature: + """ + Represent a Sequence Feature on an object. + """ + location: FeatureLocation + typ: str + location_operator: str + strand: int + id: str + qualifiers: dict[str, list[str]] + ref: str + ref_db: str + + + def __init__(self: SeqFeature, location: optional[FeatureLocation]=None, typ: str="", location_operator: str="", + strand: optional[int]=None, id: str="", + qualifiers: optional[dict[str, list[str]]]=None, + ref: optional[str]=None, ref_db: optional[str]=None): + """ + Initialize a SeqFeature on a Sequence. + location can either be a FeatureLocation (with strand argument also + given if required), or None. + + TODO/CAVEATS: + @jordan - took out sub_features from parameter for now + - Because OrderedDict is not implemented yet, qualifiers is manditory + """ + + self.location = location + self.typ = typ + + if location_operator: + self.location_operator = location_operator + + if strand is not None: + self.strand = strand + + self.id = id + self.qualifiers = qualifiers + + if ref: + self.ref = ref + + if ref_db: + self.ref_db = ref_db + + def _get_strand(self: SeqFeature): + """ + Get function for the strand property (PRIVATE). + """ + return self.location.strand + + def _set_strand(self: SeqFeature, value: int): + """ + Set function for the strand property (PRIVATE). + """ + self.location.strand = value + + @property + def strand(self: SeqFeature, fget: optional[int]=None, fset: optional[int]= None, doc="""Feature's strand + This is a shortcut for feature.location.strand"""): + if fget is None: + fget = self._get_strand + if fset is None: + fset = self._set_strand + return self._strand + + + def _get_ref(self: SeqFeature): + """ + Get function for the reference property (PRIVATE). + """ + return self.location.ref + + + def _set_ref(self: SeqFeature, value): + """ + Set function for the reference property (PRIVATE). + """ + self.location.ref = value + + @property + def ref(self: SeqFeature, fget: optional[int]=None, fset: optional[int]= None, + doc="""Feature location reference (e.g. accession). + This is a shortcut for feature.location.ref + """): + if fget is None: + fget = self._get_ref + if fset is None: + fset = self._set_ref + + return self._ref + + def _get_ref_db(self: SeqFeature): + """ + Get function for the database reference property (PRIVATE). + """ + return self.location.ref_db + + + def _set_ref_db(self: SeqFeature, value): + """ + Set function for the database reference property (PRIVATE). + """ + self.location.ref_db = value + + @property + def ref_db(self: SeqFeature, fget: optional[int]= None, fset: optional[int]=None, + doc="""Feature location reference's database. + This is a shortcut for feature.location.ref_db + """): + if fget is None: + fget = self._get_ref_db + if fset is None: + fset = self._set_ref_db + return self._ref_db + + def _get_location_operator(self: SeqFeature): + """ + Get function for the location operator property (PRIVATE). + """ + return self.location.operator + + def _set_location_operator(self, value): + """ + Set function for the location operator property (PRIVATE). + """ + self.location.operator = value + + @property + def location_operator(self: SeqFeature, fget: optional[str]=None, fset: optional[str]=None, + doc="Location operator for compound locations (e.g. join)."): + if fget is None: + fget = self._get_location_operator + if fset is None: + fset = self._set_location_operator + return self._location_operator + + def __repr__(self: SeqFeature): + """ + Represent the feature as a string for debugging. + """ + answer = f"{self.__class__.__name__}({self.repr(self.location)}" + if self.typ: + answer += f", type={self.repr(self.typ)}" + if self.location_operator: + answer += f", location_operator={self.repr(self.location_operator)}" + if self.id and self.id != "": + answer += f", id={self.repr(self.id)}" + if self.ref: + answer += f", ref={self.repr(self.ref)}" + if self.ref_db: + answer += f", ref_db={self.repr(self.ref_db)}" + answer += ")" + return answer + + def __str__(self: SeqFeature): + """ + Return the full feature as a python string. + """ + out = f"type: {self.typ}\n" + out += f"location: {self.location}\n" + if self.id and self.id != "": + out += f"id: {self.id}\n" + out += "qualifiers:\n" + for qual_key in sorted(self.qualifiers): + out += f" Key: {qual_key}, Value: {self.qualifiers[qual_key]}\n" + return out + + def _shift(self: SeqFeature, offset): + """ + Return a copy of the feature with its location shifted (PRIVATE). + The annotation qaulifiers are copied. + """ + return SeqFeature(location=self.location._shift(offset), + typ=self.typ, + location_operator=self.location_operator, + id=self.id, + qualifiers=self.OrderedDict(self.qualifiers.items())) + + def _flip(self: SeqFeature, length: int): + """ + Return a copy of the feature with its location flipped (PRIVATE). + """ + return SeqFeature(location=self.location._flip(length), + typ=self.typ, + location_operator=self.location_operator, + id=self.id, + qualifiers=self.OrderedDict(self.qualifiers.items())) + + def extract(self: SeqFeature, parent_sequence: seq): + """ + Extract the feature's sequence from supplied parent sequence. + """ + if self.location is None: + raise ValueError("The feature's .location is None. Check the " + "sequence file for a valid location.") + return self.location.extract(parent_sequence) + + def extract(self: SeqFeature, parent_sequence: str): + """ + Extract the feature's sequence from supplied parent sequence. + """ + if self.location is None: + raise ValueError("The feature's .location is None. Check the " + "sequence file for a valid location.") + return self.location.extract(parent_sequence) + + def translate(self: SeqFeature, parent_sequence, table="Standard", start_offset: optional[int]=None, + stop_symbol="*", to_stop=False, cds=None, gap=None): + """ + Get a translation of the feature's sequence. + """ + # see if this feature should be translated in a different + # frame using the "codon_start" qualifier + if start_offset is None: + try: + start_offset = int(self.qualifiers["codon_start"][0]) - 1 + + except KeyError: + start_offset = 0 + + if start_offset not in [0, 1, 2]: + raise ValueError(f"""The start_offset must be 0, 1, or 2. + The supplied value is {start_offset}. Check the value + of either the codon_start qualifier or + the start_offset argument""") + + feat_seq = self.extract(parent_sequence)[start_offset:] + codon_table = self.qualifiers.get("transl_table", [table])[0] + + if cds is None: + cds = (self.typ == "CDS") + + return feat_seq.translate(table=codon_table, stop_symbol=stop_symbol, + to_stop=to_stop, cds=cds, gap=gap) + + def __bool__(self): + """ + Boolean value of an instance of this class (True). + """ + return True + + def __len__(self): + """ + Return the length of the region where the feature is located. + """ + return len(self.location) + + def __iter__(self): + """ + Iterate over the parent positions within the feature. + """ + return iter(self.location) + + def __contains__(self, value): + """ + Check if an integer position is within the feature. + """ + return value in self.location + + +# --- References + +#class Reference(object): +class Reference: + """ + Represent a Generic Reference object. + """ + location: list[FeatureLocation] + authors: str + title: str + journal: str + medline_id: str + pubmed_id: str + comment: str + + def __init__(self: Reference): + """ + Initialize the class. + """ + self.location = list[FeatureLocation]() + self.authors = "" + self.consrtm = "" + self.title = "" + self.journal = "" + self.medline_id = "" + self.pubmed_id = "" + self.comment = "" + + def __str__(self: Reference): + """ + Return the full Reference object as a python string. + """ + out = "" + for single_location in self.location: + out += f"location: {single_location}\n" + out += f"authors: {self.authors}\n" + if self.consrtm: + out += f"consrtm: {self.consrtm}\n" + out += f"title: {self.title}\n" + out += f"journal: {self.journal}\n" + out += f"medline id: {self.medline_id}\n" + out += f"pubmed id: {self.pubmed_id}\n" + out += f"comment: {self.comment}\n" + return out + + def __repr__(self: Reference): + """ + Represent the Reference object as a string for debugging. + """ + return f"{self.__class__.__name__}(title={self.repr(self.title)}, ...)" + + def __eq__(self: Reference, other: Reference): + """ + Check if two Reference objects should be considered equal. + """ + return self.authors == other.authors and \ + self.consrtm == other.consrtm and \ + self.title == other.title and \ + self.journal == other.journal and \ + self.medline_id == other.medline_id and \ + self.pubmed_id == other.pubmed_id and \ + self.comment == other.comment and \ + self.location == other.location + + def __ne__(self: Reference, other: Reference): + """ + Implement the not-equal operand. + """ + return not self == other + +#class CompoundLocation(object): +class CompoundLocation: + """ + For handling joins etc where a feature location has several parts. + """ + parts: list[FeatureLocation] + operator: str + + def __init__(self: CompoundLocation, parts: list[FeatureLocation], operator: str="join"): + """ + Initialize the class. + """ + self.operator = operator + self.parts = parts + + if len(parts) < 2: + raise ValueError(f"CompoundLocation should have at least 2 parts, not {parts}") + + def __str__(self: CompoundLocation): + """ + Return a representation of the CompoundLocation object. + """ + return f"""{self.operator}{{(", ".join(list(str(loc) for loc in self.parts)))}} """ + + def __repr__(self: CompoundLocation): + """ + Represent the CompoundLocation object as string for debugging. + """ + return f"{self.__class__.__name__}({self.parts}, {self.operator})" + + + def _get_strand(self: CompoundLocation): + """ + Get function for the strand property (PRIVATE). + """ + if len({loc.strand for loc in self.parts}) == 1: + return self.parts[0].strand + else: + return None # i.e. mixed strands + + def _set_strand(self: CompoundLocation, value: int): + """ + Set function for the strand property (PRIVATE). + """ + for loc in self.parts: + loc.strand = value + + @property + def strand(self: CompoundLocation, fget: optional[int]=None, fset: optional[int]=None, + doc="""Overall strand of the compound location. + If all the parts have the same strand, that is returned. Otherwise + for mixed strands, this returns None. + >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation + >>> f1 = FeatureLocation(15, 17, strand=1) + >>> f2 = FeatureLocation(20, 30, strand=-1) + >>> f = f1 + f2 + >>> f1.strand + 1 + >>> f2.strand + -1 + >>> f.strand + >>> f.strand is None + True + If you set the strand of a CompoundLocation, this is applied to + all the parts - use with caution: + >>> f.strand = 1 + >>> f1.strand + 1 + >>> f2.strand + 1 + >>> f.strand + 1 + """): + if fget is None: + fget = self._get_strand + if fset is None: + fset = self._set_strand + return self._strand + + def __add__(self: CompoundLocation, other: FeatureLocation): + """ + Combine locations. + """ + return CompoundLocation(self.parts + [other], self.operator) + + def __add__(self: CompoundLocation, other: CompoundLocation): + """ + Combine locations. + """ + if self.operator != other.operator: + # Handle join+order -> order as a special case? + raise ValueError(F"Mixed operators {self.operator} and {other.operator}") + return CompoundLocation(self.parts + other.parts, self.operator) + + def __add__(self: CompoundLocation, other: int): + """ + Shift the location by an integer offset. + """ + return self._shift(other) + + + def __radd__(self: CompoundLocation, other: FeatureLocation): + """ + Add a feature to the left. + """ + return CompoundLocation([other] + self.parts, self.operator) + + + def __radd__(self: CompoundLocation, other: int): + """ + Add a feature to the left. + """ + return self._shift(other) + + + def __contains__(self: CompoundLocation, value: int): + """ + Check if an integer position is within the CompoundLocation object. + """ + for loc in self.parts: + if value in loc: + return True + return False + + def __nonzero__(self: CompoundLocation): + """ + Return True regardless of the length of the feature. + """ + return True + + def __len__(self: CompoundLocation): + """ + Return the length of the CompoundLocation object. + """ + return sum(len(loc) for loc in self.parts) + + def __iter__(self: CompoundLocation): + """ + Iterate over the parent positions within the CompoundLocation object. + """ + for loc in self.parts: + for pos in loc: + yield pos + + def __eq__(self: CompoundLocation, other: CompoundLocation): + """ + Check if all parts of CompoundLocation are equal to all parts of other CompoundLocation. + """ + if len(self.parts) != len(other.parts): + return False + if self.operator != other.operator: + return False + for self_part, other_part in zip(self.parts, other.parts): + if self_part != other_part: + return False + return True + + def __ne__(self: CompoundLocation, other: CompoundLocation): + """ + Implement the not-equal operand. + """ + return not self == other + + def _shift(self: CompoundLocation, offset: int): + """ + Return a copy of the CompoundLocation shifted by an offset (PRIVATE). + """ + return CompoundLocation([loc._shift(offset) for loc in self.parts], + self.operator) + + def _flip(self: CompoundLocation, length: int): + """ + Return a copy of the locations after the parent is reversed (PRIVATE). + """ + return CompoundLocation([loc._flip(length) for loc in self.parts], + self.operator) + + @property + def start(self: CompoundLocation): + """ + Start location - left most (minimum) value, regardless of strand. + """ + return min(loc.start for loc in self.parts) + + @property + def end(self: CompoundLocation): + """ + End location - right most (maximum) value, regardless of strand. + """ + return max(loc.end for loc in self.parts) + + @property + def nofuzzy_start(self: CompoundLocation) -> int: + """ + Start position (integer, approximated if fuzzy, read only) (OBSOLETE). + """ + return int(self.start) + + @property + def nofuzzy_end(self: CompoundLocation) -> int: + """ + End position (integer, approximated if fuzzy, read only) (OBSOLETE). + """ + return int(self.end) + + + @property + def ref(self: CompoundLocation): + """ + Not present in CompoundLocation, dummy method for API compatibility. + """ + return None + + @property + def ref_db(self: CompoundLocation): + """ + Not present in CompoundLocation, dummy method for API compatibility. + """ + return None + + def extract(self: CompoundLocation, parent_sequence: seq): + """ + Extract the sequence from supplied parent sequence using the CompoundLocation object. + """ + # This copes with mixed strand features & all on reverse: + parts = [loc.extract(parent_sequence) for loc in self.parts] + # We use addition rather than a join to avoid alphabet issues: + f_seq = parts[0] + for part in parts[1:]: + f_seq += part + return f_seq + + def extract(self: CompoundLocation, parent_sequence: str): + """ + Extract the sequence from supplied parent sequence using the CompoundLocation object. + """ + # This copes with mixed strand features & all on reverse: + parts = [loc.extract(parent_sequence) for loc in self.parts] + # We use addition rather than a join to avoid alphabet issues: + f_seq = parts[0] + for part in parts[1:]: + f_seq += part + return f_seq + +extend FeatureLocation: + def __add__(self: FeatureLocation, other: FeatureLocation): + """ + Combine location with another FeatureLocation object, or shift it. + """ + return CompoundLocation([self, other], "join") + +# class AbstractPosition(object): +class AbstractPosition: + """ + Abstract base class representing a position. + """ + + def __repr__(self: AbstractPosition): + """ + Represent the AbstractPosition object as a string for debugging. + """ + return f"{self.__class__.__name__}(...)" + +# class UnknownPosition(AbstractPosition): +class UnknownPosition: + """ + Specify a specific position which is unknown (has no position). + """ + + def __repr__(self: UnknownPosition): + """ + Represent the UnknownPosition object as a string for debugging. + """ + return f"{self.__class__.__name__}()" + + # TODO/CAVEATS: + # @jordan - hash values + # def __hash__(self: UnknownPosition): + # """ + # Return the hash value of the UnknownPosition object. + # """ + # return hash(None) + + @property + def position(self: UnknownPosition): + """ + Legacy attribute to get location (None) (OBSOLETE). + """ + return None + + @property + def extension(self: UnknownPosition): # noqa: D402 + """ + Legacy attribute to get extension (zero) as integer (OBSOLETE). + """ + return 0 + + def _shift(self: UnknownPosition, offset: int): + """ + Return a copy of the position object with its location shifted (PRIVATE). + """ + return self + + def _flip(self: UnknownPosition, length: int): + """ + Return a copy of the location after the parent is reversed (PRIVATE). + """ + return self + +# class BetweenPosition(int, AbstractPosition): +class BetweenPosition: + """ + Specify the position of a boundary between two coordinates (OBSOLETE?). + """ + pos: int + left: int + right: int + + def __init__(cls: BetweenPosition, position: int, left: int, right: int): + """ + Create a new instance in BetweenPosition object. + """ + assert position == left or position == right + cls.pos = position + cls.left = left + cls.right = right + + + def __getnewargs__(self: BetweenPosition): + """ + Return the arguments accepted by __init__. + Necessary to allow pickling and unpickling of class instances. + """ + return (int(self), self._left, self._right) + + def __repr__(self: BetweenPosition): + """ + Represent the BetweenPosition object as a string for debugging. + """ + return f"{self.__class__.__name__}({int(self)}, left={self._left}, right={self._right})" \ + + def __str__(self: BetweenPosition): + """ + Return a representation of the BetweenPosition object (with python counting). + """ + return f"({self.left}^{self.right})" + + @property + def position(self: BetweenPosition): + """ + Legacy attribute to get (left) position as integer (OBSOLETE). + """ + return self.left + + @property + def extension(self: BetweenPosition): # noqa: D402 + """ + Legacy attribute to get extension (from left to right) as an integer (OBSOLETE). + """ + return self._right - self._left + + def _shift(self: BetweenPosition, offset: int): + """ + Return a copy of the position object with its location shifted (PRIVATE). + """ + return self.__class__(int(self) + offset, + self._left + offset, + self._right + offset) + + def _flip(self: BetweenPosition, length: int): + """ + Return a copy of the location after the parent is reversed (PRIVATE). + """ + return self.__class__(length - int(self), + length - self._right, + length - self._left) +extend int: + def __init__(self: int, b: BetweenPosition): + return b.pos + +# TODO/CAVEATS +# @jordan- OneofPosition has not been tested yet. +# #class OneOfPosition(int, AbstractPosition) +# class OneOfPosition: +# """ +# Specify a position where the location can be multiple positions. +# """ +# position_choices: int +# choices: list[Position] + +# def __init__(cls: OneOfPosition, position: int, choices: list[Position]): +# """ +# Initialize with a set of possible positions. +# position_list is a list of AbstractPosition derived objects, +# specifying possible locations. +# position is an integer specifying the default behaviour. +# """ +# cls.position_choices = position +# cls.choices = choices + +# def __getnewargs__(self: OneOfPosition) -> tuple[OneOfPosition, list[Position]]: +# """ +# Return the arguments accepted by __init__. +# Necessary to allow pickling and unpickling of class instances. +# """ +# return (self, self.position_choices) + +# @property +# def position(self: OneOfPosition): +# """ +# Legacy attribute to get (left) position as integer (OBSOLETE). +# """ +# return min(int(pos) for pos in self.position_choices) + +# @property +# def extension(self: OneOfPosition): +# """ +# Legacy attribute to get extension as integer (OBSOLETE). +# """ +# positions = [int(pos) for pos in self.position_choices] +# return max(positions) - min(positions) + +# def __repr__(self: OneOfPosition): +# """ +# Represent the OneOfPosition object as a string for debugging. +# """ +# return f"{self.__class__.__name__}({int(self)}, choices={self.position_choices})" + + +# def __str__(self: OneOfPosition): +# """ +# Return a representation of the OneOfPosition object. +# """ +# out = "one-of(" +# for position in self.position_choices: +# out += f"{position}," +# # replace the last comma with the closing parenthesis +# return out[:-1] + ")" + +# def _shift(self: OneOfPosition, offset: int): +# """ +# Return a copy of the position object with its location shifted (PRIVATE). +# """ +# return self.__class__(int(self) + offset, +# [p._shift(offset) for p in self.position_choices]) + +# def _flip(self: OneOfPosition, length: int): +# """ +# Return a copy of the location after the parent is reversed (PRIVATE). +# """ +# return self.__class__(length - int(self), +# [p._flip(length) for p in self.position_choices[::-1]]) + +#class PositionGap(object): +class PositionGap: + """ + Simple class to hold information about a gap between positions. + """ + gap_size: int + + def __init__(self: PositionGap, gap_size: int): + """ + Intialize with a position object containing the gap information. + """ + self.gap_size = gap_size + + def __repr__(self: PositionGap): + """ + Represent the position gap as a string for debugging. + """ + return f"{self.__class__.__name__}({self.repr(self.gap_size)})" + + def __str__(self: PositionGap): + """ + Return a representation of the PositionGap object. + """ + return f"gap({self.gap_size})" diff --git a/stdlib/biopy/SeqUtils/checksum.seq b/stdlib/biopy/SeqUtils/checksum.seq new file mode 100644 index 000000000..4493628a6 --- /dev/null +++ b/stdlib/biopy/SeqUtils/checksum.seq @@ -0,0 +1,98 @@ + +# TODO/ CAVEATS: +# @jordan - crc32() needs to be implemented +# def crc32(seq): +# """ +# crc32(seq) + +# Return the crc32 checksum for a sequence (string or Seq object). +# """ +# return _crc32(seq) + +def _init_table_h() -> list[int]: + _table_h = list[int]() + for i in range(256): + part_l = i + part_h = 0 + for j in range(8): + rflag = part_l & 1 + part_l >>= 1 + if part_h & 1: + part_l |= 1 << 31 + part_h >>= 1 + if rflag: + part_h ^= 0xD8000000 + _table_h.append(part_h) + return _table_h + +#Initialisation +_table_h = _init_table_h() + +def crc64(s) -> str: + """ + crc64(s) + + Return the crc64 checksum for a sequence (string or Seq object). + + TODO/CAVEATS: + @jordan - return formatting crch and crcl as hex + """ + crcl = 0 + crch = 0 + for c in s: + shr = (crch & 0xFF) << 24 + temp1h = crch >> 8 + temp1l = (crcl >> 8) | shr + idx = (crcl ^ ord(c)) & 0xFF + crch = temp1h ^ _table_h[idx] + crcl = temp1l + + # need it in 08X return f"CRC-{crch}{crcl}" + return f"CRC-{crch}{crcl}" + #"CRC-%08X%08X" % (crch, crcl) + +def gcg(s) -> int: + """ + gcg(s) + + Given a nucleotide or amino-acid secuence (or any string), + returns the GCG checksum (int). Checksum used by GCG program. + seq type = str. + """ + + index = checksum = 0 + + for char in s: + index += 1 + checksum += index * ord(char.upper()) + if index == 57: + index = 0 + return checksum % 10000 + +# TODO/CAVEATS: +# @jordan - This still needs to be implemented +#hashlib and base64?? +# def seguid(s) -> str: +# """ +# seguid(seq: seq) + +# Return the SEGUID (string) for a sequence (string or Seq object). + +# Given a nucleotide or amino-acid secuence (or any string), +# returns the SEGUID string (A SEquence Globally Unique IDentifier). +# seq type = str. +# """ +# import hashlib +# import base64 + +# m = hashlib.sha1() +# m.update(_as_bytes(s.upper())) +# try: +# # For Python 3+ +# tmp = base64.encodebytes(m.digest()) +# return tmp.decode().replace("\n", "").rstrip("=") +# except AttributeError: +# pass +# # For all other Pythons +# return base64.b64encode(m.digest()).rstrip("=") + diff --git a/stdlib/biopy/SeqUtils/codonusage.seq b/stdlib/biopy/SeqUtils/codonusage.seq new file mode 100644 index 000000000..cb76843aa --- /dev/null +++ b/stdlib/biopy/SeqUtils/codonusage.seq @@ -0,0 +1,194 @@ + +"""Methods for codon usage calculations.""" + +import math +from biopy.sequtils.codonusageindices import SharpEcoliIndex +import bio +#DONT HAVE YET from Bio import SeqIO + +# Turn black code style off +# fmt: off + +CodonsDict = { + "TTT": 0, "TTC": 0, "TTA": 0, "TTG": 0, + "CTT": 0, "CTC": 0, "CTA": 0, "CTG": 0, + "ATT": 0, "ATC": 0, "ATA": 0, "ATG": 0, + "GTT": 0, "GTC": 0, "GTA": 0, "GTG": 0, + "TAT": 0, "TAC": 0, "TAA": 0, "TAG": 0, + "CAT": 0, "CAC": 0, "CAA": 0, "CAG": 0, + "AAT": 0, "AAC": 0, "AAA": 0, "AAG": 0, + "GAT": 0, "GAC": 0, "GAA": 0, "GAG": 0, + "TCT": 0, "TCC": 0, "TCA": 0, "TCG": 0, + "CCT": 0, "CCC": 0, "CCA": 0, "CCG": 0, + "ACT": 0, "ACC": 0, "ACA": 0, "ACG": 0, + "GCT": 0, "GCC": 0, "GCA": 0, "GCG": 0, + "TGT": 0, "TGC": 0, "TGA": 0, "TGG": 0, + "CGT": 0, "CGC": 0, "CGA": 0, "CGG": 0, + "AGT": 0, "AGC": 0, "AGA": 0, "AGG": 0, + "GGT": 0, "GGC": 0, "GGA": 0, "GGG": 0} + +# Turn black code style on +# fmt: on + +# this dictionary shows which codons encode the same AA +SynonymousCodons = { + "CYS": ["TGT", "TGC"], + "ASP": ["GAT", "GAC"], + "SER": ["TCT", "TCG", "TCA", "TCC", "AGC", "AGT"], + "GLN": ["CAA", "CAG"], + "MET": ["ATG"], + "ASN": ["AAC", "AAT"], + "PRO": ["CCT", "CCG", "CCA", "CCC"], + "LYS": ["AAG", "AAA"], + "STOP": ["TAG", "TGA", "TAA"], + "THR": ["ACC", "ACA", "ACG", "ACT"], + "PHE": ["TTT", "TTC"], + "ALA": ["GCA", "GCC", "GCG", "GCT"], + "GLY": ["GGT", "GGG", "GGA", "GGC"], + "ILE": ["ATC", "ATA", "ATT"], + "LEU": ["TTA", "TTG", "CTC", "CTT", "CTG", "CTA"], + "HIS": ["CAT", "CAC"], + "ARG": ["CGA", "CGC", "CGG", "CGT", "AGG", "AGA"], + "TRP": ["TGG"], + "VAL": ["GTA", "GTC", "GTG", "GTT"], + "GLU": ["GAG", "GAA"], + "TYR": ["TAT", "TAC"], +} + +class CodonAdaptationIndex: + """ + A codon adaptation index (CAI) implementation. + + Implements the codon adaptation index (CAI) described by Sharp and + Li (Nucleic Acids Res. 1987 Feb 11;15(3):1281-95). + + NOTE - This implementation does not currently cope with alternative genetic + codes: only the synonymous codons in the standard table are considered. + """ + index: dict[str, float] + codon_count: dict[str, int] + + def __init__(self: CodonAdaptationIndex): + self.index = dict[str, float]() + self.codon_count = dict[str, int]() + + # use this method with predefined CAI index + def set_cai_index(self, index): + """ + set_cai_index(self, index) + + Set up an index to be used when calculating CAI for a gene. + + Just pass a dictionary similar to the SharpEcoliIndex in the + CodonUsageIndices module. + """ + self.index = index + + def generate_index(self, fasta_file): + """ + generate_index(self, fasta_file) + + Generate a codon usage index from a FASTA file of CDS sequences. + + Takes a location of a Fasta file containing CDS sequences + (which must all have a whole number of codons) and generates a codon + usage index. + + RCSU values + """ + # first make sure we're not overwriting an existing index: + if self.index != dict[str, float]() or self.codon_count != dict[str, int](): + raise ValueError( + "an index has already been set or a codon count " + "has been done. Cannot overwrite either." + ) + + # count codon occurrences in the file. + self._count_codons(fasta_file) + + # now to calculate the index we first need to sum the number of times + # synonymous codons were used all together. + for aa in SynonymousCodons: + total = 0.0 + # RCSU values are CodonCount/((1/num of synonymous codons) * sum of + # all synonymous codons) + rcsu = list[float]() + codons = SynonymousCodons[aa] + + for codon in codons: + total += self.codon_count[codon] + + # calculate the RSCU value for each of the codons + for codon in codons: + denominator = float(total) / len(codons) + rcsu.append(self.codon_count[codon] / denominator) + + # now generate the index W=RCSUi/RCSUmax: + rcsu_max = max(rcsu) + for codon_index, codon in enumerate(codons): + self.index[codon] = rcsu[codon_index] / rcsu_max + + def cai_for_gene(self, dna_sequence: str) -> float: + """ + cai_for_gene(self, dna_sequence) + + Calculate the CAI (float) for the provided DNA sequence (string). + + This method uses the Index (either the one you set or the one you + generated) and returns the CAI for the DNA sequence. + """ + cai_value, cai_length = 0.0, 0 + + # if no index is set or generated, the default SharpEcoliIndex will + # be used. + if self.index == dict[str, float](): + self.set_cai_index(SharpEcoliIndex) + + if dna_sequence.islower(): + dna_sequence = dna_sequence.upper() + + for i in range(0, len(dna_sequence), 3): + codon = dna_sequence[i : i + 3] + if codon in self.index: + # these two codons are always one, exclude them: + if codon not in ["ATG", "TGG"]: + cai_value += math.log(self.index[codon]) + cai_length += 1 + # some indices may not include stop codons: + elif codon not in ["TGA", "TAA", "TAG"]: + raise TypeError( + f"illegal codon in sequence: {codon}.\n{self.index}" + ) + + return math.exp(cai_value / (cai_length - 1.0)) + + def _count_codons(self, fasta_file): + + # make the codon dictionary local + self.codon_count = CodonsDict.copy() + + dna_sequence = '' + + # iterate over sequence and count all the codons in the FastaFile. + for cur_record in bio.FASTA(fasta_file): + # make sure the sequence is lower case + if str(cur_record.seq).islower(): + dna_sequence = str(cur_record.seq).upper() + else: + dna_sequence = str(cur_record.seq) + for i in range(0, len(dna_sequence), 3): + codon = dna_sequence[i : i + 3] + if codon in self.codon_count: + self.codon_count[codon] += 1 + else: + raise TypeError( + f"illegal codon {codon} in gene: {cur_record}" + ) + + def print_index(self): + """ + Print out the index used. + This just gives the index when the objects is printed. + """ + for i in sorted(self.index): + print(f"{i}\t{self.index[i]}") diff --git a/stdlib/biopy/SeqUtils/codonusageindices.seq b/stdlib/biopy/SeqUtils/codonusageindices.seq new file mode 100644 index 000000000..5f066dd34 --- /dev/null +++ b/stdlib/biopy/SeqUtils/codonusageindices.seq @@ -0,0 +1,25 @@ +""" +Codon adaption indxes, including Sharp and Li (1987) E. coli index. + +Currently this module only defines a single codon adaption index from +Sharp & Li, Nucleic Acids Res. 1987. +""" +# Turn black code style off +# fmt: off + +SharpEcoliIndex = { + "GCA": 0.586, "GCC": 0.122, "GCG": 0.424, "GCT": 1.0, "AGA": 0.004, + "AGG": 0.002, "CGA": 0.004, "CGC": 0.356, "CGG": 0.004, "CGT": 1.0, "AAC": 1.0, + "AAT": 0.051, "GAC": 1.0, "GAT": 0.434, "TGC": 1.0, "TGT": 0.5, "CAA": 0.124, + "CAG": 1.0, "GAA": 1.0, "GAG": 0.259, "GGA": 0.01, "GGC": 0.724, "GGG": 0.019, + "GGT": 1.0, "CAC": 1.0, "CAT": 0.291, "ATA": 0.003, "ATC": 1.0, "ATT": 0.185, + "CTA": 0.007, "CTC": 0.037, "CTG": 1.0, "CTT": 0.042, "TTA": 0.02, + "TTG": 0.02, "AAA": 1.0, "AAG": 0.253, "ATG": 1.0, "TTC": 1.0, "TTT": 0.296, + "CCA": 0.135, "CCC": 0.012, "CCG": 1.0, "CCT": 0.07, "AGC": 0.41, + "AGT": 0.085, "TCA": 0.077, "TCC": 0.744, "TCG": 0.017, "TCT": 1.0, + "ACA": 0.076, "ACC": 1.0, "ACG": 0.099, "ACT": 0.965, "TGG": 1.0, "TAC": 1.0, + "TAT": 0.239, "GTA": 0.495, "GTC": 0.066, "GTG": 0.221, "GTT": 1.0 +} + +# Turn black code style on +# fmt: on \ No newline at end of file diff --git a/stdlib/biopy/SeqUtils/lcc.seq b/stdlib/biopy/SeqUtils/lcc.seq new file mode 100644 index 000000000..e044ba9ab --- /dev/null +++ b/stdlib/biopy/SeqUtils/lcc.seq @@ -0,0 +1,145 @@ + +"""Local Composition Complexity.""" + +import math + +def lcc_mult(s, wsize: int): + """ + lcc_mult(s, wsize) + + Calculate Local Composition Complexity (LCC) values over sliding window. + + Returns a list of floats, the LCC values for a sliding window over + the sequence. + """ + l2 = math.log(2.0) + tamseq = len(s) + + upper = s.upper() + + compone = [0.0] + lccsal = [0.0] + for i in range(wsize): + compone.append( + ((i + 1) / float(wsize)) * ((math.log((i + 1) / float(wsize))) / l2) + ) + window = s[0:wsize] + cant_a = window.count("A") + cant_c = window.count("C") + cant_t = window.count("T") + cant_g = window.count("G") + term_a = compone[cant_a] + term_c = compone[cant_c] + term_t = compone[cant_t] + term_g = compone[cant_g] + lccsal.append(-(term_a + term_c + term_t + term_g)) + tail = s[0] + for x in range(tamseq - wsize): + window = upper[x + 1 : wsize + x + 1] + if tail == window[-1]: + lccsal.append(lccsal[-1]) + elif tail == "A": + cant_a -= 1 + if window.endswith("C"): + cant_c += 1 + term_a = compone[cant_a] + term_c = compone[cant_c] + lccsal.append(-(term_a + term_c + term_t + term_g)) + elif window.endswith("T"): + cant_t += 1 + term_a = compone[cant_a] + term_t = compone[cant_t] + lccsal.append(-(term_a + term_c + term_t + term_g)) + elif window.endswith("G"): + cant_g += 1 + term_a = compone[cant_a] + term_g = compone[cant_g] + lccsal.append(-(term_a + term_c + term_t + term_g)) + elif tail == "C": + cant_c -= 1 + if window.endswith("A"): + cant_a += 1 + term_a = compone[cant_a] + term_c = compone[cant_c] + lccsal.append(-(term_a + term_c + term_t + term_g)) + elif window.endswith("T"): + cant_t += 1 + term_c = compone[cant_c] + term_t = compone[cant_t] + lccsal.append(-(term_a + term_c + term_t + term_g)) + elif window.endswith("G"): + cant_g += 1 + term_c = compone[cant_c] + term_g = compone[cant_g] + lccsal.append(-(term_a + term_c + term_t + term_g)) + elif tail == "T": + cant_t -= 1 + if window.endswith("A"): + cant_a += 1 + term_a = compone[cant_a] + term_t = compone[cant_t] + lccsal.append(-(term_a + term_c + term_t + term_g)) + elif window.endswith("C"): + cant_c += 1 + term_c = compone[cant_c] + term_t = compone[cant_t] + lccsal.append(-(term_a + term_c + term_t + term_g)) + elif window.endswith("G"): + cant_g += 1 + term_t = compone[cant_t] + term_g = compone[cant_g] + lccsal.append(-(term_a + term_c + term_t + term_g)) + elif tail == "G": + cant_g -= 1 + if window.endswith("A"): + cant_a += 1 + term_a = compone[cant_a] + term_g = compone[cant_g] + lccsal.append(-(term_a + term_c + term_t + term_g)) + elif window.endswith("C"): + cant_c += 1 + term_c = compone[cant_c] + term_g = compone[cant_g] + lccsal.append(-(term_a + term_c + term_t + term_g)) + elif window.endswith("T"): + cant_t += 1 + term_t = compone[cant_t] + term_g = compone[cant_g] + lccsal.append(-(term_a + term_c + term_t + term_g)) + tail = window[0] + return lccsal + + +def lcc_simp(s): + """ + lcc_simp(s) + + Calculate Local Composition Complexity (LCC) for a sequence. + """ + wsize = len(s) + term_a = 0.0 + term_c = 0.0 + term_t = 0.0 + term_g = 0.0 + + + upper = s.upper() + + l2 = math.log(2.0) + if "A" in s: + term_a = ((upper.count("A")) / float(wsize)) * ( + (math.log((upper.count("A")) / float(wsize))) / l2 + ) + if "C" in s: + term_c = ((upper.count("C")) / float(wsize)) * ( + (math.log((upper.count("C")) / float(wsize))) / l2 + ) + if "T" in s: + term_t = ((upper.count("T")) / float(wsize)) * ( + (math.log((upper.count("T")) / float(wsize))) / l2 + ) + if "G" in s: + term_g = ((upper.count("G")) / float(wsize)) * ( + (math.log((upper.count("G")) / float(wsize))) / l2 + ) + return -(term_a + term_c + term_t + term_g) \ No newline at end of file diff --git a/stdlib/biopy/SeqUtils/proparamdata.seq b/stdlib/biopy/SeqUtils/proparamdata.seq new file mode 100644 index 000000000..401f1d8c4 --- /dev/null +++ b/stdlib/biopy/SeqUtils/proparamdata.seq @@ -0,0 +1,154 @@ + +"""Indices to be used with ProtParam.""" + +# Turn black code style off +# fmt: off + +# Kyte & Doolittle index of hydrophobicity +# J. Mol. Biol. 157:105-132(1982). +kd = {"A": 1.8, "R": -4.5, "N": -3.5, "D": -3.5, "C": 2.5, + "Q": -3.5, "E": -3.5, "G": -0.4, "H": -3.2, "I": 4.5, + "L": 3.8, "K": -3.9, "M": 1.9, "F": 2.8, "P": -1.6, + "S": -0.8, "T": -0.7, "W": -0.9, "Y": -1.3, "V": 4.2} + +# Flexibility +# Normalized flexibility parameters (B-values), average +# Vihinen M., Torkkila E., Riikonen P. Proteins. 19(2):141-9(1994). +Flex = {"A": 0.984, "C": 0.906, "E": 1.094, "D": 1.068, + "G": 1.031, "F": 0.915, "I": 0.927, "H": 0.950, + "K": 1.102, "M": 0.952, "L": 0.935, "N": 1.048, + "Q": 1.037, "P": 1.049, "S": 1.046, "R": 1.008, + "T": 0.997, "W": 0.904, "V": 0.931, "Y": 0.929} + +# Hydrophilicity +# 1 Hopp & Wood +# Proc. Natl. Acad. Sci. U.S.A. 78:3824-3828(1981). +hw = {"A": -0.5, "R": 3.0, "N": 0.2, "D": 3.0, "C": -1.0, + "Q": 0.2, "E": 3.0, "G": 0.0, "H": -0.5, "I": -1.8, + "L": -1.8, "K": 3.0, "M": -1.3, "F": -2.5, "P": 0.0, + "S": 0.3, "T": -0.4, "W": -3.4, "Y": -2.3, "V": -1.5} + +# Surface accessibility +# Vergoten G & Theophanides T, Biomolecular Structure and Dynamics, +# pg.138 (1997). +# 1 Emini Surface fractional probability +em = {"A": 0.815, "R": 1.475, "N": 1.296, "D": 1.283, "C": 0.394, + "Q": 1.348, "E": 1.445, "G": 0.714, "H": 1.180, "I": 0.603, + "L": 0.603, "K": 1.545, "M": 0.714, "F": 0.695, "P": 1.236, + "S": 1.115, "T": 1.184, "W": 0.808, "Y": 1.089, "V": 0.606} + +# 2 Janin Interior to surface transfer energy scale +ja = {"A": 0.28, "R": -1.14, "N": -0.55, "D": -0.52, "C": 0.97, + "Q": -0.69, "E": -1.01, "G": 0.43, "H": -0.31, "I": 0.60, + "L": 0.60, "K": -1.62, "M": 0.43, "F": 0.46, "P": -0.42, + "S": -0.19, "T": -0.32, "W": 0.29, "Y": -0.15, "V": 0.60} + + +# A two dimensional dictionary for calculating the instability index. +# Guruprasad K., Reddy B.V.B., Pandit M.W. Protein Engineering 4:155-161(1990). +# It is based on dipeptide values; therefore, the value for the dipeptide DG +# is DIWV['D']['G']. +DIWV = {"A": {"A": 1.0, "C": 44.94, "E": 1.0, "D": -7.49, + "G": 1.0, "F": 1.0, "I": 1.0, "H": -7.49, + "K": 1.0, "M": 1.0, "L": 1.0, "N": 1.0, + "Q": 1.0, "P": 20.26, "S": 1.0, "R": 1.0, + "T": 1.0, "W": 1.0, "V": 1.0, "Y": 1.0}, + "C": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 20.26, + "G": 1.0, "F": 1.0, "I": 1.0, "H": 33.60, + "K": 1.0, "M": 33.60, "L": 20.26, "N": 1.0, + "Q": -6.54, "P": 20.26, "S": 1.0, "R": 1.0, + "T": 33.60, "W": 24.68, "V": -6.54, "Y": 1.0}, + "E": {"A": 1.0, "C": 44.94, "E": 33.60, "D": 20.26, + "G": 1.0, "F": 1.0, "I": 20.26, "H": -6.54, + "K": 1.0, "M": 1.0, "L": 1.0, "N": 1.0, + "Q": 20.26, "P": 20.26, "S": 20.26, "R": 1.0, + "T": 1.0, "W": -14.03, "V": 1.0, "Y": 1.0}, + "D": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0, + "G": 1.0, "F": -6.54, "I": 1.0, "H": 1.0, + "K": -7.49, "M": 1.0, "L": 1.0, "N": 1.0, + "Q": 1.0, "P": 1.0, "S": 20.26, "R": -6.54, + "T": -14.03, "W": 1.0, "V": 1.0, "Y": 1.0}, + "G": {"A": -7.49, "C": 1.0, "E": -6.54, "D": 1.0, + "G": 13.34, "F": 1.0, "I": -7.49, "H": 1.0, + "K": -7.49, "M": 1.0, "L": 1.0, "N": -7.49, + "Q": 1.0, "P": 1.0, "S": 1.0, "R": 1.0, + "T": -7.49, "W": 13.34, "V": 1.0, "Y": -7.49}, + "F": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 13.34, + "G": 1.0, "F": 1.0, "I": 1.0, "H": 1.0, + "K": -14.03, "M": 1.0, "L": 1.0, "N": 1.0, + "Q": 1.0, "P": 20.26, "S": 1.0, "R": 1.0, + "T": 1.0, "W": 1.0, "V": 1.0, "Y": 33.601}, + "I": {"A": 1.0, "C": 1.0, "E": 44.94, "D": 1.0, + "G": 1.0, "F": 1.0, "I": 1.0, "H": 13.34, + "K": -7.49, "M": 1.0, "L": 20.26, "N": 1.0, + "Q": 1.0, "P": -1.88, "S": 1.0, "R": 1.0, + "T": 1.0, "W": 1.0, "V": -7.49, "Y": 1.0}, + "H": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0, + "G": -9.37, "F": -9.37, "I": 44.94, "H": 1.0, + "K": 24.68, "M": 1.0, "L": 1.0, "N": 24.68, + "Q": 1.0, "P": -1.88, "S": 1.0, "R": 1.0, + "T": -6.54, "W": -1.88, "V": 1.0, "Y": 44.94}, + "K": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0, + "G": -7.49, "F": 1.0, "I": -7.49, "H": 1.0, + "K": 1.0, "M": 33.60, "L": -7.49, "N": 1.0, + "Q": 24.64, "P": -6.54, "S": 1.0, "R": 33.60, + "T": 1.0, "W": 1.0, "V": -7.49, "Y": 1.0}, + "M": {"A": 13.34, "C": 1.0, "E": 1.0, "D": 1.0, + "G": 1.0, "F": 1.0, "I": 1.0, "H": 58.28, + "K": 1.0, "M": -1.88, "L": 1.0, "N": 1.0, + "Q": -6.54, "P": 44.94, "S": 44.94, "R": -6.54, + "T": -1.88, "W": 1.0, "V": 1.0, "Y": 24.68}, + "L": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0, + "G": 1.0, "F": 1.0, "I": 1.0, "H": 1.0, + "K": -7.49, "M": 1.0, "L": 1.0, "N": 1.0, + "Q": 33.60, "P": 20.26, "S": 1.0, "R": 20.26, + "T": 1.0, "W": 24.68, "V": 1.0, "Y": 1.0}, + "N": {"A": 1.0, "C": -1.88, "E": 1.0, "D": 1.0, + "G": -14.03, "F": -14.03, "I": 44.94, "H": 1.0, + "K": 24.68, "M": 1.0, "L": 1.0, "N": 1.0, + "Q": -6.54, "P": -1.88, "S": 1.0, "R": 1.0, + "T": -7.49, "W": -9.37, "V": 1.0, "Y": 1.0}, + "Q": {"A": 1.0, "C": -6.54, "E": 20.26, "D": 20.26, + "G": 1.0, "F": -6.54, "I": 1.0, "H": 1.0, + "K": 1.0, "M": 1.0, "L": 1.0, "N": 1.0, + "Q": 20.26, "P": 20.26, "S": 44.94, "R": 1.0, + "T": 1.0, "W": 1.0, "V": -6.54, "Y": -6.54}, + "P": {"A": 20.26, "C": -6.54, "E": 18.38, "D": -6.54, + "G": 1.0, "F": 20.26, "I": 1.0, "H": 1.0, + "K": 1.0, "M": -6.54, "L": 1.0, "N": 1.0, + "Q": 20.26, "P": 20.26, "S": 20.26, "R": -6.54, + "T": 1.0, "W": -1.88, "V": 20.26, "Y": 1.0}, + "S": {"A": 1.0, "C": 33.60, "E": 20.26, "D": 1.0, + "G": 1.0, "F": 1.0, "I": 1.0, "H": 1.0, + "K": 1.0, "M": 1.0, "L": 1.0, "N": 1.0, + "Q": 20.26, "P": 44.94, "S": 20.26, "R": 20.26, + "T": 1.0, "W": 1.0, "V": 1.0, "Y": 1.0}, + "R": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0, + "G": -7.49, "F": 1.0, "I": 1.0, "H": 20.26, + "K": 1.0, "M": 1.0, "L": 1.0, "N": 13.34, + "Q": 20.26, "P": 20.26, "S": 44.94, "R": 58.28, + "T": 1.0, "W": 58.28, "V": 1.0, "Y": -6.54}, + "T": {"A": 1.0, "C": 1.0, "E": 20.26, "D": 1.0, + "G": -7.49, "F": 13.34, "I": 1.0, "H": 1.0, + "K": 1.0, "M": 1.0, "L": 1.0, "N": -14.03, + "Q": -6.54, "P": 1.0, "S": 1.0, "R": 1.0, + "T": 1.0, "W": -14.03, "V": 1.0, "Y": 1.0}, + "W": {"A": -14.03, "C": 1.0, "E": 1.0, "D": 1.0, + "G": -9.37, "F": 1.0, "I": 1.0, "H": 24.68, + "K": 1.0, "M": 24.68, "L": 13.34, "N": 13.34, + "Q": 1.0, "P": 1.0, "S": 1.0, "R": 1.0, + "T": -14.03, "W": 1.0, "V": -7.49, "Y": 1.0}, + "V": {"A": 1.0, "C": 1.0, "E": 1.0, "D": -14.03, + "G": -7.49, "F": 1.0, "I": 1.0, "H": 1.0, + "K": -1.88, "M": 1.0, "L": 1.0, "N": 1.0, + "Q": 1.0, "P": 20.26, "S": 1.0, "R": 1.0, + "T": -7.49, "W": 1.0, "V": 1.0, "Y": -6.54}, + "Y": {"A": 24.68, "C": 1.0, "E": -6.54, "D": 24.68, + "G": -7.49, "F": 1.0, "I": 1.0, "H": 13.34, + "K": 1.0, "M": 44.94, "L": 1.0, "N": 1.0, + "Q": 1.0, "P": 13.34, "S": 1.0, "R": -15.91, + "T": -7.49, "W": -9.37, "V": 1.0, "Y": 13.34}, + } + +# Turn black code style on +# fmt: on \ No newline at end of file diff --git a/stdlib/biopy/seqrecord.seq b/stdlib/biopy/seqrecord.seq new file mode 100644 index 000000000..5fc23a78f --- /dev/null +++ b/stdlib/biopy/seqrecord.seq @@ -0,0 +1,536 @@ + +"""Represent a Sequence Record, a sequence with annotation.""" + +from bio.seqfeature import SeqFeature + +_NO_SEQRECORD_COMPARISON = "SeqRecord comparison is deliberately not implemented. Explicitly compare the attributes of interest." + + +class _RestrictedDict: + """ + Dict which only allows sequences of given length as values (PRIVATE). + """ + d: dict[str, str] + _length: int + + def __init__(self: _RestrictedDict, length: int): + """Create an EMPTY restricted dictionary.""" + self.d.__init__(self) + self._length = length + + def __setitem__(self, key, value): + self.d.__setitem__(self, key, value) + + def update(self, new_dict): + # Force this to go via our strict __setitem__ method + for key, value in new_dict.items(): + self[key] = value + + +class SeqRecord: + """ + A SeqRecord object holds a sequence and information about it. + """ + _seq: seq + id: str + name: str + description: str + dbxrefs: list[str] + annotations: dict[str, str] + letter_annotations: dict[int, str] + features: list[SeqFeature] + + def __init__(self: SeqRecord, s: seq, id: str="", name: str="", + description: str="", dbxrefs: optional[list[str]]=None, + features: list[SeqFeature]=None, annotations: optional[dict[str, str]]=None, + letter_annotations: dict[int, str]=None): + """ + Create a SeqRecord. + """ + self._seq = s + self.id = id + self.name = name + self.description = description + + # database cross references (for the whole sequence) + if dbxrefs is None: + dbxrefs = list[str]() + self.dbxrefs = dbxrefs + + # annotations about the whole sequence + if annotations is None: + annotations = dict[str, str]() + self.annotations = annotations + + if letter_annotations is None: + # annotations about each letter in the sequence + if s is None: + self._per_letter_annotations = _RestrictedDict(length=0) + else: + self._per_letter_annotations = \ + _RestrictedDict(length=len(s)) + + else: + # This will be handled via the property set function, which will + # turn this into a _RestrictedDict and thus ensure all the values + # in the dict are the right length + self.letter_annotations = letter_annotations + + # annotations about parts of the sequence + if features is None: + features = list[SeqFeature] + self.features = features + + def _set_per_letter_annotations(self, value): + # Turn this into a restricted-dictionary (and check the entries) + try: + self._per_letter_annotations = _RestrictedDict(length=len(self.s)) + except AttributeError: + # e.g. s is None + self._per_letter_annotations = _RestrictedDict(length=0) + self._per_letter_annotations.update(value) + + # letter_annotations = property( + # fget=lambda self: self._per_letter_annotations, + # fset=_set_per_letter_annotations, + # doc="""Dictionary of per-letter-annotation for the sequence. + # For example, this can hold quality scores used in FASTQ or QUAL files. + # Consider this example using Bio.SeqIO to read in an example Solexa + # variant FASTQ file as a SeqRecord: + # >>> from Bio import SeqIO + # >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") + # >>> print("%s %s" % (record.id, record.seq)) + # slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN + # >>> print(list(record.letter_annotations)) + # ['solexa_quality'] + # >>> print(record.letter_annotations["solexa_quality"]) + # [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] + # The letter_annotations get sliced automatically if you slice the + # parent SeqRecord, for example taking the last ten bases: + # >>> sub_record = record[-10:] + # >>> print("%s %s" % (sub_record.id, sub_record.seq)) + # slxa_0001_1_0001_01 ACGTNNNNNN + # >>> print(sub_record.letter_annotations["solexa_quality"]) + # [4, 3, 2, 1, 0, -1, -2, -3, -4, -5] + # Any python sequence (i.e. list, tuple or string) can be recorded in + # the SeqRecord's letter_annotations dictionary as long as the length + # matches that of the SeqRecord's sequence. e.g. + # >>> len(sub_record.letter_annotations) + # 1 + # >>> sub_record.letter_annotations["dummy"] = "abcdefghij" + # >>> len(sub_record.letter_annotations) + # 2 + # You can delete entries from the letter_annotations dictionary as usual: + # >>> del sub_record.letter_annotations["solexa_quality"] + # >>> sub_record.letter_annotations + # {'dummy': 'abcdefghij'} + # You can completely clear the dictionary easily as follows: + # >>> sub_record.letter_annotations = {} + # >>> sub_record.letter_annotations + # {} + # Note that if replacing the record's sequence with a sequence of a + # different length you must first clear the letter_annotations dict. + # """) + + def _set_seq(self, value): + if self._per_letter_annotations: + if len(self) != len(value): + raise ValueError("You must empty the letter annotations first!") + else: + # Leave the existing per letter annotations unchanged: + self._seq = value + else: + self._seq = value + # Reset the (empty) letter annotations dict with new length: + try: + self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) + except AttributeError: + # e.g. seq is None + self._per_letter_annotations = _RestrictedDict(length=0) + + # seq = property(fget=lambda self: self._seq, + # fset=_set_seq, + # doc="The sequence itself, as a Seq or MutableSeq object.") + + def __getitem__(self: SeqRecord, index: int): + """ + Return a an individual letter. + """ + return self.seq[index] + + + def __getitem__(self: SeqRecord, index: slice): + """ + Return a sub-sequence. + Slicing, e.g. my_record[5:10], returns a new SeqRecord for + that sub-sequence with some annotation preserved. + """ + if self.seq is None: + raise ValueError("If the sequence is None, we cannot slice it.") + parent_length = len(self) + + answer = SeqRecord(self.seq[index], + id=self.id, + name=self.name, + description=self.description) + + start, stop, step = index.indices(parent_length) + if step == 1: + # Select relevant features, add them with shifted locations + # assert str(self.seq)[index] == str(self.seq)[start:stop] + for f in self.features: + if start <= f.location.nofuzzy_start \ + and f.location.nofuzzy_end <= stop: + answer.features.append(f._shift(-start)) + + # Slice all the values to match the sliced sequence + # (this should also work with strides, even negative strides): + for key, value in self.letter_annotations.items(): + answer._per_letter_annotations[key] = value[index] + + return answer + + def __iter__(self: SeqRecord): + """ + Iterate over the letters in the sequence. + """ + lines = list[str] + if self.id: + lines.append(f"ID: {self.id}") + if self.name: + lines.append(f"Name: {self.name}") + if self.description: + lines.append(f"Description: {self.description}") + if self.dbxrefs: + lines.append("Database cross-references: " + ", ".join(self.dbxrefs)) + lines.append(f"Number of features: {self.features}") + for a in self.annotations: + lines.append(f"/{a}={self.annotations[a]}") + if self.letter_annotations: + lines.append("Per letter annotation for: " + ", ".join(self.letter_annotations)) + # Don't want to include the entire sequence, + # and showing the alphabet is useful: + lines.append(self.repr(self.seq)) + return "\n".join(lines) + + def __repr__(self: SeqRecord): + """ + Return a concise summary of the record for debugging (string). + """ + # TODO!!! how to do this? + return "{self.__class__.__name__}(seq={self.seq}, id={self.id}, name={self.name}, description={self.description}, dbxrefs={self.dbxrefs})" + + # TODO/CAVEATS: + # @jordan - `format` and `__format__ still need to be implemented` + # def format(self: SeqRecord, format): + # """ + # Return the record as a string in the specified file format. + # """ + # # See also the __format__ added for Python 2.6 / 3.0, PEP 3101 + # # See also the Bio.Align.Generic.Alignment class and its format() + # return self.__format__(format) + + # def __format__(self: SeqRecord, format_spec: str): + # """ + # Return the record as a string in the specified file format. + # This method supports the python format() function added in + # Python 2.6/3.0. The format_spec should be a lower case string + # supported by Bio.SeqIO as an output file format. See also the + # SeqRecord's format() method. + # Under Python 3 binary formats raise a ValueError, while on + # Python 2 this will work with a deprecation warning. + # """ + # if not islower(format_spec): + # # Follow python convention and default to using __str__ + # return str(self) + # # NOT IMPLEMENTED YET + # from Bio import SeqIO + + # # Easy case, can call string-building function directly + # if format_spec in SeqIO._FormatToString: + # return SeqIO._FormatToString[format_spec](self) + + # if format_spec in SeqIO._BinaryFormats: + # import sys + # if sys.version_info[0] < 3: + # import warnings + # from Bio import BiopythonDeprecationWarning + # warnings.warn( + # "Binary format %s cannot be used with SeqRecord format method on Python 3" + # % format_spec, + # BiopythonDeprecationWarning + # ) + # # Continue - Python 2 StringIO will work... + # else: + # raise ValueError( + # "Binary format %s cannot be used with SeqRecord format method" + # % format_spec + # ) + + # # Harder case, make a temp handle instead + # from Bio._py3k import StringIO + # handle = StringIO() + # SeqIO.write(self, handle, format_spec) + # return handle.getvalue() + + def __len__(self): + """ + Return the length of the sequence. + """ + return len(self.seq) + + def __lt__(self, other): + """ + Define the less-than operand (not implemented). + """ + raise NotImplementedError(_NO_SEQRECORD_COMPARISON) + + def __le___(self, other): + """ + Define the less-than-or-equal-to operand (not implemented). + """ + raise NotImplementedError(_NO_SEQRECORD_COMPARISON) + + def __eq__(self, other): + """ + Define the equal-to operand (not implemented). + """ + raise NotImplementedError(_NO_SEQRECORD_COMPARISON) + + def __ne__(self, other): + """ + Define the not-equal-to operand (not implemented). + """ + raise NotImplementedError(_NO_SEQRECORD_COMPARISON) + + def __gt__(self, other): + """ + Define the greater-than operand (not implemented). + """ + raise NotImplementedError(_NO_SEQRECORD_COMPARISON) + + def __ge__(self, other): + """ + Define the greater-than-or-equal-to operand (not implemented). + """ + raise NotImplementedError(_NO_SEQRECORD_COMPARISON) + + # Note Python 3 does not use __cmp__ and there is no need to + # define __cmp__ on Python 2 as have all of _lt__ etc defined. + + # Python 3: + def __bool__(self): + """ + Boolean value of an instance of this class (True). + """ + return True + + def __add__(self: SeqRecord, other: str): + """ + Add another sequence or string to this sequence. + """ + # Note can't transfer any per-letter-annotations + return SeqRecord(self.seq + other, + id=self.id, name=self.name, + description=self.description, + features=self.features[:], + annotations=self.annotations.copy(), + dbxrefs=self.dbxrefs[:]) + + def __add__(self: SeqRecord, other: seq): + """ + Add another sequence or string to this sequence. + """ + # Note can't transfer any per-letter-annotations + return SeqRecord(self.seq + other, + id=self.id, name=self.name, + description=self.description, + features=self.features[:], + annotations=self.annotations.copy(), + dbxrefs=self.dbxrefs[:]) + + def __add__(self: SeqRecord, other: SeqRecord): + """ + Add another sequence or string to this sequence. + """ + + # Adding two SeqRecord objects... must merge annotation. + answer = SeqRecord(self.seq + other.seq, + features=self.features[:], + dbxrefs=self.dbxrefs[:]) + # Will take all the features and all the db cross refs, + length = len(self) + for f in other.features: + answer.features.append(f._shift(length)) + del length + for ref in other.dbxrefs: + if ref not in answer.dbxrefs: + answer.dbxrefs.append(ref) + # Take common id/name/description/annotation + if self.id == other.id: + answer.id = self.id + if self.name == other.name: + answer.name = self.name + if self.description == other.description: + answer.description = self.description + for k, v in self.annotations.items(): + if k in other.annotations and other.annotations[k] == v: + answer.annotations[k] = v + # Can append matching per-letter-annotation + for k, v in self.letter_annotations.items(): + if k in other.letter_annotations: + answer.letter_annotations[k] = v + other.letter_annotations[k] + return answer + + def __radd__(self: SeqRecord, other: str): + """ + Add another string to this sequence (from the left). + """ + # Note can't transfer any per-letter-annotations + offset = len(other) + return SeqRecord(other + self.seq, + id=self.id, name=self.name, + description=self.description, + features=[f._shift(offset) for f in self.features], + annotations=self.annotations.copy(), + dbxrefs=self.dbxrefs[:]) + + def __radd__(self: SeqRecord, other: seq): + """ + Add another sequence to this sequence (from the left). + """ + # Note can't transfer any per-letter-annotations + offset = len(other) + return SeqRecord(other + self.seq, + id=self.id, name=self.name, + description=self.description, + features=[f._shift(offset) for f in self.features], + annotations=self.annotations.copy(), + dbxrefs=self.dbxrefs[:]) + + def upper(self): + """ + Return a copy of the record with an upper case sequence. + """ + return SeqRecord(self.seq.upper(), + id=self.id, name=self.name, + description=self.description, + dbxrefs=self.dbxrefs[:], + features=self.features[:], + annotations=self.annotations.copy(), + letter_annotations=self.letter_annotations.copy()) + + def lower(self): + """ + Return a copy of the record with a lower case sequence. + """ + return SeqRecord(self.seq.lower(), + id=self.id, name=self.name, + description=self.description, + dbxrefs=self.dbxrefs[:], + features=self.features[:], + annotations=self.annotations.copy(), + letter_annotations=self.letter_annotations.copy()) + + def reverse_complement(self: SeqRecord, id: str=False, name: str=False, description: str=False, + features: list[SeqFeature]=False, annotations: dict[str, str]=False, + letter_annotations: dict[int, str]=False, dbxrefs: list[str]=False): + """ + Return new SeqRecord with reverse complement sequence. + TODO/CAVEATS: + @jordan- seq does not support lamda (check line 471) + answer.features.sort(key=lambda x: x.location.start.position) + """ + answer = SeqRecord(self.seq.reverse_complement()) + if id: + answer.id = id + else: + answer.id = self.id + + if name: + answer.name = name + else: + answer.name = self.name + + if description: + answer.description = description + else: + answer.description = self.description + + if dbxrefs: + answer.dbxrefs = dbxrefs + else: + answer.dbxrefs = self.dbxrefs[:] + + if features: + answer.features = features + else: + # Copy the old features, adjusting location and string + length = len(answer) + answer.features = [f._flip(length) for f in self.features] + + # answer.features.sort(key=lambda x: x.location.start.position) + answer.features.sort(key= self.location.start.position) + if annotations: + answer.annotations = annotations + else: + answer.annotations = self.annotations.copy() + + if letter_annotations: + answer.letter_annotations = letter_annotations + else: + # Copy the old per letter annotations, reversing them + for key, value in self.letter_annotations.items(): + answer._per_letter_annotations[key] = value[::-1] + + return answer + + def translate(self: SeqRecord, + # Seq translation arguments: + table="Standard", stop_symbol="*", to_stop=False, + cds=False, gap=None, + # SeqRecord annotation arguments: + id: str=False, name: str=False, description: str=False, + features: list[SeqFeature]=False, annotations: dict[str, str]=False, + letter_annotations: dict[int, str]=False, dbxrefs: list[str]=False): + """ + Return new SeqRecord with translated sequence. + """ + answer = SeqRecord(self.seq.translate(table=table, + stop_symbol=stop_symbol, + to_stop=to_stop, + cds=cds, + gap=gap)) + if id: + answer.id = id + else: + answer.id = self.id + + if name: + answer.name = name + else: + answer.name = self.name + + if description: + answer.description = description + else: + answer.description = self.description + + if dbxrefs: + answer.dbxrefs = dbxrefs + else: + answer.dbxrefs = self.dbxrefs[:] + + if features: + answer.features = features + + if annotations: + answer.annotations = annotations + else: + # Copy the old annotations + answer.annotations = self.annotations.copy() + + if letter_annotations: + answer.letter_annotations = letter_annotations + + return answer \ No newline at end of file diff --git a/stdlib/core/err.seq b/stdlib/core/err.seq index d23d98273..a3f4bbe30 100644 --- a/stdlib/core/err.seq +++ b/stdlib/core/err.seq @@ -149,6 +149,32 @@ class StopIteration: def message(self: StopIteration): return self._hdr.msg +class AttributeError: + _hdr: ExcHeader + + def __init__(self: AttributeError): + self._hdr = ('AttributeError', '', '', '', 0, 0) + + def __init__(self: AttributeError, message: str): + self._hdr = ('AttributeError', message, '', '', 0, 0) + + @property + def message(self: AttributeError): + return self._hdr.msg + +class RuntimeError: + _hdr: ExcHeader + + def __init__(self: RuntimeError): + self._hdr = ('RuntimeError', '', '', '', 0, 0) + + def __init__(self: RuntimeError, message: str): + self._hdr = ('RuntimeError', message, '', '', 0, 0) + + @property + def message(self: RuntimeError): + return self._hdr.msg + def check_errno(prefix: str): msg = _C.seq_check_errno() if msg: diff --git a/test/stdlib/biopy_test/seqfeature_test.seq b/test/stdlib/biopy_test/seqfeature_test.seq new file mode 100644 index 000000000..440eeb8b2 --- /dev/null +++ b/test/stdlib/biopy_test/seqfeature_test.seq @@ -0,0 +1,122 @@ +from Bio.SeqFeature import FeatureLocation, Position +from Bio.SeqFeature import CompoundLocation, UnknownPosition, SeqFeature +from Bio.SeqFeature import WithinPosition, BetweenPosition + + +####### Test FeatureLocation ####### +@test +def test_eq_identical(): + """ + Test two identical locations are equal. + """ + loc1 = FeatureLocation(23, 42, 1, '', '') + loc2 = FeatureLocation(23, 42, 1, '', '') + assert loc1 == loc2 + + loc1 = FeatureLocation(23, 42, -1, '', '') + loc2 = FeatureLocation(23, 42, -1, '', '') + assert loc1 == loc2 + + loc1 = FeatureLocation(23, 42, -1, '', '') + loc2 = FeatureLocation(23, 42, -1, '', '') + assert loc1 == loc2 + + loc1 = FeatureLocation(Position(23, 1, 0), Position(42, 2, 0), 1, '', '') + loc2 = FeatureLocation(23, 42, 1, '', '') + assert loc1 == loc2 + + loc1 = FeatureLocation(23, 42, 1, "foo", "bar") + loc2 = FeatureLocation(23, 42, 1, "foo", "bar") + assert loc1 == loc2 +test_eq_identical() + +@test +def test_eq_not_identical(): + """ + Test two different locations are not equal. + """ + loc1 = FeatureLocation(22, 42, 1, '', '') + loc2 = FeatureLocation(23, 42, 1, '', '') + assert loc1 != loc2 + + loc1 = FeatureLocation(23, 42, 1, '', '') + loc2 = FeatureLocation(23, 43, 1, '', '') + assert loc1 != loc2 + + loc1 = FeatureLocation(23, 42, 1, '', '') + loc2 = FeatureLocation(23, 42, -1, '', '') + assert loc1 != loc2 + + loc1 = FeatureLocation(23, 42, 1, 'foo', '') + loc2 = FeatureLocation(23, 42, 1, 'bar', '') + assert loc1 != loc2 + + loc1 = FeatureLocation(23, 42, 1, 'foo', 'bar') + loc2 = FeatureLocation(23, 42, 1, 'foo', 'baz') + assert loc1 != loc2 +test_eq_not_identical() + +@test +# not finished need to have raise error with start > end +def test_start_before_end(): + expected = "must be greater than or equal to start location" + print FeatureLocation(42, 23, 1, '', '') + +test_start_before_end() + +####### Test CompoundLocation ####### + +@test +def test_eq_identical1(): + loc1 = FeatureLocation(12, 17, 1, '', '') + FeatureLocation(23, 42, 1, '', '') + loc2 = FeatureLocation(12, 17, 1, '', '') + FeatureLocation(23, 42, 1, '', '') + assert loc1 == loc2 + + loc1 = FeatureLocation(12, 17, 1, '', '') + FeatureLocation(23, 42, 1, '', '') + loc2 = CompoundLocation([FeatureLocation(12, 17, 1, '', ''), FeatureLocation(23, 42, 1, '', '')], 'join') + assert loc1 == loc2 +test_eq_identical1() + +@test +def test_eq_not_identical1(): + loc1 = FeatureLocation(12, 17, 1, '', '') + FeatureLocation(23, 42, 1, '', '') + loc2 = FeatureLocation(12, 17, 1, '', '') + FeatureLocation(23, 42, 1, '', '') + FeatureLocation(50, 60, 1, '', '') + assert loc1 != loc2 + + loc1 = FeatureLocation(12, 17, 1, '', '') + FeatureLocation(23, 42, 1, '', '') + loc2 = FeatureLocation(12, 17, -1, '', '') + FeatureLocation(23, 42, -1, '', '') + assert loc1 != loc2 + + loc1 = CompoundLocation([FeatureLocation(12, 17, 1, '', ''), FeatureLocation(23, 42, 1, '', '')], "join") + loc2 = CompoundLocation([FeatureLocation(12, 17, 1, '', ''), FeatureLocation(23, 42, 1, '', '')], 'order') + assert loc1 != loc2 +test_eq_not_identical1() + +####### Test Locations ####### + +@test +# Not finished +def test_fuzzy(): + exact_pos = Position(5, 0, 0) + within_pos_s = WithinPosition(10, left=10, right=13) + within_pos_e = WithinPosition(13, left=10, right=13) + between_pos_e = BetweenPosition(24, 20, 24) + before_pos = Position(15, 1, 0) + after_pos = Position(40, 2, 0) + + assert int(within_pos_s) == 10 + assert str(within_pos_s) == "(10.13)" + assert int(within_pos_e) == 13 + assert str(within_pos_e) == "(10.13)" + assert int(between_pos_e) == 24 + assert str(between_pos_e) == "(20^24)" + assert str(before_pos) == "<15" + assert str(after_pos) == ">40" + + # TODO/CAVEATS: + # FeatureLocation does not support these types for now. + # location1 = FeatureLocation(exact_pos, within_pos_e, 1, '', '') + # location2 = FeatureLocation(before_pos, between_pos_e) + # location3 = FeatureLocation(within_pos_s, after_pos) +test_fuzzy() + diff --git a/test/stdlib/biopy_test/sequtils_test/checksum_test.seq b/test/stdlib/biopy_test/sequtils_test/checksum_test.seq new file mode 100644 index 000000000..5283b5c9c --- /dev/null +++ b/test/stdlib/biopy_test/sequtils_test/checksum_test.seq @@ -0,0 +1,23 @@ +from biopy.sequtils.checksum import crc64, gcg + +str_light_chain_one = ("QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQ" + "HPGKAPKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGL" + "QAEDEADYYCSSYAGSSTLVFGGGTKLTVL") + +str_light_chain_two = ("QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQ" + "HPGKAPKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGL" + "QAEDEADYYCCSYAGSSTWVFGGGTKLTVL") + +@test +def test_checksum(): + num1 = crc64(str_light_chain_one) + num2 = crc64(str_light_chain_two) + assert num1 == num2 +test_checksum() + +@test +def test_gcg(): + num1 = gcg(str_light_chain_one) + num2 = gcg(str_light_chain_two) + assert num1 != num2 +test_gcg() diff --git a/test/stdlib/biopy_test/sequtils_test/codonusage_test.seq b/test/stdlib/biopy_test/sequtils_test/codonusage_test.seq new file mode 100644 index 000000000..bc1aa7934 --- /dev/null +++ b/test/stdlib/biopy_test/sequtils_test/codonusage_test.seq @@ -0,0 +1,21 @@ +from biopy.sequtils.codonusage import CodonAdaptationIndex +import math + +X = CodonAdaptationIndex() +path = "/Users/jordanwatson1/seq/test/stdlib/biopy_test/sequtils_test/scratch.fasta" + +X.generate_index(path) + +@test +def cai_for_gene(): + PRECISION = 1e-6 + + # Test Codon Adaptation Index (CAI) using default E. coli data. + CAI = CodonAdaptationIndex() + value = CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG") + assert math.fabs(value - 0.09978) < PRECISION +cai_for_gene() + +def print_index(): + X.print_index() +# print_index() \ No newline at end of file diff --git a/test/stdlib/biopy_test/sequtils_test/lcc_test.seq b/test/stdlib/biopy_test/sequtils_test/lcc_test.seq new file mode 100644 index 000000000..aa5e9c86e --- /dev/null +++ b/test/stdlib/biopy_test/sequtils_test/lcc_test.seq @@ -0,0 +1,25 @@ +from biopy.sequtils.lcc import lcc_simp, lcc_mult +import math + +str_light_chain_one = ("QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQ" + "HPGKAPKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGL" + "QAEDEADYYCSSYAGSSTLVFGGGTKLTVL") + +exp_simple_LCC = 1.03 +exp_window_LCC = (0.00, 1.00, 0.96, 0.96, 0.96, 0.65, 0.43, 0.35, 0.35, 0.35, 0.35, 0.53, 0.59, 0.26) + +@test +def lcc_simp_test(): + PRECISION = 1e2 + assert math.fabs(lcc_simp(str_light_chain_one) - exp_simple_LCC) < PRECISION +lcc_simp_test() + +@test +def lcc_mult_test(): + PRECISION = 1e2 + values = lcc_mult(str_light_chain_one, 20) + assert len(values) == len(exp_window_LCC) + + for val1, val2 in zip(values, exp_window_LCC): + assert math.fabs(val1 - val2) < PRECISION +lcc_mult_test() \ No newline at end of file