Source code for pyfurnace.design.core.sequence

import warnings
import random
from typing import Literal, Union, List, Optional, Set
from .symbols import (
    iupac_code,
    only_nucl,
    nucl_to_none,
    nucleotides,
    nucl_to_pair,
    dot_bracket_to_pair_map,
    AmbiguosStructure,
)
from .callback import Callback


[docs] class Sequence(Callback): """ Represents a nucleotide sequence with 5' to 3' or 3' to 5' directionality. This class provides functionality for manipulating RNA sequences with support for slicing, translation, complementarity, GC content, and integration with structure-aware tools. Parameters ---------- sequence : str, optional The nucleotide sequence. directionality : {'53', '35'}, optional Directionality of the sequence (default is '53'). **kwargs : dict Additional arguments passed to the Callback base class. Attributes ---------- directionality : str Direction of the sequence ('53' or '35'). """ def __init__( self, sequence: str = "", directionality: Literal["53", "35"] = "53", **kwargs ) -> None: """ Initializes a Sequence object. Parameters ---------- sequence : str, optional The nucleotide sequence. directionality : str, optional The directionality of the sequence ('53' or '35') (default is '53'). **kwargs : Any Arbitrary keyword arguments to pass to the Callback class. Raises ------ ValueError If the directionality is not '53' or '35'. """ super().__init__(**kwargs) if directionality not in ("53", "35"): raise ValueError( f"Sequence directionality not allowed. " f"It must be either '53' or '35', " f"got {directionality} instead." ) self._directionality = directionality sequence = self._check_line(sequence) self._sequence = str(sequence).translate(only_nucl) def __str__(self) -> str: """Return the string representation of the sequence.""" return self._sequence def __repr__(self) -> str: """Return a string representation of the sequence object.""" return f"{self._directionality[0]} {self._sequence} {self._directionality[1]}" def __getitem__(self, idx: Union[int, slice]) -> "Sequence": """Return a subsequence of the sequence.""" dir_slice = 1 if isinstance(idx, slice): if idx.step is not None and idx.step < 0: dir_slice = -1 return Sequence( self._sequence[idx], directionality=self.directionality[::dir_slice], callbacks=self._callbacks, ) def __setitem__(self, idx: Union[int, slice], val: Union[str, "Sequence"]) -> None: """Set a subsequence of the sequence.""" val = self._check_line(val) if isinstance(idx, slice): seq_line = list(self._sequence) seq_line[idx] = val.upper() self._sequence = "".join(seq_line) else: if idx < 0: idx = len(self) + idx self._sequence = str(self._sequence[:idx] + val + self._sequence[idx + 1 :]) self._trigger_callbacks(new_sequence=self._sequence) def __len__(self) -> int: """Return the length of the sequence.""" return len(self._sequence) def __add__(self, other: Union[str, "Sequence"]) -> "Sequence": """Concatenate two sequences or a sequence and a string""" self._check_addition(other) return Sequence(str(self) + str(other), self.directionality) def __iadd__(self, other: Union[str, "Sequence"]) -> "Sequence": """Concatenate a sequence or a string to the current sequence""" self._check_addition(other) self._sequence = self._sequence + str(other) self._trigger_callbacks(new_sequence=self._sequence) return self def __radd__(self, other: Union[str, "Sequence"]) -> "Sequence": """Concatenate two sequences or a sequence and a string""" if other == 0: return self return Sequence(str(other) + str(self), self.directionality) def __mul__(self, other: int) -> "Sequence": """Multiply the sequence by an integer.""" if isinstance(other, int): return Sequence(self._sequence * other, self.directionality) raise ValueError( f"Can only multiply sequence by an integer, " f"got {type(other)} instead" ) def __rmul__(self, other: int) -> "Sequence": """Multiply the sequence by an integer.""" return self.__mul__(other) def __bool__(self) -> bool: """Check if the sequence is not empty.""" return bool(self._sequence) def __contains__(self, other: Union[str, "Sequence"]) -> bool: """Check if a subsequence is included in the sequence""" if isinstance(other, (str, Sequence)): str_seq = str(other) if ( isinstance(other, Sequence) and other.directionality != self.directionality ): str_seq = str_seq[::-1] return str_seq in str(self) return False def __iter__(self): """Return an iterator over the sequence.""" return iter(self._sequence) def __eq__(self, other: Union[str, "Sequence"]) -> bool: """Check that the two sequence have same string and directionality""" if isinstance(other, str): return str(self) == other elif isinstance(other, Sequence): if self.directionality != other.directionality: return str(self) == str(other)[::-1] else: return str(self) == str(other) return False def __hash__(self): """Return a hash of the sequence.""" return hash(self.__repr__()) ### ### PROPERTIES ### @property def directionality(self): """directionality of the sequence (either '53' or '35')""" return self._directionality @directionality.setter def directionality(self, new_directionality: Literal["53", "35"]) -> None: """Set the directionality of the sequence.""" if new_directionality not in ("53", "35"): raise ValueError( f"Sequence directionality not allowed. " f"It must be either '53' or '35', got " f"{new_directionality} instead." ) self._directionality = new_directionality self._trigger_callbacks(new_sequence=self._sequence) ### ### PRETECTED METHODS ### def _check_addition(self, other: Union[str, "Sequence"]) -> bool: """Check that the sequence is valid and have same directionality""" if isinstance(other, str): other = self._check_line(other) elif not isinstance(other, Sequence): raise ValueError(f"{other} is not a valid type for addition") elif self.directionality != other.directionality: raise ValueError("Cannot add two sequences with different directionality") return True def _check_line(self, line: Union[str, "Sequence"]) -> bool: """Check that the sequence is valid and contains only allowed nucleotides.""" if not isinstance(line, (str, Sequence)): raise ValueError( f"The sequence must be a string or a sequence object. " f"Got {type(line)} instead." ) if isinstance(line, str): line = line.upper() if line.translate(nucl_to_none).replace("&", ""): warnings.warn( f"Warning: The string '{line}' contains nucleotides not" f" allowed in ROAD that will be removed. The allowed " f"nucleotides are: {nucleotides.union('&')}.", AmbiguosStructure, stacklevel=3, ) return line ### ### METHODS ###
[docs] def complement(self) -> "Sequence": """ Return the complement of the sequence. Returns ------- Sequence Complementary sequence. """ return Sequence(self._sequence.translate(nucl_to_pair), self.directionality)
[docs] def copy(self, **kwargs) -> "Sequence": """ Create a copy of the sequence. Returns ------- Sequence A new Sequence instance. """ return Sequence(str(self), self.directionality, **kwargs)
[docs] def distance(self, other: Union[str, "Sequence"]) -> int: """ Compute the number of mismatched bases between this sequence and another. Parameters ---------- other : str or Sequence Sequence to compare against. Returns ------- int Number of incompatible positions. Raises ------ ValueError If the input is invalid or lengths do not match. """ other = self._check_line(other) if len(self) != len(other): raise ValueError("Sequences must have the same length.") distance = 0 # Initialize the distance for ind, (nt1, nt2) in enumerate(zip(self, other)): # the symbols are not compatible if nt2 not in iupac_code[nt1] and nt1 not in iupac_code[nt2]: distance += 1 return distance
[docs] def find(self, sub: str) -> int: """ Find the first occurrence of a subsequence in the sequence. Parameters ---------- sub : str Subsequence to find. Returns ------- int Index of the first occurrence of the subsequence, or -1 if not found. """ return self._sequence.find(sub)
[docs] def find_repeated_subsequence(self, min_length: int = 8) -> Set[str]: """ Find all subsequences of minimum length that appear more than once. Parameters ---------- min_length : int, optional Minimum length of subsequence (default is 8). Returns ------- set of str Repeated subsequences. """ repeated_subsequences = set() length = len(self) for i in range(length): for j in range(i + min_length, length): subsequence = self[i:j] if subsequence in self[j:]: repeated_subsequences.add(str(subsequence)) return repeated_subsequences
[docs] def gc_content(self, extended_alphabet: bool = False) -> float: """ Calculate the GC content of the sequence. Parameters ---------- extended_alphabet : bool, optional Whether to include ambiguous base codes (e.g., S, M, R) in the calculation. Returns ------- float GC content percentage. """ total_count = len(self) if not total_count: return 0 seq = self._sequence gc_count = seq.count("G") + seq.count("C") if extended_alphabet: gc_count += ( seq.count("S") + sum(map(seq.count, ["M", "R", "Y", "K"])) / 2 + sum(map(seq.count, ["D", "H"])) / 3 + sum(map(seq.count, ["V", "B"])) * 2 / 3 + sum(map(seq.count, ["N", "X"])) / 4 ) percentage = (gc_count / total_count) * 100 return percentage
[docs] def get_random_sequence(self, structure: str = "") -> "Sequence": """ Generate a randomized sequence compatible with the IUPAC symbols and optionally a dot-bracket structure to be respected. Parameters ---------- structure : str, optional Dot-bracket structure. Returns ------- str A new randomized sequence. """ # make a first random sequence seq = [ random.choice(list(iupac_code[nucleotide])) for nucleotide in self._sequence ] # fully random sequence if not structure: return Sequence("".join(seq), self.directionality) # small sanity check of the structure elif structure and len(structure) != len(self): raise ValueError( f"The target dot-bracket must have the same length as " f"the sequence. Got {len(structure)}, " f"expected {len(self)}." ) # build the pair map pair_map = dot_bracket_to_pair_map(structure) # paired the nucleotied that are paired in the target structure for k, v in pair_map.items(): # unpaired nucleotide if v is None: continue # the paired nucleotide has the symbol for wobble pairings if self._sequence[v] == "K": if seq[k] == "G": seq[v] = "U" elif seq[k] == "U": seq[v] = "G" # normal pairing else: # the iupac code that pairs the nucleotides pair_sym = seq[k].translate(nucl_to_pair) # the iupac code of the nucleotide at position v sym_at_pos = iupac_code[self._sequence[v]] # take the nucleotides that are allowed by both possible_paired_nucleotides = sym_at_pos & iupac_code[pair_sym] if possible_paired_nucleotides: seq[v] = random.choice(list(possible_paired_nucleotides)) return Sequence("".join(seq), self.directionality)
[docs] def lower(self) -> str: """ Convert sequence to lowercase. Returns ------- str Lowercase sequence string. """ return self._sequence.lower()
[docs] def molecular_weight(self) -> float: """ Calculate the molecular weight of the sequence. Returns ------- float Total molecular weight in Daltons. """ molecular_weight_table = {"A": 347.2, "G": 363.2, "C": 323.2, "U": 324.2} total_weight = 0 for nucleotide in self._sequence: total_weight += molecular_weight_table.get(nucleotide, 0) return total_weight
[docs] def pop(self, idx: int) -> str: """ Remove and return a nucleotide at the given index. Parameters ---------- idx : int Index of nucleotide to remove. Returns ------- str The removed character. """ seq_line = list(self._sequence) popped_val = seq_line.pop(idx) self._sequence = "".join(seq_line) self._trigger_callbacks(new_sequence=self._sequence) return popped_val
[docs] def replace(self, old: str, new: str) -> "Sequence": """ Replace all occurrences of `old` with `new` in the sequence. Parameters ---------- old : str Character to replace. new : str Replacement character. Returns ------- Sequence Updated sequence. """ new = self._check_line(new) self._sequence = str(self._sequence.replace(old, new)) self._trigger_callbacks(new_sequence=self._sequence) return self
[docs] def reverse(self, inplace: bool = False) -> "Sequence": """ Reverse the directionality of the sequence. Parameters ---------- inplace : bool, optional If True, reverse in-place. Otherwise, return a new Sequence. Returns ------- Sequence Reversed sequence. """ if not inplace: return Sequence( self._sequence, directionality=self._directionality[::-1], callback=self.callbacks, ) self._directionality = self._directionality[::-1] self._trigger_callbacks(new_sequence=self._sequence) return self
[docs] def reverse_complement(self) -> "Sequence": """ Return the reverse complement of the sequence. Returns ------- Sequence Reverse-complemented sequence. """ return Sequence( self._sequence.translate(nucl_to_pair)[::-1], self.directionality )
[docs] def split(self, sep: Optional[str] = None) -> List["Sequence"]: """ Split the sequence by a separator. Parameters ---------- sep : str, optional Separator to use (default is None, meaning split on whitespace). Returns ------- list of Sequence List of subsequences. """ return [Sequence(s, self.directionality) for s in self._sequence.split(sep)]
[docs] def translate(self, dictionary: dict, inplace: bool = False) -> "Sequence": """ Translate the sequence using a mapping dictionary. Parameters ---------- dictionary : dict Dictionary to translate each character. inplace : bool, optional If True, modify the current sequence. Otherwise, return a new instance. Returns ------- Sequence Translated sequence. """ dictionary = str.maketrans(dictionary) if not inplace: return Sequence( self._sequence.translate(dictionary), directionality=self.directionality ) new_sequence = str(self._sequence.translate(dictionary)) new_sequence = self._check_line(new_sequence) self._sequence = new_sequence self._trigger_callbacks(new_sequence=self._sequence) return self
[docs] def upper(self) -> str: """ Convert sequence to uppercase. Returns ------- str Uppercase sequence string. """ return self._sequence.upper()