Source code for pyfurnace.design.core.symbols

"""
This module contains general utility functions and constants for the pyfurnace package.

It includes:
functions for manipulating RNA strands and sequences,
utilities for RNA structures (dot-bracket notation, pair maps, trees)
folding barriers calculation for canonical co-transcriptional RNA Origami
"""

import random
import warnings
from typing import Optional, Tuple, Dict, List, Union, Iterable
from .basepair import BasePair

###
### USEFUL SETS AND DICTIONARIES
###

#: Set of all accepted nucleotides, including standard bases and extended IUPAC codes.
nucleotides = {
    "A",
    "U",
    "C",
    "G",  # standard nucleotides
    "W",  # A or U
    "M",  # A or C
    "R",  # A or G
    "Y",  # C or U
    "K",  # G or U
    "S",  # G or C
    "D",  # A or G or U
    "H",  # A or C or U
    "V",  # A or C or G
    "B",  # C or G or U
    "N",  # any nucleotide
    "X",  # any nucleotide for external Kissing Loops in ROAD
    "&",  # separator
}

#: Dictionary mapping IUPAC codes to corresponding sets of nucleotides.
#: Source: https://www.bioinformatics.org/sms/iupac.html
iupac_code = {
    "A": {"A"},
    "U": {"U"},
    "C": {"C"},
    "G": {"G"},  # standard nucleotides
    "W": {"A", "U"},  # A or U
    "M": {"A", "C"},  # A or C
    "R": {"A", "G"},  # A or G
    "Y": {"C", "U"},  # C or U
    "K": {"G", "U"},  # G or U
    "S": {"G", "C"},  # G or C
    "D": {"A", "G", "U"},  # A or G or U
    "H": {"A", "C", "U"},  # A or C or U
    "V": {"A", "C", "G"},  # A or C or G
    "B": {"C", "G", "U"},  # C or G or U
    "N": {"A", "U", "C", "G"},  # any nucleotide
    "X": {"A", "U", "C", "G"},  # any nucleotide for external Kissing Loops in ROAD
    "&": {"&"},  # separator
}

#: Dictionary mapping IUPAC codes to the set of codes they can base pair with.
base_pairing = {
    "A": {"B", "D", "H", "K", "N", "U", "W", "Y"},
    "U": {"A", "B", "D", "G", "H", "K", "M", "N", "R", "S", "V", "W"},
    "C": {"B", "D", "G", "K", "N", "R", "S", "V"},
    "G": {"B", "C", "D", "H", "K", "M", "N", "S", "U", "V", "W", "Y"},
    "W": {"A", "B", "D", "G", "H", "K", "M", "N", "R", "S", "U", "V", "W", "Y"},
    "M": {"B", "D", "G", "H", "K", "N", "R", "S", "U", "V", "W", "Y"},
    "R": {"B", "C", "D", "H", "K", "M", "N", "S", "U", "V", "W", "Y"},
    "Y": {"A", "B", "D", "G", "H", "K", "M", "N", "R", "S", "V", "W"},
    "K": {"A", "B", "C", "D", "G", "H", "K", "M", "N", "R", "S", "U", "V", "W", "Y"},
    "S": {"B", "C", "D", "G", "H", "K", "M", "N", "R", "S", "U", "V", "W", "Y"},
    "D": {"A", "B", "C", "D", "G", "H", "K", "M", "N", "R", "S", "U", "V", "W", "Y"},
    "H": {"A", "B", "D", "G", "H", "K", "M", "N", "R", "S", "U", "V", "W", "Y"},
    "V": {"B", "C", "D", "G", "H", "K", "M", "N", "R", "S", "U", "V", "W", "Y"},
    "B": {"A", "B", "C", "D", "G", "H", "K", "M", "N", "R", "S", "U", "V", "W", "Y"},
    "N": {"A", "B", "C", "D", "G", "H", "K", "M", "N", "R", "S", "U", "V", "W", "Y"},
    # this symbol is used for external Kissing Loops in ROAD
    "X": {
        "A",
        "B",
        "C",
        "D",
        "G",
        "H",
        "K",
        "M",
        "N",
        "R",
        "S",
        "U",
        "V",
        "W",
        "Y",
        "X",
    },
    "&": {"&"},  # separator
}

#: Dictionary mapping dot-bracket symbols to their corresponding pairing symbols.
db_pairs = {
    "(": ")",
    "[": "]",
    "{": "}",
    "<": ">",
    "A": "a",
    "B": "b",
    "C": "c",
    "D": "d",
    "E": "e",
    "F": "f",
    "G": "g",
    "H": "h",
    "I": "i",
    "J": "j",
    "K": "k",
    "L": "l",
    "M": "m",
    "N": "n",
    "O": "o",
    "P": "p",
    "Q": "q",
    "R": "r",
    "S": "s",
    "T": "t",
    "U": "u",
    "V": "v",
    "W": "w",
    "X": "x",
    "Y": "y",
    "Z": "z",
}

#: Tuple containing all possible pseudoknot symbols used in dot-bracket notation.
all_pk_symbols = (
    "[",
    "]",
    "{",
    "}",
    "<",
    ">",
    "A",
    "B",
    "C",
    "D",
    "E",
    "F",
    "G",
    "H",
    "I",
    "J",
    "K",
    "L",
    "M",
    "N",
    "O",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "U",
    "V",
    "W",
    "X",
    "Y",
    "Z",
    "a",
    "b",
    "c",
    "d",
    "e",
    "f",
    "g",
    "h",
    "i",
    "j",
    "k",
    "l",
    "m",
    "n",
    "o",
    "p",
    "q",
    "r",
    "s",
    "t",
    "u",
    "v",
    "w",
    "x",
    "y",
    "z",
)

#: Set of all accepted symbols including nucleotides, dots, brackets, and special ROAD
#: symbols.
accept_symbol = (
    nucleotides
    | {".", "(", ")"}
    | set(all_pk_symbols)
    | {
        "─",
        "│",
        "╭",
        "╮",
        "╰",
        "╯",
        "^",
        "*",
        "┼",
        "┊",
        "~",
        "◦",
        "↑",
        "↓",
        "⊗",
        "⊙",
        "•",
        "▂",
        "▄",
        "█",
        "-",
        "|",
        "+",
        ":",
        "=",
        "!",
        " ",
        "/",
        "\\",
        "3",
        "5",
        "&",
    }
)

#: Set of symbols representing base pairs.
bp_symbols = {"┊", "=", ":", "!", "*"}

#: Sequence representing the T7 promoter.
T7_PROMOTER = "TAATACGACTCACTATA"

###
### STRING TRANSLATORS
###

#: String translator mapping all pseudoknot symbols to dots.
pseudo_to_dot = str.maketrans("".join(all_pk_symbols), "." * len(all_pk_symbols))

#: String translator to map each dot-bracket opening symbol to its closing counterpart
#: and vice versa.
pair_db_sym = str.maketrans(
    "".join(db_pairs.keys()) + "".join(db_pairs.values()),
    "".join(db_pairs.values()) + "".join(db_pairs.keys()),
)

#: String translator that removes all nucleotides from a string.
nucl_to_none = str.maketrans("", "", "".join(nucleotides))

#: String translator that removes all accepted symbols from a string.
symb_to_none = str.maketrans("", "", "".join(accept_symbol))

#: String translator that maps nucleotides (including IUPAC codes) to their
#: complementary base.
nucl_to_pair = str.maketrans("AUCGWMRYKSDHVBNX", "UAGCWKYRKSHDBVNX")

#: String translator that removes all non-nucleotide symbols from a string.
only_nucl = str.maketrans("", "", "".join(accept_symbol - nucleotides))

#: String translator for horizontal flip of strand turning symbols.
horiz_flip = str.maketrans("╭╮╰╯/\\", "╮╭╯╰\\/")

#: String translator for vertical flip of strand turning symbols.
verti_flip = str.maketrans("╭╮╰╯/\\", "╰╯╭╮\\/")

#: String translator for 90 degree rotation of strand symbols.
rotate_90 = str.maketrans("╭╮╰╯│|─-", "╮╯╭╰──││")

#: String translator for converting normal symbols to ROAD strand symbols.
symb_to_road = str.maketrans("-|+=:*!", "─│┼┊┊┊┊")

###
### SEQUENCE FUNCTIONS
###


[docs] def complement(sequence: str) -> str: """ Return the complement of an RNA sequence using IUPAC base-pairing rules. Parameters ---------- sequence : str RNA sequence composed of standard and extended IUPAC nucleotide symbols. Returns ------- str Complementary sequence with each base replaced by its pair. """ return sequence.translate(nucl_to_pair)
[docs] def reverse_complement(sequence: str) -> str: """ Return the reverse complement of an RNA sequence. Parameters ---------- sequence : str RNA sequence to reverse and complement. Returns ------- str The reverse-complemented sequence. """ return sequence.translate(nucl_to_pair)[::-1]
[docs] def pair_nucleotide(nucleotide: str, iupac_symbol: str = "N") -> str: """ Return a nucleotide that can pair with the given nucleotide, optionally constrained by an IUPAC symbol. Parameters ---------- nucleotide : str The base to find a pair for. iupac_symbol : str, optional Constraint based on allowed base-pairing (IUPAC code). Returns ------- str A valid pairing nucleotide, or the complement of the input base. """ if iupac_symbol in "AUCG": return iupac_symbol if iupac_symbol == "K": if nucleotide == "G": return "U" elif nucleotide == "U": return "G" return nucleotide.translate(nucl_to_pair)
[docs] def mutate_nucleotide( sequence: str, sequence_constraints: str, nucl_ind: int, pair_map: Dict[int, Optional[int]], ) -> Tuple[Optional[str], Optional[str]]: """ Mutate a nucleotide in the sequence and update its paired counterpart accordingly. Parameters ---------- sequence : str Original RNA sequence. sequence_constraints : str A string of IUPAC codes representing base constraints for each position. nucl_ind : int Index of the nucleotide to mutate. pair_map : dict of int to int or None A mapping of indices representing base pairs. Returns ------- tuple of (str or None, str or None) A tuple with the new nucleotide and its paired base if applicable. """ new_set = iupac_code[sequence_constraints[nucl_ind]] - {sequence[nucl_ind]} if not new_set: return None, None new_nucl = random.choice(list(new_set)) # get the index of the paired nucleotide paired_nucl = pair_map[nucl_ind] # if the nucleotide is paired, get the paired nucleotide if paired_nucl is not None: # get the possible nucleotides that can pair with the new nucleotide paired_nucl = pair_nucleotide(new_nucl, sequence_constraints[paired_nucl]) return new_nucl, paired_nucl
[docs] def gc_content(seq: str, extended_alphabet: bool = True) -> float: """ Calculate GC content (optionally including ambiguous nucleotides). Parameters ---------- seq : str The RNA sequence. extended_alphabet : bool, optional Whether to include ambiguous IUPAC symbols (e.g., S, K) as partial GC. Returns ------- float Proportion of GC content in the sequence. """ total_count = len(seq) if not total_count: return 0 gc_count = seq.count("G") + seq.count("C") if extended_alphabet: gc_count += ( seq.count("S") + sum(map(seq.count, ["M", "R", "Y", "K", "N", "X"])) / 2 + sum(map(seq.count, ["D", "H"])) / 3 + sum(map(seq.count, ["V", "B"])) * 2 / 3 ) percentage = gc_count / total_count return percentage
### ### STRUCTURE FUNCTIONS ###
[docs] def rotate_dot_bracket(dot_bracket: str, shift_left: int) -> str: """ Rotate a dot-bracket structure leftward by a given number of positions. Parameters ---------- dot_bracket : str RNA secondary structure in dot-bracket notation. shift_left : int Number of positions to rotate to the left. Returns ------- str Rotated dot-bracket structure. """ n = len(dot_bracket) # Step 1: Get the pair map pair_map = dot_bracket_to_pair_map(dot_bracket) # Step 3: Adjust the pair map new_pair_map = BasePair() for i, j in pair_map.items(): new_i = (i - shift_left) % n new_j = (j - shift_left) % n if j is not None else None new_pair_map[new_i] = new_j return pair_map_to_dot_bracket(new_pair_map, n)
[docs] def dot_bracket_to_pair_map(dot_bracket: str) -> BasePair: """ Convert dot-bracket notation into a pair map. Parameters ---------- dot_bracket : str Secondary structure in dot-bracket notation. Returns ------- BasePair A bidirectional dictionary mapping nucleotide indices to their paired partner (or None). """ pair_map = BasePair() stacks = [[] for _ in range(len(db_pairs))] for i, sym in enumerate(dot_bracket): if sym in db_pairs.keys(): stack_num = list(db_pairs.keys()).index(sym) stacks[stack_num].append(i) elif sym in db_pairs.values(): stack_num = list(db_pairs.values()).index(sym) if stacks[stack_num]: # pop the last element if the stack is not empty j = stacks[stack_num].pop() pair_map[j] = i else: # unpaired nucleotide: No pair pair_map[i] = None # If there are still elements that are not in the dictionary, add them as unpaired for i in range(len(dot_bracket)): if i not in pair_map: pair_map[i] = None return pair_map
[docs] def pair_map_to_dot_bracket( pair_map: Dict[int, Optional[int]], structure_length: Optional[int] = None ) -> str: """ Convert a base pair map into dot-bracket notation. Parameters ---------- pair_map : dict Mapping from nucleotide index to paired index or None. structure_length : int, optional Total length of the structure. If None, computed from the map. Returns ------- str Dot-bracket notation representing the structure. """ if structure_length is None: keys_max = max(pair_map.keys(), default=-1) values_max = max((k for k in pair_map.values() if k is not None), default=-1) structure_length = max(keys_max, values_max) + 1 ### CREATE THE DOT BRACKET NOTATION ### done_pairs = set() # Prepare the variables dotbracket = ["."] * structure_length bracket_count = 0 recheck = True stack = [] while recheck: # recheck the structure, for pseudoknots recheck = False for i in range(structure_length): paired = pair_map[i] if paired is None or i in done_pairs or paired in done_pairs: continue # ignore unpaired positions or already paired positions if paired > i: # the current position will close the pair later # may be a pseudknot: Check for clash if stack and paired > stack[-1]: recheck = True # clash detected: recheck the structure later continue else: # no clash: add the pair to the stack stack.append(paired) dotbracket[i] = list(db_pairs.keys())[bracket_count] dotbracket[paired] = list(db_pairs.values())[bracket_count] else: # paired already analyzed if not stack: # no pair in the stack continue if i == stack[-1]: # the current position closes the last pair # Remove the pair from the map done_pairs.add(i) done_pairs.add(paired) stack.pop() stack = [] # reset the stack count bracket_count += 1 if bracket_count >= 30: warnings.warn( "Warning: Too many bracket types needed in to write the " "structure. Stopping at 30, 'Z' to 'z'.", stacklevel=3, ) break return "".join(dotbracket)
[docs] def dot_bracket_to_stacks( dot_bracket: str, only_opening: bool = False ) -> Tuple[str, List[Tuple[int, int]]]: """ Identify contiguous structural elements (stacks) in a dot-bracket string. Parameters ---------- dot_bracket : str RNA structure in dot-bracket format. only_opening : bool, optional Whether to return only stacks that start with an opening symbol. Returns ------- tuple of str and list of tuple - A reduced dot-bracket string marking distinct stacks. - A list of (start_index, end_index) tuples for each stack. """ pair_map = dot_bracket_to_pair_map(dot_bracket) # a list of tuples containing starting index and the length of the stack stacks = [] # a list of the reduced dot bracket symbols, that will be returned as a string reduced_dot_bracket = [] stack_start = 0 current_pair_sym = dot_bracket[0] last_stack_pair = pair_map[0] + 1 if pair_map[0] else None ### NOT NECESSARY # open_pair_sym = '.' + ''.join(db_pairs.keys()) for i, symbol in enumerate(dot_bracket + "_"): ### NOT NECESSARY # # ignore the symbols that are not '.' or not opening pairs, # # they should be already taken into accound # if symbol in open_pair_sym: # continue # get the paired index of the current symbol stack_pair = pair_map.get(i) # if symbol change, or the pairing is not consecutive, # add the last stack to the list if ( symbol != current_pair_sym or stack_pair is not None and stack_pair != last_stack_pair - 1 ): add_last_stack = True # add the last stack to the list by default # if only the opening stacks are needed, # check if the last stack is an opening stack or unpaired if only_opening: add_last_stack = ( current_pair_sym in db_pairs.keys() or current_pair_sym == "." ) if add_last_stack: reduced_dot_bracket.append(current_pair_sym) stacks.append((stack_start, i - 1)) stack_start = i last_stack_pair = stack_pair current_pair_sym = symbol return "".join(reduced_dot_bracket), stacks
class Node: """ A tree node representing a region of an RNA secondary structure. Attributes ---------- index : int or None Position of the nucleotide in the sequence. label : str Structure symbol (e.g., '(', ')', '.', '&'). paired_index : int or None Index of the nucleotide this node pairs with, if any. parent : Node or None Parent node in the tree. seq : str or None Sequence constraint at this node. children : list of Node Child nodes representing nested structures. """ def __init__( self, index: Optional[int] = None, label: str = "5'", parent: Optional["Node"] = None, seq: Optional[str] = None, **kwargs, ) -> None: """ Initialize a Node object. Parameters ---------- index : int or None Index of the nucleotide in the sequence. label : str Structure symbol (e.g., '(', ')', '.', '&'). parent : Node or None Parent node in the tree. seq : str or None Sequence constraint at this node. kwargs : dict Additional attributes to set on the node. """ # Initialize the node with the given parameters self.index = index self.label = label self.paired_index = None self.parent = parent self.seq = seq self.children: List[Node] = [] if parent is not None: parent.children.append(self) for key, value in kwargs.items(): setattr(self, key, value) def __repr__(self, level=0): """ Return a string representation of the node and its children. """ paired_str = "" if self.paired_index is not None: paired_str = f" -> {self.paired_index}" if level == 0: ret = f"{self.label}\n" else: ret = ( f"╰{'──' * level} {self.label} - {self.seq} " f"- {self.index} {paired_str} \n" ) for child in self.children: ret += child.__repr__(level + 1) return ret def __str__(self): """ Return a string representation of the node. """ return self.__repr__() def search( self, target_index: Optional[int], target_label: Optional[str] = None ) -> Optional["Node"]: """ Recursively search for a node with a matching index and label. Parameters ---------- target_index : int or None Index to match. target_label : str or None Optional label to also match. Returns ------- Node or None The first matching node found, or None if not found. """ # Check if the current node matches the target criteria if target_index == self.index and ( target_label is None or target_label == self.label ): return self elif ( self.index is None and self.parent is None and abs(target_index) == float("inf") ): return self # special case for the root node # Recursively search in child nodes for child in self.children: result = child.search(target_index=target_index, target_label=target_label) if result: return result # Return the node if found return None # Return None if not found
[docs] def dot_bracket_to_tree(dot_bracket: str, sequence: Optional[str] = None) -> Node: """ Convert a dot-bracket RNA structure into a hierarchical tree of nodes. Parameters ---------- dot_bracket : str RNA structure in dot-bracket format. Pseudoknots are ignored. sequence : str, optional Sequence constraints associated with each index. Returns ------- Node Root node of the tree representing the full structure. """ # Remove pseudoknots from the dot bracket dot_bracket = dot_bracket.translate(pseudo_to_dot) if not isinstance(sequence, str) and sequence is not None: sequence = str(sequence) root = Node() current_node = root for i, label in enumerate(dot_bracket): if sequence is not None: seq_const = sequence[i] else: seq_const = None if label == "(": # add a new node, move to the new node new_node = Node(i, label, current_node, seq=seq_const) current_node = new_node # add the paired index to the parent, move to the parent node elif label == ")": current_node.paired_index = i current_node = current_node.parent elif label == ".": # add a children to the current node, do not move new_node = Node(i, label, current_node, seq=seq_const) elif label == "&": new_node = Node(None, label, current_node) return root
[docs] def tree_to_dot_bracket( node: Node, dot_bracket: Optional[List[str]] = None, seq_constraints: Union[bool, List[str]] = False, ) -> Union[str, Tuple[str, str]]: """ Convert a tree of Nodes into a dot-bracket structure and optional sequence constraints. Parameters ---------- node : Node Root node of the RNA structure tree. dot_bracket : list of str, optional Accumulator for the dot-bracket string. seq_constraints : bool or list of str If True, collect sequence constraints using the `seq` attribute. Returns ------- str or tuple of (str, str) Dot-bracket string, and sequence constraints if requested. """ if isinstance(seq_constraints, bool) and seq_constraints: # initialize the sequence constraints only if it's a bool seq_constraints = ["N"] # assume the sequence is at least one nucleotide long if dot_bracket is None: dot_bracket = ["."] # assume the structure is at least one nucleotide long if node.parent is not None: # don't add the root node if node.index >= len(dot_bracket): add_length = node.index - len(dot_bracket) + 1 dot_bracket += ["."] * add_length if seq_constraints: seq_constraints += ["N"] * add_length dot_bracket[node.index] = node.label if seq_constraints and node.seq: seq_constraints[node.index] = node.seq if node.paired_index is not None: if node.paired_index >= len(dot_bracket): add_length = node.paired_index - len(dot_bracket) + 1 dot_bracket += "." * add_length if seq_constraints: seq_constraints += ["N"] * add_length dot_bracket[node.paired_index] = ")" if seq_constraints and node.seq: seq_constraints[node.paired_index] = node.seq.translate(nucl_to_pair) for child in node.children: # recursively add the children tree_to_dot_bracket(child, dot_bracket, seq_constraints) if node.parent is None: # we reached back the root node if seq_constraints: return "".join(dot_bracket), "".join(seq_constraints) return "".join(dot_bracket)
### ### DISTANCE FUNCTIONS ###
[docs] def hamming_distance(s1: str, s2: str, ignore_ind: Iterable[int] = (), **kwargs) -> int: """ Compute the Hamming distance between two strings, ignoring specified indices. Parameters ---------- s1 : str First sequence or structure. s2 : str Second sequence or structure. ignore_ind : iterable of int, optional Indices to exclude from comparison. Returns ------- int Number of differing characters at unignored positions. """ return sum( (1 for i, (x, y) in enumerate(zip(s1, s2)) if x != y and i not in ignore_ind) )
[docs] def base_pair_difference( s1: str, s2: str, pair_map1: Optional[Dict[int, Optional[int]]] = None, pair_map2: Optional[Dict[int, Optional[int]]] = None, ignore_ind: Iterable[int] = (), accept_unpaired_ind: Iterable[int] = (), **kwargs, ) -> List[int]: """ Find indices where base pairing differs between two structures. Parameters ---------- s1 : str First structure (dot-bracket notation). s2 : str Second structure. pair_map1 : dict, optional Precomputed pair map for s1. pair_map2 : dict, optional Precomputed pair map for s2. ignore_ind : iterable of int, optional Indices to ignore in the comparison. accept_unpaired_ind : iterable of int, optional Indices allowed to be unpaired in s2 without penalty. Returns ------- list of int Indices of base pairs that differ. """ if pair_map1 is None: pair_map1 = dot_bracket_to_pair_map(s1) if pair_map2 is None: pair_map2 = dot_bracket_to_pair_map(s2) # Determine if an index should be considered based on the unpaired indices def check_ind(main_pair_map, i, accept_unpaired=False): if i in ignore_ind or main_pair_map[i] is None: return False elif accept_unpaired and i in accept_unpaired_ind: # Only count if paired in both but different return pair_map2[i] is not None and pair_map1[i] != pair_map2[i] return pair_map1[i] != pair_map2[i] # pairs in s1 that are not in s2 diff_set = [i for i in pair_map1 if check_ind(pair_map1, i, True)] # pairs in s2 that are not in s1 diff_set.extend(i for i in pair_map2 if check_ind(pair_map2, i)) return diff_set
[docs] def base_pair_distance( s1: str, s2: str, pair_map1: Optional[Dict[int, Optional[int]]] = None, pair_map2: Optional[Dict[int, Optional[int]]] = None, ignore_ind: Iterable[int] = (), accept_unpaired_ind: Iterable[int] = (), **kwargs, ) -> int: """ Compute the number of base pair differences between two structures. Parameters ---------- s1 : str First dot-bracket structure. s2 : str Second dot-bracket structure. pair_map1 : dict, optional Pair map for s1. pair_map2 : dict, optional Pair map for s2. ignore_ind : iterable of int, optional Indices to ignore. accept_unpaired_ind : iterable of int, optional Positions where being unpaired in s2 is allowed. Returns ------- int Total number of mismatched base pairs. """ return len( base_pair_difference( s1, s2, pair_map1, pair_map2, ignore_ind, accept_unpaired_ind, **kwargs ) )
### ### FOLDING BARRIERS ###
[docs] def folding_barriers(structure: str, kl_delay: int = 150) -> Tuple[str, int]: """ Compute the folding barriers for a given RNA secondary structure. This function is based on ROAD: https://www.nature.com/articles/s41557-021-00679-1 This function analyzes the dot-bracket representation of the RNA secondary structure to determine folding barriers based on base pair topology and kinetic delay. Parameters ---------- structure : str The dot-bracket notation representing the secondary structure of the RNA. If folding barriers is called from a Motif or Origami, the structure is already provided by the object in the dot-bracket format. kl_delay : int, optional, default=150 The number of nucleotides (nts) of delay before kissing loops (KLs) snap closed. A typical realistic value is around 350 (~1 second), while a more conservative setting is 150 by default. Returns ------- Tuple[str, int] A tuple containing: - A string representing the barrier map where: '─' = no barrier (0 penalty), '▂' = opening pair weak barrier (1 penalty), '▄' = cloding pair weak barrier (1 penalty), '█' = strong barrier (2 penalty). - An integer score indicating the total penalty based on barrier strengths. Notes ----- The function assigns barrier strengths based on the topology of base pairs and kinetic constraints, ensuring proper folding predictions. """ structure = structure.replace("&", "") ss_len = len(structure) s_map = dot_bracket_to_pair_map(structure) barriers = [""] * len(structure) # don't count the nucleotides in the 3' terminal position terminal_3_bonus = 16 current_barr_count = 0 for ind in range(ss_len): # get the bracket at the position bra = structure[ind] # if bra == '&': # barriers[ind] = '&' # continue # set the default barrier to '─', no barrier barriers[ind] = "─" if ind > ss_len - terminal_3_bonus: if s_map[ind] is not None: barriers[s_map[ind]] = "─" continue ### unpaired nts are not barriers and reset the current barrier count if bra == ".": barriers[ind] = "─" current_barr_count = 0 ### opening pairs may be barriers or not, # indicate them with '▂', reset the barrier count elif bra == "(": barriers[ind] = "▂" current_barr_count = 0 ### closing pairs may be barriers or not, # depending on the topology count elif bra == ")": # if the opening pair is not blocked, # we close and reset the current barrier count if barriers[s_map[ind]] == "▂": barriers[ind] = "─" barriers[s_map[ind]] = "─" current_barr_count = 0 # if the closing pair is already blocked, then # we mark the barrier reached and start counting elif barriers[s_map[ind]] == "█": # if the current barrier count is greater than 5, wa mark it with '█' if current_barr_count > 5: barriers[ind] = "█" barriers[s_map[ind]] = "▄" # if the current barrier count is less than 5, we mark it with '▄' else: barriers[ind] = "▄" barriers[s_map[ind]] = "▄" current_barr_count += 1 # if the index is bigger than the kl_delay, # we check if the internal KLs are closed if ind > kl_delay: # we check if the KLs are closed, # if yes, we mark them with 'N' close_sym = structure[ind - kl_delay] if close_sym not in "(.)" and close_sym in db_pairs.values(): barriers[ind - kl_delay] == "─" barriers[s_map[ind - kl_delay]] == "─" # for each nt in the delay, we check if there are any barriers, # if yes, we mark them with '█' for k in range(s_map[ind - kl_delay], ind - kl_delay): if barriers[k] == "▂": barriers[k] = "█" penalty = {"█": 2, "▄": 1, "▂": 1, "─": 0} # , '&': 0} repl = [penalty[i] for i in barriers] score = sum(repl) return "".join(barriers), score
### ### CUSTOM EXCEPTIONS ###
[docs] class MotifStructureError(Exception): """ Exception raised when a motif's secondary structure is invalid or inconsistent. This error typically indicates issues such as: - Overlapping or conflicting strands - Joining strands with different directionalities - Logical inconsistencies in structural motifs """ pass
[docs] class AmbiguosStructure(Warning): """ Warning raised when a structure is ambiguous or potentially problematic. This can occur in cases where: - The strand routing is not clear Parameters ---------- message : str Description of the ambiguity or issue. """ def __init__(self, message: str) -> None: self.message: str = message def __str__(self) -> str: return repr(self.message)