Source code for pyfurnace.design.core.symbols

"""
This module contains general utility functions and constants for the pyfurnace package.

It includes:
functions for manipulating RNA strands and sequences,
utilities for RNA structures (dot-bracket notation, pair maps, trees)
folding barriers calculation for canonical co-transcriptional RNA Origami
"""

import random
import warnings
from typing import Optional, Tuple, Dict, List, Union, Iterable
from .basepair import BasePair

###
### USEFUL SETS AND DICTIONARIES
###

#: Set of all accepted nucleotides, including standard bases and extended IUPAC codes.
nucleotides = {
    "A",
    "U",
    "C",
    "G",  # standard nucleotides
    "W",  # A or U
    "M",  # A or C
    "R",  # A or G
    "Y",  # C or U
    "K",  # G or U
    "S",  # G or C
    "D",  # A or G or U
    "H",  # A or C or U
    "V",  # A or C or G
    "B",  # C or G or U
    "N",  # any nucleotide
    "X",  # any nucleotide for external Kissing Loops in ROAD
    "&",  # separator
}

#: Dictionary mapping IUPAC codes to corresponding sets of nucleotides.
#: Source: https://www.bioinformatics.org/sms/iupac.html
iupac_code = {
    "A": {"A"},
    "U": {"U"},
    "C": {"C"},
    "G": {"G"},  # standard nucleotides
    "W": {"A", "U"},  # A or U
    "M": {"A", "C"},  # A or C
    "R": {"A", "G"},  # A or G
    "Y": {"C", "U"},  # C or U
    "K": {"G", "U"},  # G or U
    "S": {"G", "C"},  # G or C
    "D": {"A", "G", "U"},  # A or G or U
    "H": {"A", "C", "U"},  # A or C or U
    "V": {"A", "C", "G"},  # A or C or G
    "B": {"C", "G", "U"},  # C or G or U
    "N": {"A", "U", "C", "G"},  # any nucleotide
    "X": {"A", "U", "C", "G"},  # any nucleotide for external Kissing Loops in ROAD
    "&": {"&"},  # separator
}

#: Dictionary mapping IUPAC codes to the set of codes they can base pair with.
base_pairing = {
    "A": {"B", "D", "H", "K", "N", "U", "W", "Y"},
    "U": {"A", "B", "D", "G", "H", "K", "M", "N", "R", "S", "V", "W"},
    "C": {"B", "D", "G", "K", "N", "R", "S", "V"},
    "G": {"B", "C", "D", "H", "K", "M", "N", "S", "U", "V", "W", "Y"},
    "W": {"A", "B", "D", "G", "H", "K", "M", "N", "R", "S", "U", "V", "W", "Y"},
    "M": {"B", "D", "G", "H", "K", "N", "R", "S", "U", "V", "W", "Y"},
    "R": {"B", "C", "D", "H", "K", "M", "N", "S", "U", "V", "W", "Y"},
    "Y": {"A", "B", "D", "G", "H", "K", "M", "N", "R", "S", "V", "W"},
    "K": {"A", "B", "C", "D", "G", "H", "K", "M", "N", "R", "S", "U", "V", "W", "Y"},
    "S": {"B", "C", "D", "G", "H", "K", "M", "N", "R", "S", "U", "V", "W", "Y"},
    "D": {"A", "B", "C", "D", "G", "H", "K", "M", "N", "R", "S", "U", "V", "W", "Y"},
    "H": {"A", "B", "D", "G", "H", "K", "M", "N", "R", "S", "U", "V", "W", "Y"},
    "V": {"B", "C", "D", "G", "H", "K", "M", "N", "R", "S", "U", "V", "W", "Y"},
    "B": {"A", "B", "C", "D", "G", "H", "K", "M", "N", "R", "S", "U", "V", "W", "Y"},
    "N": {"A", "B", "C", "D", "G", "H", "K", "M", "N", "R", "S", "U", "V", "W", "Y"},
    # this symbol is used for external Kissing Loops in ROAD
    "X": {
        "A",
        "B",
        "C",
        "D",
        "G",
        "H",
        "K",
        "M",
        "N",
        "R",
        "S",
        "U",
        "V",
        "W",
        "Y",
        "X",
    },
    "&": {"&"},  # separator
}

#: Dictionary mapping dot-bracket symbols to their corresponding pairing symbols.
db_pairs = {
    "(": ")",
    "[": "]",
    "{": "}",
    "<": ">",
    "A": "a",
    "B": "b",
    "C": "c",
    "D": "d",
    "E": "e",
    "F": "f",
    "G": "g",
    "H": "h",
    "I": "i",
    "J": "j",
    "K": "k",
    "L": "l",
    "M": "m",
    "N": "n",
    "O": "o",
    "P": "p",
    "Q": "q",
    "R": "r",
    "S": "s",
    "T": "t",
    "U": "u",
    "V": "v",
    "W": "w",
    "X": "x",
    "Y": "y",
    "Z": "z",
}

#: Tuple containing all possible pseudoknot symbols used in dot-bracket notation.
all_pk_symbols = (
    "[",
    "]",
    "{",
    "}",
    "<",
    ">",
    "A",
    "B",
    "C",
    "D",
    "E",
    "F",
    "G",
    "H",
    "I",
    "J",
    "K",
    "L",
    "M",
    "N",
    "O",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "U",
    "V",
    "W",
    "X",
    "Y",
    "Z",
    "a",
    "b",
    "c",
    "d",
    "e",
    "f",
    "g",
    "h",
    "i",
    "j",
    "k",
    "l",
    "m",
    "n",
    "o",
    "p",
    "q",
    "r",
    "s",
    "t",
    "u",
    "v",
    "w",
    "x",
    "y",
    "z",
)

#: Set of all accepted symbols including nucleotides, dots, brackets, and special ROAD
#: symbols.
accept_symbol = (
    nucleotides
    | {".", "(", ")"}
    | set(all_pk_symbols)
    | {
        "─",
        "│",
        "╭",
        "╮",
        "╰",
        "╯",
        "^",
        "*",
        "┼",
        "┊",
        "~",
        "◦",
        "↑",
        "↓",
        "⊗",
        "⊙",
        "•",
        "▂",
        "▄",
        "█",
        "-",
        "|",
        "+",
        ":",
        "=",
        "!",
        " ",
        "/",
        "\\",
        "3",
        "5",
        "&",
    }
)

#: Set of symbols representing base pairs.
bp_symbols = {"┊", "=", ":", "!", "*"}

#: Sequence representing the T7 promoter.
T7_PROMOTER = "TAATACGACTCACTATA"

###
### STRING TRANSLATORS
###

#: String translator mapping all pseudoknot symbols to dots.
pseudo_to_dot = str.maketrans("".join(all_pk_symbols), "." * len(all_pk_symbols))

#: String translator to map each dot-bracket opening symbol to its closing counterpart
#: and vice versa.
pair_db_sym = str.maketrans(
    "".join(db_pairs.keys()) + "".join(db_pairs.values()),
    "".join(db_pairs.values()) + "".join(db_pairs.keys()),
)

#: String translator that removes all nucleotides from a string.
nucl_to_none = str.maketrans("", "", "".join(nucleotides))

#: String translator that removes all accepted symbols from a string.
symb_to_none = str.maketrans("", "", "".join(accept_symbol))

#: String translator that maps nucleotides (including IUPAC codes) to their
#: complementary base.
nucl_to_pair = str.maketrans("AUCGWMRYKSDHVBNX", "UAGCWKYRKSHDBVNX")

#: String translator that removes all non-nucleotide symbols from a string.
only_nucl = str.maketrans("", "", "".join(accept_symbol - nucleotides))

#: String translator for horizontal flip of strand turning symbols.
horiz_flip = str.maketrans("╭╮╰╯/\\", "╮╭╯╰\\/")

#: String translator for vertical flip of strand turning symbols.
verti_flip = str.maketrans("╭╮╰╯/\\", "╰╯╭╮\\/")

#: String translator for 90 degree rotation of strand symbols.
rotate_90 = str.maketrans("╭╮╰╯│|─-", "╮╯╭╰──││")

#: String translator for converting normal symbols to ROAD strand symbols.
symb_to_road = str.maketrans("-|+=:*!", "─│┼┊┊┊┊")

###
### SEQUENCE FUNCTIONS
###



[docs]
def complement(sequence: str) -> str:
    """
    Return the complement of an RNA sequence using IUPAC base-pairing rules.

    Parameters
    ----------
    sequence : str
        RNA sequence composed of standard and extended IUPAC nucleotide symbols.

    Returns
    -------
    str
        Complementary sequence with each base replaced by its pair.
    """
    return sequence.translate(nucl_to_pair)




[docs]
def reverse_complement(sequence: str) -> str:
    """
    Return the reverse complement of an RNA sequence.

    Parameters
    ----------
    sequence : str
        RNA sequence to reverse and complement.

    Returns
    -------
    str
        The reverse-complemented sequence.
    """
    return sequence.translate(nucl_to_pair)[::-1]




[docs]
def pair_nucleotide(nucleotide: str, iupac_symbol: str = "N") -> str:
    """
    Return a nucleotide that can pair with the given nucleotide, optionally
    constrained by an IUPAC symbol.

    Parameters
    ----------
    nucleotide : str
        The base to find a pair for.
    iupac_symbol : str, optional
        Constraint based on allowed base-pairing (IUPAC code).

    Returns
    -------
    str
        A valid pairing nucleotide, or the complement of the input base.
    """
    if iupac_symbol in "AUCG":
        return iupac_symbol
    if iupac_symbol == "K":
        if nucleotide == "G":
            return "U"
        elif nucleotide == "U":
            return "G"
    return nucleotide.translate(nucl_to_pair)




[docs]
def mutate_nucleotide(
    sequence: str,
    sequence_constraints: str,
    nucl_ind: int,
    pair_map: Dict[int, Optional[int]],
) -> Tuple[Optional[str], Optional[str]]:
    """
    Mutate a nucleotide in the sequence and update its paired counterpart accordingly.

    Parameters
    ----------
    sequence : str
        Original RNA sequence.
    sequence_constraints : str
        A string of IUPAC codes representing base constraints for each position.
    nucl_ind : int
        Index of the nucleotide to mutate.
    pair_map : dict of int to int or None
        A mapping of indices representing base pairs.

    Returns
    -------
    tuple of (str or None, str or None)
        A tuple with the new nucleotide and its paired base if applicable.
    """
    new_set = iupac_code[sequence_constraints[nucl_ind]] - {sequence[nucl_ind]}
    if not new_set:
        return None, None
    new_nucl = random.choice(list(new_set))

    # get the index of the paired nucleotide
    paired_nucl = pair_map[nucl_ind]

    # if the nucleotide is paired, get the paired nucleotide
    if paired_nucl is not None:
        # get the possible nucleotides that can pair with the new nucleotide
        paired_nucl = pair_nucleotide(new_nucl, sequence_constraints[paired_nucl])
    return new_nucl, paired_nucl




[docs]
def gc_content(seq: str, extended_alphabet: bool = True) -> float:
    """
    Calculate GC content (optionally including ambiguous nucleotides).

    Parameters
    ----------
    seq : str
        The RNA sequence.
    extended_alphabet : bool, optional
        Whether to include ambiguous IUPAC symbols (e.g., S, K) as partial GC.

    Returns
    -------
    float
        Proportion of GC content in the sequence.
    """
    total_count = len(seq)
    if not total_count:
        return 0
    gc_count = seq.count("G") + seq.count("C")
    if extended_alphabet:
        gc_count += (
            seq.count("S")
            + sum(map(seq.count, ["M", "R", "Y", "K", "N", "X"])) / 2
            + sum(map(seq.count, ["D", "H"])) / 3
            + sum(map(seq.count, ["V", "B"])) * 2 / 3
        )
    percentage = gc_count / total_count
    return percentage



###
### STRUCTURE FUNCTIONS
###



[docs]
def rotate_dot_bracket(dot_bracket: str, shift_left: int) -> str:
    """
    Rotate a dot-bracket structure leftward by a given number of positions.

    Parameters
    ----------
    dot_bracket : str
        RNA secondary structure in dot-bracket notation.
    shift_left : int
        Number of positions to rotate to the left.

    Returns
    -------
    str
        Rotated dot-bracket structure.
    """
    n = len(dot_bracket)

    # Step 1: Get the pair map
    pair_map = dot_bracket_to_pair_map(dot_bracket)

    # Step 3: Adjust the pair map
    new_pair_map = BasePair()
    for i, j in pair_map.items():
        new_i = (i - shift_left) % n
        new_j = (j - shift_left) % n if j is not None else None

        new_pair_map[new_i] = new_j

    return pair_map_to_dot_bracket(new_pair_map, n)




[docs]
def dot_bracket_to_pair_map(dot_bracket: str) -> BasePair:
    """
    Convert dot-bracket notation into a pair map.

    Parameters
    ----------
    dot_bracket : str
        Secondary structure in dot-bracket notation.

    Returns
    -------
    BasePair
        A bidirectional dictionary mapping nucleotide indices to their paired
        partner (or None).
    """
    pair_map = BasePair()
    stacks = [[] for _ in range(len(db_pairs))]
    for i, sym in enumerate(dot_bracket):
        if sym in db_pairs.keys():
            stack_num = list(db_pairs.keys()).index(sym)
            stacks[stack_num].append(i)
        elif sym in db_pairs.values():
            stack_num = list(db_pairs.values()).index(sym)
            if stacks[stack_num]:  # pop the last element if the stack is not empty
                j = stacks[stack_num].pop()
                pair_map[j] = i
        else:  # unpaired nucleotide: No pair
            pair_map[i] = None

    # If there are still elements that are not in the dictionary, add them as unpaired
    for i in range(len(dot_bracket)):
        if i not in pair_map:
            pair_map[i] = None

    return pair_map




[docs]
def pair_map_to_dot_bracket(
    pair_map: Dict[int, Optional[int]], structure_length: Optional[int] = None
) -> str:
    """
    Convert a base pair map into dot-bracket notation.

    Parameters
    ----------
    pair_map : dict
        Mapping from nucleotide index to paired index or None.
    structure_length : int, optional
        Total length of the structure. If None, computed from the map.

    Returns
    -------
    str
        Dot-bracket notation representing the structure.
    """
    if structure_length is None:
        keys_max = max(pair_map.keys(), default=-1)
        values_max = max((k for k in pair_map.values() if k is not None), default=-1)
        structure_length = max(keys_max, values_max) + 1

    ### CREATE THE DOT BRACKET NOTATION ###
    done_pairs = set()

    # Prepare the variables
    dotbracket = ["."] * structure_length
    bracket_count = 0
    recheck = True
    stack = []

    while recheck:  # recheck the structure, for pseudoknots
        recheck = False

        for i in range(structure_length):

            paired = pair_map[i]
            if paired is None or i in done_pairs or paired in done_pairs:
                continue  # ignore unpaired positions or already paired positions

            if paired > i:  # the current position will close the pair later

                # may be a pseudknot:  Check for clash
                if stack and paired > stack[-1]:
                    recheck = True  # clash detected: recheck the structure later
                    continue
                else:  # no clash: add the pair to the stack
                    stack.append(paired)
                    dotbracket[i] = list(db_pairs.keys())[bracket_count]
                    dotbracket[paired] = list(db_pairs.values())[bracket_count]

            else:  # paired already analyzed
                if not stack:  # no pair in the stack
                    continue
                if i == stack[-1]:  # the current position closes the last pair
                    # Remove the pair from the map
                    done_pairs.add(i)
                    done_pairs.add(paired)
                    stack.pop()

        stack = []  # reset the stack count
        bracket_count += 1

        if bracket_count >= 30:
            warnings.warn(
                "Warning: Too many bracket types needed in to write the "
                "structure. Stopping at 30, 'Z' to 'z'.",
                stacklevel=3,
            )
            break

    return "".join(dotbracket)




[docs]
def dot_bracket_to_stacks(
    dot_bracket: str, only_opening: bool = False
) -> Tuple[str, List[Tuple[int, int]]]:
    """
    Identify contiguous structural elements (stacks) in a dot-bracket string.

    Parameters
    ----------
    dot_bracket : str
        RNA structure in dot-bracket format.
    only_opening : bool, optional
        Whether to return only stacks that start with an opening symbol.

    Returns
    -------
    tuple of str and list of tuple
        - A reduced dot-bracket string marking distinct stacks.
        - A list of (start_index, end_index) tuples for each stack.
    """
    pair_map = dot_bracket_to_pair_map(dot_bracket)
    # a list of tuples containing starting index and the length of the stack
    stacks = []
    # a list of the reduced dot bracket symbols, that will be returned as a string
    reduced_dot_bracket = []

    stack_start = 0
    current_pair_sym = dot_bracket[0]
    last_stack_pair = pair_map[0] + 1 if pair_map[0] else None

    ### NOT NECESSARY
    # open_pair_sym = '.' + ''.join(db_pairs.keys())

    for i, symbol in enumerate(dot_bracket + "_"):

        ### NOT NECESSARY
        # # ignore the symbols that are not '.' or not opening pairs,
        # # they should be already taken into accound
        # if symbol in open_pair_sym:
        #     continue

        # get the paired index of the current symbol
        stack_pair = pair_map.get(i)

        # if symbol change, or the pairing is not consecutive,
        # add the last stack to the list

        if (
            symbol != current_pair_sym
            or stack_pair is not None
            and stack_pair != last_stack_pair - 1
        ):

            add_last_stack = True  # add the last stack to the list by default
            # if only the opening stacks are needed,
            # check if the last stack is an opening stack or unpaired

            if only_opening:
                add_last_stack = (
                    current_pair_sym in db_pairs.keys() or current_pair_sym == "."
                )

            if add_last_stack:
                reduced_dot_bracket.append(current_pair_sym)
                stacks.append((stack_start, i - 1))
            stack_start = i

        last_stack_pair = stack_pair
        current_pair_sym = symbol

    return "".join(reduced_dot_bracket), stacks



class Node:
    """
    A tree node representing a region of an RNA secondary structure.

    Attributes
    ----------
    index : int or None
        Position of the nucleotide in the sequence.
    label : str
        Structure symbol (e.g., '(', ')', '.', '&').
    paired_index : int or None
        Index of the nucleotide this node pairs with, if any.
    parent : Node or None
        Parent node in the tree.
    seq : str or None
        Sequence constraint at this node.
    children : list of Node
        Child nodes representing nested structures.
    """

    def __init__(
        self,
        index: Optional[int] = None,
        label: str = "5'",
        parent: Optional["Node"] = None,
        seq: Optional[str] = None,
        **kwargs,
    ) -> None:
        """
        Initialize a Node object.
        Parameters
        ----------
        index : int or None
            Index of the nucleotide in the sequence.
        label : str
            Structure symbol (e.g., '(', ')', '.', '&').
        parent : Node or None
            Parent node in the tree.
        seq : str or None
            Sequence constraint at this node.
        kwargs : dict
            Additional attributes to set on the node.
        """
        # Initialize the node with the given parameters
        self.index = index
        self.label = label
        self.paired_index = None
        self.parent = parent
        self.seq = seq
        self.children: List[Node] = []
        if parent is not None:
            parent.children.append(self)
        for key, value in kwargs.items():
            setattr(self, key, value)

    def __repr__(self, level=0):
        """
        Return a string representation of the node and its children.
        """
        paired_str = ""
        if self.paired_index is not None:
            paired_str = f" -> {self.paired_index}"

        if level == 0:
            ret = f"{self.label}\n"
        else:
            ret = (
                f"╰{'──' * level} {self.label} - {self.seq} "
                f"- {self.index} {paired_str} \n"
            )
        for child in self.children:
            ret += child.__repr__(level + 1)
        return ret

    def __str__(self):
        """
        Return a string representation of the node.
        """
        return self.__repr__()

    def search(
        self, target_index: Optional[int], target_label: Optional[str] = None
    ) -> Optional["Node"]:
        """
        Recursively search for a node with a matching index and label.

        Parameters
        ----------
        target_index : int or None
            Index to match.
        target_label : str or None
            Optional label to also match.

        Returns
        -------
        Node or None
            The first matching node found, or None if not found.
        """
        # Check if the current node matches the target criteria
        if target_index == self.index and (
            target_label is None or target_label == self.label
        ):
            return self
        elif (
            self.index is None
            and self.parent is None
            and abs(target_index) == float("inf")
        ):
            return self  # special case for the root node

        # Recursively search in child nodes
        for child in self.children:
            result = child.search(target_index=target_index, target_label=target_label)
            if result:
                return result  # Return the node if found

        return None  # Return None if not found



[docs]
def dot_bracket_to_tree(dot_bracket: str, sequence: Optional[str] = None) -> Node:
    """
    Convert a dot-bracket RNA structure into a hierarchical tree of nodes.

    Parameters
    ----------
    dot_bracket : str
        RNA structure in dot-bracket format. Pseudoknots are ignored.
    sequence : str, optional
        Sequence constraints associated with each index.

    Returns
    -------
    Node
        Root node of the tree representing the full structure.
    """
    # Remove pseudoknots from the dot bracket
    dot_bracket = dot_bracket.translate(pseudo_to_dot)
    if not isinstance(sequence, str) and sequence is not None:
        sequence = str(sequence)
    root = Node()
    current_node = root
    for i, label in enumerate(dot_bracket):
        if sequence is not None:
            seq_const = sequence[i]
        else:
            seq_const = None
        if label == "(":  # add a new node, move to the new node
            new_node = Node(i, label, current_node, seq=seq_const)
            current_node = new_node
        # add the paired index to the parent, move to the parent node
        elif label == ")":
            current_node.paired_index = i
            current_node = current_node.parent
        elif label == ".":  # add a children to the current node, do not move
            new_node = Node(i, label, current_node, seq=seq_const)
        elif label == "&":
            new_node = Node(None, label, current_node)
    return root




[docs]
def tree_to_dot_bracket(
    node: Node,
    dot_bracket: Optional[List[str]] = None,
    seq_constraints: Union[bool, List[str]] = False,
) -> Union[str, Tuple[str, str]]:
    """
    Convert a tree of Nodes into a dot-bracket structure and optional sequence
    constraints.

    Parameters
    ----------
    node : Node
        Root node of the RNA structure tree.
    dot_bracket : list of str, optional
        Accumulator for the dot-bracket string.
    seq_constraints : bool or list of str
        If True, collect sequence constraints using the `seq` attribute.

    Returns
    -------
    str or tuple of (str, str)
        Dot-bracket string, and sequence constraints if requested.
    """

    if isinstance(seq_constraints, bool) and seq_constraints:
        # initialize the sequence constraints only if it's a bool
        seq_constraints = ["N"]  # assume the sequence is at least one nucleotide long

    if dot_bracket is None:
        dot_bracket = ["."]  # assume the structure is at least one nucleotide long

    if node.parent is not None:  # don't add the root node
        if node.index >= len(dot_bracket):
            add_length = node.index - len(dot_bracket) + 1
            dot_bracket += ["."] * add_length
            if seq_constraints:
                seq_constraints += ["N"] * add_length

        dot_bracket[node.index] = node.label
        if seq_constraints and node.seq:
            seq_constraints[node.index] = node.seq

        if node.paired_index is not None:

            if node.paired_index >= len(dot_bracket):
                add_length = node.paired_index - len(dot_bracket) + 1
                dot_bracket += "." * add_length
                if seq_constraints:
                    seq_constraints += ["N"] * add_length

            dot_bracket[node.paired_index] = ")"
            if seq_constraints and node.seq:
                seq_constraints[node.paired_index] = node.seq.translate(nucl_to_pair)

    for child in node.children:  # recursively add the children
        tree_to_dot_bracket(child, dot_bracket, seq_constraints)

    if node.parent is None:  # we reached back the root node
        if seq_constraints:
            return "".join(dot_bracket), "".join(seq_constraints)

        return "".join(dot_bracket)



###
### DISTANCE FUNCTIONS
###



[docs]
def hamming_distance(s1: str, s2: str, ignore_ind: Iterable[int] = (), **kwargs) -> int:
    """
    Compute the Hamming distance between two strings, ignoring specified indices.

    Parameters
    ----------
    s1 : str
        First sequence or structure.
    s2 : str
        Second sequence or structure.
    ignore_ind : iterable of int, optional
        Indices to exclude from comparison.

    Returns
    -------
    int
        Number of differing characters at unignored positions.
    """
    return sum(
        (1 for i, (x, y) in enumerate(zip(s1, s2)) if x != y and i not in ignore_ind)
    )




[docs]
def base_pair_difference(
    s1: str,
    s2: str,
    pair_map1: Optional[Dict[int, Optional[int]]] = None,
    pair_map2: Optional[Dict[int, Optional[int]]] = None,
    ignore_ind: Iterable[int] = (),
    accept_unpaired_ind: Iterable[int] = (),
    **kwargs,
) -> List[int]:
    """
    Find indices where base pairing differs between two structures.

    Parameters
    ----------
    s1 : str
        First structure (dot-bracket notation).
    s2 : str
        Second structure.
    pair_map1 : dict, optional
        Precomputed pair map for s1.
    pair_map2 : dict, optional
        Precomputed pair map for s2.
    ignore_ind : iterable of int, optional
        Indices to ignore in the comparison.
    accept_unpaired_ind : iterable of int, optional
        Indices allowed to be unpaired in s2 without penalty.

    Returns
    -------
    list of int
        Indices of base pairs that differ.
    """
    if pair_map1 is None:
        pair_map1 = dot_bracket_to_pair_map(s1)
    if pair_map2 is None:
        pair_map2 = dot_bracket_to_pair_map(s2)

    # Determine if an index should be considered based on the unpaired indices
    def check_ind(main_pair_map, i, accept_unpaired=False):
        if i in ignore_ind or main_pair_map[i] is None:
            return False
        elif accept_unpaired and i in accept_unpaired_ind:
            # Only count if paired in both but different
            return pair_map2[i] is not None and pair_map1[i] != pair_map2[i]
        return pair_map1[i] != pair_map2[i]

    # pairs in s1 that are not in s2
    diff_set = [i for i in pair_map1 if check_ind(pair_map1, i, True)]
    # pairs in s2 that are not in s1
    diff_set.extend(i for i in pair_map2 if check_ind(pair_map2, i))

    return diff_set




[docs]
def base_pair_distance(
    s1: str,
    s2: str,
    pair_map1: Optional[Dict[int, Optional[int]]] = None,
    pair_map2: Optional[Dict[int, Optional[int]]] = None,
    ignore_ind: Iterable[int] = (),
    accept_unpaired_ind: Iterable[int] = (),
    **kwargs,
) -> int:
    """
    Compute the number of base pair differences between two structures.

    Parameters
    ----------
    s1 : str
        First dot-bracket structure.
    s2 : str
        Second dot-bracket structure.
    pair_map1 : dict, optional
        Pair map for s1.
    pair_map2 : dict, optional
        Pair map for s2.
    ignore_ind : iterable of int, optional
        Indices to ignore.
    accept_unpaired_ind : iterable of int, optional
        Positions where being unpaired in s2 is allowed.

    Returns
    -------
    int
        Total number of mismatched base pairs.
    """
    return len(
        base_pair_difference(
            s1, s2, pair_map1, pair_map2, ignore_ind, accept_unpaired_ind, **kwargs
        )
    )



###
### FOLDING BARRIERS
###



[docs]
def folding_barriers(structure: str, kl_delay: int = 150) -> Tuple[str, int]:
    """
    Compute the folding barriers for a given RNA secondary structure.
    This function is based on
    ROAD: https://www.nature.com/articles/s41557-021-00679-1
    This function analyzes the dot-bracket representation of the RNA secondary
    structure to determine folding barriers based on base pair topology and
    kinetic delay.

    Parameters
    ----------
    structure : str
        The dot-bracket notation representing the secondary structure of the RNA.
        If folding barriers is called from a Motif or Origami, the structure is
        already provided by the object in the dot-bracket format.
    kl_delay : int, optional, default=150
        The number of nucleotides (nts) of delay before kissing loops (KLs) snap
        closed. A typical realistic value is around 350 (~1 second), while a
        more conservative  setting is 150 by default.

    Returns
    -------
    Tuple[str, int]
        A tuple containing:
        - A string representing the barrier map where:
        '─' = no barrier (0 penalty),
        '▂' = opening pair weak barrier (1 penalty),
        '▄' = cloding pair weak barrier (1 penalty),
        '█' = strong barrier (2 penalty).
        - An integer score indicating the total penalty based on barrier strengths.

    Notes
    -----
    The function assigns barrier strengths based on the topology of base pairs
    and kinetic constraints, ensuring proper folding predictions.
    """
    structure = structure.replace("&", "")
    ss_len = len(structure)
    s_map = dot_bracket_to_pair_map(structure)
    barriers = [""] * len(structure)

    # don't count the nucleotides in the 3' terminal position
    terminal_3_bonus = 16

    current_barr_count = 0
    for ind in range(ss_len):
        # get the bracket at the position
        bra = structure[ind]
        # if bra == '&':
        #     barriers[ind] = '&'
        #     continue

        # set the default barrier to '─', no barrier
        barriers[ind] = "─"

        if ind > ss_len - terminal_3_bonus:
            if s_map[ind] is not None:
                barriers[s_map[ind]] = "─"
            continue

        ### unpaired nts are not barriers and reset the current barrier count
        if bra == ".":
            barriers[ind] = "─"
            current_barr_count = 0

        ### opening pairs may be barriers or not,
        # indicate them with '▂', reset the barrier count
        elif bra == "(":
            barriers[ind] = "▂"
            current_barr_count = 0

        ### closing pairs may be barriers or not,
        # depending on the topology count
        elif bra == ")":
            # if the opening pair is not blocked,
            # we close and reset the current barrier count
            if barriers[s_map[ind]] == "▂":
                barriers[ind] = "─"
                barriers[s_map[ind]] = "─"
                current_barr_count = 0

            # if the closing pair is already blocked, then
            # we mark the barrier reached and start counting
            elif barriers[s_map[ind]] == "█":
                # if the current barrier count is greater than 5, wa mark it with '█'
                if current_barr_count > 5:
                    barriers[ind] = "█"
                    barriers[s_map[ind]] = "▄"
                # if the current barrier count is less than 5, we mark it with '▄'
                else:
                    barriers[ind] = "▄"
                    barriers[s_map[ind]] = "▄"
                current_barr_count += 1

        # if the index is bigger than the kl_delay,
        # we check if the internal KLs are closed
        if ind > kl_delay:

            # we check if the KLs are closed,
            # if yes, we mark them with 'N'
            close_sym = structure[ind - kl_delay]
            if close_sym not in "(.)" and close_sym in db_pairs.values():
                barriers[ind - kl_delay] == "─"
                barriers[s_map[ind - kl_delay]] == "─"

                # for each nt in the delay, we check if there are any barriers,
                # if yes, we mark them with '█'
                for k in range(s_map[ind - kl_delay], ind - kl_delay):
                    if barriers[k] == "▂":
                        barriers[k] = "█"

    penalty = {"█": 2, "▄": 1, "▂": 1, "─": 0}  # , '&': 0}
    repl = [penalty[i] for i in barriers]
    score = sum(repl)
    return "".join(barriers), score



###
### CUSTOM EXCEPTIONS
###



[docs]
class MotifStructureError(Exception):
    """
    Exception raised when a motif's secondary structure is invalid or inconsistent.

    This error typically indicates issues such as:
    - Overlapping or conflicting strands
    - Joining strands with different directionalities
    - Logical inconsistencies in structural motifs
    """

    pass




[docs]
class AmbiguosStructure(Warning):
    """
    Warning raised when a structure is ambiguous or potentially problematic.

    This can occur in cases where:
    - The strand routing is not clear

    Parameters
    ----------
    message : str
        Description of the ambiguity or issue.
    """

    def __init__(self, message: str) -> None:
        self.message: str = message

    def __str__(self) -> str:
        return repr(self.message)