Source code for pyfurnace.prepare.utils

from typing import Union, Tuple, List, Callable
from Bio.Seq import Seq
from Bio.SeqUtils import MeltingTemp as mt
from Bio.SeqUtils import gc_fraction
from functools import partial

# https://www.bioinformatics.org/sms/iupac.html
# dictionary of melting temperature methods:
# name as key and function as value
tm_methods = {
    "Nearest Neighbor": mt.Tm_NN,
    "Empirical GC content": mt.Tm_GC,
    "Wallace, rule of thumb": mt.Tm_Wallace,
}

# dictionary of melting temperature models:
#  name as key and function as value
tm_models = {
    "Nearest Neighbor": [
        "DNA_NN4",
        "DNA_NN1",
        "DNA_NN2",
        "DNA_NN3",
        "RNA_NN1",
        "RNA_NN2",
        "RNA_NN3",
        "R_DNA_NN1",
    ],
    "Empirical GC content": [1, 2, 3, 4, 5, 6, 7, 8],
    "Wallace, rule of thumb": [],
}

# dictionary of Nearest Neighbour melting temperature models:
#  name as key and function as value
NN_models = {
    "DNA_NN4": mt.DNA_NN4,
    "DNA_NN1": mt.DNA_NN1,
    "DNA_NN2": mt.DNA_NN2,
    "DNA_NN3": mt.DNA_NN3,
    "RNA_NN1": mt.RNA_NN1,
    "RNA_NN2": mt.RNA_NN2,
    "RNA_NN3": mt.RNA_NN3,
    "R_DNA_NN1": mt.R_DNA_NN1,
}

# dictionary of default values for the primer energy parameters

default_values = {
    "Na": 0,
    "K": 50,
    "Tris": 20,
    "Mg": 1.5,
    "dNTPs": 0.2,
    "Method": 7,
    "DMSO (%)": 0,
    "mt_method": list(tm_methods)[0],
    "mt_model": tm_models[list(tm_methods)[0]][0],
    "Primer": 500,
}



[docs]
def make_tm_calculator(
    method_name: str = "Nearest Neighbor",
    model_name: str = "DNA_NN4",
    primer_conc: float = 500,
    tm_kwargs: dict = default_values.copy(),
) -> callable:
    """
    Creates a melting temperature (Tm) calculator function for DNA primers using
    specified calculation method and model.
    Parameters
    ----------
    method_name : str, optional
        The method used for Tm calculation.
        Options include "Nearest Neighbor",
        "Empirical GC content",
        and "Wallace, rule of thumb".
        Default is "Nearest Neighbor".
    model_name : str, optional
        The nearest neighbor model to use for Tm calculation.
        Only relevant if `method_name` is "Nearest Neighbor".
        Default is "DNA_NN4".
    primer_conc : float, optional
        The concentration of the primer (in nM) used in the calculation.
        Only relevant for "Nearest Neighbor" method.
        Default is 500.
    tm_kwargs : dict, optional
        Additional keyword arguments for the Tm calculation method.
        May include chemical corrections (e.g., DMSO, dNTPs).
        Default is a copy of `default_values`.
    Returns
    -------
    callable
        A function that takes a DNA sequence (str) and returns its melting
        temperature (float), possibly corrected for chemical additives.

    Notes
    -----
    - If DMSO is specified in `tm_kwargs`, the returned function
        applies a chemical correction to the calculated Tm.
    - For "Nearest Neighbor" method, dNTPs and DMSO are removed
        from the keyword arguments before calculation.
    - For "Empirical formulas based on GC content",
        DMSO is removed from the keyword arguments before calculation.
    - For "Wallace" method, no additional arguments are processed.
    Examples
    --------
    >>> tm_calc = make_tm_calculator(method_name="Nearest Neighbor", primer_conc=250)
    >>> tm_calc("ATCGATCG")
    62.5
    """
    method = tm_methods[method_name]

    first_word = method_name.split()[0].lower()

    if first_word == "nearest":
        model = NN_models[model_name]
        method_kwargs = tm_kwargs.copy()
        if "dNTPs" in method_kwargs:
            method_kwargs.pop("dNTPs")
        if "DMSO" in method_kwargs:
            method_kwargs.pop("DMSO")
        partial_func = partial(
            method, nn_table=model, dnac1=primer_conc, dnac2=0, **method_kwargs
        )

    elif first_word == "empirical":
        method_kwargs = tm_kwargs.copy()
        if "DMSO" in method_kwargs:
            method_kwargs.pop("DMSO")
        partial_func = partial(method, valueset=model_name, **method_kwargs)

    else:  # Wallace
        partial_func = method

    if tm_kwargs.get("DMSO", 0) > 0:
        return lambda seq: mt.chem_correction(partial_func(seq), DMSO=tm_kwargs["DMSO"])

    return partial_func




[docs]
def calculate_gc(seq: str) -> float:
    """
    Calculate the GC content percentage of a nucleotide sequence.

    Parameters
    ----------
    seq : str
        The nucleotide sequence for which to calculate the GC content.

    Returns
    -------
    float
        The GC content of the sequence as a percentage, rounded to one decimal place.

    Notes
    -----
    Ambiguous nucleotides are ignored in the calculation.
    """
    return round(gc_fraction(seq, ambiguous="ignore") * 100, 1)




[docs]
def annealing_temp(
    mts: List[float], seq: str, tm_kwargs: dict, method: str = "IDT"
) -> float:
    """
    Calculate the annealing temperature for PCR primers using either
    the IDT or Phusion method.

    Parameters
    ----------
    mts : List[float]
        List of melting temperatures (Tm) for the primers.
    seq : str
        DNA sequence for which the annealing temperature is to be calculated.
    tm_kwargs : dict
        Dictionary of keyword arguments for Tm calculation, such as
        salt concentration, DMSO, etc.
    method : str, optional
        Method to use for calculation. Either 'IDT' or 'Phusion'.
        Default is 'IDT'.

    Returns
    -------
    float
        Calculated annealing temperature, rounded to two decimal places.

    Notes
    -----
    - The 'IDT' method applies a chemical correction and a weighted average formula.
    - The 'Phusion' method uses the minimum Tm, with an adjustment for primer length.
    - Certain keys in `tm_kwargs` (e.g., 'dNTPs', 'DMSO') are handled specifically
        for the 'IDT' method.
    """
    if method == "IDT":
        method_kwargs = tm_kwargs.copy()
        if "dNTPs" in method_kwargs:
            method_kwargs.pop("dNTPs")
        if "DMSO" in method_kwargs:
            method_kwargs.pop("DMSO")
        t_anneal = (
            0.3 * min(mts)
            + 0.7
            * mt.chem_correction(
                mt.Tm_GC(seq, valueset=7, **method_kwargs), DMSO=tm_kwargs["DMSO"]
            )
            - 14.9
        )
    else:  # Phusion method
        t_anneal = min(mts)
        if all(len(p) > 20 for p in seq):
            t_anneal += 3
    return round(t_anneal, 2)




[docs]
def check_dimer(
    seq1: str,
    seq2: str,
    dict_format: bool = False,
    basepair: dict = {"A": "T", "T": "A", "C": "G", "G": "C"},
) -> Union[str, dict]:
    """
    Check the dimerization of two sequences and return the best dimer found.
    The dimerization is checked in an extremely simple way, by aligning the
    two sequences and checking for WC basepairing at the same index.
    If dict_format is True, return a dictionary with all the dimers found.

    Parameters:
    -----------
    seq1 (str):
        The first sequence to check.
    seq2 (str):
        The second sequence to check.
    dict_format (bool):
        If True, return a dictionary with all the dimers found.
    basepair (dict):
        A dictionary with the basepairing rules.
        Default is {'A': 'U', 'U': 'A', 'C': 'G', 'G': 'C'} for RNA.
        For DNA, use {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}.

    Returns:
    --------
    str or dict:
        If dict_format is True, return a dictionary with the dimers found.
        If dict_format is False, return the best dimer found as a string.
    """
    dimers_dict = {}
    basepair = str.maketrans(basepair)
    top_seq = " " * (len(seq2) - 1) + str(seq1) + " " * len(seq2)
    bottom_seq = str(seq2[::-1])
    max_match = [top_seq, " " * len(top_seq), bottom_seq]

    for start_ind in range(len(seq1) + len(seq2) - 2):
        bp = ""
        for ind, b in enumerate(bottom_seq):
            if (
                b.translate(basepair) == top_seq[ind]
                and b != " "
                and top_seq[ind] != " "
            ):
                bp += "┊"
            else:
                bp += " "
        num_bp = bp.count("┊")
        index_num = f"[{max(0, start_ind - len(seq2) + 1)}]"
        if num_bp > max_match[1].count("┊"):
            max_match = [
                index_num + top_seq,
                " " * len(index_num) + bp,
                " " * len(index_num) + bottom_seq,
            ]
        if dict_format:
            if num_bp not in dimers_dict:
                dimers_dict[num_bp] = []
            dimers_dict[num_bp].append("\n".join(max_match))
        top_seq = top_seq[1:]

    return dimers_dict if dict_format else "\n".join(max_match)




[docs]
def auto_design_primers(
    seq: Seq, target_temp: float = 65.0, tm_func: Callable = make_tm_calculator()
) -> Tuple[List[str], List[float]]:
    """
    Automatically designs forward and reverse primers for a given DNA sequence,
      optimizing for melting temperature (Tm) and primer quality.

    Parameters
    ----------
    seq : Seq
        The DNA sequence for which primers are to be designed.
    target_temp : float, optional
        The target melting temperature (Tm) for the primers, by default 65.0°C.
    tm_func : Callable, optional
        A function to calculate the melting temperature of a primer sequence,
        by default uses `make_tm_calculator()`.

    Returns
    -------
    final_primers : list of str
        A list containing the best forward and reverse primer sequences.
    final_mts : list of float
        A list containing the melting temperatures (Tm) of the selected primers.

    Notes
    -----
    - The function evaluates primer candidates based on Tm, GC content, length,
        and dimerization potential.
    - Returns empty string and 0 Tm if no suitable primer is found for a direction.
    """
    final_primers = []
    final_mts = []

    for direction in (1, -1):
        prim_length = 10
        primers_info = []

        to_prime = seq if direction == 1 else str(Seq(seq).reverse_complement())

        while prim_length < len(seq):
            primer = to_prime[:prim_length]
            tm = round(tm_func(primer), 2)

            if tm > (target_temp + 2.5):
                break
            if tm < (target_temp - 2.5):
                prim_length += 1
                continue

            score = 0
            if primer[-1] in "GC":
                score += 1
            if primer[-2] in "GC":
                score += 1
            if 18 < prim_length < 30:
                score += 1
            elif prim_length < 18:
                score -= 17 - prim_length
            if 40 < gc_fraction(primer, ambiguous="ignore") * 100 < 60:
                score += 1
            score -= abs(tm - target_temp) / 2

            dimer_score = max(check_dimer(primer, primer, dict_format=True).keys())
            score -= dimer_score / (len(primer) // 2)

            primers_info.append((score, tm, str(primer)))
            prim_length += 1

        primers_info.sort(reverse=True)
        if primers_info:
            final_mts.append(primers_info[0][1])
            final_primers.append(primers_info[0][2])
        else:
            final_primers.append("")
            final_mts.append(0)

    return final_primers, final_mts