Source code for pyfurnace.prepare.utils

from typing import Union, Tuple, List, Callable
from Bio.Seq import Seq
from Bio.SeqUtils import MeltingTemp as mt
from Bio.SeqUtils import gc_fraction
from functools import partial

# https://www.bioinformatics.org/sms/iupac.html
# dictionary of melting temperature methods:
# name as key and function as value
tm_methods = {
    "Nearest Neighbor": mt.Tm_NN,
    "Empirical GC content": mt.Tm_GC,
    "Wallace, rule of thumb": mt.Tm_Wallace,
}

# dictionary of melting temperature models:
#  name as key and function as value
tm_models = {
    "Nearest Neighbor": [
        "DNA_NN4",
        "DNA_NN1",
        "DNA_NN2",
        "DNA_NN3",
        "RNA_NN1",
        "RNA_NN2",
        "RNA_NN3",
        "R_DNA_NN1",
    ],
    "Empirical GC content": [1, 2, 3, 4, 5, 6, 7, 8],
    "Wallace, rule of thumb": [],
}

# dictionary of Nearest Neighbour melting temperature models:
#  name as key and function as value
NN_models = {
    "DNA_NN4": mt.DNA_NN4,
    "DNA_NN1": mt.DNA_NN1,
    "DNA_NN2": mt.DNA_NN2,
    "DNA_NN3": mt.DNA_NN3,
    "RNA_NN1": mt.RNA_NN1,
    "RNA_NN2": mt.RNA_NN2,
    "RNA_NN3": mt.RNA_NN3,
    "R_DNA_NN1": mt.R_DNA_NN1,
}

# dictionary of default values for the primer energy parameters

default_values = {
    "Na": 0,
    "K": 50,
    "Tris": 20,
    "Mg": 1.5,
    "dNTPs": 0.2,
    "Method": 7,
    "DMSO (%)": 0,
    "mt_method": list(tm_methods)[0],
    "mt_model": tm_models[list(tm_methods)[0]][0],
    "Primer": 500,
}


[docs] def make_tm_calculator( method_name: str = "Nearest Neighbor", model_name: str = "DNA_NN4", primer_conc: float = 500, tm_kwargs: dict = default_values.copy(), ) -> callable: """ Creates a melting temperature (Tm) calculator function for DNA primers using specified calculation method and model. Parameters ---------- method_name : str, optional The method used for Tm calculation. Options include "Nearest Neighbor", "Empirical GC content", and "Wallace, rule of thumb". Default is "Nearest Neighbor". model_name : str, optional The nearest neighbor model to use for Tm calculation. Only relevant if `method_name` is "Nearest Neighbor". Default is "DNA_NN4". primer_conc : float, optional The concentration of the primer (in nM) used in the calculation. Only relevant for "Nearest Neighbor" method. Default is 500. tm_kwargs : dict, optional Additional keyword arguments for the Tm calculation method. May include chemical corrections (e.g., DMSO, dNTPs). Default is a copy of `default_values`. Returns ------- callable A function that takes a DNA sequence (str) and returns its melting temperature (float), possibly corrected for chemical additives. Notes ----- - If DMSO is specified in `tm_kwargs`, the returned function applies a chemical correction to the calculated Tm. - For "Nearest Neighbor" method, dNTPs and DMSO are removed from the keyword arguments before calculation. - For "Empirical formulas based on GC content", DMSO is removed from the keyword arguments before calculation. - For "Wallace" method, no additional arguments are processed. Examples -------- >>> tm_calc = make_tm_calculator(method_name="Nearest Neighbor", primer_conc=250) >>> tm_calc("ATCGATCG") 62.5 """ method = tm_methods[method_name] first_word = method_name.split()[0].lower() if first_word == "nearest": model = NN_models[model_name] method_kwargs = tm_kwargs.copy() if "dNTPs" in method_kwargs: method_kwargs.pop("dNTPs") if "DMSO" in method_kwargs: method_kwargs.pop("DMSO") partial_func = partial( method, nn_table=model, dnac1=primer_conc, dnac2=0, **method_kwargs ) elif first_word == "empirical": method_kwargs = tm_kwargs.copy() if "DMSO" in method_kwargs: method_kwargs.pop("DMSO") partial_func = partial(method, valueset=model_name, **method_kwargs) else: # Wallace partial_func = method if tm_kwargs.get("DMSO", 0) > 0: return lambda seq: mt.chem_correction(partial_func(seq), DMSO=tm_kwargs["DMSO"]) return partial_func
[docs] def calculate_gc(seq: str) -> float: """ Calculate the GC content percentage of a nucleotide sequence. Parameters ---------- seq : str The nucleotide sequence for which to calculate the GC content. Returns ------- float The GC content of the sequence as a percentage, rounded to one decimal place. Notes ----- Ambiguous nucleotides are ignored in the calculation. """ return round(gc_fraction(seq, ambiguous="ignore") * 100, 1)
[docs] def annealing_temp( mts: List[float], seq: str, tm_kwargs: dict, method: str = "IDT" ) -> float: """ Calculate the annealing temperature for PCR primers using either the IDT or Phusion method. Parameters ---------- mts : List[float] List of melting temperatures (Tm) for the primers. seq : str DNA sequence for which the annealing temperature is to be calculated. tm_kwargs : dict Dictionary of keyword arguments for Tm calculation, such as salt concentration, DMSO, etc. method : str, optional Method to use for calculation. Either 'IDT' or 'Phusion'. Default is 'IDT'. Returns ------- float Calculated annealing temperature, rounded to two decimal places. Notes ----- - The 'IDT' method applies a chemical correction and a weighted average formula. - The 'Phusion' method uses the minimum Tm, with an adjustment for primer length. - Certain keys in `tm_kwargs` (e.g., 'dNTPs', 'DMSO') are handled specifically for the 'IDT' method. """ if method == "IDT": method_kwargs = tm_kwargs.copy() if "dNTPs" in method_kwargs: method_kwargs.pop("dNTPs") if "DMSO" in method_kwargs: method_kwargs.pop("DMSO") t_anneal = ( 0.3 * min(mts) + 0.7 * mt.chem_correction( mt.Tm_GC(seq, valueset=7, **method_kwargs), DMSO=tm_kwargs["DMSO"] ) - 14.9 ) else: # Phusion method t_anneal = min(mts) if all(len(p) > 20 for p in seq): t_anneal += 3 return round(t_anneal, 2)
[docs] def check_dimer( seq1: str, seq2: str, dict_format: bool = False, basepair: dict = {"A": "T", "T": "A", "C": "G", "G": "C"}, ) -> Union[str, dict]: """ Check the dimerization of two sequences and return the best dimer found. The dimerization is checked in an extremely simple way, by aligning the two sequences and checking for WC basepairing at the same index. If dict_format is True, return a dictionary with all the dimers found. Parameters: ----------- seq1 (str): The first sequence to check. seq2 (str): The second sequence to check. dict_format (bool): If True, return a dictionary with all the dimers found. basepair (dict): A dictionary with the basepairing rules. Default is {'A': 'U', 'U': 'A', 'C': 'G', 'G': 'C'} for RNA. For DNA, use {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}. Returns: -------- str or dict: If dict_format is True, return a dictionary with the dimers found. If dict_format is False, return the best dimer found as a string. """ dimers_dict = {} basepair = str.maketrans(basepair) top_seq = " " * (len(seq2) - 1) + str(seq1) + " " * len(seq2) bottom_seq = str(seq2[::-1]) max_match = [top_seq, " " * len(top_seq), bottom_seq] for start_ind in range(len(seq1) + len(seq2) - 2): bp = "" for ind, b in enumerate(bottom_seq): if ( b.translate(basepair) == top_seq[ind] and b != " " and top_seq[ind] != " " ): bp += "┊" else: bp += " " num_bp = bp.count("┊") index_num = f"[{max(0, start_ind - len(seq2) + 1)}]" if num_bp > max_match[1].count("┊"): max_match = [ index_num + top_seq, " " * len(index_num) + bp, " " * len(index_num) + bottom_seq, ] if dict_format: if num_bp not in dimers_dict: dimers_dict[num_bp] = [] dimers_dict[num_bp].append("\n".join(max_match)) top_seq = top_seq[1:] return dimers_dict if dict_format else "\n".join(max_match)
[docs] def auto_design_primers( seq: Seq, target_temp: float = 65.0, tm_func: Callable = make_tm_calculator() ) -> Tuple[List[str], List[float]]: """ Automatically designs forward and reverse primers for a given DNA sequence, optimizing for melting temperature (Tm) and primer quality. Parameters ---------- seq : Seq The DNA sequence for which primers are to be designed. target_temp : float, optional The target melting temperature (Tm) for the primers, by default 65.0°C. tm_func : Callable, optional A function to calculate the melting temperature of a primer sequence, by default uses `make_tm_calculator()`. Returns ------- final_primers : list of str A list containing the best forward and reverse primer sequences. final_mts : list of float A list containing the melting temperatures (Tm) of the selected primers. Notes ----- - The function evaluates primer candidates based on Tm, GC content, length, and dimerization potential. - Returns empty string and 0 Tm if no suitable primer is found for a direction. """ final_primers = [] final_mts = [] for direction in (1, -1): prim_length = 10 primers_info = [] to_prime = seq if direction == 1 else str(Seq(seq).reverse_complement()) while prim_length < len(seq): primer = to_prime[:prim_length] tm = round(tm_func(primer), 2) if tm > (target_temp + 2.5): break if tm < (target_temp - 2.5): prim_length += 1 continue score = 0 if primer[-1] in "GC": score += 1 if primer[-2] in "GC": score += 1 if 18 < prim_length < 30: score += 1 elif prim_length < 18: score -= 17 - prim_length if 40 < gc_fraction(primer, ambiguous="ignore") * 100 < 60: score += 1 score -= abs(tm - target_temp) / 2 dimer_score = max(check_dimer(primer, primer, dict_format=True).keys()) score -= dimer_score / (len(primer) // 2) primers_info.append((score, tm, str(primer))) prim_length += 1 primers_info.sort(reverse=True) if primers_info: final_mts.append(primers_info[0][1]) final_primers.append(primers_info[0][2]) else: final_primers.append("") final_mts.append(0) return final_primers, final_mts