Source code for abpytools.features.composition

from collections import Counter, defaultdict
from itertools import  product
import re


aa_order = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

# this classification is based on the Shen paper and takes into account the aa side chain dipole and volume
# for more information check http://www.pnas.org/content/104/11/4337/suppl/DC1
aa_group = {'A': '0', 'G': '0', 'V': '0', 'I': '1', 'L': '1', 'F': '1', 'P': '1', 'Y': '2', 'M': '2', 'T': '2',
            'S': '2', 'H': '3', 'N': '3', 'Q': '3', 'W': '3', 'R': '4', 'K': '4', 'D': '5', 'E': '5', 'C': '6'}


[docs]def chou_pseudo_aa_composition(*sequences): """ M.K. Gupta , R. Niyogi & M. Misra (2013) An alignment-free method to find similarity among protein sequences via the general form of Chou’s pseudo amino acid composition, SAR and QSAR in Environmental Research, 24:7, 597-609, DOI: 10.1080/1062936X.2013.773378 Args: *sequences: amino acid sequences Returns: list of Chou's pseudo amino acid composition for each sequence """ # first the aa count aa_count_dict = [aa_composition(seq) for seq in sequences] # distance to first aa_distance_to_first_dict = [distance_to_first(x) for x in sequences] aa_distribution_dict = [aa_distribution(seq, aa_c, aa_dist) for seq, aa_c, aa_dist in zip(sequences, aa_count_dict, aa_distance_to_first_dict)] # create lists with amino acids in the right order aa_count = [order_seq(aa_count_dict_i) for aa_count_dict_i in aa_count_dict] aa_distance_to_first = [order_seq(aa_distance_to_first_i) for aa_distance_to_first_i in aa_distance_to_first_dict] aa_dist = [order_seq(aa_distribution_dict_i) for aa_distribution_dict_i in aa_distribution_dict] return [x + y + z for x, y, z in zip(aa_count, aa_distance_to_first, aa_dist)]
[docs]def aa_composition(seq): """ Number of amino acids in a given sequence. Args: seq (str): A string representing a sequence Returns: Counter with amino acid composition """ return Counter(seq)
[docs]def aa_frequency(seq): """ Normalised amino acid composition. Args: seq (str): A string representing a sequence Returns: Dictionary with amino acid frequency """ aa_count = aa_composition(seq) total = sum(aa_count.values()) return {key: value/total for key, value in aa_count.items()}
[docs]def distance_to_first(seq): """ Cumulative distance of each of the twenty amino acids to the first residue, Args: seq (str): A string representing a sequence Returns: Dictionary with cumulative """ return {x: sum([m.start() for m in re.finditer(x, seq)]) for x in aa_order}
[docs]def aa_distribution(seq, aa_count, aa_distance_to_first): """ Amino acid distribution described in An alignment-free method to find similarity among protein sequences via the general form of Chou’s pseudo amino acid composition. Args: seq (str): amino acid sequence aa_count (dict): aminod acid count of sequence aa_distance_to_first (dict): distance to first for each amino acid to first position Returns: dict """ aa_dist_dict = defaultdict(int) for i, aa in enumerate(seq): aa_dist_dict[aa] += (i - (aa_distance_to_first[aa] / aa_count[aa])) ** 2 / aa_count[aa] return aa_dist_dict
[docs]def order_seq(seq_dict): """ Orders dictionary to a list Args: seq_dict (dict): dictionary with amino acid keys Returns: A list with ordered amino acid """ return [seq_dict[aa] if aa in seq_dict else 0 for aa in aa_order]
[docs]def triad_method(*sequences): """ Triad featurisation method described in Shen J. et al. (2006). Predicting protein–protein interactions based only on sequences information. PNAS, 104(11), pp: 4337-4341. Args: *sequences (list): sequence of amino acids Returns: list of lists with results of triad method """ d_matrix = [] for sequence in sequences: # start dictionary with all 343 triads/keys (7 classes ** 3) f_keys = [''.join(x) for x in product(['0', '1', '2', '3', '4', '5', '6'], ['0', '1', '2', '3', '4', '5', '6'], ['0', '1', '2', '3', '4', '5', '6'])] f_results = {x: 0 for x in f_keys} # classify aa in sequence v = [aa_group[aa] for aa in sequence] # get triads triads = [''.join(v[x:x + 3]) for x in range(len(v) - 2)] for triad in triads: f_results[triad] += 1 # normalise values f_max = max(f_results.values()) f_min = min(f_results.values()) d = [(f_results[key] - f_min) / f_max for key in f_keys] # append 343 dimensional vector to d_matrix d_matrix.append(d) # d_matrix has shape (len(sequences), 343) return d_matrix
[docs]def side_chain_volume(sequences): pass
[docs]def auto_covariance(sequences): pass