from collections import Counter, defaultdict
from itertools import product
import re
aa_order = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
# this classification is based on the Shen paper and takes into account the aa side chain dipole and volume
# for more information check http://www.pnas.org/content/104/11/4337/suppl/DC1
aa_group = {'A': '0', 'G': '0', 'V': '0', 'I': '1', 'L': '1', 'F': '1', 'P': '1', 'Y': '2', 'M': '2', 'T': '2',
'S': '2', 'H': '3', 'N': '3', 'Q': '3', 'W': '3', 'R': '4', 'K': '4', 'D': '5', 'E': '5', 'C': '6'}
[docs]def chou_pseudo_aa_composition(*sequences):
"""
M.K. Gupta , R. Niyogi & M. Misra (2013) An alignment-free method to find
similarity among protein sequences via the general form of Chou’s pseudo amino acid composition,
SAR and QSAR in Environmental Research, 24:7, 597-609,
DOI: 10.1080/1062936X.2013.773378
Args:
*sequences: amino acid sequences
Returns:
list of Chou's pseudo amino acid composition for each sequence
"""
# first the aa count
aa_count_dict = [aa_composition(seq) for seq in sequences]
# distance to first
aa_distance_to_first_dict = [distance_to_first(x) for x in sequences]
aa_distribution_dict = [aa_distribution(seq, aa_c, aa_dist) for seq, aa_c, aa_dist in
zip(sequences, aa_count_dict, aa_distance_to_first_dict)]
# create lists with amino acids in the right order
aa_count = [order_seq(aa_count_dict_i) for aa_count_dict_i in aa_count_dict]
aa_distance_to_first = [order_seq(aa_distance_to_first_i) for aa_distance_to_first_i in aa_distance_to_first_dict]
aa_dist = [order_seq(aa_distribution_dict_i) for aa_distribution_dict_i in aa_distribution_dict]
return [x + y + z for x, y, z in zip(aa_count, aa_distance_to_first, aa_dist)]
[docs]def aa_composition(seq):
"""
Number of amino acids in a given sequence.
Args:
seq (str): A string representing a sequence
Returns:
Counter with amino acid composition
"""
return Counter(seq)
[docs]def aa_frequency(seq):
"""
Normalised amino acid composition.
Args:
seq (str): A string representing a sequence
Returns:
Dictionary with amino acid frequency
"""
aa_count = aa_composition(seq)
total = sum(aa_count.values())
return {key: value/total for key, value in aa_count.items()}
[docs]def distance_to_first(seq):
"""
Cumulative distance of each of the twenty amino acids to the first residue,
Args:
seq (str): A string representing a sequence
Returns:
Dictionary with cumulative
"""
return {x: sum([m.start() for m in re.finditer(x, seq)]) for x in aa_order}
[docs]def aa_distribution(seq, aa_count, aa_distance_to_first):
"""
Amino acid distribution described in An alignment-free method to find
similarity among protein sequences via the general form of Chou’s pseudo amino acid composition.
Args:
seq (str): amino acid sequence
aa_count (dict): aminod acid count of sequence
aa_distance_to_first (dict): distance to first for each amino acid to first position
Returns:
dict
"""
aa_dist_dict = defaultdict(int)
for i, aa in enumerate(seq):
aa_dist_dict[aa] += (i - (aa_distance_to_first[aa] / aa_count[aa])) ** 2 / aa_count[aa]
return aa_dist_dict
[docs]def order_seq(seq_dict):
"""
Orders dictionary to a list
Args:
seq_dict (dict): dictionary with amino acid keys
Returns:
A list with ordered amino acid
"""
return [seq_dict[aa] if aa in seq_dict else 0 for aa in aa_order]
[docs]def triad_method(*sequences):
"""
Triad featurisation method described in Shen J. et al. (2006). Predicting protein–protein interactions based
only on sequences information. PNAS, 104(11), pp: 4337-4341.
Args:
*sequences (list): sequence of amino acids
Returns:
list of lists with results of triad method
"""
d_matrix = []
for sequence in sequences:
# start dictionary with all 343 triads/keys (7 classes ** 3)
f_keys = [''.join(x) for x in product(['0', '1', '2', '3', '4', '5', '6'],
['0', '1', '2', '3', '4', '5', '6'],
['0', '1', '2', '3', '4', '5', '6'])]
f_results = {x: 0 for x in f_keys}
# classify aa in sequence
v = [aa_group[aa] for aa in sequence]
# get triads
triads = [''.join(v[x:x + 3]) for x in range(len(v) - 2)]
for triad in triads:
f_results[triad] += 1
# normalise values
f_max = max(f_results.values())
f_min = min(f_results.values())
d = [(f_results[key] - f_min) / f_max for key in f_keys]
# append 343 dimensional vector to d_matrix
d_matrix.append(d)
# d_matrix has shape (len(sequences), 343)
return d_matrix
[docs]def side_chain_volume(sequences):
pass
[docs]def auto_covariance(sequences):
pass