Source code for abpytools.analysis.amino_acid_freq

from abpytools.features.regions import ChainDomains
from collections import Counter
import numpy as np
from matplotlib import pyplot as plt
from abpytools.utils.data_loader import DataLoader
import os
from abpytools.utils import PythonConfig

amino_acid_index = {"R": 0,
                    "N": 1,
                    "D": 2,
                    "E": 3,
                    "Q": 4,
                    "K": 5,
                    "S": 6,
                    "T": 7,
                    "C": 8,
                    "H": 9,
                    "M": 10,
                    "A": 11,
                    "V": 12,
                    "G": 13,
                    "I": 14,
                    "L": 15,
                    "F": 16,
                    "P": 17,
                    "W": 18,
                    "Y": 19}


[docs]class AminoAcidFreq(ChainDomains): def __init__(self, antibody_objects=None, path=None, region='CDR3', load=False): super(AminoAcidFreq, self).__init__(antibody_objects=antibody_objects, path=path, load=load) regions = ['all', 'CDRs', 'FRs', 'FR1', 'FR2', 'FR3', 'FR4', 'CDR1', 'CDR2', 'CDR3'] if region in regions: # get the sequence for the specified region self.region = region if self.region.startswith('CDR'): self._sequences = [self.cdr_sequences()[name][self.region] for name in self.names] data_loader = DataLoader(data_type='CDR_positions', data=['chothia', self.chain]) self._numbering = data_loader.get_data()[self.region] elif self.region.startswith('FR'): self._sequences = [self.framework_sequences()[name][self.region] for name in self.names] # TODO: implement 'all' elif self.region == 'all': raise NotImplementedError("This is not the code you're looking for.") else: raise ValueError('Parameter region must be either: {}. Not {}'.format(' ,'.join(regions), region)) self._sequence_count = len(max(self._sequences, key=len)) self._aa_freq = np.zeros((20, self._sequence_count)) self._aa_hyd_freq = np.zeros((3, self._sequence_count)) self._aa_chg_freq = np.zeros((3, self._sequence_count)) self._aa_count = np.zeros((20, self._sequence_count)) self._aa_hyd_count = np.zeros((3, self._sequence_count)) self._aa_chg_count = np.zeros((3, self._sequence_count)) def _amino_acid_freq(self, normalize): # if the sum of self._aa_count is zero then the count has not been performed at this point if self._aa_count.sum() == 0: for position in range(len(max(self._sequences, key=len))): position_sequence = [x[position] for x in self._sequences if len(x) > position] count_i = Counter(position_sequence) total_i = len(position_sequence) for amino_acid_i in count_i.keys(): self._aa_count[amino_acid_index[amino_acid_i], position] = count_i[amino_acid_i] # _aa_hyd_freq: row1 -> hydrophilic # row2 -> moderate # row3 -> hydrophobic if amino_acid_i in ['R', 'N', 'D', 'E', 'Q', 'K', 'S', 'T']: self._aa_hyd_count[0, position] += count_i[amino_acid_i] elif amino_acid_i in ['C', 'H', 'M']: self._aa_hyd_count[1, position] += count_i[amino_acid_i] else: self._aa_hyd_count[2, position] += count_i[amino_acid_i] # _aa_chg_freq: row1 -> negative # row2 -> positive # row3 -> neutral if amino_acid_i in ['D', 'E']: self._aa_chg_count[0, position] += count_i[amino_acid_i] elif amino_acid_i in ['R', 'K', 'H']: self._aa_chg_count[1, position] += count_i[amino_acid_i] else: self._aa_chg_count[2, position] += count_i[amino_acid_i] # normalize values # doing it even when it is not required comes at a small computational cost # it would take longer if the user had to recalculate everything to have a count plot and then a # frequency plot self._aa_freq[:, position] = self._aa_count[:, position] / total_i self._aa_chg_freq[:, position] = self._aa_chg_count[:, position] / total_i self._aa_hyd_freq[:, position] = self._aa_hyd_count[:, position] / total_i if normalize: return self._aa_freq, self._aa_chg_freq, self._aa_hyd_freq else: return self._aa_count, self._aa_chg_count, self._aa_hyd_count
[docs] def plot(self, sort_by='name', normalize=True, display_count=True, plot_path='./', plot_name='AminoAcidFrequency.png', notebook_plot=True): ipython_config = PythonConfig() if ipython_config.matplotlib_interactive is False and ipython_config.ipython_info == 'notebook': plt.ion() if sort_by not in ['name', 'hydropathy', 'charge']: raise ValueError("Argument for sort_by not valid. Valid arguments are name, hydrophobicity and charge") # get count/ freq matrices # to avoid writing more code than necessary the count and freq are stored in the same variable # since they will always be plotted independently aa, chg, hyd = self._amino_acid_freq(normalize=normalize) fig = plt.figure(1, figsize=(8, 8)) ax = fig.add_subplot(111) for position in range(self._aa_freq.shape[1]): previous = 0 # 20 distinct colors colors = ["#023fa5", "#7d87b9", "#bec1d4", "#d6bcc0", "#bb7784", "#8e063b", "#4a6fe3", "#8595e1", "#b5bbe3", "#e6afb9", "#e07b91", "#d33f6a", "#11c638", "#8dd593", "#c6dec7", "#ead3c6", "#f0b98d", "#ef9708", "#0fcfc0", "#9cded6"] if sort_by == 'name': # previous, lgd = self.plot_helper(ax=ax, colors=colors, # title='amino acids', # keys=sorted(amino_acid_index.keys()), # position=position, data=aa, # previous=previous) ax.set_title(self.region + ' amino acids', size=20) for i, amino_acid in enumerate(sorted(amino_acid_index.keys())): c = colors[i] ax.bar(position, aa[amino_acid_index[amino_acid], position], bottom=previous, label=amino_acid, color=c, align='center') previous += aa[amino_acid_index[amino_acid], position] lgd = ax.legend(sorted(amino_acid_index.keys()), loc='center left', bbox_to_anchor=(1, 0.5), prop={"size": 16}) elif sort_by == 'hydropathy': previous, lgd = self.plot_helper(ax=ax, colors=['b', 'r', 'k'], title=sort_by, keys=['Hydrophilic', 'Moderate', 'Hydrophobic'], position=position, data=hyd, previous=previous) else: previous, lgd = self.plot_helper(ax=ax, colors=['b', 'r', 'k'], title='amino acid charge', keys=['Negative', 'Positive', 'Neutral'], position=position, data=chg, previous=previous) if display_count: for position in range(aa.shape[1]): ax.text(x=position, y=aa[:, position].sum(), s=str(int(self._aa_count[:, position].sum())), rotation=45, ha='center', va='bottom') if normalize: ax.set_ylabel('Frequency', size=16) else: ax.set_ylabel('Count', size=16) ax.set_xticks(np.arange(len(self._numbering))) ax.set_xticklabels(self._numbering, rotation=60) ax.set_xlabel('Position', size=16) ax.set_ylim([0, aa.sum(0).max()*1.1]) ax.margins(0.02) ax.grid(axis='y') if ipython_config.ipython_info == 'notebook' and notebook_plot: ax.plot() else: fig.savefig(os.path.join(plot_path, plot_name), bbox_extra_artists=(lgd,), bbox_inches='tight') plt.close(fig)
[docs] def plot_helper(self, ax, colors, title, keys, position, data, previous): ax.set_title('{} {}'.format(self.region, title), size=20) for i, prop_i in enumerate(keys): c = colors[i] ax.bar(position, data[i, position], bottom=previous, label=prop_i, color=c, align='center') previous += data[i, position] lgd = ax.legend(keys, loc='center left', bbox_to_anchor=(1, 0.5), prop={"size": 16}) return previous, lgd