Source code for abpytools.core.chain

import re
import numpy as np
from ..utils import DataLoader, Download, NumberingException
import pandas as pd
from .helper_functions import numbering_table_sequences, numbering_table_region, numbering_table_multiindex
from . import Cache
from .flags import *


[docs]class Chain:
    """The Chain object represent a single chain variable fragment (scFv) antibody.

    A scFv can be part of either the heavy or light chain of an antibody.
    The nature of the chain is determined by querying the sequence
    to the Abnum server, and is implemented with the Chain.ab_numbering() method.

    Attributes:
        numbering (list): the name of each position occupied by amino acids in sequence
        mw (float): the cached molecular weight
        pI (float): the cached isoelectric point of the sequence
        cdr (tuple): tuple with two dictionaries for CDR and FR with the index of the amino acids
        in each region
        germline_identity (dict):
    """

    def __init__(self, sequence, name='Chain1', numbering_scheme=NUMBERING_FLAGS.CHOTHIA):
        """
        The default Chain object constructor, which required a string
        representing a scFv sequence.

        Args:
            sequence (str): Amino acid sequence
            name (name): Name of sequence
            numbering_scheme (str): numbering scheme name to perform alignment

        Examples:
            Instantiate a Chain object with the default constructor
            >>> from abpytools.core import Chain
            >>> from abpytools.core.flags import *
            >>> chain = Chain(sequence='MYSEQUENCE', name='my_seq', numbering_scheme=NUMBERING_FLAGS.CHOTHIA)
        """
        self._raw_sequence = sequence.upper()
        self._sequence = self._raw_sequence.replace('-', '')
        self._aligned_sequence = None
        self._name = name
        self._chain = None
        self.numbering = None
        self.hydrophobicity_matrix = None
        self.mw = None
        self.pI = None
        self.cdr = None
        self._numbering_scheme = numbering_scheme
        self._loading_status = 'Not Loaded'
        self.germline_identity = dict()
        self.germline = tuple()
        self._cache = Cache(max_cache_size=10)

[docs]    @classmethod
    def load_from_string(cls, sequence, name='Chain1', numbering_scheme=NUMBERING_FLAGS.CHOTHIA):
        """
        Returns an instantiated Chain object from a sequence
        Args:
            sequence:
            name:
            numbering_scheme:

        Returns:

        """
        new_chain = cls(sequence=sequence, name=name, numbering_scheme=numbering_scheme)
        new_chain.load()
        return new_chain

[docs]    def load(self):
        """
        Generates all the data:
        - Chain Numbering
        - Hydrophobicity matrix
        - Molecular weight
        - pI

        All the data is then stored in its respective attributes

        :return:

        """

        if self._loading_status in [NUMBERING_FLAGS.FAILED, NUMBERING_FLAGS.NOT_LOADED]:
            try:
                self.numbering = self.ab_numbering()
                self._loading_status = NUMBERING_FLAGS.LOADED
                self.load()

            except ValueError:
                self._loading_status = NUMBERING_FLAGS.FAILED

            except NumberingException:
                self._loading_status = NUMBERING_FLAGS.UNNUMBERED

        elif self._loading_status == NUMBERING_FLAGS.LOADED:
            self.hydrophobicity_matrix = self.ab_hydrophobicity_matrix()
            self.mw = self.ab_molecular_weight()
            self.pI = self.ab_pi()
            self.cdr = self.ab_regions()

        else:
            # this should never happen...
            raise ValueError("Unknown loading status")  # pragma: no cover

[docs]    @staticmethod
    def determine_chain_type(numbering):
        if numbering[0][0] == 'H':
            chain = CHAIN_FLAGS.HEAVY_CHAIN
        elif numbering[0][0] == 'L':
            chain = CHAIN_FLAGS.LIGHT_CHAIN
        else:
            # couldn't determine chain type
            chain = CHAIN_FLAGS.UNKNOWN_CHAIN  # pragma: no cover

        return chain

[docs]    def ab_numbering(self, server=OPTION_FLAGS.ABYSIS, **kwargs):
        """
        Return list

        Returns:
            list:
        """
        # store the amino positions/numbering in a list -> len(numbering) == len(self._sequence)
        numbering = get_ab_numbering(self._sequence, server, self._numbering_scheme, **kwargs)
        self._chain = self.determine_chain_type(numbering)

        return numbering

[docs]    def ab_numbering_table(self, as_array=False, replacement='-', region='all'):

        """

        :param region:
        :param as_array: if True returns numpy.array object, if False returns a pandas.DataFrame
        :param replacement: value to replace empty positions
        :return:
        """

        region = numbering_table_region(region=region)

        # if the object has not been loaded successfully yet need to try and get the numbering scheme using
        # ab_numbering method
        if self._loading_status in [NUMBERING_FLAGS.NOT_LOADED,
                                    NUMBERING_FLAGS.FAILED]:
            self.numbering = self.ab_numbering()

        whole_sequence_dict, whole_sequence = numbering_table_sequences(region=region,
                                                                        numbering_scheme=self._numbering_scheme,
                                                                        chain=self._chain)

        # now that all the prep has been done we can extract the position from each amino acid
        # according to the numbering scheme
        data = np.empty((len(whole_sequence)), dtype=str)
        for i, position in enumerate(whole_sequence):
            if position in self.numbering:
                data[i] = self._sequence[self.numbering.index(position)]
            else:
                # if there is no amino acid in the sequence that corresponds to position i we just replace it by
                # the replacement value, which is by default '-'
                data[i] = replacement

        self._aligned_sequence = data

        if as_array:
            # return the data as numpy.array -> much faster than creating a pandas.DataFrame
            return data.reshape(1, -1)
        else:

            # return the data as a pandas.DataFrame -> it's slower but looks nicer and makes it easier to get
            # the data of interest

            multi_index = numbering_table_multiindex(region=region,
                                                     whole_sequence_dict=whole_sequence_dict)

            # create the DataFrame and assign the columns and index names
            data = pd.DataFrame(data=data).T
            data.columns = multi_index
            data.index = [self._name]
            return data

[docs]    def ab_hydrophobicity_matrix(self, hydrophobicity_scores=HYDROPHOBICITY_FLAGS.EW):

        # check if all the required parameters are in order
        if isinstance(hydrophobicity_scores, str):
            if hydrophobicity_scores not in OPTION_FLAGS.AVAILABLE_HYDROPHOBITY_SCORES:
                raise ValueError("Chosen hydrophobicity scores ({}) not available. \
                Available hydrophobicity scores: {}".format(
                    hydrophobicity_scores, ' ,'.join(OPTION_FLAGS.AVAILABLE_HYDROPHOBITY_SCORES)
                ))

        if self._loading_status == NUMBERING_FLAGS.NOT_LOADED:
            self.numbering = self.ab_numbering()
        if self._chain == 'NA':
            raise ValueError("Could not determine chain type")

        data_loader = DataLoader(data_type='NumberingSchemes',
                                 data=[self._numbering_scheme, self._chain])
        whole_sequence_dict = data_loader.get_data()

        # whole_sequence is a list with all the amino acid positions in the selected numbering scheme
        whole_sequence = whole_sequence_dict

        # get the dictionary with the hydrophobicity scores
        data_loader = DataLoader(data_type='AminoAcidProperties',
                                 data=['hydrophobicity', hydrophobicity_scores + 'Hydrophobicity'])
        aa_hydrophobicity_scores = data_loader.get_data()

        return calculate_hydrophobicity_matrix(whole_sequence=whole_sequence,
                                               numbering=self.numbering,
                                               aa_hydrophobicity_scores=aa_hydrophobicity_scores,
                                               sequence=self._sequence)

[docs]    def ab_regions(self):

        """
        method to determine Chain regions (CDR and Framework) of each amino acid in sequence

        :return:
        """

        if 'cdrs' not in self._cache:

            if self._loading_status == NUMBERING_FLAGS.NOT_LOADED:
                self.numbering, self._chain = self.ab_numbering()

            if self.numbering == 'NA':
                raise ValueError("Cannot return CDR positions without the antibody numbering information")

            data_loader = DataLoader(data_type='CDR_positions', data=[self._numbering_scheme, self._chain])
            cdr_positions = data_loader.get_data()

            data_loader = DataLoader(data_type='Framework_positions', data=[self._numbering_scheme, self._chain])
            framework_position = data_loader.get_data()

            cdrs = calculate_cdr(numbering=self.numbering, cdr_positions=cdr_positions,
                                 framework_positions=framework_position)

            self._cache.update('cdrs', cdrs)

        return self._cache['cdrs']

[docs]    def ab_molecular_weight(self, monoisotopic=False):

        if monoisotopic:
            data_loader = DataLoader(data_type='AminoAcidProperties',
                                     data=['MolecularWeight', 'average'])
        else:
            data_loader = DataLoader(data_type='AminoAcidProperties',
                                     data=['MolecularWeight', 'monoisotopic'])
        mw_dict = data_loader.get_data()

        return calculate_mw(self._sequence, mw_dict)

[docs]    def ab_pi(self, pi_database='Wikipedia'):

        assert pi_database in OPTION_FLAGS.AVAILABLE_PI_VALUES, \
            "Selected pI database {} not available. " \
            "Available databases: {}".format(pi_database, ' ,'.join(
                OPTION_FLAGS.AVAILABLE_PI_VALUES))

        data_loader = DataLoader(data_type='AminoAcidProperties',
                                 data=['pI', pi_database])
        pi_data = data_loader.get_data()

        return calculate_pi(sequence=self._sequence, pi_data=pi_data)

[docs]    def ab_ec(self, extinction_coefficient_database='Standard', reduced=False, normalise=False, **kwargs):

        if reduced:
            extinction_coefficient_database += '_reduced'

        data_loader = DataLoader(data_type='AminoAcidProperties', data=['ExtinctionCoefficient',
                                                                        extinction_coefficient_database])

        ec_data = data_loader.get_data()

        if normalise:
            return calculate_ec(sequence=self._sequence, ec_data=ec_data) / self.ab_molecular_weight(**kwargs)
        else:
            return calculate_ec(sequence=self._sequence, ec_data=ec_data)

[docs]    def ab_format(self):
        return {"name": self._name, "sequence": self._sequence, "numbering": self.numbering, "chain": self._chain,
                "MW": self.mw, "CDR": self.cdr, "numbering_scheme": self._numbering_scheme, "pI": self.pI}

[docs]    def ab_charge(self, align=True, ph=7.4, pka_database='Wikipedia'):

        """
        Method to calculate the charges for each amino acid of antibody
        :param pka_database:
        :param ph:
        :param align: if set to True an alignment will be performed,
                      if it hasn't been done already using the ab_numbering method

        :return: array with amino acid charges
        """

        assert pka_database in OPTION_FLAGS.AVAILABLE_PI_VALUES, \
            "Selected pI database {} not available. " \
            "Available databases: {}".format(pka_database,
                                             ' ,'.join(OPTION_FLAGS.AVAILABLE_PI_VALUES))

        data_loader = DataLoader(data_type='AminoAcidProperties',
                                 data=['pI', pka_database])
        pka_data = data_loader.get_data()

        if align:
            # get the first (and only) row
            sequence = self.ab_numbering_table(as_array=True)[0]
        else:
            sequence = list(self.sequence)

        return np.array([amino_acid_charge(x, ph, pka_data) for x in sequence])

[docs]    def ab_total_charge(self, ph=7.4, pka_database=PI_FLAGS.WIKIPEDIA):

        assert pka_database in OPTION_FLAGS.AVAILABLE_PI_VALUES, \
            "Selected pI database {} not available. " \
            "Available databases: {}".format(pka_database,
                                             ' ,'.join(OPTION_FLAGS.AVAILABLE_PI_VALUES))

        data_loader = DataLoader(data_type='AminoAcidProperties',
                                 data=['pI', pka_database])
        pka_data = data_loader.get_data()

        return calculate_charge(sequence=self._sequence, ph=ph, pka_values=pka_data)

    @property
    def chain(self):
        return self._chain

    @property
    def name(self):
        return self._name

[docs]    def set_name(self, name):
        self._name = name

    @property
    def sequence(self):
        return self._sequence

    @property
    def aligned_sequence(self):
        if self._aligned_sequence is None:
            _ = self.ab_numbering_table(as_array=True, replacement='-')
        return self._aligned_sequence.tolist()

    @property
    def status(self):
        return self._loading_status

    @property
    def numbering_scheme(self):
        return self._numbering_scheme

    def _string_summary_basic(self):
        return "abpytools.Chain Name: {}, Chain type: {}, Sequence length: {}, Status: {}".format(
            self._name, self._chain, len(self._sequence),
            self._loading_status)  # pragma: no cover

    def __repr__(self):
        return "<%s at 0x%02x>" % (self._string_summary_basic(), id(self))  # pragma: no cover

    def __len__(self):
        return len(self.sequence)


[docs]def get_ab_numbering(sequence, server, numbering_scheme, timeout=30):
    """

    :rtype: list
    """
    # check which server to use to get numbering
    if server == OPTION_FLAGS.ABYSIS:
        # find out which numbering scheme to use
        if numbering_scheme == NUMBERING_FLAGS.CHOTHIA:
            scheme = '-c'
        elif numbering_scheme in (NUMBERING_FLAGS.CHOTHIA_EXT, NUMBERING_FLAGS.MARTIN):
            scheme = '-a'
        elif numbering_scheme == NUMBERING_FLAGS.KABAT:
            scheme = '-k'
        else:
            raise ValueError("{} numbering scheme is unknown.".format(numbering_scheme.capitalize()))

        # prepare the url string to query server
        url = f"http://www.bioinf.org.uk/cgi-bin/abnum/abnum.pl?plain=1&aaseq={sequence}&scheme={scheme}"
        # use the Download class from utils to get output
        numbering_table = Download(url, verbose=False, timeout=timeout)
        try:
            numbering_table.download()
        except ValueError:
            raise ValueError("Check the internet connection.")

        # check whether the server returned an error
        if numbering_table.html.replace("\n", '') == 'Warning: Unable to number sequence' or len(
                numbering_table.html.replace("\n", '')) == 0:
            raise NumberingException("Unable to number sequence")

        # parse the results
        parsed_numbering_table = re.findall("[\S| ]+", numbering_table.html)

        # get the numbering from the parsed table
        numbering = [x[:-2] for x in parsed_numbering_table if x[-1] != '-']

        # TODO: add more server options
    else:
        numbering = ['']

    return numbering


[docs]def calculate_hydrophobicity_matrix(whole_sequence, numbering, aa_hydrophobicity_scores, sequence):
    # instantiate numpy array (whole sequence includes all the amino acid positions of the VH/VL, even the ones
    # that aren't occupied -> these will be filled with zeros
    # hydrophobicity_matrix = np.zeros(len(whole_sequence))
    #
    # # iterate through each position
    # for i, position in enumerate(whole_sequence):
    #
    #     if position in numbering:
    #         position_in_data = numbering.index(position)
    #         hydrophobicity_matrix[i] = aa_hydrophobicity_scores[sequence[position_in_data]]
    #
    # return hydrophobicity_matrix
    # same thing as above but in a comprehension list
    return np.array([aa_hydrophobicity_scores[sequence[numbering.index(x)]] if x in numbering
                     else 0 for x in whole_sequence])


[docs]def calculate_mw(sequence, mw_data):
    return sum(mw_data[x] for x in sequence) - (len(sequence) - 1) * mw_data['water']


[docs]def calculate_ec(sequence, ec_data):
    # ϵ280 = nW x 5,500 + nY x 1,490 + nC x 125
    n_W = sequence.count('W')
    n_Y = sequence.count('Y')
    n_C = sequence.count('C')
    return n_W * ec_data['W'] + n_Y * ec_data['Y'] + n_C * ec_data['C']


[docs]def calculate_pi(sequence, pi_data):
    # algorithm implemented from http://isoelectric.ovh.org/files/practise-isoelectric-point.html

    # count number of D, E, C, Y, H, K, R
    d_count = sequence.count('D')
    e_count = sequence.count('E')
    c_count = sequence.count('C')
    y_count = sequence.count('Y')
    h_count = sequence.count('H')
    k_count = sequence.count('K')
    r_count = sequence.count('R')

    # initiate value of pH and nq (any number above 0)
    nq = 10
    ph = 0
    # define precision
    delta = 0.01

    while nq > 0:

        if ph >= 14:
            raise Exception("Could not calculate pI (pH reached above 14)")

        # qn1, qn2, qn3, qn4, qn5, qp1, qp2, qp3, qp4
        qn1 = -1 / (1 + 10 ** (pi_data['COOH'] - ph))  # C-terminus charge
        qn2 = - d_count / (1 + 10 ** (pi_data['D'] - ph))  # D charge
        qn3 = - e_count / (1 + 10 ** (pi_data['E'] - ph))  # E charge
        qn4 = - c_count / (1 + 10 ** (pi_data['C'] - ph))  # C charge
        qn5 = - y_count / (1 + 10 ** (pi_data['Y'] - ph))  # Y charge
        qp1 = h_count / (1 + 10 ** (ph - pi_data['H']))  # H charge
        qp2 = 1 / (1 + 10 ** (ph - pi_data['NH2']))  # N-terminus charge
        qp3 = k_count / (1 + 10 ** (ph - pi_data['K']))  # K charge
        qp4 = r_count / (1 + 10 ** (ph - pi_data['R']))  # R charge

        nq = qn1 + qn2 + qn3 + qn4 + qn5 + qp1 + qp2 + qp3 + qp4

        # update pH
        ph += delta

    return ph


[docs]def calculate_cdr(numbering, cdr_positions, framework_positions):
    """

    :param numbering:
    :param cdr_positions:
    :param framework_positions:
    :return:
    """

    cdrs = {'CDR1': list(),
            'CDR2': list(),
            'CDR3': list()}

    frameworks = {'FR1': list(),
                  'FR2': list(),
                  'FR3': list(),
                  'FR4': list()}

    for cdr in cdrs.keys():

        cdr_positions_i = cdr_positions[cdr]

        for i, position in enumerate(numbering):
            if position in cdr_positions_i:
                cdrs[cdr].append(i)

    for framework in frameworks.keys():

        framework_position_i = framework_positions[framework]

        for i, position in enumerate(numbering):
            if position in framework_position_i:
                frameworks[framework].append(i)

    return cdrs, frameworks


[docs]def amino_acid_charge(amino_acid, ph, pka_values):
    if amino_acid in ['D', 'E', 'C', 'Y']:
        return -1 / (1 + 10 ** (pka_values[amino_acid] - ph))
    elif amino_acid in ['K', 'R', 'H']:
        return 1 / (1 + 10 ** (ph - pka_values[amino_acid]))
    else:
        return 0


[docs]def calculate_charge(sequence, ph, pka_values):
    # This calculation would make more sense but is slower (~1.5-2x)
    # cooh = -1 / (1 + 10 ** (pka_values['COOH'] - ph))
    # nh2 = 1 / (1 + 10 ** (ph - pka_values['NH2']))
    #
    # return sum([amino_acid_charge(x, ph, pka_values) for x in list(sequence)]) + cooh + nh2

    # Faster implementation
    # count number of D, E, C, Y, H, K, R
    d_count = sequence.count('D')
    e_count = sequence.count('E')
    c_count = sequence.count('C')
    y_count = sequence.count('Y')
    h_count = sequence.count('H')
    k_count = sequence.count('K')
    r_count = sequence.count('R')

    # qn1, qn2, qn3, qn4, qn5, qp1, qp2, qp3, qp4
    qn1 = -1 / (1 + 10 ** (pka_values['COOH'] - ph))  # C-terminus charge
    qn2 = - d_count / (1 + 10 ** (pka_values['D'] - ph))  # D charge
    qn3 = - e_count / (1 + 10 ** (pka_values['E'] - ph))  # E charge
    qn4 = - c_count / (1 + 10 ** (pka_values['C'] - ph))  # C charge
    qn5 = - y_count / (1 + 10 ** (pka_values['Y'] - ph))  # Y charge
    qp1 = h_count / (1 + 10 ** (ph - pka_values['H']))  # H charge
    qp2 = 1 / (1 + 10 ** (ph - pka_values['NH2']))  # N-terminus charge
    qp3 = k_count / (1 + 10 ** (ph - pka_values['K']))  # K charge
    qp4 = r_count / (1 + 10 ** (ph - pka_values['R']))  # R charge

    nq = qn1 + qn2 + qn3 + qn4 + qn5 + qp1 + qp2 + qp3 + qp4

    return nq
Source code for abpytools.core.chain

AbPyTools

Navigation

Related Topics