import re
import numpy as np
from ..utils import DataLoader, Download, NumberingException
import pandas as pd
from .helper_functions import numbering_table_sequences, numbering_table_region, numbering_table_multiindex
from . import Cache
from .flags import *
[docs]class Chain:
"""The Chain object represent a single chain variable fragment (scFv) antibody.
A scFv can be part of either the heavy or light chain of an antibody.
The nature of the chain is determined by querying the sequence
to the Abnum server, and is implemented with the Chain.ab_numbering() method.
Attributes:
numbering (list): the name of each position occupied by amino acids in sequence
mw (float): the cached molecular weight
pI (float): the cached isoelectric point of the sequence
cdr (tuple): tuple with two dictionaries for CDR and FR with the index of the amino acids
in each region
germline_identity (dict):
"""
def __init__(self, sequence, name='Chain1', numbering_scheme=NUMBERING_FLAGS.CHOTHIA):
"""
The default Chain object constructor, which required a string
representing a scFv sequence.
Args:
sequence (str): Amino acid sequence
name (name): Name of sequence
numbering_scheme (str): numbering scheme name to perform alignment
Examples:
Instantiate a Chain object with the default constructor
>>> from abpytools.core import Chain
>>> from abpytools.core.flags import *
>>> chain = Chain(sequence='MYSEQUENCE', name='my_seq', numbering_scheme=NUMBERING_FLAGS.CHOTHIA)
"""
self._raw_sequence = sequence.upper()
self._sequence = self._raw_sequence.replace('-', '')
self._aligned_sequence = None
self._name = name
self._chain = None
self.numbering = None
self.hydrophobicity_matrix = None
self.mw = None
self.pI = None
self.cdr = None
self._numbering_scheme = numbering_scheme
self._loading_status = 'Not Loaded'
self.germline_identity = dict()
self.germline = tuple()
self._cache = Cache(max_cache_size=10)
[docs] @classmethod
def load_from_string(cls, sequence, name='Chain1', numbering_scheme=NUMBERING_FLAGS.CHOTHIA):
"""
Returns an instantiated Chain object from a sequence
Args:
sequence:
name:
numbering_scheme:
Returns:
"""
new_chain = cls(sequence=sequence, name=name, numbering_scheme=numbering_scheme)
new_chain.load()
return new_chain
[docs] def load(self):
"""
Generates all the data:
- Chain Numbering
- Hydrophobicity matrix
- Molecular weight
- pI
All the data is then stored in its respective attributes
:return:
"""
if self._loading_status in [NUMBERING_FLAGS.FAILED, NUMBERING_FLAGS.NOT_LOADED]:
try:
self.numbering = self.ab_numbering()
self._loading_status = NUMBERING_FLAGS.LOADED
self.load()
except ValueError:
self._loading_status = NUMBERING_FLAGS.FAILED
except NumberingException:
self._loading_status = NUMBERING_FLAGS.UNNUMBERED
elif self._loading_status == NUMBERING_FLAGS.LOADED:
self.hydrophobicity_matrix = self.ab_hydrophobicity_matrix()
self.mw = self.ab_molecular_weight()
self.pI = self.ab_pi()
self.cdr = self.ab_regions()
else:
# this should never happen...
raise ValueError("Unknown loading status") # pragma: no cover
[docs] @staticmethod
def determine_chain_type(numbering):
if numbering[0][0] == 'H':
chain = CHAIN_FLAGS.HEAVY_CHAIN
elif numbering[0][0] == 'L':
chain = CHAIN_FLAGS.LIGHT_CHAIN
else:
# couldn't determine chain type
chain = CHAIN_FLAGS.UNKNOWN_CHAIN # pragma: no cover
return chain
[docs] def ab_numbering(self, server=OPTION_FLAGS.ABYSIS, **kwargs):
"""
Return list
Returns:
list:
"""
# store the amino positions/numbering in a list -> len(numbering) == len(self._sequence)
numbering = get_ab_numbering(self._sequence, server, self._numbering_scheme, **kwargs)
self._chain = self.determine_chain_type(numbering)
return numbering
[docs] def ab_numbering_table(self, as_array=False, replacement='-', region='all'):
"""
:param region:
:param as_array: if True returns numpy.array object, if False returns a pandas.DataFrame
:param replacement: value to replace empty positions
:return:
"""
region = numbering_table_region(region=region)
# if the object has not been loaded successfully yet need to try and get the numbering scheme using
# ab_numbering method
if self._loading_status in [NUMBERING_FLAGS.NOT_LOADED,
NUMBERING_FLAGS.FAILED]:
self.numbering = self.ab_numbering()
whole_sequence_dict, whole_sequence = numbering_table_sequences(region=region,
numbering_scheme=self._numbering_scheme,
chain=self._chain)
# now that all the prep has been done we can extract the position from each amino acid
# according to the numbering scheme
data = np.empty((len(whole_sequence)), dtype=str)
for i, position in enumerate(whole_sequence):
if position in self.numbering:
data[i] = self._sequence[self.numbering.index(position)]
else:
# if there is no amino acid in the sequence that corresponds to position i we just replace it by
# the replacement value, which is by default '-'
data[i] = replacement
self._aligned_sequence = data
if as_array:
# return the data as numpy.array -> much faster than creating a pandas.DataFrame
return data.reshape(1, -1)
else:
# return the data as a pandas.DataFrame -> it's slower but looks nicer and makes it easier to get
# the data of interest
multi_index = numbering_table_multiindex(region=region,
whole_sequence_dict=whole_sequence_dict)
# create the DataFrame and assign the columns and index names
data = pd.DataFrame(data=data).T
data.columns = multi_index
data.index = [self._name]
return data
[docs] def ab_hydrophobicity_matrix(self, hydrophobicity_scores=HYDROPHOBICITY_FLAGS.EW):
# check if all the required parameters are in order
if isinstance(hydrophobicity_scores, str):
if hydrophobicity_scores not in OPTION_FLAGS.AVAILABLE_HYDROPHOBITY_SCORES:
raise ValueError("Chosen hydrophobicity scores ({}) not available. \
Available hydrophobicity scores: {}".format(
hydrophobicity_scores, ' ,'.join(OPTION_FLAGS.AVAILABLE_HYDROPHOBITY_SCORES)
))
if self._loading_status == NUMBERING_FLAGS.NOT_LOADED:
self.numbering = self.ab_numbering()
if self._chain == 'NA':
raise ValueError("Could not determine chain type")
data_loader = DataLoader(data_type='NumberingSchemes',
data=[self._numbering_scheme, self._chain])
whole_sequence_dict = data_loader.get_data()
# whole_sequence is a list with all the amino acid positions in the selected numbering scheme
whole_sequence = whole_sequence_dict
# get the dictionary with the hydrophobicity scores
data_loader = DataLoader(data_type='AminoAcidProperties',
data=['hydrophobicity', hydrophobicity_scores + 'Hydrophobicity'])
aa_hydrophobicity_scores = data_loader.get_data()
return calculate_hydrophobicity_matrix(whole_sequence=whole_sequence,
numbering=self.numbering,
aa_hydrophobicity_scores=aa_hydrophobicity_scores,
sequence=self._sequence)
[docs] def ab_regions(self):
"""
method to determine Chain regions (CDR and Framework) of each amino acid in sequence
:return:
"""
if 'cdrs' not in self._cache:
if self._loading_status == NUMBERING_FLAGS.NOT_LOADED:
self.numbering, self._chain = self.ab_numbering()
if self.numbering == 'NA':
raise ValueError("Cannot return CDR positions without the antibody numbering information")
data_loader = DataLoader(data_type='CDR_positions', data=[self._numbering_scheme, self._chain])
cdr_positions = data_loader.get_data()
data_loader = DataLoader(data_type='Framework_positions', data=[self._numbering_scheme, self._chain])
framework_position = data_loader.get_data()
cdrs = calculate_cdr(numbering=self.numbering, cdr_positions=cdr_positions,
framework_positions=framework_position)
self._cache.update('cdrs', cdrs)
return self._cache['cdrs']
[docs] def ab_molecular_weight(self, monoisotopic=False):
if monoisotopic:
data_loader = DataLoader(data_type='AminoAcidProperties',
data=['MolecularWeight', 'average'])
else:
data_loader = DataLoader(data_type='AminoAcidProperties',
data=['MolecularWeight', 'monoisotopic'])
mw_dict = data_loader.get_data()
return calculate_mw(self._sequence, mw_dict)
[docs] def ab_pi(self, pi_database='Wikipedia'):
assert pi_database in OPTION_FLAGS.AVAILABLE_PI_VALUES, \
"Selected pI database {} not available. " \
"Available databases: {}".format(pi_database, ' ,'.join(
OPTION_FLAGS.AVAILABLE_PI_VALUES))
data_loader = DataLoader(data_type='AminoAcidProperties',
data=['pI', pi_database])
pi_data = data_loader.get_data()
return calculate_pi(sequence=self._sequence, pi_data=pi_data)
[docs] def ab_ec(self, extinction_coefficient_database='Standard', reduced=False, normalise=False, **kwargs):
if reduced:
extinction_coefficient_database += '_reduced'
data_loader = DataLoader(data_type='AminoAcidProperties', data=['ExtinctionCoefficient',
extinction_coefficient_database])
ec_data = data_loader.get_data()
if normalise:
return calculate_ec(sequence=self._sequence, ec_data=ec_data) / self.ab_molecular_weight(**kwargs)
else:
return calculate_ec(sequence=self._sequence, ec_data=ec_data)
[docs] def ab_charge(self, align=True, ph=7.4, pka_database='Wikipedia'):
"""
Method to calculate the charges for each amino acid of antibody
:param pka_database:
:param ph:
:param align: if set to True an alignment will be performed,
if it hasn't been done already using the ab_numbering method
:return: array with amino acid charges
"""
assert pka_database in OPTION_FLAGS.AVAILABLE_PI_VALUES, \
"Selected pI database {} not available. " \
"Available databases: {}".format(pka_database,
' ,'.join(OPTION_FLAGS.AVAILABLE_PI_VALUES))
data_loader = DataLoader(data_type='AminoAcidProperties',
data=['pI', pka_database])
pka_data = data_loader.get_data()
if align:
# get the first (and only) row
sequence = self.ab_numbering_table(as_array=True)[0]
else:
sequence = list(self.sequence)
return np.array([amino_acid_charge(x, ph, pka_data) for x in sequence])
[docs] def ab_total_charge(self, ph=7.4, pka_database=PI_FLAGS.WIKIPEDIA):
assert pka_database in OPTION_FLAGS.AVAILABLE_PI_VALUES, \
"Selected pI database {} not available. " \
"Available databases: {}".format(pka_database,
' ,'.join(OPTION_FLAGS.AVAILABLE_PI_VALUES))
data_loader = DataLoader(data_type='AminoAcidProperties',
data=['pI', pka_database])
pka_data = data_loader.get_data()
return calculate_charge(sequence=self._sequence, ph=ph, pka_values=pka_data)
@property
def chain(self):
return self._chain
@property
def name(self):
return self._name
[docs] def set_name(self, name):
self._name = name
@property
def sequence(self):
return self._sequence
@property
def aligned_sequence(self):
if self._aligned_sequence is None:
_ = self.ab_numbering_table(as_array=True, replacement='-')
return self._aligned_sequence.tolist()
@property
def status(self):
return self._loading_status
@property
def numbering_scheme(self):
return self._numbering_scheme
def _string_summary_basic(self):
return "abpytools.Chain Name: {}, Chain type: {}, Sequence length: {}, Status: {}".format(
self._name, self._chain, len(self._sequence),
self._loading_status) # pragma: no cover
def __repr__(self):
return "<%s at 0x%02x>" % (self._string_summary_basic(), id(self)) # pragma: no cover
def __len__(self):
return len(self.sequence)
[docs]def get_ab_numbering(sequence, server, numbering_scheme, timeout=30):
"""
:rtype: list
"""
# check which server to use to get numbering
if server == OPTION_FLAGS.ABYSIS:
# find out which numbering scheme to use
if numbering_scheme == NUMBERING_FLAGS.CHOTHIA:
scheme = '-c'
elif numbering_scheme in (NUMBERING_FLAGS.CHOTHIA_EXT, NUMBERING_FLAGS.MARTIN):
scheme = '-a'
elif numbering_scheme == NUMBERING_FLAGS.KABAT:
scheme = '-k'
else:
raise ValueError("{} numbering scheme is unknown.".format(numbering_scheme.capitalize()))
# prepare the url string to query server
url = f"http://www.bioinf.org.uk/cgi-bin/abnum/abnum.pl?plain=1&aaseq={sequence}&scheme={scheme}"
# use the Download class from utils to get output
numbering_table = Download(url, verbose=False, timeout=timeout)
try:
numbering_table.download()
except ValueError:
raise ValueError("Check the internet connection.")
# check whether the server returned an error
if numbering_table.html.replace("\n", '') == 'Warning: Unable to number sequence' or len(
numbering_table.html.replace("\n", '')) == 0:
raise NumberingException("Unable to number sequence")
# parse the results
parsed_numbering_table = re.findall("[\S| ]+", numbering_table.html)
# get the numbering from the parsed table
numbering = [x[:-2] for x in parsed_numbering_table if x[-1] != '-']
# TODO: add more server options
else:
numbering = ['']
return numbering
[docs]def calculate_hydrophobicity_matrix(whole_sequence, numbering, aa_hydrophobicity_scores, sequence):
# instantiate numpy array (whole sequence includes all the amino acid positions of the VH/VL, even the ones
# that aren't occupied -> these will be filled with zeros
# hydrophobicity_matrix = np.zeros(len(whole_sequence))
#
# # iterate through each position
# for i, position in enumerate(whole_sequence):
#
# if position in numbering:
# position_in_data = numbering.index(position)
# hydrophobicity_matrix[i] = aa_hydrophobicity_scores[sequence[position_in_data]]
#
# return hydrophobicity_matrix
# same thing as above but in a comprehension list
return np.array([aa_hydrophobicity_scores[sequence[numbering.index(x)]] if x in numbering
else 0 for x in whole_sequence])
[docs]def calculate_mw(sequence, mw_data):
return sum(mw_data[x] for x in sequence) - (len(sequence) - 1) * mw_data['water']
[docs]def calculate_ec(sequence, ec_data):
# ϵ280 = nW x 5,500 + nY x 1,490 + nC x 125
n_W = sequence.count('W')
n_Y = sequence.count('Y')
n_C = sequence.count('C')
return n_W * ec_data['W'] + n_Y * ec_data['Y'] + n_C * ec_data['C']
[docs]def calculate_pi(sequence, pi_data):
# algorithm implemented from http://isoelectric.ovh.org/files/practise-isoelectric-point.html
# count number of D, E, C, Y, H, K, R
d_count = sequence.count('D')
e_count = sequence.count('E')
c_count = sequence.count('C')
y_count = sequence.count('Y')
h_count = sequence.count('H')
k_count = sequence.count('K')
r_count = sequence.count('R')
# initiate value of pH and nq (any number above 0)
nq = 10
ph = 0
# define precision
delta = 0.01
while nq > 0:
if ph >= 14:
raise Exception("Could not calculate pI (pH reached above 14)")
# qn1, qn2, qn3, qn4, qn5, qp1, qp2, qp3, qp4
qn1 = -1 / (1 + 10 ** (pi_data['COOH'] - ph)) # C-terminus charge
qn2 = - d_count / (1 + 10 ** (pi_data['D'] - ph)) # D charge
qn3 = - e_count / (1 + 10 ** (pi_data['E'] - ph)) # E charge
qn4 = - c_count / (1 + 10 ** (pi_data['C'] - ph)) # C charge
qn5 = - y_count / (1 + 10 ** (pi_data['Y'] - ph)) # Y charge
qp1 = h_count / (1 + 10 ** (ph - pi_data['H'])) # H charge
qp2 = 1 / (1 + 10 ** (ph - pi_data['NH2'])) # N-terminus charge
qp3 = k_count / (1 + 10 ** (ph - pi_data['K'])) # K charge
qp4 = r_count / (1 + 10 ** (ph - pi_data['R'])) # R charge
nq = qn1 + qn2 + qn3 + qn4 + qn5 + qp1 + qp2 + qp3 + qp4
# update pH
ph += delta
return ph
[docs]def calculate_cdr(numbering, cdr_positions, framework_positions):
"""
:param numbering:
:param cdr_positions:
:param framework_positions:
:return:
"""
cdrs = {'CDR1': list(),
'CDR2': list(),
'CDR3': list()}
frameworks = {'FR1': list(),
'FR2': list(),
'FR3': list(),
'FR4': list()}
for cdr in cdrs.keys():
cdr_positions_i = cdr_positions[cdr]
for i, position in enumerate(numbering):
if position in cdr_positions_i:
cdrs[cdr].append(i)
for framework in frameworks.keys():
framework_position_i = framework_positions[framework]
for i, position in enumerate(numbering):
if position in framework_position_i:
frameworks[framework].append(i)
return cdrs, frameworks
[docs]def amino_acid_charge(amino_acid, ph, pka_values):
if amino_acid in ['D', 'E', 'C', 'Y']:
return -1 / (1 + 10 ** (pka_values[amino_acid] - ph))
elif amino_acid in ['K', 'R', 'H']:
return 1 / (1 + 10 ** (ph - pka_values[amino_acid]))
else:
return 0
[docs]def calculate_charge(sequence, ph, pka_values):
# This calculation would make more sense but is slower (~1.5-2x)
# cooh = -1 / (1 + 10 ** (pka_values['COOH'] - ph))
# nh2 = 1 / (1 + 10 ** (ph - pka_values['NH2']))
#
# return sum([amino_acid_charge(x, ph, pka_values) for x in list(sequence)]) + cooh + nh2
# Faster implementation
# count number of D, E, C, Y, H, K, R
d_count = sequence.count('D')
e_count = sequence.count('E')
c_count = sequence.count('C')
y_count = sequence.count('Y')
h_count = sequence.count('H')
k_count = sequence.count('K')
r_count = sequence.count('R')
# qn1, qn2, qn3, qn4, qn5, qp1, qp2, qp3, qp4
qn1 = -1 / (1 + 10 ** (pka_values['COOH'] - ph)) # C-terminus charge
qn2 = - d_count / (1 + 10 ** (pka_values['D'] - ph)) # D charge
qn3 = - e_count / (1 + 10 ** (pka_values['E'] - ph)) # E charge
qn4 = - c_count / (1 + 10 ** (pka_values['C'] - ph)) # C charge
qn5 = - y_count / (1 + 10 ** (pka_values['Y'] - ph)) # Y charge
qp1 = h_count / (1 + 10 ** (ph - pka_values['H'])) # H charge
qp2 = 1 / (1 + 10 ** (ph - pka_values['NH2'])) # N-terminus charge
qp3 = k_count / (1 + 10 ** (ph - pka_values['K'])) # K charge
qp4 = r_count / (1 + 10 ** (ph - pka_values['R'])) # R charge
nq = qn1 + qn2 + qn3 + qn4 + qn5 + qp1 + qp2 + qp3 + qp4
return nq