from .chain_collection import ChainCollection
import numpy as np
import pandas as pd
from .chain import calculate_charge
from abpytools.utils import DataLoader
from operator import itemgetter
from .fab import Fab
from .helper_functions import germline_identity_pd, to_numbering_table
from .base import CollectionBase
import os
import json
from .utils import (json_FabCollection_formatter, pb2_FabCollection_formatter, pb2_FabCollection_parser,
json_FabCollection_parser)
from .flags import *
if BACKEND_FLAGS.HAS_PROTO:
from abpytools.core.formats import FabCollectionProto
[docs]class FabCollection(CollectionBase):
def __init__(self, fab=None, heavy_chains=None, light_chains=None, names=None):
"""
Fab object container that handles combinations of light/heavy Chain pairs.
Args:
fab (list):
heavy_chains (ChainCollection):
light_chains (ChainCollection):
names (list):
"""
# check if it's a Chain object
if heavy_chains is None and light_chains is None and fab is None:
raise ValueError('Provide a list of Chain objects or an ChainCollection object')
# check if fab object is a list and if all object are abpytools.Fab objects
if isinstance(fab, list) and all(isinstance(fab_i, Fab) for fab_i in fab):
self._fab = fab
self._light_chains = ChainCollection([x[0] for x in self._fab])
self._heavy_chains = ChainCollection([x[1] for x in self._fab])
if fab is None and (heavy_chains is not None and light_chains is not None):
if isinstance(heavy_chains, list):
self._heavy_chains = ChainCollection(antibody_objects=heavy_chains)
elif isinstance(heavy_chains, ChainCollection):
self._heavy_chains = heavy_chains
else:
raise ValueError('Provide a list of Chain objects or an ChainCollection object')
if isinstance(light_chains, list):
self._light_chains = ChainCollection(antibody_objects=light_chains)
elif isinstance(light_chains, ChainCollection):
self._light_chains = light_chains
else:
raise ValueError('Provide a list of Chain objects or an ChainCollection object')
if len(self._light_chains.loading_status()) == 0:
self._light_chains.load()
if len(self._heavy_chains.loading_status()) == 0:
self._heavy_chains.load()
if self._light_chains.n_ab != self._heavy_chains.n_ab:
raise ValueError('Number of heavy chains must be the same of light chains')
if isinstance(names, list) and all(isinstance(name, str) for name in names):
if len(names) == self._heavy_chains.n_ab:
self._names = names
else:
raise ValueError(
'Length of name list must be the same as length of heavy_chains/light chains lists')
elif names is None:
self._names = ['{} - {}'.format(heavy, light) for heavy, light in zip(self._heavy_chains.names,
self._light_chains.names)]
else:
raise ValueError("Names expected a list of strings, instead got {}".format(type(names)))
self._n_ab = self._light_chains.n_ab
self._pair_sequences = [heavy + light for light, heavy in zip(self._heavy_chains.sequences,
self._light_chains.sequences)]
# keep the name of the heavy and light chains internally to keep everything in the right order
self._internal_heavy_name = self._heavy_chains.names
self._internal_light_name = self._light_chains.names
# even though it makes more sense to draw all these values from the base Fab objects this is much slower
# whenever self._n_ab > 1 it makes more sense to use the self._heavy_chain and self._light_chain containers
# in all the methods
# in essence the abpytools.Fab object is just a representative building block that could in future just
# cache data and would then represent a speed up in the calculations
[docs] def molecular_weights(self, monoisotopic=False):
return [heavy + light for heavy, light in zip(self._heavy_chains.molecular_weights(monoisotopic=monoisotopic),
self._light_chains.molecular_weights(monoisotopic=monoisotopic))]
[docs] def extinction_coefficients(self, extinction_coefficient_database='Standard', reduced=False, normalise=False,
**kwargs):
heavy_ec = self._heavy_chains.extinction_coefficients(
extinction_coefficient_database=extinction_coefficient_database,
reduced=reduced)
light_ec = self._light_chains.extinction_coefficients(
extinction_coefficient_database=extinction_coefficient_database,
reduced=reduced)
if normalise:
return [(heavy + light) / mw for heavy, light, mw in
zip(heavy_ec, light_ec, self.molecular_weights(**kwargs))]
else:
return [heavy + light for heavy, light in zip(heavy_ec, light_ec)]
[docs] def hydrophobicity_matrix(self):
return np.column_stack((self._heavy_chains.hydrophobicity_matrix(), self._light_chains.hydrophobicity_matrix()))
[docs] def charge(self):
return np.column_stack((self._heavy_chains.charge, self._light_chains.charge))
[docs] def total_charge(self, ph=7.4, pka_database='Wikipedia'):
available_pi_databases = ["EMBOSS", "DTASetect", "Solomon", "Sillero", "Rodwell", "Wikipedia", "Lehninger",
"Grimsley"]
assert pka_database in available_pi_databases, \
"Selected pI database {} not available. Available databases: {}".format(pka_database,
' ,'.join(available_pi_databases))
data_loader = DataLoader(data_type='AminoAcidProperties', data=['pI', pka_database])
pka_data = data_loader.get_data()
return [calculate_charge(sequence=seq, ph=ph, pka_values=pka_data) for seq in self.sequences]
[docs] def igblast_local_query(self, file_path, chain):
if chain.lower() == 'light':
self._light_chains.igblast_local_query(file_path=file_path)
elif chain.lower() == 'heavy':
self._heavy_chains.igblast_local_query(file_path=file_path)
else:
raise ValueError('Specify if the data being loaded is for the heavy or light chain')
[docs] def igblast_server_query(self, **kwargs):
self._light_chains.igblast_server_query(**kwargs)
self._heavy_chains.igblast_server_query(**kwargs)
[docs] def numbering_table(self, as_array=False, region='all', chain='both', **kwargs):
return to_numbering_table(as_array=as_array, region=region, chain=chain,
heavy_chains_numbering_table=self._heavy_chains.numbering_table,
light_chains_numbering_table=self._light_chains.numbering_table,
names=self.names, **kwargs)
def _germline_pd(self):
# empty dictionaries return false, so this condition checks if any of the values are False
if all([x for x in self._light_chains.germline_identity.values()]) is False:
# this means there is no information about the germline,
# by default it will run a web query
self._light_chains.igblast_server_query()
if all([x for x in self._heavy_chains.germline_identity.values()]) is False:
self._heavy_chains.igblast_server_query()
heavy_chain_germlines = self._heavy_chains.germline
light_chain_germlines = self._light_chains.germline
data = np.array([[heavy_chain_germlines[x][0] for x in self._internal_heavy_name],
[heavy_chain_germlines[x][1] for x in self._internal_heavy_name],
[light_chain_germlines[x][0] for x in self._internal_light_name],
[light_chain_germlines[x][1] for x in self._internal_light_name]]).T
df = pd.DataFrame(data=data,
columns=pd.MultiIndex.from_tuples([('Heavy', 'Assignment'),
('Heavy', 'Score'),
('Light', 'Assignment'),
('Light', 'Score')]),
index=self.names)
df.loc[:, (slice(None), 'Score')] = df.loc[:, (slice(None), 'Score')].apply(pd.to_numeric)
return df
[docs] def save_to_json(self, path, update=True):
with open(os.path.join(path + '.json'), 'w') as f:
fab_data = json_FabCollection_formatter(self)
json.dump(fab_data, f, indent=2)
[docs] def save_to_pb2(self, path, update=True):
proto_parser = FabCollectionProto()
try:
with open(os.path.join(path + '.pb2'), 'rb') as f:
proto_parser.ParseFromString(f.read())
except IOError:
# Creating new file
pass
pb2_FabCollection_formatter(self, proto_parser)
with open(os.path.join(path + '.pb2'), 'wb') as f:
f.write(proto_parser.SerializeToString())
[docs] def save_to_fasta(self, path, update=True):
raise NotImplementedError
[docs] @classmethod
def load_from_json(cls, path, n_threads=20, verbose=True, show_progressbar=True):
with open(path, 'r') as f:
data = json.load(f)
fab_objects = json_FabCollection_parser(data)
fab_collection = cls(fab=fab_objects)
return fab_collection
[docs] @classmethod
def load_from_pb2(cls, path, n_threads=20, verbose=True, show_progressbar=True):
with open(path, 'rb') as f:
proto_parser = FabCollectionProto()
proto_parser.ParseFromString(f.read())
fab_objects = pb2_FabCollection_parser(proto_parser)
fab_collection = cls(fab=fab_objects)
return fab_collection
[docs] @classmethod
def load_from_fasta(cls, path, numbering_scheme=NUMBERING_FLAGS.CHOTHIA, n_threads=20,
verbose=True, show_progressbar=True):
raise NotImplementedError
def _get_names_iter(self, chain='both'):
if chain == 'both':
for light_chain, heavy_chain in zip(self._light_chains, self._heavy_chains):
yield f"{light_chain.name}-{heavy_chain.name}"
elif chain == 'light':
for light_chain in self._light_chains:
yield light_chain.name
elif chain == 'heavy':
for heavy_chain in self._heavy_chains:
yield heavy_chain.name
else:
raise ValueError(f"Unknown chain type ({chain}), available options are:"
f"both, light or heavy.")
@property
def regions(self):
heavy_regions = self._heavy_chains.ab_region_index()
light_regions = self._light_chains.ab_region_index()
return {name: {CHAIN_FLAGS.HEAVY_CHAIN: heavy_regions[heavy],
CHAIN_FLAGS.LIGHT_CHAIN: light_regions[light]} for name, heavy, light in
zip(self.names, self._internal_heavy_name, self._internal_light_name)}
@property
def names(self):
return self._names
@property
def sequences(self):
return self._pair_sequences
@property
def aligned_sequences(self):
return [heavy + light for light, heavy in
zip(self._heavy_chains.aligned_sequences,
self._light_chains.aligned_sequences)]
@property
def n_ab(self):
return self._n_ab
@property
def germline_identity(self):
return self._germline_identity()
@property
def germline(self):
return self._germline_pd()
def _string_summary_basic(self):
return "abpytools.FabCollection Number of sequences: {}".format(self._n_ab)
def __len__(self):
return self._n_ab
def __repr__(self):
return "<%s at 0x%02x>" % (self._string_summary_basic(), id(self))
def __getitem__(self, indices):
if isinstance(indices, int):
return Fab(heavy_chain=self._heavy_chains[indices],
light_chain=self._light_chains[indices],
name=self.names[indices], load=False)
else:
return FabCollection(heavy_chains=list(itemgetter(*indices)(self._heavy_chains)),
light_chains=list(itemgetter(*indices)(self._light_chains)),
names=list(itemgetter(*indices)(self._names)))
def _germline_identity(self):
# empty dictionaries return false, so this condition checks if any of the values are False
if all([x for x in self._light_chains.germline_identity.values()]) is False:
# this means there is no information about the germline,
# by default it will run a web query
self._light_chains.igblast_server_query()
if all([x for x in self._heavy_chains.germline_identity.values()]) is False:
self._heavy_chains.igblast_server_query()
return germline_identity_pd(self._heavy_chains.germline_identity,
self._light_chains.germline_identity,
self._internal_heavy_name,
self._internal_light_name,
self._names)
[docs] def get_object(self, name):
"""
:param name: str
:return:
"""
if name in self.names:
index = self.names.index(name)
return self[index]
else:
raise ValueError('Could not find sequence with specified name')