Source code for abpytools.core.fab_collection

from .chain_collection import ChainCollection
import numpy as np
import pandas as pd
from .chain import calculate_charge
from abpytools.utils import DataLoader
from operator import itemgetter
from .fab import Fab
from .helper_functions import germline_identity_pd, to_numbering_table
from .base import CollectionBase
import os
import json
from .utils import (json_FabCollection_formatter, pb2_FabCollection_formatter, pb2_FabCollection_parser,
                    json_FabCollection_parser)
from .flags import *

if BACKEND_FLAGS.HAS_PROTO:
    from abpytools.core.formats import FabCollectionProto


[docs]class FabCollection(CollectionBase): def __init__(self, fab=None, heavy_chains=None, light_chains=None, names=None): """ Fab object container that handles combinations of light/heavy Chain pairs. Args: fab (list): heavy_chains (ChainCollection): light_chains (ChainCollection): names (list): """ # check if it's a Chain object if heavy_chains is None and light_chains is None and fab is None: raise ValueError('Provide a list of Chain objects or an ChainCollection object') # check if fab object is a list and if all object are abpytools.Fab objects if isinstance(fab, list) and all(isinstance(fab_i, Fab) for fab_i in fab): self._fab = fab self._light_chains = ChainCollection([x[0] for x in self._fab]) self._heavy_chains = ChainCollection([x[1] for x in self._fab]) if fab is None and (heavy_chains is not None and light_chains is not None): if isinstance(heavy_chains, list): self._heavy_chains = ChainCollection(antibody_objects=heavy_chains) elif isinstance(heavy_chains, ChainCollection): self._heavy_chains = heavy_chains else: raise ValueError('Provide a list of Chain objects or an ChainCollection object') if isinstance(light_chains, list): self._light_chains = ChainCollection(antibody_objects=light_chains) elif isinstance(light_chains, ChainCollection): self._light_chains = light_chains else: raise ValueError('Provide a list of Chain objects or an ChainCollection object') if len(self._light_chains.loading_status()) == 0: self._light_chains.load() if len(self._heavy_chains.loading_status()) == 0: self._heavy_chains.load() if self._light_chains.n_ab != self._heavy_chains.n_ab: raise ValueError('Number of heavy chains must be the same of light chains') if isinstance(names, list) and all(isinstance(name, str) for name in names): if len(names) == self._heavy_chains.n_ab: self._names = names else: raise ValueError( 'Length of name list must be the same as length of heavy_chains/light chains lists') elif names is None: self._names = ['{} - {}'.format(heavy, light) for heavy, light in zip(self._heavy_chains.names, self._light_chains.names)] else: raise ValueError("Names expected a list of strings, instead got {}".format(type(names))) self._n_ab = self._light_chains.n_ab self._pair_sequences = [heavy + light for light, heavy in zip(self._heavy_chains.sequences, self._light_chains.sequences)] # keep the name of the heavy and light chains internally to keep everything in the right order self._internal_heavy_name = self._heavy_chains.names self._internal_light_name = self._light_chains.names # even though it makes more sense to draw all these values from the base Fab objects this is much slower # whenever self._n_ab > 1 it makes more sense to use the self._heavy_chain and self._light_chain containers # in all the methods # in essence the abpytools.Fab object is just a representative building block that could in future just # cache data and would then represent a speed up in the calculations
[docs] def molecular_weights(self, monoisotopic=False): return [heavy + light for heavy, light in zip(self._heavy_chains.molecular_weights(monoisotopic=monoisotopic), self._light_chains.molecular_weights(monoisotopic=monoisotopic))]
[docs] def extinction_coefficients(self, extinction_coefficient_database='Standard', reduced=False, normalise=False, **kwargs): heavy_ec = self._heavy_chains.extinction_coefficients( extinction_coefficient_database=extinction_coefficient_database, reduced=reduced) light_ec = self._light_chains.extinction_coefficients( extinction_coefficient_database=extinction_coefficient_database, reduced=reduced) if normalise: return [(heavy + light) / mw for heavy, light, mw in zip(heavy_ec, light_ec, self.molecular_weights(**kwargs))] else: return [heavy + light for heavy, light in zip(heavy_ec, light_ec)]
[docs] def hydrophobicity_matrix(self): return np.column_stack((self._heavy_chains.hydrophobicity_matrix(), self._light_chains.hydrophobicity_matrix()))
[docs] def charge(self): return np.column_stack((self._heavy_chains.charge, self._light_chains.charge))
[docs] def total_charge(self, ph=7.4, pka_database='Wikipedia'): available_pi_databases = ["EMBOSS", "DTASetect", "Solomon", "Sillero", "Rodwell", "Wikipedia", "Lehninger", "Grimsley"] assert pka_database in available_pi_databases, \ "Selected pI database {} not available. Available databases: {}".format(pka_database, ' ,'.join(available_pi_databases)) data_loader = DataLoader(data_type='AminoAcidProperties', data=['pI', pka_database]) pka_data = data_loader.get_data() return [calculate_charge(sequence=seq, ph=ph, pka_values=pka_data) for seq in self.sequences]
[docs] def igblast_local_query(self, file_path, chain): if chain.lower() == 'light': self._light_chains.igblast_local_query(file_path=file_path) elif chain.lower() == 'heavy': self._heavy_chains.igblast_local_query(file_path=file_path) else: raise ValueError('Specify if the data being loaded is for the heavy or light chain')
[docs] def igblast_server_query(self, **kwargs): self._light_chains.igblast_server_query(**kwargs) self._heavy_chains.igblast_server_query(**kwargs)
[docs] def numbering_table(self, as_array=False, region='all', chain='both', **kwargs): return to_numbering_table(as_array=as_array, region=region, chain=chain, heavy_chains_numbering_table=self._heavy_chains.numbering_table, light_chains_numbering_table=self._light_chains.numbering_table, names=self.names, **kwargs)
def _germline_pd(self): # empty dictionaries return false, so this condition checks if any of the values are False if all([x for x in self._light_chains.germline_identity.values()]) is False: # this means there is no information about the germline, # by default it will run a web query self._light_chains.igblast_server_query() if all([x for x in self._heavy_chains.germline_identity.values()]) is False: self._heavy_chains.igblast_server_query() heavy_chain_germlines = self._heavy_chains.germline light_chain_germlines = self._light_chains.germline data = np.array([[heavy_chain_germlines[x][0] for x in self._internal_heavy_name], [heavy_chain_germlines[x][1] for x in self._internal_heavy_name], [light_chain_germlines[x][0] for x in self._internal_light_name], [light_chain_germlines[x][1] for x in self._internal_light_name]]).T df = pd.DataFrame(data=data, columns=pd.MultiIndex.from_tuples([('Heavy', 'Assignment'), ('Heavy', 'Score'), ('Light', 'Assignment'), ('Light', 'Score')]), index=self.names) df.loc[:, (slice(None), 'Score')] = df.loc[:, (slice(None), 'Score')].apply(pd.to_numeric) return df
[docs] def save_to_json(self, path, update=True): with open(os.path.join(path + '.json'), 'w') as f: fab_data = json_FabCollection_formatter(self) json.dump(fab_data, f, indent=2)
[docs] def save_to_pb2(self, path, update=True): proto_parser = FabCollectionProto() try: with open(os.path.join(path + '.pb2'), 'rb') as f: proto_parser.ParseFromString(f.read()) except IOError: # Creating new file pass pb2_FabCollection_formatter(self, proto_parser) with open(os.path.join(path + '.pb2'), 'wb') as f: f.write(proto_parser.SerializeToString())
[docs] def save_to_fasta(self, path, update=True): raise NotImplementedError
[docs] @classmethod def load_from_json(cls, path, n_threads=20, verbose=True, show_progressbar=True): with open(path, 'r') as f: data = json.load(f) fab_objects = json_FabCollection_parser(data) fab_collection = cls(fab=fab_objects) return fab_collection
[docs] @classmethod def load_from_pb2(cls, path, n_threads=20, verbose=True, show_progressbar=True): with open(path, 'rb') as f: proto_parser = FabCollectionProto() proto_parser.ParseFromString(f.read()) fab_objects = pb2_FabCollection_parser(proto_parser) fab_collection = cls(fab=fab_objects) return fab_collection
[docs] @classmethod def load_from_fasta(cls, path, numbering_scheme=NUMBERING_FLAGS.CHOTHIA, n_threads=20, verbose=True, show_progressbar=True): raise NotImplementedError
def _get_names_iter(self, chain='both'): if chain == 'both': for light_chain, heavy_chain in zip(self._light_chains, self._heavy_chains): yield f"{light_chain.name}-{heavy_chain.name}" elif chain == 'light': for light_chain in self._light_chains: yield light_chain.name elif chain == 'heavy': for heavy_chain in self._heavy_chains: yield heavy_chain.name else: raise ValueError(f"Unknown chain type ({chain}), available options are:" f"both, light or heavy.") @property def regions(self): heavy_regions = self._heavy_chains.ab_region_index() light_regions = self._light_chains.ab_region_index() return {name: {CHAIN_FLAGS.HEAVY_CHAIN: heavy_regions[heavy], CHAIN_FLAGS.LIGHT_CHAIN: light_regions[light]} for name, heavy, light in zip(self.names, self._internal_heavy_name, self._internal_light_name)} @property def names(self): return self._names @property def sequences(self): return self._pair_sequences @property def aligned_sequences(self): return [heavy + light for light, heavy in zip(self._heavy_chains.aligned_sequences, self._light_chains.aligned_sequences)] @property def n_ab(self): return self._n_ab @property def germline_identity(self): return self._germline_identity() @property def germline(self): return self._germline_pd() def _string_summary_basic(self): return "abpytools.FabCollection Number of sequences: {}".format(self._n_ab) def __len__(self): return self._n_ab def __repr__(self): return "<%s at 0x%02x>" % (self._string_summary_basic(), id(self)) def __getitem__(self, indices): if isinstance(indices, int): return Fab(heavy_chain=self._heavy_chains[indices], light_chain=self._light_chains[indices], name=self.names[indices], load=False) else: return FabCollection(heavy_chains=list(itemgetter(*indices)(self._heavy_chains)), light_chains=list(itemgetter(*indices)(self._light_chains)), names=list(itemgetter(*indices)(self._names))) def _germline_identity(self): # empty dictionaries return false, so this condition checks if any of the values are False if all([x for x in self._light_chains.germline_identity.values()]) is False: # this means there is no information about the germline, # by default it will run a web query self._light_chains.igblast_server_query() if all([x for x in self._heavy_chains.germline_identity.values()]) is False: self._heavy_chains.igblast_server_query() return germline_identity_pd(self._heavy_chains.germline_identity, self._light_chains.germline_identity, self._internal_heavy_name, self._internal_light_name, self._names)
[docs] def get_object(self, name): """ :param name: str :return: """ if name in self.names: index = self.names.index(name) return self[index] else: raise ValueError('Could not find sequence with specified name')