from ..utils import DataLoader
import itertools
import pandas as pd
import numpy as np
available_regions = ['FR1', 'CDR1', 'FR2', 'CDR2', 'FR3', 'CDR3', 'FR4']
[docs]def numbering_table_sequences(region, numbering_scheme, chain):
# next two conditionals are used to only extract the needed data with FR and CDR positions
# it makes the loading of the data quicker when there are only CDRs or FRs
if any([True if x.startswith('CDR') else False for x in region]):
cdr_list = DataLoader(data_type='CDR_positions',
data=[numbering_scheme, chain])
cdrs = cdr_list.get_data()
else:
cdrs = {}
if any([True if x.startswith('FR') else False for x in region]):
fr_list = DataLoader(data_type='Framework_positions',
data=[numbering_scheme, chain])
frs = fr_list.get_data()
else:
frs = {}
# pack it all up into a single dictionary
whole_sequence_dict = {**cdrs, **frs}
# get the sequence list in the correct order (since region has been sorted before)
whole_sequence_list = [whole_sequence_dict[x] for x in region]
# unpack whole_sequence_list into a single list
whole_sequence = list(itertools.chain.from_iterable(whole_sequence_list))
return whole_sequence_dict, whole_sequence
[docs]def numbering_table_region(region):
# if 'all' is chosen then region becomes a list with all
if region == 'all':
region = ['FR1', 'CDR1', 'FR2', 'CDR2', 'FR3', 'CDR3', 'FR4']
# if region is a string (i.e. 'CDR1') it becomes a list and in the next block of code is checked if it
# is a valid selection
if isinstance(region, str):
region = [region]
# checks if the selected regions are all in the available_regions
if not set(region).issubset(set(available_regions)):
raise ValueError("The chosen region is not available"
"Currently available regions: {}".format(available_regions))
# make sure the regions are in a logical order
region.sort(key=lambda x: available_regions.index(x))
return region
[docs]def numbering_table_multiindex(region, whole_sequence_dict):
# [[(CDR1, L23), (CDR1, L24),..], ..., [(CDR3, L100), ...]]
# here we get the list of lists from the line above
pre_region_map = [[(region_i, numbering) for numbering in whole_sequence_dict[region_i]] for region_i in
region]
# which can be easily unpacked into a single list
# region_map is a list of tuples that can be interpreted by pd.MultiIndex to form a two layer column system
region_map = list(itertools.chain.from_iterable(pre_region_map))
multi_index = pd.MultiIndex.from_tuples(tuples=region_map, names=['Region', 'Numbering'])
return multi_index
[docs]def germline_identity_pd(heavy_identity, light_identity, internal_heavy, internal_light, names):
regions = ['CDR1', 'CDR2', 'CDR3', 'FR1', 'FR2', 'FR3', 'Total']
columns = pd.MultiIndex.from_tuples([('Light', x) for x in regions] +
[('Heavy', x) for x in regions] +
[('Average', x) for x in regions],
names=['Chain', 'Region'])
df = pd.DataFrame(columns=columns, index=names)
for column in columns:
if column[0] == 'Light':
df[column] = [light_identity[x][column[1]] if column[1] in light_identity[x] else np.NaN
for x in internal_light]
# df[column] = list(map(lambda x: light_identity[x][column[1]] if column[1] in light_identity[x] else np.NaN,
# internal_light))
elif column[0] == 'Heavy':
df[column] = [heavy_identity[x][column[1]] if column[1] in heavy_identity[x] else np.NaN
for x in internal_heavy]
# df[column] = list(map(lambda x: heavy_identity[x][column[1]] if column[1] in heavy_identity[x] else np.NaN,
# internal_heavy))
else:
df[column] = (df[('Light', column[1])] + df[('Heavy', column[1])]) / 2
return df
[docs]def to_numbering_table(as_array, region, chain, heavy_chains_numbering_table,
light_chains_numbering_table, names, **kwargs):
if chain == 'both':
if as_array:
t_heavy = heavy_chains_numbering_table(as_array=True, region=region, **kwargs)
t_light = light_chains_numbering_table(as_array=True, region=region, **kwargs)
data = np.concatenate((t_light, t_heavy), axis=1)
else:
t_heavy = heavy_chains_numbering_table(as_array=False, region=region, **kwargs)
t_light = light_chains_numbering_table(as_array=False, region=region, **kwargs)
t_heavy.reset_index(drop=True, inplace=True)
t_light.reset_index(drop=True, inplace=True)
data = pd.concat([t_light, t_heavy], axis=1, keys=['Light', 'Heavy'])
elif chain == 'heavy':
data = heavy_chains_numbering_table(as_array=as_array, region=region, **kwargs)
elif chain == 'light':
data = light_chains_numbering_table(as_array=as_array, region=region, **kwargs)
else:
raise ValueError("Unknown chain.")
if not as_array:
data.index = names
return data