-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit a663af9
Showing
28 changed files
with
372 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,295 @@ | ||
# %% | ||
import os | ||
import re | ||
import sys | ||
import copy | ||
import random | ||
import numpy as np | ||
import pandas as pd | ||
import argparse | ||
import keras | ||
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, UpSampling2D | ||
from keras.models import Model | ||
from keras import backend as K | ||
from keras.models import Sequential | ||
from keras.layers import LSTM | ||
from keras.layers import Dense | ||
from keras.layers import RepeatVector | ||
from keras.layers import TimeDistributed | ||
from keras.layers import concatenate | ||
from keras import optimizers | ||
from allennlp.commands.elmo import ElmoEmbedder | ||
import torch | ||
from keras.models import model_from_json | ||
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" | ||
|
||
# %% | ||
common_aa = "ARNDCQEGHILKMFPSTWYV" | ||
aa={"A":0,"R":1,"N":2,"D":3,"C":4,"Q":5,"E":6,"G":7,"H":8,"I":9,"L":10,"K":11,"M":12,"F":13,"P":14,"S":15,"T":16,"W":17,"Y":18,"V":19} | ||
path_dict = './supporting_file/' | ||
model_dict = './models/' | ||
pep_length = [8,9,10,11,12,13] | ||
|
||
cdrs = np.load(path_dict + "create_cdr_dict.npy", allow_pickle = True).item() | ||
|
||
cdrs1 = copy.copy(cdrs) | ||
for key in list(cdrs1['human'].keys()): | ||
dict_q = cdrs1['human'][key] | ||
for key1 in list(dict_q.keys()): | ||
key2 = key1.split('*')[0] | ||
if key2 not in cdrs1['human'][key].keys(): | ||
cdrs['human'][key][key2] = dict_q[key1] | ||
|
||
cdrs2 = copy.copy(cdrs) | ||
for key in list(cdrs2['human'].keys()): | ||
dict_q = cdrs2['human'][key] | ||
for key1 in list(dict_q.keys()): | ||
key2 = key1.replace('-','.') | ||
if key2 not in cdrs2['human'][key].keys(): | ||
cdrs['human'][key][key2] = dict_q[key1] | ||
|
||
cdrs3 = copy.copy(cdrs) | ||
for key in list(cdrs3['human'].keys()): | ||
dict_q = cdrs3['human'][key] | ||
for key1 in list(dict_q.keys()): | ||
key2 = key1.split('*')[0].replace('/','') | ||
if key2 not in cdrs3['human'][key].keys(): | ||
cdrs['human'][key][key2] = dict_q[key1] | ||
|
||
def read_blosum(path,one_hot): | ||
f = open(path,"r") | ||
blosum = [] | ||
if one_hot ==0: #(blosum 50) | ||
for line in f: | ||
blosum.append([(float(i))/10 for i in re.split("\t",line)]) | ||
else: | ||
for line in f: #(one-hot) | ||
blosum.append([float(i) for i in re.split("\t",line)]) | ||
#The values are rescaled by a factor of 1/10 to facilitate training | ||
f.close() | ||
return blosum | ||
|
||
def pseudo_seq(seq_dict,blosum_matrix): | ||
pseq_dict = {}#pseudo sequence dictionary | ||
for allele in seq_dict.keys(): | ||
new_pseq = [] | ||
for index in range(34): | ||
new_pseq.append(blosum_matrix[aa[seq_dict[allele][index]]]) | ||
pseq_dict[allele] = new_pseq | ||
return pseq_dict | ||
|
||
blosum_matrix = read_blosum(path_dict + 'blosum50.txt', 0) | ||
pseq_dict = np.load(path_dict + 'pseq_dict_all.npy', allow_pickle = True).item() | ||
pseq_dict_blosum_matrix = pseudo_seq(pseq_dict, blosum_matrix) | ||
|
||
def peptide_matrix(peptides): | ||
|
||
blosum_matrix = np.array([[0.5, -0.2, -0.1, -0.2, -0.1, -0.1, -0.1, 0.0, -0.2, -0.1, -0.2, -0.1, -0.1, -0.3, -0.1, 0.1, 0.0, -0.3, -0.2, 0.0], | ||
[-0.2, 0.7, -0.1, -0.2, -0.4, 0.1, 0.0, -0.3, 0.0, -0.4, -0.3, 0.3, -0.2, -0.3, -0.3, -0.1, -0.1, -0.3, -0.1, -0.3], | ||
[-0.1, -0.1, 0.7, 0.2, -0.2, 0.0, 0.0, 0.0, 0.1, -0.3, -0.4, 0.0, -0.2, -0.4, -0.2, 0.1, 0.0, -0.4, -0.2, -0.3], | ||
[-0.2, -0.2, 0.2, 0.8, -0.4, 0.0, 0.2, -0.1, -0.1, -0.4, -0.4, -0.1, -0.4, -0.5, -0.1, 0.0, -0.1, -0.5, -0.3, -0.4], | ||
[-0.1, -0.4, -0.2, -0.4, 1.3, -0.3, -0.3, -0.3, -0.3, -0.2, -0.2, -0.3, -0.2, -0.2, -0.4, -0.1, -0.1, -0.5, -0.3, -0.1], | ||
[-0.1, 0.1, 0.0, 0.0, -0.3, 0.7, 0.2, -0.2, 0.1, -0.3, -0.2, 0.2, 0.0, -0.4, -0.1, 0.0, -0.1, -0.1, -0.1, -0.3], | ||
[-0.1, 0.0, 0.0, 0.2, -0.3, 0.2, 0.6, -0.3, 0.0, -0.4, -0.3, 0.1, -0.2, -0.3, -0.1, -0.1, -0.1, -0.3, -0.2, -0.3], | ||
[0.0, -0.3, 0.0, -0.1, -0.3, -0.2, -0.3, 0.8, -0.2, -0.4, -0.4, -0.2, -0.3, -0.4, -0.2, 0.0, -0.2, -0.3, -0.3, -0.4], | ||
[-0.2, 0.0, 0.1, -0.1, -0.3, 0.1, 0.0, -0.2, 1.0, -0.4, -0.3, 0.0, -0.1, -0.1, -0.2, -0.1, -0.2, -0.3, 0.2, -0.4], | ||
[-0.1, -0.4, -0.3, -0.4, -0.2, -0.3, -0.4, -0.4, -0.4, 0.5, 0.2, -0.3, 0.2, 0.0, -0.3, -0.3, -0.1, -0.3, -0.1, 0.4], | ||
[-0.2, -0.3, -0.4, -0.4, -0.2, -0.2, -0.3, -0.4, -0.3, 0.2, 0.5, -0.3, 0.3, 0.1, -0.4, -0.3, -0.1, -0.2, -0.1, 0.1], | ||
[-0.1, 0.3, 0.0, -0.1, -0.3, 0.2, 0.1, -0.2, 0.0, -0.3, -0.3, 0.6, -0.2, -0.4, -0.1, 0.0, -0.1, -0.3, -0.2, -0.3], | ||
[-0.1, -0.2, -0.2, -0.4, -0.2, 0.0, -0.2, -0.3, -0.1, 0.2, 0.3, -0.2, 0.7, 0.0, -0.3, -0.2, -0.1, -0.1, 0.0, 0.1], | ||
[-0.3, -0.3, -0.4, -0.5, -0.2, -0.4, -0.3, -0.4, -0.1, 0.0, 0.1, -0.4, 0.0, 0.8, -0.4, -0.3, -0.2, 0.1, 0.4, -0.1], | ||
[-0.1, -0.3, -0.2, -0.1, -0.4, -0.1, -0.1, -0.2, -0.2, -0.3, -0.4, -0.1, -0.3, -0.4, 1.0, -0.1, -0.1, -0.4, -0.3, -0.3], | ||
[0.1, -0.1, 0.1, 0.0, -0.1, 0.0, -0.1, 0.0, -0.1, -0.3, -0.3, 0.0, -0.2, -0.3, -0.1, 0.5, 0.2, -0.4, -0.2, -0.2], | ||
[0.0, -0.1, 0.0, -0.1, -0.1, -0.1, -0.1, -0.2, -0.2, -0.1, -0.1, -0.1, -0.1, -0.2, -0.1, 0.2, 0.5, -0.3, -0.2, 0.0], | ||
[-0.3, -0.3, -0.4, -0.5, -0.5, -0.1, -0.3, -0.3, -0.3, -0.3, -0.2, -0.3, -0.1, 0.1, -0.4, -0.4, -0.3, 1.5, 0.2, -0.3], | ||
[-0.2, -0.1, -0.2, -0.3, -0.3, -0.1, -0.2, -0.3, 0.2, -0.1, -0.1, -0.2, 0.0, 0.4, -0.3, -0.2, -0.2, 0.2, 0.8, -0.1], | ||
[0.0, -0.3, -0.3, -0.4, -0.1, -0.3, -0.3, -0.4, -0.4, 0.4, 0.1, -0.3, 0.1, -0.1, -0.3, -0.2, 0.0, -0.3, -0.1, 0.5]]) | ||
|
||
aa={"A":0,"R":1,"N":2,"D":3,"C":4,"Q":5,"E":6,"G":7,"H":8,"I":9,"L":10,"K":11,"M":12,"F":13,"P":14,"S":15,"T":16,"W":17,"Y":18,"V":19} | ||
pep_length = [8,9,10,11,12,13] | ||
|
||
if set(list(peptides)).difference(list('ACDEFGHIKLMNPQRSTVWY')): | ||
return | ||
if len(peptides) not in pep_length: | ||
return | ||
pep_blosum = [] | ||
for residue_index in range(13): | ||
if residue_index < len(peptides): | ||
pep_blosum.append(blosum_matrix[aa[peptides[residue_index]]]) | ||
else: | ||
pep_blosum.append(np.zeros(20)) | ||
for residue_index in range(13): | ||
if 13 - residue_index > len(peptides): | ||
pep_blosum.append(np.zeros(20)) | ||
else: | ||
pep_blosum.append(blosum_matrix[aa[peptides[len(peptides) - 13 + residue_index]]]) | ||
|
||
return pep_blosum | ||
|
||
# %% | ||
def import_binding_model(model_dict, number): | ||
models = [] | ||
for i in range(number): | ||
json_f = open(model_dict + "binding/model_"+str(i)+".json", 'r') | ||
loaded_model_json = json_f.read() | ||
json_f.close() | ||
loaded_model = model_from_json(loaded_model_json) | ||
loaded_model.load_weights((model_dict + "binding/model_"+str(i)+".h5")) | ||
models.append(loaded_model) | ||
return models | ||
|
||
def import_prediction_model(model_dict, number): | ||
models = [] | ||
for i in range(number): | ||
json_f = open(model_dict + "dnn/model_dnn.json", 'r') | ||
loaded_model_json = json_f.read() | ||
json_f.close() | ||
loaded_model = model_from_json(loaded_model_json) | ||
loaded_model.load_weights((model_dict + "dnn/model_"+str(i)+".h5")) | ||
models.append(loaded_model) | ||
return models | ||
|
||
def scoring(models, data): | ||
import numpy as np | ||
probas_ = [np.transpose(model.predict(data))[0] for model in models] | ||
probas_ = [np.mean(scores) for scores in zip(*probas_)] | ||
return probas_ | ||
|
||
def embedding_model(model_dict): | ||
json_f = open(model_dict + 'pan_binding/model_pan_binding.json', 'r') | ||
loaded_model_json = json_f.read() | ||
json_f.close() | ||
pan_ligand = model_from_json(loaded_model_json) | ||
pan_ligand.load_weights(model_dict + 'pan_binding/model_pan_binding.h5') | ||
layer_model = Model(inputs=pan_ligand.input, outputs=pan_ligand.layers[26].output) | ||
return layer_model | ||
|
||
weights = model_dict + 'uniref50_v2/weights.hdf5' | ||
options = model_dict + 'uniref50_v2/options.json' | ||
seqvec = ElmoEmbedder(options,weights,cuda_device=-1) | ||
|
||
model_embedding = embedding_model(model_dict) | ||
model_binding = import_binding_model(model_dict, 5) | ||
model_prediction = import_prediction_model(model_dict, 5) | ||
|
||
json_file = open(model_dict + 'lstm/LSTM_Audoencoder_8.json', 'r') | ||
loaded_model_json = json_file.read() | ||
json_file.close() | ||
modelx = model_from_json(loaded_model_json) | ||
modelx.load_weights(model_dict + "lstm/LSTM_Audoencoder_8.h5") | ||
|
||
def s2v(seq): | ||
embed1 = seqvec.embed_sentence(list(seq)) | ||
protein_embd1 = torch.tensor(embed1).sum(dim=0).mean(dim=0) | ||
return list(protein_embd1.detach().numpy()) | ||
|
||
def pred_and_write_metrics_datatable(TCR_file, tmp_folder): | ||
df_tcrs = pd.read_csv(TCR_file) | ||
TCRs = df_tcrs.values.tolist() | ||
|
||
print("Loading TCRs information for prediction...") | ||
|
||
ac1 = [] | ||
ac2 = [] | ||
ac2_5 = [] | ||
bc1 = [] | ||
bc2 = [] | ||
bc2_5 = [] | ||
ac3 = [] | ||
bc3 = [] | ||
epi = [] | ||
hla = [] | ||
|
||
L31 = [] | ||
for out in TCRs: | ||
try: | ||
ac3.append(out[0]) | ||
bc3.append(out[2]) | ||
TRAV = cdrs['human']['A'][out[1]] | ||
ac1.append(TRAV[0].replace('-','')) | ||
ac2.append(TRAV[1].replace('-','')) | ||
ac2_5.append(TRAV[2].replace('-','')) | ||
TRAB = cdrs['human']['B'][out[3]] | ||
bc1.append(TRAB[0].replace('-','')) | ||
bc2.append(TRAB[1].replace('-','')) | ||
bc2_5.append(TRAB[2].replace('-','')) | ||
epi.append(out[-2]) | ||
hla.append(out[-1]) | ||
except Exception as e: | ||
print(e) | ||
continue | ||
L31.append(out) | ||
|
||
# %% | ||
embed_ac1 = dict(zip(list(set(ac1)), map(lambda x: s2v(x), list(set(ac1))))) | ||
embed_ac2 = dict(zip(list(set(ac2)), map(lambda x: s2v(x), list(set(ac2))))) | ||
embed_ac2_5 = dict(zip(list(set(ac2_5)), map(lambda x: s2v(x), list(set(ac2_5))))) | ||
embed_ac3 = dict(zip(list(set(ac3)), map(lambda x: s2v(x), list(set(ac3))))) | ||
|
||
embed_bc1 = dict(zip(list(set(bc1)), map(lambda x: s2v(x), list(set(bc1))))) | ||
embed_bc2 = dict(zip(list(set(bc2)), map(lambda x: s2v(x), list(set(bc2))))) | ||
embed_bc2_5 = dict(zip(list(set(bc2_5)), map(lambda x: s2v(x), list(set(bc2_5))))) | ||
embed_bc3 = dict(zip(list(set(bc3)), map(lambda x: s2v(x), list(set(bc3))))) | ||
|
||
embed_epi = dict(zip(list(set(epi)), map(lambda x: peptide_matrix(x), list(set(epi))))) | ||
embed_hla = dict(zip(list(set(hla)), map(lambda x: pseq_dict_blosum_matrix[x], list(set(hla))))) | ||
|
||
print("TCRs characterization is in progress...") | ||
|
||
# %% | ||
test_tcr, test_pmhc = [], [] | ||
for i,x in enumerate(L31): | ||
test_tcr.append([embed_ac1[ac1[i]], embed_ac2[ac2[i]], embed_ac2_5[ac2_5[i]], | ||
embed_ac3[ac3[i]], embed_bc1[bc1[i]], embed_bc2[bc2[i]], embed_bc2_5[bc2_5[i]], | ||
embed_bc3[bc3[i]]]) | ||
|
||
test_pmhc.append([embed_epi[epi[i]], embed_hla[hla[i]]]) | ||
|
||
test_data = np.array(test_tcr) | ||
test_tcr1 = modelx.predict(test_data) | ||
test_pmhc1 = np.array(test_pmhc) | ||
|
||
[validation_pep, validation_mhc] = [[i[j] for i in test_pmhc1] for j in range(2)] | ||
mp_pairs1 = [np.array(validation_pep),np.array(validation_mhc)] | ||
|
||
layer_output_array = model_embedding.predict(mp_pairs1) | ||
binding_scores = scoring(model_binding, mp_pairs1) | ||
|
||
X_test = np.concatenate((test_tcr1, layer_output_array), axis=1) | ||
interaction_scores = scoring(model_prediction, X_test) | ||
|
||
# %% | ||
binding_predictions: pd.DataFrame = pd.DataFrame(columns=['CDR3a', 'CDR3a_V', 'CDR3b', | ||
'CDR3b_V', 'Antigens','HLA_alleles', | ||
'Score_BA','Score_interaction']) | ||
for i, line in enumerate(L31): | ||
rows = [] | ||
rows.append(line + [binding_scores[i], interaction_scores[i]]) | ||
binding_predictions = binding_predictions.append( | ||
pd.DataFrame(columns=['CDR3a', 'CDR3a_V', 'CDR3b', | ||
'CDR3b_V', 'Antigens','HLA_alleles', | ||
'Score_BA','Score_interaction'], data=rows), ignore_index=True) | ||
|
||
# tmp_folder = './prediction' | ||
if not os.path.isdir(tmp_folder): | ||
os.makedirs(tmp_folder) | ||
|
||
binding_predictions.to_csv(tmp_folder + '/DeepTR_predictions.tsv', index=False, header=True) | ||
|
||
print("Done! the prediction results are written to file, please check...") | ||
|
||
def main(args): | ||
tcr_file = args.inputfile | ||
out_file = args.outfile | ||
pred_and_write_metrics_datatable(tcr_file, out_file) | ||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser(description="Prediction of T cell response by improved T cell receptors to antigen specificity") | ||
parser.add_argument('-inf', '--inputfile', type=str, | ||
help='One file containing predicted sequences.') | ||
parser.add_argument('-out', '--outfile', type=str, | ||
help='The output of the predicted result.') | ||
|
||
args = parser.parse_args() | ||
main(args) | ||
# python DeepTR_Prediction.py -inf './data/test_tcr.txt' -out './prediction' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
CDR3a,CDR3a_V,CDR3b,CDR3b_V,Antigens,HLA_alleles | ||
CALNPFNAGNMLTF,TRAV12-2*01,CSARVGDTGELFF,TRBV20-1*01,ALSPVIPHI,HLA-A*02:01 | ||
CAVGEAGGGNKLTF,TRAV8-3*01,CASSYDSTTGELFF,TRBV4-1*01,ALSPVIPHI,HLA-A*02:01 | ||
CALNPFNAGNMLTF,TRAV12-2*01,CSARVGDTGELFF,TRBV20-1*01,ALSPVIPHI,HLA-A*02:01 | ||
CAMRYNFNKFYF,TRAV12-3*01,CASRSSLMTGNNANEQYF,TRBV12-4*01,KLGGALQAK,HLA-A*03:01 | ||
CAVPGNTGKLIF,TRAV25*01,CASSLTSGNTEAFF,TRBV7-3*01,KLGGALQAK,HLA-A*03:01 | ||
CIVRVSYGGATNKLIF,TRAV26-1*01,CASSQEEAGTGELFF,TRBV4-1*01,KLGGALQAK,HLA-A*03:01 | ||
CAMKGNQFYF,TRAV12-3*01,CASSRLPGLAGVGNHEQYF,TRBV4-3*01,KLGGALQAK,HLA-A*03:01 | ||
CAVDLMKTSYDKVIF,TRAV21*01,CASSSRTGYDGYTF,TRBV28*01,IVTDFSVIK,HLA-A*11:01 | ||
CALSEDAGNMLTF,TRAV19*01,CASSWGGGSHYGYTF,TRBV11-2*01,IVTDFSVIK,HLA-A*11:01 | ||
CAVDLMKTSYDKVIF,TRAV21*01,CASSLAGAKEAFF,TRBV7-8*01,IVTDFSVIK,HLA-A*11:01 | ||
CAVDLMKTSYDKVIF,TRAV21*01,CSANVRQSNTGELFF,TRBV20-1*01,IVTDFSVIK,HLA-A*11:01 | ||
CAGLFSGYALNF,TRAV35*01,CASSLEPPGRGFAGYTF,TRBV7-3*01,GILGFVFTL,HLA-B*51:193 | ||
CAVNKGYSTLTF,TRAV12-2*01,CASSSSTSGYTDTQYF,TRBV7-8*01,GILGFVFTL,HLA-C*03:04 | ||
CAVSQAGTALIF,TRAV1-2*01,CASSHLSGVRSYNEQFF,TRBV3-1*01,GILGFVFTL,HLA-A*01:01 | ||
CAVRDWNAGNMLTF,TRAV1-2*01,CASSEASYGATYNEQFF,TRBV6-1*01,ELAGIGILTV,HLA-C*08:02 | ||
CAVKDTNAGKSTF,TRAV21*01,CSAREAGVWFNEQFF,TRBV20-1*01,ELAGIGILTV,HLA-B*35:01 | ||
CAVGGLVFGNEKLTF,TRAV8-3*01,CASSFSGNTGELFF,TRBV12-3*01,ELAGIGILTV,HLA-B*27:05 | ||
CAVTSGDDKIIF,TRAV1-1*01,CASSEGGGDTEAFF,TRBV25-1*01,ELAGIGILTV,HLA-B*07:02 | ||
CAGAGGGSQGNLIF,TRAV27*01,CASSFRSADTQYF,TRBV19*01,KLGGALQAK,HLA-C*05:01 | ||
CAGAGSQGNLIF,TRAV27*01,CASSIRSAYEQYF,TRBV19*01,KLGGALQAK,HLA-B*44:02 | ||
CAASGYDYKLSF,TRAV13-1*01,CASSARSSYEQYF,TRBV19*01,KLGGALQAK,HLA-B*27:05 | ||
CALRAGNQFYF,TRAV19*01,CASSLMRGGTYNSPLHF,TRBV13*01,KLGGALQAK,HLA-A*25:01 | ||
CAGAQGSSNTGKLIF,TRAV27*01,CASSIRSAYEQYF,TRBV19*01,KLGGALQAK,HLA-A*25:01 |
Binary file not shown.
Oops, something went wrong.