-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
156 lines (140 loc) · 6.41 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from timeit import default_timer as timer
from Preprocess import Preprocess
from WordEmb import WordEmb
from Corpus import Corpus
from utils import sec2hour_min_sec
### ###
### Intepro domains --> embeddings ###
### ###
### ###
### Setup Interpro Preprocess class ###
### ###
"""
#Server run
data_path = "/home/damian/Documents/L3S/projects"
# data_path = "/data2/melidis" #server
prot_len_file_name = "prot_id_len.tab"
interpro_local_format = True #True for all data sets #False for protein2ipr.dat.gz (interpro_ftp)
#overlap -> with_overlap == True no matter the others
#no overlap -> with_overlap == False and with_redundant == False
#no_redundant -> with_overlap == False and with_redundant == True
with_overlap = False
with_redundant = False
with_gap = True
preprocess_protein2ipr = Preprocess(data_path, prot_len_file_name, with_overlap, with_redundant, with_gap, interpro_local_format)
"""
### ###
### Preprocess protein2ipr to ###
### get domain architecture corpus ###
### ###
"""
# print("=====")
# print("1) Parsing protein2ipr -> protein_id tab domains")
### ###
# Processing Interpro data to learn embeddings from
### ###
#input: place the tabular.gz file in the data_path in order to proceed
#output: .tab file with protein id and their domains
#file_name = "protein2ipr.dat.gz" ##protein2ipr (interpro ftp)##
#file_name = "prot6.tab.gz" ##TEST##
batch_num_lines = 1000000
batch_out_prot = 10000
#credits: https://stackoverflow.com/questions/7370801/measure-time-elapsed-in-python
time_start = timer()
preprocess_protein2ipr.parse_prot2in(file_name, batch_num_lines, batch_out_prot)
time_end = timer()
print("Elapsed CPU time for parsing: {}.".format(sec2hour_min_sec(time_end-time_start)))
print("\n=====")
print("2) Parse id_domains.tab -> domains_corpus.txt")
file_in_name = "id_domains_no_overlap_gap.tab"#"id_domains_no_redundant_gap.tab"#"id_domains_overlap_gap.tab"
file_corpus_name = "domains_corpus_no_overlap_gap.txt"#"domains_corpus_overlap_gap.txt"#"domains_corpus_no_redundant_gap.txt"#
batch_num_lines = 100000
time_start = timer()
preprocess_protein2ipr.create_domains_corpus(file_in_name,file_corpus_name,batch_num_lines)
time_end = timer()
print("Elapsed CPU time for creating corpus: {}.".format(sec2hour_min_sec(time_end-time_start)))
print("\n=====")
print("3) Plot corpus histogram")
file_in = file_corpus_name#"domains_corpus_no_redundant_gap.txt"#"domains_corpus_no_overlap_gap.txt"#"domains_corpus_example.txt"#
domains_corpus = Corpus(data_path,file_in)
domains_corpus.plot_line_histogram()
"""
"""
###
# Train word2vec embeddings using corpus
###
print("\n=====")
print("4) Train domains_copurs.txt -> dom2vec.txt")
file_in = "domains_corpus_no_redundant_gap.txt"#file_corpus_name#"domains_corpus_no_overlap.txt"#"domains_corpus_prep1.txt"
#Train step-wise
### Word2vec Parameters ###
window = 10
use_cbow = 0
use_hierachical_soft_max = 0
vec_dim = 50
cores = 8
epochs_step = 5
max_epochs = 50
### ###
time_start = timer()
dom2Vec = WordEmb(data_path,file_in)
dom2Vec.set_up(window,use_cbow,use_hierachical_soft_max,vec_dim,cores)
dom2Vec.build_voc()
dom2Vec.train_stepwise(max_epochs,epochs_step)
time_end = timer()
print("Elapsed CPU time for initializing and training the model: {}.".format(sec2hour_min_sec(time_end-time_start)))
"""
### ###
### Extract domains from proteins ###
### in prediction data sets ###
"""
### ###
# Processing prediction data sets
# Get available domains for proteins
### ###
data_path = "/home/damian/Documents/L3S/projects"
prot_len_file_name = "prot_id_len.tab"
interpro_local_format = True #True for all data sets #False for protein2ipr.dat.gz (interpro_ftp)
#overlap -> with_overlap == True no matter the others
#no overlap -> with_overlap == False and with_redundant == False
#no_redundant -> with_overlap == False and with_redundant == True
with_overlap = True
with_redundant = False
with_gap = False
preprocess_domains4datasets = Preprocess(data_path, prot_len_file_name, with_overlap, with_redundant, with_gap, interpro_local_format)
print("======")
print("Prediction data sets A) map found domains to proteins of data set")
#file_name = "deeploc_remaining_seq.fasta_new.tsv.gz" ##DeepLoc##
file_name = "SP.715.rr.fasta.tsv.gz" #"nuc.1214.rr.fasta.tsv.gz" #"mTP.371.rr.fasta.tsv.gz" #"cyt.438.rr.fasta.tsv.gz" ##targetP##
# file_name = "targetp_remaining_seq_dataset_pos.fasta.tsv.gz" #"targetp_remaining_seq_dataset_pos.fasta.tsv.gz" #"targetp_remaining_seq_dataset_hard.fasta.tsv.gz" ##Toxin##
# file_name = "new_dataset_all.fasta.tsv.gz" ##NEW##
batch_num_lines = 1000000
batch_out_prot = 10000
#credits: https://stackoverflow.com/questions/7370801/measure-time-elapsed-in-python
time_start = timer()
preprocess_domains4datasets.parse_prot2in(file_name, batch_num_lines, batch_out_prot)
time_end = timer()
print("Elapsed CPU time for parsing: {}.".format(sec2hour_min_sec(time_end-time_start)))
### ###
# Processing prediction data sets
# Get unknown full-length domain for proteins without found domains
### ###
print("\n=====")
print("Prediction data sets B) parsing remaining fasta -> default domains tabular file")
## Input: fasta file and data_id_format
## all following fasta files should be placed in the data_path specified in the constructor of preprocess_domains4datasets
## Output: .tab file with 3 columns, protein ids, its domains and their evidence
## move this output to the respective dataset preprocessing subfolder
# fasta_name = "deeploc_remaining_seq2.fasta" ## DeepLoc ##
# fasta_name = "targetp_remaining_seq_Cytosole.fasta" #"targetp_remaining_seq_PathwaySignal.fasta" #"targetp_remaining_seq_Nuclear.fasta" #"targetp_remaining_seq_Mitochondrial.fasta" ##targetP##
# fasta_name = "targetp_remaining_seq_targetp_remaining_seq_dataset_pos.fasta" #"targetp_remaining_seq_targetp_remaining_seq_dataset_hard.fasta" #
# fasta_name = "new_remaining_seq.fasta" ## NEW ##
fasta_name = "targetp_remaining_seq_no_overlap.fasta" ## TargetP non overlapping ##
data_id_format = 1 #0 for DeepLoc and NEW #1 for TargetP #2 for Toxin
time_start = timer()
preprocess_domains4datasets.fasta2default_domains(fasta_name, data_id_format)
time_end = timer()
print("Elapsed CPU time for parsing: {}".format(sec2hour_min_sec(time_end-time_start)))
"""
print("=== * ===")
print("== *** ==")