-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path5_BuildInteractome_BinaryPPIwithExpansion.py
352 lines (270 loc) · 12.7 KB
/
5_BuildInteractome_BinaryPPIwithExpansion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
#!/usr/bin/python
# manojmw
# 27 Jan, 2022
import sys, argparse
import logging
import gzip
###########################################################
# Parses the tab-seperated output files
# produced by the Interaction_parser.py (No headers)
#
# Required columns are:
# - Protein A UniProt PrimAC (column-1)
# - Protein B UniProt PrimAC (column-2)
# - Interaction Detection Method (column-3)
# - Pubmed Identifier (column-4)
# - Interaction type (column-5)
#
# Processes it by filtering based on Interaction Detection Method
# Final filtering - Each interaction has at least 2 experiments
# at least one of the experiments proved by any binary interaction method
#
# Returns a list with 5 items (in each sublist):
# - Protein A UniProt PrimAC,
# - Protein B UniProt PrimAC,
# - Publication_Count,
# - PMID(s) and
# - Experiment_count
def UniProtInteractome(inExpFile):
logging.info("Starting to run...")
# Dictionary for storing Interacting
# Proteins and Pubmed Identifier
PPI_PMID_dict = {}
# Dictionary for storing Interacting
# Proteins and Interaction Detection Method
PPI_IntDetMethod_dict = {}
# List of User input PPI interaction experiments file(s)
PPIExpFiles = inExpFile
# there can be multiple files
for file in PPIExpFiles:
PPIExpFile = open(file)
# Data lines
for line in PPIExpFile:
line = line.rstrip('\n')
ExpPPI_fields = line.split('\t')
IntDetMethod = ExpPPI_fields[2]
# MI:0254 -> genetic interference
# MI:0686 -> unspecified method
# Filtering out genetic interference & unspecified
# Interaction Detection Method
if IntDetMethod not in ['MI:0254', 'MI:0686']:
# ExpPPI_fields[0] -> Protein_A_UniprotPrimAC
# ExpPPI_fields[1] -> Protein_B_UniprotPrimAC
Interactors = ExpPPI_fields[0] + '_' + ExpPPI_fields[1]
# Key -> UniProt PrimAC of Protein A & B joined together by an '_'
# Value -> Pubmed Identifier (PMID) - ExpPPI_fields[3]
(Int_key, PMIDs) = (Interactors, ExpPPI_fields[3])
# Check if the Key exists in PPI_PMID_dict
# If yes, then store the values (PMIDs) as a list
if PPI_PMID_dict.get(Int_key, False):
# Avoiding duplicate PMIDs
if PMIDs not in PPI_PMID_dict[Int_key]:
PPI_PMID_dict[Int_key].append(PMIDs)
else:
PPI_PMID_dict[Int_key] = [PMIDs]
# Key -> UniProt PrimAC of Protein A & B joined together by an '_'
# Value -> Interaction Detection Method - ExpPPI_fields[2]
(Int_key, IntDetMeth) = (Interactors, ExpPPI_fields[2])
if PPI_IntDetMethod_dict.get(Int_key, False):
PPI_IntDetMethod_dict[Int_key].append(IntDetMeth)
else:
PPI_IntDetMethod_dict[Int_key] = [IntDetMeth]
# Closing the file
PPIExpFile.close()
# Initializing output list
Uniprot_Interactome_list = []
# Processing the dictionaries and returning a list
# Since both the dictionaries have the same keys,
# We can use the keys from PPI_PMID_dict to iterate through PPI_IntDetMethod_dict
for Int_key in PPI_PMID_dict:
Proteins = Int_key.split('_')
Protein_A = Proteins[0]
Protein_B = Proteins[1]
Pubmed_Identifier = ', '.join(PPI_PMID_dict[Int_key])
PMID_count = len(PPI_PMID_dict[Int_key])
Exp_count = len(PPI_IntDetMethod_dict[Int_key])
# Final Quality Control
# Each interaction proven by at least 2 experiments
if (Exp_count >= 2):
interaction_out_line = [Protein_A, Protein_B, str(PMID_count), Pubmed_Identifier, str(Exp_count)]
Uniprot_Interactome_list.append(interaction_out_line)
return Uniprot_Interactome_list
###########################################################
# Parses tab-seperated canonical transcripts file
# Required columns are: 'ENSG' and 'GENE' (can be in any order,
# but they MUST exist)
#
# Returns a dictionary:
# - Key -> ENSG
# - Value -> Gene
def ENSG_Gene(inCanonicalFile):
# Dictionary for storing ENSG
# and Gene data
ENSG_Gene_dict = {}
# Opening canonical transcript file (gzip or non-gzip)
try:
if inCanonicalFile.endswith('.gz'):
Canonical_File = gzip.open(inCanonicalFile, 'rt')
else:
Canonical_File = open(inCanonicalFile)
except IOError:
sys.exit("Error: Failed to read the Canonical transcript file: %s" % inCanonicalFile)
# Grabbing the header line
Canonical_header_line = Canonical_File.readline()
Canonical_header_fields = Canonical_header_line.split('\t')
# Check the column headers and grab indexes of our columns of interest
(ENSG_index, Gene_index) = (-1,-1)
for i in range(len(Canonical_header_fields)):
if Canonical_header_fields[i] == 'ENSG':
ENSG_index = i
elif Canonical_header_fields[i] == 'GENE':
Gene_index = i
if not ENSG_index >= 0:
sys.exit("Error: Missing required column title 'ENSG' in the file: %s \n" % inCanonicalFile)
elif not Gene_index >= 0:
sys.exit("Error: Missing required column title 'GENE' in the file: %s \n" % inCanonicalFile)
# else grabbed the required column indexes -> PROCEED
# Data lines
for line in Canonical_File:
line = line.rstrip('\n')
CanonicalTranscripts_fields = line.split('\t')
# Key -> ENSG
# Value -> Gene
ENSG_Gene_dict[CanonicalTranscripts_fields[ENSG_index]] = CanonicalTranscripts_fields[Gene_index]
# Closing the file
Canonical_File.close()
return ENSG_Gene_dict
###########################################################
# Parses the tab-seperated UniProt Primary Accession file
# produced by Uniprot_parser.py
#
# Required columns are: 'Primary_AC' and 'ENSGs' (can be in any order,
# but they MUST exist)
#
# Parses the dictionary ENSG_Gene_dict
# returned by the function ENSG_Gene
# Maps UniProt Primary Accession to ENSG
#
# Returns a dictionary:
# - Key: UniProt Primary Accession
# - Value: Corresponding ENSG
def Uniprot_ENSG(inUniProt, ENSG_Gene_dict):
# Dictionary for storing UniProt Primary
# Accession and ENSG data
Uniprot_ENSG_dict = {}
Uniprot_File = open(inUniProt)
# Grabbing the header line
Uniprot_header = Uniprot_File.readline()
Uniprot_header = Uniprot_header.rstrip('\n')
Uniprot_header_fields = Uniprot_header.split('\t')
# Check the column header and grab indexes of our columns of interest
(UniProt_PrimAC_index, ENSG_index) = (-1, -1)
for i in range(len(Uniprot_header_fields)):
if Uniprot_header_fields[i] == 'Primary_AC':
UniProt_PrimAC_index = i
elif Uniprot_header_fields[i] == 'ENSGs':
ENSG_index = i
# Sanity check
if not UniProt_PrimAC_index >= 0:
sys.exit("Error: Missing required column title 'Primary_AC' in the file: %s \n" % inUniProt)
elif not ENSG_index >= 0:
sys.exit("Error: Missing required column title 'ENSG' in the file: %s \n" % inUniProt)
# else grabbed the required column indices -> PROCEED
# Counter for canonical ENSGs
canonical_ENSG_count = 0
# Counter for accessions with no canonical human ENSG
no_CanonicalHumanENSG = 0
# Counter for accessions with single canonical human ENSG
single_CanonicalHumanENSG = 0
# Counter for accessions with multiple canonical human ENSGs
multiple_CanonicalHumanENSG = 0
# Data lines
for line in Uniprot_File:
line = line.rstrip('\n')
Uniprot_fields = line.split('\t')
# ENSG column - This is a single string containing comma-seperated ENSGs
# So we split it into a list that can be accessed later
UniProt_ENSGs = Uniprot_fields[ENSG_index].split(',')
# List to store Human ENSG(s)
# found in the canonical transcripts file
canonical_human_ENSGs = []
for ENSG in UniProt_ENSGs:
if ENSG in ENSG_Gene_dict.keys():
canonical_human_ENSGs.append(ENSG)
canonical_ENSG_count += 1
# Key -> Uniprot Primary accession
# Value -> Canonical_human_ENSG
if len(canonical_human_ENSGs) == 0:
no_CanonicalHumanENSG += 1
elif len(canonical_human_ENSGs) == 1:
Uniprot_ENSG_dict[Uniprot_fields[UniProt_PrimAC_index]] = ''.join(canonical_human_ENSGs)
single_CanonicalHumanENSG += 1
# Also counting accessions with multiple ENSGs
elif len(canonical_human_ENSGs) > 1:
multiple_CanonicalHumanENSG += 1
# logging.debug("Total no. of Human UniProt Primary Accessions: %d " % Count_HumanUniprotPrimAC)
# logging.debug("Total no. of ENSGs in the Canonical Transcripts file: %d " % len(ENSG_Gene_dict.keys()))
# logging.debug("Total no. of ENSGs in the UniProt Primary Accession file: %d " % canonical_ENSG_count)
# logging.debug("No. of UniProt primary accessions without canonical human ENSG: %d " % no_CanonicalHumanENSG)
# logging.debug("No. of UniProt primary accessions with single canonical human ENSG: %d " % single_CanonicalHumanENSG)
# logging.debug("No. of UniProt primary accessions with multiple canonical human ENSGs: %d " % multiple_CanonicalHumanENSG)
# Closing the file
Uniprot_File.close()
return Uniprot_ENSG_dict
###########################################################
# Parses the list - [Uniprot_Interactome_list]
# returned by the function: UniProtInteractome
# Also parses the dictionary - {Uniprot_ENSG_dict} returned by the function: Uniprot_ENSG
# and {ENSG_Gene_dict} returned by the function: ENSG_Gene
#
# Maps the UniProt Primary Accessions to ENSG
#
# Prints to STDOUT in .tsv format
# Output consists of 2 columns:
# - ENSG of Protein A
# - ENSG of Protein B
def Interactome_Uniprot2ENSG(args):
# Calling the functions
Uniprot_Interactome_list = UniProtInteractome(args.inExpFile)
ENSG_Gene_dict = ENSG_Gene(args.inCanonicalFile)
Uniprot_ENSG_dict = Uniprot_ENSG(args.inUniProt, ENSG_Gene_dict)
# Counter for UniProt Primary Accessions of proteins not mapping to ENSG
lost_Interaction = 0
for data in Uniprot_Interactome_list:
if data[0] in Uniprot_ENSG_dict.keys() and data[1] in Uniprot_ENSG_dict.keys():
ENSG_Interactome_out = (Uniprot_ENSG_dict.get(data[0]), Uniprot_ENSG_dict.get(data[1]))
print('\t'.join(ENSG_Interactome_out))
else:
lost_Interaction += 1
# logging.debug("Total no. of Interactions lost: %d " % lost_Interaction)
logging.info("All done, completed succesfully!")
return
###########################################################
# Taking and handling command-line arguments
def main():
file_parser = argparse.ArgumentParser(description =
"""
---------------------------------------------------------------------------------------------------------
Program: Parses the output file(s) produced by the interaction_parser.py to produce a high-quality
Interactome (containing both true binary PPIs and expansion), maps the UniProt Primary Accession
of interacting proteins to ENSG using the Canonical transcripts file and prints to STDOUT
---------------------------------------------------------------------------------------------------------
The output (High-quality Human Interactome) consists of two columns in .tsv format:
-> ENSG of Protein A
-> ENSG of Protein B
---------------------------------------------------------------------------------------------------------
Arguments [defaults] -> Can be abbreviated to shortest unambiguous prefixes
""",
formatter_class = argparse.RawDescriptionHelpFormatter)
required = file_parser.add_argument_group('Required arguments')
optional = file_parser.add_argument_group('Optional arguments')
required.add_argument('--inExpFile', metavar = "Input File", dest = "inExpFile", nargs = '+', help = 'Output files produced by interaction_parser.py', required = True)
required.add_argument('--inUniprot', metavar = "Input File", dest = "inUniProt", help = 'Uniprot Primary Accession File generated by the uniprot parser', required = True)
required.add_argument('--inCanonicalFile', metavar = "Input File", dest = "inCanonicalFile", help = 'Canonical Transcripts file (.gz or non .gz)', required = True)
args = file_parser.parse_args()
Interactome_Uniprot2ENSG(args)
if __name__ == "__main__":
# Logging to Standard Error
Log_Format = "%(levelname)s - %(asctime)s - %(message)s \n"
logging.basicConfig(stream = sys.stderr, format = Log_Format, level = logging.DEBUG)
main()