-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathresidue_clustering_determinator_ALL_proteins.py
565 lines (407 loc) · 24.6 KB
/
residue_clustering_determinator_ALL_proteins.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
#!/usr/bin/env python
#*******************************************************************************
##################################
# USER ADJUSTABLE VALUES #
##################################
#
amino_acid_to_examine_full_name = "comboKQE" # options are: 'lysine', 'glutamine', and 'glutamic_acid' for single. Or 'comboKQE'
protein_collection_name = "ALL_PROTEINS"
Window_Size = 50
step_to_move_window = 10
name_of_outputfileA = ("rankings_of_" + amino_acid_to_examine_full_name + "_raw_density_for_"+ protein_collection_name + str(Window_Size) + "_" + str(step_to_move_window )+ ".tsv")
name_of_outputfileB = ("rankings_of_" + amino_acid_to_examine_full_name + "_clustering_for_"+ protein_collection_name + "_" + str(Window_Size) + "_" + str(step_to_move_window )+ ".tsv")
name_of_outputfileC = ("rankings_of_" + amino_acid_to_examine_full_name + "_len_normalized_density_for_"+ protein_collection_name + "_" + str(Window_Size) + "_" + str(step_to_move_window )+ ".tsv")
name_of_outputfileD = ("rankings_of_" + amino_acid_to_examine_full_name + "_len_normalized_clustering_for_"+ protein_collection_name + "_" + str(Window_Size) + "_" + str(step_to_move_window )+ ".tsv")
name_of_outputfileE = ("highest_clusters_of_" + amino_acid_to_examine_full_name + "_per_gene_for_"+ protein_collection_name + "_" + str(Window_Size) + "_" + str(step_to_move_window )+ ".tsv")
adjancency_bonus_factor = 0.016999 #0.005799 from when developing but then worked out higher value while examining nuMRP proteins
#
#*******************************************************************************
#**********************END USER ADJUSTABLE VARIABLES****************************
# options are: 'lysine', 'glutamine', and 'glutamic_acid' for single. Or 'comboKQE'
analyze_triple_amino_acid_clustering_combo = False
if amino_acid_to_examine_full_name == "lysine":
amino_acid_to_examine = "K"
elif amino_acid_to_examine_full_name == "glutamine":
amino_acid_to_examine = "Q"
elif amino_acid_to_examine_full_name == "glutamic_acid":
amino_acid_to_examine = "E"
elif amino_acid_to_examine_full_name == "comboKQE":
amino_acid_to_examine = "J"
# from http://virology.wisc.edu/acp/Classes/DropFolders/Drop660_lectures/SingleLetterCode.html
# references to IUPAC and IUBMB and
# says no amino acids or ambiguity indicators assigned to single letters
# 'J', 'O', or 'U' and so I should be safe replacing all 'K', 'Q', and 'E's
# with one of those and counting those to assess clustering
analyze_triple_amino_acid_clustering_combo = True
###---------------------------HELPER FUNCTIONS---------------------------------###
def density_calc (sequence_string, residue):
'''
takes a string representation of a protein sequence and returns the density
of a residue
'''
residue_density = float(sequence_string.count(residue)) / len(sequence_string)
return residue_density
def adjancency_bonus_calc (sequence_string, residue):
'''
takes a string representation of a protein sequence and test if it contains
occurences of the residue of interest adjacent to the same residuec and then
returns the bonus score to assign to that string based on the number of
occurences.
'''
adjancency_bonus = 0.0
for idx,amino_acid in enumerate(sequence_string):
# Make sure there is a previous residue then only see about adding bonus
# if current amino_acid is same as residue of interest.
if idx > 0 and (amino_acid == residue) :
if sequence_string[idx-1] == residue:
adjancency_bonus += adjancency_bonus_factor
#plus if there is a residue before the previous one, add a
# add a bonus if that is also residue of interest to compound
# detection of chains of residue_density
if idx > 1 and sequence_string[idx-2] == residue:
adjancency_bonus += adjancency_bonus_factor
if idx > 2 and sequence_string[idx-3] == residue:
adjancency_bonus += adjancency_bonus_factor
if idx > 3 and sequence_string[idx-4] == residue:
adjancency_bonus += adjancency_bonus_factor
if idx > 5 and sequence_string[idx-6] == residue:
adjancency_bonus += adjancency_bonus_factor
if idx > 6 and sequence_string[idx-7] == residue:
adjancency_bonus += adjancency_bonus_factor
if idx > 7 and sequence_string[idx-8] == residue:
adjancency_bonus += adjancency_bonus_factor/2
return adjancency_bonus
def replace_KQE_with_dummy_amino_acid(text):
'''
Function takes text and replaces all occurences of 'K', 'Q', or 'E' with
'J'.
The choice of 'J' comes from fact
http://virology.wisc.edu/acp/Classes/DropFolders/Drop660_lectures/SingleLetterCode.html
which references to IUPAC and IUBMB says no amino acids or ambiguity
indicators assigned to single letters 'J', 'O', or 'U' and so I should be
safe replacing all 'K', 'Q', and 'E's
with one of those and counting those to assess clustering
The main portion of the function, i.e., replacing multiple characters,
was developed using information on page
http://stackoverflow.com/questions/3411771/multiple-character-replace-with-python
where they report the timing of various approaches with this (ba) as one of
fastest and most readable in Hugo's answer
'''
chars = "KQE"
for char in chars:
if char in text:
text = text.replace(char, "J")
return text
def generate_density_reportA (table_data):
'''
This function sorts the list of data contents based on density score
and outputs a tsv table with the results with labeled headers.
The generated outout file should be able to be opened in
Google Spreadsheets or Excel.
'''
# initialize output file stream
output_file = open(name_of_outputfileA, "w")
# generate the sorted lists to be used
sorted_on_density = sorted(
table_data, key=lambda table_data: table_data[3], reverse=True)
# generate the output
# for excel I find I can control the number of empty cells on left with tab,
# but once the text starts, you'll need an extra one for putting in exmpty
# cells because after text it just signals go to next cell
data_text = "\t\tsorted on density\n"
#write data row
output_file.write(data_text + '\n')
data_text = "ID\tsys\tpI\tdensity\tsequence of fragment analyzed"
output_file.write(data_text + '\n')
#write other rows of data
for data_row in sorted_on_density:
# CODE BEING USED TO POPULATE TABLE ROWS:
# table_data.append([row["symbol"], current_protein_gene, row["proteins.pI"], density_score, density_with_adjancency_bonus, length_normalized_density_score, length_normalized_density_and_adjancency, sequence_window])
# Added check for `None` because I was seeing error error `TypeError:
# unsupported operand type(s) for +: 'NoneType' and 'str'` and when I
# set to skip those, only ended up with 1141 and not 1208 expexted
if data_row[0] == None:
# originally was just using the alternate, which I had as
# the gene systematic name, when no gene)symbol present
# but some gene_symbols/name aliases overlap and cause
# problems if you try to go back into yeastmine, so I added
# systematic name to all lines.
data_text = "None\t" + data_row[1] + "\t" + str(data_row[2]) + "\t" + str(data_row[3]) + "\t" + str(data_row[7])
else:
# use the standard name, which was stored at index zero
data_text = data_row[0] + "\t" + data_row[1] + "\t" + str(data_row[2]) + "\t" + str(data_row[3]) + "\t" + str(data_row[7])
#write data row
output_file.write(data_text + '\n')
# close output file stream
output_file.close()
def generate_density_with_adjancency_bonus_reportB (table_data):
'''
This function sorts the list of data contents based on density score + bonus
for being adjacent another occurence of the residue of interest and outputs
a tsv table with the results with labeled headers.
The generated outout file should be able to be opened in
Google Spreadsheets or Excel.
'''
# initialize output file stream
output_file = open(name_of_outputfileB, "w")
# generate the sorted lists to be used
sorted_on_density_with_adjancency_bonus = sorted(
table_data, key=lambda table_data: table_data[4], reverse=True)
# generate the output
# for excel I find I can control the number of empty cells on left with tab,
# but once the text starts, you'll need an extra one for putting in exmpty
# cells because after text it just signals go to next cell
data_text = "\t\tsorted on density + adjancency bonus\n"
#write data row
output_file.write(data_text + '\n')
data_text = "ID\tsys\tpI\tdensity+adjancency bonus\tsequence of fragment analyzed"
output_file.write(data_text + '\n')
#write other rows of data
for data_row in sorted_on_density_with_adjancency_bonus:
# CODE BEING USED TO POPULATE TABLE ROWS:
# table_data.append([row["symbol"], current_protein_gene, row["proteins.pI"], density_score, density_with_adjancency_bonus, length_normalized_density_score, length_normalized_density_and_adjancency, sequence_window])
# Added check for `None` because I was seeing error error `TypeError:
# unsupported operand type(s) for +: 'NoneType' and 'str'` and when I
# set to skip those, only ended up with 1141 and not 1208 expexted
if data_row[0] == None:
# originally was just using the alternate, which I had as
# the gene systematic name, when no gene)symbol present
# but some gene_symbols/name aliases overlap and cause
# problems if you try to go back into yeastmine, so I added
# systematic name to all lines.
data_text = "None\t" + data_row[1] + "\t" + str(data_row[2]) + "\t" + str(data_row[4]) + "\t" + str(data_row[7])
else:
# use the standard name, which was stored at index zero
data_text = data_row[0] + "\t" + data_row[1] + "\t" + str(data_row[2]) + "\t" + str(data_row[4]) + "\t" + str(data_row[7])
#write data row
output_file.write(data_text + '\n')
# close output file stream
output_file.close()
def generate_len_normalized_density_reportC (table_data):
'''
This function sorts the list of data contents based on length-normalized
density and outputs a tsv table with the results with labeled headers.
The generated outout file should be able to be opened in
Google Spreadsheets or Excel.
'''
# initialize output file stream
output_file = open(name_of_outputfileC, "w")
# generate the sorted lists to be used
sorted_on_len_normalized_density = sorted(
table_data, key=lambda table_data: table_data[5], reverse=True)
# generate the output
# for excel I find I can control the number of empty cells on left with tab,
# but once the text starts, you'll need an extra one for putting in exmpty
# cells because after text it just signals go to next cell
data_text = "\t\tsorted on length-normalized density\n"
#write data row
output_file.write(data_text + '\n')
data_text = "ID\tsys\tpI\tlength-normalized density\tsequence of fragment analyzed"
output_file.write(data_text + '\n')
#write other rows of data
for data_row in sorted_on_len_normalized_density:
# CODE BEING USED TO POPULATE TABLE ROWS:
# table_data.append([row["symbol"], current_protein_gene, row["proteins.pI"], density_score, density_with_adjancency_bonus, length_normalized_density_score, length_normalized_density_and_adjancency, sequence_window])
# Added check for `None` because I was seeing error error `TypeError:
# unsupported operand type(s) for +: 'NoneType' and 'str'` and when I
# set to skip those, only ended up with 1141 and not 1208 expexted
if data_row[0] == None:
# originally was just using the alternate, which I had as
# the gene systematic name, when no gene)symbol present
# but some gene_symbols/name aliases overlap and cause
# problems if you try to go back into yeastmine, so I added
# systematic name to all lines.
data_text = "None\t" + data_row[1] + "\t" + str(data_row[2]) + "\t" + str(data_row[5]) + "\t" + str(data_row[7])
else:
# use the standard name, which was stored at index zero
data_text = data_row[0] + "\t" + data_row[1] + "\t" + str(data_row[2]) + "\t" + str(data_row[5]) + "\t" + str(data_row[7])
#write data row
output_file.write(data_text + '\n')
# close output file stream
output_file.close()
def generate_len_normalized_density_with_adjacency_bonus_reportD (table_data):
'''
This function sorts the list of data contents based on length-normalized
density + bonus for being adjacent another occurence of the residue of
interest and outputs a tsv table with the results with labeled headers.
The generated outout file should be able to be opened in
Google Spreadsheets or Excel.
'''
# initialize output file stream
output_file = open(name_of_outputfileD, "w")
# generate the sorted lists to be used
sorted_on_len_normalized_density_with_adjancency_bonus = sorted(
table_data, key=lambda table_data: table_data[6], reverse=True)
# generate the output
# for excel I find I can control the number of empty cells on left with tab,
# but once the text starts, you'll need an extra one for putting in exmpty
# cells because after text it just signals go to next cell
data_text = "\t\tsorted on length-normalized density + adjancency bonus\n"
#write data row
output_file.write(data_text + '\n')
data_text = "ID\tsys\tpI\tlength-normalized density + adjancency bonus\tsequence of fragment analyzed"
output_file.write(data_text + '\n')
#write other rows of data
for data_row in sorted_on_len_normalized_density_with_adjancency_bonus :
# CODE BEING USED TO POPULATE TABLE ROWS:
# table_data.append([row["symbol"], current_protein_gene, row["proteins.pI"], density_score, density_with_adjancency_bonus, length_normalized_density_score, length_normalized_density_and_adjancency, sequence_window])
# Added check for `None` because I was seeing error error `TypeError:
# unsupported operand type(s) for +: 'NoneType' and 'str'` and when I
# set to skip those, only ended up with 1141 and not 1208 expexted
if data_row[0] == None:
# originally was just using the alternate, which I had as
# the gene systematic name, when no gene)symbol present
# but some gene_symbols/name aliases overlap and cause
# problems if you try to go back into yeastmine, so I added
# systematic name to all lines.
data_text = "None\t" + data_row[1] + "\t" + str(data_row[2]) + "\t" + str(data_row[6]) + "\t" + str(data_row[7])
else:
# use the standard name, which was stored at index zero
data_text = data_row[0] + "\t" + data_row[1] + "\t" + str(data_row[2]) + "\t" + str(data_row[6]) + "\t" + str(data_row[7])
#write data row
output_file.write(data_text + '\n')
# close output file stream
output_file.close()
def generate_richest_chunks_per_gene_reportE (table_data):
'''
This function collects the best scoring fragment for each gene using the
combined length-normalized density scores and adjancency bonus score
and outputs a tsv table with the results with labeled headers.
The generated outout file should be able to be opened in
Google Spreadsheets or Excel.
'''
from collections import defaultdict
richest_fragment_dict = defaultdict(list) # see http://ludovf.net/blog/python-collections-defaultdict/
# initialize output file stream
output_file = open(name_of_outputfileE, "w")
# generate the dictionary to be used
# this will be a dictionary of the data lines for the highest score for
# each gene. They keys will be the protein_gene_name.
for i in table_data:
#check if anything listed already and if so whether lower or better score.
# if nothing is listed copy it in.
if len(richest_fragment_dict[i[1]]):
if (i[6] > richest_fragment_dict[i[1]][6]):
richest_fragment_dict[i[1]] = i
else:
richest_fragment_dict[i[1]] = i
# Convert dictionary with scores and information to a list to allow sorting the data.
# (see Akseli Palen's answer at http://stackoverflow.com/questions/1679384/converting-python-dictionary-to-list )
data_list = richest_fragment_dict.values()
# sort the data - want highest to lowest combined length-normalized density
# scores and adjancency bonus score
from operator import itemgetter
sorted_data_list = sorted(data_list, key=itemgetter(6), reverse=True)
# generate the output
# for excel I find I can control the number of empty cells on left with tab,
# but once the text starts, you'll need an extra one for putting in empty
# cells because after text it just signals go to next cell
data_text = "\t\tbest combined length-normalized density + adjancency bonus score for each gene\n"
#write data row
output_file.write(data_text + '\n')
data_text = "ID\tsys\tpI\tlength-normalized density + adjancency bonus\tsequence of fragment analyzed"
output_file.write(data_text + '\n')
#write other rows of data
for gene in sorted_data_list:
# CODE BEING USED TO POPULATE TABLE ROWS:
# table_data.append([row["symbol"], current_protein_gene, row["proteins.pI"], density_score, density_with_adjancency_bonus, length_normalized_density_score, length_normalized_density_and_adjancency, sequence_window])
# Added check for `None` because I was seeing error error `TypeError:
# unsupported operand type(s) for +: 'NoneType' and 'str'` and when I
# set to skip those, only ended up with 1141 and not 1208 expexted
if gene[0] == None:
# originally was just using the alternate, which I had as
# the gene systematic name, when no gene)symbol present
# but some gene_symbols/name aliases overlap and cause
# problems if you try to go back into yeastmine, so I added
# systematic name to all lines.
data_text = "None\t" + gene[1] + "\t" + str(gene[2]) + "\t" + str(gene[6]) + "\t" + str(gene[7])
else:
# use the standard name, which was stored at index zero
data_text = gene[0] + "\t" + gene[1] + "\t" + str(gene[2]) + "\t" + str(gene[6]) + "\t" + str(gene[7])
#write data row
output_file.write(data_text + '\n')
# close output file stream
output_file.close()
###--------------------------END OF HELPER FUNCTIONS---------------------------###
###-----------------Actual Main function of script---------------------------###
##################################################################
#############START OF CODE FROM YEASTMINE########################
#!/usr/bin/env python
# This is an automatically generated script to run your query
# to use it you will require the intermine python client.
# To install the client, run the following command from a terminal:
#
# sudo easy_install intermine
#
# For further documentation you can visit:
# http://intermine.readthedocs.org/en/latest/web-services/
# The following two lines will be needed in every python script:
from intermine.webservice import Service
service = Service("http://yeastmine.yeastgenome.org/yeastmine/service")
# Get a new query on the class (table) you will be querying:
query = service.new_query("Protein")
# The view specifies the output columns
query.add_view(
"genes.primaryIdentifier", "genes.secondaryIdentifier", "symbol", "length",
"molecularWeight", "pI", "genes.featureType", "genes.sgdAlias",
"genes.description", "sequence.residues"
)
# You can edit the constraint values below
query.add_constraint("genes.featureType", "=", "intein_encoding_region", code = "H")
query.add_constraint("genes.featureType", "=", "blocked_reading_frame", code = "E")
query.add_constraint("genes.qualifier", "!=", "Dubious", code = "B")
query.add_constraint("genes.qualifier", "IS NULL", code = "C")
query.add_constraint("genes.status", "=", "Active", code = "D")
query.add_constraint("genes.featureType", "=", "ORF", code = "F")
query.add_constraint("genes.featureType", "=", "transposable_element_gene", code = "G")
query.add_constraint("organism.name", "=", "Saccharomyces cerevisiae", code = "A")
# Your custom constraint logic is specified with the code below:
query.set_logic("A and (B or C) and D and (F or G or E or H)")
#############END OF CODE FROM YEASTMINE########################
##################################################################
total_gene_count = 0
table_data = []
for row in query.rows():
total_gene_count += 1
current_protein_gene = row["genes.secondaryIdentifier"]
current_sequence = row["sequence.residues"]
# For breaking up into chunks as a leapfrogging window, I used msw's answer on
# http://stackoverflow.com/questions/11636079/split-very-long-character-string-into-smaller-character-blocks-with-character-ov
# Split very long character string into smaller character blocks with character overlap
for i in range(0, len(current_sequence), step_to_move_window):
sequence_window = current_sequence[i:i+Window_Size]
# Karlin et al 2002 cites their earlier work that says, "a 'typical' protein of 400 residues and average composition, a run of an individual amino acid is statistically significant (at the 0.1% significance level) if it is five or more residues long." So I picked a fuzzy number two above that because stop codon represented in text sequence as asterisk and to allow cosideration of some short sequences towards end of proteins. Also this allows consideration of some of sequences where the break of 10 perhaps falls arbitraily less than optimally given the step of 10 amino acids (vs one or some other low number) I used to limit computation and redundancy.
if len(sequence_window) > 6:
if analyze_triple_amino_acid_clustering_combo:
unsubstituted_seq_window = sequence_window
sequence_window = replace_KQE_with_dummy_amino_acid(sequence_window)
density_score = density_calc (sequence_window, amino_acid_to_examine)
adjancency_bonus = adjancency_bonus_calc (sequence_window, amino_acid_to_examine)
density_with_adjancency_bonus = density_score + adjancency_bonus
length_normalized_density_score = density_score * (len(sequence_window)/float(Window_Size))
length_normalized_adjancency_bonus = adjancency_bonus * (len(sequence_window)/float(Window_Size))
length_normalized_density_and_adjancency = length_normalized_density_score + length_normalized_adjancency_bonus
if analyze_triple_amino_acid_clustering_combo:
table_data.append([row["symbol"], current_protein_gene, row["pI"], density_score, density_with_adjancency_bonus, length_normalized_density_score, length_normalized_density_and_adjancency, unsubstituted_seq_window])
else:
table_data.append([row["symbol"], current_protein_gene, row["pI"], density_score, density_with_adjancency_bonus, length_normalized_density_score, length_normalized_density_and_adjancency, sequence_window])
# generate reports using the data
generate_density_reportA (table_data)
generate_density_with_adjancency_bonus_reportB (table_data)
generate_len_normalized_density_reportC (table_data)
generate_len_normalized_density_with_adjacency_bonus_reportD (table_data)
generate_richest_chunks_per_gene_reportE(table_data)
#give user some stats and feeback
import sys
sys.stderr.write("Concluded. \n")
sys.stderr.write(str(total_gene_count) + " protein sequences analyzed. \n")
sys.stderr.write("The report on the amino acid density been saved as \n'"
+ name_of_outputfileA +"' in same directory as the script.\n")
sys.stderr.write("The report on the amino acid clustering has been saved as \n'")
sys.stderr.write(name_of_outputfileB + ".\n")
sys.stderr.write("The report on the length-normalized amino acid density has been saved as \n'")
sys.stderr.write(name_of_outputfileC + ".\n")
sys.stderr.write("The report on the length-normalized amino acid clustering has been saved as \n'")
sys.stderr.write(name_of_outputfileD + ".\n")
sys.stderr.write("A listing of the highest scoring clusters for each gene has been saved as \n'")
sys.stderr.write(name_of_outputfileE + ".\n")