-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgraphanalyzer.py
executable file
·1298 lines (1045 loc) · 59 KB
/
graphanalyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
# Coded by Gioele Lazzari ([email protected])
software = "graphanalyzer.py"
version = "1.6.0"
# begin the memory tracing
import tracemalloc
tracemalloc.start()# at this point, get_traced_memory() it's (0, 0)
#from memory_profiler import profile
def consoleout(lvl, msg):
import sys
# define a function to communicate with the console with colors:
colors = {
'HEADER' : '\033[95m',
'OKBLUE' : '\033[94m',
'OKCYAN' : '\033[96m',
'OKGREEN' : '\033[92m',
'WARNING' : '\033[93m',
'FAIL' : '\033[91m',
'ENDC' : '\033[0m',
'BOLD' : '\033[1m',
'UNDERLINE' : '\033[4m'
}
if lvl == 'error':
msg = colors['FAIL'] + 'ERROR: ' + colors['ENDC'] + msg + '\n'
sys.stderr.write(msg)
sys.stderr.flush()
sys.exit(1)
elif lvl == 'warning':
msg = colors['WARNING'] + 'WARNING: ' + colors['ENDC'] + msg + '\n'
sys.stdout.write(msg)
sys.stdout.flush()
elif lvl == 'okay':
msg = colors['OKGREEN'] + 'OKAY: ' + colors['ENDC'] + msg + '\n'
sys.stdout.write(msg)
sys.stdout.flush()
elif lvl == 'meminfo':
msg = colors['OKCYAN'] + 'ALLOC: ' + colors['ENDC'] + msg + '\n'
sys.stdout.write(msg)
sys.stdout.flush()
else:
consoleout('error', 'consoleout() called with wrong lvl.')
def fillWithMetas(csv , metas):
# vConTACT2 used with the method described in https://github.com/RyanCook94/inphared.pl
# produces a c1.ntw with GenBank accessions as node labels, and a genome_by_genome_overview.csv
# with GenBank accessions as genome names. Moreover columns “Genus”, “Family” and “Order”
# are filled with "Unassigned" so they need to be filled with the correct taxonomy.
# Fortunately, method described at https://github.com/RyanCook94/inphared.pl provides a
# metadata table, {date}_data_excluding_refseq.tsv, containing the correct taxonomy for
# every GenBank accession.
# The following function update the void columns of genome_by_genome_overview.csv with the
# taxonomies contained in 26Jan2021_data_excluding_refseq.tsv.
# make an editable copy:
csv_edit = csv.copy(deep=True)
# rename some columns, so below they could easily be called as Series.name :
metas = metas.rename(columns={"Sub-family": "Subfamily"})
metas = metas.rename(columns={"Baltimore Group": "BaltimoreGroup"})
# Add some new columns:
csv_edit["Accession"] = csv_edit["Genome"] # copy to allow future edits of "Genome".
csv_edit["FullClass"] = None # to save the "full classification" as stored in metas.Classification.
# Host column will be imported from metas, matching the accession number:
csv_edit["Host"] = None
# Below we try to reconstruct the full taxonomy.
# "Genus", "Family", and "Order" are commented as they were already created in the original "csv".
# "Genus" and "Family" will anyway be filled with info from "metas", matching the accession number.
# Taxonomic levels signed with *, are not present in "metas" tables. We will try to fill
# these fields decomposing the metas.Classification string, that is a full taxonomy string in ascending order, like eg:
# "Escherichia phage phiX174 Sinsheimervirus Bullavirinae Microviridae Petitvirales Malgrandaviricetes Phixviricota Sangervirae Monodnaviria Viruses"
csv_edit["BaltimoreGroup"] = None # Not a real taxonomy level - to be filled with info from metas, matching the accession number.
csv_edit["Realm"] = None # Or "domain" - to be filled with info from metas, matching the accession number.
csv_edit["Subrealm"] = None # *
csv_edit["Kingdom"] = None # *
csv_edit["Subkingdom"] = None # *
csv_edit["Phylum"] = None # *
csv_edit["Subphylum"] = None # *
csv_edit["Class"] = None # *
csv_edit["Subclass"] = None # *
# Order # *
csv_edit["Suborder"] = None # *
# Family # to be filled with info from metas, matching the accession number.
csv_edit["Subfamily"] = None # to be filled with info from metas, matching the accession number.
# Genus # to be filled with info from metas, matching the accession number.
csv_edit["Species"] = None # to be filled with info from metas, matching the accession number.
# Viral taxonomy, defined by ICTV, has several taxonomy levels.
# Each taxonomic level is charaterized by a particular suffix.
# The species level has no suffix, can contain more words, and
# must not only contain the word virus and the host name.
# Below he report all levels-suffix pairs, in form of python dictionary.
# (level-suffix pairs taken from https://en.wikipedia.org/wiki/Virus_classification)
lvlsuffix = {"Realm": "viria",
"Subrealm": "vira", # extra field
"Kingdom": "virae",
"Subkingdom": "virites", # extra field
"Phylum": "viricota",
"Subphylum": "viricotina", # extra field
"Class": "viricetes",
"Subclass": "viricetidae", # extra field
"Order": "virales",
"Suborder": "virineae", # extra field
"Family": "viridae",
"Subfamily": "virinae",
"Genus": "virus",
"Subgenus": "virus" # again! Extra field. Not used below beacause it's indistinguishable from "Genus".
} # "Species" has no suffix (see above).
# for every row in csv_edit (that means: for every row in the vConTACT2 "csv" output)
for index, row in csv_edit.iterrows():
# find the corresponding GenBank accession in the metas table:
matches = metas[metas['Accession'] == row.Accession]
if len(matches) == 0:
# so this is a vOTU (that means: a viral scaffold in need to be classified)
pass # go on with the next row!
elif (len(matches) != 0 and len(matches) != 1):
# there should be only 1 exact match.
consoleout('error', "More than 1 match in filler() function!") # stop the program.
else:
# get first (and only) match:
match = matches.iloc[0]
# fill csv_edit with missing infos (Species, Genus, Family, Order, ...) from match (metas)
csv_edit.at[index,'Host'] = match.Host
csv_edit.at[index,'BaltimoreGroup'] = match.BaltimoreGroup
csv_edit.at[index,'Realm'] = match.Realm
# Subreal # -
# Kingdom # -
# Subkingdom # -
# Phylum # -
# Subphylum # -
# Class # -
# Subclass # -
# Order # -
# Suborder # -
csv_edit.at[index,'Family'] = match.Family
csv_edit.at[index,'Subfamily'] = match.Subfamily
csv_edit.at[index,'Genus'] = match.Genus
csv_edit.at[index,'Species'] = match.Description # match.Description contains only the species name.
# Below we try to catch the missing isolated fileds from metas (Kindom, Phylum, Class, ...)
# from match.Classification, that is a big string like containing the full known linalogy, for example:
# "Pseudomonas virus phiCTX Citexvirus Peduovirinae Myoviridae Caudovirales Caudoviricetes Uroviricota Heunggongvirae Duplodnaviria Viruses"
# Anyway, match.Classification (the linalogy) is rarely complete. Some levels are missing here or there.
# Fortunately, for Kindom, Phylum, Class etc there is a "suffix convention":
# eg: _virae for Kingdom, _viricota for Phylum, _viricetes for Class. E
# Please note that very suffix was before saved in lvlsuffix.
# Remove the last level (species) from linealogy, then split by ' ':
linealogy = match.Classification.replace(match.Description + " ","").split(" ")
def scroll_linealogy(lin, suff): # function to find term with correspondent suffix in linealogy (list).
cnt = 0
got = "n.a." # Maybe it could be better to replace "n.a." with somthing more specific.
for x in lin:
if x.endswith(suff):
cnt += 1
if cnt > 1: # There SHOULD be only one match for every suffix:
consoleout('warning', f'More than 1 term in linealogy with suffix "{suff}"! Taking the last.')
got = x
return got
csv_edit.at[index,'Subrealm'] = scroll_linealogy(linealogy, lvlsuffix['Subrealm'])
csv_edit.at[index,'Kingdom'] = scroll_linealogy(linealogy, lvlsuffix['Kingdom'])
csv_edit.at[index,'Subkingdom'] = scroll_linealogy(linealogy, lvlsuffix['Subkingdom'])
csv_edit.at[index,'Phylum'] = scroll_linealogy(linealogy, lvlsuffix['Phylum'])
csv_edit.at[index,'Subphylum'] = scroll_linealogy(linealogy, lvlsuffix['Subphylum'])
csv_edit.at[index,'Class'] = scroll_linealogy(linealogy, lvlsuffix['Class'])
csv_edit.at[index,'Subclass'] = scroll_linealogy(linealogy, lvlsuffix['Subclass'])
csv_edit.at[index,'Order'] = scroll_linealogy(linealogy, lvlsuffix['Order'])
csv_edit.at[index,'Suborder'] = scroll_linealogy(linealogy, lvlsuffix['Suborder'])
# Anyway store the full linealogy for following eventual uses.
csv_edit.at[index,'FullClass'] = match.Classification
# Note: after running the script, seems like metas 1Nov2021_data_excluding_refseq.tsv doesn not contain
# Subreal, Subkingdom, Subphylum, Subclass and Suborder informations in metas.Classification.
return csv_edit
def clusterExtractor(graph, csv_edit, output_path, string_suffix, prefix):
# prepare results dataframe:
results = [] # a list of dict to be later converted in dataframe
# Function to append the taxonomy result of a vOTU into the passed results dataframe.
# "closer" is the accession to search for in the 'Genome' column of "csv_edit" used as database.
# "scaffold" is a string passaed like "vOTU_123".
# "vc" is the viral subcluster as determined by vConTACT2.
def insertReference(csv_edit, results, scaffold, closer, weight, level, status, vc):
# extract taxonomy
baltim = "n.a."
realm = "n.a."
kingdom = "n.a."
phylum = "n.a."
classs = "n.a."
order = "n.a."
family = "n.a."
subfamily = "n.a."
genus = "n.a."
species = "n.a."
# extract other infos
host = "n.a."
# "G": not in the graph.
# "F": present in the graph but not assigned (completely isolated).
# "A": present in the graph but not assigned (indireclty connected to references).
if level != "F" and level != "G" and level != "A" and level != "AF":
baltim = csv_edit.loc[csv_edit['Genome'] == closer, 'BaltimoreGroup'].values[0]
realm = csv_edit.loc[csv_edit['Genome'] == closer, 'Realm'].values[0]
kingdom = csv_edit.loc[csv_edit['Genome'] == closer, 'Kingdom'].values[0]
phylum = csv_edit.loc[csv_edit['Genome'] == closer, 'Phylum'].values[0]
classs = csv_edit.loc[csv_edit['Genome'] == closer, 'Class'].values[0]
order = csv_edit.loc[csv_edit['Genome'] == closer, 'Order'].values[0]
family = csv_edit.loc[csv_edit['Genome'] == closer, 'Family'].values[0]
subfamily = csv_edit.loc[csv_edit['Genome'] == closer, 'Subfamily'].values[0]
genus = csv_edit.loc[csv_edit['Genome'] == closer, 'Genus'].values[0]
species = csv_edit.loc[csv_edit['Genome'] == closer, 'Species'].values[0]
host = csv_edit.loc[csv_edit['Genome'] == closer, 'Host'].values[0]
# format weight (cloud be float or "n.a."):
if type(weight) != str:
weight = str(int(round(weight,0)))
# append new row:
results.append({
'Scaffold': scaffold,
'Closer': species, # this is the putative species.
'Accession': closer,
'Status': status,
'VC': vc,
'Level': level,
'Weight': weight,
'Host': host,
# Below the putative taxonomy table exluding the species:
'BaltimoreGroup' : baltim,
'Realm' : realm,
'Kingdom' : kingdom,
'Phylum' : phylum,
'Class' : classs,
'Order' : order,
'Family' : family,
'Subfamily' : subfamily,
'Genus' : genus
})
return results
####################
# PREPARATORY PART #
####################
# create the .log file
import logging
logging.basicConfig(filename = output_path + 'graphanalyzer_' + string_suffix + '.log',
filemode='w', level = logging.INFO, # level sets the threshold
format = '%(asctime)s %(levelname)s: %(message)s',
datefmt = '%H:%M:%S')
logging.info('Processing of vConTACT2 output is started.\n')
# less problematic when called as variable:
csv_edit = csv_edit.rename(columns={"VC Subcluster": "VCSubcluster"})
csv_edit = csv_edit.rename(columns={"VC Status": "VCStatus"})
# sobstitute NA with '' to avoid future runtime errors:
csv_edit = csv_edit.fillna('')
###################
# PROCESSING PART #
###################
# extract scaffolds from reference genomes:
scaffolds_total = csv_edit[csv_edit['Genome'].str.startswith(prefix)]
# extract scaffolds contained in the graph (as pnd dataframe):
scaffolds_ingraph = scaffolds_total.copy(deep = True)
for index, row in scaffolds_total.iterrows():
if graph.has_node(row.Genome) == False :
# remove row from scaffolds_ingraph if scaffold is not in the graph:
scaffolds_ingraph = scaffolds_ingraph.drop(index)
# In this case "status" should be always "Singleton" (that is: never present in the graph).
status = row.VCStatus # SINGLETON
# append dropped row in results as level="G" (meaning: NOT IN GRAPH):
results = insertReference(
csv_edit=csv_edit,
results=results,
scaffold=row.Genome,
closer="n.a.",
weight="n.a.",
level="G",
status=status,
vc="n.a."
)
# log first stats:
logging.info("Viral scaffolds in total: %i" % len(scaffolds_total))
logging.info("Viral scaffolds in the graph: %i\n" % len(scaffolds_ingraph))
import operator
# for each viral scaffold in the graph:
for index, row in scaffolds_ingraph.iterrows():
# extract scaffold name and VCStatus:
scaffold = row.Genome
status = row.VCStatus
logging.info("Scaffold: %s" % (scaffold))
logging.info("VCStatus: %s" % (status))
# COMPUTE NEIGHBORS (that are: the directly connected nodes):
neighbors_list = list(graph.neighbors(scaffold))
logging.info("# Neighbors: %i" % len(neighbors_list))
if (status == "Clustered" or status == "Outlier" or status == "Clustered/Singleton" or "Overlap" in status):
vc = "" # scaffold's viral cluster or subcluster
sameclustered_list = [] # other genomes/scaffolds in the same cluster/subcluster
# EXTRACT SAMECLUSTERED LIST:
if status == "Outlier": # not clustered but connected
vc = "n.a."
sameclustered_list = []
elif status == "Clustered":
vc = row.VCSubcluster
# extract all scaffolds clustered in the same subcluster (hereafter: the "sameclustered")
sameclustered = csv_edit[csv_edit['VCSubcluster'] == vc]
sameclustered_list = sameclustered["Genome"].tolist()
sameclustered_list.remove(scaffold) # remove current scaffold
elif status == "Clustered/Singleton": # they have a subcluster of their own
# extract all the possible subclustered within the cluster
vc = "VC_" + row.VCSubcluster.split("_")[1] + "_"
sameclustered = csv_edit[csv_edit['VCSubcluster'].str.startswith(vc)]
sameclustered_list = sameclustered["Genome"].tolist()
sameclustered_list.remove(scaffold) # remove current scaffold
elif "Overlap" in status: # "Overlap" belong to n cluster
# extract the n cluster and consider every of thier subcluster
vc_list = status.replace("Overlap (", "").replace(")", "").split("/")
sameclustered = pnd.DataFrame() # empty df
for vc in vc_list: # extract the rows and glue them together with the others
sameclustered = pnd.concat([sameclustered, csv_edit[csv_edit['VCSubcluster'].str.startswith(vc + "_")]])
# handling of others Overlaps:
sameclustered = pnd.concat([sameclustered, csv_edit[
(csv_edit['VCStatus'].str.contains('\(' + vc + '/')) | \
(csv_edit['VCStatus'].str.contains('/' + vc + '/')) | \
(csv_edit['VCStatus'].str.contains('/' + vc + '\)')) ]])
sameclustered_list = sameclustered["Genome"].tolist()
sameclustered_list.remove(scaffold)
else:
consoleout("error", "Strange level found during the extraction of the cluster.")
# log some stats
logging.info("Status: %s" % status)
logging.info("Number of sameclustered (sc): %i" % len(sameclustered_list))
# check if all sameclustered are contained in the neighbours
logging.info("Are all sc connected to scaffold?: %s" % set(sameclustered_list).issubset(neighbors_list))
# GET SAMECLUSTERED-CONNECTED LIST ORDERED BY WEIGHT
# associate every CONNECTED sameclustered with its weight
connected_list = [] # this will be a list of DICT:
# {"scaffold": ___, "weight": ___}
for node in sameclustered_list:
try:
w = graph.get_edge_data(scaffold, node)["weight"]
connected_list.append({"scaffold": node, "weight": w})
except: # not every pair of nodes in the same cluster are directly connected in the graph
continue
# ordinate the list by weight
connected_list = sorted(connected_list, key=operator.itemgetter('weight'), reverse=True) # reverse for descending order
# EXTRACT 'CLOSER_NODE' (= the haviest connected-sameclustered)
# note that closer_node could be another scaffold!
try:
closer_node = connected_list[0]["scaffold"] # to store the node connected with the heavier edge
closer_node_w = connected_list[0]["weight"] # to store the heavier edge
except: # not always there is a best_bet: for example "Outlier" are not clustered
closer_node = "n.a."
closer_node_w = 0.0
logging.info("closer_node: %s" % closer_node)
logging.info("closer_node_w: %f" % closer_node_w)
# 'CLOSER_REF' SEARCH
closer_ref = "n.a." # store the reference genome connected with the heavier edge
closer_ref_w = 0.0 # store the heavier egde that connect to a reference genome
level = "n.a." # keep track of the cardinality (order of the list)
# iterate the list until the first reference genome is reached
for i in range(len(connected_list)):
# if it's connected a true reference genome or a scaffold assigned in the last iteration:
if connected_list[i]["scaffold"].startswith(prefix) == False:
closer_ref = connected_list[i]["scaffold"]
closer_ref_w = connected_list[i]["weight"]
level = "C" + str(i + 1) # first level is 1, not 0
# break the for-loop, because the first reference genome was reached
break
# if a reference genome was found within the connected-sameclustered:
if closer_ref != "n.a.":
logging.info("closer_ref: %s" % closer_ref)
logging.info("closer_ref_w: %f" % closer_ref_w)
logging.info("level: %s" % level)
# update results:
if status == "Clustered/Singleton":
# 'row.VCSubcluster' instead of 'vc', beacuse 'vc' changed during the algorithm
results = insertReference(csv_edit, results, scaffold, closer_ref, closer_ref_w, level, status, row.VCSubcluster)
elif "Overlap" in status:
# 'n.a.' instead of 'vc', beacuse 'vc' changed during the algorithm
results = insertReference(csv_edit, results, scaffold, closer_ref, closer_ref_w, level, status, vc="n.a.")
else:
results = insertReference(csv_edit, results, scaffold, closer_ref, closer_ref_w, level, status, vc)
else: # RETRY ITERATING THE WHOLE NEGHBORS
# associate every neighbors with its weight
connected_list = [] # this will be a list of DICT:
# {"scaffold": ___, "weight": ___}
for node in neighbors_list:
w = graph.get_edge_data(scaffold, node)["weight"]
connected_list.append({"scaffold": node, "weight": w})
# ordinate the list by weight
connected_list = sorted(connected_list, key=operator.itemgetter('weight'), reverse=True) # reverse for descending order
# 'CLOSER_REF' SEARCH
# iterate the list until the first reference genome is reached
for i in range(len(connected_list)):
if connected_list[i]["scaffold"].startswith(prefix) == False:
closer_ref = connected_list[i]["scaffold"]
closer_ref_w = connected_list[i]["weight"]
level = "N" + str(i + 1) # first level is 1, not 0
# break the for-loop, because the first reference genome was reached
break
# if a reference genome was found within the whole neighbors:
if closer_ref != "n.a.":
logging.info("closer_ref: %s" % closer_ref)
logging.info("closer_ref_w: %f" % closer_ref_w)
logging.info("level: %s" % level)
# update results
if status == "Clustered/Singleton":
# 'row.VCSubcluster' instead of 'vc', beacuse 'vc' changed during the algorithm
results = insertReference(csv_edit, results, scaffold, closer_ref, closer_ref_w, level, status, row.VCSubcluster)
elif "Overlap" in status:
# 'n.a.' instead of 'vc', beacuse 'vc' changed during the algorithm
results = insertReference(csv_edit, results, scaffold, closer_ref, closer_ref_w, level, status, vc="n.a.")
else:
results = insertReference(csv_edit, results, scaffold, closer_ref, closer_ref_w, level, status, vc)
else: # this means: NO reference in sameclustered AND NO reference in neighbors
logging.warning("NO reference in connected sameclustered AND NO reference in neighbors.")
# warning: node_connected_component is simgle core and computation-intensive with large graphs
currlev = "F" # F: completely isolated vOTUs; A: vOTUs distantly indirectly linked to references.
genomes_connected = list(net.node_connected_component(graph, row.Genome))
for genome in genomes_connected: # here "genome" is a string !
if genome.startswith(prefix)==False:
currlev = "A"
break
# From what we've seen, status for "F" can be:
# Clustered, Outlier, Overlap. But NOT "Singleton" (that is: never present in the graph).
# Please note: also "Clustered/Singleton" is valid VCStatus, though never found in 'F' here till now.
status = row.VCStatus # should NOT be a "Singleton".
vc = row.VCSubcluster
if status == "Outlier" or "Overlap" in status:
vc = "n.a."
results = insertReference(csv_edit, results, scaffold=row.Genome, closer="n.a.", weight="n.a.", level=currlev, status=status, vc=vc)
else: # "Singleton" never present in the graph
logging.consoleout('error', "Found a strange VCStatus! Maybe it's a Singleton? Singletons should not be present inside the graph.")
logging.info("##############################################\n")
logging.info('Processing of vConTACT2 output is ended.')
# convert the list of dict into a pandas dataframe
results = pnd.DataFrame.from_records(results)
# order results by scaffold name
results = results.sort_values(by=['Scaffold'])
# Cut taxonomy at a level dependent of the "confidence" of the assignment:
for index, row in results.iterrows():
# We decided to stop at:
# Genus for Subclusters;
# Subfamily (if exists, otherwise Family) for Clusters;
# Order for 'N' (Outliers).
if row.Level.startswith('C') and row.Status == "Clustered":
continue # the best case: keep everything
elif row.Level.startswith('C') and (row.Status == "Clustered/Singleton" or "Overlap" in row.Status):
results.at[index,'Genus'] = "O" # Please note: "O" stands for "omitted".
elif row.Level.startswith('N') :
results.at[index,'Genus'] = "O"
results.at[index,'Subfamily'] = "O"
# res.at[index,'Family'] = "O"
# Now we want to save results dataframe into 4 different formats:
# Formats are: 1) csv file; 2) Excel file; 3) MultiQC custom table.
# 1) save results as a csv file:
results.to_csv(output_path + "results_vcontact2_" + string_suffix + ".csv")
# 2) save results as a csv file:
results.to_excel(output_path + "results_vcontact2_" + string_suffix + ".xlsx")
# 3) Save results as a MultiQC custom table (return the object and save in 4th step):
# First, add the custom MultiQC header to the final string "content":
import textwrap
content = textwrap.dedent("""
# id: 'taxotab'
# plot_type: 'table'
# section_name: 'vConTACT2 extender table'
# description: 'Automatic processing of vConTACT2 outputs `c1.ntw` and `genome_by_genome_overview.csv`.'
<-- REPLACE -->
""")
# Crate a StringIO object to flush results.to_csv() into:
import io
string_eater = io.StringIO()
datas = results.to_csv(path_or_buf = string_eater, sep='\t', index=False)
# Get out the results.to_csv() as string, and insert it into "content":
content = content.replace("<-- REPLACE -->", string_eater.getvalue())
# Write the whole "content" string on file.
file_report = open(output_path + "custom_taxonomy_table_" + parameters.suffix + "_mqc.txt", "w")
file_report.write(content)
file_report.close()
return results
def addGraphAttributes(graph, csv_edit, results, prefix):
# less problematic when called as variable
csv_edit = csv_edit.rename(columns={"VC Subcluster": "VCSubcluster"})
csv_edit = csv_edit.rename(columns={"VC Status": "VCStatus"})
# sobstitute NA with '' to avoid future runtime errors
csv_edit = csv_edit.fillna('')
# Below we add attributes to each node in the graph. Thanks to this, the interactive
# visualization will be more informative (each attribute is shown when a node is hovered
# with the mouse).
attribs = {} # create empty dict:
# for each row in csv_edit, preparate a col and a dict to rename nodes
for index, row in csv_edit.iterrows():
# not all references in genome_by_genome_overview.csv are contained in the graph!
# for genomes/scaffolds not included in graph: ignore.
if graph.has_node(row.Genome) == False:
continue
# don't want to rename scaffold at this point
if row.Genome.startswith(prefix): # Discriminate accesions from vOTUs.
# find the corresponding vOTU in the results table:
matches = results[results['Scaffold'] == row.Genome]
if len(matches) != 1:
# there should be only 1 exact match
consoleout("error", "We have len(matches) != 1 in subgraphCreator().")
else:
# get first (and only) match:
match = matches.iloc[0]
# take infos from the results dataframe:
label= {"A0_Type": "Scaffold",
"A1_Species/Closer": match.Closer,
"A2_Accession": match.Accession,
"A3_Status": match.Status,
"A4_VC": match.VC,
"A5_Level": match.Level,
"A6_Weight": match.Weight,
"A7_Genus": match.Genus,
"A8_Family": match.Family,
"A9_Host": match.Host}
# fill the "dict of dicts":
attribs[row.Genome] = label
else: # here we have a reference genome:
# take infos from the csv_edit dataframe:
label= {"A0_Type": "Reference",
"A1_Species/Closer": row.Species,
"A2_Accession": row.Accession, # commentable because the index is already the accession.
"A3_Status": row.VCStatus,
"A4_VC": row.VCSubcluster,
"A5_Level": "-",
"A6_Weight": "-",
"A7_Genus": row.Genus,
"A8_Family": row.Family,
"A9_Host": row.Host}
# fill the "dict of dicts":
attribs[row.Genome] = label
# add attributes to nodes using the dict just prepared:
net.set_node_attributes(graph, attribs)
# extract scaffold that are in-graph
results_ingraph = results.copy(deep=True) # not G
for index, row in results.iterrows():
if row.Level == "G":
results_ingraph = results_ingraph.drop(index)
scaffolds_ingraph = results_ingraph['Scaffold'].tolist()
return graph, scaffolds_ingraph
def get_curr_color(scaff_Status, scaff_VC, scaff_Accession, curr_Status, curr_VC, curr_Accession, curr_Type, scaffold, node):
col_ref = '#C7A1D1'
col_inner = '#F76915' # orange
col_outer = '#EEDE04' # yellow
col_closer = '#2FA236' # green
col_current = '#FD0100' # red
#######################
# SCAFFOLD view point #
#######################
# Clustered
if scaff_Status == "Clustered":
valid_scaff_VCs = [scaff_VC] # eg ['VC_589_1']
# Clustered/Singleton
elif scaff_Status == "Clustered/Singleton":
valid_scaff_VCs = ["VC_" + scaff_VC.split("_")[1] + "_"] # eg ['VC_589_']
# Outlier
elif scaff_Status == "Outlier": # Outliers are never put inside a VC:
valid_scaff_VCs = []
# Overlap
elif "Overlap" in scaff_Status: # eg ['VC_589', 'VC_590', 'VC_591']
valid_scaff_VCs = scaff_Status.replace("Overlap (", "").replace(")", "").split("/")
###########################
# CURRENT NODE view point #
###########################
# Clustered, Clustered/Singleton
if curr_Status == "Clustered" or curr_Status == "Clustered/Singleton": # VC_z_k
# Overlap
if "Overlap" in scaff_Status: # 'valid_scaff_VCs' eg ['VC_589_', 'VC_590_', 'VC_591_']
valid_curr_VCs = ["VC_" + curr_VC.split("_")[1] + '_']
elif scaff_Status == "Clustered/Singleton": # 'valid_scaff_VCs' eg ['VC_589_']
valid_curr_VCs = ["VC_" + curr_VC.split("_")[1] + "_"]
else: # 'scaff_Status' is Clustered or Outlier
valid_curr_VCs = [curr_VC] # VC_z_k
# Outlier
elif curr_Status == "Outlier": # Outliers are never put inside a VC:
valid_curr_VCs = [] # Remember: ">>> [] in []" return 'False'
# Overlap
elif "Overlap" in curr_Status: # VC_z1, VC_z2, VC_z3
valid_curr_VCs = curr_Status.replace("Overlap (", "").replace(")", "").split("/")
if scaff_Status == "Clustered" or scaff_Status == "Clustered/Singleton":
valid_scaff_VCs = ["VC_" + vc.split("_")[1] for vc in valid_scaff_VCs]
curr_color = col_ref
# color for nodes in the same VC, distinguishing "real" VCs (Clustered) from
# "artifical" or "extended" VCs (Overlap and Clustered/Singleton):
if any(vc in valid_scaff_VCs for vc in valid_curr_VCs):
if "Overlap" in scaff_Status or scaff_Status == "Clustered/Singleton":
curr_color = col_outer
else:
curr_color = col_inner
# Add Cluster's nodes when in the Subcluster mode:
# So we have to consider the 3 cases: 'Clustered', 'Clustered/Singleton', 'Overlap'.
if scaff_Status == "Clustered" and (curr_Status == "Clustered" or curr_Status == "Clustered/Singleton"):
if (("VC_" + scaff_VC.split('_')[1] + "_") in curr_VC) and (curr_VC != scaff_VC):
curr_color = col_outer
elif scaff_Status == "Clustered" and ("Overlap" in curr_Status):
# Keep in mind that it's a string like "Overlap (VC_4/VC_412/VC_41)""
if (("VC_" + scaff_VC.split('_')[1] + "/") in curr_Status):
curr_color = col_outer
elif (("VC_" + scaff_VC.split('_')[1] + ")") in curr_Status):
curr_color = col_outer
# understand if this node determines the taxonomy of the current 'scaffold':
# this if statement picks up only 'References'. So we'll have just 1 green triangle.
if scaff_Accession == curr_Accession and curr_Type == "Reference":
curr_color = col_closer
# check if this is current vOTU:
if node == scaffold:
curr_color = col_current
return curr_color
#@profile
def subgraph_generation2(arguments):
# get allocated memory as soon as the child process starts:
alloc_child, _ = tracemalloc.get_traced_memory()
# get the arguments
scaffolds = arguments[0]
desired_path = arguments[1]
graph = arguments[2]
max_weight = arguments[3]
worker = arguments[4] +1 # just an ID for this worker.
view = arguments[5]
import os # needed by os.getpid()
#print(f"Worker {worker} (PID {os.getpid()}) started with {len(scaffolds)} vOTUs.")
# load external libraries:
import networkx as net # net.spring_layout() could require also scipy.
import plotly.offline as py
import plotly.graph_objects as go
# get allocated memory now that ext libraries are imported
alloc_libs, _ = tracemalloc.get_traced_memory()
for scaffold in scaffolds:
# stop-start combo is needed for python<3.9 in order to reset the 'peak' couter.
#tracemalloc.stop()
#tracemalloc.start()
# now 'alloc_start' should be 0 bytes.
alloc_start, _ = tracemalloc.get_traced_memory()
# create a new subgraph using 'scaffold' neighbors.
neigh = list(graph.neighbors(scaffold))
# recreate a subgraph with: nieghbors + scaffold
# we assume that neighbors() doesn't already include 'scaffold'.
neigh.append(scaffold)
sview_graph = graph.subgraph(neigh).copy()
nedges = sview_graph.size() # get the current number of edges
# Extract attributes of the current 'scaffold' (vOTU):
scaff_Type = sview_graph.nodes[scaffold]["A0_Type"]
scaff_Accession = sview_graph.nodes[scaffold]["A2_Accession"]
scaff_Status = sview_graph.nodes[scaffold]["A3_Status"]
scaff_VC = sview_graph.nodes[scaffold]["A4_VC"]
scaff_Level = sview_graph.nodes[scaffold]["A5_Level"]
# just a Status check
if scaff_Status == "Singleton" or scaff_Level == 'G':
consoleout("error", "There shouldn't be 'G' scaffolds at this point. Contact the developer.")
# Calculate position of all nodes and edges with spring_layout():
# (APPROXIMATELY, the higher is 'weight' attribute, the shorter is edge length)
if view=='3d' or view=='3D': sview_pos = net.drawing.layout.spring_layout(sview_graph, weight='weight', dim=3, seed=1)
else: sview_pos = net.drawing.layout.spring_layout(sview_graph, weight='weight', dim=2, seed=1)
# Create a plotly figure:
layout = go.Layout(
title=f'{scaffold} subnetwork.<br><sub>Interactive plot generated with <b>graphanalyzer.py</b> v{version}. Please wait the loading.</sub>',
annotations=[dict(text='User guide available at <a href="https://www.github.com/lazzarigioele/graphanalyzer/">github.com/lazzarigioele/graphanalyzer</a>.'+
'<br>Bugs can be reported to <a href= "mailto:[email protected]">[email protected]</a>.',
showarrow=False, xref="paper", yref="paper", x=0.0, y=0.0, align="left")])
fig = go.Figure(layout=layout)
# update the attribute 'A6_Weight' for every node. 'Weight' must become relative to the current scaffold.
for node in sview_graph: # here node is a string !
attrs = {} # a dict.
if node == scaffold: attrs = {node: {"A6_Weight": "origin"}}
else: attrs = {node: {"A6_Weight": str(round(sview_graph[scaffold][node]["weight"],1))}}
net.set_node_attributes(sview_graph, attrs)
# add traces for edges:
palette = ['#98A9D7', '#8ED2CD', '#C2ED98', '#F1F487', '#FED776', '#F59B7C']
for thickness in range(6):
edges_x = []
edges_y = []
edges_z = []
for edge in sview_graph.edges.data("weight"): # (nodeA, nodeB, weight)
# '*6' is a scaling factor. So 6 is the max.
width = edge[2] / max_weight *6
if thickness < width and width <= (thickness+1):
# save extremes positions:
if view=='3d' or view=='3D':
xA, yA, zA = sview_pos[edge[0]]
xB, yB, zB = sview_pos[edge[1]]
else:
xA, yA = sview_pos[edge[0]]
xB, yB = sview_pos[edge[1]]
edges_x.append(xA)
edges_x.append(xB)
edges_x.append(None)
edges_y.append(yA)
edges_y.append(yB)
edges_y.append(None)
if view=='3d' or view=='3D':
edges_z.append(zA)
edges_z.append(zB)
edges_z.append(None)
if view=='3d' or view=='3D':
edge_trace = go.Scatter3d(
x=edges_x,
y=edges_y,
z=edges_z,
line=dict(width=thickness+1, color=palette[thickness]),
hoverinfo='text',
mode='lines')
else:
edge_trace = go.Scatter(
x=edges_x,
y=edges_y,
#z=edges_z,
line=dict(width=thickness+1, color=palette[thickness]),
hoverinfo='text',
mode='lines')
fig.add_trace(edge_trace)
# add trace for nodes:
nodes_x = []
nodes_y = []
nodes_z = []
nodes_text = []
nodes_shape = []
nodes_color = []
for node in sview_graph: # here node is a string !
if view=='3d' or view=='3D': x, y, z = sview_pos[node]
else: x, y = sview_pos[node]
nodes_x.append(x)
nodes_y.append(y)
if view=='3d' or view=='3D': nodes_z.append(z)
# Extract attributes of the current node:
curr_Type = sview_graph.nodes[node]["A0_Type"]
curr_SpeClo = sview_graph.nodes[node]["A1_Species/Closer"]
curr_Accession = sview_graph.nodes[node]["A2_Accession"]
curr_Status = sview_graph.nodes[node]["A3_Status"]
curr_VC = sview_graph.nodes[node]["A4_VC"]
curr_Level = sview_graph.nodes[node]["A5_Level"]
curr_Weight = sview_graph.nodes[node]["A6_Weight"]
curr_Genus = sview_graph.nodes[node]["A7_Genus"]
curr_Family = sview_graph.nodes[node]["A8_Family"]
curr_Host = sview_graph.nodes[node]["A9_Host"]
# Initialize properties of this node:
curr_shape = ""
curr_color = ""
# Distinguish between "Scaffold" and "Reference":
if curr_Type == "Scaffold":
curr_shape = 'circle'
elif curr_Type == "Reference":
curr_shape = 'diamond' if view=='3d' or view=='3D' else 'star-triangle-up'
else: consoleout("error", "Strange A0_Type when determining the node's shape.")
nodes_shape.append(curr_shape)
# get the color of this node:
curr_color = get_curr_color(scaff_Status, scaff_VC, scaff_Accession, curr_Status, curr_VC, curr_Accession, curr_Type, scaffold, node)
nodes_color.append(curr_color)
# get the mouse-hover description:
nodes_text.append(
'<b><i>'+"index "+'</b></i>' + '<b>'+node+'</b>' +
'<br>' + '<b><i>'+"A0_Type "+'</b></i>' + curr_Type +
'<br>' + '<b><i>'+"A1_Species/Closer "+'</b></i>' + curr_SpeClo +
'<br>' + '<b><i>'+"A2_Accession "+'</b></i>' + curr_Accession +
'<br>' + '<b><i>'+"A3_Status "+'</b></i>' + curr_Status +
'<br>' + '<b><i>'+"A4_VC "+'</b></i>' + curr_VC +
'<br>' + '<b><i>'+"A5_Level "+'</b></i>' + curr_Level +
'<br>' + '<b><i>'+"A6_Weight "+'</b></i>' + curr_Weight +
'<br>' + '<b><i>'+"A7_Genus "+'</b></i>' + curr_Genus +
'<br>' + '<b><i>'+"A8_Family "+'</b></i>' + curr_Family +
'<br>' + '<b><i>'+"A9_Host "+'</b></i>' + curr_Host
)
if view=='3d' or view=='3D':
node_trace = go.Scatter3d(
x=nodes_x,
y=nodes_y,
z=nodes_z,
text=nodes_text,
mode='markers',
hoverinfo='text',
marker=dict(size=8, color=nodes_color, line_width=1, symbol=nodes_shape, line=dict(color='black')))
else:
node_trace = go.Scatter(
x=nodes_x,
y=nodes_y,
#z=nodes_z,
text=nodes_text,
mode='markers',
hoverinfo='text',
marker=dict(size=12, color=nodes_color, line_width=1, symbol=nodes_shape))
fig.add_trace(node_trace)
# Remove legend
fig.update_layout(showlegend=False)
# Remove tick labels
fig.update_xaxes(showticklabels=False)
fig.update_yaxes(showticklabels=False)
# Save to html
string_repr = fig.write_html(desired_path + scaffold + '.html')
alloc_end, _ = tracemalloc.get_traced_memory()
# free memory for the next cycle:
del string_repr, fig, layout, sview_graph, sview_pos
import gc
gc.collect()
alloc_free, _ = tracemalloc.get_traced_memory()
print(f"W#{worker}-PID {os.getpid()}: {scaffold} #E {nedges} "+
f"[Ch {int(alloc_child/10**6)} MB; In {int((alloc_free-alloc_child)/10**6)} MB] ", end='\r')
def subgraphCreator(graph, scaffolds_ingraph, output_path, string_suffix, max_weight, prefix, nthreads, view):
import os
# First of all create the subfolder for storing all the subplots.
desired_path = output_path + 'single-views_' + string_suffix + '/'
if (os.path.isdir(desired_path) == False): # if it's not been created yet:
try: os.mkdir(desired_path) # make it!
except: consoleout("error", "Can't create the output sub-folder '%s'. " % desired_path)
# setting up the multiprocessing part.
# splitting the 'scaffolds_ingraph' into chunks (one chunk for core/process)
nscaffodls_inchunk = int(len(scaffolds_ingraph) / nthreads)
if len(scaffolds_ingraph) % nthreads !=0: nscaffodls_inchunk += 1
chunks = [scaffolds_ingraph[x *nscaffodls_inchunk : (x+1)* nscaffodls_inchunk] for x in range(nthreads)]
import itertools
results = globalpool.imap(
subgraph_generation2,