@@ -381,9 +381,39 @@ def anchor_filenames(self):
381
381
ret += g .anchor_filenames
382
382
return ret
383
383
384
+ def bitmap_to_bins (self , bitmap , binlen ):
385
+ df = bitmap .set_index (bitmap .index // binlen )#.groupby(level=1)
386
+
387
+ pancounts = df .sum (axis = 1 ).reset_index ().value_counts ()
388
+ pancount_bins = pancounts .unstack (level = 1 ,fill_value = 0 ).T
389
+ pancount_bins = pancount_bins .reindex (self .bitsum_index , fill_value = 0 )
390
+
391
+ paircount_bins = df .groupby (level = 0 ).sum ()
392
+ paircount_bins = paircount_bins .set_index (paircount_bins .index * binlen ).T
393
+
394
+ paircount_bins = paircount_bins .div (paircount_bins .max (axis = 1 ),axis = 0 )
395
+
396
+ paircount_bins
397
+
398
+ return pancount_bins ,paircount_bins
399
+
400
+ def bitmap_to_pancount (self , bitmap ):
401
+ return pd .Series (bitmap .to_numpy ().sum (axis = 1 ),index = bitmap .index )
402
+
403
+ def bitmap_to_paircount (self , bitmap ):
404
+ return pd .Series (bitmap .to_numpy ().sum (axis = 1 ),index = bitmap .index )
405
+
406
+ def pancount_to_bins (self , pancnts , binlen ):
407
+ bin_counts = pd .DataFrame ({
408
+ "count" : pancnts .to_numpy (),
409
+ "bin" : pancnts .index // binlen
410
+ }).value_counts (sort = False )
411
+ return bin_counts .unstack (level = 1 ,fill_value = 0 ).reindex (self .bitsum_index , fill_value = 0 )
412
+
384
413
385
414
class Genome :
386
415
def __init__ (self , idx , id , name , fasta = None , gff = None , anchor = None , write = False ):
416
+ self .index = idx
387
417
self .samples = idx .samples
388
418
self .params = idx .params
389
419
self .prefix = os .path .join (self .params ["prefix" ], ANCHOR_DIR , name )
@@ -586,11 +616,11 @@ def _init_anno_types(self):
586
616
self .gff_anno_types = set (self .params ["gff_anno_types" ])
587
617
return
588
618
589
- if os .path .exists (self .anno_types_fname ):
619
+ if os .path .exists (self .anno_types_fname ) and not self . write_mode :
590
620
with open (self .anno_types_fname ) as f :
591
621
self .gff_anno_types = {l .strip () for l in f }
592
622
else :
593
- self .gff_anno_types = set ()
623
+ self .gff_anno_types = None
594
624
595
625
def _write_anno_types (self ):
596
626
with open (self .anno_types_fname , "w" ) as f :
@@ -602,15 +632,20 @@ def init_gff(self, filename=None):
602
632
filename = self .gff
603
633
if pd .isna (filename ): return
604
634
635
+ if self .params ["gff_anno_types" ] is not None :
636
+ gff_anno_types = set (self .params ["gff_anno_types" ])
637
+ else :
638
+ gff_anno_types = None
639
+
605
640
genes = list ()
606
641
annos = list ()
607
642
608
643
for df in self ._iter_gff ():
609
644
gmask = df ["type" ].isin (self .params ["gff_gene_types" ])
610
645
genes .append (df [gmask ])
611
646
612
- if self . gff_anno_types is not None :
613
- annos .append (df [df ["type" ].isin (self . gff_anno_types )])
647
+ if gff_anno_types is not None :
648
+ annos .append (df [df ["type" ].isin (gff_anno_types )])
614
649
else :
615
650
annos .append (df [~ gmask ])
616
651
@@ -620,10 +655,10 @@ def _merge_dfs(dfs):
620
655
annos = _merge_dfs (annos )
621
656
self ._write_tabix (annos , "anno" )
622
657
623
- if self . gff_anno_types is None :
658
+ if gff_anno_types is None :
624
659
self .gff_anno_types = set (annos ["type" ].unique ())
625
660
else :
626
- self .gff_anno_types = self . gff_anno_types .intersection (annos ["type" ])
661
+ self .gff_anno_types = gff_anno_types .intersection (annos ["type" ])
627
662
self ._write_anno_types ()
628
663
629
664
genes = _merge_dfs (genes )
@@ -666,7 +701,8 @@ def query(self, name, start=None, end=None, step=1):
666
701
667
702
pac = self ._query_bytes (name , start , end - 1 , step , bstep )
668
703
bits = self ._bytes_to_bits (pac )
669
- idx = np .arange (start , end , step , dtype = int )#[:len(bits)-1]
704
+ #idx = np.arange(start, end, step, dtype=int)#[:len(bits)-1]
705
+ idx = pd .RangeIndex (start , end , step )
670
706
df = pd .DataFrame (bits , index = idx , columns = self .genome_names )
671
707
return df
672
708
@@ -821,7 +857,6 @@ def run_annotate(self, gff_file=None, logfile=None):
821
857
self .bitsum_genes .to_csv (self .chr_genes_fname , sep = "\t " )
822
858
823
859
gene_tabix = gene_df .reset_index ()[self .gene_tabix_cols ]
824
- print (gene_tabix )
825
860
self ._write_tabix (gene_tabix , "gene" )
826
861
827
862
@@ -873,7 +908,8 @@ def run_anchor(self, bitvecs, logfile=None):
873
908
t = time ()
874
909
875
910
if self .annotated :
876
- self ._write_tabix (gene_df .reset_index (), "gene" )
911
+ gene_tabix = gene_df .reset_index ()[self .gene_tabix_cols ]
912
+ self ._write_tabix (gene_tabix , "gene" )
877
913
self .bitsum_genes = gene_df .groupby ("chr" ,sort = False )[self .bitsum_index ].sum ()#.sort_index()
878
914
self .bitsum_genes .to_csv (self .chr_genes_fname , sep = "\t " )
879
915
0 commit comments