@@ -49,10 +49,10 @@ def init_logger(logfile):
49
49
IDX_SUFFIX = "gzi"
50
50
ANCHOR_DIR = "anchor"
51
51
52
- TABIX_COLS = ["chr" ,"start" ,"end" ,"type" ,"attr" ]
53
- # TABIX_COLS = ["chr","start","end","type","name"]
52
+ GFF_COLS = ["chr" ,"start" ,"end" ,"type" ,"attr" ]
53
+ TABIX_COLS = ["chr" ,"start" ,"end" ,"type" ,"name" ]
54
54
TABIX_TYPES = {"start" : int , "end" : int }
55
- GENE_TABIX_COLS = TABIX_COLS + [ "unique" ,"universal" ]
55
+ GENE_COLS = [ "chr" , "start" , "end" , "name" ] #, "unique","universal"]
56
56
GENE_TABIX_TYPES = {"start" : int , "end" : int , "unique" : int , "universal" : int }
57
57
TABIX_SUFFIX = ".gz"
58
58
@@ -95,7 +95,7 @@ class Index(Serializable):
95
95
96
96
gff_gene_types : List [str ] = field (default_factory = lambda : ["gene" ], help = "GFF features to store locations and conservation scores" )
97
97
gff_anno_types : List [str ] = field (default = None , help = "GFF features of which to store locations, but not conservation scores" )
98
- gff_attrs : List [str ] = field (default_factory = lambda : ["ID " ,"Name " ], help = "GFF attributes to store in annotation indexes" )
98
+ gff_name : List [str ] = field (default_factory = lambda : ["Name " ,"ID " ], help = "GFF attributes to store in annotation indexes" )
99
99
100
100
#Subset of genome IDs to generate anchor genomes for. Will use all genomes as anchors if not specified
101
101
anchor_genomes : List [str ] = field (default = None )
@@ -473,12 +473,13 @@ def bitsum_index(self):
473
473
474
474
@property
475
475
def gene_tabix_cols (self ):
476
- return TABIX_COLS + list (self .bitsum_index )
476
+ return GENE_COLS + [1 ,self .ngenomes ]#list(self.bitsum_index)
477
+ #return TABIX_COLS + list(self.bitsum_index)
477
478
478
479
@property
479
480
def gene_tabix_types (self ):
480
481
r = {"start" : int , "end" : int }
481
- for i in self .bitsum_index :
482
+ for i in [ 1 , self . ngenomes ]: # self.bitsum_index:
482
483
r [i ] = int
483
484
return r #TABIX_COLS + list(self.bitsum_index)
484
485
@@ -562,20 +563,21 @@ def seq_len(self, seq_name):
562
563
return self .sizes .loc [seq_name ]
563
564
564
565
def _iter_gff (self ):
565
- gffattr = lambda df ,name : df ["attr" ].str .extract (f"{ name } =([^;]+)" , re .IGNORECASE )
566
+ gffattr = lambda df ,name : df ["attr" ].str .extract (f"{ name } =([^;]+)" , re .IGNORECASE )[ 0 ]
566
567
for df in pd .read_csv (
567
568
self .gff ,
568
569
sep = "\t " , comment = "#" , chunksize = 10000 ,
569
570
names = ["chr" ,"source" ,"type" ,"start" ,"end" ,"score" ,"strand" ,"phase" ,"attr" ],
570
- usecols = TABIX_COLS ):
571
-
572
- #df["name"] = pd.NA
573
- #for attr in df,self.gff_attrs:
574
- # isna = df.index[df["name"].isna()]
575
- # if len(isna) > 0:
576
- # df.loc[isna,"name"] = gffattr(df.loc[isna], attr)
577
- # else:
578
- # break
571
+ usecols = GFF_COLS ):
572
+
573
+ df ["name" ] = pd .NA
574
+ for attr in self .params ["gff_name" ]:
575
+ isna = df .index [df ["name" ].isna ()]
576
+ if len (isna ) > 0 :
577
+ names = gffattr (df .loc [isna ], attr )
578
+ df .loc [isna ,"name" ] = names
579
+ else :
580
+ break
579
581
580
582
yield df [TABIX_COLS ]
581
583
@@ -595,8 +597,10 @@ def _write_anno_types(self):
595
597
for t in self .gff_anno_types :
596
598
f .write (f"{ t } \n " )
597
599
598
- def init_gff (self ):
599
- if pd .isna (self .gff ): return
600
+ def init_gff (self , filename = None ):
601
+ if filename is None :
602
+ filename = self .gff
603
+ if pd .isna (filename ): return
600
604
601
605
genes = list ()
602
606
annos = list ()
@@ -660,9 +664,9 @@ def query(self, name, start=None, end=None, step=1):
660
664
if end is None :
661
665
end = self .seq_len (name )
662
666
663
- pac = self ._query_bytes (name , start , end , step , bstep )
667
+ pac = self ._query_bytes (name , start , end - 1 , step , bstep )
664
668
bits = self ._bytes_to_bits (pac )
665
- idx = np .arange (start , end + 1 , step , dtype = int )#[:len(bits)-1]
669
+ idx = np .arange (start , end , step , dtype = int )#[:len(bits)-1]
666
670
df = pd .DataFrame (bits , index = idx , columns = self .genome_names )
667
671
return df
668
672
@@ -720,14 +724,14 @@ def query_genes(self, chrom=None, start=None, end=None):#, attrs=["name","id"]):
720
724
rows = list (rows )
721
725
ret = pd .DataFrame (rows , columns = self .gene_tabix_cols ).astype (self .gene_tabix_types )
722
726
723
- if "attr" in ret .columns :
724
- attr = lambda a : ret ["attr" ].str .extract (f"{ a } =([^;]+)" , re .IGNORECASE )
725
- names = attr ("Name" )
726
- ids = attr ("ID" )
727
- names [names .isna ()] = ids [names .isna ()]
728
- ret ["name" ] = names
729
- else :
730
- ret ["name" ] = ""
727
+ # if "attr" in ret.columns:
728
+ # attr = lambda a: ret["attr"].str.extract(f"{a}=([^;]+)", re.IGNORECASE)
729
+ # names = attr("Name")
730
+ # ids = attr("ID")
731
+ # names[names.isna()] = ids[names.isna()]
732
+ # ret["name"] = names
733
+ # else:
734
+ # ret["name"] = ""
731
735
732
736
return ret
733
737
@@ -789,6 +793,38 @@ def _write_bitmap(self, name, seq):
789
793
790
794
return self ._bytes_to_bits (arrs [1 ])
791
795
796
+ def run_annotate (self , gff_file = None , logfile = None ):
797
+ logging .basicConfig (
798
+ filename = logfile , level = logging .INFO ,
799
+ format = '[ %(asctime)s %(levelname)7s ] %(message)s' ,
800
+ datefmt = "%Y-%m-%d %H:%M:%S"
801
+ )
802
+
803
+ gene_df = self .init_gff (gff_file )
804
+
805
+ #for chrom,df in gene_df.groupby(level="chr"):
806
+ for chrom in gene_df .index .unique ("chr" ):
807
+ df = gene_df .loc [chrom ]
808
+ st = df .index .get_level_values ("start" ).min ()
809
+ en = min (self .sizes [chrom ],df .index .get_level_values ("end" ).max ())
810
+
811
+ bitsum = self .query (chrom , st , en ).sum (axis = 1 )
812
+
813
+ for start ,end in df .index :
814
+ if end <= start or start < 0 or end - st > len (bitsum ):
815
+ logger .warning (f"Skipping gene at { chrom } :{ start } -{ end } , coordinates out-of-bounds" )
816
+ continue
817
+ occ , counts = np .unique (bitsum [start - st :end - st ], return_counts = True )
818
+ gene_df .loc [(chrom ,start ,end ),occ ] += counts
819
+
820
+ self .bitsum_genes = gene_df .groupby ("chr" ,sort = False )[self .bitsum_index ].sum ()#.sort_index()
821
+ self .bitsum_genes .to_csv (self .chr_genes_fname , sep = "\t " )
822
+
823
+ gene_tabix = gene_df .reset_index ()[self .gene_tabix_cols ]
824
+ print (gene_tabix )
825
+ self ._write_tabix (gene_tabix , "gene" )
826
+
827
+
792
828
793
829
def run_anchor (self , bitvecs , logfile = None ):
794
830
logging .basicConfig (
0 commit comments