Skip to content

Commit 87d8f9c

Browse files
committed
Added paircounts
1 parent 4d9d982 commit 87d8f9c

File tree

2 files changed

+161
-164
lines changed

2 files changed

+161
-164
lines changed

panagram/index.py

+45-9
Original file line numberDiff line numberDiff line change
@@ -381,9 +381,39 @@ def anchor_filenames(self):
381381
ret += g.anchor_filenames
382382
return ret
383383

384+
def bitmap_to_bins(self, bitmap, binlen):
385+
df = bitmap.set_index(bitmap.index // binlen)#.groupby(level=1)
386+
387+
pancounts = df.sum(axis=1).reset_index().value_counts()
388+
pancount_bins = pancounts.unstack(level=1,fill_value=0).T
389+
pancount_bins = pancount_bins.reindex(self.bitsum_index, fill_value=0)
390+
391+
paircount_bins = df.groupby(level=0).sum()
392+
paircount_bins = paircount_bins.set_index(paircount_bins.index*binlen).T
393+
394+
paircount_bins = paircount_bins.div(paircount_bins.max(axis=1),axis=0)
395+
396+
paircount_bins
397+
398+
return pancount_bins,paircount_bins
399+
400+
def bitmap_to_pancount(self, bitmap):
401+
return pd.Series(bitmap.to_numpy().sum(axis=1),index=bitmap.index)
402+
403+
def bitmap_to_paircount(self, bitmap):
404+
return pd.Series(bitmap.to_numpy().sum(axis=1),index=bitmap.index)
405+
406+
def pancount_to_bins(self, pancnts, binlen):
407+
bin_counts = pd.DataFrame({
408+
"count" : pancnts.to_numpy(),
409+
"bin" : pancnts.index // binlen
410+
}).value_counts(sort=False)
411+
return bin_counts.unstack(level=1,fill_value=0).reindex(self.bitsum_index, fill_value=0)
412+
384413

385414
class Genome:
386415
def __init__(self, idx, id, name, fasta=None, gff=None, anchor=None, write=False):
416+
self.index = idx
387417
self.samples = idx.samples
388418
self.params = idx.params
389419
self.prefix = os.path.join(self.params["prefix"], ANCHOR_DIR, name)
@@ -586,11 +616,11 @@ def _init_anno_types(self):
586616
self.gff_anno_types = set(self.params["gff_anno_types"])
587617
return
588618

589-
if os.path.exists(self.anno_types_fname):
619+
if os.path.exists(self.anno_types_fname) and not self.write_mode:
590620
with open(self.anno_types_fname) as f:
591621
self.gff_anno_types = {l.strip() for l in f}
592622
else:
593-
self.gff_anno_types = set()
623+
self.gff_anno_types = None
594624

595625
def _write_anno_types(self):
596626
with open(self.anno_types_fname, "w") as f:
@@ -602,15 +632,20 @@ def init_gff(self, filename=None):
602632
filename = self.gff
603633
if pd.isna(filename): return
604634

635+
if self.params["gff_anno_types"] is not None:
636+
gff_anno_types = set(self.params["gff_anno_types"])
637+
else:
638+
gff_anno_types = None
639+
605640
genes = list()
606641
annos = list()
607642

608643
for df in self._iter_gff():
609644
gmask = df["type"].isin(self.params["gff_gene_types"])
610645
genes.append(df[gmask])
611646

612-
if self.gff_anno_types is not None:
613-
annos.append(df[df["type"].isin(self.gff_anno_types)])
647+
if gff_anno_types is not None:
648+
annos.append(df[df["type"].isin(gff_anno_types)])
614649
else:
615650
annos.append(df[~gmask])
616651

@@ -620,10 +655,10 @@ def _merge_dfs(dfs):
620655
annos = _merge_dfs(annos)
621656
self._write_tabix(annos, "anno")
622657

623-
if self.gff_anno_types is None:
658+
if gff_anno_types is None:
624659
self.gff_anno_types = set(annos["type"].unique())
625660
else:
626-
self.gff_anno_types = self.gff_anno_types.intersection(annos["type"])
661+
self.gff_anno_types = gff_anno_types.intersection(annos["type"])
627662
self._write_anno_types()
628663

629664
genes = _merge_dfs(genes)
@@ -666,7 +701,8 @@ def query(self, name, start=None, end=None, step=1):
666701

667702
pac = self._query_bytes(name, start, end-1, step, bstep)
668703
bits = self._bytes_to_bits(pac)
669-
idx = np.arange(start, end, step, dtype=int)#[:len(bits)-1]
704+
#idx = np.arange(start, end, step, dtype=int)#[:len(bits)-1]
705+
idx = pd.RangeIndex(start, end, step)
670706
df = pd.DataFrame(bits, index=idx, columns=self.genome_names)
671707
return df
672708

@@ -821,7 +857,6 @@ def run_annotate(self, gff_file=None, logfile=None):
821857
self.bitsum_genes.to_csv(self.chr_genes_fname, sep="\t")
822858

823859
gene_tabix = gene_df.reset_index()[self.gene_tabix_cols]
824-
print(gene_tabix)
825860
self._write_tabix(gene_tabix, "gene")
826861

827862

@@ -873,7 +908,8 @@ def run_anchor(self, bitvecs, logfile=None):
873908
t = time()
874909

875910
if self.annotated:
876-
self._write_tabix(gene_df.reset_index(), "gene")
911+
gene_tabix = gene_df.reset_index()[self.gene_tabix_cols]
912+
self._write_tabix(gene_tabix, "gene")
877913
self.bitsum_genes = gene_df.groupby("chr",sort=False)[self.bitsum_index].sum()#.sort_index()
878914
self.bitsum_genes.to_csv(self.chr_genes_fname, sep="\t")
879915

0 commit comments

Comments
 (0)