Skip to content

Commit 4d9d982

Browse files
committed
Wrote annotate command, storing only unique and univ genes, bitmap boundary bug fix
1 parent eb4e498 commit 4d9d982

File tree

3 files changed

+87
-34
lines changed

3 files changed

+87
-34
lines changed

panagram/__main__.py

+18-1
Original file line numberDiff line numberDiff line change
@@ -91,16 +91,33 @@ def run(self):
9191

9292
idx.close()
9393

94+
@dataclasses.dataclass
95+
class Annotate:
96+
"""(Re-)annotate an existing anchored genome using a GFF file """
97+
98+
index_dir: str = field(positional=True, metavar="index_dir")
99+
"""Panagram index directory"""
100+
101+
genome: str = field(positional=True, metavar="genome_name")
102+
103+
gff_file: str = field(positional=True, metavar="gff_file")
104+
105+
def run(self):
106+
idx = Index(self.index_dir)
107+
idx[self.genome].run_annotate(self.gff_file)
108+
idx.close()
109+
94110
@dataclasses.dataclass
95111
class Main:
96112
"""Alignment-free pan-genome viewer
97113
98114
Subcommands:
99115
index Anchor KMC bitvectors to reference FASTA files
100116
view Display panagram viewer in a browser window
117+
annotate Create or replace GFF annotation for anchored genome
101118
bitdump Query pan-kmer bitmap via the commandline"""
102119

103-
cmd: Union[View, Index, Bitdump]
120+
cmd: Union[View, Index, Bitdump, Annotate]
104121
cprof: str = field(default=None, help=argparse.SUPPRESS)
105122

106123
def run(self):

panagram/index.py

+64-28
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,10 @@ def init_logger(logfile):
4949
IDX_SUFFIX = "gzi"
5050
ANCHOR_DIR = "anchor"
5151

52-
TABIX_COLS = ["chr","start","end","type","attr"]
53-
#TABIX_COLS = ["chr","start","end","type","name"]
52+
GFF_COLS = ["chr","start","end","type","attr"]
53+
TABIX_COLS = ["chr","start","end","type","name"]
5454
TABIX_TYPES = {"start" : int, "end" : int}
55-
GENE_TABIX_COLS = TABIX_COLS + ["unique","universal"]
55+
GENE_COLS = ["chr","start","end","name"]#,"unique","universal"]
5656
GENE_TABIX_TYPES = {"start" : int, "end" : int, "unique" : int, "universal" : int}
5757
TABIX_SUFFIX = ".gz"
5858

@@ -95,7 +95,7 @@ class Index(Serializable):
9595

9696
gff_gene_types: List[str] = field(default_factory=lambda: ["gene"], help="GFF features to store locations and conservation scores")
9797
gff_anno_types: List[str] = field(default=None, help="GFF features of which to store locations, but not conservation scores")
98-
gff_attrs: List[str] = field(default_factory=lambda: ["ID","Name"], help="GFF attributes to store in annotation indexes")
98+
gff_name: List[str] = field(default_factory=lambda: ["Name","ID"], help="GFF attributes to store in annotation indexes")
9999

100100
#Subset of genome IDs to generate anchor genomes for. Will use all genomes as anchors if not specified
101101
anchor_genomes: List[str] = field(default=None)
@@ -473,12 +473,13 @@ def bitsum_index(self):
473473

474474
@property
475475
def gene_tabix_cols(self):
476-
return TABIX_COLS + list(self.bitsum_index)
476+
return GENE_COLS + [1,self.ngenomes]#list(self.bitsum_index)
477+
#return TABIX_COLS + list(self.bitsum_index)
477478

478479
@property
479480
def gene_tabix_types(self):
480481
r = {"start" : int, "end" : int}
481-
for i in self.bitsum_index:
482+
for i in [1,self.ngenomes]:#self.bitsum_index:
482483
r[i] = int
483484
return r#TABIX_COLS + list(self.bitsum_index)
484485

@@ -562,20 +563,21 @@ def seq_len(self, seq_name):
562563
return self.sizes.loc[seq_name]
563564

564565
def _iter_gff(self):
565-
gffattr = lambda df,name: df["attr"].str.extract(f"{name}=([^;]+)", re.IGNORECASE)
566+
gffattr = lambda df,name: df["attr"].str.extract(f"{name}=([^;]+)", re.IGNORECASE)[0]
566567
for df in pd.read_csv(
567568
self.gff,
568569
sep="\t", comment="#", chunksize=10000,
569570
names = ["chr","source","type","start","end","score","strand","phase","attr"],
570-
usecols = TABIX_COLS):
571-
572-
#df["name"] = pd.NA
573-
#for attr in df,self.gff_attrs:
574-
# isna = df.index[df["name"].isna()]
575-
# if len(isna) > 0:
576-
# df.loc[isna,"name"] = gffattr(df.loc[isna], attr)
577-
# else:
578-
# break
571+
usecols = GFF_COLS):
572+
573+
df["name"] = pd.NA
574+
for attr in self.params["gff_name"]:
575+
isna = df.index[df["name"].isna()]
576+
if len(isna) > 0:
577+
names = gffattr(df.loc[isna], attr)
578+
df.loc[isna,"name"] = names
579+
else:
580+
break
579581

580582
yield df[TABIX_COLS]
581583

@@ -595,8 +597,10 @@ def _write_anno_types(self):
595597
for t in self.gff_anno_types:
596598
f.write(f"{t}\n")
597599

598-
def init_gff(self):
599-
if pd.isna(self.gff): return
600+
def init_gff(self, filename=None):
601+
if filename is None:
602+
filename = self.gff
603+
if pd.isna(filename): return
600604

601605
genes = list()
602606
annos = list()
@@ -660,9 +664,9 @@ def query(self, name, start=None, end=None, step=1):
660664
if end is None:
661665
end = self.seq_len(name)
662666

663-
pac = self._query_bytes(name, start, end, step, bstep)
667+
pac = self._query_bytes(name, start, end-1, step, bstep)
664668
bits = self._bytes_to_bits(pac)
665-
idx = np.arange(start, end+1, step, dtype=int)#[:len(bits)-1]
669+
idx = np.arange(start, end, step, dtype=int)#[:len(bits)-1]
666670
df = pd.DataFrame(bits, index=idx, columns=self.genome_names)
667671
return df
668672

@@ -720,14 +724,14 @@ def query_genes(self, chrom=None, start=None, end=None):#, attrs=["name","id"]):
720724
rows = list(rows)
721725
ret = pd.DataFrame(rows, columns=self.gene_tabix_cols).astype(self.gene_tabix_types)
722726

723-
if "attr" in ret.columns:
724-
attr = lambda a: ret["attr"].str.extract(f"{a}=([^;]+)", re.IGNORECASE)
725-
names = attr("Name")
726-
ids = attr("ID")
727-
names[names.isna()] = ids[names.isna()]
728-
ret["name"] = names
729-
else:
730-
ret["name"] = ""
727+
#if "attr" in ret.columns:
728+
# attr = lambda a: ret["attr"].str.extract(f"{a}=([^;]+)", re.IGNORECASE)
729+
# names = attr("Name")
730+
# ids = attr("ID")
731+
# names[names.isna()] = ids[names.isna()]
732+
# ret["name"] = names
733+
#else:
734+
# ret["name"] = ""
731735

732736
return ret
733737

@@ -789,6 +793,38 @@ def _write_bitmap(self, name, seq):
789793

790794
return self._bytes_to_bits(arrs[1])
791795

796+
def run_annotate(self, gff_file=None, logfile=None):
797+
logging.basicConfig(
798+
filename=logfile, level=logging.INFO,
799+
format='[ %(asctime)s %(levelname)7s ] %(message)s',
800+
datefmt="%Y-%m-%d %H:%M:%S"
801+
)
802+
803+
gene_df = self.init_gff(gff_file)
804+
805+
#for chrom,df in gene_df.groupby(level="chr"):
806+
for chrom in gene_df.index.unique("chr"):
807+
df = gene_df.loc[chrom]
808+
st = df.index.get_level_values("start").min()
809+
en = min(self.sizes[chrom],df.index.get_level_values("end").max())
810+
811+
bitsum = self.query(chrom, st, en).sum(axis=1)
812+
813+
for start,end in df.index:
814+
if end <= start or start < 0 or end-st > len(bitsum):
815+
logger.warning(f"Skipping gene at {chrom}:{start}-{end}, coordinates out-of-bounds")
816+
continue
817+
occ, counts = np.unique(bitsum[start-st:end-st], return_counts=True)
818+
gene_df.loc[(chrom,start,end),occ] += counts
819+
820+
self.bitsum_genes = gene_df.groupby("chr",sort=False)[self.bitsum_index].sum()#.sort_index()
821+
self.bitsum_genes.to_csv(self.chr_genes_fname, sep="\t")
822+
823+
gene_tabix = gene_df.reset_index()[self.gene_tabix_cols]
824+
print(gene_tabix)
825+
self._write_tabix(gene_tabix, "gene")
826+
827+
792828

793829
def run_anchor(self, bitvecs, logfile=None):
794830
logging.basicConfig(

panagram/view.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -665,9 +665,9 @@ def plot_interactive(n_skips, bitmap_counts, plot_rep, plot_gene, start_coord, e
665665
adjusted_bin_size = (bin_size/n_skips)
666666

667667
#Here we are filling in the bins for the main figure.
668-
bin_size_int = n_skips*(int(bin_size) // n_skips)+1
668+
bin_size_int = n_skips*(int(bin_size) // n_skips)#+1
669669

670-
adjusted_bin_size_init = int(adjusted_bin_size) + 1
670+
adjusted_bin_size_init = int(adjusted_bin_size) #+ 1
671671

672672
#t2 = time.perf_counter()
673673
#print(f"\tFigure init {t2 - t_start:0.4f} seconds")
@@ -1142,7 +1142,7 @@ def make_gene_per_genome_fig(anchor_name):
11421142
genes = index.query_genes(anchor_name)
11431143
tib = time.perf_counter()
11441144
sys.stderr.write(f"Queried genes 3 ({tib-tic})\n")
1145-
genes["name"] = genes["attr"].str.extract("Name=([^;]+)")
1145+
#genes["name"] = genes["attr"].str.extract("Name=([^;]+)")
11461146
genes["size"] = genes["end"] - genes["start"]
11471147

11481148
x = [i for i in range(0, len(genes))]
@@ -1569,8 +1569,8 @@ def update_gene_locals(local_gene_list, chrs, start_coord, end_coord, anchor_nam
15691569
printme = ""
15701570
elif len(local_gene_list)==1:
15711571
printme += "Genes: "
1572-
printme += local_gene_list['name'].iloc[0] + ": "
1573-
printme += local_gene_list['attr'].iloc[0] #index.query_anno(anchor_name, chrs, start_coord, end_coord)['attr']
1572+
printme += local_gene_list['name'].iloc[0] #+ ": "
1573+
#printme += local_gene_list['attr'].iloc[0] #index.query_anno(anchor_name, chrs, start_coord, end_coord)['attr']
15741574
elif len(local_gene_list)<=25:
15751575
printme += "Genes: "
15761576
print(local_gene_list)

0 commit comments

Comments
 (0)