Skip to content

Commit b1f7eda

Browse files
committed
Added snakemake resource requirements and fixed annotation issue
1 parent 2e3cc90 commit b1f7eda

File tree

3 files changed

+66
-21
lines changed

3 files changed

+66
-21
lines changed

MANIFEST.in

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
recursive-include panagram *py *so extra* workflow*
1+
recursive-include panagram *py *so extra* panagram/workflow
22
include panagram/workflow/Snakefile
33
include panagram/extra/kmc
44
include panagram/extra/kmc_tools

panagram/index.py

+15-5
Original file line numberDiff line numberDiff line change
@@ -589,8 +589,11 @@ def _read_bitsum_bins(self):
589589
def seq_len(self, seq_name):
590590
return self.sizes.loc[seq_name]
591591

592-
def _gffattr(self,df,name):
593-
return df["attr"].str.extract(f"{name}=([^;]+)", re.IGNORECASE)[0]
592+
def _gffattr(self,df,name,fill=None):
593+
attr = df["attr"].str.extract(f"{name}=([^;]+)", re.IGNORECASE)[0]
594+
if fill is not None:
595+
return attr.fillna(df[fill])
596+
return attr
594597

595598
def _iter_gff(self):
596599
for df in pd.read_csv(
@@ -668,11 +671,11 @@ def _merge_dfs(dfs):
668671

669672
annos = _merge_dfs(annos)#.set_index("type").sort_index()
670673
genes = _merge_dfs(genes)
671-
genes["name"] = self._gffattr(genes, self.params["gff_name"]).fillna(genes["id"])
674+
genes["name"] = self._gffattr(genes, self.params["gff_name"], "id")
672675

673676
parents = self._gffattr(annos, "Parent")#.set_index(annos["id"]).dropna()
674677
anno_ids = annos.reset_index().dropna().set_index("id")["index"]
675-
gene_names = genes[["id","name"]].dropna().set_index("id")["name"]
678+
gene_names = genes[["id","name"]].set_index("id")["name"]
676679

677680
p = parents.isin(anno_ids.index)
678681

@@ -682,7 +685,14 @@ def _merge_dfs(dfs):
682685
p = parents.isin(anno_ids.index)
683686
n += 1
684687

685-
annos["name"] = gene_names.loc[parents].to_numpy()
688+
roots = pd.isna(parents)
689+
childs = ~roots
690+
print(gene_names)
691+
print(gene_names.isna().mean())
692+
print(parents[childs])
693+
print(gene_names.loc[parents[childs]].isna().mean())
694+
annos.loc[childs,"name"] = gene_names.loc[parents[childs]].to_numpy()
695+
annos.loc[roots,"name"] = self._gffattr(annos[roots], self.params["gff_name"], "id")
686696

687697
annos = annos[annos["type"] != "transcript"][TABIX_COLS].drop_duplicates()
688698

panagram/workflow/Snakefile

+50-15
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,11 @@ def get_onehot_tag(wildcards):
2727
i = SAMPLES.index.get_loc(wildcards.sample) % 32
2828
return 1 << i
2929

30+
def get_anchor_mb(wc,input):
31+
kmc_bytes = sum([f.size for f in input.kmc_pre+input.kmc_suf])
32+
fa_bytes = input.fasta.size
33+
return 1000+(1.5*kmc_bytes + fa_bytes*len(input.kmc_pre))/(10**6)
34+
3035
rule anchor:
3136
input:
3237
kmc_pre=expand("kmc/bitvec{i}.kmc_pre", i=range(index.kmc_bitvec_count)),
@@ -36,29 +41,51 @@ rule anchor:
3641
expand("anchor/{{sample}}/bitmap.{step}.{ext}", step=index.steps, ext=["gz","gzi"]),
3742
"anchor/{sample}/chrs.tsv",
3843
log:
39-
logfile="logs/anchor.{sample}.txt"
44+
log="logs/anchor.{sample}.log.txt"
45+
benchmark:
46+
"logs/anchor.{sample}.benchmark.txt"
47+
resources:
48+
mem_mb=get_anchor_mb,
4049
run:
41-
index[wildcards.sample].run_anchor(index.bitvec_prefixes, log.logfile)
50+
index[wildcards.sample].run_anchor(index.bitvec_prefixes, log.log)
51+
52+
def get_bitvec_mb(wc,input):
53+
dbsize = max([d.size for d in input.dbs])
54+
return 1000 + dbsize/(10**6)
4255

4356
rule kmc_bitvec:
4457
input:
45-
"kmc/opdef{i}.txt",
58+
opdef="kmc/opdef{i}.txt",
59+
dbs=expand("kmc/{sample}.onehot.{ext}", sample=list(SAMPLES.index), ext=KMC_EXTS)
4660
output:
4761
expand("kmc/bitvec{{i}}.{ext}", ext=KMC_EXTS)
4862
log:
49-
"logs/kmc.bitvec{i}.txt"
63+
"logs/kmc.bitvec{i}.log.txt"
64+
benchmark:
65+
"logs/kmc.bitvec{i}.benchmark.txt"
66+
resources:
67+
mem_mb=get_bitvec_mb,
68+
threads: 2
5069
shell:
51-
f"{EXTRA_DIR}/kmc_tools complex kmc/opdef{{wildcards.i}}.txt > {{log}} 2>&1"
70+
f"{EXTRA_DIR}/kmc_tools complex {{input.opdef}} > {{log}} 2>&1"
71+
#kmc/opdef{{wildcards.i}}.txt
5272

5373
rule opdefs:
5474
input:
5575
expand("kmc/{sample}.onehot.{ext}", sample=list(SAMPLES.index), ext=KMC_EXTS)
5676
output:
5777
expand("kmc/opdef{i}.txt", i=range(index.kmc_bitvec_count))
78+
benchmark:
79+
"logs/kmc.opdef.benchmark.txt"
5880
run:
5981
index.init_opdefs()
6082

61-
rule kmc_sample:
83+
def get_kmc_mb(wc,input):
84+
dbsize = sum([d.size for d in input.dbs])
85+
config["kmc"]["memory"]*1000
86+
return 1000 + dbsize/(10**6)
87+
88+
rule kmc_count:
6289
input:
6390
get_fasta #"{fasta}"
6491
output:
@@ -67,15 +94,19 @@ rule kmc_sample:
6794
tag = get_onehot_tag
6895
log:
6996
"logs/kmc.{sample}.txt"
97+
benchmark:
98+
"logs/kmc.{sample}.benchmark.txt"
7099
threads : config["kmc"]["threads"]
100+
resources:
101+
mem_mb=500+config["kmc"]["memory"]*1000,
71102
shell:
72103
f"mkdir -p {TMPDIR}{{wildcards.sample}}; "
73104

74105
f"{EXTRA_DIR}/kmc -k{{config[k]}} -t{{threads}} -m{{config[kmc][memory]}} "
75106
f"-ci1 -cs1000 -fm {{input}} kmc/{{wildcards.sample}}.count {TMPDIR}{{wildcards.sample}} "
76107
"> {log} 2>&1;"
77108

78-
f"{EXTRA_DIR}/kmc_tools -t4 transform kmc/{{wildcards.sample}}.count "
109+
f"{EXTRA_DIR}/kmc_tools -t{{threads}} transform kmc/{{wildcards.sample}}.count "
79110
f"set_counts {{params.tag}} kmc/{{wildcards.sample}}.onehot "
80111
">> {log} 2>&1;"
81112

@@ -85,36 +116,40 @@ rule faidx:
85116
output:
86117
"{fasta}.fai"
87118
log:
88-
"logs/faidx.{fasta}.txt"
119+
"logs/faidx.{fasta}.log.txt"
120+
benchmark:
121+
"logs/faidx.{fasta}.benchmark.txt"
89122
shell:
90123
"samtools faidx {input} > {log} 2>&1"
91124

92125
rule mash_sample:
93126
input:
94127
get_fasta
95128
output:
96-
"{sample}.msh"
129+
"tmp/{sample}.msh"
97130
log:
98-
"logs/mash.sketch.{sample}.txt"
131+
"logs/mash.sketch.{sample}.log.txt"
132+
benchmark:
133+
"logs/mash.sketch.{sample}.benchmark.txt"
99134
shell:
100135
"{EXTRA_DIR}/mash "
101-
"sketch -C {wildcards.sample} -o {wildcards.sample}.msh -r -s 10000 {input} "
136+
"sketch -C {wildcards.sample} -o tmp/{wildcards.sample}.msh -r -s 10000 {input} "
102137
"> {log} 2>&1;"
103138

104-
105139
rule mash_triangle:
106140
input:
107-
expand("{sample}.msh", sample=SAMPLES.index)
141+
expand("tmp/{sample}.msh", sample=SAMPLES.index)
108142
output:
109143
"genome_dist.tsv"
110144
log:
111-
"logs/mash.triangle.txt"
145+
"logs/mash.triangle.log.txt"
146+
benchmark:
147+
"logs/mash.triangle.benchmark.txt"
112148
shell:
113149
"{EXTRA_DIR}/mash "
114150
"triangle -C -E {input} > {output} 2> {log}"
115151

116152
rule all:
117153
input:
118154
"genome_dist.tsv",
119-
#index.anchor_filenames
120155
expand("anchor/{sample}/chrs.tsv", sample=index.anchor_genomes)

0 commit comments

Comments
 (0)