Skip to content

Commit d684205

Browse files
committed
Added C++ anchoring code
1 parent 133f20d commit d684205

File tree

5 files changed

+557
-0
lines changed

5 files changed

+557
-0
lines changed

cpp/Makefile

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
CC=gcc
2+
CFLAGS=-Wall -std=c++11 -g -fPIC -fopenmp -O3 $(FLAGS)
3+
4+
LIBS=-lstdc++ -lz -ldl -pthread -lbz2 -llzma -lm
5+
INCLUDE=./htslib
6+
7+
SRC=.
8+
BUILD=build
9+
10+
#ANCHOR_BIN=anchor
11+
#SW_BIN = $(BIN)/sw
12+
13+
KMC_API_DIR = ../KMC/kmc_api
14+
KMC_DUMP_DIR = ../KMC/kmc_dump
15+
16+
KMC_UTIL_OBJS = \
17+
$(KMC_DUMP_DIR)/nc_utils.o
18+
19+
KMC_API_OBJS = \
20+
$(KMC_API_DIR)/mmer.o \
21+
$(KMC_API_DIR)/kmc_file.o \
22+
$(KMC_API_DIR)/kmer_api.o
23+
24+
HTS_LIB=-L./htslib ./htslib/libhts.a
25+
26+
_OBJS=anchor.o
27+
OBJS = $(patsubst %, $(BUILD)/%, $(_OBJS))
28+
29+
DEPENDS := $(patsubst %.o, %.d, $(OBJS))
30+
31+
all: dirs anchor
32+
33+
anchor: build/anchor.o
34+
$(CC) $(KMC_API_OBJS) $(CFLAGS) $< -o $@ -lz $(HTS_LIB) $(LIBS)
35+
36+
-include $(DEPENDS)
37+
38+
$(BUILD)/%.o: $(SRC)/%.cpp
39+
$(CC) $(CFLAGS) -MMD -MP -c $< -o $@ -I$(INCLUDE)
40+
41+
.PHONY: dirs
42+
dirs: $(BUILD)/ #$(BIN)/
43+
44+
$(BUILD)/:
45+
mkdir -p $@
46+
47+
#$BIN)/:
48+
# mkdir -p $@
49+
50+
clean:
51+
rm -rf $(BUILD)

cpp/README.md

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Installation
2+
3+
Requires OpenMP, htslib (shown below), and compiled KMC
4+
5+
```
6+
#build KMC
7+
make -C ../KMC
8+
9+
#download and build htslib
10+
git clone https://github.com/samtools/htslib.git
11+
cd htslib
12+
autoreconf -i # Build the configure script and install files it uses
13+
./configure # Optional but recommended, for choosing extra functionality
14+
make
15+
make install
16+
17+
#compile anchor binary
18+
cd ..
19+
make anchor
20+
```
21+
22+
# Usage
23+
```
24+
panagram index --prepare samples.tsv
25+
cp ~/panagram/cpp/Snakefile . #copy Snakefile from panagram/cpp
26+
snakemake -c <cores> .
27+
```

cpp/Snakefile

+162
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
import pandas as pd
2+
from panagram.index import Index
3+
from panagram.index import EXTRA_DIR
4+
5+
configfile: "config.yaml"
6+
7+
index = Index(".", mode="w")
8+
index.load_config()
9+
10+
SAMPLES = index.samples #pd.read_table(config["samples"]).set_index("name")
11+
12+
TMPDIR = f"tmp/"
13+
KMC_EXTS = ["kmc_pre","kmc_suf"]
14+
ANCHOR_BIN="./run_anchor"
15+
16+
def get_fasta(wildcards):
17+
f = SAMPLES.loc[wildcards.sample, "fasta"]
18+
return f
19+
20+
def get_fai(wildcards):
21+
return get_fasta(wildcards)+".fai"
22+
23+
def get_genome_id(wildcards):
24+
f = SAMPLES.loc[wildcards.sample, "id"]
25+
return f
26+
27+
def get_onehot_tag(wildcards):
28+
i = SAMPLES.index.get_loc(wildcards.sample) % 32
29+
return 1 << i
30+
31+
def get_anchor_mb(wc,input):
32+
kmc_bytes = sum([f.size for f in input.kmc_pre+input.kmc_suf])
33+
fa_bytes = input.fasta.size
34+
return 1000+(1.5*kmc_bytes + fa_bytes*len(input.kmc_pre))/(10**6)
35+
36+
rule anchors:
37+
input:
38+
kmc_pre=expand("kmc/bitvec{i}.kmc_pre", i=range(index.kmc_bitvec_count)),
39+
kmc_suf=expand("kmc/bitvec{i}.kmc_suf", i=range(index.kmc_bitvec_count)),
40+
output:
41+
expand("anchor/{sample}/chrs.tsv", sample=list(SAMPLES.index)),
42+
log:
43+
log="logs/anchor.log.txt"
44+
benchmark:
45+
"logs/anchor.benchmark.txt"
46+
#resources:
47+
# mem_mb=get_anchor_mb,
48+
threads: workflow.cores
49+
run:
50+
args = list()
51+
for name,fasta in SAMPLES["fasta"].items():
52+
args += [name,fasta]
53+
shell("mkdir -p anchor/{name}")
54+
shell("touch anchor/{name}/chrs.tsv")
55+
ngenomes = len(SAMPLES)
56+
shell(f"OMP_NUM_THREADS={{workflow.cores}} {ANCHOR_BIN} {ngenomes} . " + " ".join(args))
57+
58+
def get_bitvec_mb(wc,input):
59+
dbsize = max([d.size for d in input.dbs])
60+
return 1000 + dbsize/(10**6)
61+
62+
rule kmc_bitvec:
63+
input:
64+
opdef="kmc/opdef{i}.txt",
65+
dbs=expand("kmc/{sample}.onehot.{ext}", sample=list(SAMPLES.index), ext=KMC_EXTS)
66+
output:
67+
expand("kmc/bitvec{{i}}.{ext}", ext=KMC_EXTS)
68+
log:
69+
"logs/kmc.bitvec{i}.log.txt"
70+
benchmark:
71+
"logs/kmc.bitvec{i}.benchmark.txt"
72+
resources:
73+
mem_mb=get_bitvec_mb,
74+
threads: 2
75+
shell:
76+
f"{EXTRA_DIR}/kmc_tools complex {{input.opdef}} > {{log}} 2>&1"
77+
#kmc/opdef{{wildcards.i}}.txt
78+
79+
80+
rule opdefs:
81+
input:
82+
expand("kmc/{sample}.onehot.{ext}", sample=list(SAMPLES.index), ext=KMC_EXTS)
83+
output:
84+
expand("kmc/opdef{i}.txt", i=range(index.kmc_bitvec_count))
85+
benchmark:
86+
"logs/kmc.opdef.benchmark.txt"
87+
run:
88+
index.init_opdefs()
89+
90+
def get_kmc_mb(wc,input):
91+
dbsize = sum([d.size for d in input.dbs])
92+
config["kmc"]["memory"]*1000
93+
return 1000 + dbsize/(10**6)
94+
95+
rule kmc_count:
96+
input:
97+
get_fasta #"{fasta}"
98+
output:
99+
expand("kmc/{{sample}}.{db}.{ext}", db=["count","onehot"], ext=KMC_EXTS)
100+
params:
101+
tag = get_onehot_tag
102+
log:
103+
"logs/kmc.{sample}.txt"
104+
benchmark:
105+
"logs/kmc.{sample}.benchmark.txt"
106+
threads : config["kmc"]["threads"]
107+
resources:
108+
mem_mb=500+config["kmc"]["memory"]*1000,
109+
shell:
110+
f"mkdir -p {TMPDIR}{{wildcards.sample}}; "
111+
112+
f"{EXTRA_DIR}/kmc -k{{config[k]}} -t{{threads}} -m{{config[kmc][memory]}} "
113+
f"-ci1 -cs1000 -fm {{input}} kmc/{{wildcards.sample}}.count {TMPDIR}{{wildcards.sample}} "
114+
"> {log} 2>&1;"
115+
116+
f"{EXTRA_DIR}/kmc_tools -t{{threads}} transform kmc/{{wildcards.sample}}.count "
117+
f"set_counts {{params.tag}} kmc/{{wildcards.sample}}.onehot "
118+
">> {log} 2>&1;"
119+
120+
rule faidx:
121+
input:
122+
"{fasta}"
123+
output:
124+
"{fasta}.fai"
125+
log:
126+
"logs/faidx.{fasta}.log.txt"
127+
benchmark:
128+
"logs/faidx.{fasta}.benchmark.txt"
129+
shell:
130+
"samtools faidx {input} > {log} 2>&1"
131+
132+
rule mash_sample:
133+
input:
134+
get_fasta
135+
output:
136+
"tmp/{sample}.msh"
137+
log:
138+
"logs/mash.sketch.{sample}.log.txt"
139+
benchmark:
140+
"logs/mash.sketch.{sample}.benchmark.txt"
141+
shell:
142+
"{EXTRA_DIR}/mash "
143+
"sketch -C {wildcards.sample} -o tmp/{wildcards.sample}.msh -r -s 10000 {input} "
144+
"> {log} 2>&1;"
145+
146+
rule mash_triangle:
147+
input:
148+
expand("tmp/{sample}.msh", sample=SAMPLES.index)
149+
output:
150+
"genome_dist.tsv"
151+
log:
152+
"logs/mash.triangle.log.txt"
153+
benchmark:
154+
"logs/mash.triangle.benchmark.txt"
155+
shell:
156+
"{EXTRA_DIR}/mash "
157+
"triangle -C -E {input} > {output} 2> {log}"
158+
159+
rule all:
160+
input:
161+
"genome_dist.tsv",
162+
expand("anchor/{sample}/chrs.tsv", sample=index.anchor_genomes)

0 commit comments

Comments
 (0)