kjenike
diff --git a/‎cpp/Makefile
+51 b/‎cpp/Makefile
+51
diff --git a/‎cpp/README.md
+27 b/‎cpp/README.md
+27
diff --git a/‎cpp/Snakefile
+162 b/‎cpp/Snakefile
+162
@@ -0,0 +1,51 @@
+CC=gcc
+CFLAGS=-Wall -std=c++11 -g -fPIC -fopenmp -O3 $(FLAGS)
+
+LIBS=-lstdc++ -lz -ldl -pthread -lbz2 -llzma -lm
+INCLUDE=./htslib
+
+SRC=.
+BUILD=build
+
+#ANCHOR_BIN=anchor
+#SW_BIN = $(BIN)/sw
+
+KMC_API_DIR = ../KMC/kmc_api
+KMC_DUMP_DIR = ../KMC/kmc_dump
+
+KMC_UTIL_OBJS = \
+$(KMC_DUMP_DIR)/nc_utils.o
+
+KMC_API_OBJS = \
+$(KMC_API_DIR)/mmer.o \
+$(KMC_API_DIR)/kmc_file.o \
+$(KMC_API_DIR)/kmer_api.o
+
+HTS_LIB=-L./htslib ./htslib/libhts.a
+
+_OBJS=anchor.o
+OBJS = $(patsubst %, $(BUILD)/%, $(_OBJS))
+
+DEPENDS := $(patsubst %.o, %.d, $(OBJS))
+
+all: dirs anchor
+
+anchor: build/anchor.o
+	$(CC) $(KMC_API_OBJS) $(CFLAGS) $< -o $@ -lz $(HTS_LIB) $(LIBS)
+
+-include $(DEPENDS)
+
+$(BUILD)/%.o: $(SRC)/%.cpp
+	$(CC) $(CFLAGS) -MMD -MP -c $< -o $@ -I$(INCLUDE)
+
+.PHONY: dirs
+dirs: $(BUILD)/ #$(BIN)/
+
+$(BUILD)/:
+	mkdir -p $@
+
+#$BIN)/:
+#	mkdir -p $@
+
+clean:
+	rm -rf $(BUILD)
@@ -0,0 +1,27 @@
+# Installation
+
+Requires OpenMP, htslib (shown below), and compiled KMC
+
+```
+#build KMC
+make -C ../KMC
+
+#download and build htslib
+git clone https://github.com/samtools/htslib.git
+cd htslib
+autoreconf -i  # Build the configure script and install files it uses
+./configure    # Optional but recommended, for choosing extra functionality
+make
+make install
+
+#compile anchor binary
+cd ..
+make anchor
+```
+
+# Usage
+```
+panagram index --prepare samples.tsv
+cp ~/panagram/cpp/Snakefile . #copy Snakefile from panagram/cpp
+snakemake -c <cores> .
+```
@@ -0,0 +1,162 @@
+import pandas as pd
+from panagram.index import Index
+from panagram.index import EXTRA_DIR
+
+configfile: "config.yaml"
+
+index = Index(".", mode="w")
+index.load_config()
+
+SAMPLES = index.samples #pd.read_table(config["samples"]).set_index("name")
+
+TMPDIR = f"tmp/"
+KMC_EXTS = ["kmc_pre","kmc_suf"]
+ANCHOR_BIN="./run_anchor"
+
+def get_fasta(wildcards):
+    f = SAMPLES.loc[wildcards.sample, "fasta"]
+    return f
+
+def get_fai(wildcards):
+    return get_fasta(wildcards)+".fai"
+
+def get_genome_id(wildcards):
+    f = SAMPLES.loc[wildcards.sample, "id"]
+    return f
+
+def get_onehot_tag(wildcards):
+    i = SAMPLES.index.get_loc(wildcards.sample) % 32
+    return 1 << i
+
+def get_anchor_mb(wc,input):
+    kmc_bytes = sum([f.size for f in input.kmc_pre+input.kmc_suf])
+    fa_bytes = input.fasta.size
+    return 1000+(1.5*kmc_bytes + fa_bytes*len(input.kmc_pre))/(10**6)
+
+rule anchors:
+    input:
+        kmc_pre=expand("kmc/bitvec{i}.kmc_pre", i=range(index.kmc_bitvec_count)),
+        kmc_suf=expand("kmc/bitvec{i}.kmc_suf", i=range(index.kmc_bitvec_count)),
+    output:
+        expand("anchor/{sample}/chrs.tsv", sample=list(SAMPLES.index)),
+    log:
+        log="logs/anchor.log.txt"
+    benchmark:
+        "logs/anchor.benchmark.txt"
+    #resources:               
+    #    mem_mb=get_anchor_mb,
+    threads: workflow.cores
+    run:
+        args = list()
+        for name,fasta in SAMPLES["fasta"].items():
+            args += [name,fasta]
+            shell("mkdir -p anchor/{name}")
+            shell("touch anchor/{name}/chrs.tsv")
+        ngenomes = len(SAMPLES)
+        shell(f"OMP_NUM_THREADS={{workflow.cores}} {ANCHOR_BIN} {ngenomes} . " + " ".join(args))
+
+def get_bitvec_mb(wc,input):
+    dbsize = max([d.size for d in input.dbs])
+    return 1000 + dbsize/(10**6)
+
+rule kmc_bitvec:
+    input:
+        opdef="kmc/opdef{i}.txt",
+        dbs=expand("kmc/{sample}.onehot.{ext}", sample=list(SAMPLES.index), ext=KMC_EXTS)
+    output:
+        expand("kmc/bitvec{{i}}.{ext}", ext=KMC_EXTS)
+    log:
+        "logs/kmc.bitvec{i}.log.txt"
+    benchmark:
+        "logs/kmc.bitvec{i}.benchmark.txt"
+    resources:
+        mem_mb=get_bitvec_mb,
+    threads: 2
+    shell:
+        f"{EXTRA_DIR}/kmc_tools complex {{input.opdef}} > {{log}} 2>&1"
+        #kmc/opdef{{wildcards.i}}.txt
+
+
+rule opdefs:
+    input:
+        expand("kmc/{sample}.onehot.{ext}", sample=list(SAMPLES.index), ext=KMC_EXTS)
+    output:
+        expand("kmc/opdef{i}.txt", i=range(index.kmc_bitvec_count))
+    benchmark:
+        "logs/kmc.opdef.benchmark.txt"
+    run:
+        index.init_opdefs()
+
+def get_kmc_mb(wc,input):
+    dbsize = sum([d.size for d in input.dbs])
+    config["kmc"]["memory"]*1000
+    return 1000 + dbsize/(10**6)
+
+rule kmc_count:
+    input:
+        get_fasta #"{fasta}"
+    output:
+        expand("kmc/{{sample}}.{db}.{ext}", db=["count","onehot"], ext=KMC_EXTS)
+    params:
+        tag = get_onehot_tag
+    log:
+        "logs/kmc.{sample}.txt"
+    benchmark:
+        "logs/kmc.{sample}.benchmark.txt"
+    threads : config["kmc"]["threads"]
+    resources:
+        mem_mb=500+config["kmc"]["memory"]*1000,
+    shell:
+        f"mkdir -p {TMPDIR}{{wildcards.sample}}; "
+
+        f"{EXTRA_DIR}/kmc -k{{config[k]}} -t{{threads}} -m{{config[kmc][memory]}} "
+        f"-ci1 -cs1000 -fm {{input}} kmc/{{wildcards.sample}}.count {TMPDIR}{{wildcards.sample}} "
+        "> {log} 2>&1;"
+
+        f"{EXTRA_DIR}/kmc_tools -t{{threads}} transform kmc/{{wildcards.sample}}.count " 
+        f"set_counts {{params.tag}} kmc/{{wildcards.sample}}.onehot "
+        ">> {log} 2>&1;"
+
+rule faidx:
+    input:
+        "{fasta}"
+    output:
+        "{fasta}.fai"
+    log:
+        "logs/faidx.{fasta}.log.txt"
+    benchmark:
+        "logs/faidx.{fasta}.benchmark.txt"
+    shell:
+        "samtools faidx {input} > {log} 2>&1"
+
+rule mash_sample:
+    input:
+        get_fasta
+    output:
+        "tmp/{sample}.msh"
+    log:
+        "logs/mash.sketch.{sample}.log.txt"
+    benchmark:
+        "logs/mash.sketch.{sample}.benchmark.txt"
+    shell:
+        "{EXTRA_DIR}/mash "
+        "sketch -C {wildcards.sample} -o tmp/{wildcards.sample}.msh -r -s 10000 {input} "
+        "> {log} 2>&1;"
+
+rule mash_triangle:
+    input:
+        expand("tmp/{sample}.msh", sample=SAMPLES.index)
+    output:
+        "genome_dist.tsv"
+    log:
+        "logs/mash.triangle.log.txt"
+    benchmark:
+        "logs/mash.triangle.benchmark.txt"
+    shell:
+        "{EXTRA_DIR}/mash "
+        "triangle -C -E {input} > {output} 2> {log}"
+        
+rule all:
+    input:
+        "genome_dist.tsv",
+        expand("anchor/{sample}/chrs.tsv", sample=index.anchor_genomes)