build-index

#!/bin/bash
#./deSAMBA-index test/refs/ taxonomy/nodes.dmp bin index

ref_file=$1
index_dir=$2
kmer_size=31
DE_SAMBA=./bin/deSAMBA

USAGE="
Program:   build_index
  Usage:
    `basename $0` <ref_file> <index_dir>
  Basic:
    <ref_file>  FILE/STR    \"all\"\\\"viral\" OR [file] of reference sequences.
                            - all: Using lastest NCBI RefSeq 
                                bacteria+viral+archaea database.
                            - viral: Using lastest NCBI RefSeq viral 
                                database.
                            - Otherwise using user defined reference 
                                sequences[.fa], storing them in one file.
                                When you have more than one files,	
                                combined them into one.Make sure all genomes start as following format:
				">tid|285013|ref|NC_006268.1" to running analysis based on taxnomy
				like "285013".
    <index_dir>   FOLDER  folder to store index.
"
if [ $# -lt 2 ]
then 
	echo "ERROR: Parameters are not enough"
	printf "$USAGE"
	exit 1
fi

WGS_FILE=${ref_file}

#make index dir
mkdir -p $index_dir

#-all download and store 
if [ $ref_file == 'all' ]
 then
 	DOWNLOAD=$index_dir/download
	#download bacteria/viral/archaea
	mkdir "$DOWNLOAD"
	bash ./download  -P 10 -o $DOWNLOAD -d viral refseq
	bash ./download  -P 10 -o $DOWNLOAD -d archaea refseq
	bash ./download  -P 10 -o $DOWNLOAD -d bacteria refseq
	
	#store all fna/fa/fastq file into wgs file
	WGS_FILE=${index_dir}/deSAMBA.wgs
	find $DOWNLOAD/ -name "*.fna"   | xargs -n 1 cat > $WGS_FILE
	#rm source
	rm -r "$DOWNLOAD"
	echo "downloading end"
fi

#-all download and store 
if [ $ref_file == 'viral' ]
 then
 	DOWNLOAD=$index_dir/download
	#download bacteria/viral/archaea
	mkdir "$DOWNLOAD"
	bash ./download  -P 10 -o $DOWNLOAD -d viral refseq

	#store all fna/fa/fastq file into wgs file
	WGS_FILE=${index_dir}/deSAMBA.wgs
	find $DOWNLOAD/ -name "*.fna"   | xargs -n 1 cat > $WGS_FILE
	#rm source
	rm -r "$DOWNLOAD"
	echo "downloading end"
fi

#jellyfish size
FILE_SIZE=$(ls -l $WGS_FILE | awk '{print $5}')
echo "$WGS_FILE file size" [$FILE_SIZE]
let JELLYFISH_HASH_SIZE=FILE_SIZE*115/100
if [ ${JELLYFISH_HASH_SIZE} -gt 12000000000 ]; then #at most 12G kmer, this will using 144G MEM
	JELLYFISH_HASH_SIZE=12000000000
fi
let MAX_JF_MEM=JELLYFISH_HASH_SIZE*12/1000000
echo "Jellyfish hash size:" [$JELLYFISH_HASH_SIZE] "this will using MEM at most" ${MAX_JF_MEM}"M"

#jellyfish count
jelly_fish_dir="${index_dir}/jelly_fish"
mkdir ${jelly_fish_dir}
jelly_fish_prefix="${jelly_fish_dir}/mer"

./bin/jellyfish count -m $kmer_size -s $JELLYFISH_HASH_SIZE -t 8 -o ${jelly_fish_prefix} $WGS_FILE

if [[ $ref_file == 'all' || $ref_file == 'viral' ]]; then
	rm ${WGS_FILE}
fi

#jellyfish merge
#when jellyfish output one more file
if [ -e "${jelly_fish_prefix}_1" ]
then
	#echo
	./bin/jellyfish merge -o ${index_dir}/database.jdb ${jelly_fish_prefix}*
	rm -r ${jelly_fish_dir}
else
	mv ${jelly_fish_prefix}_0 ${index_dir}/database.jdb
fi 

#sort
${DE_SAMBA} kmersort -k 31 -o ${index_dir}/kmer.srt ${index_dir}/database.jdb
rm ${index_dir}/database.jdb

#index
${DE_SAMBA} index ${index_dir}/kmer.srt $WGS_FILE ${index_dir}
rm ${index_dir}/kmer.srt

echo "finished building index!"