-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathbuild-index
113 lines (97 loc) · 3.21 KB
/
build-index
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/bin/bash
#./deSAMBA-index test/refs/ taxonomy/nodes.dmp bin index
ref_file=$1
index_dir=$2
kmer_size=31
DE_SAMBA=./bin/deSAMBA
USAGE="
Program: build_index
Usage:
`basename $0` <ref_file> <index_dir>
Basic:
<ref_file> FILE/STR \"all\"\\\"viral\" OR [file] of reference sequences.
- all: Using lastest NCBI RefSeq
bacteria+viral+archaea database.
- viral: Using lastest NCBI RefSeq viral
database.
- Otherwise using user defined reference
sequences[.fa], storing them in one file.
When you have more than one files,
combined them into one.Make sure all genomes start as following format:
">tid|285013|ref|NC_006268.1" to running analysis based on taxnomy
like "285013".
<index_dir> FOLDER folder to store index.
"
if [ $# -lt 2 ]
then
echo "ERROR: Parameters are not enough"
printf "$USAGE"
exit 1
fi
WGS_FILE=${ref_file}
#make index dir
mkdir -p $index_dir
#-all download and store
if [ $ref_file == 'all' ]
then
DOWNLOAD=$index_dir/download
#download bacteria/viral/archaea
mkdir "$DOWNLOAD"
bash ./download -P 10 -o $DOWNLOAD -d viral refseq
bash ./download -P 10 -o $DOWNLOAD -d archaea refseq
bash ./download -P 10 -o $DOWNLOAD -d bacteria refseq
#store all fna/fa/fastq file into wgs file
WGS_FILE=${index_dir}/deSAMBA.wgs
find $DOWNLOAD/ -name "*.fna" | xargs -n 1 cat > $WGS_FILE
#rm source
rm -r "$DOWNLOAD"
echo "downloading end"
fi
#-all download and store
if [ $ref_file == 'viral' ]
then
DOWNLOAD=$index_dir/download
#download bacteria/viral/archaea
mkdir "$DOWNLOAD"
bash ./download -P 10 -o $DOWNLOAD -d viral refseq
#store all fna/fa/fastq file into wgs file
WGS_FILE=${index_dir}/deSAMBA.wgs
find $DOWNLOAD/ -name "*.fna" | xargs -n 1 cat > $WGS_FILE
#rm source
rm -r "$DOWNLOAD"
echo "downloading end"
fi
#jellyfish size
FILE_SIZE=$(ls -l $WGS_FILE | awk '{print $5}')
echo "$WGS_FILE file size" [$FILE_SIZE]
let JELLYFISH_HASH_SIZE=FILE_SIZE*115/100
if [ ${JELLYFISH_HASH_SIZE} -gt 12000000000 ]; then #at most 12G kmer, this will using 144G MEM
JELLYFISH_HASH_SIZE=12000000000
fi
let MAX_JF_MEM=JELLYFISH_HASH_SIZE*12/1000000
echo "Jellyfish hash size:" [$JELLYFISH_HASH_SIZE] "this will using MEM at most" ${MAX_JF_MEM}"M"
#jellyfish count
jelly_fish_dir="${index_dir}/jelly_fish"
mkdir ${jelly_fish_dir}
jelly_fish_prefix="${jelly_fish_dir}/mer"
./bin/jellyfish count -m $kmer_size -s $JELLYFISH_HASH_SIZE -t 8 -o ${jelly_fish_prefix} $WGS_FILE
if [[ $ref_file == 'all' || $ref_file == 'viral' ]]; then
rm ${WGS_FILE}
fi
#jellyfish merge
#when jellyfish output one more file
if [ -e "${jelly_fish_prefix}_1" ]
then
#echo
./bin/jellyfish merge -o ${index_dir}/database.jdb ${jelly_fish_prefix}*
rm -r ${jelly_fish_dir}
else
mv ${jelly_fish_prefix}_0 ${index_dir}/database.jdb
fi
#sort
${DE_SAMBA} kmersort -k 31 -o ${index_dir}/kmer.srt ${index_dir}/database.jdb
rm ${index_dir}/database.jdb
#index
${DE_SAMBA} index ${index_dir}/kmer.srt $WGS_FILE ${index_dir}
rm ${index_dir}/kmer.srt
echo "finished building index!"