-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathrun_gwas_gemma_lmm4.sh
executable file
·329 lines (267 loc) · 13.7 KB
/
run_gwas_gemma_lmm4.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
#!/bin/bash
# AUTHOR: Johan Zicola
# DATE: 2018-01-26
# DESCRIPTION: This script performs GWAS using as input a vcf file and a phenotype file
# The phenotype file should contain as many values as individuals in the vcf file.
# The order of the accessions in the vcf file should be the same than the order of the
# phenotype in the phenotype file
# USAGE:
# run_gwas_gemma.sh <phenotype_file.tsv> <vcf_file.vcf> [covariate_file.txt]
# Output is a .assoc.clean.txt file which can be loaded in R to look at significant SNPs.
# All generated files are located where <phenotype_file.tsv> is and the subdirectory "output"
# Path to python script assoc2qqman.py (assumes the script is in the same
# directory as this script)
# Note that gemma output files will be generated in the directory
# in which the user launches the script (not forcibly the directory containing the script or the input files)
# We call this directory $current_path
current_path=$(pwd)
# Get directory of running bash script to identify where the python script assoc2qqman.py is
path_script=$(dirname $0)
path_script=$(cd $path_script && pwd)
# Assign path of the python script assoc2qqman.py to $assoc2qqman
assoc2qqman="${path_script}/assoc2qqman.py"
# Softwares required:
# python2.7
# gemma 0.94 (tested)
# vcftools 0.1.14 (tested)
# bcftools 1.2 (tested)
# PLINK v1.90b3.40 (tested)
#
####################################################################################
# Help command if no arguments or -h argument is given
if [ "$1" == "-h" -o "$#" -eq 0 ] ; then
echo -e "\n"
echo "Usage: `basename $0` <phenotype_file.tsv> <vcf_file.vcf> [covariate_file.txt] [-h]"
echo -e "\n"
echo "Description: Provide as main argument a phenotype file"
echo "generated in R (should have a .tsv extension) and a vcf files containing "
echo "the accessions described in the phenotype file (should have a .vcf extension)."
echo "Optional argument "covariate_file.txt" can be provided in third position"
echo "if covariate analysis is to be performed."
echo "`basename $0` generates several files derived from vcftools, p-link, and gemma"
echo "The file with ".assoc.clean.txt" as suffix contains the GWAS results and can be "
echo "uploaded in R for visualization (qqman library)"
echo -e "\n"
exit 0
fi
# Test if softwares are available
test_program(){
command -v $1 >/dev/null 2>&1 || { echo >&2 " $1 required but it's not installed. Aborting"; exit 1; }
}
test_program "python"
test_program "gemma"
test_program "vcftools"
test_program "bcftools"
test_program "plink"
# Time script
start=$(date +%s)
# Test if 2 arguments were provided
if [ "$#" -lt 2 ]; then
echo "Argument(s) missing"
exit 1
fi
# Phenotype file is generated by R and has a tsv extension (can have several phenotypes
# tab-separated in different columns. It should be in unix format and should not
# contai empty line
phenotype_file=$1
if [ ! -e $phenotype_file ]; then
echo "File $phenotype_file does not exist"
exit 1
elif [[ $phenotype_file != *.tsv ]]; then
echo "Provide a phenotype file with .tsv extension"
exit 1
fi
# Define location of vcf file to analyze
vcf_file=$2
if [ ! -e $vcf_file ]; then
echo "File $vcf_file does not exist"
exit 0
elif [[ $vcf_file != *.vcf ]] && [[ $vcf_file != *.vcf.gz ]]; then
echo "Provide a vcf file with .vcf or vcf.gz extension"
exit 0
fi
# Check if phenotype file contains as many lines as there are samples in the VCF file.
# If not, this will yield in segmentation fault when making the kinship matrix
nb_samples=$(bcftools query -l $vcf_file | wc -l)
nb_phenotypes=$(cat $phenotype_file | wc -l)
if [[ $nb_samples != $nb_phenotypes ]]; then
echo "Number of samples and number of phenotypes are not equal! verify files"
exit 0
fi
# Check if covariate file is provided
if [ "$#" -eq 3 ]; then
covariate_file=$3
# Check if file exists
if [ -e $covariate_file ]; then
# Check if number of lines equal number of samples
line_covariate_file=$(cat $covariate_file | wc -l)
if [[ $line_covariate_file != $nb_phenotypes ]]; then
echo "Number of lines in covariate file and number of phenotypes are not equal! verify files"
exit 0
else
# Get prefix of the covariate file to add as suffix in final output name
covariate_prefix="${covariate_file%.*}"
fi
else
echo "Covariate file $covariate_file does not exist"
exit 0
fi
fi
##################################################################################
# Directory containing the phenotype file and which will contain final GWAS results
# in the output/ directory
# Get prefix of phenotype file
dir_file=$(pwd $phenotype_file)
# Get prefix from phenotype name (assume the phenotype file has a .tsv extension)
# Prefix for GWAS results. Based on phenotype file name
prefix_gwas=$(basename -s .tsv $phenotype_file)
# Prefix to generate plink files from input VCF file
# Per default will be name of the VCF file before the first point
path_prefix_vcf=$(echo $vcf_file | cut -d'.' -f1)
# In case the file of the VCF file contains a path, get suffix of the vcf file
# for output name for gemma
prefix_vcf=$(basename $path_prefix_vcf)
echo -e "\n###################### INPUT VARIABLES ###################################\n"
echo "phenotype file: $phenotype_file"
echo "vcf file: $vcf_file"
echo "dir_file (directory where output/ will be): $dir_file"
echo "current_path: $current_path"
echo "prefix plink output files: $prefix_vcf"
echo "prefix gemma output files: $prefix_gwas"
echo -e "\n###################### CONVERT VCF TO PLINK FORMAT #######################\n"
# VCF into bed file => make .ped and .map files
# These files must be made only once, then only the fam file should be modified for the
# tested phenotype
# Check if plink files already exist
# Also check if input vcf file is compressed or not
echo -e "Generate ped and map files:"
if [ -e ${path_prefix_vcf}.ped ] && [ -e ${path_prefix_vcf}.map ]; then
echo -e "${path_prefix_vcf}.ped and ${path_prefix_vcf}.map already exists. Go to next step."
else
if [[ $vcf_file == *.vcf ]]; then
printf "vcftools --vcf $vcf_file --plink --out ${path_prefix_vcf}"
vcftools --vcf $vcf_file --plink --out ${path_prefix_vcf}
elif [[ $vcf_file == *.vcf.gz ]]; then
printf "vcftools --gzvcf $vcf_file --plink --out ${path_prefix_vcf}"
vcftools --gzvcf $vcf_file --plink --out ${path_prefix_vcf}
fi
fi
echo -e "\nGenerate bed, bim, and fam files:"
# Make bed files: 3 files are created => .bed, .bim, .fam
if [ -e ${path_prefix_vcf}.bed ] && [ -e ${path_prefix_vcf}.bim ] && [ -e ${path_prefix_vcf}.fam ]; then
echo "File ${path_prefix_vcf}.bed, ${path_prefix_vcf}.bim, ${path_prefix_vcf}.fam already exist. Go to next step."
else
printf "plink --file ${path_prefix_vcf} --make-bed --out ${path_prefix_vcf}"
plink --file ${path_prefix_vcf} --make-bed --out ${path_prefix_vcf}
fi
echo -e "\nKeep only 5 first column from .fam file (remove 6th column of -9)"
echo "cut -d' ' -f1,2,3,4,5 ${path_prefix_vcf}.fam > ${path_prefix_vcf}_modified.fam"
cut -d' ' -f1,2,3,4,5 ${path_prefix_vcf}.fam > ${path_prefix_vcf}_modified.fam
echo -e "\nPaste phenotype data to fam file and reformat it:"
# Paste to fam file
echo "paste -d ' ' ${path_prefix_vcf}_modified.fam $phenotype_file > ${path_prefix_vcf}.fam"
paste -d ' ' ${path_prefix_vcf}_modified.fam $phenotype_file > ${path_prefix_vcf}.fam
echo -e "\n############################# RUN GEMMA ##################################\n"
# Run Gemma
# Per default, gemma put the results in an "output" directory located in working directory ($current_path)
# There is apparently no way to change this (adding fullpath in -o /srv/biodata/dep_coupland/grp_hancock/johan/GWAS/rDNA_copy_number_MOR)
# does not work:
# error writing file: ./output//srv/biodata/dep_coupland/grp_hancock/johan/GWAS/rDNA_copy_number_MOR.cXX.txt
# Instead, just transfer the generated files into the ${dir_file}/output directory at the end
# Estimate relatedness matrix from genotypes (n x n individuals)
# Generate relatedness matrix based on centered genotypes (cXX)
# centered matrix preferred in general, accounts better for population structure
# If standardized genotype matrix is needed, change to -gk 2 (sXX)
# standardized matrix preferred if SNPs with lower MAF have larger effects
# Not that this file remains the same whatever phenotype is studied as it considers
# only the VCF file.
# Add flag (-maf 0.05) to set it at 5% for instance).
echo -e "Generate relatedness matrix:"
if [ -e ${current_path}/output/${prefix_vcf}.cXX.txt ]; then
echo -e "${current_path}/output/${prefix_vcf}.cXX.txt file already exists. Go to next step."
else
echo -e "\ngemma -bfile ${path_prefix_vcf} -gk 1 -o ${prefix_vcf}"
gemma -bfile ${path_prefix_vcf} -gk 1 -o $prefix_vcf
fi
## If needed, the relatedness matrix can be transformed into eigen values and eigen vectors
## Generates 3 files: log, eigen values (1 column of na elements) and eigen vectors (na x na matrix)
## Use of eigen transformation allows quicker analysis (if samples > 10 000)
# gemma -bfile ${prefix} -k ${current_path}/output/${prefix}.cXX.txt -eigen -o ${prefix}
## Association Tests with Univariate Linear Mixed Models
# Use lmm 2 to performs likelihood ratio test
# Note that minor allele frequency (MAF( threshold is set at 1% per default.
# Add flag (-maf 0.05) to set it at 5% for instance).
# prefix.log.txt contains PVE estimate and its standard error in the null linear mixed model.
# assoc.txt file contains the results
# If covariate file is provided as third argument, perform covariate analysis
echo -e "\nPerform the association test:"
# Rename variable $prefix_gwas if covariate file was provided so
# the output can be differentiated from same analysis without covariate file
if [ $covariate_file ]; then
prefix_gwas="${prefix_gwas}_${covariate_prefix}"
fi
if [ -e ${current_path}/output/${prefix_gwas}.assoc.txt ]; then
echo "${current_path}/output/${prefix_gwas}.assoc.txt already exists. Go to next step"
else
if [ $covariate_file ]; then
echo -e "\ngemma -bfile ${path_prefix_vcf} -k ${current_path}/output/${prefix_vcf}.cXX.txt -lmm 4 -maf 0.05 -o ${prefix_gwas} -c $covariate_file"
gemma -bfile ${path_prefix_vcf} -k ${current_path}/output/${prefix_vcf}.cXX.txt -lmm 4 -maf 0.05 -o ${prefix_gwas} -c $covariate_file
else
echo -e "\ngemma -bfile ${path_prefix_vcf} -k ${current_path}/output/${prefix_vcf}.cXX.txt -lmm 4 -maf 0.05 -o ${prefix_gwas}"
gemma -bfile ${path_prefix_vcf} -k ${current_path}/output/${prefix_vcf}.cXX.txt -lmm 4 -maf 0.05 -o ${prefix_gwas}
fi
fi
# # Association Tests with Multivariate Linear Mixed Models
# # several phenotypes can be given (for instance columns 1,2,3 of the phenotype file). Less than 10
# # phenotypes are recommended
# gemma -bfile ${path_prefix_vcf} -k ${current_path}/output/${prefix}.cXX.txt -lmm 4 -n 1 2 3 -o ${prefix}
## Bayesian Sparse Linear Mixed Model
## Use a standard linear BSLMM (-bslmm 1)
## Does not require a relatedness matrix (calculates it internally)
## Generates 5 output files: log, hyp.txt (estimated hyper-parameters), param.txt (posterior mean
## estimates for the effect size parameters), prefix.bv contains
# gemma -bfile ${path_prefix_vcf} -bslmm 1 -o ${prefix}
# To analyze the output of bslmm in R, one needs to plot the gamma value multiplied by beta value from the # param.txt (see more details on https://visoca.github.io/popgenomworkshop-gwas_gemma/)
# see R script gemma_param.R
echo -e "\n########################## POLISHING GEMMA OUTPUT ############################\n"
# Polish file for R
echo -e "Reformat assoc.txt file to be compatible with manhattan library in R:"
echo "python $assoc2qqman ${current_path}/output/${prefix_gwas}.assoc.txt > ${current_path}/output/${prefix_gwas}.assoc.clean.txt"
python $assoc2qqman ${current_path}/output/${prefix_gwas}.assoc.txt > ${current_path}/output/${prefix_gwas}.assoc.clean.txt
# Move the output data into dir_file
# Check if output directory is present in dir_file (case when script is $dir_file == $path_script
# If not, create it and move output/ in it
# Transfer output files if necessary (in case $current_path != $dir_path
if [[ $current_path != $dir_file ]]; then
echo "Transfer output files in ${dir_file}/output"
echo "mkdir ${dir_file}/output"
mkdir ${dir_file}/output
echo "mv ${current_path}/output/* ${dir_file}/output/"
mv ${current_path}/output/* ${dir_file}/output/
echo "rm -r ${current_path}/output"
rm -r ${current_path}/output
fi
end=$(date +%s)
runtime=$((end-start))
echo -e "\n############################ GENERATE LOG FILE #################################\n"
# Create a log output file and integrate gemma log file
# Check if log from gemma was successfully generated
if [ -e ${current_path}/output/${prefix_gwas}.log.txt ]; then
echo -e "\nLog generated in ${current_path}/output/${prefix_gwas}.log"
echo -e "LOG FILE\n" > ${current_path}/output/temp.txt
echo "Command: run_gwas_gemma.sh $phenotype_file $vcf_file" >> ${current_path}/output/temp.txt
echo "Phenotype file: $phenotype_file" >> ${current_path}/output/temp.txt
echo "VCF file: $vcf_file" >> ${current_path}/output/temp.txt
if [ $covariate_file ]; then
echo "Covariate file: $covariate_file" >> ${current_path}/output/temp.txt
fi
echo "Output file: ${dir_file}/output/${prefix_gwas}.assoc.clean.txt" >> ${current_path}/output/temp.txt
echo -e "Run finished on $(date)" >> ${current_path}/output/temp.txt
echo -e "Total time of the run: $runtime seconds \n" >> ${current_path}/output/temp.txt
echo "Log output from GEMMA:" >> ${current_path}/output/temp.txt
cat ${current_path}/output/temp.txt ${current_path}/output/${prefix_gwas}.log.txt > ${current_path}/output/${prefix_gwas}.log
rm ${current_path}/output/temp.txt ${current_path}/output/${prefix_gwas}.log.txt
else
echo -e "Gemma did not generate a log, a problem occured in the pipeline!"
fi
echo -e "\n############################ END OF THE PIPELINE #################################\n"