forked from Ensembl/ensembl-vep
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvep
executable file
·275 lines (237 loc) · 13.9 KB
/
vep
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
#!/usr/bin/env perl
# Copyright [2016-2020] EMBL-European Bioinformatics Institute
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
use Getopt::Long;
use FindBin qw($RealBin);
use lib $RealBin;
use lib $RealBin.'/modules';
use Bio::EnsEMBL::VEP::Runner;
use Bio::EnsEMBL::VEP::Utils qw(get_version_string);
my $config = {};
my $arg_count = scalar @ARGV;
my @argv_copy = @ARGV;
GetOptions(
$config,
'help', # displays help message
# input options,
'config=s', # config file name
'input_file|i=s', # input file name
'input_data|id=s', # input data
'format=s', # input file format
'output_format=s', # output file format
'delimiter=s', # delimiter between fields in input
'no_check_variants_order', # skip check about the variants ordering within a region
# DB options
'species|s=s', # species e.g. human, homo_sapiens
'registry=s', # registry file
'host=s', # database host
'port=s', # database port
'user|u=s', # database user name
'password|pass=s', # database password
'db_version=i', # Ensembl database version to use e.g. 62
'assembly|a=s', # assembly version to use
'grch37', # set for using GRCh37
'genomes', # automatically sets DB params for e!Genomes
'refseq', # use otherfeatures RefSeq DB instead of Ensembl
'merged', # use merged cache
'all_refseq', # report consequences on all transcripts in RefSeq cache, includes CCDS, EST etc
'gencode_basic', # limit to using just GenCode basic transcript set
'is_multispecies=i', # '1' for a multispecies database (e.g protists_euglenozoa1_collection_core_29_82_1)
# runtime options
'transcript_filter=s' => ($config->{transcript_filter} ||= []), # filter transcripts
'exclude_predicted',
'minimal', # convert input alleles to minimal representation
'most_severe', # only return most severe consequence
'summary', # only return one line per variation with all consquence types
'pick', # used defined criteria to return most severe line
'pick_allele', # choose one con per allele
'per_gene', # choose one con per gene
'pick_allele_gene', # choose one con per gene, allele
'flag_pick', # flag one con per line
'flag_pick_allele', # flag one con per allele
'flag_pick_allele_gene', # flag one con per gene, allele
'pick_order=s', # define the order of categories used by the --*pick* flags
'buffer_size=i', # number of variations to read in before analysis
'failed=i', # include failed variations when finding existing
'gp', # read coords from GP part of INFO column in VCF (probably only relevant to 1KG)
'chr=s', # analyse only these chromosomes, e.g. 1-5,10,MT
'check_ref', # check supplied reference allele against DB/FASTA
'lookup_ref', # replace supplied reference allele with allele from DB/FASTA
'check_existing', # find existing co-located variations
'check_svs', # find overlapping structural variations
'no_check_alleles', # attribute co-located regardless of alleles
'exclude_null_alleles', # exclude variants with null alleles from co-located check (e.g COSMIC)
'check_frequency', # enable frequency checking
'af', # add global AF of existing var
'af_1kg', # add 1KG AFs of existing vars
'af_esp', # add ESP AFs of existing vars
'af_exac', # add ExAC AFs of existing vars
'af_gnomad', # add gnomAD AFs of existing vars
'old_maf', # report 1KG/ESP MAFs in the old way (no allele, always < 0.5)
'max_af', # report maximum observed allele frequency in any 1KG, ESP, ExAC pop
'pubmed', # add Pubmed IDs for publications that cite existing vars
'freq_filter=s', # exclude or include
'freq_freq=f', # frequency to filter on
'freq_gt_lt=s', # gt or lt (greater than or less than)
'freq_pop=s', # population to filter on
'filter_common', # shortcut to MAF filtering
'allow_non_variant', # allow non-variant VCF lines through
'process_ref_homs', # force processing of individuals with homozygous ref genotype
'individual=s', # give results by genotype for individuals
'phased', # force VCF genotypes to be interpreted as phased
'fork=i', # fork into N processes
'dont_skip', # don't skip vars that fail validation
'nearest=s', # get nearest transcript, gene or symbol (for gene)
'distance=s', # set up/downstream distance
'clin_sig_allele=i', # use allele specific clinical significance data where it exists
'overlaps', # report length and percent of a transcript or regulatory feature overlaped with a SV
'max_sv_size=i', # modify the size of structural variant to be handled (limited by default to reduce memory requirements)
# verbosity options
'verbose|v', # print out a bit more info while running
'quiet|q', # print nothing to STDOUT (unless using -o stdout)
'no_progress', # don't display progress bars
# output options
'everything|e', # switch on EVERYTHING :-)
'output_file|o=s', # output file name
'compress_output=s', # compress output with e.g. bgzip, gzip
'no_headers', # don't print headers
'stats_file|sf=s', # stats file name
'stats_text', # write stats as text
'stats_html', # write stats as html
'no_stats', # don't write stats file
'warning_file=s', # file to write warnings to
'force_overwrite|force', # force overwrite of output file if already exists
'terms|t=s', # consequence terms to use e.g. NCBI, SO
'coding_only', # only return results for consequences in coding regions
'canonical', # indicates if transcript is canonical
'mane', # output mane transcript status
'tsl', # output transcript support level
'appris', # output APPRIS transcript annotation
'ccds', # output CCDS identifer
'xref_refseq', # output refseq mrna xref
'uniprot', # output Uniprot identifiers (includes UniParc)
'protein', # add e! protein ID to extra column
'biotype', # add biotype of transcript to output
'hgnc', # add HGNC gene ID to extra column
'symbol', # add gene symbol (e.g. HGNC)
'transcript_version', # add transcript version to stable id in feature column
'gene_phenotype', # indicate if genes are phenotype-associated
'mirna', # identify miRNA structural elements overlapped by variant
'hgvs', # add HGVS names to extra column
'hgvsg', # add HGVS g. also
'hgvsg_use_accession', # force HGVSg to return on chromosome accession instead of input chr name
'shift_hgvs=i', # disable/enable 3-prime shifting of HGVS indels to comply with standard
'ambiguous_hgvs', # allow input HGVSp. to resolve to many input variants
'sift=s', # SIFT predictions
'polyphen=s', # PolyPhen predictions
'humdiv', # use humDiv instead of humVar for PolyPhen
'condel=s', # Condel predictions
'variant_class', # get SO variant type
'regulatory', # enable regulatory stuff
'cell_type=s', # filter cell types for regfeats
'convert=s', # DEPRECATED: convert input to another format (doesn't run VEP)
'no_intergenic', # don't print out INTERGENIC consequences
'vcf', # produce vcf output
'solr', # produce XML output for Solr
'json', # produce JSON document output
'tab', # produce tabulated output
'vcf_info_field=s', # allow user to change VCF info field name
'keep_csq', # don't nuke existing CSQ fields in VCF
'keep_ann', # synonym for keep_csq
'lrg', # enable LRG-based features
'fields=s', # define your own output fields
'domains', # output overlapping protein features
'numbers', # include exon and intron numbers
'total_length', # give total length alongside positions e.g. 14/203
'allele_number', # indicate allele by number to avoid confusion with VCF conversions
'show_ref_allele', # indicate reference allele
'no_escape', # don't percent-escape HGVS strings
'ambiguity', # Add allele ambiguity code
'shift_3prime=i', # enables shifting of all variants to 3prime
'shift_genomic', # adds genomic shifting to output, and provides shifting of intergenic variants
'shift_length', # adds the length of the transcript directional shift to output
# cache stuff
'database', # must specify this to use DB now
'cache', # use cache
'cache_version=i', # specify a different cache version
'show_cache_info', # print cache info and quit
'dir=s', # dir where cache is found (defaults to $HOME/.vep/)
'dir_cache=s', # specific directory for cache
'dir_plugins=s', # specific directory for plugins
'offline', # offline mode uses minimal set of modules installed in same dir, no DB connection
'fasta|fa=s', # file or dir containing FASTA files with reference sequence
'fasta_dir=s', # dir containing FASTA file (may contain multiple species/assemblies)
'no_fasta', # don't autodetect FASTA file in cache dir
'sereal', # user Sereal instead of Storable for the cache
'synonyms=s', # file of chromosome synonyms
# these flags are for use with RefSeq caches
'bam=s', # bam file used to modify transcripts
'use_transcript_ref', # extract the reference allele from the transcript (or genome)
'use_given_ref', # override use_transcript_ref setting that may be set from cache info
# custom file stuff
'custom=s' => ($config->{custom} ||= []), # specify custom tabixed bgzipped or bigWig file with annotation
'tmpdir=s', # tmp dir used for BigWig retrieval
'gff=s', # shortcut to --custom [file],,gff
'gtf=s', # shortcut to --custom [file],,gtf
'bigwig=s', # shortcut to --custom [file],,bigwig,exact
'phyloP=s' => ($config->{phyloP} ||= []), # shortcut to using remote phyloP, may use multiple
'phastCons=s', => ($config->{phastCons} ||= []), # shortcut to using remote phastCons, may use multiple
'ucsc_assembly=s', # required for phyloP, phastCons, e.g. use hg19 for GRCh37, hg38 for GRCh38
'ucsc_data_root=s', # replace if you have the data locally, defaults to http://hgdownload.cse.ucsc.edu/goldenpath/
'custom_multi_allelic', # prevents filtering of custom annotation data when comma separated lists are assumed to be allele specific
# plugins
'plugin=s' => ($config->{plugin} ||= []), # specify a method in a module in the plugins directory
'safe', # die if plugins don't compile or spit warnings
# debug
'debug', # print out debug info
) or die "ERROR: Failed to parse command-line flags\n";
&usage && exit(0) if (!$arg_count) || $config->{help};
$config->{database} ||= 0;
my $runner = Bio::EnsEMBL::VEP::Runner->new($config);
if($config->{show_cache_info}) {
my $info = $runner->get_output_header_info->{version_data};
print "$_\t$info->{$_}\n" for keys %$info;
exit(0);
}
$runner->run();
# outputs usage message
sub usage {
my $versions = get_version_string($RealBin.'/.version');
my $usage =<<END;
#----------------------------------#
# ENSEMBL VARIANT EFFECT PREDICTOR #
#----------------------------------#
Versions:
$versions
Help: dev\@ensembl.org , helpdesk\@ensembl.org
Twitter: \@ensembl
http://www.ensembl.org/info/docs/tools/vep/script/index.html
Usage:
./vep [--cache|--offline|--database] [arguments]
Basic options
=============
--help Display this message and quit
-i | --input_file Input file
-o | --output_file Output file
--force_overwrite Force overwriting of output file
--species [species] Species to use [default: "human"]
--everything Shortcut switch to turn on commonly used options. See web
documentation for details [default: off]
--fork [num_forks] Use forking to improve script runtime
For full option documentation see:
http://www.ensembl.org/info/docs/tools/vep/script/vep_options.html
END
print $usage;
}
1;