Skip to content

Commit e4065f2

Browse files
committed
Merge remote-tracking branch 'sjfranklin/Add-HTTP-Method'
* sjfranklin/Add-HTTP-Method: taxdump should work with http method Added clarifying comments for some changes Explained the regex, which finds the relevant file name from the raw http response The main difference here is how it pulls the list of URLs, parses them, and the actual download is essentially the same. If user mentions http as an argument, load the http rake. Ruby is quite over my head so this is a very quick-and-dirty fix Load normal or http rakefile depending on option Signed-off-by: Anurag Priyam <[email protected]>
2 parents 0d3a1df + b09d23d commit e4065f2

File tree

2 files changed

+95
-4
lines changed

2 files changed

+95
-4
lines changed

bin/ncbi-blast-dbs

+11-4
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,20 @@
11
#!/usr/bin/env ruby
22

33
require 'rake'
4-
import "#{File.dirname(__FILE__)}/../lib/ncbi-blast-dbs.rake"
54

65
trap :INT do
76
puts "Quitting ..."
87
exit!
98
end
109

11-
Rake.application.init 'ncbi-blast-dbs'
12-
Rake.application.load_imports
13-
Rake.application.top_level
10+
if ARGV.include? "http";
11+
import "#{File.dirname(__FILE__)}/../lib/http-ncbi-blast-dbs.rake"
12+
Rake.application.init 'http-ncbi-blast-dbs'
13+
Rake.application.load_imports
14+
Rake.application.top_level
15+
else;
16+
import "#{File.dirname(__FILE__)}/../lib/ncbi-blast-dbs.rake"
17+
Rake.application.init 'ncbi-blast-dbs'
18+
Rake.application.load_imports
19+
Rake.application.top_level
20+
end

lib/http-ncbi-blast-dbs.rake

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
require 'net/http'
2+
require 'uri'
3+
puts "using http-ncbi-dbs-dgs.rake"
4+
# Downloads tarball at the given URL if a local copy does not exist, or if the
5+
# local copy is older than at the given URL, or if the local copy is corrupt.
6+
def download(url, last_to_do)
7+
file = File.basename(url)
8+
9+
# # Resume an interrupted download or fetch the file for the first time. If
10+
# # the file on the server is newer, then it is downloaded from start.
11+
12+
sh "wget -Nc --no-verbose #{url}"
13+
# If the local copy is already fully retrieved, then the previous command
14+
# ignores the timestamp. So we check with the server again if the file on
15+
# the server is newer and if so download the new copy.
16+
sh "wget -N --no-verbose #{url}"
17+
sh "wget -Nc --no-verbose #{url}.md5"
18+
sh "wget -N --no-verbose #{url}.md5"
19+
# Immediately download md5 and verify the tarball. Re-download tarball if
20+
# corrupt; extract otherwise.
21+
sh "md5sum -c #{file}.md5" do |matched, _|
22+
if !matched
23+
sh "rm #{file} #{file}.md5"; download(url)
24+
# too many tar instances unzipping the same file clutter the system
25+
elsif file == last_to_do;
26+
sh "tar xfov #{file}"
27+
else
28+
# at least nr and nt tarballs have identical files .?al; unsure of others
29+
sh "tar xfov #{file} --exclude='*.?al' --exclude='taxdb*'"
30+
end
31+
end
32+
end
33+
34+
35+
def databases
36+
method = 'https://'
37+
host, dir = 'ftp.ncbi.nlm.nih.gov', 'blast/db'
38+
uri = URI.parse(method + host + "/" + dir + "/")
39+
40+
response = Net::HTTP.get_response(uri)
41+
body = response.body.split
42+
43+
array_of_files = []
44+
body.each do |line|
45+
# regex takes the raw http response, matches lines such as:
46+
# href="tsa_nt.06.tar.gz.md5">tsa_nt.06.tar.gz</a>
47+
# Returns:
48+
# tsa_nt.06.tar.gz
49+
filenames_and_newlines = line[/(^href=".*">)(.*tar.gz|.*md5)(<\/a>)$/, 2]
50+
array_of_files.append(filenames_and_newlines) unless filenames_and_newlines.nil?
51+
end
52+
53+
# append the full path to file for downstream wget
54+
array_of_files.map! { |string| "".concat("/blast/db/", string ) }
55+
array_of_files.
56+
map { |file| File.join(host, file) }.
57+
select { |file| file.match(/\.tar\.gz$/) }.
58+
group_by { |file| File.basename(file).split('.')[0] }
59+
end
60+
61+
62+
# Create user-facing task for each database to drive the download of its
63+
# volumes in parallel.
64+
databases.each do |name, files|
65+
last = { name => files.last }
66+
multitask(name => files.map { |file| task(file) { download(file, last.values.uniq) } })
67+
end
68+
69+
# List name of all databases that can be downloaded if executed without
70+
# any arguments.
71+
task :default do
72+
databases
73+
puts databases.keys.push('taxdump').join(', ')
74+
end
75+
76+
task :taxdump do
77+
download('https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz', "nil")
78+
end
79+
80+
# Ruby being over my head, this is my quick-and-dirty way to trick it ignoring
81+
# "http" as a task rather than a specification. Happy for an expert to fix it up!
82+
task :http do
83+
puts "using http method"
84+
end

0 commit comments

Comments
 (0)