-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathsetup.py
36 lines (31 loc) · 1.29 KB
/
setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from six.moves.urllib import request
from os.path import dirname, abspath, exists, join
from os import mkdir
import gzip
import shutil
import nltk
PROJECT_ROOT = dirname(abspath(__file__))
data_path = join(PROJECT_ROOT, 'data')
if not exists(data_path):
mkdir(data_path)
for i in range(1, 928):
num = str(i).zfill(4)
print('Downloading pubmed18n{}.xml.gz ...'.format(num))
request.urlretrieve(
'ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed18n{}.xml.gz'.format(num),
join(data_path, 'pubmed18n{}.xml.gz'.format(num)))
print('Extracting pubmed18n{}.xml.gz ...'.format(num))
with gzip.open(join(data_path, 'pubmed18n{}.xml.gz'.format(num)), 'rb') as f_in:
with open(join(data_path, 'pubmed18n{}.xml'.format(num)), 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
print('Downloading snp_pubmed_cited.gz ...')
request.urlretrieve(
'ftp://ftp.ncbi.nlm.nih.gov/snp/Entrez/eLinks/snp_pubmed_cited.gz',
join(data_path, 'snp_pubmed_cited.gz'))
print('Extracting snp_pubmed_cited.gz ...')
with gzip.open(join(data_path, 'snp_pubmed_cited.gz'), 'rb') as f_in:
with open(join(data_path, 'snp_pubmed_cited'), 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')