Skip to content

Commit c84ba9d

Browse files
committed
reconfigure loader to process only 100 records at a time. simplify python libs in requirements. change target of es instance
1 parent 7462d62 commit c84ba9d

File tree

3 files changed

+27
-107
lines changed

3 files changed

+27
-107
lines changed

Diff for: foo.py

-14
This file was deleted.

Diff for: loader.py

+23-15
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88

99
import elasticsearch.helpers
10-
from elasticsearch import Elasticsearch, RequestsHttpConnection, serializer, compat, exceptions
10+
from elasticsearch import Elasticsearch, RequestsHttpConnection, serializer, compat, exceptions, helpers
1111

1212
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
1313
ssl._create_default_https_context = ssl._create_unverified_context
@@ -71,31 +71,39 @@ def load(self):
7171
def __load_file(self, file):
7272
doc_count = 0
7373
data = []
74+
chunk_size = 100 # Set chunk size to 100 records
7475

7576
with open(file) as f:
7677
print("Starting indexing on " + f.name)
7778
reader = csv.DictReader(f)
7879

7980
for row in reader:
8081
# gracefully handle empty locations
81-
if (row['decimalLatitude'] == '' or row['decimalLongitude'] == ''):
82+
if row['decimalLatitude'] == '' or row['decimalLongitude'] == '':
8283
row['location'] = ''
8384
else:
84-
row['location'] = row['decimalLatitude'] + "," + row['decimalLongitude']
85+
row['location'] = row['decimalLatitude'] + "," + row['decimalLongitude']
8586

86-
# pipeline code identifies null yearCollected values as 'unknown'. es_loader should be empty string
87-
if (row['yearCollected'] == 'unknown'):
88-
row['yearCollected'] = ''
89-
if (row['yearCollected'] == 'Unknown'):
87+
# handle 'unknown' values for yearCollected
88+
if row['yearCollected'].lower() == 'unknown':
9089
row['yearCollected'] = ''
9190

92-
data.append({k: v for k, v in row.items() if v}) # remove any empty values
91+
data.append({k: v for k, v in row.items() if v}) # remove empty values
92+
93+
# When chunk_size is reached, send bulk data to Elasticsearch
94+
if len(data) == chunk_size:
95+
helpers.bulk( client=self.es, index=self.index_name, actions=data, raise_on_error=True, request_timeout=60)
96+
doc_count += len(data)
97+
print(f"Indexed {len(data)} documents. Total indexed: {doc_count}")
98+
data = [] # Clear the data list for the next chunk
9399

94-
elasticsearch.helpers.bulk(client=self.es, index=self.index_name, actions=data,
95-
raise_on_error=True, chunk_size=10000, request_timeout=60)
96-
doc_count += len(data)
97-
print("Indexed {} documents in {}".format(doc_count, f.name))
100+
# Index remaining data if it’s less than chunk_size
101+
if data:
102+
helpers.bulk( client=self.es, index=self.index_name, actions=data, raise_on_error=True, request_timeout=60)
103+
doc_count += len(data)
104+
print(f"Indexed {len(data)} remaining documents. Total indexed: {doc_count}")
98105

106+
print("Finished indexing in", f.name)
99107
return doc_count
100108

101109
def __create_index(self):
@@ -144,7 +152,6 @@ def __create_index(self):
144152
"dayCollected": {"type":"text"},
145153
"verbatimEventDate": {"type":"text"},
146154
"collectorList": {"type": "text"},
147-
"Sample_bcid": {"type": "text"},
148155
"occurrenceID": {"type":"text"},
149156
"otherCatalogNumbers": {"type":"text"},
150157
"fieldNumber": {"type":"text"},
@@ -171,7 +178,8 @@ def __create_index(self):
171178
"zeScore": {"type":"text"},
172179
"diagnosticLab": {"type":"text"},
173180
"projectId": {"type":"text"},
174-
"projectURL": {"type":"text"}
181+
"projectURL": {"type":"text"},
182+
"Sample_bcid": {"type": "text"}
175183
}
176184
}
177185
}
@@ -190,7 +198,7 @@ def get_files(dir, ext='csv'):
190198
index = 'amphibiandisease'
191199
drop_existing = True
192200
alias = 'amphibiandisease'
193-
host = 'tarly.cyverse.org:80'
201+
host = '149.165.170.158:80'
194202
#file_location = 'test.csv'
195203
file_location = 'data/amphibian_disease_data_processed.csv'
196204

Diff for: requirements.txt

+4-78
Original file line numberDiff line numberDiff line change
@@ -1,78 +1,4 @@
1-
alabaster==0.7.12
2-
altgraph==0.16.1
3-
appnope==0.1.0
4-
atomicwrites==1.2.1
5-
attrs==18.2.0
6-
Babel==2.6.0
7-
certifi==2017.4.17
8-
chardet==3.0.4
9-
Click==7.0
10-
cycler==0.10.0
11-
decorator==4.0.11
12-
Django==2.1.5
13-
django-cors-headers==2.4.0
14-
django-filter==2.0.0
15-
django-haystack==2.8.1
16-
django-oauth2-provider==0.2.6.1
17-
djangorestframework==3.9.0
18-
djangorestframework-csv==2.1.0
19-
docopt==0.6.2
20-
docutils==0.14
21-
elasticsearch==5.4.0
22-
Flask==1.0.2
23-
future==0.17.1
24-
idna==2.5
25-
imagesize==1.1.0
26-
ipython-genutils==0.2.0
27-
isodate==0.5.4
28-
itsdangerous==1.1.0
29-
jedi==0.10.2
30-
Jinja2==2.10
31-
kiwisolver==1.0.1
32-
lxml==3.8.0
33-
macholib==1.11
34-
MarkupSafe==1.1.0
35-
matplotlib==3.0.3
36-
mock==2.0.0
37-
more-itertools==5.0.0
38-
numpy==1.13.1
39-
packaging==18.0
40-
pandas==0.20.3
41-
pathlib2==2.3.3
42-
pbr==5.1.1
43-
pefile==2018.8.8
44-
pexpect==4.2.1
45-
pickleshare==0.7.4
46-
pipreqs==0.4.9
47-
pluggy==0.8.1
48-
prompt-toolkit==1.0.14
49-
ptyprocess==0.5.2
50-
py==1.7.0
51-
Pygments==2.2.0
52-
PyInstaller==3.4
53-
pyparsing==2.2.0
54-
pytest==4.1.1
55-
python-dateutil==2.6.1
56-
pytz==2017.2
57-
requests==2.21.0
58-
rfc3987==1.3.7
59-
shortuuid==0.5.0
60-
simplegeneric==0.8.1
61-
simplejson==3.16.0
62-
six==1.10.0
63-
snowballstemmer==1.2.1
64-
Sphinx==1.8.3
65-
sphinx-bootstrap-theme==0.6.5
66-
sphinxcontrib-httpdomain==1.7.0
67-
sphinxcontrib-websupport==1.1.0
68-
testfixtures==5.2.0
69-
testontology-data-pipeline==0.0.1
70-
testontologydataipeline==0.0.1
71-
testontologydatapipeline==0.0.1
72-
traitlets==4.3.2
73-
unicodecsv==0.14.1
74-
urllib3==1.24.1
75-
wcwidth==0.1.7
76-
Werkzeug==0.14.1
77-
xlrd==1.2.0
78-
yarg==0.1.9
1+
certifi==2024.8.30
2+
elastic-transport==8.15.1
3+
elasticsearch==7.17.0
4+
urllib3==1.26.20

0 commit comments

Comments
 (0)