reconfigure loader to process only 100 records at a time. simplify python libs in requirements. change target of es instance

jdeck88 · jdeck88 · commit c84ba9d08bcf · 2024-10-29T17:15:54.000Z
diff --git a/foo.py b/foo.py
diff --git a/loader.py b/loader.py
@@ -7,7 +7,7 @@
 
 
 import elasticsearch.helpers
-from elasticsearch import Elasticsearch, RequestsHttpConnection, serializer, compat, exceptions
+from elasticsearch import Elasticsearch, RequestsHttpConnection, serializer, compat, exceptions, helpers
 
 if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
     ssl._create_default_https_context = ssl._create_unverified_context
@@ -71,31 +71,39 @@ def load(self):
     def __load_file(self, file):
         doc_count = 0
         data = []
+        chunk_size = 100  # Set chunk size to 100 records
 
         with open(file) as f:
             print("Starting indexing on " + f.name)
             reader = csv.DictReader(f)
 
             for row in reader:
                 # gracefully handle empty locations
-                if (row['decimalLatitude'] == '' or row['decimalLongitude'] == ''): 
+                if row['decimalLatitude'] == '' or row['decimalLongitude'] == '':
                     row['location'] = ''
                 else:
-                    row['location'] = row['decimalLatitude'] + "," + row['decimalLongitude'] 
+                    row['location'] = row['decimalLatitude'] + "," + row['decimalLongitude']
 
-                # pipeline code identifies null yearCollected values as 'unknown'. es_loader should be empty string
-                if (row['yearCollected'] == 'unknown'): 
-                    row['yearCollected'] = ''
-                if (row['yearCollected'] == 'Unknown'): 
+                # handle 'unknown' values for yearCollected
+                if row['yearCollected'].lower() == 'unknown':
                     row['yearCollected'] = ''
 
-                data.append({k: v for k, v in row.items() if v})  # remove any empty values
+                data.append({k: v for k, v in row.items() if v})  # remove empty values
+
+                # When chunk_size is reached, send bulk data to Elasticsearch
+                if len(data) == chunk_size:
+                    helpers.bulk( client=self.es, index=self.index_name, actions=data, raise_on_error=True, request_timeout=60)
+                    doc_count += len(data)
+                    print(f"Indexed {len(data)} documents. Total indexed: {doc_count}")
+                    data = []  # Clear the data list for the next chunk
 
-            elasticsearch.helpers.bulk(client=self.es, index=self.index_name, actions=data, 
-                                       raise_on_error=True, chunk_size=10000, request_timeout=60)
-            doc_count += len(data)
-            print("Indexed {} documents in {}".format(doc_count, f.name))
+            # Index remaining data if it’s less than chunk_size
+            if data:
+                helpers.bulk( client=self.es, index=self.index_name, actions=data, raise_on_error=True, request_timeout=60)
+                doc_count += len(data)
+                print(f"Indexed {len(data)} remaining documents. Total indexed: {doc_count}")
 
+        print("Finished indexing in", f.name)
         return doc_count
 
     def __create_index(self):
@@ -144,7 +152,6 @@ def __create_index(self):
 "dayCollected": {"type":"text"},
 "verbatimEventDate": {"type":"text"},
                         "collectorList": {"type": "text"},
-                        "Sample_bcid": {"type": "text"},
 "occurrenceID": {"type":"text"},
 "otherCatalogNumbers": {"type":"text"},
 "fieldNumber": {"type":"text"},
@@ -171,7 +178,8 @@ def __create_index(self):
 "zeScore": {"type":"text"},
 "diagnosticLab": {"type":"text"},
 "projectId": {"type":"text"},
-"projectURL": {"type":"text"}
+"projectURL": {"type":"text"},
+                        "Sample_bcid": {"type": "text"}
                     }
             }
         }
@@ -190,7 +198,7 @@ def get_files(dir, ext='csv'):
 index = 'amphibiandisease'
 drop_existing = True
 alias = 'amphibiandisease'
-host =  'tarly.cyverse.org:80'
+host =  '149.165.170.158:80'
 #file_location = 'test.csv'
 file_location = 'data/amphibian_disease_data_processed.csv'
 
diff --git a/requirements.txt b/requirements.txt
@@ -1,78 +1,4 @@
-alabaster==0.7.12
-altgraph==0.16.1
-appnope==0.1.0
-atomicwrites==1.2.1
-attrs==18.2.0
-Babel==2.6.0
-certifi==2017.4.17
-chardet==3.0.4
-Click==7.0
-cycler==0.10.0
-decorator==4.0.11
-Django==2.1.5
-django-cors-headers==2.4.0
-django-filter==2.0.0
-django-haystack==2.8.1
-django-oauth2-provider==0.2.6.1
-djangorestframework==3.9.0
-djangorestframework-csv==2.1.0
-docopt==0.6.2
-docutils==0.14
-elasticsearch==5.4.0
-Flask==1.0.2
-future==0.17.1
-idna==2.5
-imagesize==1.1.0
-ipython-genutils==0.2.0
-isodate==0.5.4
-itsdangerous==1.1.0
-jedi==0.10.2
-Jinja2==2.10
-kiwisolver==1.0.1
-lxml==3.8.0
-macholib==1.11
-MarkupSafe==1.1.0
-matplotlib==3.0.3
-mock==2.0.0
-more-itertools==5.0.0
-numpy==1.13.1
-packaging==18.0
-pandas==0.20.3
-pathlib2==2.3.3
-pbr==5.1.1
-pefile==2018.8.8
-pexpect==4.2.1
-pickleshare==0.7.4
-pipreqs==0.4.9
-pluggy==0.8.1
-prompt-toolkit==1.0.14
-ptyprocess==0.5.2
-py==1.7.0
-Pygments==2.2.0
-PyInstaller==3.4
-pyparsing==2.2.0
-pytest==4.1.1
-python-dateutil==2.6.1
-pytz==2017.2
-requests==2.21.0
-rfc3987==1.3.7
-shortuuid==0.5.0
-simplegeneric==0.8.1
-simplejson==3.16.0
-six==1.10.0
-snowballstemmer==1.2.1
-Sphinx==1.8.3
-sphinx-bootstrap-theme==0.6.5
-sphinxcontrib-httpdomain==1.7.0
-sphinxcontrib-websupport==1.1.0
-testfixtures==5.2.0
-testontology-data-pipeline==0.0.1
-testontologydataipeline==0.0.1
-testontologydatapipeline==0.0.1
-traitlets==4.3.2
-unicodecsv==0.14.1
-urllib3==1.24.1
-wcwidth==0.1.7
-Werkzeug==0.14.1
-xlrd==1.2.0
-yarg==0.1.9
+certifi==2024.8.30
+elastic-transport==8.15.1
+elasticsearch==7.17.0
+urllib3==1.26.20