covidgraph · Apr 14, 2020
diff --git a/‎Dockerfile
+4 b/‎Dockerfile
+4
diff --git a/‎README.md
+22-5 b/‎README.md
+22-5
diff --git a/‎covid_graph/download.py
-103 b/‎covid_graph/download.py
-103
diff --git a/‎covid_graph/helper.py
+37 b/‎covid_graph/helper.py
+37
diff --git a/‎covid_graph/load_to_neo4j.py ‎covid_graph/jhu.py
+35-59 b/‎covid_graph/load_to_neo4j.py ‎covid_graph/jhu.py
+35-59
diff --git a/‎covid_graph/unwpp.py
+95 b/‎covid_graph/unwpp.py
+95
diff --git a/‎docker-compose.yml
+1 b/‎docker-compose.yml
+1
diff --git a/‎requirements.txt
+2-1 b/‎requirements.txt
+2-1
diff --git a/‎run.py
+32-24 b/‎run.py
+32-24
diff --git a/‎tests/test_base.py
+18 b/‎tests/test_base.py
+18
@@ -3,9 +3,13 @@ FROM python:3
 RUN mkdir /src
 RUN mkdir /download
 
+# copy covid_graph package
 COPY covid_graph /src/covid_graph
+# copy run.py script
 COPY run.py /src/
 COPY requirements.txt /src/
+# copy tests
+COPY tests /src/test
 
 WORKDIR /src
 
 
@@ -1,12 +1,29 @@
-# A Knowledge Graph on Covid-19
+# CovidGraph data loading module for case statistics from JHU and UN World Population data 
 
-A knowledge graph that integrates case numbers reported by John Hopkins University and population data from the UN. Work in progress, looking for more datasources, PR welcome!
+Build Docker image:
 
-The graph is available in a Neo4j Sandbox: https://10-0-1-172-33065.neo4jsandbox.com/browser/
+```shell script
+docker build -t data_jhu_population .
+```
+
+You need to set the following environment variables in the Docker container to run it:
 
-**User:** public, **Password:** public
+```shell script
+GC_NEO4J_URL: URL of Neo4j instance
+GC_NEO4J_USER: Neo4j username
+GC_NEO4J_PASSWORD: Neo4j password
+RUN_MODE: test or full
+```
+### RUN_MODE
+The `test` mode runs some basic tests including availability of files. it is meant to be executed at runtime
+in a data loading pipeline. The goal is to hae some basic sanity checks and avoid long running downloads if something is wrong.
+This is only a part of the full test suit that is executed as part of CI.
 
-You can add it to Neo4j Desktop with the bolt URL, same user/password: `bolt://100.24.206.62:33064`
+Run the container:
+
+```shell script
+docker run --env GC_NEO4J_URL=bolt://myhost:7687 --env GC_NEO4J_USER=neo4j --env GC_NEO4J_PASSWORD=password --env RUN_MODE=test data_jhu_population
+```
 
 ## Datamodel 
 ![Data Model](docs/datamodel.png)
 
@@ -1,3 +1,7 @@
+import os
+import shutil
+import zipfile
+
 from py2neo.database import ClientError
 
 
@@ -20,3 +24,36 @@ def setup_db(graph):
             graph.schema.create_index(index[0], index[1])
         except ClientError:
             pass
+
+
+def unzip_file(zip_file_path, skip_existing=False):
+    """
+    Unzip a zip file at the same directory. Return the path to the unzipped directory.
+
+    Note: By default the data is not overwritten. Remove target directory before unzipping.
+
+    :param zip_file_path: Path to the zip file.
+    :param skip_existing: Do not unzip if directory exists. Default is false, set to true for dev.
+    :return: Path to unzipped directory.
+    """
+    zip_file_directory = os.path.dirname(zip_file_path)
+    zip_file_name = os.path.basename(zip_file_path)
+
+    target_directory = os.path.join(zip_file_directory, zip_file_name.replace('.zip', ''))
+
+    if skip_existing:
+        if os.path.exists(target_directory):
+            log.info("Unzipped directory exists, skip_existing is True, do not download again.")
+            return target_directory
+
+    if os.path.exists(target_directory):
+        log.debug("Target directory exists already {}".format(target_directory))
+        log.debug("Delete to unzip again.")
+        shutil.rmtree(target_directory)
+
+    log.debug('Unzip {} to {}'.format(zip_file_path, target_directory))
+
+    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
+        zip_ref.extractall(target_directory)
+
+    return target_directory
@@ -1,13 +1,43 @@
+import csv
 import os
 import logging
-import csv
-from uuid import uuid4
+
+import requests
+from dateutil.parser import parse, ParserError
 from graphio import NodeSet, RelationshipSet
-from dateutil.parser import parse
-from dateutil.parser import ParserError
 
 log = logging.getLogger(__name__)
 
+JHU_GITHUB_ARCHIVE_LINK = 'https://codeload.github.com/CSSEGISandData/COVID-19/zip/master'
+JHU_FILE_NAME = 'jhu_covid19.zip'
+
+
+def download_jhu(target_dir, skip_existing=False):
+    """
+    Downlaod the data repository from JHU.
+
+    https://github.com/CSSEGISandData/COVID-19
+
+    :param target_dir: Target directory where to store files.
+    :param skip_existing: Do not download if file exists. Default is false, set to true for dev.
+    :return: Path to downloaded file.
+    """
+    log.info('Download JHU data.')
+    target_file = os.path.join(target_dir, JHU_FILE_NAME)
+
+    if skip_existing:
+        if os.path.exists(target_file):
+            log.info("File exists, skip_existing is True, do not download again.")
+            return target_file
+
+    log.info('Download to {}'.format(target_file))
+
+    r = requests.get(JHU_GITHUB_ARCHIVE_LINK, allow_redirects=True)
+
+    open(target_file, 'wb').write(r.content)
+
+    return target_file
+
 
 def read_daily_report_JHU(path_to_jhu, graph):
     """
@@ -182,58 +212,4 @@ def parse_jhu_new_file_row(row):
     lat = row[5]
     long = row[6]
 
-    return country, province, date, confirmed, death, recovered, lat, long
-
-
-def load_wpp_data(base_path, graph):
-    """
-    Load UN population data.
-
-    :param base_path: Path where file was downloaded.
-    """
-    un_wpp_csv_file = os.path.join(base_path, 'WPP2019_PopulationByAgeSex_Medium.csv')
-    log.info('Parse UN population data file: {}'.format(un_wpp_csv_file))
-
-    country = NodeSet(['Country'], ['name'])
-    age_group_nodes = NodeSet(['AgeGroup'], ['group'])
-    country_total_group = RelationshipSet('CURRENT_TOTAL', ['Country'], ['AgeGroup'], ['name'], ['group'])
-    country_male_group = RelationshipSet('CURRENT_MALE', ['Country'], ['AgeGroup'], ['name'], ['group'])
-    country_female_group = RelationshipSet('CURRENT_FEMALE', ['Country'], ['AgeGroup'], ['name'], ['group'])
-
-    countries_added = set()
-    age_groups_added = set()
-
-    with open(un_wpp_csv_file, 'rt') as f:
-        csv_file = csv.reader(f, delimiter=',', quotechar='"')
-        # skip header
-        next(csv_file)
-        for row in csv_file:
-            # LocID,Location,VarID,Variant,Time,MidPeriod,AgeGrp,AgeGrpStart,AgeGrpSpan,PopMale,PopFemale,PopTotal
-            loc_id = row[0]
-            location = row[1]
-            time = int(row[4])
-            age_group = row[6]
-            age_group_start = int(row[7])
-            age_group_span = row[8]
-            pop_male = int(float((row[9])) * 1000)
-            pop_female = int(float((row[10])) * 1000)
-            pop_total = int(float((row[11])) * 1000)
-
-            # only take 2019
-            if time == 2019:
-                if location not in countries_added:
-                    country.add_node({'name': location, 'un_id': loc_id})
-                    countries_added.add(location)
-                if age_group not in age_groups_added:
-                    age_group_nodes.add_node({'group': age_group, 'start': age_group_start, 'span': age_group_span})
-
-                country_total_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_total})
-                country_male_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_male})
-                country_female_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_female})
-
-    log.info('Load data to Neo4j')
-    country.merge(graph)
-    age_group_nodes.merge(graph)
-    country_total_group.merge(graph)
-    country_male_group.merge(graph)
-    country_female_group.merge(graph)
+    return country, province, date, confirmed, death, recovered, lat, long
@@ -0,0 +1,95 @@
+import csv
+import os
+import logging
+import requests
+from graphio import NodeSet, RelationshipSet
+
+
+log = logging.getLogger(__name__)
+
+WPP_AGE_CSV = 'https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/CSV_FILES/WPP2019_PopulationByAgeSex_Medium.csv'
+WPP_FILENAME = 'WPP2019_PopulationByAgeSex_Medium.csv'
+
+
+def download_population_data(target_dir, skip_existing=False):
+    """
+    Download population data from the UN world population prospect.
+
+    The UN gathers data on world population statistics and publishes the
+    world population prospects: https://population.un.org/wpp/
+
+    The latest data set in CSV format can be found here: https://population.un.org/wpp/Download/Standard/CSV/
+
+    :param target_dir: Target directory where to store files.
+    :param skip_existing: Do not download if file exists. Default is false, set to true for dev.
+    :return: Path to downloaded file.
+    """
+    log.info('Download UN WPP data')
+    target_file = os.path.join(target_dir, WPP_FILENAME)
+
+    if skip_existing:
+        if os.path.exists(target_file):
+            log.info("File exists, skip_existing is True, do not download again.")
+            return target_file
+
+    log.info('Download to {}'.format(target_file))
+
+    r = requests.get(WPP_AGE_CSV, allow_redirects=True)
+
+    open(target_file, 'wb').write(r.content)
+
+    return target_file
+
+
+def load_wpp_data(base_path, graph):
+    """
+    Load UN population data.
+
+    :param base_path: Path where file was downloaded.
+    """
+    un_wpp_csv_file = os.path.join(base_path, 'WPP2019_PopulationByAgeSex_Medium.csv')
+    log.info('Parse UN population data file: {}'.format(un_wpp_csv_file))
+
+    country = NodeSet(['Country'], ['name'])
+    age_group_nodes = NodeSet(['AgeGroup'], ['group'])
+    country_total_group = RelationshipSet('CURRENT_TOTAL', ['Country'], ['AgeGroup'], ['name'], ['group'])
+    country_male_group = RelationshipSet('CURRENT_MALE', ['Country'], ['AgeGroup'], ['name'], ['group'])
+    country_female_group = RelationshipSet('CURRENT_FEMALE', ['Country'], ['AgeGroup'], ['name'], ['group'])
+
+    countries_added = set()
+    age_groups_added = set()
+
+    with open(un_wpp_csv_file, 'rt') as f:
+        csv_file = csv.reader(f, delimiter=',', quotechar='"')
+        # skip header
+        next(csv_file)
+        for row in csv_file:
+            # LocID,Location,VarID,Variant,Time,MidPeriod,AgeGrp,AgeGrpStart,AgeGrpSpan,PopMale,PopFemale,PopTotal
+            loc_id = row[0]
+            location = row[1]
+            time = int(row[4])
+            age_group = row[6]
+            age_group_start = int(row[7])
+            age_group_span = row[8]
+            pop_male = int(float((row[9])) * 1000)
+            pop_female = int(float((row[10])) * 1000)
+            pop_total = int(float((row[11])) * 1000)
+
+            # only take 2019
+            if time == 2019:
+                if location not in countries_added:
+                    country.add_node({'name': location, 'un_id': loc_id})
+                    countries_added.add(location)
+                if age_group not in age_groups_added:
+                    age_group_nodes.add_node({'group': age_group, 'start': age_group_start, 'span': age_group_span})
+
+                country_total_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_total})
+                country_male_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_male})
+                country_female_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_female})
+
+    log.info('Load data to Neo4j')
+    country.merge(graph)
+    age_group_nodes.merge(graph)
+    country_total_group.merge(graph)
+    country_male_group.merge(graph)
+    country_female_group.merge(graph)
@@ -7,4 +7,5 @@ services:
       - GC_NEO4J_URL=bolt://host.docker.internal:7687
       - GC_NEO4J_USER=neo4j
       - GC_NEO4J_PASSWORD=test
+      - RUN_MODE=test
     #command: python run.py
@@ -3,4 +3,5 @@ py2neo
 #git+https://github.com/technige/py2neo.git@v5#egg=py2neo
 requests
 python-dateutil
-graphio>=0.0.12
+graphio>=0.0.12
+pytest
@@ -1,6 +1,8 @@
 import os
+import sys
 import logging
 import py2neo
+import pytest
 
 logging.basicConfig(level=logging.DEBUG)
 logging.getLogger('py2neo.connect.bolt').setLevel(logging.WARNING)
@@ -11,41 +13,47 @@
 log = logging.getLogger(__name__)
 
 # import and setup
-from covid_graph import download, load_to_neo4j, helper, post
+from covid_graph import helper, post, jhu, unwpp
 
 ROOT_DIR = os.getenv('ROOT_DIR', '/download')
 GC_NEO4J_URL = os.getenv('GC_NEO4J_URL', 'bolt://localhost:7687')
 GC_NEO4J_USER = os.getenv('GC_NEO4J_USER', 'neo4j')
 GC_NEO4J_PASSWORD = os.getenv('GC_NEO4J_PASSWORD', 'test')
+RUN_MODE = os.getenv('RUN_MODE', 'test')
 
-for v in [ROOT_DIR, GC_NEO4J_URL, GC_NEO4J_USER, GC_NEO4J_PASSWORD]:
-    log.debug(v)
+if RUN_MODE.lower() == 'test':
+    log.info("Run tests")
+    pytest.main()
 
-graph = py2neo.Graph(GC_NEO4J_URL, user=GC_NEO4J_USER, password=GC_NEO4J_PASSWORD)
-log.debug(graph)
+else:
+    for v in [ROOT_DIR, GC_NEO4J_URL, GC_NEO4J_USER, GC_NEO4J_PASSWORD]:
+        log.debug(v)
 
-result = list(graph.run("MATCH (a) RETURN a LIMIT 1"))
-log.debug(result)
+    graph = py2neo.Graph(GC_NEO4J_URL, user=GC_NEO4J_USER, password=GC_NEO4J_PASSWORD)
+    log.debug(graph)
 
-# setup DB
-helper.setup_db(graph)
+    result = list(graph.run("MATCH (a) RETURN a LIMIT 1"))
+    log.debug(result)
 
-if not os.path.exists(ROOT_DIR):
-    os.makedirs(ROOT_DIR)
-###
+    # setup DB
+    helper.setup_db(graph)
 
-# download data
-jhu_zip_file = download.download_jhu(ROOT_DIR)
-jhu_dir = download.unzip_file(jhu_zip_file)
+    if not os.path.exists(ROOT_DIR):
+        os.makedirs(ROOT_DIR)
+    ###
 
-wpp_csv_file = download.download_population_data(ROOT_DIR, skip_existing=True)
-###
+    # download data
+    jhu_zip_file = covid_graph.jhu.download_jhu(ROOT_DIR)
+    jhu_dir = covid_graph.helper.unzip_file(jhu_zip_file)
 
-# load to Neo4j
-load_to_neo4j.read_daily_report_JHU(jhu_dir, graph)
-load_to_neo4j.load_wpp_data(ROOT_DIR, graph)
-###
+    wpp_csv_file = covid_graph.unwpp.download_population_data(ROOT_DIR, skip_existing=True)
+    ###
 
-# post process
-post.set_latest_update(graph)
-###
+    # load to Neo4j
+    covid_graph.jhu.read_daily_report_JHU(jhu_dir, graph)
+    covid_graph.unwpp.load_wpp_data(ROOT_DIR, graph)
+    ###
+
+    # post process
+    post.set_latest_update(graph)
+    ###
@@ -0,0 +1,18 @@
+import pytest
+import requests
+
+JHU_GITHUB_ARCHIVE_LINK = 'https://codeload.github.com/CSSEGISandData/COVID-19/zip/master'
+JHU_FILE_NAME = 'jhu_covid19.zip'
+WPP_AGE_CSV = 'https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/CSV_FILES/WPP2019_PopulationByAgeSex_Medium.csv'
+
+@pytest.mark.runtest
+def test_jhu_available():
+    r = requests.head(JHU_GITHUB_ARCHIVE_LINK, allow_redirects=True)
+
+    assert r.status_code == 200
+
+
+def test_wpp_available():
+    r = requests.head(WPP_AGE_CSV, allow_redirects=True)
+
+    assert r.status_code == 200