Skip to content

Commit ced0783

Browse files
author
Martin
committedApr 14, 2020
basic tests
1 parent 99dab56 commit ced0783

10 files changed

+246
-192
lines changed
 

‎Dockerfile

+4
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,13 @@ FROM python:3
33
RUN mkdir /src
44
RUN mkdir /download
55

6+
# copy covid_graph package
67
COPY covid_graph /src/covid_graph
8+
# copy run.py script
79
COPY run.py /src/
810
COPY requirements.txt /src/
11+
# copy tests
12+
COPY tests /src/test
913

1014
WORKDIR /src
1115

‎README.md

+22-5
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,29 @@
1-
# A Knowledge Graph on Covid-19
1+
# CovidGraph data loading module for case statistics from JHU and UN World Population data
22

3-
A knowledge graph that integrates case numbers reported by John Hopkins University and population data from the UN. Work in progress, looking for more datasources, PR welcome!
3+
Build Docker image:
44

5-
The graph is available in a Neo4j Sandbox: https://10-0-1-172-33065.neo4jsandbox.com/browser/
5+
```shell script
6+
docker build -t data_jhu_population .
7+
```
8+
9+
You need to set the following environment variables in the Docker container to run it:
610

7-
**User:** public, **Password:** public
11+
```shell script
12+
GC_NEO4J_URL: URL of Neo4j instance
13+
GC_NEO4J_USER: Neo4j username
14+
GC_NEO4J_PASSWORD: Neo4j password
15+
RUN_MODE: test or full
16+
```
17+
### RUN_MODE
18+
The `test` mode runs some basic tests including availability of files. it is meant to be executed at runtime
19+
in a data loading pipeline. The goal is to hae some basic sanity checks and avoid long running downloads if something is wrong.
20+
This is only a part of the full test suit that is executed as part of CI.
821

9-
You can add it to Neo4j Desktop with the bolt URL, same user/password: `bolt://100.24.206.62:33064`
22+
Run the container:
23+
24+
```shell script
25+
docker run --env GC_NEO4J_URL=bolt://myhost:7687 --env GC_NEO4J_USER=neo4j --env GC_NEO4J_PASSWORD=password --env RUN_MODE=test data_jhu_population
26+
```
1027

1128
## Datamodel
1229
![Data Model](docs/datamodel.png)

‎covid_graph/download.py

-103
This file was deleted.

‎covid_graph/helper.py

+37
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
import os
2+
import shutil
3+
import zipfile
4+
15
from py2neo.database import ClientError
26

37

@@ -20,3 +24,36 @@ def setup_db(graph):
2024
graph.schema.create_index(index[0], index[1])
2125
except ClientError:
2226
pass
27+
28+
29+
def unzip_file(zip_file_path, skip_existing=False):
30+
"""
31+
Unzip a zip file at the same directory. Return the path to the unzipped directory.
32+
33+
Note: By default the data is not overwritten. Remove target directory before unzipping.
34+
35+
:param zip_file_path: Path to the zip file.
36+
:param skip_existing: Do not unzip if directory exists. Default is false, set to true for dev.
37+
:return: Path to unzipped directory.
38+
"""
39+
zip_file_directory = os.path.dirname(zip_file_path)
40+
zip_file_name = os.path.basename(zip_file_path)
41+
42+
target_directory = os.path.join(zip_file_directory, zip_file_name.replace('.zip', ''))
43+
44+
if skip_existing:
45+
if os.path.exists(target_directory):
46+
log.info("Unzipped directory exists, skip_existing is True, do not download again.")
47+
return target_directory
48+
49+
if os.path.exists(target_directory):
50+
log.debug("Target directory exists already {}".format(target_directory))
51+
log.debug("Delete to unzip again.")
52+
shutil.rmtree(target_directory)
53+
54+
log.debug('Unzip {} to {}'.format(zip_file_path, target_directory))
55+
56+
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
57+
zip_ref.extractall(target_directory)
58+
59+
return target_directory

‎covid_graph/load_to_neo4j.py ‎covid_graph/jhu.py

+35-59
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,43 @@
1+
import csv
12
import os
23
import logging
3-
import csv
4-
from uuid import uuid4
4+
5+
import requests
6+
from dateutil.parser import parse, ParserError
57
from graphio import NodeSet, RelationshipSet
6-
from dateutil.parser import parse
7-
from dateutil.parser import ParserError
88

99
log = logging.getLogger(__name__)
1010

11+
JHU_GITHUB_ARCHIVE_LINK = 'https://codeload.github.com/CSSEGISandData/COVID-19/zip/master'
12+
JHU_FILE_NAME = 'jhu_covid19.zip'
13+
14+
15+
def download_jhu(target_dir, skip_existing=False):
16+
"""
17+
Downlaod the data repository from JHU.
18+
19+
https://github.com/CSSEGISandData/COVID-19
20+
21+
:param target_dir: Target directory where to store files.
22+
:param skip_existing: Do not download if file exists. Default is false, set to true for dev.
23+
:return: Path to downloaded file.
24+
"""
25+
log.info('Download JHU data.')
26+
target_file = os.path.join(target_dir, JHU_FILE_NAME)
27+
28+
if skip_existing:
29+
if os.path.exists(target_file):
30+
log.info("File exists, skip_existing is True, do not download again.")
31+
return target_file
32+
33+
log.info('Download to {}'.format(target_file))
34+
35+
r = requests.get(JHU_GITHUB_ARCHIVE_LINK, allow_redirects=True)
36+
37+
open(target_file, 'wb').write(r.content)
38+
39+
return target_file
40+
1141

1242
def read_daily_report_JHU(path_to_jhu, graph):
1343
"""
@@ -182,58 +212,4 @@ def parse_jhu_new_file_row(row):
182212
lat = row[5]
183213
long = row[6]
184214

185-
return country, province, date, confirmed, death, recovered, lat, long
186-
187-
188-
def load_wpp_data(base_path, graph):
189-
"""
190-
Load UN population data.
191-
192-
:param base_path: Path where file was downloaded.
193-
"""
194-
un_wpp_csv_file = os.path.join(base_path, 'WPP2019_PopulationByAgeSex_Medium.csv')
195-
log.info('Parse UN population data file: {}'.format(un_wpp_csv_file))
196-
197-
country = NodeSet(['Country'], ['name'])
198-
age_group_nodes = NodeSet(['AgeGroup'], ['group'])
199-
country_total_group = RelationshipSet('CURRENT_TOTAL', ['Country'], ['AgeGroup'], ['name'], ['group'])
200-
country_male_group = RelationshipSet('CURRENT_MALE', ['Country'], ['AgeGroup'], ['name'], ['group'])
201-
country_female_group = RelationshipSet('CURRENT_FEMALE', ['Country'], ['AgeGroup'], ['name'], ['group'])
202-
203-
countries_added = set()
204-
age_groups_added = set()
205-
206-
with open(un_wpp_csv_file, 'rt') as f:
207-
csv_file = csv.reader(f, delimiter=',', quotechar='"')
208-
# skip header
209-
next(csv_file)
210-
for row in csv_file:
211-
# LocID,Location,VarID,Variant,Time,MidPeriod,AgeGrp,AgeGrpStart,AgeGrpSpan,PopMale,PopFemale,PopTotal
212-
loc_id = row[0]
213-
location = row[1]
214-
time = int(row[4])
215-
age_group = row[6]
216-
age_group_start = int(row[7])
217-
age_group_span = row[8]
218-
pop_male = int(float((row[9])) * 1000)
219-
pop_female = int(float((row[10])) * 1000)
220-
pop_total = int(float((row[11])) * 1000)
221-
222-
# only take 2019
223-
if time == 2019:
224-
if location not in countries_added:
225-
country.add_node({'name': location, 'un_id': loc_id})
226-
countries_added.add(location)
227-
if age_group not in age_groups_added:
228-
age_group_nodes.add_node({'group': age_group, 'start': age_group_start, 'span': age_group_span})
229-
230-
country_total_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_total})
231-
country_male_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_male})
232-
country_female_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_female})
233-
234-
log.info('Load data to Neo4j')
235-
country.merge(graph)
236-
age_group_nodes.merge(graph)
237-
country_total_group.merge(graph)
238-
country_male_group.merge(graph)
239-
country_female_group.merge(graph)
215+
return country, province, date, confirmed, death, recovered, lat, long

‎covid_graph/unwpp.py

+95
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import csv
2+
import os
3+
import logging
4+
import requests
5+
from graphio import NodeSet, RelationshipSet
6+
7+
8+
log = logging.getLogger(__name__)
9+
10+
WPP_AGE_CSV = 'https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/CSV_FILES/WPP2019_PopulationByAgeSex_Medium.csv'
11+
WPP_FILENAME = 'WPP2019_PopulationByAgeSex_Medium.csv'
12+
13+
14+
def download_population_data(target_dir, skip_existing=False):
15+
"""
16+
Download population data from the UN world population prospect.
17+
18+
The UN gathers data on world population statistics and publishes the
19+
world population prospects: https://population.un.org/wpp/
20+
21+
The latest data set in CSV format can be found here: https://population.un.org/wpp/Download/Standard/CSV/
22+
23+
:param target_dir: Target directory where to store files.
24+
:param skip_existing: Do not download if file exists. Default is false, set to true for dev.
25+
:return: Path to downloaded file.
26+
"""
27+
log.info('Download UN WPP data')
28+
target_file = os.path.join(target_dir, WPP_FILENAME)
29+
30+
if skip_existing:
31+
if os.path.exists(target_file):
32+
log.info("File exists, skip_existing is True, do not download again.")
33+
return target_file
34+
35+
log.info('Download to {}'.format(target_file))
36+
37+
r = requests.get(WPP_AGE_CSV, allow_redirects=True)
38+
39+
open(target_file, 'wb').write(r.content)
40+
41+
return target_file
42+
43+
44+
def load_wpp_data(base_path, graph):
45+
"""
46+
Load UN population data.
47+
48+
:param base_path: Path where file was downloaded.
49+
"""
50+
un_wpp_csv_file = os.path.join(base_path, 'WPP2019_PopulationByAgeSex_Medium.csv')
51+
log.info('Parse UN population data file: {}'.format(un_wpp_csv_file))
52+
53+
country = NodeSet(['Country'], ['name'])
54+
age_group_nodes = NodeSet(['AgeGroup'], ['group'])
55+
country_total_group = RelationshipSet('CURRENT_TOTAL', ['Country'], ['AgeGroup'], ['name'], ['group'])
56+
country_male_group = RelationshipSet('CURRENT_MALE', ['Country'], ['AgeGroup'], ['name'], ['group'])
57+
country_female_group = RelationshipSet('CURRENT_FEMALE', ['Country'], ['AgeGroup'], ['name'], ['group'])
58+
59+
countries_added = set()
60+
age_groups_added = set()
61+
62+
with open(un_wpp_csv_file, 'rt') as f:
63+
csv_file = csv.reader(f, delimiter=',', quotechar='"')
64+
# skip header
65+
next(csv_file)
66+
for row in csv_file:
67+
# LocID,Location,VarID,Variant,Time,MidPeriod,AgeGrp,AgeGrpStart,AgeGrpSpan,PopMale,PopFemale,PopTotal
68+
loc_id = row[0]
69+
location = row[1]
70+
time = int(row[4])
71+
age_group = row[6]
72+
age_group_start = int(row[7])
73+
age_group_span = row[8]
74+
pop_male = int(float((row[9])) * 1000)
75+
pop_female = int(float((row[10])) * 1000)
76+
pop_total = int(float((row[11])) * 1000)
77+
78+
# only take 2019
79+
if time == 2019:
80+
if location not in countries_added:
81+
country.add_node({'name': location, 'un_id': loc_id})
82+
countries_added.add(location)
83+
if age_group not in age_groups_added:
84+
age_group_nodes.add_node({'group': age_group, 'start': age_group_start, 'span': age_group_span})
85+
86+
country_total_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_total})
87+
country_male_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_male})
88+
country_female_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_female})
89+
90+
log.info('Load data to Neo4j')
91+
country.merge(graph)
92+
age_group_nodes.merge(graph)
93+
country_total_group.merge(graph)
94+
country_male_group.merge(graph)
95+
country_female_group.merge(graph)

‎docker-compose.yml

+1
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ services:
77
- GC_NEO4J_URL=bolt://host.docker.internal:7687
88
- GC_NEO4J_USER=neo4j
99
- GC_NEO4J_PASSWORD=test
10+
- RUN_MODE=test
1011
#command: python run.py

‎requirements.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@ py2neo
33
#git+https://github.com/technige/py2neo.git@v5#egg=py2neo
44
requests
55
python-dateutil
6-
graphio>=0.0.12
6+
graphio>=0.0.12
7+
pytest

‎run.py

+32-24
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import os
2+
import sys
23
import logging
34
import py2neo
5+
import pytest
46

57
logging.basicConfig(level=logging.DEBUG)
68
logging.getLogger('py2neo.connect.bolt').setLevel(logging.WARNING)
@@ -11,41 +13,47 @@
1113
log = logging.getLogger(__name__)
1214

1315
# import and setup
14-
from covid_graph import download, load_to_neo4j, helper, post
16+
from covid_graph import helper, post, jhu, unwpp
1517

1618
ROOT_DIR = os.getenv('ROOT_DIR', '/download')
1719
GC_NEO4J_URL = os.getenv('GC_NEO4J_URL', 'bolt://localhost:7687')
1820
GC_NEO4J_USER = os.getenv('GC_NEO4J_USER', 'neo4j')
1921
GC_NEO4J_PASSWORD = os.getenv('GC_NEO4J_PASSWORD', 'test')
22+
RUN_MODE = os.getenv('RUN_MODE', 'test')
2023

21-
for v in [ROOT_DIR, GC_NEO4J_URL, GC_NEO4J_USER, GC_NEO4J_PASSWORD]:
22-
log.debug(v)
24+
if RUN_MODE.lower() == 'test':
25+
log.info("Run tests")
26+
pytest.main()
2327

24-
graph = py2neo.Graph(GC_NEO4J_URL, user=GC_NEO4J_USER, password=GC_NEO4J_PASSWORD)
25-
log.debug(graph)
28+
else:
29+
for v in [ROOT_DIR, GC_NEO4J_URL, GC_NEO4J_USER, GC_NEO4J_PASSWORD]:
30+
log.debug(v)
2631

27-
result = list(graph.run("MATCH (a) RETURN a LIMIT 1"))
28-
log.debug(result)
32+
graph = py2neo.Graph(GC_NEO4J_URL, user=GC_NEO4J_USER, password=GC_NEO4J_PASSWORD)
33+
log.debug(graph)
2934

30-
# setup DB
31-
helper.setup_db(graph)
35+
result = list(graph.run("MATCH (a) RETURN a LIMIT 1"))
36+
log.debug(result)
3237

33-
if not os.path.exists(ROOT_DIR):
34-
os.makedirs(ROOT_DIR)
35-
###
38+
# setup DB
39+
helper.setup_db(graph)
3640

37-
# download data
38-
jhu_zip_file = download.download_jhu(ROOT_DIR)
39-
jhu_dir = download.unzip_file(jhu_zip_file)
41+
if not os.path.exists(ROOT_DIR):
42+
os.makedirs(ROOT_DIR)
43+
###
4044

41-
wpp_csv_file = download.download_population_data(ROOT_DIR, skip_existing=True)
42-
###
45+
# download data
46+
jhu_zip_file = covid_graph.jhu.download_jhu(ROOT_DIR)
47+
jhu_dir = covid_graph.helper.unzip_file(jhu_zip_file)
4348

44-
# load to Neo4j
45-
load_to_neo4j.read_daily_report_JHU(jhu_dir, graph)
46-
load_to_neo4j.load_wpp_data(ROOT_DIR, graph)
47-
###
49+
wpp_csv_file = covid_graph.unwpp.download_population_data(ROOT_DIR, skip_existing=True)
50+
###
4851

49-
# post process
50-
post.set_latest_update(graph)
51-
###
52+
# load to Neo4j
53+
covid_graph.jhu.read_daily_report_JHU(jhu_dir, graph)
54+
covid_graph.unwpp.load_wpp_data(ROOT_DIR, graph)
55+
###
56+
57+
# post process
58+
post.set_latest_update(graph)
59+
###

‎tests/test_base.py

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import pytest
2+
import requests
3+
4+
JHU_GITHUB_ARCHIVE_LINK = 'https://codeload.github.com/CSSEGISandData/COVID-19/zip/master'
5+
JHU_FILE_NAME = 'jhu_covid19.zip'
6+
WPP_AGE_CSV = 'https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/CSV_FILES/WPP2019_PopulationByAgeSex_Medium.csv'
7+
8+
@pytest.mark.runtest
9+
def test_jhu_available():
10+
r = requests.head(JHU_GITHUB_ARCHIVE_LINK, allow_redirects=True)
11+
12+
assert r.status_code == 200
13+
14+
15+
def test_wpp_available():
16+
r = requests.head(WPP_AGE_CSV, allow_redirects=True)
17+
18+
assert r.status_code == 200

0 commit comments

Comments
 (0)
Please sign in to comment.