Skip to content

Commit 003b93e

Browse files
authored
upload fgb for previously ingested vector data (#83)
* fix: downgrade ubuntu to 22 for tippecanoe build * adding script to upload fgb for past data... * fix: implemented upload data script * add readme * overwrite=True for upload blob
1 parent 2e1abdb commit 003b93e

File tree

6 files changed

+284
-14
lines changed

6 files changed

+284
-14
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,5 @@ Pipfile.lock
66
*sample.dat
77
test.tif
88
*.pyc
9-
deployment/test
9+
deployment/test
10+
data/

dockerfiles/docker-compose_cli.yaml

+9-6
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,16 @@ version: '3'
22
services:
33
geohub-data-pipeline:
44
#user: 1000:1000
5-
build: ./
6-
command: "python -m ingest.cli.main"
7-
# env_file:
8-
# - ../.env
5+
build:
6+
context: ..
7+
dockerfile: ./Dockerfile
8+
# command: "python -m ingest.cli.main"
9+
command: "python -m ingest.fgb_upload.main -dst /data"
10+
env_file:
11+
- ../.env
912
# environment:
1013
# - user=1000
11-
# volumes:
12-
# - /home/janf/Downloads/data:/data
14+
volumes:
15+
- ../data:/data
1316
# - /etc/passwd:/etc/passwd:ro
1417
# - /etc/group:/etc/group:ro

ingest/fgb_upload/README.md

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# fgb upload script
2+
3+
This script scan all users or a specific user folder in Azure Blob Storage to search existing vector PMTiles datasets. Then, it processes original vector data into flatgeobuf formats for uploading.
4+
5+
## Usage
6+
7+
The easiest way to execute this CLI tool is to use docker compose.
8+
9+
```shell
10+
docker compose -f dockerfiles/docker-compose_cli.yaml build
11+
12+
# process all users
13+
docker compose -f dockerfiles/docker-compose_cli.yaml run geohub-data-pipeline python -m ingest.fgb_upload.main -dst /data
14+
15+
# process a specific user
16+
docker compose -f dockerfiles/docker-compose_cli.yaml run geohub-data-pipeline python -m ingest.fgb_upload.main -u {user-email} -dst /data
17+
```
18+
19+
If `-u {user-email}` is not specified, all users will be scanned. If fgb is already uploaded by the data pipeline, the dataset will be skipped.

ingest/fgb_upload/__init__.py

Whitespace-only changes.

ingest/fgb_upload/main.py

+239
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
import asyncio
2+
import multiprocessing
3+
import argparse
4+
import logging
5+
import sys
6+
import os
7+
import hashlib
8+
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, generate_blob_sas, BlobSasPermissions
9+
from osgeo import gdal
10+
import aiohttp
11+
from pmtiles.reader import Reader, MemorySource
12+
from datetime import datetime, timedelta
13+
from ingest.azblob import upload_blob
14+
from ingest.processing import dataset2fgb
15+
from ingest.utils import prepare_arch_path
16+
17+
logging.basicConfig()
18+
logger = logging.getLogger()
19+
sthandler = logging.StreamHandler()
20+
sthandler.setFormatter(
21+
logging.Formatter('%(asctime)s-%(filename)s:%(funcName)s:%(lineno)d:%(levelname)s:%(message)s',
22+
"%Y-%m-%d %H:%M:%S"))
23+
logger.handlers.clear()
24+
logger.addHandler(sthandler)
25+
logger.name = __name__
26+
logger.setLevel(logging.INFO)
27+
28+
logging.getLogger('azure').setLevel(logging.WARNING)
29+
30+
31+
AZURE_STORAGE_CONNECTION_STRING = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
32+
33+
# docker run --rm -it -v .:/data -v ./ingest:/usr/src/app/ingest dockerfiles-app python ./ingest/cli/upload_fgb.py -h
34+
35+
36+
def generate_userid(user_email):
37+
if user_email:
38+
m = hashlib.md5(user_email.encode())
39+
return m.hexdigest()
40+
else:
41+
return
42+
43+
44+
def get_blob_container(container_name):
45+
blob_service_client = BlobServiceClient.from_connection_string(
46+
conn_str=AZURE_STORAGE_CONNECTION_STRING
47+
)
48+
container_client = blob_service_client.get_container_client(
49+
container=container_name
50+
)
51+
return container_client
52+
53+
54+
async def get_layer_names(file_path):
55+
async with aiohttp.ClientSession() as session:
56+
async with session.get(file_path) as response:
57+
if response.status != 200:
58+
raise Exception(f"Failed to retrieve PMTiles from {file_path}, status code: {response.status}")
59+
60+
data = await response.read()
61+
source = MemorySource(data)
62+
reader = Reader(source)
63+
metadata = reader.metadata()
64+
vector_layers = metadata.get("vector_layers", [])
65+
layer_names = [layer.get("id") for layer in vector_layers if "id" in layer]
66+
return layer_names
67+
68+
69+
def generate_sas_url(container_client, blob_name):
70+
parts = dict(item.split("=", 1) for item in AZURE_STORAGE_CONNECTION_STRING.split(";") if "=" in item)
71+
account_name = parts.get("AccountName")
72+
account_key = parts.get("AccountKey")
73+
74+
container_name = container_client.container_name
75+
76+
sas_token = generate_blob_sas(
77+
account_name=account_name,
78+
container_name=container_name,
79+
blob_name=blob_name,
80+
account_key=account_key,
81+
permission=BlobSasPermissions(read=True),
82+
expiry=datetime.utcnow() + timedelta(hours=1)
83+
)
84+
return sas_token
85+
86+
87+
def download_blob(container_client, blob_name: str, download_path: str):
88+
"""Download a blob to a local file with a progress bar."""
89+
blob_client = container_client.get_blob_client(blob_name)
90+
91+
logger.info(f"Downloading {blob_name.name} to {download_path}")
92+
blob_properties = blob_client.get_blob_properties()
93+
94+
download_dir = os.path.dirname(download_path)
95+
if not os.path.exists(download_dir):
96+
os.makedirs(download_dir)
97+
98+
with open(download_path, "wb") as f:
99+
stream = blob_client.download_blob()
100+
for chunk in stream.chunks():
101+
f.write(chunk)
102+
logger.info(f"Downloaded {blob_client.blob_name} to {download_path}")
103+
104+
105+
async def ingest_user_folder(user_id: str, container_client: ContainerClient, dist_dir: str, timeout_event: multiprocessing.Event = None,):
106+
# find pmtiles files in datasets folder
107+
for blob in container_client.list_blobs(name_starts_with=f"{user_id}/datasets"):
108+
if blob.name.split(".")[-1] != 'pmtiles':
109+
continue
110+
pmtiles_path = blob.name
111+
112+
sas_url = generate_sas_url(container_client, pmtiles_path)
113+
pmtiles_url = f"{container_client.url}/{pmtiles_path}?{sas_url}"
114+
115+
layers = await get_layer_names(pmtiles_url)
116+
layer_count = len(layers)
117+
if layer_count == 0:
118+
continue
119+
else:
120+
# check if fgb is already uploaed
121+
fgb_blob_list = [blob for blob in container_client.list_blobs(name_starts_with=pmtiles_path) if
122+
blob.name.split(".")[-1] == "fgb"]
123+
if len(fgb_blob_list) > 0:
124+
logger.debug(f"{pmtiles_path} has already fgb uploaded. Skip this dataset.")
125+
continue
126+
127+
parts = pmtiles_path.split('/')
128+
129+
join_vector_tiles = layer_count == 1
130+
raw_blob_name = f"{user_id}/raw/{parts[2]}"
131+
raw_file = f"{container_client.url}/{raw_blob_name}"
132+
raw_file_path = os.path.join(dist_dir, f"{raw_blob_name}")
133+
134+
blob_list = [blob for blob in container_client.list_blobs(name_starts_with=raw_blob_name) if blob.name == raw_blob_name]
135+
136+
if not blob_list:
137+
continue
138+
blob_name = blob_list[0]
139+
140+
download_blob(container_client, blob_name, raw_file_path)
141+
src_file_path = prepare_arch_path(src_path=raw_file_path)
142+
try:
143+
vdataset = gdal.OpenEx(src_file_path, gdal.OF_VECTOR)
144+
except RuntimeError as ioe:
145+
if 'supported' in str(ioe):
146+
vdataset = None
147+
else:
148+
raise
149+
if vdataset is not None:
150+
logger.info(f'Opened {raw_file} with {vdataset.GetDriver().ShortName} vector driver')
151+
nvector_layers = vdataset.GetLayerCount()
152+
layer_names = [vdataset.GetLayerByIndex(i).GetName() for i in range(nvector_layers)]
153+
fgb_dir = os.path.join(dist_dir, raw_blob_name.replace("/raw/", "/datasets/"))
154+
if not os.path.exists(fgb_dir):
155+
os.makedirs(fgb_dir)
156+
157+
if nvector_layers > 0:
158+
if not join_vector_tiles:
159+
# multi layers
160+
for layer_name in layer_names:
161+
fgb_layers = dataset2fgb(fgb_dir=fgb_dir,
162+
src_ds=vdataset,
163+
layers=[layer_name],
164+
timeout_event=timeout_event,
165+
conn_string=AZURE_STORAGE_CONNECTION_STRING,
166+
blob_url=raw_file,
167+
silent_mode=True)
168+
169+
if fgb_layers:
170+
for layer_name in fgb_layers:
171+
fgb_layer_path = fgb_layers[layer_name]
172+
upload_blob(src_path=fgb_layer_path, connection_string=AZURE_STORAGE_CONNECTION_STRING,
173+
container_name=container_client.container_name,
174+
dst_blob_path=f"{pmtiles_path}.{layer_name}.fgb",
175+
overwrite=True)
176+
else:
177+
# single layers
178+
fgb_layers = dataset2fgb(fgb_dir=fgb_dir,
179+
src_ds=vdataset,
180+
layers=layer_names,
181+
timeout_event=timeout_event,
182+
conn_string=AZURE_STORAGE_CONNECTION_STRING,
183+
blob_url=raw_file,
184+
silent_mode=True)
185+
if fgb_layers:
186+
for layer_name in fgb_layers:
187+
fgb_layer_path = fgb_layers[layer_name]
188+
logger.info(f"{fgb_layer_path} to {pmtiles_path}.fgb")
189+
upload_blob(src_path=fgb_layer_path, connection_string=AZURE_STORAGE_CONNECTION_STRING,
190+
container_name=container_client.container_name,
191+
dst_blob_path=f"{pmtiles_path}.fgb",
192+
overwrite=True)
193+
194+
195+
async def main():
196+
197+
parser = argparse.ArgumentParser(
198+
description='Convert previous vector data to flatgeobuf and upload them to blob storage',
199+
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
200+
parser.add_argument('-u', '--user',
201+
help='User email address to process. If not specified, process all users',
202+
type=str, )
203+
parser.add_argument('-c', '--container',
204+
help='Target container name of blob storage',
205+
type=str, default="userdata")
206+
parser.add_argument('-dst', '--destination-directory',
207+
help='A full absolute path to a folder where the files will be written.',
208+
type=str, )
209+
parser.add_argument('-d', '--debug', action='store_true',
210+
help='Set log level to debug', default=False
211+
)
212+
213+
args = parser.parse_args(args=None if sys.argv[1:] else ['--help'])
214+
if args.debug:
215+
logger.setLevel(logging.DEBUG)
216+
timeout_event = multiprocessing.Event()
217+
218+
dist_dir = args.destination_directory
219+
if not os.path.exists(dist_dir):
220+
os.mkdir(dist_dir)
221+
222+
container_client = get_blob_container(args.container)
223+
if not args.user:
224+
user_ids = list(
225+
set([blob.name.split("/")[0] for blob in container_client.list_blobs() if
226+
blob.name.split("/")[0] != "test"])
227+
)
228+
for user_id in user_ids:
229+
logger.info(f"Processing user: {user_id}")
230+
await ingest_user_folder(user_id, container_client, dist_dir, timeout_event=timeout_event)
231+
else:
232+
user_id = generate_userid(args.user)
233+
logger.info(f"Processing user: {user_id}")
234+
await ingest_user_folder(user_id, container_client, dist_dir, timeout_event=timeout_event)
235+
236+
if __name__ == '__main__':
237+
asyncio.run(main())
238+
239+

ingest/processing.py

+15-7
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,8 @@ def dataset2fgb(fgb_dir: str = None,
100100
dst_prj_epsg: int = 4326,
101101
conn_string: str = None,
102102
blob_url: str = None,
103-
timeout_event=None):
103+
timeout_event=None,
104+
silent_mode=False):
104105
"""
105106
Convert one or more layers from src_ds into FlatGeobuf format in a (temporary) directory featuring dst_prj_epsg
106107
projection. The layer is possibly reprojected. In case errors are encountered an error blob is uploaded for now
@@ -112,6 +113,7 @@ def dataset2fgb(fgb_dir: str = None,
112113
@param conn_string: the connection string used to connect to the Azure storage account
113114
@param blob_url: the url of the blob to be ingested
114115
@param timeout_event:
116+
@param silent_mode: if True, it will not upload error file
115117
@return:
116118
"""
117119
dst_srs = osr.SpatialReference()
@@ -168,9 +170,12 @@ def dataset2fgb(fgb_dir: str = None,
168170
error_blob_path = f'{"/".join(rest)}/{blob_name}.error'
169171
logger.info(f'Uploading error message to {error_blob_path}')
170172
error_message = f'There could be issues with layer "{lname}".\nOriginal number of features/geometries ={original_features} while converted={converted_features}'
171-
upload_content_to_blob(content=error_message, connection_string=conn_string,
172-
container_name=container_name,
173-
dst_blob_path=error_blob_path)
173+
if silent_mode:
174+
logger.info(f"skipped uploading error file")
175+
else:
176+
upload_content_to_blob(content=error_message, connection_string=conn_string,
177+
container_name=container_name,
178+
dst_blob_path=error_blob_path)
174179

175180

176181

@@ -194,9 +199,12 @@ def dataset2fgb(fgb_dir: str = None,
194199
container_name, *rest, blob_name = blob_name.split("/")
195200
error_blob_path = f'{"/".join(rest)}/{blob_name}.error'
196201
logger.info(f'Uploading error message to {error_blob_path}')
197-
upload_content_to_blob(content=error_message, connection_string=conn_string,
198-
container_name=container_name,
199-
dst_blob_path=error_blob_path)
202+
if silent_mode:
203+
logger.info(f"skipped uploading error file")
204+
else:
205+
upload_content_to_blob(content=error_message, connection_string=conn_string,
206+
container_name=container_name,
207+
dst_blob_path=error_blob_path)
200208

201209

202210
return converted_layers

0 commit comments

Comments
 (0)