Skip to content

Commit 67e3ea6

Browse files
authored
Parse GeoParquet metadata (#407)
Closes #406 This should allow a two-liner for visualization of just ```py table = pyarrow.parquet.read_table('file.parquet') lonboard.viz(table) ```
1 parent 7c948a7 commit 67e3ea6

File tree

2 files changed

+56
-3
lines changed

2 files changed

+56
-3
lines changed

lonboard/_geoarrow/parse_wkb.py

+44-3
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
1-
"""Handle GeoArrow tables with WKB-encoded geometry
2-
"""
1+
"""Handle GeoArrow tables with WKB-encoded geometry"""
2+
3+
import json
34
from typing import Tuple
45

56
import pyarrow as pa
67
import shapely
78

8-
from lonboard._constants import EXTENSION_NAME
9+
from lonboard._constants import EXTENSION_NAME, OGC_84
910
from lonboard._geoarrow.crs import get_field_crs
1011
from lonboard._geoarrow.extension_types import construct_geometry_array
12+
from lonboard._utils import get_geometry_column_index
1113

1214

1315
def parse_wkb_table(table: pa.Table) -> pa.Table:
@@ -16,6 +18,8 @@ def parse_wkb_table(table: pa.Table) -> pa.Table:
1618
If no columns are WKB-encoded, returns the input. Note that WKB columns must be
1719
tagged with an extension name of `geoarrow.wkb` or `ogc.wkb`
1820
"""
21+
table = parse_geoparquet_table(table)
22+
1923
wkb_names = {EXTENSION_NAME.WKB, EXTENSION_NAME.OGC_WKB}
2024
for field_idx in range(len(table.schema)):
2125
field = table.field(field_idx)
@@ -32,6 +36,43 @@ def parse_wkb_table(table: pa.Table) -> pa.Table:
3236
return table
3337

3438

39+
def parse_geoparquet_table(table: pa.Table) -> pa.Table:
40+
"""Parse GeoParquet table metadata, assigning it to GeoArrow metadata"""
41+
# If a column already has geoarrow metadata, don't parse from GeoParquet metadata
42+
if get_geometry_column_index(table.schema) is not None:
43+
return table
44+
45+
schema_metadata = table.schema.metadata or {}
46+
geo_metadata = schema_metadata.get(b"geo")
47+
if not geo_metadata:
48+
return table
49+
50+
try:
51+
geo_metadata = json.loads(geo_metadata)
52+
except json.JSONDecodeError:
53+
return table
54+
55+
primary_column = geo_metadata["primary_column"]
56+
column_meta = geo_metadata["columns"][primary_column]
57+
column_idx = [
58+
idx for idx, name in enumerate(table.column_names) if name == primary_column
59+
]
60+
assert len(column_idx) == 1, f"Expected one column with name {primary_column}"
61+
column_idx = column_idx[0]
62+
if column_meta["encoding"] == "WKB":
63+
existing_field = table.schema.field(column_idx)
64+
existing_column = table.column(column_idx)
65+
crs_metadata = {"crs": column_meta.get("crs", OGC_84.to_json_dict())}
66+
metadata = {
67+
b"ARROW:extension:name": EXTENSION_NAME.WKB,
68+
b"ARROW:extension:metadata": json.dumps(crs_metadata),
69+
}
70+
new_field = existing_field.with_metadata(metadata)
71+
table = table.set_column(column_idx, new_field, existing_column)
72+
73+
return table
74+
75+
3576
def parse_wkb_column(
3677
field: pa.Field, column: pa.ChunkedArray
3778
) -> Tuple[pa.Field, pa.ChunkedArray]:

tests/test_geoarrow.py

+12
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import json
2+
from tempfile import NamedTemporaryFile
23

34
import geodatasets
45
import geopandas as gpd
6+
import pyarrow.parquet as pq
57
from pyproj import CRS
68

79
from lonboard import SolidPolygonLayer
@@ -61,3 +63,13 @@ def test_reproject_sliced_array():
6163
sliced_table = table.slice(2)
6264
# This should work even with a sliced array.
6365
_reprojected = reproject_table(sliced_table, to_crs=OGC_84)
66+
67+
68+
def test_geoparquet_metadata():
69+
gdf = gpd.read_file(geodatasets.get_path("nybb"))
70+
71+
with NamedTemporaryFile("+wb", suffix=".parquet") as f:
72+
gdf.to_parquet(f)
73+
table = pq.read_table(f)
74+
75+
_layer = SolidPolygonLayer(table=table)

0 commit comments

Comments
 (0)