Skip to content

Commit 655e55d

Browse files
authored
geoarrow-based multithreaded coordinate reprojection (#337)
### Change list - Include PROJJSON CRS on geometry array metadata when converting from GeoPandas - Move CRS checking and reprojection into GeoArrow level instead of GeoPandas level ### Example Low level example and benchmark. 4.6x reprojection speedup across 8 threads! ```py import geodatasets import pyarrow as pa import geopandas as gpd import numpy as np from lonboard._geoarrow.geopandas_interop import geopandas_to_geoarrow from lonboard._geoarrow.ops import reproject path = geodatasets.get_path('nybb') gdf = gpd.read_file(path) table = geopandas_to_geoarrow(gdf) field = table.schema.field(4) column = table['geometry'] chunk = column.chunk(0) new_chunked_array = pa.chunked_array([chunk] * 100) %timeit out = reproject(field, new_chunked_array, max_workers=8) # 693 ms ± 85.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) %timeit out = reproject(field, new_chunked_array, max_workers=1) # 3.22 s ± 22 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) ```
1 parent f5544e4 commit 655e55d

File tree

9 files changed

+388
-24
lines changed

9 files changed

+388
-24
lines changed

lonboard/_geoarrow/extension_types.py

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1+
import json
12
from enum import Enum
2-
from typing import Optional
3+
from typing import Dict, Optional, Tuple
34

45
import numpy as np
56
import pyarrow as pa
@@ -279,8 +280,11 @@ def __arrow_ext_deserialize__(cls, storage_type: pa.DataType, serialized: bytes)
279280

280281

281282
def construct_geometry_array(
282-
shapely_arr: NDArray[np.object_], include_z: Optional[bool] = None
283-
):
283+
shapely_arr: NDArray[np.object_],
284+
include_z: Optional[bool] = None,
285+
*,
286+
crs_str: Optional[str] = None,
287+
) -> Tuple[pa.Field, pa.Array]:
284288
# NOTE: this implementation returns a (field, array) pair so that it can set the
285289
# extension metadata on the field without instantiating extension types into the
286290
# global pyarrow registry
@@ -295,13 +299,18 @@ def construct_geometry_array(
295299
else:
296300
raise ValueError(f"Unexpected coords dimensions: {coords.shape}")
297301

302+
extension_metadata: Dict[str, str] = {}
303+
if crs_str is not None:
304+
extension_metadata["ARROW:extension:metadata"] = json.dumps({"crs": crs_str})
305+
298306
if geom_type == GeometryType.POINT:
299307
parr = pa.FixedSizeListArray.from_arrays(coords.flatten(), len(dims))
308+
extension_metadata["ARROW:extension:name"] = "geoarrow.point"
300309
field = pa.field(
301310
"geometry",
302311
parr.type,
303312
nullable=True,
304-
metadata={"ARROW:extension:name": "geoarrow.point"},
313+
metadata=extension_metadata,
305314
)
306315
return field, parr
307316

@@ -310,11 +319,12 @@ def construct_geometry_array(
310319
(geom_offsets,) = offsets
311320
_parr = pa.FixedSizeListArray.from_arrays(coords.flatten(), len(dims))
312321
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr)
322+
extension_metadata["ARROW:extension:name"] = "geoarrow.linestring"
313323
field = pa.field(
314324
"geometry",
315325
parr.type,
316326
nullable=True,
317-
metadata={"ARROW:extension:name": "geoarrow.linestring"},
327+
metadata=extension_metadata,
318328
)
319329
return field, parr
320330

@@ -324,11 +334,12 @@ def construct_geometry_array(
324334
_parr = pa.FixedSizeListArray.from_arrays(coords.flatten(), len(dims))
325335
_parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
326336
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr1)
337+
extension_metadata["ARROW:extension:name"] = "geoarrow.polygon"
327338
field = pa.field(
328339
"geometry",
329340
parr.type,
330341
nullable=True,
331-
metadata={"ARROW:extension:name": "geoarrow.polygon"},
342+
metadata=extension_metadata,
332343
)
333344
return field, parr
334345

@@ -337,11 +348,12 @@ def construct_geometry_array(
337348
(geom_offsets,) = offsets
338349
_parr = pa.FixedSizeListArray.from_arrays(coords.flatten(), len(dims))
339350
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr)
351+
extension_metadata["ARROW:extension:name"] = "geoarrow.multipoint"
340352
field = pa.field(
341353
"geometry",
342354
parr.type,
343355
nullable=True,
344-
metadata={"ARROW:extension:name": "geoarrow.multipoint"},
356+
metadata=extension_metadata,
345357
)
346358
return field, parr
347359

@@ -351,11 +363,12 @@ def construct_geometry_array(
351363
_parr = pa.FixedSizeListArray.from_arrays(coords.flatten(), len(dims))
352364
_parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
353365
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr1)
366+
extension_metadata["ARROW:extension:name"] = "geoarrow.multilinestring"
354367
field = pa.field(
355368
"geometry",
356369
parr.type,
357370
nullable=True,
358-
metadata={"ARROW:extension:name": "geoarrow.multilinestring"},
371+
metadata=extension_metadata,
359372
)
360373
return field, parr
361374

@@ -366,11 +379,12 @@ def construct_geometry_array(
366379
_parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
367380
_parr2 = pa.ListArray.from_arrays(pa.array(polygon_offsets), _parr1)
368381
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr2)
382+
extension_metadata["ARROW:extension:name"] = "geoarrow.multipolygon"
369383
field = pa.field(
370384
"geometry",
371385
parr.type,
372386
nullable=True,
373-
metadata={"ARROW:extension:name": "geoarrow.multipolygon"},
387+
metadata=extension_metadata,
374388
)
375389
return field, parr
376390

lonboard/_geoarrow/geopandas_interop.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,9 @@ def geopandas_to_geoarrow(
1818
df_attr = df_attr[columns]
1919

2020
table = pa.Table.from_pandas(df_attr, preserve_index=preserve_index)
21-
field, geom_arr = construct_geometry_array(np.array(gdf.geometry))
21+
field, geom_arr = construct_geometry_array(
22+
np.array(gdf.geometry),
23+
crs_str=gdf.crs.to_json() if gdf.crs is not None else None,
24+
)
25+
2226
return table.append_column(field, geom_arr)

lonboard/_geoarrow/ops/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@
33

44
from .bbox import total_bounds
55
from .centroid import weighted_centroid
6+
from .reproject import reproject_column, reproject_table

lonboard/_geoarrow/ops/reproject.py

Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
"""Reproject a GeoArrow array
2+
"""
3+
import json
4+
import warnings
5+
from concurrent.futures import ThreadPoolExecutor
6+
from functools import lru_cache, partial
7+
from typing import Callable, Optional, Tuple, Union
8+
9+
import numpy as np
10+
import pyarrow as pa
11+
from pyproj import CRS, Transformer
12+
13+
from lonboard._constants import EXTENSION_NAME, OGC_84
14+
from lonboard._geoarrow.extension_types import CoordinateDimension
15+
from lonboard._utils import get_geometry_column_index
16+
17+
TransformerFromCRS = lru_cache(Transformer.from_crs)
18+
19+
20+
def reproject_table(
21+
table: pa.Table,
22+
*,
23+
to_crs: Union[str, CRS] = OGC_84,
24+
max_workers: Optional[int] = None,
25+
) -> pa.Table:
26+
"""Reproject a GeoArrow table to a new CRS
27+
28+
Args:
29+
table: The table to reproject.
30+
to_crs: The target CRS. Defaults to OGC_84.
31+
max_workers: The maximum number of threads to use. Defaults to None.
32+
33+
Returns:
34+
A new table.
35+
"""
36+
geom_col_idx = get_geometry_column_index(table.schema)
37+
# No geometry column in table
38+
if geom_col_idx is None:
39+
return table
40+
41+
geom_field = table.schema.field(geom_col_idx)
42+
geom_column = table.column(geom_col_idx)
43+
44+
# geometry column exists in table but is not assigned a CRS
45+
if b"ARROW:extension:metadata" not in geom_field.metadata:
46+
return table
47+
48+
new_field, new_column = reproject_column(
49+
field=geom_field, column=geom_column, to_crs=to_crs, max_workers=max_workers
50+
)
51+
return table.set_column(geom_col_idx, new_field, new_column)
52+
53+
54+
def reproject_column(
55+
*,
56+
field: pa.Field,
57+
column: pa.ChunkedArray,
58+
to_crs: Union[str, CRS] = OGC_84,
59+
max_workers: Optional[int] = None,
60+
) -> Tuple[pa.Field, pa.ChunkedArray]:
61+
"""Reproject a GeoArrow array to a new CRS
62+
63+
Args:
64+
field: The field describing the column
65+
column: A ChunkedArray
66+
to_crs: The target CRS. Defaults to OGC_84.
67+
max_workers: The maximum number of threads to use. Defaults to None.
68+
"""
69+
extension_type_name = field.metadata[b"ARROW:extension:name"]
70+
extension_metadata = json.loads(field.metadata[b"ARROW:extension:metadata"])
71+
crs_str = extension_metadata["crs"]
72+
existing_crs = CRS(crs_str)
73+
74+
if existing_crs == to_crs:
75+
return field, column
76+
77+
# NOTE: Not sure the best place to put this warning
78+
warnings.warn("Input being reprojected to EPSG:4326 CRS")
79+
80+
transformer = TransformerFromCRS(existing_crs, to_crs, always_xy=True)
81+
82+
# Metadata inside metadata, bad naming
83+
new_extension_meta_meta = {"crs": CRS(to_crs).to_json()}
84+
new_extension_metadata = {
85+
b"ARROW:extension:name": extension_type_name,
86+
b"ARROW:extension:metadata": json.dumps(new_extension_meta_meta),
87+
}
88+
89+
new_chunked_array = _reproject_column(
90+
column,
91+
extension_type_name=extension_type_name,
92+
transformer=transformer,
93+
max_workers=max_workers,
94+
)
95+
return field.with_metadata(new_extension_metadata), new_chunked_array
96+
97+
98+
def _reproject_column(
99+
column: pa.ChunkedArray,
100+
*,
101+
extension_type_name: EXTENSION_NAME,
102+
transformer: Transformer,
103+
max_workers: Optional[int] = None,
104+
) -> pa.ChunkedArray:
105+
if extension_type_name == EXTENSION_NAME.POINT:
106+
func = partial(_reproject_chunk_nest_0, transformer=transformer)
107+
elif extension_type_name in [EXTENSION_NAME.LINESTRING, EXTENSION_NAME.MULTIPOINT]:
108+
func = partial(_reproject_chunk_nest_1, transformer=transformer)
109+
elif extension_type_name in [
110+
EXTENSION_NAME.POLYGON,
111+
EXTENSION_NAME.MULTILINESTRING,
112+
]:
113+
func = partial(_reproject_chunk_nest_2, transformer=transformer)
114+
115+
elif extension_type_name == EXTENSION_NAME.MULTIPOLYGON:
116+
func = partial(_reproject_chunk_nest_3, transformer=transformer)
117+
else:
118+
raise ValueError(f"Unexpected extension type name {extension_type_name}")
119+
120+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
121+
return pa.chunked_array(executor.map(func, column.chunks))
122+
123+
124+
def _reproject_coords(arr: pa.FixedSizeListArray, transformer: Transformer):
125+
list_size = arr.type.list_size
126+
np_arr = arr.flatten().to_numpy().reshape(-1, list_size)
127+
128+
if list_size == 2:
129+
output_np_arr = np.column_stack(
130+
transformer.transform(np_arr[:, 0], np_arr[:, 1])
131+
)
132+
dims = CoordinateDimension.XY
133+
elif list_size == 3:
134+
output_np_arr = np.column_stack(
135+
transformer.transform(np_arr[:, 0], np_arr[:, 1], np_arr[:, 2])
136+
)
137+
dims = CoordinateDimension.XYZ
138+
else:
139+
raise ValueError(f"Unexpected list size {list_size}")
140+
141+
coord_field = pa.list_(pa.field(dims, pa.float64()), len(dims))
142+
return pa.FixedSizeListArray.from_arrays(
143+
output_np_arr.flatten("C"), type=coord_field
144+
)
145+
146+
147+
def _reproject_chunk_nest_0(arr: pa.ListArray, transformer: Transformer):
148+
callback = partial(_reproject_coords, transformer=transformer)
149+
return _map_coords_nest_0(arr, callback)
150+
151+
152+
def _reproject_chunk_nest_1(arr: pa.ListArray, transformer: Transformer):
153+
callback = partial(_reproject_coords, transformer=transformer)
154+
return _map_coords_nest_1(arr, callback)
155+
156+
157+
def _reproject_chunk_nest_2(arr: pa.ListArray, transformer: Transformer):
158+
callback = partial(_reproject_coords, transformer=transformer)
159+
return _map_coords_nest_2(arr, callback)
160+
161+
162+
def _reproject_chunk_nest_3(arr: pa.ListArray, transformer: Transformer):
163+
callback = partial(_reproject_coords, transformer=transformer)
164+
return _map_coords_nest_3(arr, callback)
165+
166+
167+
def _map_coords_nest_0(
168+
arr: pa.FixedSizeListArray,
169+
callback: Callable[[pa.FixedSizeListArray], pa.FixedSizeListArray],
170+
):
171+
new_coords = callback(arr)
172+
return new_coords
173+
174+
175+
def _map_coords_nest_1(
176+
arr: pa.ListArray,
177+
callback: Callable[[pa.FixedSizeListArray], pa.FixedSizeListArray],
178+
):
179+
geom_offsets = arr.offsets
180+
coords = arr.flatten()
181+
new_coords = callback(coords)
182+
new_geometry_array = pa.ListArray.from_arrays(geom_offsets, new_coords)
183+
return new_geometry_array
184+
185+
186+
def _map_coords_nest_2(
187+
arr: pa.ListArray,
188+
callback: Callable[[pa.FixedSizeListArray], pa.FixedSizeListArray],
189+
):
190+
geom_offsets = arr.offsets
191+
ring_offsets = arr.flatten().offsets
192+
coords = arr.flatten().flatten()
193+
new_coords = callback(coords)
194+
new_ring_array = pa.ListArray.from_arrays(ring_offsets, new_coords)
195+
new_geometry_array = pa.ListArray.from_arrays(geom_offsets, new_ring_array)
196+
return new_geometry_array
197+
198+
199+
def _map_coords_nest_3(
200+
arr: pa.ListArray,
201+
callback: Callable[[pa.FixedSizeListArray], pa.FixedSizeListArray],
202+
):
203+
geom_offsets = arr.offsets
204+
polygon_offsets = arr.flatten().offsets
205+
ring_offsets = arr.flatten().flatten().offsets
206+
coords = arr.flatten().flatten().flatten()
207+
new_coords = callback(coords)
208+
new_ring_array = pa.ListArray.from_arrays(ring_offsets, new_coords)
209+
new_polygon_array = pa.ListArray.from_arrays(polygon_offsets, new_ring_array)
210+
new_geometry_array = pa.ListArray.from_arrays(geom_offsets, new_polygon_array)
211+
return new_geometry_array

lonboard/_layer.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
from __future__ import annotations
1111

1212
import sys
13-
import warnings
1413
from typing import TYPE_CHECKING, List, Optional, Sequence, Tuple
1514

1615
import geopandas as gpd
@@ -19,8 +18,9 @@
1918
import traitlets
2019

2120
from lonboard._base import BaseExtension, BaseWidget
22-
from lonboard._constants import EPSG_4326, EXTENSION_NAME, OGC_84
21+
from lonboard._constants import EXTENSION_NAME, OGC_84
2322
from lonboard._geoarrow.geopandas_interop import geopandas_to_geoarrow
23+
from lonboard._geoarrow.ops import reproject_table
2424
from lonboard._geoarrow.ops.bbox import Bbox, total_bounds
2525
from lonboard._geoarrow.ops.centroid import WeightedCentroid, weighted_centroid
2626
from lonboard._serialization import infer_rows_per_chunk
@@ -182,9 +182,8 @@ def default_geoarrow_viewport(
182182
) -> Optional[Tuple[Bbox, WeightedCentroid]]:
183183
# Note: in the ArcLayer we won't necessarily have a column with a geoarrow
184184
# extension type/metadata
185-
try:
186-
geom_col_idx = get_geometry_column_index(table.schema)
187-
except ValueError:
185+
geom_col_idx = get_geometry_column_index(table.schema)
186+
if geom_col_idx is None:
188187
return None
189188

190189
geom_field = table.schema.field(geom_col_idx)
@@ -232,6 +231,10 @@ class BaseArrowLayer(BaseLayer):
232231
def __init__(
233232
self, *, table: pa.Table, _rows_per_chunk: Optional[int] = None, **kwargs
234233
):
234+
# Reproject table to WGS84 if needed
235+
# Note this must happen before calculating the default viewport
236+
table = reproject_table(table, to_crs=OGC_84)
237+
235238
default_viewport = default_geoarrow_viewport(table)
236239
if default_viewport is not None:
237240
self._bbox = default_viewport[0]
@@ -266,10 +269,6 @@ def from_geopandas(
266269
Returns:
267270
A Layer with the initialized data.
268271
"""
269-
if gdf.crs and gdf.crs not in [EPSG_4326, OGC_84]:
270-
warnings.warn("GeoDataFrame being reprojected to EPSG:4326")
271-
gdf = gdf.to_crs(OGC_84) # type: ignore
272-
273272
if auto_downcast:
274273
# Note: we don't deep copy because we don't need to clone geometries
275274
gdf = _auto_downcast(gdf.copy()) # type: ignore

0 commit comments

Comments
 (0)