Skip to content

Commit 1ced621

Browse files
committed
Implement generic MovingPandas to GeoArrow
1 parent fa65b87 commit 1ced621

File tree

2 files changed

+203
-57
lines changed

2 files changed

+203
-57
lines changed
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
from __future__ import annotations
2+
3+
import json
4+
from typing import TYPE_CHECKING, Dict, List, Literal, Tuple
5+
6+
import numpy as np
7+
from arro3.core import (
8+
Array,
9+
ChunkedArray,
10+
DataType,
11+
Field,
12+
RecordBatch,
13+
Schema,
14+
Table,
15+
fixed_size_list_array,
16+
list_array,
17+
)
18+
19+
if TYPE_CHECKING:
20+
import movingpandas as mpd
21+
import pyarrow as pa
22+
from movingpandas import TrajectoryCollection
23+
24+
25+
# TODO (lonboard-specific):
26+
# - update timestamp serialization to cast to float32 at that point
27+
# # offset by earliest timestamp
28+
# timestamps -= timestamps.min()
29+
30+
# # Cast to float32
31+
# timestamps = timestamps.astype(np.float32)
32+
33+
34+
def movingpandas_to_geoarrow(
35+
traj_collection: TrajectoryCollection,
36+
) -> Tuple[Table, ChunkedArray]:
37+
"""Convert a MovingPandas TrajectoryCollection to GeoArrow
38+
39+
Args:
40+
traj_collection: _description_
41+
42+
Returns:
43+
_description_
44+
"""
45+
import pyarrow as pa
46+
import shapely
47+
48+
crs = traj_collection.get_crs()
49+
crs_json = crs.to_json_dict() if crs is not None else None
50+
51+
num_coords = 0
52+
num_trajectories = len(traj_collection)
53+
offsets = np.zeros(num_trajectories + 1, dtype=np.int32)
54+
datetime_dtypes = set()
55+
attr_schemas: List[pa.Schema] = []
56+
57+
# Loop the first time to infer offsets for each trajectory
58+
for i, traj in enumerate(traj_collection.trajectories):
59+
traj: mpd.Trajectory
60+
61+
num_coords += traj.size()
62+
offsets[i + 1] = num_coords
63+
datetime_dtypes.add(traj.df.index.dtype)
64+
65+
geom_col_name = traj.get_geom_col()
66+
df_attr = traj.df.drop(columns=[geom_col_name])
67+
68+
# Explicitly drop index because the index is a DatetimeIndex that we convert
69+
# manually later.
70+
arrow_schema = pa.Schema.from_pandas(df_attr, preserve_index=False)
71+
attr_schemas.append(arrow_schema)
72+
73+
assert (
74+
len(datetime_dtypes) == 1
75+
), "Expected only one datetime dtype across all trajectories."
76+
datetime_dtype = list(datetime_dtypes)[0]
77+
78+
# We currently always provision space for XYZ coordinates, and then only use 2d if
79+
# the Z dimension is always NaN
80+
coords = np.zeros((num_coords, 3), dtype=np.float64)
81+
82+
# Infer an arrow time unit from the numpy
83+
time_unit, time_arrow_dtype = infer_time_unit(datetime_dtype)
84+
85+
# TODO: switch this to just using `time_arrow_dtype.bit_width` once
86+
# https://github.com/kylebarron/arro3/pull/190 is released
87+
if time_unit in {"s", "ms"}:
88+
timestamps = np.zeros(num_coords, dtype=np.int32)
89+
elif time_unit in {"us", "ns"}:
90+
timestamps = np.zeros(num_coords, dtype=np.int64)
91+
else:
92+
raise ValueError(f"Unexpected time unit: {time_unit}.")
93+
94+
attr_schema = pa.unify_schemas(attr_schemas, promote_options="permissive")
95+
attr_tables: List[pa.Table] = []
96+
97+
# Loop second time to fill timestamps and coords
98+
for i, traj in enumerate(traj_collection.trajectories):
99+
start_offset = offsets[i]
100+
end_offset = offsets[i + 1]
101+
102+
timestamps[start_offset:end_offset] = traj.df.index
103+
coords[start_offset:end_offset, 0] = shapely.get_x(traj.df.geometry)
104+
coords[start_offset:end_offset, 1] = shapely.get_y(traj.df.geometry)
105+
coords[start_offset:end_offset, 2] = shapely.get_z(traj.df.geometry)
106+
107+
geom_col_name = traj.get_geom_col()
108+
df_attr = traj.df.drop(columns=[geom_col_name])
109+
110+
attr_table = pa.Table.from_pandas(
111+
traj.df, schema=attr_schema, preserve_index=False
112+
)
113+
attr_tables.append(attr_table)
114+
115+
attr_table = pa.concat_tables(attr_tables, promote_options="none")
116+
attr_table = Table.from_arrow(attr_table)
117+
118+
offsets = Array.from_numpy(offsets)
119+
120+
nested_attr_table = apply_offsets_to_table(attr_table, offsets=offsets)
121+
122+
if np.alltrue(np.isnan(coords[:, 2])):
123+
coord_list_size = 2
124+
# Cast to 2D coords
125+
coords = coords[:, :2]
126+
else:
127+
assert not np.any(
128+
np.isnan(coords[:, 2])
129+
), "Mixed 2D and 3D coordinates not currently supported"
130+
coord_list_size = 3
131+
132+
coords_arr = Array.from_numpy(coords.ravel("C"))
133+
coords_fixed_size_list = fixed_size_list_array(coords_arr, coord_list_size)
134+
linestrings_arr = list_array(offsets, coords_fixed_size_list)
135+
136+
extension_metadata: Dict[str, str] = {"ARROW:extension:name": "geoarrow.linestring"}
137+
if crs_json is not None:
138+
extension_metadata["ARROW:extension:metadata"] = json.dumps({"crs": crs_json})
139+
140+
linestrings_field = Field(
141+
"geometry",
142+
linestrings_arr.type,
143+
nullable=True,
144+
metadata=extension_metadata,
145+
)
146+
147+
timestamp_values = Array.from_numpy(timestamps).cast(time_arrow_dtype)
148+
timestamp_arr = list_array(offsets, timestamp_values)
149+
timestamp_col = ChunkedArray([timestamp_arr])
150+
151+
table = nested_attr_table.append_column(
152+
linestrings_field, ChunkedArray([linestrings_arr])
153+
)
154+
return table, timestamp_col
155+
156+
157+
def infer_time_unit(dtype: np.dtype) -> Tuple[Literal["s", "ms", "us", "ns"], DataType]:
158+
"""Infer an arrow time unit from the numpy data type
159+
160+
Raises:
161+
ValueError: If not a known numpy datetime dtype
162+
"""
163+
164+
if dtype.name == "datetime64[s]":
165+
code = "s"
166+
return code, DataType.timestamp(code)
167+
168+
if dtype.name == "datetime64[ms]":
169+
code = "ms"
170+
return code, DataType.timestamp(code)
171+
172+
if dtype.name == "datetime64[us]":
173+
code = "us"
174+
return code, DataType.timestamp(code)
175+
176+
if dtype.name == "datetime64[ns]":
177+
code = "ns"
178+
return code, DataType.timestamp(code)
179+
180+
raise ValueError(f"Unexpected datetime type: {dtype}")
181+
182+
183+
def apply_offsets_to_table(table: Table, offsets: Array) -> Table:
184+
batch = table.combine_chunks().to_batches()[0]
185+
186+
new_fields = []
187+
new_arrays = []
188+
189+
for field_idx in range(batch.num_columns):
190+
field = batch.schema.field(field_idx)
191+
new_field = field.with_type(DataType.list(field))
192+
new_array = list_array(offsets, batch[field_idx], type=new_field)
193+
194+
new_fields.append(new_field)
195+
new_arrays.append(new_array)
196+
197+
new_schema = Schema(new_fields, metadata=batch.schema.metadata)
198+
new_batch = RecordBatch(new_arrays, schema=new_schema)
199+
return Table.from_batches([new_batch])

lonboard/experimental/_layer.py

Lines changed: 4 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,7 @@
88
import sys
99
from typing import TYPE_CHECKING, Optional
1010

11-
import numpy as np
1211
import traitlets
13-
from arro3.core import (
14-
Array,
15-
ChunkedArray,
16-
Field,
17-
Table,
18-
fixed_size_list_array,
19-
list_array,
20-
)
2112
from arro3.core.types import ArrowStreamExportable
2213

2314
from lonboard._constants import EXTENSION_NAME
@@ -519,54 +510,10 @@ def from_movingpandas(
519510
traj_collection: TrajectoryCollection,
520511
**kwargs: Unpack[TripsLayerKwargs],
521512
) -> Self:
522-
import shapely
523-
524-
num_coords = 0
525-
num_rows = len(traj_collection)
526-
offsets = np.zeros(num_rows + 1, dtype=np.int32)
527-
528-
for i, traj in enumerate(traj_collection.trajectories):
529-
num_coords += traj.size()
530-
offsets[i + 1] = num_coords
531-
532-
coords = np.zeros((num_coords, 2), dtype=np.float64)
533-
timestamps = np.zeros(num_coords, dtype=np.int64)
534-
535-
for i, traj in enumerate(traj_collection.trajectories):
536-
start_offset = offsets[i]
537-
end_offset = offsets[i + 1]
538-
539-
# millisecond-based timestamps
540-
int64_ms_timestamps = traj.df.index.to_series().astype(np.int64) // (
541-
1000**2
542-
)
543-
timestamps[start_offset:end_offset] = int64_ms_timestamps
513+
"""Construct from a MovingPandas TrajectoryCollection"""
514+
from lonboard._geoarrow.movingpandas_interop import movingpandas_to_geoarrow
544515

545-
coords[start_offset:end_offset, 0] = shapely.get_x(traj.df.geometry).values
546-
coords[start_offset:end_offset, 1] = shapely.get_y(traj.df.geometry).values
547-
548-
# offset by earliest timestamp
549-
timestamps -= timestamps.min()
550-
551-
# Cast to float32
552-
timestamps = timestamps.astype(np.float32)
553-
554-
coords_arr = Array.from_numpy(coords.ravel("C"))
555-
coords_fixed_size_list = fixed_size_list_array(coords_arr, 2)
556-
linestrings_arr = list_array(Array.from_numpy(offsets), coords_fixed_size_list)
557-
timestamp_arr = list_array(
558-
Array.from_numpy(offsets), Array.from_numpy(timestamps)
516+
(table, timestamp_col) = movingpandas_to_geoarrow(
517+
traj_collection=traj_collection
559518
)
560-
timestamp_col = ChunkedArray([timestamp_arr])
561-
562-
linestrings_field = Field(
563-
"geometry",
564-
linestrings_arr.type,
565-
nullable=True,
566-
metadata={"ARROW:extension:name": "geoarrow.linestring"},
567-
)
568-
569-
# TODO: don't add timestamps onto table
570-
table = Table.from_pydict({"timestamps": timestamp_col})
571-
table = table.append_column(linestrings_field, ChunkedArray([linestrings_arr]))
572519
return cls(table=table, get_timestamps=timestamp_col, **kwargs)

0 commit comments

Comments
 (0)