Skip to content

Commit ce6fcb9

Browse files
authored
Merge pull request #183 from knaaptime/main
2 parents 65a72aa + de2e9e5 commit ce6fcb9

File tree

1 file changed

+63
-65
lines changed

1 file changed

+63
-65
lines changed

tobler/area_weighted/area_interpolate_dask.py

Lines changed: 63 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,12 @@
1-
'''
1+
"""
22
Area Weighted Interpolation, out-of-core and parallel through Dask
3-
'''
3+
"""
44

55
import pandas
66
import geopandas
77
import numpy as np
88
from .area_interpolate import _area_interpolate_binning as area_interpolate
9-
try:
10-
import dask_geopandas
11-
from dask.base import tokenize
12-
from dask.highlevelgraph import HighLevelGraph
13-
except ImportError:
14-
raise ImportError(
15-
"Area interpolation with Dask requires `dask` and "
16-
"`dask_geopandas` installed to run. Please install them "
17-
"before importing this functionality."
18-
)
9+
1910

2011
def area_interpolate_dask(
2112
source_dgdf,
@@ -24,11 +15,11 @@ def area_interpolate_dask(
2415
extensive_variables=None,
2516
intensive_variables=None,
2617
categorical_variables=None,
27-
categorical_frequency=True
18+
categorical_frequency=True,
2819
):
29-
'''
20+
"""
3021
Out-of-core and parallel area interpolation for categorical variables.
31-
22+
3223
Parameters
3324
----------
3425
source_dgdf : dask_geopandas.GeoDataFrame
@@ -40,7 +31,7 @@ def area_interpolate_dask(
4031
Dask-geopandas GeoDataFrame
4132
IMPORTANT: the table needs to be spatially shuffled and with spatial partitions.
4233
This is required so only overlapping partitions are checked for interpolation. See
43-
more on spatial shuffling at: https://dask-geopandas.readthedocs.io/en/stable/guide/spatial-partitioning.html
34+
more on spatial shuffling at: https://dask-geopandas.readthedocs.io/en/stable/guide/spatial-partitioning.html
4435
id_col : str
4536
Name of the column in `target_dgdf` with unique IDs to be used in output table
4637
extensive_variables : list
@@ -50,7 +41,7 @@ def area_interpolate_dask(
5041
[Optional. Default=None] Columns in `source_dgdf` for intensive variables
5142
IMPORTANT: currently NOT implemented.
5243
categorical_variables : list
53-
[Optional. Default=None] Columns in `source_dgdf` for categorical variables
44+
[Optional. Default=None] Columns in `source_dgdf` for categorical variables
5445
IMPORTANT: categorical variables must be of type `'category[known]'`. This is so
5546
all categories are known ahead of time and Dask can run lazily.
5647
categorical_frequency : Boolean
@@ -65,42 +56,55 @@ def area_interpolate_dask(
6556
estimates : dask_geopandas.GeoDataFrame
6657
new dask-geopandas geodaraframe with interpolated variables and `id_col` as
6758
columns and target_df geometry as output geometry
68-
69-
'''
59+
60+
"""
61+
try:
62+
import dask_geopandas
63+
from dask.base import tokenize
64+
from dask.highlevelgraph import HighLevelGraph
65+
except ImportError:
66+
raise ImportError(
67+
"Area interpolation with Dask requires `dask` and "
68+
"`dask_geopandas` installed to run. Please install them "
69+
"before importing this functionality."
70+
)
71+
7072
if intensive_variables is not None:
71-
raise NotImplementedError((
72-
"Dask-based interpolation of intensive variables is "
73-
"not implemented yet. Please remove intensive variables to "
74-
"be able to run the rest."
75-
))
73+
raise NotImplementedError(
74+
(
75+
"Dask-based interpolation of intensive variables is "
76+
"not implemented yet. Please remove intensive variables to "
77+
"be able to run the rest."
78+
)
79+
)
7680
if extensive_variables is not None:
77-
raise NotImplementedError((
78-
"Dask-based interpolation of extensive variables is "
79-
"not implemented yet. Please remove intensive variables to "
80-
"be able to run the rest."
81-
))
81+
raise NotImplementedError(
82+
(
83+
"Dask-based interpolation of extensive variables is "
84+
"not implemented yet. Please remove intensive variables to "
85+
"be able to run the rest."
86+
)
87+
)
8288
# Categoricals must be Dask's known categorical
8389
if categorical_variables is not None:
8490
category_vars = []
8591
for cat_var in categorical_variables:
86-
var_names = [f'{cat_var}_{c}' for c in source_dgdf[cat_var].cat.categories]
92+
var_names = [f"{cat_var}_{c}" for c in source_dgdf[cat_var].cat.categories]
8793
category_vars.extend(var_names)
8894
else:
8995
category_vars = None
9096
# Build tasks by joining pairs of chunks from left/right
9197
dsk = {}
9298
new_spatial_partitions = []
9399
parts = geopandas.sjoin(
94-
source_dgdf.spatial_partitions.to_frame('geometry'),
95-
target_dgdf.spatial_partitions.to_frame('geometry'),
96-
how='inner',
97-
predicate='intersects'
100+
source_dgdf.spatial_partitions.to_frame("geometry"),
101+
target_dgdf.spatial_partitions.to_frame("geometry"),
102+
how="inner",
103+
predicate="intersects",
98104
)
99105
parts_left = np.asarray(parts.index)
100-
parts_right = np.asarray(parts['index_right'].values)
101-
name = 'area_interpolate-' + tokenize(
102-
target_dgdf, source_dgdf
103-
)
106+
parts_right = np.asarray(parts["index_right"].values)
107+
name = "area_interpolate-" + tokenize(target_dgdf, source_dgdf)
104108
for i, (l, r) in enumerate(zip(parts_left, parts_right)):
105109
dsk[(name, i)] = (
106110
id_area_interpolate,
@@ -111,7 +115,7 @@ def area_interpolate_dask(
111115
intensive_variables,
112116
None,
113117
True,
114-
'auto',
118+
"auto",
115119
1,
116120
categorical_variables,
117121
category_vars,
@@ -137,23 +141,19 @@ def area_interpolate_dask(
137141
intensive_variables=intensive_variables,
138142
table=None,
139143
allocate_total=True,
140-
spatial_index='auto',
144+
spatial_index="auto",
141145
n_jobs=1,
142146
categorical_variables=categorical_variables,
143147
category_vars=category_vars,
144148
)
145149
# Build output table
146150
transferred = dask_geopandas.GeoDataFrame(
147-
graph,
148-
name,
149-
meta,
150-
[None] * (len(dsk) + 1),
151-
new_spatial_partitions
151+
graph, name, meta, [None] * (len(dsk) + 1), new_spatial_partitions
152152
)
153153
# Merge chunks
154-
out = target_dgdf[[id_col, 'geometry']]
154+
out = target_dgdf[[id_col, "geometry"]]
155155
## Extensive --> Not implemented (DAB: the below does not match single-core)
156-
'''
156+
"""
157157
if extensive_variables is not None:
158158
out_extensive = (
159159
transferred
@@ -162,25 +162,23 @@ def area_interpolate_dask(
162162
.agg({v: 'sum' for v in extensive_variables})
163163
)
164164
out = out.join(out_extensive, on=id_col)
165-
'''
165+
"""
166166
## Intensive --> Weight by area of the chunk (Not implemented)
167167
## Categorical --> Add up proportions
168168
if categorical_variables is not None:
169169
out_categorical = (
170-
transferred
171-
[category_vars]
170+
transferred[category_vars]
172171
.astype(float)
173172
.groupby(transferred[id_col])
174-
.agg({v: 'sum' for v in category_vars})
175-
)
173+
.agg({v: "sum" for v in category_vars})
174+
)
176175
out = out.join(out_categorical, on=id_col)
177176
if categorical_frequency is True:
178177
cols = out_categorical.columns.tolist()
179-
out[cols] = out[cols].div(
180-
out.area, axis='index'
181-
)
178+
out[cols] = out[cols].div(out.area, axis="index")
182179
return out
183180

181+
184182
def id_area_interpolate(
185183
source_df,
186184
target_df,
@@ -189,20 +187,20 @@ def id_area_interpolate(
189187
intensive_variables=None,
190188
table=None,
191189
allocate_total=True,
192-
spatial_index='auto',
190+
spatial_index="auto",
193191
n_jobs=1,
194192
categorical_variables=None,
195-
category_vars=None
193+
category_vars=None,
196194
):
197-
'''
195+
"""
198196
Light wrapper around single-core area interpolation to be run on distributed workers
199-
197+
200198
Parameters
201199
----------
202200
source_df : geopandas.GeoDataFrame
203201
target_df : geopandas.GeoDataFrame
204202
id_col : str
205-
Name of the column in `target_dgdf` with unique IDs to be used in output table
203+
Name of the column in `target_dgdf` with unique IDs to be used in output table
206204
extensive_variables : list
207205
[Optional. Default=None] Columns in dataframes for extensive variables
208206
intensive_variables : list
@@ -241,8 +239,8 @@ def id_area_interpolate(
241239
estimates : geopandas.GeoDataFrame
242240
new geodaraframe with interpolated variables as columns and target_df geometry
243241
as output geometry
244-
245-
'''
242+
243+
"""
246244
estimates = area_interpolate(
247245
source_df,
248246
target_df,
@@ -253,16 +251,16 @@ def id_area_interpolate(
253251
spatial_index=spatial_index,
254252
n_jobs=n_jobs,
255253
categorical_variables=categorical_variables,
256-
categorical_frequency=False
254+
categorical_frequency=False,
257255
)
258256
estimates[id_col] = target_df[id_col].values
259-
257+
260258
if categorical_variables is not None:
261259
category_vars_to_add = []
262260
for category_var in category_vars:
263261
if category_var not in estimates.columns:
264262
category_vars_to_add.append(category_var)
265263
estimates = estimates.join(
266264
pandas.DataFrame(index=estimates.index, columns=category_vars_to_add)
267-
)
265+
)
268266
return estimates

0 commit comments

Comments
 (0)