1- '''
1+ """
22Area Weighted Interpolation, out-of-core and parallel through Dask
3- '''
3+ """
44
55import pandas
66import geopandas
77import numpy as np
88from .area_interpolate import _area_interpolate_binning as area_interpolate
9- try :
10- import dask_geopandas
11- from dask .base import tokenize
12- from dask .highlevelgraph import HighLevelGraph
13- except ImportError :
14- raise ImportError (
15- "Area interpolation with Dask requires `dask` and "
16- "`dask_geopandas` installed to run. Please install them "
17- "before importing this functionality."
18- )
9+
1910
2011def area_interpolate_dask (
2112 source_dgdf ,
@@ -24,11 +15,11 @@ def area_interpolate_dask(
2415 extensive_variables = None ,
2516 intensive_variables = None ,
2617 categorical_variables = None ,
27- categorical_frequency = True
18+ categorical_frequency = True ,
2819):
29- '''
20+ """
3021 Out-of-core and parallel area interpolation for categorical variables.
31-
22+
3223 Parameters
3324 ----------
3425 source_dgdf : dask_geopandas.GeoDataFrame
@@ -40,7 +31,7 @@ def area_interpolate_dask(
4031 Dask-geopandas GeoDataFrame
4132 IMPORTANT: the table needs to be spatially shuffled and with spatial partitions.
4233 This is required so only overlapping partitions are checked for interpolation. See
43- more on spatial shuffling at: https://dask-geopandas.readthedocs.io/en/stable/guide/spatial-partitioning.html
34+ more on spatial shuffling at: https://dask-geopandas.readthedocs.io/en/stable/guide/spatial-partitioning.html
4435 id_col : str
4536 Name of the column in `target_dgdf` with unique IDs to be used in output table
4637 extensive_variables : list
@@ -50,7 +41,7 @@ def area_interpolate_dask(
5041 [Optional. Default=None] Columns in `source_dgdf` for intensive variables
5142 IMPORTANT: currently NOT implemented.
5243 categorical_variables : list
53- [Optional. Default=None] Columns in `source_dgdf` for categorical variables
44+ [Optional. Default=None] Columns in `source_dgdf` for categorical variables
5445 IMPORTANT: categorical variables must be of type `'category[known]'`. This is so
5546 all categories are known ahead of time and Dask can run lazily.
5647 categorical_frequency : Boolean
@@ -65,42 +56,55 @@ def area_interpolate_dask(
6556 estimates : dask_geopandas.GeoDataFrame
6657 new dask-geopandas geodaraframe with interpolated variables and `id_col` as
6758 columns and target_df geometry as output geometry
68-
69- '''
59+
60+ """
61+ try :
62+ import dask_geopandas
63+ from dask .base import tokenize
64+ from dask .highlevelgraph import HighLevelGraph
65+ except ImportError :
66+ raise ImportError (
67+ "Area interpolation with Dask requires `dask` and "
68+ "`dask_geopandas` installed to run. Please install them "
69+ "before importing this functionality."
70+ )
71+
7072 if intensive_variables is not None :
71- raise NotImplementedError ((
72- "Dask-based interpolation of intensive variables is "
73- "not implemented yet. Please remove intensive variables to "
74- "be able to run the rest."
75- ))
73+ raise NotImplementedError (
74+ (
75+ "Dask-based interpolation of intensive variables is "
76+ "not implemented yet. Please remove intensive variables to "
77+ "be able to run the rest."
78+ )
79+ )
7680 if extensive_variables is not None :
77- raise NotImplementedError ((
78- "Dask-based interpolation of extensive variables is "
79- "not implemented yet. Please remove intensive variables to "
80- "be able to run the rest."
81- ))
81+ raise NotImplementedError (
82+ (
83+ "Dask-based interpolation of extensive variables is "
84+ "not implemented yet. Please remove intensive variables to "
85+ "be able to run the rest."
86+ )
87+ )
8288 # Categoricals must be Dask's known categorical
8389 if categorical_variables is not None :
8490 category_vars = []
8591 for cat_var in categorical_variables :
86- var_names = [f' { cat_var } _{ c } ' for c in source_dgdf [cat_var ].cat .categories ]
92+ var_names = [f" { cat_var } _{ c } " for c in source_dgdf [cat_var ].cat .categories ]
8793 category_vars .extend (var_names )
8894 else :
8995 category_vars = None
9096 # Build tasks by joining pairs of chunks from left/right
9197 dsk = {}
9298 new_spatial_partitions = []
9399 parts = geopandas .sjoin (
94- source_dgdf .spatial_partitions .to_frame (' geometry' ),
95- target_dgdf .spatial_partitions .to_frame (' geometry' ),
96- how = ' inner' ,
97- predicate = ' intersects'
100+ source_dgdf .spatial_partitions .to_frame (" geometry" ),
101+ target_dgdf .spatial_partitions .to_frame (" geometry" ),
102+ how = " inner" ,
103+ predicate = " intersects" ,
98104 )
99105 parts_left = np .asarray (parts .index )
100- parts_right = np .asarray (parts ['index_right' ].values )
101- name = 'area_interpolate-' + tokenize (
102- target_dgdf , source_dgdf
103- )
106+ parts_right = np .asarray (parts ["index_right" ].values )
107+ name = "area_interpolate-" + tokenize (target_dgdf , source_dgdf )
104108 for i , (l , r ) in enumerate (zip (parts_left , parts_right )):
105109 dsk [(name , i )] = (
106110 id_area_interpolate ,
@@ -111,7 +115,7 @@ def area_interpolate_dask(
111115 intensive_variables ,
112116 None ,
113117 True ,
114- ' auto' ,
118+ " auto" ,
115119 1 ,
116120 categorical_variables ,
117121 category_vars ,
@@ -137,23 +141,19 @@ def area_interpolate_dask(
137141 intensive_variables = intensive_variables ,
138142 table = None ,
139143 allocate_total = True ,
140- spatial_index = ' auto' ,
144+ spatial_index = " auto" ,
141145 n_jobs = 1 ,
142146 categorical_variables = categorical_variables ,
143147 category_vars = category_vars ,
144148 )
145149 # Build output table
146150 transferred = dask_geopandas .GeoDataFrame (
147- graph ,
148- name ,
149- meta ,
150- [None ] * (len (dsk ) + 1 ),
151- new_spatial_partitions
151+ graph , name , meta , [None ] * (len (dsk ) + 1 ), new_spatial_partitions
152152 )
153153 # Merge chunks
154- out = target_dgdf [[id_col , ' geometry' ]]
154+ out = target_dgdf [[id_col , " geometry" ]]
155155 ## Extensive --> Not implemented (DAB: the below does not match single-core)
156- '''
156+ """
157157 if extensive_variables is not None:
158158 out_extensive = (
159159 transferred
@@ -162,25 +162,23 @@ def area_interpolate_dask(
162162 .agg({v: 'sum' for v in extensive_variables})
163163 )
164164 out = out.join(out_extensive, on=id_col)
165- '''
165+ """
166166 ## Intensive --> Weight by area of the chunk (Not implemented)
167167 ## Categorical --> Add up proportions
168168 if categorical_variables is not None :
169169 out_categorical = (
170- transferred
171- [category_vars ]
170+ transferred [category_vars ]
172171 .astype (float )
173172 .groupby (transferred [id_col ])
174- .agg ({v : ' sum' for v in category_vars })
175- )
173+ .agg ({v : " sum" for v in category_vars })
174+ )
176175 out = out .join (out_categorical , on = id_col )
177176 if categorical_frequency is True :
178177 cols = out_categorical .columns .tolist ()
179- out [cols ] = out [cols ].div (
180- out .area , axis = 'index'
181- )
178+ out [cols ] = out [cols ].div (out .area , axis = "index" )
182179 return out
183180
181+
184182def id_area_interpolate (
185183 source_df ,
186184 target_df ,
@@ -189,20 +187,20 @@ def id_area_interpolate(
189187 intensive_variables = None ,
190188 table = None ,
191189 allocate_total = True ,
192- spatial_index = ' auto' ,
190+ spatial_index = " auto" ,
193191 n_jobs = 1 ,
194192 categorical_variables = None ,
195- category_vars = None
193+ category_vars = None ,
196194):
197- '''
195+ """
198196 Light wrapper around single-core area interpolation to be run on distributed workers
199-
197+
200198 Parameters
201199 ----------
202200 source_df : geopandas.GeoDataFrame
203201 target_df : geopandas.GeoDataFrame
204202 id_col : str
205- Name of the column in `target_dgdf` with unique IDs to be used in output table
203+ Name of the column in `target_dgdf` with unique IDs to be used in output table
206204 extensive_variables : list
207205 [Optional. Default=None] Columns in dataframes for extensive variables
208206 intensive_variables : list
@@ -241,8 +239,8 @@ def id_area_interpolate(
241239 estimates : geopandas.GeoDataFrame
242240 new geodaraframe with interpolated variables as columns and target_df geometry
243241 as output geometry
244-
245- '''
242+
243+ """
246244 estimates = area_interpolate (
247245 source_df ,
248246 target_df ,
@@ -253,16 +251,16 @@ def id_area_interpolate(
253251 spatial_index = spatial_index ,
254252 n_jobs = n_jobs ,
255253 categorical_variables = categorical_variables ,
256- categorical_frequency = False
254+ categorical_frequency = False ,
257255 )
258256 estimates [id_col ] = target_df [id_col ].values
259-
257+
260258 if categorical_variables is not None :
261259 category_vars_to_add = []
262260 for category_var in category_vars :
263261 if category_var not in estimates .columns :
264262 category_vars_to_add .append (category_var )
265263 estimates = estimates .join (
266264 pandas .DataFrame (index = estimates .index , columns = category_vars_to_add )
267- )
265+ )
268266 return estimates
0 commit comments