Skip to content

Commit d719d9e

Browse files
committed
download results in parallel
1 parent 32b9a20 commit d719d9e

File tree

2 files changed

+48
-11
lines changed

2 files changed

+48
-11
lines changed

pandas_gbq/core/sample.py

Lines changed: 47 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from typing import Optional, Sequence, cast
99

1010
import google.cloud.bigquery
11+
import google.cloud.bigquery.table
1112
import google.oauth2.credentials
1213
import psutil
1314

@@ -124,6 +125,36 @@ def _estimate_row_bytes(fields: Sequence[google.cloud.bigquery.SchemaField]) ->
124125
return total_size
125126

126127

128+
def _download_results_in_parallel(
129+
rows: google.cloud.bigquery.table.RowIterator,
130+
*,
131+
bqclient: google.cloud.bigquery.Client,
132+
progress_bar_type: Optional[str] = None,
133+
use_bqstorage_api: bool = True,
134+
):
135+
table_reference = getattr(rows, "_table", None)
136+
schema = getattr(rows, "_schema", None)
137+
138+
# If the results are large enough to materialize a table, break the
139+
# connection to the original query that contains an ORDER BY clause to allow
140+
# reading with multiple streams.
141+
if table_reference is not None and schema is not None:
142+
rows = bqclient.list_rows(
143+
table_reference,
144+
selected_fields=schema,
145+
)
146+
147+
return pandas_gbq.core.read.download_results(
148+
rows,
149+
bqclient=bqclient,
150+
progress_bar_type=progress_bar_type,
151+
warn_on_large_results=False,
152+
max_results=None,
153+
user_dtypes=None,
154+
use_bqstorage_api=use_bqstorage_api,
155+
)
156+
157+
127158
def _sample_with_tablesample(
128159
table: google.cloud.bigquery.Table,
129160
*,
@@ -141,13 +172,10 @@ def _sample_with_tablesample(
141172
LIMIT {int(target_row_count)};
142173
"""
143174
rows = bqclient.query_and_wait(query)
144-
return pandas_gbq.core.read.download_results(
175+
return _download_results_in_parallel(
145176
rows,
146177
bqclient=bqclient,
147178
progress_bar_type=progress_bar_type,
148-
warn_on_large_results=False,
149-
max_results=None,
150-
user_dtypes=None,
151179
use_bqstorage_api=use_bqstorage_api,
152180
)
153181

@@ -167,13 +195,10 @@ def _sample_with_limit(
167195
LIMIT {int(target_row_count)};
168196
"""
169197
rows = bqclient.query_and_wait(query)
170-
return pandas_gbq.core.read.download_results(
198+
return _download_results_in_parallel(
171199
rows,
172200
bqclient=bqclient,
173201
progress_bar_type=progress_bar_type,
174-
warn_on_large_results=False,
175-
max_results=None,
176-
user_dtypes=None,
177202
use_bqstorage_api=use_bqstorage_api,
178203
)
179204

@@ -192,7 +217,19 @@ def sample(
192217
This function attempts to sample a BigQuery table to a target size in
193218
memory. It prioritizes methods that minimize data scanned and downloaded.
194219
195-
The sampling strategy is as follows:
220+
The target size is based on an estimate of the row size and this method
221+
return more or less than expected. If the table metadata doesn't include
222+
a size, such as with views, an estimate based on the table schema is
223+
used.
224+
225+
Sampling is based on the `BigQuery TABLESAMPLE
226+
<https://docs.cloud.google.com/bigquery/docs/table-sampling>`_ feature,
227+
which can provide a biased sample if data is not randomly distributed
228+
among file blocks. For more control over sampling, use BigQuery
229+
DataFrames ``read_gbq_table`` and ``DataFrame.sample`` methods.
230+
231+
Specificially, the sampling strategy is as follows:
232+
196233
1. If the table is small enough (based on `target_mb` or available memory)
197234
and eligible for the BigQuery Storage Read API, the entire table is
198235
downloaded.
@@ -209,7 +246,7 @@ def sample(
209246
"project.dataset.table" or "dataset.table".
210247
target_mb: Optional. The target size in megabytes for the sampled
211248
DataFrame. If not specified, it defaults to 1/4 of available
212-
system memory, with a minimum of 100MB.
249+
system memory, with a minimum of 100MB and maximum of 1 GB.
213250
credentials: Optional. The credentials to use for BigQuery access.
214251
If not provided, `pandas_gbq` will attempt to infer them.
215252
billing_project_id: Optional. The ID of the Google Cloud project to

tests/system/test_sample.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
# Use of this source code is governed by a BSD-style
33
# license that can be found in the LICENSE file.
44

5-
import google.oauth2.credentials
65
import google.cloud.bigquery
6+
import google.oauth2.credentials
77

88
import pandas_gbq
99

0 commit comments

Comments
 (0)