88from typing import Optional , Sequence , cast
99
1010import google .cloud .bigquery
11+ import google .cloud .bigquery .table
1112import google .oauth2 .credentials
1213import psutil
1314
@@ -124,6 +125,36 @@ def _estimate_row_bytes(fields: Sequence[google.cloud.bigquery.SchemaField]) ->
124125 return total_size
125126
126127
128+ def _download_results_in_parallel (
129+ rows : google .cloud .bigquery .table .RowIterator ,
130+ * ,
131+ bqclient : google .cloud .bigquery .Client ,
132+ progress_bar_type : Optional [str ] = None ,
133+ use_bqstorage_api : bool = True ,
134+ ):
135+ table_reference = getattr (rows , "_table" , None )
136+ schema = getattr (rows , "_schema" , None )
137+
138+ # If the results are large enough to materialize a table, break the
139+ # connection to the original query that contains an ORDER BY clause to allow
140+ # reading with multiple streams.
141+ if table_reference is not None and schema is not None :
142+ rows = bqclient .list_rows (
143+ table_reference ,
144+ selected_fields = schema ,
145+ )
146+
147+ return pandas_gbq .core .read .download_results (
148+ rows ,
149+ bqclient = bqclient ,
150+ progress_bar_type = progress_bar_type ,
151+ warn_on_large_results = False ,
152+ max_results = None ,
153+ user_dtypes = None ,
154+ use_bqstorage_api = use_bqstorage_api ,
155+ )
156+
157+
127158def _sample_with_tablesample (
128159 table : google .cloud .bigquery .Table ,
129160 * ,
@@ -141,13 +172,10 @@ def _sample_with_tablesample(
141172 LIMIT { int (target_row_count )} ;
142173 """
143174 rows = bqclient .query_and_wait (query )
144- return pandas_gbq . core . read . download_results (
175+ return _download_results_in_parallel (
145176 rows ,
146177 bqclient = bqclient ,
147178 progress_bar_type = progress_bar_type ,
148- warn_on_large_results = False ,
149- max_results = None ,
150- user_dtypes = None ,
151179 use_bqstorage_api = use_bqstorage_api ,
152180 )
153181
@@ -167,13 +195,10 @@ def _sample_with_limit(
167195 LIMIT { int (target_row_count )} ;
168196 """
169197 rows = bqclient .query_and_wait (query )
170- return pandas_gbq . core . read . download_results (
198+ return _download_results_in_parallel (
171199 rows ,
172200 bqclient = bqclient ,
173201 progress_bar_type = progress_bar_type ,
174- warn_on_large_results = False ,
175- max_results = None ,
176- user_dtypes = None ,
177202 use_bqstorage_api = use_bqstorage_api ,
178203 )
179204
@@ -192,7 +217,19 @@ def sample(
192217 This function attempts to sample a BigQuery table to a target size in
193218 memory. It prioritizes methods that minimize data scanned and downloaded.
194219
195- The sampling strategy is as follows:
220+ The target size is based on an estimate of the row size and this method
221+ return more or less than expected. If the table metadata doesn't include
222+ a size, such as with views, an estimate based on the table schema is
223+ used.
224+
225+ Sampling is based on the `BigQuery TABLESAMPLE
226+ <https://docs.cloud.google.com/bigquery/docs/table-sampling>`_ feature,
227+ which can provide a biased sample if data is not randomly distributed
228+ among file blocks. For more control over sampling, use BigQuery
229+ DataFrames ``read_gbq_table`` and ``DataFrame.sample`` methods.
230+
231+ Specificially, the sampling strategy is as follows:
232+
196233 1. If the table is small enough (based on `target_mb` or available memory)
197234 and eligible for the BigQuery Storage Read API, the entire table is
198235 downloaded.
@@ -209,7 +246,7 @@ def sample(
209246 "project.dataset.table" or "dataset.table".
210247 target_mb: Optional. The target size in megabytes for the sampled
211248 DataFrame. If not specified, it defaults to 1/4 of available
212- system memory, with a minimum of 100MB.
249+ system memory, with a minimum of 100MB and maximum of 1 GB .
213250 credentials: Optional. The credentials to use for BigQuery access.
214251 If not provided, `pandas_gbq` will attempt to infer them.
215252 billing_project_id: Optional. The ID of the Google Cloud project to
0 commit comments