Skip to content

Commit 1a68b40

Browse files
authored
BUG: Use object dtype for STRING, ARRAY, and STRUCT columns when there are zero rows. (#285)
* BUG: Use object dtype for STRING, ARRAY, and STRUCT columns when there are zero rows. If a there are no rows, the default dtype is used (which is now float64, must previously have been object). * Add PR number to changelog. * Blacken
1 parent 526ec32 commit 1a68b40

File tree

3 files changed

+27
-14
lines changed

3 files changed

+27
-14
lines changed

docs/source/changelog.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@ Changelog
1010
with the pandas package which dropped Python 2 support at the end of 2019.
1111
(:issue:`268`)
1212

13+
Implementation changes
14+
~~~~~~~~~~~~~~~~~~~~~~
15+
16+
- Use object dtype for ``STRING``, ``ARRAY``, and ``STRUCT`` columns when
17+
there are zero rows. (:issue:`285`)
18+
1319
.. _changelog-0.10.0:
1420

1521
0.10.0 / 2019-04-05

pandas_gbq/gbq.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -672,20 +672,26 @@ def _bqschema_to_nullsafe_dtypes(schema_fields):
672672
# If you update this mapping, also update the table at
673673
# `docs/source/reading.rst`.
674674
dtype_map = {
675+
"DATE": "datetime64[ns]",
676+
"DATETIME": "datetime64[ns]",
675677
"FLOAT": np.dtype(float),
678+
"GEOMETRY": "object",
679+
"RECORD": "object",
680+
"STRING": "object",
681+
"TIME": "datetime64[ns]",
676682
# pandas doesn't support timezone-aware dtype in DataFrame/Series
677683
# constructors. It's more idiomatic to localize after construction.
678684
# https://github.com/pandas-dev/pandas/issues/25843
679685
"TIMESTAMP": "datetime64[ns]",
680-
"TIME": "datetime64[ns]",
681-
"DATE": "datetime64[ns]",
682-
"DATETIME": "datetime64[ns]",
683686
}
684687

685688
dtypes = {}
686689
for field in schema_fields:
687690
name = str(field["name"])
691+
# Array BigQuery type is represented as an object column containing
692+
# list objects.
688693
if field["mode"].upper() == "REPEATED":
694+
dtypes[name] = "object"
689695
continue
690696

691697
dtype = dtype_map.get(field["type"].upper())

tests/system/test_gbq.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -577,24 +577,25 @@ def test_download_dataset_larger_than_200k_rows(self, project_id):
577577
def test_zero_rows(self, project_id):
578578
# Bug fix for https://github.com/pandas-dev/pandas/issues/10273
579579
df = gbq.read_gbq(
580-
"SELECT title, id, is_bot, "
581-
"SEC_TO_TIMESTAMP(timestamp) ts "
582-
"FROM [publicdata:samples.wikipedia] "
583-
"WHERE timestamp=-9999999",
580+
'SELECT name, number, (mlc_class = "HU") is_hurricane, iso_time '
581+
"FROM `bigquery-public-data.noaa_hurricanes.hurricanes` "
582+
'WHERE iso_time = TIMESTAMP("1900-01-01 00:00:00") ',
584583
project_id=project_id,
585584
credentials=self.credentials,
586-
dialect="legacy",
587585
)
588586
empty_columns = {
589-
"title": pandas.Series([], dtype=object),
590-
"id": pandas.Series([], dtype=np.dtype(int)),
591-
"is_bot": pandas.Series([], dtype=np.dtype(bool)),
592-
"ts": pandas.Series([], dtype="datetime64[ns]"),
587+
"name": pandas.Series([], dtype=object),
588+
"number": pandas.Series([], dtype=np.dtype(int)),
589+
"is_hurricane": pandas.Series([], dtype=np.dtype(bool)),
590+
"iso_time": pandas.Series([], dtype="datetime64[ns]"),
593591
}
594592
expected_result = DataFrame(
595-
empty_columns, columns=["title", "id", "is_bot", "ts"]
593+
empty_columns,
594+
columns=["name", "number", "is_hurricane", "iso_time"],
596595
)
597-
expected_result["ts"] = expected_result["ts"].dt.tz_localize("UTC")
596+
expected_result["iso_time"] = expected_result[
597+
"iso_time"
598+
].dt.tz_localize("UTC")
598599
tm.assert_frame_equal(df, expected_result, check_index_type=False)
599600

600601
def test_one_row_one_column(self, project_id):

0 commit comments

Comments
 (0)