BUG: Use object dtype for STRING, ARRAY, and STRUCT columns when there are zero rows. (#285)

tswast · web-flow · commit 1a68b40f1503 · 2019-07-26T11:29:57.000-07:00
* BUG: Use object dtype for STRING, ARRAY, and STRUCT columns when there are zero rows.

If a there are no rows, the default dtype is used (which is now
float64, must previously have been object).

* Add PR number to changelog.

* Blacken
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -10,6 +10,12 @@ Changelog
   with the pandas package which dropped Python 2 support at the end of 2019.
   (:issue:`268`)
 
+Implementation changes
+~~~~~~~~~~~~~~~~~~~~~~
+
+- Use object dtype for ``STRING``, ``ARRAY``, and ``STRUCT`` columns when
+  there are zero rows. (:issue:`285`)
+
 .. _changelog-0.10.0:
 
 0.10.0 / 2019-04-05
diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py
@@ -672,20 +672,26 @@ def _bqschema_to_nullsafe_dtypes(schema_fields):
     # If you update this mapping, also update the table at
     # `docs/source/reading.rst`.
     dtype_map = {
+        "DATE": "datetime64[ns]",
+        "DATETIME": "datetime64[ns]",
         "FLOAT": np.dtype(float),
+        "GEOMETRY": "object",
+        "RECORD": "object",
+        "STRING": "object",
+        "TIME": "datetime64[ns]",
         # pandas doesn't support timezone-aware dtype in DataFrame/Series
         # constructors. It's more idiomatic to localize after construction.
         # https://github.com/pandas-dev/pandas/issues/25843
         "TIMESTAMP": "datetime64[ns]",
-        "TIME": "datetime64[ns]",
-        "DATE": "datetime64[ns]",
-        "DATETIME": "datetime64[ns]",
     }
 
     dtypes = {}
     for field in schema_fields:
         name = str(field["name"])
+        # Array BigQuery type is represented as an object column containing
+        # list objects.
         if field["mode"].upper() == "REPEATED":
+            dtypes[name] = "object"
             continue
 
         dtype = dtype_map.get(field["type"].upper())
diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py
@@ -577,24 +577,25 @@ def test_download_dataset_larger_than_200k_rows(self, project_id):
     def test_zero_rows(self, project_id):
         # Bug fix for https://github.com/pandas-dev/pandas/issues/10273
         df = gbq.read_gbq(
-            "SELECT title, id, is_bot, "
-            "SEC_TO_TIMESTAMP(timestamp) ts "
-            "FROM [publicdata:samples.wikipedia] "
-            "WHERE timestamp=-9999999",
+            'SELECT name, number, (mlc_class = "HU") is_hurricane, iso_time '
+            "FROM `bigquery-public-data.noaa_hurricanes.hurricanes` "
+            'WHERE iso_time = TIMESTAMP("1900-01-01 00:00:00") ',
             project_id=project_id,
             credentials=self.credentials,
-            dialect="legacy",
         )
         empty_columns = {
-            "title": pandas.Series([], dtype=object),
-            "id": pandas.Series([], dtype=np.dtype(int)),
-            "is_bot": pandas.Series([], dtype=np.dtype(bool)),
-            "ts": pandas.Series([], dtype="datetime64[ns]"),
+            "name": pandas.Series([], dtype=object),
+            "number": pandas.Series([], dtype=np.dtype(int)),
+            "is_hurricane": pandas.Series([], dtype=np.dtype(bool)),
+            "iso_time": pandas.Series([], dtype="datetime64[ns]"),
         }
         expected_result = DataFrame(
-            empty_columns, columns=["title", "id", "is_bot", "ts"]
+            empty_columns,
+            columns=["name", "number", "is_hurricane", "iso_time"],
         )
-        expected_result["ts"] = expected_result["ts"].dt.tz_localize("UTC")
+        expected_result["iso_time"] = expected_result[
+            "iso_time"
+        ].dt.tz_localize("UTC")
         tm.assert_frame_equal(df, expected_result, check_index_type=False)
 
     def test_one_row_one_column(self, project_id):