pandas-dev · kastkeepitjumpinlikekangaroos · Oct 25, 2024 · Oct 25, 2024 · Nov 12, 2024 · Nov 12, 2024
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -134,6 +134,7 @@ MultiIndex
 I/O
 ^^^
 - :meth:`DataFrame.to_excel` was storing decimals as strings instead of numbers (:issue:`49598`)
+- Bug in :func:`read_sql` causing an unintended exception when byte data was being converted to string when using the pyarrow dtype_backend (:issue:`59242`)
 -
 
 Period

diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -2261,7 +2261,9 @@ def type(self):
         elif pa.types.is_null(pa_type):
             # TODO: None? pd.NA? pa.null?
             return type(pa_type)
-        elif isinstance(pa_type, pa.ExtensionType):
+        elif isinstance(pa_type, pa.ExtensionType) or isinstance(
+            pa_type, pa.OpaqueType
+        ):
             return type(self)(pa_type.storage_type).type
         raise NotImplementedError(pa_type)
 

diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -967,10 +967,6 @@ def convert(arr):
                     # i.e. maybe_convert_objects didn't convert
                     convert_to_nullable_dtype = dtype_backend != "numpy"
                     arr = maybe_infer_to_datetimelike(arr, convert_to_nullable_dtype)
-                    if convert_to_nullable_dtype and arr.dtype == np.dtype("O"):
-                        new_dtype = StringDtype()
-                        arr_cls = new_dtype.construct_array_type()
-                        arr = arr_cls._from_sequence(arr, dtype=new_dtype)
                 elif dtype_backend != "numpy" and isinstance(arr, np.ndarray):
                     if arr.dtype.kind in "iufb":
                         arr = pd_array(arr, copy=False)

diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
@@ -4358,3 +4358,46 @@ def test_xsqlite_if_exists(sqlite_buildin):
         (5, "E"),
     ]
     drop_table(table_name, sqlite_buildin)
+
+
+@pytest.mark.parametrize("con", all_connectable)
+@pytest.mark.parametrize("dtype_backend", ["pyarrow", "numpy_nullable", lib.no_default])
+def test_bytes_column(con, dtype_backend, request):
+    # GitHub Issue #59242
+    conn = request.getfixturevalue(con)
+    pa = pytest.importorskip("pyarrow")
+
+    dtype = "O"
+    val = b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef"
+
+    if "postgres" in con:
+        val = (
+            b"\x00\x00\x00\x80\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef"
+            if "adbc" in con
+            else "0000000100100011010001010110011110001001101010"
+            "11110011011110111100000001001000110100010101100"
+            "11110001001101010111100110111101111"
+        )
+        if dtype_backend == "pyarrow":
+            dtype = (
+                pd.ArrowDtype(pa.string())
+                if "adbc" not in con
+                else pd.ArrowDtype(pa.opaque(pa.binary(), "bit", "PostgreSQL"))
+            )
+
+        if "psycopg2" in con:
+            if dtype_backend == "numpy_nullable":
+                dtype = pd.StringDtype()
+            elif dtype_backend == lib.no_default and pd.options.future.infer_string:
+                dtype = pd.StringDtype(storage="pyarrow", na_value=np.nan)
+
+    if "postgres" not in con and dtype_backend == "pyarrow":
+        dtype = pd.ArrowDtype(pa.binary())
+
+    expected = DataFrame([{"a": val}], dtype=dtype)
+    df = pd.read_sql(
+        "select x'0123456789abcdef0123456789abcdef' a",
+        conn,
+        dtype_backend=dtype_backend,
+    )
+    tm.assert_frame_equal(df, expected)
-Original file line number
+Diff line change
@@ Expand Up / @@ -134,6 +134,7 @@ MultiIndex @@
     I/O
     ^^^
     - :meth:`DataFrame.to_excel` was storing decimals as strings instead of numbers (:issue:`49598`)
+    - Bug in :func:`read_sql` causing an unintended exception when byte data was being converted to string when using the pyarrow dtype_backend (:issue:`59242`)
     -
     Period
@@ Expand Down @@