From 6dd02166328361eb636e6f4e7dab87f31fa7d9f1 Mon Sep 17 00:00:00 2001
From: Jake Thomas Trevallion <flamingood.media.jt@gmail.com>
Date: Sun, 2 Feb 2025 09:35:22 +0000
Subject: [PATCH] Add clearer error messages for datatype mismatch in
 HDFStore.append. Raise ValueError when nan_rep too large for pytable column.
 Add and modify applicable test code.

---
 pandas/io/pytables.py                   | 11 +++++++++++
 pandas/tests/io/pytables/test_append.py | 19 ++++++++++---------
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index e18db2e53113f..e387691d2ff64 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -3524,6 +3524,14 @@ def validate(self, other) -> None:
                     # Value of type "Optional[Any]" is not indexable  [index]
                     oax = ov[i]  # type: ignore[index]
                     if sax != oax:
+                        ## Raise clearer error if mismatching type on values_axes
+                        if c == "values_axes" and sax.kind != oax.kind:
+                            raise TypeError(
+                                f"Cannot serialize the column [{oax.values[0]}] "
+                                f"because its data contents are not [{oax.kind}] "
+                                f"but [{sax.kind}] object dtype"
+                            )
+                        # Fallback if other source of difference
                         raise ValueError(
                             f"invalid combination of [{c}] on appending data "
                             f"[{sax}] vs current table [{oax}]"
@@ -5136,6 +5144,9 @@ def _maybe_convert_for_string_atom(
     data = bvalues.copy()
     data[mask] = nan_rep
 
+    if existing_col and mask.any() and len(nan_rep) > existing_col.itemsize:
+        raise ValueError("NaN representation is too large for existing column size")
+
     # see if we have a valid string type
     inferred_type = lib.infer_dtype(data, skipna=False)
     if inferred_type != "string":
diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py
index 47658c0eb9012..eb0dd7cd9c142 100644
--- a/pandas/tests/io/pytables/test_append.py
+++ b/pandas/tests/io/pytables/test_append.py
@@ -421,6 +421,14 @@ def check_col(key, name, size):
         with pytest.raises(ValueError, match=msg):
             store.append("df_new", df_new)
 
+        # bigger NaN representation on next append
+        df_new = DataFrame([[124, "a"], [346, "b"]])
+        store.append("df_new2", df_new)
+        df_new = DataFrame([[124, None], [346, "b"]])
+        msg = "NaN representation is too large for existing column size"
+        with pytest.raises(ValueError, match=msg):
+            store.append("df_new2", df_new)
+
         # min_itemsize on Series index (GH 11412)
         df = DataFrame(
             {
@@ -822,15 +830,8 @@ def test_append_raise(setup_path):
         df["foo"] = Timestamp("20130101")
         store.append("df", df)
         df["foo"] = "bar"
-        msg = re.escape(
-            "invalid combination of [values_axes] on appending data "
-            "[name->values_block_1,cname->values_block_1,"
-            "dtype->bytes24,kind->string,shape->(1, 30)] "
-            "vs current table "
-            "[name->values_block_1,cname->values_block_1,"
-            "dtype->datetime64[s],kind->datetime64[s],shape->None]"
-        )
-        with pytest.raises(ValueError, match=msg):
+        msg = re.escape("Cannot serialize the column [foo] but [string] object dtype")
+        with pytest.raises(TypeError, match=msg):
             store.append("df", df)