Skip to content

Commit 1ff6970

Browse files
authored
BUG: hash_pandas_object ignores optional arguments when the input is a DataFrame. (#42049)
1 parent 648eb40 commit 1ff6970

File tree

3 files changed

+31
-1
lines changed

3 files changed

+31
-1
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1215,6 +1215,7 @@ Other
12151215
- Bug in :class:`Series` backed by :class:`DatetimeArray` or :class:`TimedeltaArray` sometimes failing to set the array's ``freq`` to ``None`` (:issue:`41425`)
12161216
- Bug in creating a :class:`Series` from a ``range`` object that does not fit in the bounds of ``int64`` dtype (:issue:`30173`)
12171217
- Bug in creating a :class:`Series` from a ``dict`` with all-tuple keys and an :class:`Index` that requires reindexing (:issue:`41707`)
1218+
- Bug in :func:`pandas.util.hash_pandas_object` not recognizing ``hash_key``, ``encoding`` and ``categorize`` when the input object type is a :class:`DataFrame` (:issue:`41404`)
12181219

12191220
.. ---------------------------------------------------------------------------
12201221

pandas/core/util/hashing.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,10 @@ def hash_pandas_object(
139139
ser = Series(h, index=obj.index, dtype="uint64", copy=False)
140140

141141
elif isinstance(obj, ABCDataFrame):
142-
hashes = (hash_array(series._values) for _, series in obj.items())
142+
hashes = (
143+
hash_array(series._values, encoding, hash_key, categorize)
144+
for _, series in obj.items()
145+
)
143146
num_items = len(obj.columns)
144147
if index:
145148
index_hash_generator = (

pandas/tests/util/test_hashing.py

+26
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,32 @@ def test_hash_keys():
255255
assert (a != b).all()
256256

257257

258+
def test_df_hash_keys():
259+
# DataFrame version of the test_hash_keys.
260+
# https://github.com/pandas-dev/pandas/issues/41404
261+
obj = DataFrame({"x": np.arange(3), "y": list("abc")})
262+
263+
a = hash_pandas_object(obj, hash_key="9876543210123456")
264+
b = hash_pandas_object(obj, hash_key="9876543210123465")
265+
266+
assert (a != b).all()
267+
268+
269+
def test_df_encoding():
270+
# Check that DataFrame recognizes optional encoding.
271+
# https://github.com/pandas-dev/pandas/issues/41404
272+
# https://github.com/pandas-dev/pandas/pull/42049
273+
obj = DataFrame({"x": np.arange(3), "y": list("a+c")})
274+
275+
a = hash_pandas_object(obj, encoding="utf8")
276+
b = hash_pandas_object(obj, encoding="utf7")
277+
278+
# Note that the "+" is encoded as "+-" in utf-7.
279+
assert a[0] == b[0]
280+
assert a[1] != b[1]
281+
assert a[2] == b[2]
282+
283+
258284
def test_invalid_key():
259285
# This only matters for object dtypes.
260286
msg = "key should be a 16-byte string encoded"

0 commit comments

Comments
 (0)