Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API: ignore empty range/object dtype in Index setop operations (string dtype compat) #60797

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
31 changes: 30 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@

import numpy as np

from pandas._config import get_option
from pandas._config import (
get_option,
using_string_dtype,
)

from pandas._libs import (
NaT,
Expand Down Expand Up @@ -6235,6 +6238,24 @@ def _find_common_type_compat(self, target) -> DtypeObj:
"""
target_dtype, _ = infer_dtype_from(target)

if using_string_dtype():
# special case: if left or right is a zero-length RangeIndex or
# Index[object], those can be created by the default empty constructors
# -> for that case ignore this dtype and always return the other
# (https://github.com/pandas-dev/pandas/pull/60797)
from pandas.core.indexes.range import RangeIndex

if len(self) == 0 and (
isinstance(self, RangeIndex) or self.dtype == np.object_
):
return target_dtype
if (
isinstance(target, Index)
and len(target) == 0
and (isinstance(target, RangeIndex) or target_dtype == np.object_)
):
return self.dtype

# special case: if one dtype is uint64 and the other a signed int, return object
# See https://github.com/pandas-dev/pandas/issues/26778 for discussion
# Now it's:
Expand Down Expand Up @@ -6888,6 +6909,14 @@ def insert(self, loc: int, item) -> Index:

arr = self._values

if using_string_dtype() and len(self) == 0 and self.dtype == np.object_:
# special case: if we are an empty object-dtype Index, also
# take into account the inserted item for the resulting dtype
# (https://github.com/pandas-dev/pandas/pull/60797)
dtype = self._find_common_type_compat(item)
if dtype != self.dtype:
return self.astype(dtype).insert(loc, item)

try:
if isinstance(arr, ExtensionArray):
res_values = arr.insert(loc, item)
Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/dtypes/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def test_concat_periodarray_2d():
_concat.concat_compat([arr[:2], arr[2:]], axis=1)


def test_concat_series_between_empty_and_tzaware_series():
def test_concat_series_between_empty_and_tzaware_series(using_infer_string):
tzaware_time = pd.Timestamp("2020-01-01T00:00:00+00:00")
ser1 = Series(index=[tzaware_time], data=0, dtype=float)
ser2 = Series(dtype=float)
Expand All @@ -57,7 +57,9 @@ def test_concat_series_between_empty_and_tzaware_series():
data=[
(0.0, None),
],
index=pd.Index([tzaware_time], dtype=object),
index=[tzaware_time]
if using_infer_string
else pd.Index([tzaware_time], dtype=object),
columns=[0, 1],
dtype=float,
)
Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/frame/constructors/test_from_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas import (
DataFrame,
Index,
Expand Down Expand Up @@ -44,7 +42,6 @@ def test_constructor_single_row(self):
)
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken")
def test_constructor_list_of_series(self):
data = [
OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]),
Expand Down
7 changes: 1 addition & 6 deletions pandas/tests/frame/indexing/test_coercion.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,7 @@ def test_26395(indexer_al):
df["D"] = 0

indexer_al(df)["C", "D"] = 2
expected = DataFrame(
{"D": [0, 0, 2]},
index=["A", "B", "C"],
columns=pd.Index(["D"], dtype=object),
dtype=np.int64,
)
expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64)
tm.assert_frame_equal(df, expected)

with pytest.raises(TypeError, match="Invalid value"):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1138,7 +1138,7 @@ def test_loc_setitem_datetimelike_with_inference(self):
result = df.dtypes
expected = Series(
[np.dtype("timedelta64[ns]")] * 6 + [np.dtype("datetime64[ns]")] * 2,
index=Index(list("ABCDEFGH"), dtype=object),
index=list("ABCDEFGH"),
)
tm.assert_series_equal(result, expected)

Expand Down
3 changes: 1 addition & 2 deletions pandas/tests/frame/indexing/test_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,7 @@ def test_insert_with_columns_dups(self):
df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True)
df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True)
exp = DataFrame(
[["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]],
columns=Index(["A", "A", "A"], dtype=object),
[["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"]
)
tm.assert_frame_equal(df, exp)

Expand Down
32 changes: 22 additions & 10 deletions pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,18 +144,32 @@ def test_setitem_different_dtype(self):
)
tm.assert_series_equal(result, expected)

def test_setitem_empty_columns(self):
# GH 13522
def test_setitem_overwrite_index(self):
# GH 13522 - assign the index as a column and then overwrite the values
# -> should not affect the index
df = DataFrame(index=["A", "B", "C"])
df["X"] = df.index
df["X"] = ["x", "y", "z"]
exp = DataFrame(
data={"X": ["x", "y", "z"]},
index=["A", "B", "C"],
columns=Index(["X"], dtype=object),
data={"X": ["x", "y", "z"]}, index=["A", "B", "C"], columns=["X"]
)
tm.assert_frame_equal(df, exp)

def test_setitem_empty_columns(self):
# Starting from an empty DataFrame and setting a column should result
# in a default string dtype for the columns' Index
# https://github.com/pandas-dev/pandas/issues/60338

df = DataFrame()
df["foo"] = [1, 2, 3]
expected = DataFrame({"foo": [1, 2, 3]})
tm.assert_frame_equal(df, expected)

df = DataFrame(columns=Index([]))
df["foo"] = [1, 2, 3]
expected = DataFrame({"foo": [1, 2, 3]})
tm.assert_frame_equal(df, expected)

def test_setitem_dt64_index_empty_columns(self):
rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s")
df = DataFrame(index=np.arange(len(rng)))
Expand All @@ -169,9 +183,7 @@ def test_setitem_timestamp_empty_columns(self):
df["now"] = Timestamp("20130101", tz="UTC")

expected = DataFrame(
[[Timestamp("20130101", tz="UTC")]] * 3,
index=range(3),
columns=Index(["now"], dtype=object),
[[Timestamp("20130101", tz="UTC")]] * 3, index=range(3), columns=["now"]
)
tm.assert_frame_equal(df, expected)

Expand Down Expand Up @@ -210,7 +222,7 @@ def test_setitem_period_preserves_dtype(self):
result = DataFrame([])
result["a"] = data

expected = DataFrame({"a": data}, columns=Index(["a"], dtype=object))
expected = DataFrame({"a": data}, columns=["a"])

tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -930,7 +942,7 @@ def test_setitem_scalars_no_index(self):
# GH#16823 / GH#17894
df = DataFrame()
df["foo"] = 1
expected = DataFrame(columns=Index(["foo"], dtype=object)).astype(np.int64)
expected = DataFrame(columns=["foo"]).astype(np.int64)
tm.assert_frame_equal(df, expected)

def test_setitem_newcol_tuple_key(self, float_frame):
Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/frame/methods/test_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,12 +182,9 @@ def test_dropna_multiple_axes(self):
with pytest.raises(TypeError, match="supplying multiple axes"):
inp.dropna(how="all", axis=(0, 1), inplace=True)

def test_dropna_tz_aware_datetime(self, using_infer_string):
def test_dropna_tz_aware_datetime(self):
# GH13407

df = DataFrame()
if using_infer_string:
df.columns = df.columns.astype("str")
dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc())
dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc())
df["Time"] = [dt1]
Expand Down
34 changes: 31 additions & 3 deletions pandas/tests/frame/methods/test_reset_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.core.dtypes.common import (
is_float_dtype,
is_integer_dtype,
Expand Down Expand Up @@ -644,7 +642,6 @@ def test_rest_index_multiindex_categorical_with_missing_values(self, codes):
tm.assert_frame_equal(res, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) - GH#60338")
@pytest.mark.parametrize(
"array, dtype",
[
Expand Down Expand Up @@ -781,3 +778,34 @@ def test_reset_index_false_index_name():
result_frame.reset_index()
expected_frame = DataFrame(range(5, 10), RangeIndex(range(5), name=False))
tm.assert_frame_equal(result_frame, expected_frame)


@pytest.mark.parametrize("columns", [None, Index([])])
def test_reset_index_with_empty_frame(columns):
# Currently empty DataFrame has RangeIndex or object dtype Index, but when
# resetting the index we still want to end up with the default string dtype
# https://github.com/pandas-dev/pandas/issues/60338

index = Index([], name="foo")
df = DataFrame(index=index, columns=columns)
result = df.reset_index()
expected = DataFrame(columns=["foo"])
tm.assert_frame_equal(result, expected)

index = Index([1, 2, 3], name="foo")
df = DataFrame(index=index, columns=columns)
result = df.reset_index()
expected = DataFrame({"foo": [1, 2, 3]})
tm.assert_frame_equal(result, expected)

index = MultiIndex.from_tuples([], names=["foo", "bar"])
df = DataFrame(index=index, columns=columns)
result = df.reset_index()
expected = DataFrame(columns=["foo", "bar"])
tm.assert_frame_equal(result, expected)

index = MultiIndex.from_tuples([(1, 2), (2, 3)], names=["foo", "bar"])
df = DataFrame(index=index, columns=columns)
result = df.reset_index()
expected = DataFrame({"foo": [1, 2], "bar": [2, 3]})
tm.assert_frame_equal(result, expected)
3 changes: 0 additions & 3 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
from numpy.ma import mrecords
import pytest

from pandas._config import using_string_dtype

from pandas._libs import lib
from pandas.compat.numpy import np_version_gt2
from pandas.errors import IntCastingNaNError
Expand Down Expand Up @@ -1974,7 +1972,6 @@ def test_constructor_with_datetimes4(self):
df = DataFrame({"value": dr})
assert str(df.iat[0, 0].tz) == "US/Eastern"

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_constructor_with_datetimes5(self):
# GH 7822
# preserver an index with a tz on dict construction
Expand Down
1 change: 0 additions & 1 deletion pandas/tests/frame/test_query_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -791,7 +791,6 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture):
tm.assert_frame_equal(result, expected)

expected = DataFrame(df_index)
expected.columns = expected.columns.astype(object)
result = df.reset_index().query('"2018-01-03 00:00:00+00" < time')
tm.assert_frame_equal(result, expected)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1278,7 +1278,7 @@ def test_groupby_2d_malformed():
d["label"] = ["l1", "l2"]
tmp = d.groupby(["group"]).mean(numeric_only=True)
res_values = np.array([[0.0, 1.0], [0.0, 1.0]])
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"], dtype=object))
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"]))
tm.assert_numpy_array_equal(tmp.values, res_values)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexes/base_class/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def test_insert(self):

# test empty
null_index = Index([])
tm.assert_index_equal(Index(["a"], dtype=object), null_index.insert(0, "a"))
tm.assert_index_equal(Index(["a"]), null_index.insert(0, "a"))

def test_insert_missing(self, nulls_fixture, using_infer_string):
# GH#22295
Expand Down
3 changes: 1 addition & 2 deletions pandas/tests/indexes/base_class/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,6 @@ def test_tuple_union_bug(self, method, expected, sort):
def test_union_name_preservation(
self, first_list, second_list, first_name, second_name, expected_name, sort
):
expected_dtype = object if not first_list or not second_list else "str"
first = Index(first_list, name=first_name)
second = Index(second_list, name=second_name)
union = first.union(second, sort=sort)
Expand All @@ -251,7 +250,7 @@ def test_union_name_preservation(
expected = Index(sorted(vals), name=expected_name)
tm.assert_index_equal(union, expected)
else:
expected = Index(vals, name=expected_name, dtype=expected_dtype)
expected = Index(vals, name=expected_name)
tm.assert_index_equal(union.sort_values(), expected.sort_values())

@pytest.mark.parametrize(
Expand Down
10 changes: 7 additions & 3 deletions pandas/tests/indexes/datetimes/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,17 @@ def test_join_utc_convert(self, join_type):
assert isinstance(result, DatetimeIndex)
assert result.tz is timezone.utc

def test_datetimeindex_union_join_empty(self, sort):
def test_datetimeindex_union_join_empty(self, sort, using_infer_string):
dti = date_range(start="1/1/2001", end="2/1/2001", freq="D")
empty = Index([])

result = dti.union(empty, sort=sort)
expected = dti.astype("O")
tm.assert_index_equal(result, expected)
if using_infer_string:
assert isinstance(result, DatetimeIndex)
tm.assert_index_equal(result, dti)
else:
expected = dti.astype("O")
tm.assert_index_equal(result, expected)

result = dti.join(empty)
assert isinstance(result, DatetimeIndex)
Expand Down
10 changes: 6 additions & 4 deletions pandas/tests/indexes/test_old_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,10 +454,12 @@ def test_insert_out_of_bounds(self, index, using_infer_string):
else:
msg = "slice indices must be integers or None or have an __index__ method"

if using_infer_string and (
index.dtype == "string" or index.dtype == "category"
):
msg = "loc must be an integer between"
if using_infer_string:
if index.dtype == "string" or index.dtype == "category":
msg = "loc must be an integer between"
elif index.dtype == "object" and len(index) == 0:
msg = "loc must be an integer between"
err = TypeError

with pytest.raises(err, match=msg):
index.insert(0.5, "foo")
Expand Down
14 changes: 13 additions & 1 deletion pandas/tests/indexes/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,7 @@ def test_intersection_difference_match_empty(self, index, sort):
@pytest.mark.parametrize(
"method", ["intersection", "union", "difference", "symmetric_difference"]
)
def test_setop_with_categorical(index_flat, sort, method):
def test_setop_with_categorical(index_flat, sort, method, using_infer_string):
# MultiIndex tested separately in tests.indexes.multi.test_setops
index = index_flat

Expand All @@ -539,10 +539,22 @@ def test_setop_with_categorical(index_flat, sort, method):

result = getattr(index, method)(other, sort=sort)
expected = getattr(index, method)(index, sort=sort)
if (
using_infer_string
and index.empty
and method in ("union", "symmetric_difference")
):
expected = expected.astype("category")
tm.assert_index_equal(result, expected, exact=exact)

result = getattr(index, method)(other[:5], sort=sort)
expected = getattr(index, method)(index[:5], sort=sort)
if (
using_infer_string
and index.empty
and method in ("union", "symmetric_difference")
):
expected = expected.astype("category")
tm.assert_index_equal(result, expected, exact=exact)


Expand Down
Loading
Loading