Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API: ignore empty range/object dtype in Index setop operations (string dtype compat) #60797

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
22 changes: 21 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@

import numpy as np

from pandas._config import get_option
from pandas._config import (
get_option,
using_string_dtype,
)

from pandas._libs import (
NaT,
Expand Down Expand Up @@ -6235,6 +6238,23 @@ def _find_common_type_compat(self, target) -> DtypeObj:
"""
target_dtype, _ = infer_dtype_from(target)

if using_string_dtype():
# special case: if left or right is a zero-length RangeIndex or
# Index[object], those can be created by the default empty constructors
# -> for that case ignore this dtype and always return the other
from pandas.core.indexes.range import RangeIndex

if len(self) == 0 and (
isinstance(self, RangeIndex) or self.dtype == np.object_
):
return target_dtype
if (
isinstance(target, Index)
and len(target) == 0
and (isinstance(target, RangeIndex) or target_dtype == np.object_)
):
return self.dtype

# special case: if one dtype is uint64 and the other a signed int, return object
# See https://github.com/pandas-dev/pandas/issues/26778 for discussion
# Now it's:
Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/dtypes/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def test_concat_periodarray_2d():
_concat.concat_compat([arr[:2], arr[2:]], axis=1)


def test_concat_series_between_empty_and_tzaware_series():
def test_concat_series_between_empty_and_tzaware_series(using_infer_string):
tzaware_time = pd.Timestamp("2020-01-01T00:00:00+00:00")
ser1 = Series(index=[tzaware_time], data=0, dtype=float)
ser2 = Series(dtype=float)
Expand All @@ -57,7 +57,9 @@ def test_concat_series_between_empty_and_tzaware_series():
data=[
(0.0, None),
],
index=pd.Index([tzaware_time], dtype=object),
index=[tzaware_time]
if using_infer_string
else pd.Index([tzaware_time], dtype=object),
columns=[0, 1],
dtype=float,
)
Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/frame/constructors/test_from_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas import (
DataFrame,
Index,
Expand Down Expand Up @@ -44,7 +42,6 @@ def test_constructor_single_row(self):
)
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken")
def test_constructor_list_of_series(self):
data = [
OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]),
Expand Down
7 changes: 1 addition & 6 deletions pandas/tests/frame/indexing/test_coercion.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,7 @@ def test_26395(indexer_al):
df["D"] = 0

indexer_al(df)["C", "D"] = 2
expected = DataFrame(
{"D": [0, 0, 2]},
index=["A", "B", "C"],
columns=pd.Index(["D"], dtype=object),
dtype=np.int64,
)
expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64)
tm.assert_frame_equal(df, expected)

with pytest.raises(TypeError, match="Invalid value"):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1138,7 +1138,7 @@ def test_loc_setitem_datetimelike_with_inference(self):
result = df.dtypes
expected = Series(
[np.dtype("timedelta64[ns]")] * 6 + [np.dtype("datetime64[ns]")] * 2,
index=Index(list("ABCDEFGH"), dtype=object),
index=list("ABCDEFGH"),
)
tm.assert_series_equal(result, expected)

Expand Down
3 changes: 1 addition & 2 deletions pandas/tests/frame/indexing/test_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,7 @@ def test_insert_with_columns_dups(self):
df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True)
df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True)
exp = DataFrame(
[["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]],
columns=Index(["A", "A", "A"], dtype=object),
[["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"]
)
tm.assert_frame_equal(df, exp)

Expand Down
12 changes: 4 additions & 8 deletions pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,7 @@ def test_setitem_empty_columns(self):
df["X"] = df.index
df["X"] = ["x", "y", "z"]
exp = DataFrame(
data={"X": ["x", "y", "z"]},
index=["A", "B", "C"],
columns=Index(["X"], dtype=object),
data={"X": ["x", "y", "z"]}, index=["A", "B", "C"], columns=["X"]
)
tm.assert_frame_equal(df, exp)

Expand All @@ -169,9 +167,7 @@ def test_setitem_timestamp_empty_columns(self):
df["now"] = Timestamp("20130101", tz="UTC")

expected = DataFrame(
[[Timestamp("20130101", tz="UTC")]] * 3,
index=range(3),
columns=Index(["now"], dtype=object),
[[Timestamp("20130101", tz="UTC")]] * 3, index=range(3), columns=["now"]
)
tm.assert_frame_equal(df, expected)

Expand Down Expand Up @@ -210,7 +206,7 @@ def test_setitem_period_preserves_dtype(self):
result = DataFrame([])
result["a"] = data

expected = DataFrame({"a": data}, columns=Index(["a"], dtype=object))
expected = DataFrame({"a": data}, columns=["a"])

tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -930,7 +926,7 @@ def test_setitem_scalars_no_index(self):
# GH#16823 / GH#17894
df = DataFrame()
df["foo"] = 1
expected = DataFrame(columns=Index(["foo"], dtype=object)).astype(np.int64)
expected = DataFrame(columns=["foo"]).astype(np.int64)
tm.assert_frame_equal(df, expected)

def test_setitem_newcol_tuple_key(self, float_frame):
Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/frame/methods/test_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
DataFrame,
Expand Down Expand Up @@ -184,7 +182,6 @@ def test_dropna_multiple_axes(self):
with pytest.raises(TypeError, match="supplying multiple axes"):
inp.dropna(how="all", axis=(0, 1), inplace=True)

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_dropna_tz_aware_datetime(self):
# GH13407
df = DataFrame()
Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/frame/methods/test_reset_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.core.dtypes.common import (
is_float_dtype,
is_integer_dtype,
Expand Down Expand Up @@ -644,7 +642,6 @@ def test_rest_index_multiindex_categorical_with_missing_values(self, codes):
tm.assert_frame_equal(res, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) - GH#60338")
@pytest.mark.parametrize(
"array, dtype",
[
Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
from numpy.ma import mrecords
import pytest

from pandas._config import using_string_dtype

from pandas._libs import lib
from pandas.compat.numpy import np_version_gt2
from pandas.errors import IntCastingNaNError
Expand Down Expand Up @@ -1974,7 +1972,6 @@ def test_constructor_with_datetimes4(self):
df = DataFrame({"value": dr})
assert str(df.iat[0, 0].tz) == "US/Eastern"

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_constructor_with_datetimes5(self):
# GH 7822
# preserver an index with a tz on dict construction
Expand Down
1 change: 0 additions & 1 deletion pandas/tests/frame/test_query_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -791,7 +791,6 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture):
tm.assert_frame_equal(result, expected)

expected = DataFrame(df_index)
expected.columns = expected.columns.astype(object)
result = df.reset_index().query('"2018-01-03 00:00:00+00" < time')
tm.assert_frame_equal(result, expected)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1278,7 +1278,7 @@ def test_groupby_2d_malformed():
d["label"] = ["l1", "l2"]
tmp = d.groupby(["group"]).mean(numeric_only=True)
res_values = np.array([[0.0, 1.0], [0.0, 1.0]])
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"], dtype=object))
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"]))
tm.assert_numpy_array_equal(tmp.values, res_values)


Expand Down
3 changes: 1 addition & 2 deletions pandas/tests/indexes/base_class/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,6 @@ def test_tuple_union_bug(self, method, expected, sort):
def test_union_name_preservation(
self, first_list, second_list, first_name, second_name, expected_name, sort
):
expected_dtype = object if not first_list or not second_list else "str"
first = Index(first_list, name=first_name)
second = Index(second_list, name=second_name)
union = first.union(second, sort=sort)
Expand All @@ -251,7 +250,7 @@ def test_union_name_preservation(
expected = Index(sorted(vals), name=expected_name)
tm.assert_index_equal(union, expected)
else:
expected = Index(vals, name=expected_name, dtype=expected_dtype)
expected = Index(vals, name=expected_name)
tm.assert_index_equal(union.sort_values(), expected.sort_values())

@pytest.mark.parametrize(
Expand Down
10 changes: 7 additions & 3 deletions pandas/tests/indexes/datetimes/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,17 @@ def test_join_utc_convert(self, join_type):
assert isinstance(result, DatetimeIndex)
assert result.tz is timezone.utc

def test_datetimeindex_union_join_empty(self, sort):
def test_datetimeindex_union_join_empty(self, sort, using_infer_string):
dti = date_range(start="1/1/2001", end="2/1/2001", freq="D")
empty = Index([])

result = dti.union(empty, sort=sort)
expected = dti.astype("O")
tm.assert_index_equal(result, expected)
if using_infer_string:
assert isinstance(result, DatetimeIndex)
tm.assert_index_equal(result, dti)
else:
expected = dti.astype("O")
tm.assert_index_equal(result, expected)

result = dti.join(empty)
assert isinstance(result, DatetimeIndex)
Expand Down
14 changes: 13 additions & 1 deletion pandas/tests/indexes/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,7 @@ def test_intersection_difference_match_empty(self, index, sort):
@pytest.mark.parametrize(
"method", ["intersection", "union", "difference", "symmetric_difference"]
)
def test_setop_with_categorical(index_flat, sort, method):
def test_setop_with_categorical(index_flat, sort, method, using_infer_string):
# MultiIndex tested separately in tests.indexes.multi.test_setops
index = index_flat

Expand All @@ -539,10 +539,22 @@ def test_setop_with_categorical(index_flat, sort, method):

result = getattr(index, method)(other, sort=sort)
expected = getattr(index, method)(index, sort=sort)
if (
using_infer_string
and index.empty
and method in ("union", "symmetric_difference")
):
expected = expected.astype("category")
tm.assert_index_equal(result, expected, exact=exact)

result = getattr(index, method)(other[:5], sort=sort)
expected = getattr(index, method)(index[:5], sort=sort)
if (
using_infer_string
and index.empty
and method in ("union", "symmetric_difference")
):
expected = expected.astype("category")
tm.assert_index_equal(result, expected, exact=exact)


Expand Down
7 changes: 1 addition & 6 deletions pandas/tests/indexing/test_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
CategoricalIndex,
DataFrame,
DatetimeIndex,
Index,
MultiIndex,
Series,
Timestamp,
Expand Down Expand Up @@ -67,11 +66,7 @@ def test_at_setitem_item_cache_cleared(self):
df.at[0, "x"] = 4
df.at[0, "cost"] = 789

expected = DataFrame(
{"x": [4], "cost": 789},
index=[0],
columns=Index(["x", "cost"], dtype=object),
)
expected = DataFrame({"x": [4], "cost": 789}, index=[0])
tm.assert_frame_equal(df, expected)

# And in particular, check that the _item_cache has updated correctly.
Expand Down
26 changes: 12 additions & 14 deletions pandas/tests/indexing/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -766,9 +766,9 @@ def test_loc_setitem_empty_frame(self):
# is inplace, so that dtype is retained
sera = Series(val1, index=keys1, dtype=np.float64)
serb = Series(val2, index=keys2)
expected = DataFrame(
{"A": sera, "B": serb}, columns=Index(["A", "B"], dtype=object)
).reindex(index=index)
expected = DataFrame({"A": sera, "B": serb}, columns=Index(["A", "B"])).reindex(
index=index
)
tm.assert_frame_equal(df, expected)

def test_loc_setitem_frame(self):
Expand Down Expand Up @@ -966,7 +966,7 @@ def test_setitem_new_key_tz(self, indexer_sl):
to_datetime(42).tz_localize("UTC"),
to_datetime(666).tz_localize("UTC"),
]
expected = Series(vals, index=Index(["foo", "bar"], dtype=object))
expected = Series(vals, index=Index(["foo", "bar"]))

ser = Series(dtype=object)
indexer_sl(ser)["foo"] = vals[0]
Expand Down Expand Up @@ -1966,15 +1966,11 @@ def test_loc_setitem_empty_series_str_idx(self):
# partially set with an empty object series
ser = Series(dtype=object)
ser.loc["foo"] = 1
tm.assert_series_equal(ser, Series([1], index=Index(["foo"], dtype=object)))
tm.assert_series_equal(ser, Series([1], index=Index(["foo"])))
ser.loc["bar"] = 3
tm.assert_series_equal(
ser, Series([1, 3], index=Index(["foo", "bar"], dtype=object))
)
tm.assert_series_equal(ser, Series([1, 3], index=Index(["foo", "bar"])))
ser.loc[3] = 4
tm.assert_series_equal(
ser, Series([1, 3, 4], index=Index(["foo", "bar", 3], dtype=object))
)
tm.assert_series_equal(ser, Series([1, 3, 4], index=Index(["foo", "bar", 3])))

def test_loc_setitem_incremental_with_dst(self):
# GH#20724
Expand All @@ -1996,7 +1992,7 @@ def test_loc_setitem_incremental_with_dst(self):
],
ids=["self", "to_datetime64", "to_pydatetime", "np.datetime64"],
)
def test_loc_setitem_datetime_keys_cast(self, conv):
def test_loc_setitem_datetime_keys_cast(self, conv, using_infer_string):
# GH#9516, GH#51363 changed in 3.0 to not cast on Index.insert
dt1 = Timestamp("20130101 09:00:00")
dt2 = Timestamp("20130101 10:00:00")
Expand All @@ -2006,8 +2002,10 @@ def test_loc_setitem_datetime_keys_cast(self, conv):

expected = DataFrame(
{"one": [100.0, 200.0]},
index=Index([conv(dt1), conv(dt2)], dtype=object),
columns=Index(["one"], dtype=object),
index=Index(
[conv(dt1), conv(dt2)], dtype=None if using_infer_string else object
),
columns=Index(["one"]),
)
tm.assert_frame_equal(df, expected)

Expand Down
Loading