Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API: ignore empty range/object dtype in Index setop operations (string dtype compat) #60797

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions doc/source/whatsnew/v2.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,16 @@ These are bug fixes that might have notable behavior changes.
notable_bug_fix1
^^^^^^^^^^^^^^^^

.. _whatsnew_230.api_changes:

API changes
~~~~~~~~~~~

- When enabling the ``future.infer_string`` option: Index set operations (like
union or intersection) will now ignore the dtype of an empty ``RangeIndex`` or
empty ``Index`` with object dtype when determining the dtype of the resulting
Index (:issue:`60797`)

.. ---------------------------------------------------------------------------
.. _whatsnew_230.deprecations:

Expand Down
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,9 @@ Other API changes
- pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`)
- pickled objects from pandas version less than ``1.0.0`` are no longer supported (:issue:`57155`)
- when comparing the indexes in :func:`testing.assert_series_equal`, check_exact defaults to True if an :class:`Index` is of integer dtypes. (:issue:`57386`)
- Index set operations (like union or intersection) will now ignore the dtype of
an empty ``RangeIndex`` or empty ``Index`` with object dtype when determining
the dtype of the resulting Index (:issue:`60797`)

.. ---------------------------------------------------------------------------
.. _whatsnew_300.deprecations:
Expand Down
31 changes: 30 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@

import numpy as np

from pandas._config import get_option
from pandas._config import (
get_option,
using_string_dtype,
)

from pandas._libs import (
NaT,
Expand Down Expand Up @@ -6235,6 +6238,24 @@ def _find_common_type_compat(self, target) -> DtypeObj:
"""
target_dtype, _ = infer_dtype_from(target)

if using_string_dtype():
# special case: if left or right is a zero-length RangeIndex or
# Index[object], those can be created by the default empty constructors
# -> for that case ignore this dtype and always return the other
# (https://github.com/pandas-dev/pandas/pull/60797)
from pandas.core.indexes.range import RangeIndex

if len(self) == 0 and (
isinstance(self, RangeIndex) or self.dtype == np.object_
):
return target_dtype
if (
isinstance(target, Index)
and len(target) == 0
and (isinstance(target, RangeIndex) or target_dtype == np.object_)
):
return self.dtype

# special case: if one dtype is uint64 and the other a signed int, return object
# See https://github.com/pandas-dev/pandas/issues/26778 for discussion
# Now it's:
Expand Down Expand Up @@ -6888,6 +6909,14 @@ def insert(self, loc: int, item) -> Index:

arr = self._values

if using_string_dtype() and len(self) == 0 and self.dtype == np.object_:
# special case: if we are an empty object-dtype Index, also
# take into account the inserted item for the resulting dtype
# (https://github.com/pandas-dev/pandas/pull/60797)
dtype = self._find_common_type_compat(item)
if dtype != self.dtype:
return self.astype(dtype).insert(loc, item)

try:
if isinstance(arr, ExtensionArray):
res_values = arr.insert(loc, item)
Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/dtypes/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def test_concat_periodarray_2d():
_concat.concat_compat([arr[:2], arr[2:]], axis=1)


def test_concat_series_between_empty_and_tzaware_series():
def test_concat_series_between_empty_and_tzaware_series(using_infer_string):
tzaware_time = pd.Timestamp("2020-01-01T00:00:00+00:00")
ser1 = Series(index=[tzaware_time], data=0, dtype=float)
ser2 = Series(dtype=float)
Expand All @@ -57,7 +57,9 @@ def test_concat_series_between_empty_and_tzaware_series():
data=[
(0.0, None),
],
index=pd.Index([tzaware_time], dtype=object),
index=[tzaware_time]
if using_infer_string
else pd.Index([tzaware_time], dtype=object),
columns=[0, 1],
dtype=float,
)
Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/frame/constructors/test_from_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas import (
DataFrame,
Index,
Expand Down Expand Up @@ -44,7 +42,6 @@ def test_constructor_single_row(self):
)
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken")
def test_constructor_list_of_series(self):
data = [
OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]),
Expand Down
7 changes: 1 addition & 6 deletions pandas/tests/frame/indexing/test_coercion.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,7 @@ def test_26395(indexer_al):
df["D"] = 0

indexer_al(df)["C", "D"] = 2
expected = DataFrame(
{"D": [0, 0, 2]},
index=["A", "B", "C"],
columns=pd.Index(["D"], dtype=object),
dtype=np.int64,
)
expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64)
tm.assert_frame_equal(df, expected)

with pytest.raises(TypeError, match="Invalid value"):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1138,7 +1138,7 @@ def test_loc_setitem_datetimelike_with_inference(self):
result = df.dtypes
expected = Series(
[np.dtype("timedelta64[ns]")] * 6 + [np.dtype("datetime64[ns]")] * 2,
index=Index(list("ABCDEFGH"), dtype=object),
index=list("ABCDEFGH"),
)
tm.assert_series_equal(result, expected)

Expand Down
3 changes: 1 addition & 2 deletions pandas/tests/frame/indexing/test_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,7 @@ def test_insert_with_columns_dups(self):
df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True)
df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True)
exp = DataFrame(
[["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]],
columns=Index(["A", "A", "A"], dtype=object),
[["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"]
)
tm.assert_frame_equal(df, exp)

Expand Down
32 changes: 22 additions & 10 deletions pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,18 +144,32 @@ def test_setitem_different_dtype(self):
)
tm.assert_series_equal(result, expected)

def test_setitem_empty_columns(self):
# GH 13522
def test_setitem_overwrite_index(self):
# GH 13522 - assign the index as a column and then overwrite the values
# -> should not affect the index
df = DataFrame(index=["A", "B", "C"])
df["X"] = df.index
df["X"] = ["x", "y", "z"]
exp = DataFrame(
data={"X": ["x", "y", "z"]},
index=["A", "B", "C"],
columns=Index(["X"], dtype=object),
data={"X": ["x", "y", "z"]}, index=["A", "B", "C"], columns=["X"]
)
tm.assert_frame_equal(df, exp)

def test_setitem_empty_columns(self):
# Starting from an empty DataFrame and setting a column should result
# in a default string dtype for the columns' Index
# https://github.com/pandas-dev/pandas/issues/60338

df = DataFrame()
df["foo"] = [1, 2, 3]
expected = DataFrame({"foo": [1, 2, 3]})
tm.assert_frame_equal(df, expected)

df = DataFrame(columns=Index([]))
df["foo"] = [1, 2, 3]
expected = DataFrame({"foo": [1, 2, 3]})
tm.assert_frame_equal(df, expected)

def test_setitem_dt64_index_empty_columns(self):
rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s")
df = DataFrame(index=np.arange(len(rng)))
Expand All @@ -169,9 +183,7 @@ def test_setitem_timestamp_empty_columns(self):
df["now"] = Timestamp("20130101", tz="UTC")

expected = DataFrame(
[[Timestamp("20130101", tz="UTC")]] * 3,
index=range(3),
columns=Index(["now"], dtype=object),
[[Timestamp("20130101", tz="UTC")]] * 3, index=range(3), columns=["now"]
)
tm.assert_frame_equal(df, expected)

Expand Down Expand Up @@ -210,7 +222,7 @@ def test_setitem_period_preserves_dtype(self):
result = DataFrame([])
result["a"] = data

expected = DataFrame({"a": data}, columns=Index(["a"], dtype=object))
expected = DataFrame({"a": data}, columns=["a"])

tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -930,7 +942,7 @@ def test_setitem_scalars_no_index(self):
# GH#16823 / GH#17894
df = DataFrame()
df["foo"] = 1
expected = DataFrame(columns=Index(["foo"], dtype=object)).astype(np.int64)
expected = DataFrame(columns=["foo"]).astype(np.int64)
tm.assert_frame_equal(df, expected)

def test_setitem_newcol_tuple_key(self, float_frame):
Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/frame/methods/test_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,12 +182,9 @@ def test_dropna_multiple_axes(self):
with pytest.raises(TypeError, match="supplying multiple axes"):
inp.dropna(how="all", axis=(0, 1), inplace=True)

def test_dropna_tz_aware_datetime(self, using_infer_string):
def test_dropna_tz_aware_datetime(self):
# GH13407

df = DataFrame()
if using_infer_string:
df.columns = df.columns.astype("str")
dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc())
dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc())
df["Time"] = [dt1]
Expand Down
34 changes: 31 additions & 3 deletions pandas/tests/frame/methods/test_reset_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.core.dtypes.common import (
is_float_dtype,
is_integer_dtype,
Expand Down Expand Up @@ -644,7 +642,6 @@ def test_rest_index_multiindex_categorical_with_missing_values(self, codes):
tm.assert_frame_equal(res, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) - GH#60338")
@pytest.mark.parametrize(
"array, dtype",
[
Expand Down Expand Up @@ -781,3 +778,34 @@ def test_reset_index_false_index_name():
result_frame.reset_index()
expected_frame = DataFrame(range(5, 10), RangeIndex(range(5), name=False))
tm.assert_frame_equal(result_frame, expected_frame)


@pytest.mark.parametrize("columns", [None, Index([])])
def test_reset_index_with_empty_frame(columns):
# Currently empty DataFrame has RangeIndex or object dtype Index, but when
# resetting the index we still want to end up with the default string dtype
# https://github.com/pandas-dev/pandas/issues/60338

index = Index([], name="foo")
df = DataFrame(index=index, columns=columns)
result = df.reset_index()
expected = DataFrame(columns=["foo"])
tm.assert_frame_equal(result, expected)

index = Index([1, 2, 3], name="foo")
df = DataFrame(index=index, columns=columns)
result = df.reset_index()
expected = DataFrame({"foo": [1, 2, 3]})
tm.assert_frame_equal(result, expected)

index = MultiIndex.from_tuples([], names=["foo", "bar"])
df = DataFrame(index=index, columns=columns)
result = df.reset_index()
expected = DataFrame(columns=["foo", "bar"])
tm.assert_frame_equal(result, expected)

index = MultiIndex.from_tuples([(1, 2), (2, 3)], names=["foo", "bar"])
df = DataFrame(index=index, columns=columns)
result = df.reset_index()
expected = DataFrame({"foo": [1, 2], "bar": [2, 3]})
tm.assert_frame_equal(result, expected)
3 changes: 0 additions & 3 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
from numpy.ma import mrecords
import pytest

from pandas._config import using_string_dtype

from pandas._libs import lib
from pandas.compat.numpy import np_version_gt2
from pandas.errors import IntCastingNaNError
Expand Down Expand Up @@ -1974,7 +1972,6 @@ def test_constructor_with_datetimes4(self):
df = DataFrame({"value": dr})
assert str(df.iat[0, 0].tz) == "US/Eastern"

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_constructor_with_datetimes5(self):
# GH 7822
# preserver an index with a tz on dict construction
Expand Down
1 change: 0 additions & 1 deletion pandas/tests/frame/test_query_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -791,7 +791,6 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture):
tm.assert_frame_equal(result, expected)

expected = DataFrame(df_index)
expected.columns = expected.columns.astype(object)
result = df.reset_index().query('"2018-01-03 00:00:00+00" < time')
tm.assert_frame_equal(result, expected)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1278,7 +1278,7 @@ def test_groupby_2d_malformed():
d["label"] = ["l1", "l2"]
tmp = d.groupby(["group"]).mean(numeric_only=True)
res_values = np.array([[0.0, 1.0], [0.0, 1.0]])
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"], dtype=object))
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"]))
tm.assert_numpy_array_equal(tmp.values, res_values)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexes/base_class/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def test_insert(self):

# test empty
null_index = Index([])
tm.assert_index_equal(Index(["a"], dtype=object), null_index.insert(0, "a"))
tm.assert_index_equal(Index(["a"]), null_index.insert(0, "a"))

def test_insert_missing(self, nulls_fixture, using_infer_string):
# GH#22295
Expand Down
3 changes: 1 addition & 2 deletions pandas/tests/indexes/base_class/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,6 @@ def test_tuple_union_bug(self, method, expected, sort):
def test_union_name_preservation(
self, first_list, second_list, first_name, second_name, expected_name, sort
):
expected_dtype = object if not first_list or not second_list else "str"
first = Index(first_list, name=first_name)
second = Index(second_list, name=second_name)
union = first.union(second, sort=sort)
Expand All @@ -251,7 +250,7 @@ def test_union_name_preservation(
expected = Index(sorted(vals), name=expected_name)
tm.assert_index_equal(union, expected)
else:
expected = Index(vals, name=expected_name, dtype=expected_dtype)
expected = Index(vals, name=expected_name)
tm.assert_index_equal(union.sort_values(), expected.sort_values())

@pytest.mark.parametrize(
Expand Down
10 changes: 7 additions & 3 deletions pandas/tests/indexes/datetimes/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,17 @@ def test_join_utc_convert(self, join_type):
assert isinstance(result, DatetimeIndex)
assert result.tz is timezone.utc

def test_datetimeindex_union_join_empty(self, sort):
def test_datetimeindex_union_join_empty(self, sort, using_infer_string):
dti = date_range(start="1/1/2001", end="2/1/2001", freq="D")
empty = Index([])

result = dti.union(empty, sort=sort)
expected = dti.astype("O")
tm.assert_index_equal(result, expected)
if using_infer_string:
assert isinstance(result, DatetimeIndex)
tm.assert_index_equal(result, dti)
else:
expected = dti.astype("O")
tm.assert_index_equal(result, expected)

result = dti.join(empty)
assert isinstance(result, DatetimeIndex)
Expand Down
10 changes: 6 additions & 4 deletions pandas/tests/indexes/test_old_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,10 +454,12 @@ def test_insert_out_of_bounds(self, index, using_infer_string):
else:
msg = "slice indices must be integers or None or have an __index__ method"

if using_infer_string and (
index.dtype == "string" or index.dtype == "category"
):
msg = "loc must be an integer between"
if using_infer_string:
if index.dtype == "string" or index.dtype == "category":
msg = "loc must be an integer between"
elif index.dtype == "object" and len(index) == 0:
msg = "loc must be an integer between"
err = TypeError

with pytest.raises(err, match=msg):
index.insert(0.5, "foo")
Expand Down
Loading
Loading