Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TST(string dtype): Resolve xfails in pytables #60795

Merged
merged 11 commits into from
Feb 10, 2025
3 changes: 3 additions & 0 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
@@ -5118,6 +5118,9 @@ def _maybe_convert_for_string_atom(
errors,
columns: list[str],
):
if isinstance(bvalues.dtype, StringDtype):
# "ndarray[Any, Any]" has no attribute "to_numpy"
bvalues = bvalues.to_numpy() # type: ignore[union-attr]
if bvalues.dtype != object:
return bvalues

56 changes: 30 additions & 26 deletions pandas/tests/io/pytables/test_append.py
Original file line number Diff line number Diff line change
@@ -25,10 +25,7 @@
ensure_clean_store,
)

pytestmark = [
pytest.mark.single_cpu,
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
]
pytestmark = [pytest.mark.single_cpu]

tables = pytest.importorskip("tables")

@@ -40,7 +37,7 @@ def test_append(setup_path):
# tables.NaturalNameWarning):
df = DataFrame(
np.random.default_rng(2).standard_normal((20, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=20, freq="B"),
)
_maybe_remove(store, "df1")
@@ -203,7 +200,7 @@ def test_append_some_nans(setup_path):
tm.assert_frame_equal(store["df3"], df3, check_index_type=True)


def test_append_all_nans(setup_path):
def test_append_all_nans(setup_path, using_infer_string):
with ensure_clean_store(setup_path) as store:
df = DataFrame(
{
@@ -255,7 +252,13 @@ def test_append_all_nans(setup_path):
_maybe_remove(store, "df")
store.append("df", df[:10], dropna=True)
store.append("df", df[10:], dropna=True)
tm.assert_frame_equal(store["df"], df, check_index_type=True)
result = store["df"]
expected = df
if using_infer_string:
# TODO: Test is incorrect when not using_infer_string.
# Should take the last 4 rows uncondiationally.
expected = expected[-4:]
tm.assert_frame_equal(result, expected, check_index_type=True)

_maybe_remove(store, "df2")
store.append("df2", df[:10], dropna=False)
@@ -294,7 +297,7 @@ def test_append_frame_column_oriented(setup_path, request):
# column oriented
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)
df.index = df.index._with_freq(None) # freq doesn't round-trip
@@ -426,7 +429,7 @@ def check_col(key, name, size):
{
"A": [0.0, 1.0, 2.0, 3.0, 4.0],
"B": [0.0, 1.0, 0.0, 1.0, 0.0],
"C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object),
"C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]),
"D": date_range("20130101", periods=5),
}
).set_index("C")
@@ -453,7 +456,7 @@ def check_col(key, name, size):
_maybe_remove(store, "df")
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)
df["string"] = "foo"
@@ -513,11 +516,12 @@ def test_append_with_empty_string(setup_path):
tm.assert_frame_equal(store.select("df"), df)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_append_with_data_columns(setup_path):
with ensure_clean_store(setup_path) as store:
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)
df.iloc[0, df.columns.get_loc("B")] = 1.0
@@ -693,8 +697,8 @@ def test_append_misc(setup_path):
with ensure_clean_store(setup_path) as store:
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)
store.append("df", df, chunksize=1)
result = store.select("df")
@@ -710,8 +714,8 @@ def test_append_misc_chunksize(setup_path, chunksize):
# more chunksize in append tests
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)
df["string"] = "foo"
df["float322"] = 1.0
@@ -747,15 +751,15 @@ def test_append_misc_empty_frame(setup_path):
tm.assert_frame_equal(store.select("df2"), df)


def test_append_raise(setup_path):
def test_append_raise(setup_path, using_infer_string):
with ensure_clean_store(setup_path) as store:
# test append with invalid input to get good error messages

# list in column
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)
df["invalid"] = [["a"]] * len(df)
assert df.dtypes["invalid"] == np.object_
@@ -775,8 +779,8 @@ def test_append_raise(setup_path):
# datetime with embedded nans as object
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)
s = Series(datetime.datetime(2001, 1, 2), index=df.index)
s = s.astype(object)
@@ -803,8 +807,8 @@ def test_append_raise(setup_path):
# appending an incompatible table
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)
store.append("df", df)

@@ -881,7 +885,7 @@ def test_append_with_timedelta(setup_path):
def test_append_to_multiple(setup_path):
df1 = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)
df2 = df1.copy().rename(columns="{}_2".format)
@@ -918,12 +922,12 @@ def test_append_to_multiple(setup_path):
def test_append_to_multiple_dropna(setup_path):
df1 = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)
df2 = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
).rename(columns="{}_2".format)
df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan
@@ -943,7 +947,7 @@ def test_append_to_multiple_dropna(setup_path):
def test_append_to_multiple_dropna_false(setup_path):
df1 = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)
df2 = df1.copy().rename(columns="{}_2".format)
6 changes: 2 additions & 4 deletions pandas/tests/io/pytables/test_categorical.py
Original file line number Diff line number Diff line change
@@ -16,10 +16,7 @@
ensure_clean_store,
)

pytestmark = [
pytest.mark.single_cpu,
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
]
pytestmark = [pytest.mark.single_cpu]


def test_categorical(setup_path):
@@ -143,6 +140,7 @@ def test_categorical(setup_path):
store.select("df3/meta/s/meta")


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_categorical_conversion(tmp_path, setup_path):
# GH13322
# Check that read_hdf with categorical columns doesn't return rows if
6 changes: 0 additions & 6 deletions pandas/tests/io/pytables/test_complex.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
DataFrame,
@@ -13,10 +11,6 @@

from pandas.io.pytables import read_hdf

pytestmark = pytest.mark.xfail(
using_string_dtype(), reason="TODO(infer_string)", strict=False
)


def test_complex_fixed(tmp_path, setup_path):
df = DataFrame(
18 changes: 9 additions & 9 deletions pandas/tests/io/pytables/test_errors.py
Original file line number Diff line number Diff line change
@@ -5,8 +5,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas import (
CategoricalIndex,
DataFrame,
@@ -24,10 +22,7 @@
_maybe_adjust_name,
)

pytestmark = [
pytest.mark.single_cpu,
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
]
pytestmark = [pytest.mark.single_cpu]


def test_pass_spec_to_storer(setup_path):
@@ -93,9 +88,14 @@ def test_unimplemented_dtypes_table_columns(setup_path):

with ensure_clean_store(setup_path) as store:
# this fails because we have a date in the object block......
msg = re.escape(
"""Cannot serialize the column [datetime1]
because its data contents are not [string] but [date] object dtype"""
msg = "|".join(
[
re.escape(
"Cannot serialize the column [datetime1]\nbecause its data "
"contents are not [string] but [date] object dtype"
),
re.escape("[date] is not implemented as a table column"),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the original error message here is much clearer - is there no way to catch and raise that for the string types?

Copy link
Member Author

@rhshadrach rhshadrach Feb 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what you mean, this error is not raised for string types. It's being raised for date types.

When infer_string=False, this function is passed a single object block with a mix of strings and dates. In this case, the data of the block is inferred as mixed, and then checked column-by-column. This is where the top message (which I think is confusing) is raised. When infer_string=True, each string array is fed into this function individually and does not raise. Then the object block is fed in containing only dates. This is inferred as dates, and the corresponding error message is raised.

]
)
with pytest.raises(TypeError, match=msg):
store.append("df_unimplemented", df)
10 changes: 2 additions & 8 deletions pandas/tests/io/pytables/test_file_handling.py
Original file line number Diff line number Diff line change
@@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import (
PY311,
is_ci_environment,
@@ -35,9 +33,7 @@
from pandas.io import pytables
from pandas.io.pytables import Term

pytestmark = [
pytest.mark.single_cpu,
]
pytestmark = [pytest.mark.single_cpu]


@pytest.mark.parametrize("mode", ["r", "r+", "a", "w"])
@@ -329,7 +325,6 @@ def test_complibs(tmp_path, lvl, lib, request):
assert node.filters.complib == lib


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.skipif(
not is_platform_little_endian(), reason="reason platform is not little endian"
)
@@ -347,7 +342,6 @@ def test_encoding(setup_path):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize(
"val",
[
@@ -362,7 +356,7 @@ def test_encoding(setup_path):
[b"A\xf8\xfc", np.nan, b"", b"b", b"c"],
],
)
@pytest.mark.parametrize("dtype", ["category", object])
@pytest.mark.parametrize("dtype", ["category", None])
def test_latin_encoding(tmp_path, setup_path, dtype, val):
enc = "latin-1"
nan_rep = ""
7 changes: 1 addition & 6 deletions pandas/tests/io/pytables/test_keys.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas import (
DataFrame,
HDFStore,
@@ -15,10 +13,7 @@
tables,
)

pytestmark = [
pytest.mark.single_cpu,
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
]
pytestmark = [pytest.mark.single_cpu]


def test_keys(setup_path):
4 changes: 1 addition & 3 deletions pandas/tests/io/pytables/test_put.py
Original file line number Diff line number Diff line change
@@ -22,9 +22,7 @@
)
from pandas.util import _test_decorators as td

pytestmark = [
pytest.mark.single_cpu,
]
pytestmark = [pytest.mark.single_cpu]


def test_format_type(tmp_path, setup_path):
16 changes: 10 additions & 6 deletions pandas/tests/io/pytables/test_read.py
Original file line number Diff line number Diff line change
@@ -26,10 +26,7 @@

from pandas.io.pytables import TableIterator

pytestmark = [
pytest.mark.single_cpu,
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
]
pytestmark = [pytest.mark.single_cpu]


def test_read_missing_key_close_store(tmp_path, setup_path):
@@ -75,10 +72,11 @@ def test_read_missing_key_opened_store(tmp_path, setup_path):
read_hdf(store, "k1")


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_read_column(setup_path):
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)

@@ -175,7 +173,7 @@ def test_pytables_native2_read(datapath):
assert isinstance(d1, DataFrame)


def test_read_hdf_open_store(tmp_path, setup_path):
def test_read_hdf_open_store(tmp_path, setup_path, using_infer_string):
# GH10330
# No check for non-string path_or-buf, and no test of open store
df = DataFrame(
@@ -187,6 +185,12 @@ def test_read_hdf_open_store(tmp_path, setup_path):
df = df.set_index(keys="E", append=True)

path = tmp_path / setup_path
if using_infer_string:
# TODO(infer_string) make this work for string dtype
msg = "Saving a MultiIndex with an extension dtype is not supported."
with pytest.raises(NotImplementedError, match=msg):
df.to_hdf(path, key="df", mode="w")
return
df.to_hdf(path, key="df", mode="w")
direct = read_hdf(path, "df")
with HDFStore(path, mode="r") as store:
49 changes: 28 additions & 21 deletions pandas/tests/io/pytables/test_round_trip.py
Original file line number Diff line number Diff line change
@@ -4,8 +4,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas._libs.tslibs import Timestamp
from pandas.compat import is_platform_windows

@@ -26,10 +24,7 @@
)
from pandas.util import _test_decorators as td

pytestmark = [
pytest.mark.single_cpu,
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
]
pytestmark = [pytest.mark.single_cpu]


def test_conv_read_write():
@@ -49,8 +44,8 @@ def roundtrip(key, obj, **kwargs):

o = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)
tm.assert_frame_equal(o, roundtrip("frame", o))

@@ -150,8 +145,8 @@ def test_api_invalid(tmp_path, setup_path):
# Invalid.
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)

msg = "Can only append to Tables"
@@ -201,7 +196,7 @@ def test_put_integer(setup_path):
_check_roundtrip(df, tm.assert_frame_equal, setup_path)


def test_table_values_dtypes_roundtrip(setup_path):
def test_table_values_dtypes_roundtrip(setup_path, using_infer_string):
with ensure_clean_store(setup_path) as store:
df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8")
store.append("df_f8", df1)
@@ -246,6 +241,7 @@ def test_table_values_dtypes_roundtrip(setup_path):
store.append("df_mixed_dtypes1", df1)
result = store.select("df_mixed_dtypes1").dtypes.value_counts()
result.index = [str(i) for i in result.index]
str_dtype = "str" if using_infer_string else "object"
expected = Series(
{
"float32": 2,
@@ -255,7 +251,7 @@ def test_table_values_dtypes_roundtrip(setup_path):
"int16": 1,
"int8": 1,
"int64": 1,
"object": 1,
str_dtype: 1,
"datetime64[s]": 2,
"datetime64[ms]": 1,
"datetime64[ns]": 1,
@@ -277,10 +273,10 @@ def test_series(setup_path):
)
_check_roundtrip(ts, tm.assert_series_equal, path=setup_path)

ts2 = Series(ts.index, Index(ts.index, dtype=object))
ts2 = Series(ts.index, Index(ts.index))
_check_roundtrip(ts2, tm.assert_series_equal, path=setup_path)

ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object))
ts3 = Series(ts.values, Index(np.asarray(ts.index)))
_check_roundtrip(
ts3, tm.assert_series_equal, path=setup_path, check_index_type=False
)
@@ -370,8 +366,8 @@ def test_timeseries_preepoch(setup_path, request):
def test_frame(compression, setup_path):
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)

# put in some random NAs
@@ -387,7 +383,7 @@ def test_frame(compression, setup_path):

tdf = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)
_check_roundtrip(
@@ -402,7 +398,10 @@ def test_frame(compression, setup_path):
assert recons._mgr.is_consolidated()

# empty
_check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path)
df2 = df[:0]
# Prevent df2 from having index with inferred_type as string
df2.index = Index([])
_check_roundtrip(df2[:0], tm.assert_frame_equal, path=setup_path)


def test_empty_series_frame(setup_path):
@@ -434,9 +433,17 @@ def test_can_serialize_dates(setup_path):
_check_roundtrip(frame, tm.assert_frame_equal, path=setup_path)


def test_store_hierarchical(setup_path, multiindex_dataframe_random_data):
def test_store_hierarchical(
setup_path, using_infer_string, multiindex_dataframe_random_data
):
frame = multiindex_dataframe_random_data

if using_infer_string:
# TODO(infer_string) make this work for string dtype
msg = "Saving a MultiIndex with an extension dtype is not supported."
with pytest.raises(NotImplementedError, match=msg):
_check_roundtrip(frame, tm.assert_frame_equal, path=setup_path)
return
_check_roundtrip(frame, tm.assert_frame_equal, path=setup_path)
_check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path)
_check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path)
@@ -455,8 +462,8 @@ def test_store_mixed(compression, setup_path):
def _make_one():
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)
df["obj1"] = "foo"
df["obj2"] = "bar"
44 changes: 21 additions & 23 deletions pandas/tests/io/pytables/test_select.py
Original file line number Diff line number Diff line change
@@ -27,10 +27,7 @@

from pandas.io.pytables import Term

pytestmark = [
pytest.mark.single_cpu,
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
]
pytestmark = [pytest.mark.single_cpu]


def test_select_columns_in_where(setup_path):
@@ -138,7 +135,7 @@ def test_select(setup_path):
# select with columns=
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)
_maybe_remove(store, "df")
@@ -278,8 +275,8 @@ def test_select_dtypes(setup_path, request):
with ensure_clean_store(setup_path) as store:
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)

expected = df[df["A"] > 0]
@@ -350,7 +347,7 @@ def test_select_iterator(tmp_path, setup_path):
with ensure_clean_store(setup_path) as store:
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)
_maybe_remove(store, "df")
@@ -375,7 +372,7 @@ def test_select_iterator(tmp_path, setup_path):

df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)
df.to_hdf(path, key="df_non_table")
@@ -391,7 +388,7 @@ def test_select_iterator(tmp_path, setup_path):

df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)
df.to_hdf(path, key="df", format="table")
@@ -408,7 +405,7 @@ def test_select_iterator(tmp_path, setup_path):
with ensure_clean_store(setup_path) as store:
df1 = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)
store.append("df1", df1, data_columns=True)
@@ -436,7 +433,7 @@ def test_select_iterator_complete_8014(setup_path):
with ensure_clean_store(setup_path) as store:
expected = DataFrame(
np.random.default_rng(2).standard_normal((100064, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=100064, freq="s"),
)
_maybe_remove(store, "df")
@@ -471,7 +468,7 @@ def test_select_iterator_complete_8014(setup_path):
with ensure_clean_store(setup_path) as store:
expected = DataFrame(
np.random.default_rng(2).standard_normal((100064, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=100064, freq="s"),
)
_maybe_remove(store, "df")
@@ -513,7 +510,7 @@ def test_select_iterator_non_complete_8014(setup_path):
with ensure_clean_store(setup_path) as store:
expected = DataFrame(
np.random.default_rng(2).standard_normal((100064, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=100064, freq="s"),
)
_maybe_remove(store, "df")
@@ -547,7 +544,7 @@ def test_select_iterator_non_complete_8014(setup_path):
with ensure_clean_store(setup_path) as store:
expected = DataFrame(
np.random.default_rng(2).standard_normal((100064, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=100064, freq="s"),
)
_maybe_remove(store, "df")
@@ -571,7 +568,7 @@ def test_select_iterator_many_empty_frames(setup_path):
with ensure_clean_store(setup_path) as store:
expected = DataFrame(
np.random.default_rng(2).standard_normal((100064, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=100064, freq="s"),
)
_maybe_remove(store, "df")
@@ -623,7 +620,7 @@ def test_select_iterator_many_empty_frames(setup_path):
def test_frame_select(setup_path, request):
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)

@@ -655,7 +652,7 @@ def test_frame_select(setup_path, request):
# invalid terms
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)
store.append("df_time", df)
@@ -669,12 +666,13 @@ def test_frame_select(setup_path, request):
# store.select('frame', [crit1, crit2])


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_frame_select_complex(setup_path):
# select via complex criteria

df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)
df["string"] = "foo"
@@ -791,7 +789,7 @@ def test_invalid_filtering(setup_path):

df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)

@@ -813,7 +811,7 @@ def test_string_select(setup_path):
with ensure_clean_store(setup_path) as store:
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)

@@ -857,7 +855,7 @@ def test_string_select(setup_path):
def test_select_as_multiple(setup_path):
df1 = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)
df2 = df1.copy().rename(columns="{}_2".format)
@@ -982,6 +980,7 @@ def test_query_long_float_literal(setup_path):
tm.assert_frame_equal(expected, result)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_query_compare_column_type(setup_path):
# GH 15492
df = DataFrame(
@@ -1058,7 +1057,6 @@ def test_select_large_integer(tmp_path):
),
columns=["x", "y"],
)
result = None
with HDFStore(path) as s:
s.append("data", df, data_columns=True, index=False)
result = s.select("data", where="y==-9223372036854775801").get("y").get(0)
66 changes: 35 additions & 31 deletions pandas/tests/io/pytables/test_store.py
Original file line number Diff line number Diff line change
@@ -25,6 +25,7 @@
timedelta_range,
)
import pandas._testing as tm
from pandas.conftest import has_pyarrow
from pandas.tests.io.pytables.common import (
_maybe_remove,
ensure_clean_store,
@@ -35,10 +36,7 @@
read_hdf,
)

pytestmark = [
pytest.mark.single_cpu,
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
]
pytestmark = [pytest.mark.single_cpu]

tables = pytest.importorskip("tables")

@@ -110,7 +108,7 @@ def test_iter_empty(setup_path):
assert list(store) == []


def test_repr(setup_path, performance_warning):
def test_repr(setup_path, performance_warning, using_infer_string):
with ensure_clean_store(setup_path) as store:
repr(store)
store.info()
@@ -145,7 +143,9 @@ def test_repr(setup_path, performance_warning):
df.loc[df.index[3:6], ["obj1"]] = np.nan
df = df._consolidate()

with tm.assert_produces_warning(performance_warning):
warning = None if using_infer_string else performance_warning
msg = "cannot\nmap directly to c-types .* dtype='object'"
with tm.assert_produces_warning(warning, match=msg):
store["df"] = df

# make a random group in hdf space
@@ -316,7 +316,7 @@ def test_getattr(setup_path):

df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)
store["df"] = df
@@ -369,7 +369,7 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path):
{
"A": [0.0, 1.0, 2.0, 3.0, 4.0],
"B": [0.0, 1.0, 0.0, 1.0, 0.0],
"C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object),
"C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]),
"D": date_range("20130101", periods=5),
}
).set_index("C")
@@ -385,6 +385,10 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path):
tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]]))


@pytest.mark.xfail(
using_string_dtype() and has_pyarrow,
reason="TODO(infer_string): can't encode '\ud800': surrogates not allowed",
)
Comment on lines +388 to +391
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know if the error happens when writing or reading? (I suppose reading because even if it was object dtype we try to read it as strings?)

This is probably not something we can fix? Wondering (not for this PR) if we should fall back to object dtype in such a case (or at least give that option), so the user can still load the data and inspect it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I hadn't looked into this one - currently it raises on data creation. Fixing that by specifying object dtype, you're right it's the reading that raises here. The error happens in Series.str.decode(encoding, errors=errors). I am wondering if we should add a dtype argument to decode so that you can force object dtype - that seems generally useful to alleviate this pain-point, and then we could use it to provide a fallback in pytables.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PyArrow strings not supporting surrogates is another interesting case in regards to PDEP-13. #58455

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am wondering if we should add a dtype argument to decode so that you can force object dtype - that seems generally useful to alleviate this pain-point, and then we could use it to provide a fallback in pytables.

+1

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I listed this issue in #59328 as one of the breaking changes when users upgrade to the string dtype (in this case at least when using pyarrow).

But so we should also list this as one of the differences between the pyarrow and python engines.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do wonder if we should start nudging users down the path of referring to our strings as utf8. For other encodings, the binary type is the best bet for now, and probably for a while (I'm not aware of any non-utf8 Arrow array work, but maybe its out there)

@pytest.mark.parametrize("format", ["fixed", "table"])
def test_to_hdf_errors(tmp_path, format, setup_path):
data = ["\ud800foo"]
@@ -406,7 +410,7 @@ def col(t, column):
# data columns
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)
df["string"] = "foo"
@@ -441,7 +445,7 @@ def col(t, column):
# data columns
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)
df["string"] = "foo"
@@ -483,8 +487,8 @@ def test_table_mixed_dtypes(setup_path):
# frame
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)
df["obj1"] = "foo"
df["obj2"] = "bar"
@@ -539,8 +543,8 @@ def test_remove(setup_path):
)
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)
store["a"] = ts
store["b"] = df
@@ -603,8 +607,8 @@ def test_same_name_scoping(setup_path):
def test_store_index_name(setup_path):
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)
df.index.name = "foo"

@@ -650,8 +654,8 @@ def test_store_index_name_numpy_str(tmp_path, table_format, setup_path, unit, tz
def test_store_series_name(setup_path):
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)
series = df["A"]

@@ -665,7 +669,7 @@ def test_overwrite_node(setup_path):
with ensure_clean_store(setup_path) as store:
store["a"] = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)
ts = Series(
@@ -679,7 +683,7 @@ def test_overwrite_node(setup_path):
def test_coordinates(setup_path):
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)

@@ -714,7 +718,7 @@ def test_coordinates(setup_path):
_maybe_remove(store, "df2")
df1 = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)
df2 = df1.copy().rename(columns="{}_2".format)
@@ -870,8 +874,8 @@ def test_start_stop_fixed(setup_path):
# sparse; not implemented
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)
df.iloc[3:5, 1:3] = np.nan
df.iloc[8:10, -2] = np.nan
@@ -904,8 +908,8 @@ def test_select_filter_corner(setup_path, request):
def test_path_pathlib():
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)

result = tm.round_trip_pathlib(
@@ -934,8 +938,8 @@ def test_contiguous_mixed_data_table(start, stop, setup_path):
def test_path_pathlib_hdfstore():
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)

def writer(path):
@@ -953,8 +957,8 @@ def reader(path):
def test_pickle_path_localpath():
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)
result = tm.round_trip_pathlib(
lambda p: df.to_hdf(p, key="df"), lambda p: read_hdf(p, "df")
@@ -966,8 +970,8 @@ def test_pickle_path_localpath():
def test_copy(propindexes):
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)

with tm.ensure_clean() as path:
6 changes: 0 additions & 6 deletions pandas/tests/io/pytables/test_timezones.py
Original file line number Diff line number Diff line change
@@ -6,8 +6,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas._libs.tslibs.timezones import maybe_get_tz
import pandas.util._test_decorators as td

@@ -25,10 +23,6 @@
ensure_clean_store,
)

pytestmark = pytest.mark.xfail(
using_string_dtype(), reason="TODO(infer_string)", strict=False
)


def _compare_with_tz(a, b):
tm.assert_frame_equal(a, b)