-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
TST(string dtype): Resolve xfails in pytables #60795
Changes from all commits
120b0bd
adb0349
1413d27
70563ce
d12f66d
bc5b697
55ea98b
cb56243
733b123
dae84aa
4df00dc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,7 @@ | |
timedelta_range, | ||
) | ||
import pandas._testing as tm | ||
from pandas.conftest import has_pyarrow | ||
from pandas.tests.io.pytables.common import ( | ||
_maybe_remove, | ||
ensure_clean_store, | ||
|
@@ -35,10 +36,7 @@ | |
read_hdf, | ||
) | ||
|
||
pytestmark = [ | ||
pytest.mark.single_cpu, | ||
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), | ||
] | ||
pytestmark = [pytest.mark.single_cpu] | ||
|
||
tables = pytest.importorskip("tables") | ||
|
||
|
@@ -110,7 +108,7 @@ def test_iter_empty(setup_path): | |
assert list(store) == [] | ||
|
||
|
||
def test_repr(setup_path, performance_warning): | ||
def test_repr(setup_path, performance_warning, using_infer_string): | ||
with ensure_clean_store(setup_path) as store: | ||
repr(store) | ||
store.info() | ||
|
@@ -145,7 +143,9 @@ def test_repr(setup_path, performance_warning): | |
df.loc[df.index[3:6], ["obj1"]] = np.nan | ||
df = df._consolidate() | ||
|
||
with tm.assert_produces_warning(performance_warning): | ||
warning = None if using_infer_string else performance_warning | ||
msg = "cannot\nmap directly to c-types .* dtype='object'" | ||
with tm.assert_produces_warning(warning, match=msg): | ||
store["df"] = df | ||
|
||
# make a random group in hdf space | ||
|
@@ -316,7 +316,7 @@ def test_getattr(setup_path): | |
|
||
df = DataFrame( | ||
np.random.default_rng(2).standard_normal((10, 4)), | ||
columns=Index(list("ABCD"), dtype=object), | ||
columns=Index(list("ABCD")), | ||
index=date_range("2000-01-01", periods=10, freq="B"), | ||
) | ||
store["df"] = df | ||
|
@@ -369,7 +369,7 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path): | |
{ | ||
"A": [0.0, 1.0, 2.0, 3.0, 4.0], | ||
"B": [0.0, 1.0, 0.0, 1.0, 0.0], | ||
"C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object), | ||
"C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]), | ||
"D": date_range("20130101", periods=5), | ||
} | ||
).set_index("C") | ||
|
@@ -385,6 +385,10 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path): | |
tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]])) | ||
|
||
|
||
@pytest.mark.xfail( | ||
using_string_dtype() and has_pyarrow, | ||
reason="TODO(infer_string): can't encode '\ud800': surrogates not allowed", | ||
) | ||
Comment on lines
+388
to
+391
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you know if the error happens when writing or reading? (I suppose reading because even if it was object dtype we try to read it as strings?) This is probably not something we can fix? Wondering (not for this PR) if we should fall back to object dtype in such a case (or at least give that option), so the user can still load the data and inspect it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I hadn't looked into this one - currently it raises on data creation. Fixing that by specifying object dtype, you're right it's the reading that raises here. The error happens in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PyArrow strings not supporting surrogates is another interesting case in regards to PDEP-13. #58455 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
+1 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I listed this issue in #59328 as one of the breaking changes when users upgrade to the string dtype (in this case at least when using pyarrow). But so we should also list this as one of the differences between the pyarrow and python engines. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do wonder if we should start nudging users down the path of referring to our strings as utf8. For other encodings, the binary type is the best bet for now, and probably for a while (I'm not aware of any non-utf8 Arrow array work, but maybe its out there) |
||
@pytest.mark.parametrize("format", ["fixed", "table"]) | ||
def test_to_hdf_errors(tmp_path, format, setup_path): | ||
data = ["\ud800foo"] | ||
|
@@ -406,7 +410,7 @@ def col(t, column): | |
# data columns | ||
df = DataFrame( | ||
np.random.default_rng(2).standard_normal((10, 4)), | ||
columns=Index(list("ABCD"), dtype=object), | ||
columns=Index(list("ABCD")), | ||
index=date_range("2000-01-01", periods=10, freq="B"), | ||
) | ||
df["string"] = "foo" | ||
|
@@ -441,7 +445,7 @@ def col(t, column): | |
# data columns | ||
df = DataFrame( | ||
np.random.default_rng(2).standard_normal((10, 4)), | ||
columns=Index(list("ABCD"), dtype=object), | ||
columns=Index(list("ABCD")), | ||
index=date_range("2000-01-01", periods=10, freq="B"), | ||
) | ||
df["string"] = "foo" | ||
|
@@ -483,8 +487,8 @@ def test_table_mixed_dtypes(setup_path): | |
# frame | ||
df = DataFrame( | ||
1.1 * np.arange(120).reshape((30, 4)), | ||
columns=Index(list("ABCD"), dtype=object), | ||
index=Index([f"i-{i}" for i in range(30)], dtype=object), | ||
columns=Index(list("ABCD")), | ||
index=Index([f"i-{i}" for i in range(30)]), | ||
) | ||
df["obj1"] = "foo" | ||
df["obj2"] = "bar" | ||
|
@@ -539,8 +543,8 @@ def test_remove(setup_path): | |
) | ||
df = DataFrame( | ||
1.1 * np.arange(120).reshape((30, 4)), | ||
columns=Index(list("ABCD"), dtype=object), | ||
index=Index([f"i-{i}" for i in range(30)], dtype=object), | ||
columns=Index(list("ABCD")), | ||
index=Index([f"i-{i}" for i in range(30)]), | ||
) | ||
store["a"] = ts | ||
store["b"] = df | ||
|
@@ -603,8 +607,8 @@ def test_same_name_scoping(setup_path): | |
def test_store_index_name(setup_path): | ||
df = DataFrame( | ||
1.1 * np.arange(120).reshape((30, 4)), | ||
columns=Index(list("ABCD"), dtype=object), | ||
index=Index([f"i-{i}" for i in range(30)], dtype=object), | ||
columns=Index(list("ABCD")), | ||
index=Index([f"i-{i}" for i in range(30)]), | ||
) | ||
df.index.name = "foo" | ||
|
||
|
@@ -650,8 +654,8 @@ def test_store_index_name_numpy_str(tmp_path, table_format, setup_path, unit, tz | |
def test_store_series_name(setup_path): | ||
df = DataFrame( | ||
1.1 * np.arange(120).reshape((30, 4)), | ||
columns=Index(list("ABCD"), dtype=object), | ||
index=Index([f"i-{i}" for i in range(30)], dtype=object), | ||
columns=Index(list("ABCD")), | ||
index=Index([f"i-{i}" for i in range(30)]), | ||
) | ||
series = df["A"] | ||
|
||
|
@@ -665,7 +669,7 @@ def test_overwrite_node(setup_path): | |
with ensure_clean_store(setup_path) as store: | ||
store["a"] = DataFrame( | ||
np.random.default_rng(2).standard_normal((10, 4)), | ||
columns=Index(list("ABCD"), dtype=object), | ||
columns=Index(list("ABCD")), | ||
index=date_range("2000-01-01", periods=10, freq="B"), | ||
) | ||
ts = Series( | ||
|
@@ -679,7 +683,7 @@ def test_overwrite_node(setup_path): | |
def test_coordinates(setup_path): | ||
df = DataFrame( | ||
np.random.default_rng(2).standard_normal((10, 4)), | ||
columns=Index(list("ABCD"), dtype=object), | ||
columns=Index(list("ABCD")), | ||
index=date_range("2000-01-01", periods=10, freq="B"), | ||
) | ||
|
||
|
@@ -714,7 +718,7 @@ def test_coordinates(setup_path): | |
_maybe_remove(store, "df2") | ||
df1 = DataFrame( | ||
np.random.default_rng(2).standard_normal((10, 4)), | ||
columns=Index(list("ABCD"), dtype=object), | ||
columns=Index(list("ABCD")), | ||
index=date_range("2000-01-01", periods=10, freq="B"), | ||
) | ||
df2 = df1.copy().rename(columns="{}_2".format) | ||
|
@@ -870,8 +874,8 @@ def test_start_stop_fixed(setup_path): | |
# sparse; not implemented | ||
df = DataFrame( | ||
1.1 * np.arange(120).reshape((30, 4)), | ||
columns=Index(list("ABCD"), dtype=object), | ||
index=Index([f"i-{i}" for i in range(30)], dtype=object), | ||
columns=Index(list("ABCD")), | ||
index=Index([f"i-{i}" for i in range(30)]), | ||
) | ||
df.iloc[3:5, 1:3] = np.nan | ||
df.iloc[8:10, -2] = np.nan | ||
|
@@ -904,8 +908,8 @@ def test_select_filter_corner(setup_path, request): | |
def test_path_pathlib(): | ||
df = DataFrame( | ||
1.1 * np.arange(120).reshape((30, 4)), | ||
columns=Index(list("ABCD"), dtype=object), | ||
index=Index([f"i-{i}" for i in range(30)], dtype=object), | ||
columns=Index(list("ABCD")), | ||
index=Index([f"i-{i}" for i in range(30)]), | ||
) | ||
|
||
result = tm.round_trip_pathlib( | ||
|
@@ -934,8 +938,8 @@ def test_contiguous_mixed_data_table(start, stop, setup_path): | |
def test_path_pathlib_hdfstore(): | ||
df = DataFrame( | ||
1.1 * np.arange(120).reshape((30, 4)), | ||
columns=Index(list("ABCD"), dtype=object), | ||
index=Index([f"i-{i}" for i in range(30)], dtype=object), | ||
columns=Index(list("ABCD")), | ||
index=Index([f"i-{i}" for i in range(30)]), | ||
) | ||
|
||
def writer(path): | ||
|
@@ -953,8 +957,8 @@ def reader(path): | |
def test_pickle_path_localpath(): | ||
df = DataFrame( | ||
1.1 * np.arange(120).reshape((30, 4)), | ||
columns=Index(list("ABCD"), dtype=object), | ||
index=Index([f"i-{i}" for i in range(30)], dtype=object), | ||
columns=Index(list("ABCD")), | ||
index=Index([f"i-{i}" for i in range(30)]), | ||
) | ||
result = tm.round_trip_pathlib( | ||
lambda p: df.to_hdf(p, key="df"), lambda p: read_hdf(p, "df") | ||
|
@@ -966,8 +970,8 @@ def test_pickle_path_localpath(): | |
def test_copy(propindexes): | ||
df = DataFrame( | ||
1.1 * np.arange(120).reshape((30, 4)), | ||
columns=Index(list("ABCD"), dtype=object), | ||
index=Index([f"i-{i}" for i in range(30)], dtype=object), | ||
columns=Index(list("ABCD")), | ||
index=Index([f"i-{i}" for i in range(30)]), | ||
) | ||
|
||
with tm.ensure_clean() as path: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the original error message here is much clearer - is there no way to catch and raise that for the string types?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure what you mean, this error is not raised for string types. It's being raised for date types.
When
infer_string=False
, this function is passed a single object block with a mix of strings and dates. In this case, the data of the block is inferred as mixed, and then checked column-by-column. This is where the top message (which I think is confusing) is raised. Wheninfer_string=True
, each string array is fed into this function individually and does not raise. Then the object block is fed in containing only dates. This is inferred as dates, and the corresponding error message is raised.