From ddd729a2715b6cf8da41c8a6ad30db7968956262 Mon Sep 17 00:00:00 2001 From: Ashar Date: Mon, 20 Jan 2025 17:32:26 -0500 Subject: [PATCH 1/4] BUG: Fix factorize to ensure proper use of null_encoding parameter --- pandas/core/arrays/arrow/array.py | 7 +++---- pandas/tests/extension/test_arrow.py | 13 +++++++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0b546bed1c2b7..402466a490f89 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1207,10 +1207,9 @@ def factorize( # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 data = data.cast(pa.int64()) - if pa.types.is_dictionary(data.type): - encoded = data - else: - encoded = data.dictionary_encode(null_encoding=null_encoding) + if pa.types.is_dictionary(data.type) and null_encoding == "encode": + data = data.cast(data.type.value_type) + encoded = data.dictionary_encode(null_encoding=null_encoding) if encoded.length() == 0: indices = np.array([], dtype=np.intp) uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type)) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index f4a63ff4c92ec..ade600765c126 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3329,6 +3329,19 @@ def test_factorize_chunked_dictionary(): tm.assert_index_equal(res_uniques, exp_uniques) +def test_factorize_dictionary_with_na(): + # Test that factorize properly handles NA values in dictionary arrays + arr = pd.array( + ["a1", pd.NA], dtype=ArrowDtype(pa.dictionary(pa.int32(), pa.utf8())) + ) + # Test with use_na_sentinel=False + indices, uniques = arr.factorize(use_na_sentinel=False) + expected_indices = np.array([0, 1], dtype=np.intp) + expected_uniques = pd.array(["a1", None], dtype=ArrowDtype(pa.string())) + tm.assert_numpy_array_equal(indices, expected_indices) + tm.assert_extension_array_equal(uniques, expected_uniques) + + def test_dictionary_astype_categorical(): # GH#56672 arrs = [ From 126d96343c17631e63ac279953a9883ef4aa95a5 Mon Sep 17 00:00:00 2001 From: Ashar Date: Thu, 23 Jan 2025 14:26:09 -0500 Subject: [PATCH 2/4] DOC: Add whatsnew entry for dictionary array NA handling fix --- doc/source/whatsnew/v3.0.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index edd205860b4e4..e6e827c011522 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -625,6 +625,8 @@ Performance improvements Bug fixes ~~~~~~~~~ +- Fixed bug in :meth:`ArrowExtensionArray.factorize` where NA values were dropped when input was dictionary-encoded even when dropna was set to False(:issue:`60567`) + Categorical ^^^^^^^^^^^ From a463415206f11869cda829689aa709608485e745 Mon Sep 17 00:00:00 2001 From: Ashar Date: Mon, 27 Jan 2025 16:56:54 -0500 Subject: [PATCH 3/4] BUG: Fix factorize to ensure proper use of null_encoding parameter and backwards compatibility maintained --- pandas/core/arrays/arrow/array.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 402466a490f89..e2feda495c103 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1207,9 +1207,15 @@ def factorize( # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 data = data.cast(pa.int64()) - if pa.types.is_dictionary(data.type) and null_encoding == "encode": - data = data.cast(data.type.value_type) - encoded = data.dictionary_encode(null_encoding=null_encoding) + if pa.types.is_dictionary(data.type): + if null_encoding == "encode": + # dictionary encode does nothing if an already encoded array is given + data = data.cast(data.type.value_type) + encoded = data.dictionary_encode(null_encoding=null_encoding) + else: + encoded = data + else: + encoded = data.dictionary_encode(null_encoding=null_encoding) if encoded.length() == 0: indices = np.array([], dtype=np.intp) uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type)) From 67b48d73a5c6fcfb6dc292fdce35dc52ba616ed3 Mon Sep 17 00:00:00 2001 From: Ashar Date: Fri, 31 Jan 2025 14:50:26 +0500 Subject: [PATCH 4/4] DOC: Improve rst file and test case comments for arrow groupby NA fix --- doc/source/whatsnew/v3.0.0.rst | 3 +-- pandas/tests/extension/test_arrow.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e6e827c011522..7ebcf18a36a96 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -625,8 +625,6 @@ Performance improvements Bug fixes ~~~~~~~~~ -- Fixed bug in :meth:`ArrowExtensionArray.factorize` where NA values were dropped when input was dictionary-encoded even when dropna was set to False(:issue:`60567`) - Categorical ^^^^^^^^^^^ @@ -792,6 +790,7 @@ ExtensionArray ^^^^^^^^^^^^^^ - Bug in :class:`Categorical` when constructing with an :class:`Index` with :class:`ArrowDtype` (:issue:`60563`) - Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`) +- Bug in :meth:`ArrowExtensionArray.factorize` where NA values were dropped when input was dictionary-encoded even when dropna was set to False(:issue:`60567`) - Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`) - Bug in comparison between object with :class:`ArrowDtype` and incompatible-dtyped (e.g. string vs bool) incorrectly raising instead of returning all-``False`` (for ``==``) or all-``True`` (for ``!=``) (:issue:`59505`) - Bug in constructing pandas data structures when passing into ``dtype`` a string of the type followed by ``[pyarrow]`` while PyArrow is not installed would raise ``NameError`` rather than ``ImportError`` (:issue:`57928`) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index ade600765c126..fbd3868f62899 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3330,11 +3330,10 @@ def test_factorize_chunked_dictionary(): def test_factorize_dictionary_with_na(): - # Test that factorize properly handles NA values in dictionary arrays + # GH#60567 arr = pd.array( ["a1", pd.NA], dtype=ArrowDtype(pa.dictionary(pa.int32(), pa.utf8())) ) - # Test with use_na_sentinel=False indices, uniques = arr.factorize(use_na_sentinel=False) expected_indices = np.array([0, 1], dtype=np.intp) expected_uniques = pd.array(["a1", None], dtype=ArrowDtype(pa.string()))