From df900b594d5ed88023601450a51f5f08fd6fdcbb Mon Sep 17 00:00:00 2001 From: Samuel Oranyeli Date: Wed, 9 Nov 2022 06:33:26 +1100 Subject: [PATCH] [ENH] generic select function (#1187) * fix blank note output * return whatever user passes, even if they r duplicates * add DropLabel for dropping columns * generic select with tests * update test_select.py * update changelog * update docs in select * add version notifications * update docs for DropLabel class * update admonition * ensure booleans are converted into arrays --- CHANGELOG.md | 2 +- janitor/functions/__init__.py | 2 +- janitor/functions/case_when.py | 5 ++ janitor/functions/conditional_join.py | 8 ++- janitor/functions/pivot.py | 16 ++++- janitor/functions/select.py | 86 +++++++++++++++++++++++-- janitor/functions/utils.py | 87 +++++++++++++++++++++++--- tests/functions/test_select.py | 60 ++++++++++++++++++ tests/functions/test_select_columns.py | 32 +++++++++- 9 files changed, 276 insertions(+), 22 deletions(-) create mode 100644 tests/functions/test_select.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ecea609f0..bddb10cd93 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,7 +29,7 @@ - [ENH] Fix error when `sort_by_appearance=True` is combined with `dropna=True`. Issue #1168 @samukweku - [ENH] Add explicit default parameter to `case_when` function. Issue #1159 @samukweku - [BUG] pandas 1.5.x `_MergeOperation` doesn't have `copy` keyword anymore. Issue #1174 @Zeroto521 -- [ENH] `select_rows` function added for flexible row selection. Add support for MultiIndex selection via dictionary. Issue #1124 @samukweku +- [ENH] `select_rows` function added for flexible row selection. Generic `select` function added as well. Add support for MultiIndex selection via dictionary. Issue #1124 @samukweku - [TST] Compat with macos and window, to fix `FailedHealthCheck` Issue #1181 @Zeroto521 - [INF] Merge two docs CIs (`docs-preview.yml` and `docs.yml`) to one. And add `documentation` pytest mark. PR #1183 @Zeroto521 - [INF] Merge `codecov.yml` (only works for the dev branch pushing event) into `tests.yml` (only works for PR event). PR #1185 @Zeroto521 diff --git a/janitor/functions/__init__.py b/janitor/functions/__init__.py index 860a38e238..57a1eaad4f 100644 --- a/janitor/functions/__init__.py +++ b/janitor/functions/__init__.py @@ -75,4 +75,4 @@ from .transform_columns import transform_column, transform_columns from .truncate_datetime import truncate_datetime_dataframe from .update_where import update_where -from .utils import patterns, unionize_dataframe_categories +from .utils import patterns, unionize_dataframe_categories, DropLabel diff --git a/janitor/functions/case_when.py b/janitor/functions/case_when.py index 7812fc9e84..628a5660a7 100644 --- a/janitor/functions/case_when.py +++ b/janitor/functions/case_when.py @@ -90,6 +90,11 @@ def case_when( else: default ``` + !!! abstract "Version Changed" + + - 0.24.0 + - Added `default` parameter. + :param df: A pandas DataFrame. :param args: Variable argument of conditions and expected values. diff --git a/janitor/functions/conditional_join.py b/janitor/functions/conditional_join.py index d3da6f8e01..1ff628e322 100644 --- a/janitor/functions/conditional_join.py +++ b/janitor/functions/conditional_join.py @@ -115,6 +115,12 @@ def conditional_join( 3 4 3 5 4 4 3 6 + !!! abstract "Version Changed" + + - 0.24.0 + - Added `df_columns`, `right_columns`, `keep` and `use_numba` parameters. + + :param df: A pandas DataFrame. :param right: Named Series or DataFrame to join to. @@ -145,7 +151,7 @@ def conditional_join( :param use_numba: Use numba, if installed, to accelerate the computation. Applicable only to strictly non-equi joins. Default is `False`. :returns: A pandas DataFrame of the two merged Pandas objects. - """ + """ # noqa: E501 return _conditional_join_compute( df, diff --git a/janitor/functions/pivot.py b/janitor/functions/pivot.py index 9761cd7217..5a8e211188 100644 --- a/janitor/functions/pivot.py +++ b/janitor/functions/pivot.py @@ -220,6 +220,13 @@ def pivot_longer( 7 Austin Texas Watermelon 99 None NaN 8 Hoover Alabama Watermelon 43 None NaN + + !!! abstract "Version Changed" + + - 0.24.0 + - Added `dropna` parameter. + + :param df: A pandas DataFrame. :param index: Name(s) of columns to use as identifier variables. Should be either a single column name, or a list/tuple of @@ -1259,6 +1266,13 @@ def pivot_wider( 0 5.5 20 25 30 37 1 6.1 22 18 19 29 + + !!! abstract "Version Changed" + + - 0.24.0 + - Added `reset_index`, `names_expand` and `index_expand` parameters. + + :param df: A pandas DataFrame. :param index: Name(s) of columns to use as identifier variables. It should be either a single column name, or a list of column names. @@ -1293,7 +1307,7 @@ def pivot_wider( Applies only if `index` is a categorical column. Default is `False`. :returns: A pandas DataFrame that has been unpivoted from long to wide form. - """ + """ # noqa: E501 df = df.copy() diff --git a/janitor/functions/select.py b/janitor/functions/select.py index 4c14809345..1fc9101092 100644 --- a/janitor/functions/select.py +++ b/janitor/functions/select.py @@ -1,7 +1,7 @@ import pandas_flavor as pf import pandas as pd from janitor.utils import deprecated_alias -from janitor.functions.utils import _select +from janitor.functions.utils import _select, DropLabel # noqa: F401 @pf.register_dataframe_method @@ -24,7 +24,8 @@ def select_columns( Optional ability to invert selection of columns available as well. - !!! Note + !!!note + The preferred option when selecting columns or rows in a Pandas DataFrame is with `.loc` or `.iloc` methods, as they are generally performant. `select_columns` is primarily for convenience. @@ -57,7 +58,7 @@ def select_columns( :returns: A pandas DataFrame with the specified columns selected. """ # noqa: E501 - return _select(df, args, invert, axis="columns") + return _select(df, args=args, invert=invert, axis="columns") @pf.register_dataframe_method @@ -79,11 +80,17 @@ def select_rows( Optional ability to invert selection of rows available as well. - !!! Note + + !!! info "New in version 0.24.0" + + + !!!note + The preferred option when selecting columns or rows in a Pandas DataFrame is with `.loc` or `.iloc` methods, as they are generally performant. `select_rows` is primarily for convenience. + Example: >>> import pandas as pd @@ -113,5 +120,74 @@ def select_rows( provided. :returns: A pandas DataFrame with the specified rows selected. """ # noqa: E501 + return _select(df, args=args, invert=invert, axis="index") + + +@pf.register_dataframe_method +def select(df: pd.DataFrame, *, rows=None, columns=None) -> pd.DataFrame: + """ + Method-chainable selection of rows and columns. + + It accepts a string, shell-like glob strings `(*string*)`, + regex, slice, array-like object, or a list of the previous options. + + Selection on a MultiIndex on a level, or multiple levels, + is possible with a dictionary. + + This method does not mutate the original DataFrame. + + Selection can be inverted with the `DropLabel` class. + + + !!! info "New in version 0.24.0" + + + !!!note + + The preferred option when selecting columns or rows in a Pandas DataFrame + is with `.loc` or `.iloc` methods, as they are generally performant. + `select` is primarily for convenience. + + + Example: + + >>> import pandas as pd + >>> import janitor + >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], + ... index=['cobra', 'viper', 'sidewinder'], + ... columns=['max_speed', 'shield']) + >>> df + max_speed shield + cobra 1 2 + viper 4 5 + sidewinder 7 8 + >>> df.select(rows='cobra', columns='shield') + shield + cobra 2 + + Labels can be dropped with the `DropLabel` class: + + >>> df.select(rows=DropLabel('cobra')) + max_speed shield + viper 4 5 + sidewinder 7 8 + + :param df: A pandas DataFrame. + :param rows: Valid inputs include: an exact label to look for, + a shell-style glob string (e.g. `*_thing_*`), + a regular expression, + a callable, + or variable arguments of all the aforementioned. + A sequence of booleans is also acceptable. + A dictionary can be used for selection on a MultiIndex on different levels. + :param columns: Valid inputs include: an exact label to look for, + a shell-style glob string (e.g. `*_thing_*`), + a regular expression, + a callable, + or variable arguments of all the aforementioned. + A sequence of booleans is also acceptable. + A dictionary can be used for selection on a MultiIndex on different levels. + :returns: A pandas DataFrame with the specified rows and/or columns selected. + """ # noqa: E501 - return _select(df, args, invert, axis="index") + return _select(df, args=None, rows=rows, columns=columns, axis="both") diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py index e87c33d60a..f51fe9962d 100644 --- a/janitor/functions/utils.py +++ b/janitor/functions/utils.py @@ -11,10 +11,11 @@ Pattern, Union, Callable, + Any, ) from pandas.core.dtypes.generic import ABCPandasArray, ABCExtensionArray from pandas.core.common import is_bool_indexer - +from dataclasses import dataclass import pandas as pd from janitor.utils import check, _expand_grid @@ -269,6 +270,23 @@ def _select_callable(arg, func: Callable, axis=None): return bools +@dataclass +class DropLabel: + """ + Helper class for removing labels within the `select` syntax. + `label` can be any of the types supported in the `select`, + `select_rows` and `select_columns` functions. + An array of integers not matching the labels is returned. + + !!! info "New in version 0.24.0" + + :param label: Label(s) to be dropped from the index. + :returns: A dataclass. + """ + + label: Any + + @singledispatch def _select_index(arg, df, axis): """ @@ -284,6 +302,27 @@ def _select_index(arg, df, axis): raise KeyError(f"No match was returned for {arg}") from exc +@_select_index.register(DropLabel) # noqa: F811 +def _column_sel_dispatch(cols, df, axis): # noqa: F811 + """ + Base function for selection on a Pandas Index object. + Returns the inverse of the passed label(s). + + Returns an array of integers. + """ + arr = _select_index(cols.label, df, axis) + index = np.arange(getattr(df, axis).size) + if isinstance(arr, int): + arr = [arr] + elif isinstance(arr, slice): + arr = index[arr] + elif is_list_like(arr): + arr = np.asanyarray(arr) + if is_bool_dtype(arr): + return index[~arr] + return np.setdiff1d(index, arr) + + @_select_index.register(str) # noqa: F811 def _index_dispatch(arg, df, axis): # noqa: F811 """ @@ -437,7 +476,7 @@ def _index_dispatch(arg, df, axis): # noqa: F811 f"{arg} is a boolean dtype and has wrong length: " f"{len(arg)} instead of {len(index)}" ) - return arg + return np.asanyarray(arg) try: if isinstance(arg, pd.Series): @@ -486,17 +525,27 @@ def _index_dispatch(arg, df, axis): # noqa: F811 return arg + # treat multiple DropLabel instances as a single unit + checks = (isinstance(entry, DropLabel) for entry in arg) + if sum(checks) > 1: + drop_labels = (entry for entry in arg if isinstance(entry, DropLabel)) + drop_labels = [entry.label for entry in drop_labels] + drop_labels = DropLabel(drop_labels) + arg = [entry for entry in arg if not isinstance(entry, DropLabel)] + arg.append(drop_labels) + indices = [_select_index(entry, df, axis) for entry in arg] # single entry does not need to be combined # or materialized if possible; # this offers more performance if len(indices) == 1: - if isinstance(indices[0], int): + if is_scalar(indices[0]): return indices - if is_list_like(indices[0]): - return np.asanyarray(indices[0]) - return indices[0] + indices = indices[0] + if is_list_like(indices): + indices = np.asanyarray(indices) + return indices contents = [] for arr in indices: if is_list_like(arr): @@ -508,19 +557,37 @@ def _index_dispatch(arg, df, axis): # noqa: F811 elif isinstance(arr, int): arr = [arr] contents.append(arr) - contents = np.concatenate(contents) - # remove possible duplicates - return pd.unique(contents) + return np.concatenate(contents) def _select( - df: pd.DataFrame, args: tuple, invert: bool, axis: str + df: pd.DataFrame, + args: tuple, + invert: bool = False, + axis: str = "index", + rows=None, + columns=None, ) -> pd.DataFrame: """ Index DataFrame on the index or columns. Returns a DataFrame. """ + assert axis in {"both", "index", "columns"} + if axis == "both": + if rows is None: + rows = slice(None) + else: + if not is_list_like(rows): + rows = [rows] + rows = _select_index(rows, df, axis="index") + if columns is None: + columns = slice(None) + else: + if not is_list_like(columns): + columns = [columns] + columns = _select_index(columns, df, axis="columns") + return df.iloc[rows, columns] indices = _select_index(list(args), df, axis) if invert: rev = np.ones(getattr(df, axis).size, dtype=np.bool8) diff --git a/tests/functions/test_select.py b/tests/functions/test_select.py new file mode 100644 index 0000000000..09571a8e95 --- /dev/null +++ b/tests/functions/test_select.py @@ -0,0 +1,60 @@ +import pandas as pd +import numpy as np +import pytest +from pandas.testing import assert_frame_equal + +from janitor.functions.utils import DropLabel + + +@pytest.fixture +def dataframe(): + """Base DataFrame""" + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + tuples = list(zip(*arrays)) + index = pd.MultiIndex.from_tuples(tuples, names=["A", "B"]) + return pd.DataFrame( + np.random.randint(9, size=(8, 2)), + index=index, + columns=["col1", "col2"], + ) + + +def test_select_rows_only(dataframe): + """Test output for rows only""" + actual = dataframe.select(rows={"B": "two"}) + expected = dataframe.loc(axis=0)[(slice(None), "two")] + assert_frame_equal(actual, expected) + + +def test_select_rows_scalar_(dataframe): + """Test output for rows only""" + actual = dataframe.select(rows="bar") + expected = dataframe.xs("bar", axis=0, level=0, drop_level=False) + assert_frame_equal(actual, expected) + + +def test_select_columns_only(dataframe): + """Test output for columns only""" + actual = dataframe.select(columns=["col1", "col2"]) + expected = dataframe.loc[:, :] + assert_frame_equal(actual, expected) + + +def test_select_columns_scalar(dataframe): + """Test output for columns only""" + actual = dataframe.select(columns="col*") + expected = dataframe.loc[:, :] + assert_frame_equal(actual, expected) + + +def test_select_rows_and_columns(dataframe): + """Test output for both rows and columns""" + actual = dataframe.select( + rows=DropLabel(lambda df: df.eval('A == "foo"')), + columns=DropLabel(slice("col2", None)), + ) + expected = dataframe.loc[["bar", "baz", "qux"], ["col1"]] + assert_frame_equal(actual, expected) diff --git a/tests/functions/test_select_columns.py b/tests/functions/test_select_columns.py index cde32079a1..4faeb5f61e 100644 --- a/tests/functions/test_select_columns.py +++ b/tests/functions/test_select_columns.py @@ -6,7 +6,7 @@ from pandas.testing import assert_frame_equal from itertools import product -from janitor.functions.utils import patterns +from janitor.functions.utils import patterns, DropLabel @pytest.mark.functions @@ -25,6 +25,32 @@ def test_select_column_names(dataframe, invert, expected): assert_frame_equal(df, dataframe[expected]) +@pytest.mark.functions +@pytest.mark.parametrize( + "invert,expected", + [ + (True, ["a", "Bell__Chart", "cities"]), + (False, ["decorated-elephant", "animals@#$%^"]), + ], +) +def test_select_column_names_droplabel(dataframe, invert, expected): + "Base DataFrame" + columns = ["a", "Bell__Chart", "cities"] + df = dataframe.select_columns(DropLabel(columns), invert=invert) + + assert_frame_equal(df, dataframe[expected]) + + +@pytest.mark.functions +def test_select_column_names_droplabel_multiple(dataframe): + "Base DataFrame" + columns = ["a", "Bell__Chart", "cities"] + cols = [DropLabel(ent) for ent in columns] + df = dataframe.select_columns(*cols) + + assert_frame_equal(df, dataframe.drop(columns=columns)) + + @pytest.mark.functions @pytest.mark.parametrize( "invert,expected", @@ -57,6 +83,7 @@ def test_select_column_names_missing_columns(dataframe, columns): dataframe.select_columns(columns) +@pytest.mark.xfail(reason="return whatever user passes") @pytest.mark.functions @pytest.mark.parametrize( "invert,expected", @@ -394,8 +421,7 @@ def test_boolean_list_multi(multiindex): def test_series_multi(multiindex): """Test pd.Series output on a MultiIndex""" - mapp = pd.Series(["bar"]) - expected = multiindex.select_columns(mapp, slice("foo")) + expected = multiindex.select_columns(pd.Series("bar"), slice("baz", "foo")) actual = multiindex.loc(axis=1)["bar":"foo"] assert_frame_equal(expected, actual)