From df900b594d5ed88023601450a51f5f08fd6fdcbb Mon Sep 17 00:00:00 2001
From: Samuel Oranyeli <samueloranyeli@gmail.com>
Date: Wed, 9 Nov 2022 06:33:26 +1100
Subject: [PATCH] [ENH] generic select function (#1187)

* fix blank note output

* return whatever user passes, even if they r duplicates

* add DropLabel for dropping columns

* generic select with tests

* update test_select.py

* update changelog

* update docs in select

* add version notifications

* update docs for DropLabel class

* update admonition

* ensure booleans are converted into arrays
---
 CHANGELOG.md                           |  2 +-
 janitor/functions/__init__.py          |  2 +-
 janitor/functions/case_when.py         |  5 ++
 janitor/functions/conditional_join.py  |  8 ++-
 janitor/functions/pivot.py             | 16 ++++-
 janitor/functions/select.py            | 86 +++++++++++++++++++++++--
 janitor/functions/utils.py             | 87 +++++++++++++++++++++++---
 tests/functions/test_select.py         | 60 ++++++++++++++++++
 tests/functions/test_select_columns.py | 32 +++++++++-
 9 files changed, 276 insertions(+), 22 deletions(-)
 create mode 100644 tests/functions/test_select.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4ecea609f..bddb10cd9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -29,7 +29,7 @@
 -   [ENH] Fix error when `sort_by_appearance=True` is combined with `dropna=True`. Issue #1168 @samukweku
 -   [ENH] Add explicit default parameter to `case_when` function. Issue #1159 @samukweku
 -   [BUG] pandas 1.5.x `_MergeOperation` doesn't have `copy` keyword anymore. Issue #1174 @Zeroto521
--   [ENH] `select_rows` function added for flexible row selection. Add support for MultiIndex selection via dictionary. Issue #1124 @samukweku
+-   [ENH] `select_rows` function added for flexible row selection. Generic `select` function added as well. Add support for MultiIndex selection via dictionary. Issue #1124 @samukweku
 -   [TST] Compat with macos and window, to fix `FailedHealthCheck` Issue #1181 @Zeroto521
 -   [INF] Merge two docs CIs (`docs-preview.yml` and `docs.yml`) to one. And add `documentation` pytest mark. PR #1183 @Zeroto521
 -   [INF] Merge `codecov.yml` (only works for the dev branch pushing event) into `tests.yml` (only works for PR event). PR #1185 @Zeroto521
diff --git a/janitor/functions/__init__.py b/janitor/functions/__init__.py
index 860a38e23..57a1eaad4 100644
--- a/janitor/functions/__init__.py
+++ b/janitor/functions/__init__.py
@@ -75,4 +75,4 @@
 from .transform_columns import transform_column, transform_columns
 from .truncate_datetime import truncate_datetime_dataframe
 from .update_where import update_where
-from .utils import patterns, unionize_dataframe_categories
+from .utils import patterns, unionize_dataframe_categories, DropLabel
diff --git a/janitor/functions/case_when.py b/janitor/functions/case_when.py
index 7812fc9e8..628a5660a 100644
--- a/janitor/functions/case_when.py
+++ b/janitor/functions/case_when.py
@@ -90,6 +90,11 @@ def case_when(
     else:
         default
     ```
+    !!! abstract "Version Changed"
+
+        - 0.24.0
+            - Added `default` parameter.
+
 
     :param df: A pandas DataFrame.
     :param args: Variable argument of conditions and expected values.
diff --git a/janitor/functions/conditional_join.py b/janitor/functions/conditional_join.py
index d3da6f8e0..1ff628e32 100644
--- a/janitor/functions/conditional_join.py
+++ b/janitor/functions/conditional_join.py
@@ -115,6 +115,12 @@ def conditional_join(
         3        4         3         5
         4        4         3         6
 
+    !!! abstract "Version Changed"
+
+        - 0.24.0
+            - Added `df_columns`, `right_columns`, `keep` and `use_numba` parameters.
+
+
 
     :param df: A pandas DataFrame.
     :param right: Named Series or DataFrame to join to.
@@ -145,7 +151,7 @@ def conditional_join(
     :param use_numba: Use numba, if installed, to accelerate the computation.
         Applicable only to strictly non-equi joins. Default is `False`.
     :returns: A pandas DataFrame of the two merged Pandas objects.
-    """
+    """  # noqa: E501
 
     return _conditional_join_compute(
         df,
diff --git a/janitor/functions/pivot.py b/janitor/functions/pivot.py
index 9761cd721..5a8e21118 100644
--- a/janitor/functions/pivot.py
+++ b/janitor/functions/pivot.py
@@ -220,6 +220,13 @@ def pivot_longer(
         7   Austin    Texas  Watermelon      99   None     NaN
         8   Hoover  Alabama  Watermelon      43   None     NaN
 
+
+    !!! abstract "Version Changed"
+
+        - 0.24.0
+            - Added `dropna` parameter.
+
+
     :param df: A pandas DataFrame.
     :param index: Name(s) of columns to use as identifier variables.
         Should be either a single column name, or a list/tuple of
@@ -1259,6 +1266,13 @@ def pivot_wider(
         0  5.5       20       25       30       37
         1  6.1       22       18       19       29
 
+
+    !!! abstract "Version Changed"
+
+        - 0.24.0
+            - Added `reset_index`, `names_expand` and `index_expand` parameters.
+
+
     :param df: A pandas DataFrame.
     :param index: Name(s) of columns to use as identifier variables.
         It should be either a single column name, or a list of column names.
@@ -1293,7 +1307,7 @@ def pivot_wider(
         Applies only if `index` is a categorical column. Default is `False`.
     :returns: A pandas DataFrame that has been unpivoted from long to wide
         form.
-    """
+    """  # noqa: E501
 
     df = df.copy()
 
diff --git a/janitor/functions/select.py b/janitor/functions/select.py
index 4c1480934..1fc910109 100644
--- a/janitor/functions/select.py
+++ b/janitor/functions/select.py
@@ -1,7 +1,7 @@
 import pandas_flavor as pf
 import pandas as pd
 from janitor.utils import deprecated_alias
-from janitor.functions.utils import _select
+from janitor.functions.utils import _select, DropLabel  # noqa: F401
 
 
 @pf.register_dataframe_method
@@ -24,7 +24,8 @@ def select_columns(
 
     Optional ability to invert selection of columns available as well.
 
-    !!! Note
+    !!!note
+
     The preferred option when selecting columns or rows in a Pandas DataFrame
     is with `.loc` or `.iloc` methods, as they are generally performant.
     `select_columns` is primarily for convenience.
@@ -57,7 +58,7 @@ def select_columns(
     :returns: A pandas DataFrame with the specified columns selected.
     """  # noqa: E501
 
-    return _select(df, args, invert, axis="columns")
+    return _select(df, args=args, invert=invert, axis="columns")
 
 
 @pf.register_dataframe_method
@@ -79,11 +80,17 @@ def select_rows(
 
     Optional ability to invert selection of rows available as well.
 
-    !!! Note
+
+    !!! info "New in version 0.24.0"
+
+
+    !!!note
+
     The preferred option when selecting columns or rows in a Pandas DataFrame
     is with `.loc` or `.iloc` methods, as they are generally performant.
     `select_rows` is primarily for convenience.
 
+
     Example:
 
         >>> import pandas as pd
@@ -113,5 +120,74 @@ def select_rows(
         provided.
     :returns: A pandas DataFrame with the specified rows selected.
     """  # noqa: E501
+    return _select(df, args=args, invert=invert, axis="index")
+
+
+@pf.register_dataframe_method
+def select(df: pd.DataFrame, *, rows=None, columns=None) -> pd.DataFrame:
+    """
+    Method-chainable selection of rows and columns.
+
+    It accepts a string, shell-like glob strings `(*string*)`,
+    regex, slice, array-like object, or a list of the previous options.
+
+    Selection on a MultiIndex on a level, or multiple levels,
+    is possible with a dictionary.
+
+    This method does not mutate the original DataFrame.
+
+    Selection can be inverted with the `DropLabel` class.
+
+
+    !!! info "New in version 0.24.0"
+
+
+    !!!note
+
+    The preferred option when selecting columns or rows in a Pandas DataFrame
+    is with `.loc` or `.iloc` methods, as they are generally performant.
+    `select` is primarily for convenience.
+
+
+    Example:
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
+        ...      index=['cobra', 'viper', 'sidewinder'],
+        ...      columns=['max_speed', 'shield'])
+        >>> df
+                    max_speed  shield
+        cobra               1       2
+        viper               4       5
+        sidewinder          7       8
+        >>> df.select(rows='cobra', columns='shield')
+               shield
+        cobra       2
+
+    Labels can be dropped with the `DropLabel` class:
+
+        >>> df.select(rows=DropLabel('cobra'))
+                    max_speed  shield
+        viper               4       5
+        sidewinder          7       8
+
+    :param df: A pandas DataFrame.
+    :param rows: Valid inputs include: an exact label to look for,
+        a shell-style glob string (e.g. `*_thing_*`),
+        a regular expression,
+        a callable,
+        or variable arguments of all the aforementioned.
+        A sequence of booleans is also acceptable.
+        A dictionary can be used for selection on a MultiIndex on different levels.
+    :param columns: Valid inputs include: an exact label to look for,
+        a shell-style glob string (e.g. `*_thing_*`),
+        a regular expression,
+        a callable,
+        or variable arguments of all the aforementioned.
+        A sequence of booleans is also acceptable.
+        A dictionary can be used for selection on a MultiIndex on different levels.
+    :returns: A pandas DataFrame with the specified rows and/or columns selected.
+    """  # noqa: E501
 
-    return _select(df, args, invert, axis="index")
+    return _select(df, args=None, rows=rows, columns=columns, axis="both")
diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py
index e87c33d60..f51fe9962 100644
--- a/janitor/functions/utils.py
+++ b/janitor/functions/utils.py
@@ -11,10 +11,11 @@
     Pattern,
     Union,
     Callable,
+    Any,
 )
 from pandas.core.dtypes.generic import ABCPandasArray, ABCExtensionArray
 from pandas.core.common import is_bool_indexer
-
+from dataclasses import dataclass
 
 import pandas as pd
 from janitor.utils import check, _expand_grid
@@ -269,6 +270,23 @@ def _select_callable(arg, func: Callable, axis=None):
     return bools
 
 
+@dataclass
+class DropLabel:
+    """
+    Helper class for removing labels within the `select` syntax.
+    `label` can be any of the types supported in the `select`,
+    `select_rows` and `select_columns` functions.
+    An array of integers not matching the labels is returned.
+
+    !!! info "New in version 0.24.0"
+
+    :param label: Label(s) to be dropped from the index.
+    :returns: A dataclass.
+    """
+
+    label: Any
+
+
 @singledispatch
 def _select_index(arg, df, axis):
     """
@@ -284,6 +302,27 @@ def _select_index(arg, df, axis):
         raise KeyError(f"No match was returned for {arg}") from exc
 
 
+@_select_index.register(DropLabel)  # noqa: F811
+def _column_sel_dispatch(cols, df, axis):  # noqa: F811
+    """
+    Base function for selection on a Pandas Index object.
+    Returns the inverse of the passed label(s).
+
+    Returns an array of integers.
+    """
+    arr = _select_index(cols.label, df, axis)
+    index = np.arange(getattr(df, axis).size)
+    if isinstance(arr, int):
+        arr = [arr]
+    elif isinstance(arr, slice):
+        arr = index[arr]
+    elif is_list_like(arr):
+        arr = np.asanyarray(arr)
+    if is_bool_dtype(arr):
+        return index[~arr]
+    return np.setdiff1d(index, arr)
+
+
 @_select_index.register(str)  # noqa: F811
 def _index_dispatch(arg, df, axis):  # noqa: F811
     """
@@ -437,7 +476,7 @@ def _index_dispatch(arg, df, axis):  # noqa: F811
                 f"{arg} is a boolean dtype and has wrong length: "
                 f"{len(arg)} instead of {len(index)}"
             )
-        return arg
+        return np.asanyarray(arg)
     try:
 
         if isinstance(arg, pd.Series):
@@ -486,17 +525,27 @@ def _index_dispatch(arg, df, axis):  # noqa: F811
 
         return arg
 
+    # treat multiple DropLabel instances as a single unit
+    checks = (isinstance(entry, DropLabel) for entry in arg)
+    if sum(checks) > 1:
+        drop_labels = (entry for entry in arg if isinstance(entry, DropLabel))
+        drop_labels = [entry.label for entry in drop_labels]
+        drop_labels = DropLabel(drop_labels)
+        arg = [entry for entry in arg if not isinstance(entry, DropLabel)]
+        arg.append(drop_labels)
+
     indices = [_select_index(entry, df, axis) for entry in arg]
 
     # single entry does not need to be combined
     # or materialized if possible;
     # this offers more performance
     if len(indices) == 1:
-        if isinstance(indices[0], int):
+        if is_scalar(indices[0]):
             return indices
-        if is_list_like(indices[0]):
-            return np.asanyarray(indices[0])
-        return indices[0]
+        indices = indices[0]
+        if is_list_like(indices):
+            indices = np.asanyarray(indices)
+        return indices
     contents = []
     for arr in indices:
         if is_list_like(arr):
@@ -508,19 +557,37 @@ def _index_dispatch(arg, df, axis):  # noqa: F811
         elif isinstance(arr, int):
             arr = [arr]
         contents.append(arr)
-    contents = np.concatenate(contents)
-    # remove possible duplicates
-    return pd.unique(contents)
+    return np.concatenate(contents)
 
 
 def _select(
-    df: pd.DataFrame, args: tuple, invert: bool, axis: str
+    df: pd.DataFrame,
+    args: tuple,
+    invert: bool = False,
+    axis: str = "index",
+    rows=None,
+    columns=None,
 ) -> pd.DataFrame:
     """
     Index DataFrame on the index or columns.
 
     Returns a DataFrame.
     """
+    assert axis in {"both", "index", "columns"}
+    if axis == "both":
+        if rows is None:
+            rows = slice(None)
+        else:
+            if not is_list_like(rows):
+                rows = [rows]
+            rows = _select_index(rows, df, axis="index")
+        if columns is None:
+            columns = slice(None)
+        else:
+            if not is_list_like(columns):
+                columns = [columns]
+            columns = _select_index(columns, df, axis="columns")
+        return df.iloc[rows, columns]
     indices = _select_index(list(args), df, axis)
     if invert:
         rev = np.ones(getattr(df, axis).size, dtype=np.bool8)
diff --git a/tests/functions/test_select.py b/tests/functions/test_select.py
new file mode 100644
index 000000000..09571a8e9
--- /dev/null
+++ b/tests/functions/test_select.py
@@ -0,0 +1,60 @@
+import pandas as pd
+import numpy as np
+import pytest
+from pandas.testing import assert_frame_equal
+
+from janitor.functions.utils import DropLabel
+
+
+@pytest.fixture
+def dataframe():
+    """Base DataFrame"""
+    arrays = [
+        ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
+        ["one", "two", "one", "two", "one", "two", "one", "two"],
+    ]
+    tuples = list(zip(*arrays))
+    index = pd.MultiIndex.from_tuples(tuples, names=["A", "B"])
+    return pd.DataFrame(
+        np.random.randint(9, size=(8, 2)),
+        index=index,
+        columns=["col1", "col2"],
+    )
+
+
+def test_select_rows_only(dataframe):
+    """Test output for rows only"""
+    actual = dataframe.select(rows={"B": "two"})
+    expected = dataframe.loc(axis=0)[(slice(None), "two")]
+    assert_frame_equal(actual, expected)
+
+
+def test_select_rows_scalar_(dataframe):
+    """Test output for rows only"""
+    actual = dataframe.select(rows="bar")
+    expected = dataframe.xs("bar", axis=0, level=0, drop_level=False)
+    assert_frame_equal(actual, expected)
+
+
+def test_select_columns_only(dataframe):
+    """Test output for columns only"""
+    actual = dataframe.select(columns=["col1", "col2"])
+    expected = dataframe.loc[:, :]
+    assert_frame_equal(actual, expected)
+
+
+def test_select_columns_scalar(dataframe):
+    """Test output for columns only"""
+    actual = dataframe.select(columns="col*")
+    expected = dataframe.loc[:, :]
+    assert_frame_equal(actual, expected)
+
+
+def test_select_rows_and_columns(dataframe):
+    """Test output for both rows and columns"""
+    actual = dataframe.select(
+        rows=DropLabel(lambda df: df.eval('A == "foo"')),
+        columns=DropLabel(slice("col2", None)),
+    )
+    expected = dataframe.loc[["bar", "baz", "qux"], ["col1"]]
+    assert_frame_equal(actual, expected)
diff --git a/tests/functions/test_select_columns.py b/tests/functions/test_select_columns.py
index cde32079a..4faeb5f61 100644
--- a/tests/functions/test_select_columns.py
+++ b/tests/functions/test_select_columns.py
@@ -6,7 +6,7 @@
 from pandas.testing import assert_frame_equal
 from itertools import product
 
-from janitor.functions.utils import patterns
+from janitor.functions.utils import patterns, DropLabel
 
 
 @pytest.mark.functions
@@ -25,6 +25,32 @@ def test_select_column_names(dataframe, invert, expected):
     assert_frame_equal(df, dataframe[expected])
 
 
+@pytest.mark.functions
+@pytest.mark.parametrize(
+    "invert,expected",
+    [
+        (True, ["a", "Bell__Chart", "cities"]),
+        (False, ["decorated-elephant", "animals@#$%^"]),
+    ],
+)
+def test_select_column_names_droplabel(dataframe, invert, expected):
+    "Base DataFrame"
+    columns = ["a", "Bell__Chart", "cities"]
+    df = dataframe.select_columns(DropLabel(columns), invert=invert)
+
+    assert_frame_equal(df, dataframe[expected])
+
+
+@pytest.mark.functions
+def test_select_column_names_droplabel_multiple(dataframe):
+    "Base DataFrame"
+    columns = ["a", "Bell__Chart", "cities"]
+    cols = [DropLabel(ent) for ent in columns]
+    df = dataframe.select_columns(*cols)
+
+    assert_frame_equal(df, dataframe.drop(columns=columns))
+
+
 @pytest.mark.functions
 @pytest.mark.parametrize(
     "invert,expected",
@@ -57,6 +83,7 @@ def test_select_column_names_missing_columns(dataframe, columns):
         dataframe.select_columns(columns)
 
 
+@pytest.mark.xfail(reason="return whatever user passes")
 @pytest.mark.functions
 @pytest.mark.parametrize(
     "invert,expected",
@@ -394,8 +421,7 @@ def test_boolean_list_multi(multiindex):
 
 def test_series_multi(multiindex):
     """Test pd.Series output on a MultiIndex"""
-    mapp = pd.Series(["bar"])
-    expected = multiindex.select_columns(mapp, slice("foo"))
+    expected = multiindex.select_columns(pd.Series("bar"), slice("baz", "foo"))
     actual = multiindex.loc(axis=1)["bar":"foo"]
     assert_frame_equal(expected, actual)