[ENH] generic select function (#1187)

* fix blank note output * return whatever user passes, even if they r duplicates * add DropLabel for dropping columns * generic select with tests * update test_select.py * update changelog * update docs in select * add version notifications * update docs for DropLabel class * update admonition * ensure booleans are converted into arrays
pyjanitor-devs · Nov 28, 2022 · df900b5 · df900b5
1 parent 08c2c8b
commit df900b5
Show file tree

Hide file tree

Showing 9 changed files with 276 additions and 22 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -29,7 +29,7 @@
 -   [ENH] Fix error when `sort_by_appearance=True` is combined with `dropna=True`. Issue #1168 @samukweku
 -   [ENH] Add explicit default parameter to `case_when` function. Issue #1159 @samukweku
 -   [BUG] pandas 1.5.x `_MergeOperation` doesn't have `copy` keyword anymore. Issue #1174 @Zeroto521
--   [ENH] `select_rows` function added for flexible row selection. Add support for MultiIndex selection via dictionary. Issue #1124 @samukweku
+-   [ENH] `select_rows` function added for flexible row selection. Generic `select` function added as well. Add support for MultiIndex selection via dictionary. Issue #1124 @samukweku
 -   [TST] Compat with macos and window, to fix `FailedHealthCheck` Issue #1181 @Zeroto521
 -   [INF] Merge two docs CIs (`docs-preview.yml` and `docs.yml`) to one. And add `documentation` pytest mark. PR #1183 @Zeroto521
 -   [INF] Merge `codecov.yml` (only works for the dev branch pushing event) into `tests.yml` (only works for PR event). PR #1185 @Zeroto521

diff --git a/janitor/functions/__init__.py b/janitor/functions/__init__.py
@@ -75,4 +75,4 @@
 from .transform_columns import transform_column, transform_columns
 from .truncate_datetime import truncate_datetime_dataframe
 from .update_where import update_where
-from .utils import patterns, unionize_dataframe_categories
+from .utils import patterns, unionize_dataframe_categories, DropLabel
diff --git a/janitor/functions/case_when.py b/janitor/functions/case_when.py
@@ -90,6 +90,11 @@ def case_when(
     else:
         default
     ```
+    !!! abstract "Version Changed"
+
+        - 0.24.0
+            - Added `default` parameter.
+
 
     :param df: A pandas DataFrame.
     :param args: Variable argument of conditions and expected values.

diff --git a/janitor/functions/conditional_join.py b/janitor/functions/conditional_join.py
@@ -115,6 +115,12 @@ def conditional_join(
         3        4         3         5
         4        4         3         6
 
+    !!! abstract "Version Changed"
+
+        - 0.24.0
+            - Added `df_columns`, `right_columns`, `keep` and `use_numba` parameters.
+
+
 
     :param df: A pandas DataFrame.
     :param right: Named Series or DataFrame to join to.
@@ -145,7 +151,7 @@ def conditional_join(
     :param use_numba: Use numba, if installed, to accelerate the computation.
         Applicable only to strictly non-equi joins. Default is `False`.
     :returns: A pandas DataFrame of the two merged Pandas objects.
-    """
+    """  # noqa: E501
 
     return _conditional_join_compute(
         df,

diff --git a/janitor/functions/pivot.py b/janitor/functions/pivot.py
@@ -220,6 +220,13 @@ def pivot_longer(
         7   Austin    Texas  Watermelon      99   None     NaN
         8   Hoover  Alabama  Watermelon      43   None     NaN
 
+
+    !!! abstract "Version Changed"
+
+        - 0.24.0
+            - Added `dropna` parameter.
+
+
     :param df: A pandas DataFrame.
     :param index: Name(s) of columns to use as identifier variables.
         Should be either a single column name, or a list/tuple of
@@ -1259,6 +1266,13 @@ def pivot_wider(
         0  5.5       20       25       30       37
         1  6.1       22       18       19       29
 
+
+    !!! abstract "Version Changed"
+
+        - 0.24.0
+            - Added `reset_index`, `names_expand` and `index_expand` parameters.
+
+
     :param df: A pandas DataFrame.
     :param index: Name(s) of columns to use as identifier variables.
         It should be either a single column name, or a list of column names.
@@ -1293,7 +1307,7 @@ def pivot_wider(
         Applies only if `index` is a categorical column. Default is `False`.
     :returns: A pandas DataFrame that has been unpivoted from long to wide
         form.
-    """
+    """  # noqa: E501
 
     df = df.copy()
 

diff --git a/janitor/functions/select.py b/janitor/functions/select.py
@@ -1,7 +1,7 @@
 import pandas_flavor as pf
 import pandas as pd
 from janitor.utils import deprecated_alias
-from janitor.functions.utils import _select
+from janitor.functions.utils import _select, DropLabel  # noqa: F401
 
 
 @pf.register_dataframe_method
@@ -24,7 +24,8 @@ def select_columns(
 
     Optional ability to invert selection of columns available as well.
 
-    !!! Note
+    !!!note
+
     The preferred option when selecting columns or rows in a Pandas DataFrame
     is with `.loc` or `.iloc` methods, as they are generally performant.
     `select_columns` is primarily for convenience.
@@ -57,7 +58,7 @@ def select_columns(
     :returns: A pandas DataFrame with the specified columns selected.
     """  # noqa: E501
 
-    return _select(df, args, invert, axis="columns")
+    return _select(df, args=args, invert=invert, axis="columns")
 
 
 @pf.register_dataframe_method
@@ -79,11 +80,17 @@ def select_rows(
 
     Optional ability to invert selection of rows available as well.
 
-    !!! Note
+
+    !!! info "New in version 0.24.0"
+
+
+    !!!note
+
     The preferred option when selecting columns or rows in a Pandas DataFrame
     is with `.loc` or `.iloc` methods, as they are generally performant.
     `select_rows` is primarily for convenience.
 
+
     Example:
 
         >>> import pandas as pd
@@ -113,5 +120,74 @@ def select_rows(
         provided.
     :returns: A pandas DataFrame with the specified rows selected.
     """  # noqa: E501
+    return _select(df, args=args, invert=invert, axis="index")
+
+
+@pf.register_dataframe_method
+def select(df: pd.DataFrame, *, rows=None, columns=None) -> pd.DataFrame:
+    """
+    Method-chainable selection of rows and columns.
+
+    It accepts a string, shell-like glob strings `(*string*)`,
+    regex, slice, array-like object, or a list of the previous options.
+
+    Selection on a MultiIndex on a level, or multiple levels,
+    is possible with a dictionary.
+
+    This method does not mutate the original DataFrame.
+
+    Selection can be inverted with the `DropLabel` class.
+
+
+    !!! info "New in version 0.24.0"
+
+
+    !!!note
+
+    The preferred option when selecting columns or rows in a Pandas DataFrame
+    is with `.loc` or `.iloc` methods, as they are generally performant.
+    `select` is primarily for convenience.
+
+
+    Example:
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
+        ...      index=['cobra', 'viper', 'sidewinder'],
+        ...      columns=['max_speed', 'shield'])
+        >>> df
+                    max_speed  shield
+        cobra               1       2
+        viper               4       5
+        sidewinder          7       8
+        >>> df.select(rows='cobra', columns='shield')
+               shield
+        cobra       2
+
+    Labels can be dropped with the `DropLabel` class:
+
+        >>> df.select(rows=DropLabel('cobra'))
+                    max_speed  shield
+        viper               4       5
+        sidewinder          7       8
+
+    :param df: A pandas DataFrame.
+    :param rows: Valid inputs include: an exact label to look for,
+        a shell-style glob string (e.g. `*_thing_*`),
+        a regular expression,
+        a callable,
+        or variable arguments of all the aforementioned.
+        A sequence of booleans is also acceptable.
+        A dictionary can be used for selection on a MultiIndex on different levels.
+    :param columns: Valid inputs include: an exact label to look for,
+        a shell-style glob string (e.g. `*_thing_*`),
+        a regular expression,
+        a callable,
+        or variable arguments of all the aforementioned.
+        A sequence of booleans is also acceptable.
+        A dictionary can be used for selection on a MultiIndex on different levels.
+    :returns: A pandas DataFrame with the specified rows and/or columns selected.
+    """  # noqa: E501
 
-    return _select(df, args, invert, axis="index")
+    return _select(df, args=None, rows=rows, columns=columns, axis="both")
diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py
@@ -11,10 +11,11 @@
     Pattern,
     Union,
     Callable,
+    Any,
 )
 from pandas.core.dtypes.generic import ABCPandasArray, ABCExtensionArray
 from pandas.core.common import is_bool_indexer
-
+from dataclasses import dataclass
 
 import pandas as pd
 from janitor.utils import check, _expand_grid
@@ -269,6 +270,23 @@ def _select_callable(arg, func: Callable, axis=None):
     return bools
 
 
+@dataclass
+class DropLabel:
+    """
+    Helper class for removing labels within the `select` syntax.
+    `label` can be any of the types supported in the `select`,
+    `select_rows` and `select_columns` functions.
+    An array of integers not matching the labels is returned.
+
+    !!! info "New in version 0.24.0"
+
+    :param label: Label(s) to be dropped from the index.
+    :returns: A dataclass.
+    """
+
+    label: Any
+
+
 @singledispatch
 def _select_index(arg, df, axis):
     """
@@ -284,6 +302,27 @@ def _select_index(arg, df, axis):
         raise KeyError(f"No match was returned for {arg}") from exc
 
 
+@_select_index.register(DropLabel)  # noqa: F811
+def _column_sel_dispatch(cols, df, axis):  # noqa: F811
+    """
+    Base function for selection on a Pandas Index object.
+    Returns the inverse of the passed label(s).
+
+    Returns an array of integers.
+    """
+    arr = _select_index(cols.label, df, axis)
+    index = np.arange(getattr(df, axis).size)
+    if isinstance(arr, int):
+        arr = [arr]
+    elif isinstance(arr, slice):
+        arr = index[arr]
+    elif is_list_like(arr):
+        arr = np.asanyarray(arr)
+    if is_bool_dtype(arr):
+        return index[~arr]
+    return np.setdiff1d(index, arr)
+
+
 @_select_index.register(str)  # noqa: F811
 def _index_dispatch(arg, df, axis):  # noqa: F811
     """
@@ -437,7 +476,7 @@ def _index_dispatch(arg, df, axis):  # noqa: F811
                 f"{arg} is a boolean dtype and has wrong length: "
                 f"{len(arg)} instead of {len(index)}"
             )
-        return arg
+        return np.asanyarray(arg)
     try:
 
         if isinstance(arg, pd.Series):
@@ -486,17 +525,27 @@ def _index_dispatch(arg, df, axis):  # noqa: F811
 
         return arg
 
+    # treat multiple DropLabel instances as a single unit
+    checks = (isinstance(entry, DropLabel) for entry in arg)
+    if sum(checks) > 1:
+        drop_labels = (entry for entry in arg if isinstance(entry, DropLabel))
+        drop_labels = [entry.label for entry in drop_labels]
+        drop_labels = DropLabel(drop_labels)
+        arg = [entry for entry in arg if not isinstance(entry, DropLabel)]
+        arg.append(drop_labels)
+
     indices = [_select_index(entry, df, axis) for entry in arg]
 
     # single entry does not need to be combined
     # or materialized if possible;
     # this offers more performance
     if len(indices) == 1:
-        if isinstance(indices[0], int):
+        if is_scalar(indices[0]):
             return indices
-        if is_list_like(indices[0]):
-            return np.asanyarray(indices[0])
-        return indices[0]
+        indices = indices[0]
+        if is_list_like(indices):
+            indices = np.asanyarray(indices)
+        return indices
     contents = []
     for arr in indices:
         if is_list_like(arr):
@@ -508,19 +557,37 @@ def _index_dispatch(arg, df, axis):  # noqa: F811
         elif isinstance(arr, int):
             arr = [arr]
         contents.append(arr)
-    contents = np.concatenate(contents)
-    # remove possible duplicates
-    return pd.unique(contents)
+    return np.concatenate(contents)
 
 
 def _select(
-    df: pd.DataFrame, args: tuple, invert: bool, axis: str
+    df: pd.DataFrame,
+    args: tuple,
+    invert: bool = False,
+    axis: str = "index",
+    rows=None,
+    columns=None,
 ) -> pd.DataFrame:
     """
     Index DataFrame on the index or columns.
 
     Returns a DataFrame.
     """
+    assert axis in {"both", "index", "columns"}
+    if axis == "both":
+        if rows is None:
+            rows = slice(None)
+        else:
+            if not is_list_like(rows):
+                rows = [rows]
+            rows = _select_index(rows, df, axis="index")
+        if columns is None:
+            columns = slice(None)
+        else:
+            if not is_list_like(columns):
+                columns = [columns]
+            columns = _select_index(columns, df, axis="columns")
+        return df.iloc[rows, columns]
     indices = _select_index(list(args), df, axis)
     if invert:
         rev = np.ones(getattr(df, axis).size, dtype=np.bool8)