pyjanitor-devs · ericmjl · Nov 8, 2022 · Nov 1, 2022 · Nov 2, 2022 · Nov 2, 2022
diff --git a/janitor/functions/__init__.py b/janitor/functions/__init__.py
@@ -75,4 +75,4 @@
 from .transform_columns import transform_column, transform_columns
 from .truncate_datetime import truncate_datetime_dataframe
 from .update_where import update_where
-from .utils import patterns, unionize_dataframe_categories
+from .utils import patterns, unionize_dataframe_categories, DropLabel
diff --git a/janitor/functions/select.py b/janitor/functions/select.py
@@ -1,7 +1,7 @@
 import pandas_flavor as pf
 import pandas as pd
 from janitor.utils import deprecated_alias
-from janitor.functions.utils import _select
+from janitor.functions.utils import _select, DropLabel  # noqa: F401
 
 
 @pf.register_dataframe_method
@@ -24,7 +24,8 @@ def select_columns(
 
     Optional ability to invert selection of columns available as well.
 
-    !!! Note
+    !!!note
+
     The preferred option when selecting columns or rows in a Pandas DataFrame
     is with `.loc` or `.iloc` methods, as they are generally performant.
     `select_columns` is primarily for convenience.
@@ -57,7 +58,7 @@ def select_columns(
     :returns: A pandas DataFrame with the specified columns selected.
     """  # noqa: E501
 
-    return _select(df, args, invert, axis="columns")
+    return _select(df, args=args, invert=invert, axis="columns")
 
 
 @pf.register_dataframe_method
@@ -79,7 +80,8 @@ def select_rows(
 
     Optional ability to invert selection of rows available as well.
 
-    !!! Note
+    !!!note
+
     The preferred option when selecting columns or rows in a Pandas DataFrame
     is with `.loc` or `.iloc` methods, as they are generally performant.
     `select_rows` is primarily for convenience.
@@ -113,5 +115,68 @@ def select_rows(
         provided.
     :returns: A pandas DataFrame with the specified rows selected.
     """  # noqa: E501
+    return _select(df, args=args, invert=invert, axis="index")
+
+
+@pf.register_dataframe_method
+def select(df: pd.DataFrame, *, rows=None, columns=None) -> pd.DataFrame:
+    """
+    Method-chainable selection of rows and columns.
+
+    It accepts a string, shell-like glob strings `(*string*)`,
+    regex, slice, array-like object, or a list of the previous options.
+
+    Selection on a MultiIndex on a level, or multiple levels,
+    is possible with a dictionary.
+
+    This method does not mutate the original DataFrame.
+
+    Selection can be inverted with the `DropLabel` class.
+
+    !!!note
+
+    The preferred option when selecting columns or rows in a Pandas DataFrame
+    is with `.loc` or `.iloc` methods, as they are generally performant.
+    `select` is primarily for convenience.
+
+    Example:
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
+        ...      index=['cobra', 'viper', 'sidewinder'],
+        ...      columns=['max_speed', 'shield'])
+        >>> df
+                    max_speed  shield
+        cobra               1       2
+        viper               4       5
+        sidewinder          7       8
+        >>> df.select(rows='cobra', columns='shield')
+               shield
+        cobra       2
+
+    Labels can be dropped with the `DropLabel` class:
+        >>> df.select(rows=DropLabel('cobra'))
+                    max_speed  shield
+        viper               4       5
+        sidewinder          7       8
+
+    :param df: A pandas DataFrame.
+    :param rows: Valid inputs include: an exact label to look for,
+        a shell-style glob string (e.g. `*_thing_*`),
+        a regular expression,
+        a callable,
+        or variable arguments of all the aforementioned.
+        A sequence of booleans is also acceptable.
+        A dictionary can be used for selection on a MultiIndex on different levels.
+    :param columns: Valid inputs include: an exact label to look for,
+        a shell-style glob string (e.g. `*_thing_*`),
+        a regular expression,
+        a callable,
+        or variable arguments of all the aforementioned.
+        A sequence of booleans is also acceptable.
+        A dictionary can be used for selection on a MultiIndex on different levels.
+    :returns: A pandas DataFrame with the specified rows and/or columns selected.
+    """  # noqa: E501
 
-    return _select(df, args, invert, axis="index")
+    return _select(df, args=None, rows=rows, columns=columns, axis="both")
diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py
@@ -11,10 +11,11 @@
     Pattern,
     Union,
     Callable,
+    Any,
 )
 from pandas.core.dtypes.generic import ABCPandasArray, ABCExtensionArray
 from pandas.core.common import is_bool_indexer
-
+from dataclasses import dataclass
 
 import pandas as pd
 from janitor.utils import check, _expand_grid
@@ -269,6 +270,19 @@ def _select_callable(arg, func: Callable, axis=None):
     return bools
 
 
+@dataclass
+class DropLabel:
+    """
+    Helper class for removing labels within the `select` syntax.
+    `label` can be any of the types supported in `_select_index`.
+    An array of integers not matching the labels is returned.
+    :param label: Label(s) to be dropped from the index.
+    :returns: A dataclass.
+    """
+
+    label: Any
+
+
 @singledispatch
 def _select_index(arg, df, axis):
     """
@@ -284,6 +298,27 @@ def _select_index(arg, df, axis):
         raise KeyError(f"No match was returned for {arg}") from exc
 
 
+@_select_index.register(DropLabel)  # noqa: F811
+def _column_sel_dispatch(cols, df, axis):  # noqa: F811
+    """
+    Base function for selection on a Pandas Index object.
+    Returns the inverse of the passed label(s).
+
+    Returns an array of integers.
+    """
+    arr = _select_index(cols.label, df, axis)
+    index = np.arange(getattr(df, axis).size)
+    if isinstance(arr, int):
+        arr = [arr]
+    elif isinstance(arr, slice):
+        arr = index[arr]
+    elif is_list_like(arr):
+        arr = np.asanyarray(arr)
+    if is_bool_dtype(arr):
+        return index[~arr]
+    return np.setdiff1d(index, arr)
+
+
 @_select_index.register(str)  # noqa: F811
 def _index_dispatch(arg, df, axis):  # noqa: F811
     """
@@ -486,6 +521,15 @@ def _index_dispatch(arg, df, axis):  # noqa: F811
 
         return arg
 
+    # treat multiple DropLabel instances as a single unit
+    checks = (isinstance(entry, DropLabel) for entry in arg)
+    if sum(checks) > 1:
+        drop_labels = (entry for entry in arg if isinstance(entry, DropLabel))
+        drop_labels = [entry.label for entry in drop_labels]
+        drop_labels = DropLabel(drop_labels)
+        arg = [entry for entry in arg if not isinstance(entry, DropLabel)]
+        arg.append(drop_labels)
+
     indices = [_select_index(entry, df, axis) for entry in arg]
 
     # single entry does not need to be combined
@@ -508,19 +552,37 @@ def _index_dispatch(arg, df, axis):  # noqa: F811
         elif isinstance(arr, int):
             arr = [arr]
         contents.append(arr)
-    contents = np.concatenate(contents)
-    # remove possible duplicates
-    return pd.unique(contents)
+    return np.concatenate(contents)
 
 
 def _select(
-    df: pd.DataFrame, args: tuple, invert: bool, axis: str
+    df: pd.DataFrame,
+    args: tuple,
+    invert: bool = False,
+    axis: str = "index",
+    rows=None,
+    columns=None,
 ) -> pd.DataFrame:
     """
     Index DataFrame on the index or columns.
 
     Returns a DataFrame.
     """
+    assert axis in {"both", "index", "columns"}
+    if axis == "both":
+        if rows is None:
+            rows = slice(None)
+        else:
+            if not is_list_like(rows):
+                rows = [rows]
+            rows = _select_index(rows, df, axis="index")
+        if columns is None:
+            columns = slice(None)
+        else:
+            if not is_list_like(columns):
+                columns = [columns]
+            columns = _select_index(columns, df, axis="columns")
+        return df.iloc[rows, columns]
     indices = _select_index(list(args), df, axis)
     if invert:
         rev = np.ones(getattr(df, axis).size, dtype=np.bool8)

diff --git a/tests/functions/test_select.py b/tests/functions/test_select.py
@@ -0,0 +1,60 @@
+import pandas as pd
+import numpy as np
+import pytest
+from pandas.testing import assert_frame_equal
+
+from janitor.functions.utils import DropLabel
+
+
+@pytest.fixture
+def dataframe():
+    """Base DataFrame"""
+    arrays = [
+        ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
+        ["one", "two", "one", "two", "one", "two", "one", "two"],
+    ]
+    tuples = list(zip(*arrays))
+    index = pd.MultiIndex.from_tuples(tuples, names=["A", "B"])
+    return pd.DataFrame(
+        np.random.randint(9, size=(8, 2)),
+        index=index,
+        columns=["col1", "col2"],
+    )
+
+
+def test_select_rows_only(dataframe):
+    """Test output for rows only"""
+    actual = dataframe.select(rows={"B": "two"})
+    expected = dataframe.loc(axis=0)[(slice(None), "two")]
+    assert_frame_equal(actual, expected)
+
+
+def test_select_rows_scalar_(dataframe):
+    """Test output for rows only"""
+    actual = dataframe.select(rows="bar")
+    expected = dataframe.xs("bar", axis=0, level=0, drop_level=False)
+    assert_frame_equal(actual, expected)
+
+
+def test_select_columns_only(dataframe):
+    """Test output for columns only"""
+    actual = dataframe.select(columns=["col1", "col2"])
+    expected = dataframe.loc[:, :]
+    assert_frame_equal(actual, expected)
+
+
+def test_select_columns_scalar(dataframe):
+    """Test output for columns only"""
+    actual = dataframe.select(columns="col*")
+    expected = dataframe.loc[:, :]
+    assert_frame_equal(actual, expected)
+
+
+def test_select_rows_and_columns(dataframe):
+    """Test output for both rows and columns"""
+    actual = dataframe.select(
+        rows=DropLabel({"A": lambda df: df == "foo"}),
+        columns=DropLabel(slice("col2", None)),
+    )
+    expected = dataframe.loc[["bar", "baz", "qux"], ["col1"]]
+    assert_frame_equal(actual, expected)
diff --git a/tests/functions/test_select_columns.py b/tests/functions/test_select_columns.py
@@ -6,7 +6,7 @@
 from pandas.testing import assert_frame_equal
 from itertools import product
 
-from janitor.functions.utils import patterns
+from janitor.functions.utils import patterns, DropLabel
 
 
 @pytest.mark.functions
@@ -25,6 +25,32 @@ def test_select_column_names(dataframe, invert, expected):
     assert_frame_equal(df, dataframe[expected])
 
 
+@pytest.mark.functions
+@pytest.mark.parametrize(
+    "invert,expected",
+    [
+        (True, ["a", "Bell__Chart", "cities"]),
+        (False, ["decorated-elephant", "animals@#$%^"]),
+    ],
+)
+def test_select_column_names_droplabel(dataframe, invert, expected):
+    "Base DataFrame"
+    columns = ["a", "Bell__Chart", "cities"]
+    df = dataframe.select_columns(DropLabel(columns), invert=invert)
+
+    assert_frame_equal(df, dataframe[expected])
+
+
+@pytest.mark.functions
+def test_select_column_names_droplabel_multiple(dataframe):
+    "Base DataFrame"
+    columns = ["a", "Bell__Chart", "cities"]
+    cols = [DropLabel(ent) for ent in columns]
+    df = dataframe.select_columns(*cols)
+
+    assert_frame_equal(df, dataframe.drop(columns=columns))
+
+
 @pytest.mark.functions
 @pytest.mark.parametrize(
     "invert,expected",
@@ -57,6 +83,7 @@ def test_select_column_names_missing_columns(dataframe, columns):
         dataframe.select_columns(columns)
 
 
+@pytest.mark.xfail(reason="return whatever user passes")
 @pytest.mark.functions
 @pytest.mark.parametrize(
     "invert,expected",
@@ -394,8 +421,7 @@ def test_boolean_list_multi(multiindex):
 
 def test_series_multi(multiindex):
     """Test pd.Series output on a MultiIndex"""
-    mapp = pd.Series(["bar"])
-    expected = multiindex.select_columns(mapp, slice("foo"))
+    expected = multiindex.select_columns(pd.Series("bar"), slice("baz", "foo"))
     actual = multiindex.loc(axis=1)["bar":"foo"]
     assert_frame_equal(expected, actual)