Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] generic select function #1187

Merged
merged 13 commits into from
Nov 8, 2022
2 changes: 1 addition & 1 deletion janitor/functions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,4 +75,4 @@
from .transform_columns import transform_column, transform_columns
from .truncate_datetime import truncate_datetime_dataframe
from .update_where import update_where
from .utils import patterns, unionize_dataframe_categories
from .utils import patterns, unionize_dataframe_categories, DropLabel
75 changes: 70 additions & 5 deletions janitor/functions/select.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas_flavor as pf
import pandas as pd
from janitor.utils import deprecated_alias
from janitor.functions.utils import _select
from janitor.functions.utils import _select, DropLabel # noqa: F401


@pf.register_dataframe_method
Expand All @@ -24,7 +24,8 @@ def select_columns(

Optional ability to invert selection of columns available as well.

!!! Note
!!!note

The preferred option when selecting columns or rows in a Pandas DataFrame
is with `.loc` or `.iloc` methods, as they are generally performant.
`select_columns` is primarily for convenience.
Expand Down Expand Up @@ -57,7 +58,7 @@ def select_columns(
:returns: A pandas DataFrame with the specified columns selected.
""" # noqa: E501

return _select(df, args, invert, axis="columns")
return _select(df, args=args, invert=invert, axis="columns")


@pf.register_dataframe_method
Expand All @@ -79,7 +80,8 @@ def select_rows(

Optional ability to invert selection of rows available as well.

!!! Note
!!!note

The preferred option when selecting columns or rows in a Pandas DataFrame
is with `.loc` or `.iloc` methods, as they are generally performant.
`select_rows` is primarily for convenience.
Expand Down Expand Up @@ -113,5 +115,68 @@ def select_rows(
provided.
:returns: A pandas DataFrame with the specified rows selected.
""" # noqa: E501
return _select(df, args=args, invert=invert, axis="index")


@pf.register_dataframe_method
def select(df: pd.DataFrame, *, rows=None, columns=None) -> pd.DataFrame:
"""
Method-chainable selection of rows and columns.

It accepts a string, shell-like glob strings `(*string*)`,
regex, slice, array-like object, or a list of the previous options.

Selection on a MultiIndex on a level, or multiple levels,
is possible with a dictionary.

This method does not mutate the original DataFrame.

Selection can be inverted with the `DropLabel` class.

!!!note

The preferred option when selecting columns or rows in a Pandas DataFrame
is with `.loc` or `.iloc` methods, as they are generally performant.
`select` is primarily for convenience.

Example:

>>> import pandas as pd
>>> import janitor
>>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
... index=['cobra', 'viper', 'sidewinder'],
... columns=['max_speed', 'shield'])
>>> df
max_speed shield
cobra 1 2
viper 4 5
sidewinder 7 8
>>> df.select(rows='cobra', columns='shield')
shield
cobra 2

Labels can be dropped with the `DropLabel` class:
>>> df.select(rows=DropLabel('cobra'))
max_speed shield
viper 4 5
sidewinder 7 8

:param df: A pandas DataFrame.
:param rows: Valid inputs include: an exact label to look for,
a shell-style glob string (e.g. `*_thing_*`),
a regular expression,
a callable,
or variable arguments of all the aforementioned.
A sequence of booleans is also acceptable.
A dictionary can be used for selection on a MultiIndex on different levels.
:param columns: Valid inputs include: an exact label to look for,
a shell-style glob string (e.g. `*_thing_*`),
a regular expression,
a callable,
or variable arguments of all the aforementioned.
A sequence of booleans is also acceptable.
A dictionary can be used for selection on a MultiIndex on different levels.
:returns: A pandas DataFrame with the specified rows and/or columns selected.
""" # noqa: E501

return _select(df, args, invert, axis="index")
return _select(df, args=None, rows=rows, columns=columns, axis="both")
72 changes: 67 additions & 5 deletions janitor/functions/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@
Pattern,
Union,
Callable,
Any,
)
from pandas.core.dtypes.generic import ABCPandasArray, ABCExtensionArray
from pandas.core.common import is_bool_indexer

from dataclasses import dataclass

import pandas as pd
from janitor.utils import check, _expand_grid
Expand Down Expand Up @@ -269,6 +270,19 @@ def _select_callable(arg, func: Callable, axis=None):
return bools


@dataclass
class DropLabel:
"""
Helper class for removing labels within the `select` syntax.
`label` can be any of the types supported in `_select_index`.
An array of integers not matching the labels is returned.
:param label: Label(s) to be dropped from the index.
:returns: A dataclass.
"""

label: Any


@singledispatch
def _select_index(arg, df, axis):
"""
Expand All @@ -284,6 +298,27 @@ def _select_index(arg, df, axis):
raise KeyError(f"No match was returned for {arg}") from exc


@_select_index.register(DropLabel) # noqa: F811
def _column_sel_dispatch(cols, df, axis): # noqa: F811
"""
Base function for selection on a Pandas Index object.
Returns the inverse of the passed label(s).

Returns an array of integers.
"""
arr = _select_index(cols.label, df, axis)
index = np.arange(getattr(df, axis).size)
if isinstance(arr, int):
arr = [arr]
elif isinstance(arr, slice):
arr = index[arr]
elif is_list_like(arr):
arr = np.asanyarray(arr)
if is_bool_dtype(arr):
return index[~arr]
return np.setdiff1d(index, arr)


@_select_index.register(str) # noqa: F811
def _index_dispatch(arg, df, axis): # noqa: F811
"""
Expand Down Expand Up @@ -486,6 +521,15 @@ def _index_dispatch(arg, df, axis): # noqa: F811

return arg

# treat multiple DropLabel instances as a single unit
checks = (isinstance(entry, DropLabel) for entry in arg)
if sum(checks) > 1:
drop_labels = (entry for entry in arg if isinstance(entry, DropLabel))
drop_labels = [entry.label for entry in drop_labels]
drop_labels = DropLabel(drop_labels)
arg = [entry for entry in arg if not isinstance(entry, DropLabel)]
arg.append(drop_labels)

indices = [_select_index(entry, df, axis) for entry in arg]

# single entry does not need to be combined
Expand All @@ -508,19 +552,37 @@ def _index_dispatch(arg, df, axis): # noqa: F811
elif isinstance(arr, int):
arr = [arr]
contents.append(arr)
contents = np.concatenate(contents)
# remove possible duplicates
return pd.unique(contents)
return np.concatenate(contents)


def _select(
df: pd.DataFrame, args: tuple, invert: bool, axis: str
df: pd.DataFrame,
args: tuple,
invert: bool = False,
axis: str = "index",
rows=None,
columns=None,
) -> pd.DataFrame:
"""
Index DataFrame on the index or columns.

Returns a DataFrame.
"""
assert axis in {"both", "index", "columns"}
if axis == "both":
if rows is None:
rows = slice(None)
else:
if not is_list_like(rows):
rows = [rows]
rows = _select_index(rows, df, axis="index")
if columns is None:
columns = slice(None)
else:
if not is_list_like(columns):
columns = [columns]
columns = _select_index(columns, df, axis="columns")
return df.iloc[rows, columns]
indices = _select_index(list(args), df, axis)
if invert:
rev = np.ones(getattr(df, axis).size, dtype=np.bool8)
Expand Down
60 changes: 60 additions & 0 deletions tests/functions/test_select.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import pandas as pd
import numpy as np
import pytest
from pandas.testing import assert_frame_equal

from janitor.functions.utils import DropLabel


@pytest.fixture
def dataframe():
"""Base DataFrame"""
arrays = [
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
["one", "two", "one", "two", "one", "two", "one", "two"],
]
tuples = list(zip(*arrays))
index = pd.MultiIndex.from_tuples(tuples, names=["A", "B"])
return pd.DataFrame(
np.random.randint(9, size=(8, 2)),
index=index,
columns=["col1", "col2"],
)


def test_select_rows_only(dataframe):
"""Test output for rows only"""
actual = dataframe.select(rows={"B": "two"})
expected = dataframe.loc(axis=0)[(slice(None), "two")]
assert_frame_equal(actual, expected)


def test_select_rows_scalar_(dataframe):
"""Test output for rows only"""
actual = dataframe.select(rows="bar")
expected = dataframe.xs("bar", axis=0, level=0, drop_level=False)
assert_frame_equal(actual, expected)


def test_select_columns_only(dataframe):
"""Test output for columns only"""
actual = dataframe.select(columns=["col1", "col2"])
expected = dataframe.loc[:, :]
assert_frame_equal(actual, expected)


def test_select_columns_scalar(dataframe):
"""Test output for columns only"""
actual = dataframe.select(columns="col*")
expected = dataframe.loc[:, :]
assert_frame_equal(actual, expected)


def test_select_rows_and_columns(dataframe):
"""Test output for both rows and columns"""
actual = dataframe.select(
rows=DropLabel({"A": lambda df: df == "foo"}),
columns=DropLabel(slice("col2", None)),
)
expected = dataframe.loc[["bar", "baz", "qux"], ["col1"]]
assert_frame_equal(actual, expected)
32 changes: 29 additions & 3 deletions tests/functions/test_select_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pandas.testing import assert_frame_equal
from itertools import product

from janitor.functions.utils import patterns
from janitor.functions.utils import patterns, DropLabel


@pytest.mark.functions
Expand All @@ -25,6 +25,32 @@ def test_select_column_names(dataframe, invert, expected):
assert_frame_equal(df, dataframe[expected])


@pytest.mark.functions
@pytest.mark.parametrize(
"invert,expected",
[
(True, ["a", "Bell__Chart", "cities"]),
(False, ["decorated-elephant", "animals@#$%^"]),
],
)
def test_select_column_names_droplabel(dataframe, invert, expected):
"Base DataFrame"
columns = ["a", "Bell__Chart", "cities"]
df = dataframe.select_columns(DropLabel(columns), invert=invert)

assert_frame_equal(df, dataframe[expected])


@pytest.mark.functions
def test_select_column_names_droplabel_multiple(dataframe):
"Base DataFrame"
columns = ["a", "Bell__Chart", "cities"]
cols = [DropLabel(ent) for ent in columns]
df = dataframe.select_columns(*cols)

assert_frame_equal(df, dataframe.drop(columns=columns))


@pytest.mark.functions
@pytest.mark.parametrize(
"invert,expected",
Expand Down Expand Up @@ -57,6 +83,7 @@ def test_select_column_names_missing_columns(dataframe, columns):
dataframe.select_columns(columns)


@pytest.mark.xfail(reason="return whatever user passes")
@pytest.mark.functions
@pytest.mark.parametrize(
"invert,expected",
Expand Down Expand Up @@ -394,8 +421,7 @@ def test_boolean_list_multi(multiindex):

def test_series_multi(multiindex):
"""Test pd.Series output on a MultiIndex"""
mapp = pd.Series(["bar"])
expected = multiindex.select_columns(mapp, slice("foo"))
expected = multiindex.select_columns(pd.Series("bar"), slice("baz", "foo"))
actual = multiindex.loc(axis=1)["bar":"foo"]
assert_frame_equal(expected, actual)

Expand Down