Skip to content

Commit

Permalink
[DOC] MWE for label_encode, factorize_columns, `encode_categorica…
Browse files Browse the repository at this point in the history
…l` (#1028)

* Add MWE for label_encode and factorize_columns

* Fix some formatting for encode_categorical

* Simplify encode_categorical logic slightly

* Fix formatting

* Make factorize_columns non-mutating
  • Loading branch information
thatlittleboy authored Mar 2, 2022
1 parent 270498e commit 92251b5
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 68 deletions.
31 changes: 14 additions & 17 deletions janitor/functions/encode_categorical.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import warnings
from enum import Enum
from typing import Hashable, Iterable, Union

import pandas_flavor as pf
import pandas as pd
from pandas.api.types import is_list_like
import warnings

from janitor.utils import check, check_column, deprecated_alias
from enum import Enum


@pf.register_dataframe_method
Expand Down Expand Up @@ -90,17 +92,15 @@ def encode_categorical(
>>> enc_df["foo"].cat.ordered
True
:param df: A pandas DataFrame object.
:param column_names: A column name or an iterable (list or tuple)
of column names.
:param kwargs: A mapping from column name to either `None`,
`sort` or `appearance`, or a 1-D array. This is useful
:param **kwargs: A mapping from column name to either `None`,
`'sort'` or `'appearance'`, or a 1-D array. This is useful
in creating categorical columns that are ordered, or
if the user needs to explicitly specify the categories.
:returns: A pandas DataFrame.
:raises ValueError: if both `column_names` and `kwargs` are provided.
:raises ValueError: If both `column_names` and `kwargs` are provided.
""" # noqa: E501

if all((column_names, kwargs)):
Expand All @@ -112,13 +112,11 @@ def encode_categorical(
# or user supplies specific categories to create the categorical
if column_names is not None:
check("column_names", column_names, [list, tuple, Hashable])
if isinstance(column_names, (list, tuple)):
check_column(df, column_names)
dtypes = {col: "category" for col in column_names}
return df.astype(dtypes)
if isinstance(column_names, Hashable):
check_column(df, [column_names])
return df.astype({column_names: "category"})
column_names = [column_names]
check_column(df, column_names)
dtypes = {col: "category" for col in column_names}
return df.astype(dtypes)

return _computations_as_categorical(df, **kwargs)

Expand Down Expand Up @@ -167,21 +165,20 @@ def _as_categorical_checks(df: pd.DataFrame, **kwargs) -> dict:
This function raises errors if columns in `kwargs` are
absent from the dataframe's columns.
It also raises errors if the value in `kwargs`
is not a string (`appearance` or `sort`), or a 1D array.
is not a string (`'appearance'` or `'sort'`), or a 1D array.
This function is executed before proceeding to the computation phase.
If all checks pass, a dictionary of column names and value is returned.
:param df: The pandas DataFrame object.
:param kwargs: A pairing of column name and value.
:param **kwargs: A pairing of column name and value.
:returns: A dictionary.
:raises TypeError: If `value` is not a 1-D array, or a string.
:raises ValueError: If `value` is a 1-D array, and contains nulls,
or is non-unique.
"""

# column checks
check_column(df, kwargs)

categories_dict = {}
Expand Down Expand Up @@ -255,7 +252,7 @@ def _as_categorical_checks(df: pd.DataFrame, **kwargs) -> dict:
category_order_types = {ent.value for ent in _CategoryOrder}
if value.lower() not in category_order_types:
raise ValueError(
"argument should be one of `appearance` or `sort`."
"Argument should be one of 'appearance' or 'sort'."
)

categories_dict[column_name] = value
Expand Down
64 changes: 32 additions & 32 deletions janitor/functions/factorize_columns.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"""Implementation of the `factorize_columns` function"""
from typing import Hashable, Iterable, Union
import pandas_flavor as pf
import pandas as pd
Expand All @@ -13,52 +14,51 @@ def factorize_columns(
**kwargs,
) -> pd.DataFrame:
"""
Converts labels into numerical data
Converts labels into numerical data.
This method will create a new column with the string `_enc` appended
after the original column's name.
This can be overriden with the suffix parameter.
Internally this method uses pandas `factorize` method.
Internally, this method uses pandas `factorize` method.
It takes in an optional suffix and keyword arguments also.
An empty string as suffix will override the existing column.
This method mutates the original DataFrame.
This method does not mutate the original DataFrame.
Functional usage syntax:
Example:
```python
df = factorize_columns(
df,
column_names="my_categorical_column",
suffix="_enc"
) # one way
```
Method chaining syntax:
```python
import pandas as pd
import janitor
categorical_cols = ['col1', 'col2', 'col4']
df = (
pd.DataFrame(...)
.factorize_columns(
column_names=categorical_cols,
suffix="_enc"
)
)
```
>>> import pandas as pd
>>> import janitor
>>> df = pd.DataFrame({
... "foo": ["b", "b", "a", "c", "b"],
... "bar": range(4, 9),
... })
>>> df
foo bar
0 b 4
1 b 5
2 a 6
3 c 7
4 b 8
>>> df.factorize_columns(column_names="foo")
foo bar foo_enc
0 b 4 0
1 b 5 0
2 a 6 1
3 c 7 2
4 b 8 0
:param df: The pandas DataFrame object.
:param column_names: A column name or an iterable (list
or tuple) of column names.
:param suffix: Suffix to be used for the new column. Default value is _enc.
An empty string suffix means, it will override the existing column
:param column_names: A column name or an iterable (list or tuple) of
column names.
:param suffix: Suffix to be used for the new column.
An empty string suffix means, it will override the existing column.
:param **kwargs: Keyword arguments. It takes any of the keyword arguments,
which the pandas factorize method takes like sort,na_sentinel,size_hint
which the pandas factorize method takes like `sort`, `na_sentinel`,
`size_hint`.
:returns: A pandas DataFrame.
"""
df = _factorize(df, column_names, suffix, **kwargs)
df = _factorize(df.copy(), column_names, suffix, **kwargs)
return df
58 changes: 39 additions & 19 deletions janitor/functions/label_encode.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"""Implementation of `label_encode` function"""
from typing import Hashable, Iterable, Union
import warnings
import pandas_flavor as pf
Expand All @@ -10,43 +11,62 @@
@pf.register_dataframe_method
@deprecated_alias(columns="column_names")
def label_encode(
df: pd.DataFrame, column_names: Union[str, Iterable[str], Hashable]
df: pd.DataFrame,
column_names: Union[str, Iterable[str], Hashable],
) -> pd.DataFrame:
"""
Convert labels into numerical data.
This method will create a new column with the string `_enc` appended
after the original column's name. Consider this to be syntactic sugar.
after the original column's name.
Consider this to be syntactic sugar.
This function uses the `factorize` pandas function under the hood.
This method behaves differently from `encode_categorical`. This method
creates a new column of numeric data. `encode_categorical` replaces the
dtype of the original column with a *categorical* dtype.
This method behaves differently from
[`encode_categorical`][janitor.functions.encode_categorical.encode_categorical].
This method creates a new column of numeric data.
[`encode_categorical`][janitor.functions.encode_categorical.encode_categorical]
replaces the dtype of the original column with a *categorical* dtype.
This method mutates the original DataFrame.
Functional usage syntax:
Example:
```python
df = label_encode(df, column_names="my_categorical_column") # one way
```
>>> import pandas as pd
>>> import janitor
>>> df = pd.DataFrame({
... "foo": ["b", "b", "a", "c", "b"],
... "bar": range(4, 9),
... })
>>> df
foo bar
0 b 4
1 b 5
2 a 6
3 c 7
4 b 8
>>> df.label_encode(column_names="foo")
foo bar foo_enc
0 b 4 0
1 b 5 0
2 a 6 1
3 c 7 2
4 b 8 0
Method chaining syntax:
!!!note
```python
import pandas as pd
import janitor
categorical_cols = ['col1', 'col2', 'col4']
df = pd.DataFrame(...).label_encode(column_names=categorical_cols)
```
This function will be deprecated in a 1.x release.
Please use [`factorize_columns`][janitor.functions.factorize_columns.factorize_columns]
instead.
:param df: The pandas DataFrame object.
:param column_names: A column name or an iterable (list
or tuple) of column names.
:returns: A pandas DataFrame.
"""
""" # noqa: E501
warnings.warn(
"label_encode will be deprecated in a 1.x release. \
Please use factorize_columns instead"
"`label_encode` will be deprecated in a 1.x release. "
"Please use `factorize_columns` instead."
)
df = _factorize(df, column_names, "_enc")
return df

0 comments on commit 92251b5

Please sign in to comment.