Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DOC] MWE for label_encode, factorize_columns, encode_categorical #1028

Merged
merged 5 commits into from
Mar 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 14 additions & 17 deletions janitor/functions/encode_categorical.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import warnings
from enum import Enum
from typing import Hashable, Iterable, Union

import pandas_flavor as pf
import pandas as pd
from pandas.api.types import is_list_like
import warnings

from janitor.utils import check, check_column, deprecated_alias
from enum import Enum


@pf.register_dataframe_method
Expand Down Expand Up @@ -90,17 +92,15 @@ def encode_categorical(
>>> enc_df["foo"].cat.ordered
True



:param df: A pandas DataFrame object.
:param column_names: A column name or an iterable (list or tuple)
of column names.
:param kwargs: A mapping from column name to either `None`,
`sort` or `appearance`, or a 1-D array. This is useful
:param **kwargs: A mapping from column name to either `None`,
`'sort'` or `'appearance'`, or a 1-D array. This is useful
in creating categorical columns that are ordered, or
if the user needs to explicitly specify the categories.
:returns: A pandas DataFrame.
:raises ValueError: if both `column_names` and `kwargs` are provided.
:raises ValueError: If both `column_names` and `kwargs` are provided.
""" # noqa: E501

if all((column_names, kwargs)):
Expand All @@ -112,13 +112,11 @@ def encode_categorical(
# or user supplies specific categories to create the categorical
if column_names is not None:
check("column_names", column_names, [list, tuple, Hashable])
if isinstance(column_names, (list, tuple)):
check_column(df, column_names)
dtypes = {col: "category" for col in column_names}
return df.astype(dtypes)
if isinstance(column_names, Hashable):
check_column(df, [column_names])
return df.astype({column_names: "category"})
column_names = [column_names]
check_column(df, column_names)
dtypes = {col: "category" for col in column_names}
return df.astype(dtypes)

return _computations_as_categorical(df, **kwargs)

Expand Down Expand Up @@ -167,21 +165,20 @@ def _as_categorical_checks(df: pd.DataFrame, **kwargs) -> dict:
This function raises errors if columns in `kwargs` are
absent from the dataframe's columns.
It also raises errors if the value in `kwargs`
is not a string (`appearance` or `sort`), or a 1D array.
is not a string (`'appearance'` or `'sort'`), or a 1D array.

This function is executed before proceeding to the computation phase.

If all checks pass, a dictionary of column names and value is returned.

:param df: The pandas DataFrame object.
:param kwargs: A pairing of column name and value.
:param **kwargs: A pairing of column name and value.
:returns: A dictionary.
:raises TypeError: If `value` is not a 1-D array, or a string.
:raises ValueError: If `value` is a 1-D array, and contains nulls,
or is non-unique.
"""

# column checks
check_column(df, kwargs)

categories_dict = {}
Expand Down Expand Up @@ -255,7 +252,7 @@ def _as_categorical_checks(df: pd.DataFrame, **kwargs) -> dict:
category_order_types = {ent.value for ent in _CategoryOrder}
if value.lower() not in category_order_types:
raise ValueError(
"argument should be one of `appearance` or `sort`."
"Argument should be one of 'appearance' or 'sort'."
)

categories_dict[column_name] = value
Expand Down
64 changes: 32 additions & 32 deletions janitor/functions/factorize_columns.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"""Implementation of the `factorize_columns` function"""
from typing import Hashable, Iterable, Union
import pandas_flavor as pf
import pandas as pd
Expand All @@ -13,52 +14,51 @@ def factorize_columns(
**kwargs,
) -> pd.DataFrame:
"""
Converts labels into numerical data
Converts labels into numerical data.

This method will create a new column with the string `_enc` appended
after the original column's name.
This can be overriden with the suffix parameter.

Internally this method uses pandas `factorize` method.
Internally, this method uses pandas `factorize` method.
It takes in an optional suffix and keyword arguments also.
An empty string as suffix will override the existing column.

This method mutates the original DataFrame.
This method does not mutate the original DataFrame.

Functional usage syntax:
Example:

```python
df = factorize_columns(
df,
column_names="my_categorical_column",
suffix="_enc"
) # one way
```

Method chaining syntax:

```python
import pandas as pd
import janitor
categorical_cols = ['col1', 'col2', 'col4']
df = (
pd.DataFrame(...)
.factorize_columns(
column_names=categorical_cols,
suffix="_enc"
)
)
```
>>> import pandas as pd
>>> import janitor
>>> df = pd.DataFrame({
... "foo": ["b", "b", "a", "c", "b"],
... "bar": range(4, 9),
... })
>>> df
foo bar
0 b 4
1 b 5
2 a 6
3 c 7
4 b 8
>>> df.factorize_columns(column_names="foo")
foo bar foo_enc
0 b 4 0
1 b 5 0
2 a 6 1
3 c 7 2
4 b 8 0

:param df: The pandas DataFrame object.
:param column_names: A column name or an iterable (list
or tuple) of column names.
:param suffix: Suffix to be used for the new column. Default value is _enc.
An empty string suffix means, it will override the existing column
:param column_names: A column name or an iterable (list or tuple) of
column names.
:param suffix: Suffix to be used for the new column.
An empty string suffix means, it will override the existing column.
:param **kwargs: Keyword arguments. It takes any of the keyword arguments,
which the pandas factorize method takes like sort,na_sentinel,size_hint
which the pandas factorize method takes like `sort`, `na_sentinel`,
`size_hint`.

:returns: A pandas DataFrame.
"""
df = _factorize(df, column_names, suffix, **kwargs)
df = _factorize(df.copy(), column_names, suffix, **kwargs)
return df
58 changes: 39 additions & 19 deletions janitor/functions/label_encode.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"""Implementation of `label_encode` function"""
from typing import Hashable, Iterable, Union
import warnings
import pandas_flavor as pf
Expand All @@ -10,43 +11,62 @@
@pf.register_dataframe_method
@deprecated_alias(columns="column_names")
def label_encode(
df: pd.DataFrame, column_names: Union[str, Iterable[str], Hashable]
df: pd.DataFrame,
column_names: Union[str, Iterable[str], Hashable],
) -> pd.DataFrame:
"""
Convert labels into numerical data.

This method will create a new column with the string `_enc` appended
after the original column's name. Consider this to be syntactic sugar.
after the original column's name.
Consider this to be syntactic sugar.
This function uses the `factorize` pandas function under the hood.

This method behaves differently from `encode_categorical`. This method
creates a new column of numeric data. `encode_categorical` replaces the
dtype of the original column with a *categorical* dtype.
This method behaves differently from
[`encode_categorical`][janitor.functions.encode_categorical.encode_categorical].
This method creates a new column of numeric data.
[`encode_categorical`][janitor.functions.encode_categorical.encode_categorical]
replaces the dtype of the original column with a *categorical* dtype.

This method mutates the original DataFrame.

Functional usage syntax:
Example:

```python
df = label_encode(df, column_names="my_categorical_column") # one way
```
>>> import pandas as pd
>>> import janitor
>>> df = pd.DataFrame({
... "foo": ["b", "b", "a", "c", "b"],
... "bar": range(4, 9),
... })
>>> df
foo bar
0 b 4
1 b 5
2 a 6
3 c 7
4 b 8
>>> df.label_encode(column_names="foo")
foo bar foo_enc
0 b 4 0
1 b 5 0
2 a 6 1
3 c 7 2
4 b 8 0

Method chaining syntax:
!!!note

```python
import pandas as pd
import janitor
categorical_cols = ['col1', 'col2', 'col4']
df = pd.DataFrame(...).label_encode(column_names=categorical_cols)
```
This function will be deprecated in a 1.x release.
Please use [`factorize_columns`][janitor.functions.factorize_columns.factorize_columns]
instead.

:param df: The pandas DataFrame object.
:param column_names: A column name or an iterable (list
or tuple) of column names.
:returns: A pandas DataFrame.
"""
""" # noqa: E501
warnings.warn(
"label_encode will be deprecated in a 1.x release. \
Please use factorize_columns instead"
"`label_encode` will be deprecated in a 1.x release. "
"Please use `factorize_columns` instead."
)
df = _factorize(df, column_names, "_enc")
return df