From 92251b55462a4c494b2c4c14de75a95625cb666a Mon Sep 17 00:00:00 2001 From: Jeremy Goh <30731072+thatlittleboy@users.noreply.github.com> Date: Wed, 2 Mar 2022 22:45:56 +0800 Subject: [PATCH] [DOC] MWE for `label_encode`, `factorize_columns`, `encode_categorical` (#1028) * Add MWE for label_encode and factorize_columns * Fix some formatting for encode_categorical * Simplify encode_categorical logic slightly * Fix formatting * Make factorize_columns non-mutating --- janitor/functions/encode_categorical.py | 31 ++++++------ janitor/functions/factorize_columns.py | 64 ++++++++++++------------- janitor/functions/label_encode.py | 58 ++++++++++++++-------- 3 files changed, 85 insertions(+), 68 deletions(-) diff --git a/janitor/functions/encode_categorical.py b/janitor/functions/encode_categorical.py index 016b61e23..0800ab365 100644 --- a/janitor/functions/encode_categorical.py +++ b/janitor/functions/encode_categorical.py @@ -1,10 +1,12 @@ +import warnings +from enum import Enum from typing import Hashable, Iterable, Union + import pandas_flavor as pf import pandas as pd from pandas.api.types import is_list_like -import warnings + from janitor.utils import check, check_column, deprecated_alias -from enum import Enum @pf.register_dataframe_method @@ -90,17 +92,15 @@ def encode_categorical( >>> enc_df["foo"].cat.ordered True - - :param df: A pandas DataFrame object. :param column_names: A column name or an iterable (list or tuple) of column names. - :param kwargs: A mapping from column name to either `None`, - `sort` or `appearance`, or a 1-D array. This is useful + :param **kwargs: A mapping from column name to either `None`, + `'sort'` or `'appearance'`, or a 1-D array. This is useful in creating categorical columns that are ordered, or if the user needs to explicitly specify the categories. :returns: A pandas DataFrame. - :raises ValueError: if both `column_names` and `kwargs` are provided. + :raises ValueError: If both `column_names` and `kwargs` are provided. """ # noqa: E501 if all((column_names, kwargs)): @@ -112,13 +112,11 @@ def encode_categorical( # or user supplies specific categories to create the categorical if column_names is not None: check("column_names", column_names, [list, tuple, Hashable]) - if isinstance(column_names, (list, tuple)): - check_column(df, column_names) - dtypes = {col: "category" for col in column_names} - return df.astype(dtypes) if isinstance(column_names, Hashable): - check_column(df, [column_names]) - return df.astype({column_names: "category"}) + column_names = [column_names] + check_column(df, column_names) + dtypes = {col: "category" for col in column_names} + return df.astype(dtypes) return _computations_as_categorical(df, **kwargs) @@ -167,21 +165,20 @@ def _as_categorical_checks(df: pd.DataFrame, **kwargs) -> dict: This function raises errors if columns in `kwargs` are absent from the dataframe's columns. It also raises errors if the value in `kwargs` - is not a string (`appearance` or `sort`), or a 1D array. + is not a string (`'appearance'` or `'sort'`), or a 1D array. This function is executed before proceeding to the computation phase. If all checks pass, a dictionary of column names and value is returned. :param df: The pandas DataFrame object. - :param kwargs: A pairing of column name and value. + :param **kwargs: A pairing of column name and value. :returns: A dictionary. :raises TypeError: If `value` is not a 1-D array, or a string. :raises ValueError: If `value` is a 1-D array, and contains nulls, or is non-unique. """ - # column checks check_column(df, kwargs) categories_dict = {} @@ -255,7 +252,7 @@ def _as_categorical_checks(df: pd.DataFrame, **kwargs) -> dict: category_order_types = {ent.value for ent in _CategoryOrder} if value.lower() not in category_order_types: raise ValueError( - "argument should be one of `appearance` or `sort`." + "Argument should be one of 'appearance' or 'sort'." ) categories_dict[column_name] = value diff --git a/janitor/functions/factorize_columns.py b/janitor/functions/factorize_columns.py index 6d340a72e..1ecdf88fa 100644 --- a/janitor/functions/factorize_columns.py +++ b/janitor/functions/factorize_columns.py @@ -1,3 +1,4 @@ +"""Implementation of the `factorize_columns` function""" from typing import Hashable, Iterable, Union import pandas_flavor as pf import pandas as pd @@ -13,52 +14,51 @@ def factorize_columns( **kwargs, ) -> pd.DataFrame: """ - Converts labels into numerical data + Converts labels into numerical data. This method will create a new column with the string `_enc` appended after the original column's name. This can be overriden with the suffix parameter. - Internally this method uses pandas `factorize` method. + Internally, this method uses pandas `factorize` method. It takes in an optional suffix and keyword arguments also. An empty string as suffix will override the existing column. - This method mutates the original DataFrame. + This method does not mutate the original DataFrame. - Functional usage syntax: + Example: - ```python - df = factorize_columns( - df, - column_names="my_categorical_column", - suffix="_enc" - ) # one way - ``` - - Method chaining syntax: - - ```python - import pandas as pd - import janitor - categorical_cols = ['col1', 'col2', 'col4'] - df = ( - pd.DataFrame(...) - .factorize_columns( - column_names=categorical_cols, - suffix="_enc" - ) - ) - ``` + >>> import pandas as pd + >>> import janitor + >>> df = pd.DataFrame({ + ... "foo": ["b", "b", "a", "c", "b"], + ... "bar": range(4, 9), + ... }) + >>> df + foo bar + 0 b 4 + 1 b 5 + 2 a 6 + 3 c 7 + 4 b 8 + >>> df.factorize_columns(column_names="foo") + foo bar foo_enc + 0 b 4 0 + 1 b 5 0 + 2 a 6 1 + 3 c 7 2 + 4 b 8 0 :param df: The pandas DataFrame object. - :param column_names: A column name or an iterable (list - or tuple) of column names. - :param suffix: Suffix to be used for the new column. Default value is _enc. - An empty string suffix means, it will override the existing column + :param column_names: A column name or an iterable (list or tuple) of + column names. + :param suffix: Suffix to be used for the new column. + An empty string suffix means, it will override the existing column. :param **kwargs: Keyword arguments. It takes any of the keyword arguments, - which the pandas factorize method takes like sort,na_sentinel,size_hint + which the pandas factorize method takes like `sort`, `na_sentinel`, + `size_hint`. :returns: A pandas DataFrame. """ - df = _factorize(df, column_names, suffix, **kwargs) + df = _factorize(df.copy(), column_names, suffix, **kwargs) return df diff --git a/janitor/functions/label_encode.py b/janitor/functions/label_encode.py index a192d3767..a8b39950d 100644 --- a/janitor/functions/label_encode.py +++ b/janitor/functions/label_encode.py @@ -1,3 +1,4 @@ +"""Implementation of `label_encode` function""" from typing import Hashable, Iterable, Union import warnings import pandas_flavor as pf @@ -10,43 +11,62 @@ @pf.register_dataframe_method @deprecated_alias(columns="column_names") def label_encode( - df: pd.DataFrame, column_names: Union[str, Iterable[str], Hashable] + df: pd.DataFrame, + column_names: Union[str, Iterable[str], Hashable], ) -> pd.DataFrame: """ Convert labels into numerical data. This method will create a new column with the string `_enc` appended - after the original column's name. Consider this to be syntactic sugar. + after the original column's name. + Consider this to be syntactic sugar. + This function uses the `factorize` pandas function under the hood. - This method behaves differently from `encode_categorical`. This method - creates a new column of numeric data. `encode_categorical` replaces the - dtype of the original column with a *categorical* dtype. + This method behaves differently from + [`encode_categorical`][janitor.functions.encode_categorical.encode_categorical]. + This method creates a new column of numeric data. + [`encode_categorical`][janitor.functions.encode_categorical.encode_categorical] + replaces the dtype of the original column with a *categorical* dtype. This method mutates the original DataFrame. - Functional usage syntax: + Example: - ```python - df = label_encode(df, column_names="my_categorical_column") # one way - ``` + >>> import pandas as pd + >>> import janitor + >>> df = pd.DataFrame({ + ... "foo": ["b", "b", "a", "c", "b"], + ... "bar": range(4, 9), + ... }) + >>> df + foo bar + 0 b 4 + 1 b 5 + 2 a 6 + 3 c 7 + 4 b 8 + >>> df.label_encode(column_names="foo") + foo bar foo_enc + 0 b 4 0 + 1 b 5 0 + 2 a 6 1 + 3 c 7 2 + 4 b 8 0 - Method chaining syntax: + !!!note - ```python - import pandas as pd - import janitor - categorical_cols = ['col1', 'col2', 'col4'] - df = pd.DataFrame(...).label_encode(column_names=categorical_cols) - ``` + This function will be deprecated in a 1.x release. + Please use [`factorize_columns`][janitor.functions.factorize_columns.factorize_columns] + instead. :param df: The pandas DataFrame object. :param column_names: A column name or an iterable (list or tuple) of column names. :returns: A pandas DataFrame. - """ + """ # noqa: E501 warnings.warn( - "label_encode will be deprecated in a 1.x release. \ - Please use factorize_columns instead" + "`label_encode` will be deprecated in a 1.x release. " + "Please use `factorize_columns` instead." ) df = _factorize(df, column_names, "_enc") return df