[DOC] MWE for label_encode, factorize_columns, `encode_categorica…

…l` (#1028) * Add MWE for label_encode and factorize_columns * Fix some formatting for encode_categorical * Simplify encode_categorical logic slightly * Fix formatting * Make factorize_columns non-mutating
pyjanitor-devs · Mar 2, 2022 · 92251b5 · 92251b5
1 parent 270498e
commit 92251b5
Show file tree

Hide file tree

Showing 3 changed files with 85 additions and 68 deletions.
diff --git a/janitor/functions/encode_categorical.py b/janitor/functions/encode_categorical.py
@@ -1,10 +1,12 @@
+import warnings
+from enum import Enum
 from typing import Hashable, Iterable, Union
+
 import pandas_flavor as pf
 import pandas as pd
 from pandas.api.types import is_list_like
-import warnings
+
 from janitor.utils import check, check_column, deprecated_alias
-from enum import Enum
 
 
 @pf.register_dataframe_method
@@ -90,17 +92,15 @@ def encode_categorical(
         >>> enc_df["foo"].cat.ordered
         True
 
-
-
     :param df: A pandas DataFrame object.
     :param column_names: A column name or an iterable (list or tuple)
         of column names.
-    :param kwargs: A mapping from column name to either `None`,
-        `sort` or `appearance`, or a 1-D array. This is useful
+    :param **kwargs: A mapping from column name to either `None`,
+        `'sort'` or `'appearance'`, or a 1-D array. This is useful
         in creating categorical columns that are ordered, or
         if the user needs to explicitly specify the categories.
     :returns: A pandas DataFrame.
-    :raises ValueError: if both `column_names` and `kwargs` are provided.
+    :raises ValueError: If both `column_names` and `kwargs` are provided.
     """  # noqa: E501
 
     if all((column_names, kwargs)):
@@ -112,13 +112,11 @@ def encode_categorical(
     # or user supplies specific categories to create the categorical
     if column_names is not None:
         check("column_names", column_names, [list, tuple, Hashable])
-        if isinstance(column_names, (list, tuple)):
-            check_column(df, column_names)
-            dtypes = {col: "category" for col in column_names}
-            return df.astype(dtypes)
         if isinstance(column_names, Hashable):
-            check_column(df, [column_names])
-            return df.astype({column_names: "category"})
+            column_names = [column_names]
+        check_column(df, column_names)
+        dtypes = {col: "category" for col in column_names}
+        return df.astype(dtypes)
 
     return _computations_as_categorical(df, **kwargs)
 
@@ -167,21 +165,20 @@ def _as_categorical_checks(df: pd.DataFrame, **kwargs) -> dict:
     This function raises errors if columns in `kwargs` are
     absent from the dataframe's columns.
     It also raises errors if the value in `kwargs`
-    is not a string (`appearance` or `sort`), or a 1D array.
+    is not a string (`'appearance'` or `'sort'`), or a 1D array.
 
     This function is executed before proceeding to the computation phase.
 
     If all checks pass, a dictionary of column names and value is returned.
 
     :param df: The pandas DataFrame object.
-    :param kwargs: A pairing of column name and value.
+    :param **kwargs: A pairing of column name and value.
     :returns: A dictionary.
     :raises TypeError: If `value` is not a 1-D array, or a string.
     :raises ValueError: If `value` is a 1-D array, and contains nulls,
         or is non-unique.
     """
 
-    # column checks
     check_column(df, kwargs)
 
     categories_dict = {}
@@ -255,7 +252,7 @@ def _as_categorical_checks(df: pd.DataFrame, **kwargs) -> dict:
             category_order_types = {ent.value for ent in _CategoryOrder}
             if value.lower() not in category_order_types:
                 raise ValueError(
-                    "argument should be one of `appearance` or `sort`."
+                    "Argument should be one of 'appearance' or 'sort'."
                 )
 
         categories_dict[column_name] = value

diff --git a/janitor/functions/factorize_columns.py b/janitor/functions/factorize_columns.py
@@ -1,3 +1,4 @@
+"""Implementation of the `factorize_columns` function"""
 from typing import Hashable, Iterable, Union
 import pandas_flavor as pf
 import pandas as pd
@@ -13,52 +14,51 @@ def factorize_columns(
     **kwargs,
 ) -> pd.DataFrame:
     """
-    Converts labels into numerical data
+    Converts labels into numerical data.
 
     This method will create a new column with the string `_enc` appended
     after the original column's name.
     This can be overriden with the suffix parameter.
 
-    Internally this method uses pandas `factorize` method.
+    Internally, this method uses pandas `factorize` method.
     It takes in an optional suffix and keyword arguments also.
     An empty string as suffix will override the existing column.
 
-    This method mutates the original DataFrame.
+    This method does not mutate the original DataFrame.
 
-    Functional usage syntax:
+    Example:
 
-    ```python
-    df = factorize_columns(
-        df,
-        column_names="my_categorical_column",
-        suffix="_enc"
-    )  # one way
-    ```
-
-    Method chaining syntax:
-
-    ```python
-    import pandas as pd
-    import janitor
-    categorical_cols = ['col1', 'col2', 'col4']
-    df = (
-        pd.DataFrame(...)
-        .factorize_columns(
-            column_names=categorical_cols,
-            suffix="_enc"
-        )
-    )
-    ```
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "foo": ["b", "b", "a", "c", "b"],
+        ...     "bar": range(4, 9),
+        ... })
+        >>> df
+          foo  bar
+        0   b    4
+        1   b    5
+        2   a    6
+        3   c    7
+        4   b    8
+        >>> df.factorize_columns(column_names="foo")
+          foo  bar  foo_enc
+        0   b    4        0
+        1   b    5        0
+        2   a    6        1
+        3   c    7        2
+        4   b    8        0
 
     :param df: The pandas DataFrame object.
-    :param column_names: A column name or an iterable (list
-        or tuple) of column names.
-    :param suffix: Suffix to be used for the new column. Default value is _enc.
-        An empty string suffix means, it will override the existing column
+    :param column_names: A column name or an iterable (list or tuple) of
+        column names.
+    :param suffix: Suffix to be used for the new column.
+        An empty string suffix means, it will override the existing column.
     :param **kwargs: Keyword arguments. It takes any of the keyword arguments,
-        which the pandas factorize method takes like sort,na_sentinel,size_hint
+        which the pandas factorize method takes like `sort`, `na_sentinel`,
+        `size_hint`.
 
     :returns: A pandas DataFrame.
     """
-    df = _factorize(df, column_names, suffix, **kwargs)
+    df = _factorize(df.copy(), column_names, suffix, **kwargs)
     return df
diff --git a/janitor/functions/label_encode.py b/janitor/functions/label_encode.py
@@ -1,3 +1,4 @@
+"""Implementation of `label_encode` function"""
 from typing import Hashable, Iterable, Union
 import warnings
 import pandas_flavor as pf
@@ -10,43 +11,62 @@
 @pf.register_dataframe_method
 @deprecated_alias(columns="column_names")
 def label_encode(
-    df: pd.DataFrame, column_names: Union[str, Iterable[str], Hashable]
+    df: pd.DataFrame,
+    column_names: Union[str, Iterable[str], Hashable],
 ) -> pd.DataFrame:
     """
     Convert labels into numerical data.
 
     This method will create a new column with the string `_enc` appended
-    after the original column's name. Consider this to be syntactic sugar.
+    after the original column's name.
+    Consider this to be syntactic sugar.
+    This function uses the `factorize` pandas function under the hood.
 
-    This method behaves differently from `encode_categorical`. This method
-    creates a new column of numeric data. `encode_categorical` replaces the
-    dtype of the original column with a *categorical* dtype.
+    This method behaves differently from
+    [`encode_categorical`][janitor.functions.encode_categorical.encode_categorical].
+    This method creates a new column of numeric data.
+    [`encode_categorical`][janitor.functions.encode_categorical.encode_categorical]
+    replaces the dtype of the original column with a *categorical* dtype.
 
     This method mutates the original DataFrame.
 
-    Functional usage syntax:
+    Example:
 
-    ```python
-    df = label_encode(df, column_names="my_categorical_column")  # one way
-    ```
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "foo": ["b", "b", "a", "c", "b"],
+        ...     "bar": range(4, 9),
+        ... })
+        >>> df
+          foo  bar
+        0   b    4
+        1   b    5
+        2   a    6
+        3   c    7
+        4   b    8
+        >>> df.label_encode(column_names="foo")
+          foo  bar  foo_enc
+        0   b    4        0
+        1   b    5        0
+        2   a    6        1
+        3   c    7        2
+        4   b    8        0
 
-    Method chaining syntax:
+    !!!note
 
-    ```python
-    import pandas as pd
-    import janitor
-    categorical_cols = ['col1', 'col2', 'col4']
-    df = pd.DataFrame(...).label_encode(column_names=categorical_cols)
-    ```
+        This function will be deprecated in a 1.x release.
+        Please use [`factorize_columns`][janitor.functions.factorize_columns.factorize_columns]
+        instead.
 
     :param df: The pandas DataFrame object.
     :param column_names: A column name or an iterable (list
         or tuple) of column names.
     :returns: A pandas DataFrame.
-    """
+    """  # noqa: E501
     warnings.warn(
-        "label_encode will be deprecated in a 1.x release. \
-        Please use factorize_columns instead"
+        "`label_encode` will be deprecated in a 1.x release. "
+        "Please use `factorize_columns` instead."
     )
     df = _factorize(df, column_names, "_enc")
     return df