Add polars support for janitor.io.xlsx_table (#1357)

* add make_clean_names function that can be applied to polars * add examples for make_clean_names * changelog * limit import location for polars * limit import location for polars * fix polars in environment-dev.yml * install polars in doctest * limit polars imports - user should have polars already installed * use subprocess.run * add subprocess.devnull to docstrings * add subprocess.devnull to docstrings * add subprocess.devnull to docstrings * add subprocess.devnull to docstrings * add os.devnull * add polars as requirement for docs * add polars to tests requirements * delete irrelevant folder * changelog * create submodule for polars * fix doctests * fix tests; add polars to documentation * fix tests; add polars to documentation * import janitor.polars * control docs output for polars submodule * exclude functions in docs rendering * exclude functions in docs rendering * show_submodules=true * fix docstring rendering for polars * Expression -> expression * rename functions.py * pivot_longer implemented for polars * changelog * keep changes related only to pivot_longer * pd -> pl * pd -> pl * df.pivot_longer -> df.janitor.pivot_longer * df.pivot_longer -> df.janitor.pivot_longer * pd -> pl * pd -> pl * add >>> df * add >>> df * keep changes related only to polars pivot_longer * add polars support to read_commandline * remove irrelevant files * minor edit to docs * xlsx_table now supports polars --------- Co-authored-by: samuel.oranyeli <[email protected]> Co-authored-by: Eric Ma <[email protected]>
pyjanitor-devs · Jun 3, 2024 · 46ab4d8 · 46ab4d8
1 parent 891b711
commit 46ab4d8
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,7 @@
 # Changelog
 
 ## [Unreleased]
+-  [ENH] `xlsx_table` function now supports polars - Issue #1352
 
 -  [ENH] Improved performance for non-equi joins when using numba - @samukweku PR #1341
 -  [ENH] Added a `clean_names` method for polars - it can be used to clean the column names, or clean column values . Issue #1343

diff --git a/janitor/io.py b/janitor/io.py
@@ -8,7 +8,7 @@
 from glob import glob
 from io import StringIO
 from itertools import chain
-from typing import IO, TYPE_CHECKING, Any, Iterable, Union
+from typing import IO, TYPE_CHECKING, Any, Iterable, Mapping, Union
 
 import pandas as pd
 
@@ -142,21 +142,23 @@ def xlsx_table(
     path: Union[str, IO, Workbook],
     sheetname: str = None,
     table: Union[str, list, tuple] = None,
-) -> Union[pd.DataFrame, dict]:
+    engine: str = "pandas",
+) -> Mapping:
     """Returns a DataFrame of values in a table in the Excel file.
 
     This applies to an Excel file, where the data range is explicitly
     specified as a Microsoft Excel table.
 
     If there is a single table in the sheet, or a string is provided
-    as an argument to the `table` parameter, a pandas DataFrame is returned;
+    as an argument to the `table` parameter, a DataFrame is returned;
     if there is more than one table in the sheet,
     and the `table` argument is `None`, or a list/tuple of names,
     a dictionary of DataFrames is returned, where the keys of the dictionary
     are the table names.
 
     Examples:
         >>> import pandas as pd
+        >>> import polars as pl
         >>> from janitor import xlsx_table
         >>> filename="../pyjanitor/tests/test_data/016-MSPTDA-Excel.xlsx"
 
@@ -170,6 +172,20 @@ def xlsx_table(
         3           4    Competition
         4           5  Long Distance
 
+        >>> xlsx_table(filename, table='dCategory', engine='polars')
+        shape: (5, 2)
+        ┌────────────┬───────────────┐
+        │ CategoryID ┆ Category      │
+        │ ---        ┆ ---           │
+        │ i64        ┆ str           │
+        ╞════════════╪═══════════════╡
+        │ 1          ┆ Beginner      │
+        │ 2          ┆ Advanced      │
+        │ 3          ┆ Freestyle     │
+        │ 4          ┆ Competition   │
+        │ 5          ┆ Long Distance │
+        └────────────┴───────────────┘
+
         Multiple tables:
 
         >>> out=xlsx_table(filename, table=["dCategory", "dSalesReps"])
@@ -189,14 +205,16 @@ def xlsx_table(
     Args:
           path: Path to the Excel File. It can also be an openpyxl Workbook.
           table: Name of a table, or list of tables in the sheet.
+          engine: DataFrame engine. Should be either pandas or polars.
+            Defaults to pandas
 
     Raises:
         AttributeError: If a workbook is provided, and is a ReadOnlyWorksheet.
         ValueError: If there are no tables in the sheet.
         KeyError: If the provided table does not exist in the sheet.
 
     Returns:
-        A pandas DataFrame, or a dictionary of DataFrames,
+        A DataFrame, or a dictionary of DataFrames,
             if there are multiple arguments for the `table` parameter,
             or the argument to `table` is `None`.
     """  # noqa : E501
@@ -219,6 +237,22 @@ def xlsx_table(
             DeprecationWarning,
             stacklevel=find_stack_level(),
         )
+    if engine not in {"pandas", "polars"}:
+        raise ValueError("engine should be one of pandas or polars.")
+    base_engine = pd
+    if engine == "polars":
+        try:
+            import polars as pl
+
+            base_engine = pl
+        except ImportError:
+            import_message(
+                submodule="polars",
+                package="polars",
+                conda_channel="conda-forge",
+                pip_install=True,
+            )
+
     if table is not None:
         check("table", table, [str, list, tuple])
         if isinstance(table, (list, tuple)):
@@ -245,13 +279,15 @@ def _create_dataframe_or_dictionary_from_table(
             header_exist = contents.headerRowCount
             coordinates = contents.ref
             data = worksheet[coordinates]
-            data = [[entry.value for entry in cell] for cell in data]
             if header_exist:
                 header, *data = data
+                header = [cell.value for cell in header]
             else:
                 header = [f"C{num}" for num in range(len(data[0]))]
-            data = pd.DataFrame(data, columns=header)
-            dictionary[table_name] = data
+            data = zip(*data)
+            data = ([entry.value for entry in cell] for cell in data)
+            data = dict(zip(header, data))
+            dictionary[table_name] = base_engine.DataFrame(data)
         return dictionary
 
     worksheets = [worksheet for worksheet in ws if worksheet.tables.items()]