From 46ab4d8a5521305d45ff771184862d047959b1a1 Mon Sep 17 00:00:00 2001
From: Samuel Oranyeli <samueloranyeli@gmail.com>
Date: Mon, 3 Jun 2024 10:16:17 +1000
Subject: [PATCH] Add polars support for `janitor.io.xlsx_table` (#1357)

* add make_clean_names function that can be applied to polars

* add examples for make_clean_names

* changelog

* limit import location for polars

* limit import location for polars

* fix polars in environment-dev.yml

* install polars in doctest

* limit polars imports - user should have polars already installed

* use subprocess.run

* add subprocess.devnull to docstrings

* add subprocess.devnull to docstrings

* add subprocess.devnull to docstrings

* add subprocess.devnull to docstrings

* add os.devnull

* add polars as requirement for docs

* add polars to tests requirements

* delete irrelevant folder

* changelog

* create submodule for polars

* fix doctests

* fix tests; add polars to documentation

* fix tests; add polars to documentation

* import janitor.polars

* control docs output for polars submodule

* exclude functions in docs rendering

* exclude functions in docs rendering

* show_submodules=true

* fix docstring rendering for polars

* Expression -> expression

* rename functions.py

* pivot_longer implemented for polars

* changelog

* keep changes related only to pivot_longer

* pd -> pl

* pd -> pl

* df.pivot_longer -> df.janitor.pivot_longer

* df.pivot_longer -> df.janitor.pivot_longer

* pd -> pl

* pd -> pl

* add >>> df

* add >>> df

* keep changes related only to polars pivot_longer

* add polars support to read_commandline

* remove irrelevant files

* minor edit to docs

* xlsx_table now supports polars

---------

Co-authored-by: samuel.oranyeli <samuel.oranyeli@grow.inc>
Co-authored-by: Eric Ma <ericmjl@users.noreply.github.com>
---
 CHANGELOG.md  |  1 +
 janitor/io.py | 50 +++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 207b130b8..47739249b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,7 @@
 # Changelog
 
 ## [Unreleased]
+-  [ENH] `xlsx_table` function now supports polars - Issue #1352
 
 -  [ENH] Improved performance for non-equi joins when using numba - @samukweku PR #1341
 -  [ENH] Added a `clean_names` method for polars - it can be used to clean the column names, or clean column values . Issue #1343
diff --git a/janitor/io.py b/janitor/io.py
index 1912afe8c..4829b3e1c 100644
--- a/janitor/io.py
+++ b/janitor/io.py
@@ -8,7 +8,7 @@
 from glob import glob
 from io import StringIO
 from itertools import chain
-from typing import IO, TYPE_CHECKING, Any, Iterable, Union
+from typing import IO, TYPE_CHECKING, Any, Iterable, Mapping, Union
 
 import pandas as pd
 
@@ -142,14 +142,15 @@ def xlsx_table(
     path: Union[str, IO, Workbook],
     sheetname: str = None,
     table: Union[str, list, tuple] = None,
-) -> Union[pd.DataFrame, dict]:
+    engine: str = "pandas",
+) -> Mapping:
     """Returns a DataFrame of values in a table in the Excel file.
 
     This applies to an Excel file, where the data range is explicitly
     specified as a Microsoft Excel table.
 
     If there is a single table in the sheet, or a string is provided
-    as an argument to the `table` parameter, a pandas DataFrame is returned;
+    as an argument to the `table` parameter, a DataFrame is returned;
     if there is more than one table in the sheet,
     and the `table` argument is `None`, or a list/tuple of names,
     a dictionary of DataFrames is returned, where the keys of the dictionary
@@ -157,6 +158,7 @@ def xlsx_table(
 
     Examples:
         >>> import pandas as pd
+        >>> import polars as pl
         >>> from janitor import xlsx_table
         >>> filename="../pyjanitor/tests/test_data/016-MSPTDA-Excel.xlsx"
 
@@ -170,6 +172,20 @@ def xlsx_table(
         3           4    Competition
         4           5  Long Distance
 
+        >>> xlsx_table(filename, table='dCategory', engine='polars')
+        shape: (5, 2)
+        ┌────────────┬───────────────┐
+        │ CategoryID ┆ Category      │
+        │ ---        ┆ ---           │
+        │ i64        ┆ str           │
+        ╞════════════╪═══════════════╡
+        │ 1          ┆ Beginner      │
+        │ 2          ┆ Advanced      │
+        │ 3          ┆ Freestyle     │
+        │ 4          ┆ Competition   │
+        │ 5          ┆ Long Distance │
+        └────────────┴───────────────┘
+
         Multiple tables:
 
         >>> out=xlsx_table(filename, table=["dCategory", "dSalesReps"])
@@ -189,6 +205,8 @@ def xlsx_table(
     Args:
           path: Path to the Excel File. It can also be an openpyxl Workbook.
           table: Name of a table, or list of tables in the sheet.
+          engine: DataFrame engine. Should be either pandas or polars.
+            Defaults to pandas
 
     Raises:
         AttributeError: If a workbook is provided, and is a ReadOnlyWorksheet.
@@ -196,7 +214,7 @@ def xlsx_table(
         KeyError: If the provided table does not exist in the sheet.
 
     Returns:
-        A pandas DataFrame, or a dictionary of DataFrames,
+        A DataFrame, or a dictionary of DataFrames,
             if there are multiple arguments for the `table` parameter,
             or the argument to `table` is `None`.
     """  # noqa : E501
@@ -219,6 +237,22 @@ def xlsx_table(
             DeprecationWarning,
             stacklevel=find_stack_level(),
         )
+    if engine not in {"pandas", "polars"}:
+        raise ValueError("engine should be one of pandas or polars.")
+    base_engine = pd
+    if engine == "polars":
+        try:
+            import polars as pl
+
+            base_engine = pl
+        except ImportError:
+            import_message(
+                submodule="polars",
+                package="polars",
+                conda_channel="conda-forge",
+                pip_install=True,
+            )
+
     if table is not None:
         check("table", table, [str, list, tuple])
         if isinstance(table, (list, tuple)):
@@ -245,13 +279,15 @@ def _create_dataframe_or_dictionary_from_table(
             header_exist = contents.headerRowCount
             coordinates = contents.ref
             data = worksheet[coordinates]
-            data = [[entry.value for entry in cell] for cell in data]
             if header_exist:
                 header, *data = data
+                header = [cell.value for cell in header]
             else:
                 header = [f"C{num}" for num in range(len(data[0]))]
-            data = pd.DataFrame(data, columns=header)
-            dictionary[table_name] = data
+            data = zip(*data)
+            data = ([entry.value for entry in cell] for cell in data)
+            data = dict(zip(header, data))
+            dictionary[table_name] = base_engine.DataFrame(data)
         return dictionary
 
     worksheets = [worksheet for worksheet in ws if worksheet.tables.items()]