From 46ab4d8a5521305d45ff771184862d047959b1a1 Mon Sep 17 00:00:00 2001 From: Samuel Oranyeli Date: Mon, 3 Jun 2024 10:16:17 +1000 Subject: [PATCH] Add polars support for `janitor.io.xlsx_table` (#1357) * add make_clean_names function that can be applied to polars * add examples for make_clean_names * changelog * limit import location for polars * limit import location for polars * fix polars in environment-dev.yml * install polars in doctest * limit polars imports - user should have polars already installed * use subprocess.run * add subprocess.devnull to docstrings * add subprocess.devnull to docstrings * add subprocess.devnull to docstrings * add subprocess.devnull to docstrings * add os.devnull * add polars as requirement for docs * add polars to tests requirements * delete irrelevant folder * changelog * create submodule for polars * fix doctests * fix tests; add polars to documentation * fix tests; add polars to documentation * import janitor.polars * control docs output for polars submodule * exclude functions in docs rendering * exclude functions in docs rendering * show_submodules=true * fix docstring rendering for polars * Expression -> expression * rename functions.py * pivot_longer implemented for polars * changelog * keep changes related only to pivot_longer * pd -> pl * pd -> pl * df.pivot_longer -> df.janitor.pivot_longer * df.pivot_longer -> df.janitor.pivot_longer * pd -> pl * pd -> pl * add >>> df * add >>> df * keep changes related only to polars pivot_longer * add polars support to read_commandline * remove irrelevant files * minor edit to docs * xlsx_table now supports polars --------- Co-authored-by: samuel.oranyeli Co-authored-by: Eric Ma --- CHANGELOG.md | 1 + janitor/io.py | 50 +++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 207b130b8..47739249b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # Changelog ## [Unreleased] +- [ENH] `xlsx_table` function now supports polars - Issue #1352 - [ENH] Improved performance for non-equi joins when using numba - @samukweku PR #1341 - [ENH] Added a `clean_names` method for polars - it can be used to clean the column names, or clean column values . Issue #1343 diff --git a/janitor/io.py b/janitor/io.py index 1912afe8c..4829b3e1c 100644 --- a/janitor/io.py +++ b/janitor/io.py @@ -8,7 +8,7 @@ from glob import glob from io import StringIO from itertools import chain -from typing import IO, TYPE_CHECKING, Any, Iterable, Union +from typing import IO, TYPE_CHECKING, Any, Iterable, Mapping, Union import pandas as pd @@ -142,14 +142,15 @@ def xlsx_table( path: Union[str, IO, Workbook], sheetname: str = None, table: Union[str, list, tuple] = None, -) -> Union[pd.DataFrame, dict]: + engine: str = "pandas", +) -> Mapping: """Returns a DataFrame of values in a table in the Excel file. This applies to an Excel file, where the data range is explicitly specified as a Microsoft Excel table. If there is a single table in the sheet, or a string is provided - as an argument to the `table` parameter, a pandas DataFrame is returned; + as an argument to the `table` parameter, a DataFrame is returned; if there is more than one table in the sheet, and the `table` argument is `None`, or a list/tuple of names, a dictionary of DataFrames is returned, where the keys of the dictionary @@ -157,6 +158,7 @@ def xlsx_table( Examples: >>> import pandas as pd + >>> import polars as pl >>> from janitor import xlsx_table >>> filename="../pyjanitor/tests/test_data/016-MSPTDA-Excel.xlsx" @@ -170,6 +172,20 @@ def xlsx_table( 3 4 Competition 4 5 Long Distance + >>> xlsx_table(filename, table='dCategory', engine='polars') + shape: (5, 2) + ┌────────────┬───────────────┐ + │ CategoryID ┆ Category │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞════════════╪═══════════════╡ + │ 1 ┆ Beginner │ + │ 2 ┆ Advanced │ + │ 3 ┆ Freestyle │ + │ 4 ┆ Competition │ + │ 5 ┆ Long Distance │ + └────────────┴───────────────┘ + Multiple tables: >>> out=xlsx_table(filename, table=["dCategory", "dSalesReps"]) @@ -189,6 +205,8 @@ def xlsx_table( Args: path: Path to the Excel File. It can also be an openpyxl Workbook. table: Name of a table, or list of tables in the sheet. + engine: DataFrame engine. Should be either pandas or polars. + Defaults to pandas Raises: AttributeError: If a workbook is provided, and is a ReadOnlyWorksheet. @@ -196,7 +214,7 @@ def xlsx_table( KeyError: If the provided table does not exist in the sheet. Returns: - A pandas DataFrame, or a dictionary of DataFrames, + A DataFrame, or a dictionary of DataFrames, if there are multiple arguments for the `table` parameter, or the argument to `table` is `None`. """ # noqa : E501 @@ -219,6 +237,22 @@ def xlsx_table( DeprecationWarning, stacklevel=find_stack_level(), ) + if engine not in {"pandas", "polars"}: + raise ValueError("engine should be one of pandas or polars.") + base_engine = pd + if engine == "polars": + try: + import polars as pl + + base_engine = pl + except ImportError: + import_message( + submodule="polars", + package="polars", + conda_channel="conda-forge", + pip_install=True, + ) + if table is not None: check("table", table, [str, list, tuple]) if isinstance(table, (list, tuple)): @@ -245,13 +279,15 @@ def _create_dataframe_or_dictionary_from_table( header_exist = contents.headerRowCount coordinates = contents.ref data = worksheet[coordinates] - data = [[entry.value for entry in cell] for cell in data] if header_exist: header, *data = data + header = [cell.value for cell in header] else: header = [f"C{num}" for num in range(len(data[0]))] - data = pd.DataFrame(data, columns=header) - dictionary[table_name] = data + data = zip(*data) + data = ([entry.value for entry in cell] for cell in data) + data = dict(zip(header, data)) + dictionary[table_name] = base_engine.DataFrame(data) return dictionary worksheets = [worksheet for worksheet in ws if worksheet.tables.items()]