Skip to content

Commit

Permalink
Add polars support for janitor.io.xlsx_table (#1357)
Browse files Browse the repository at this point in the history
* add make_clean_names function that can be applied to polars

* add examples for make_clean_names

* changelog

* limit import location for polars

* limit import location for polars

* fix polars in environment-dev.yml

* install polars in doctest

* limit polars imports - user should have polars already installed

* use subprocess.run

* add subprocess.devnull to docstrings

* add subprocess.devnull to docstrings

* add subprocess.devnull to docstrings

* add subprocess.devnull to docstrings

* add os.devnull

* add polars as requirement for docs

* add polars to tests requirements

* delete irrelevant folder

* changelog

* create submodule for polars

* fix doctests

* fix tests; add polars to documentation

* fix tests; add polars to documentation

* import janitor.polars

* control docs output for polars submodule

* exclude functions in docs rendering

* exclude functions in docs rendering

* show_submodules=true

* fix docstring rendering for polars

* Expression -> expression

* rename functions.py

* pivot_longer implemented for polars

* changelog

* keep changes related only to pivot_longer

* pd -> pl

* pd -> pl

* df.pivot_longer -> df.janitor.pivot_longer

* df.pivot_longer -> df.janitor.pivot_longer

* pd -> pl

* pd -> pl

* add >>> df

* add >>> df

* keep changes related only to polars pivot_longer

* add polars support to read_commandline

* remove irrelevant files

* minor edit to docs

* xlsx_table now supports polars

---------

Co-authored-by: samuel.oranyeli <[email protected]>
Co-authored-by: Eric Ma <[email protected]>
  • Loading branch information
3 people authored Jun 3, 2024
1 parent 891b711 commit 46ab4d8
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 7 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Changelog

## [Unreleased]
- [ENH] `xlsx_table` function now supports polars - Issue #1352

- [ENH] Improved performance for non-equi joins when using numba - @samukweku PR #1341
- [ENH] Added a `clean_names` method for polars - it can be used to clean the column names, or clean column values . Issue #1343
Expand Down
50 changes: 43 additions & 7 deletions janitor/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from glob import glob
from io import StringIO
from itertools import chain
from typing import IO, TYPE_CHECKING, Any, Iterable, Union
from typing import IO, TYPE_CHECKING, Any, Iterable, Mapping, Union

import pandas as pd

Expand Down Expand Up @@ -142,21 +142,23 @@ def xlsx_table(
path: Union[str, IO, Workbook],
sheetname: str = None,
table: Union[str, list, tuple] = None,
) -> Union[pd.DataFrame, dict]:
engine: str = "pandas",
) -> Mapping:
"""Returns a DataFrame of values in a table in the Excel file.
This applies to an Excel file, where the data range is explicitly
specified as a Microsoft Excel table.
If there is a single table in the sheet, or a string is provided
as an argument to the `table` parameter, a pandas DataFrame is returned;
as an argument to the `table` parameter, a DataFrame is returned;
if there is more than one table in the sheet,
and the `table` argument is `None`, or a list/tuple of names,
a dictionary of DataFrames is returned, where the keys of the dictionary
are the table names.
Examples:
>>> import pandas as pd
>>> import polars as pl
>>> from janitor import xlsx_table
>>> filename="../pyjanitor/tests/test_data/016-MSPTDA-Excel.xlsx"
Expand All @@ -170,6 +172,20 @@ def xlsx_table(
3 4 Competition
4 5 Long Distance
>>> xlsx_table(filename, table='dCategory', engine='polars')
shape: (5, 2)
┌────────────┬───────────────┐
│ CategoryID ┆ Category │
│ --- ┆ --- │
│ i64 ┆ str │
╞════════════╪═══════════════╡
│ 1 ┆ Beginner │
│ 2 ┆ Advanced │
│ 3 ┆ Freestyle │
│ 4 ┆ Competition │
│ 5 ┆ Long Distance │
└────────────┴───────────────┘
Multiple tables:
>>> out=xlsx_table(filename, table=["dCategory", "dSalesReps"])
Expand All @@ -189,14 +205,16 @@ def xlsx_table(
Args:
path: Path to the Excel File. It can also be an openpyxl Workbook.
table: Name of a table, or list of tables in the sheet.
engine: DataFrame engine. Should be either pandas or polars.
Defaults to pandas
Raises:
AttributeError: If a workbook is provided, and is a ReadOnlyWorksheet.
ValueError: If there are no tables in the sheet.
KeyError: If the provided table does not exist in the sheet.
Returns:
A pandas DataFrame, or a dictionary of DataFrames,
A DataFrame, or a dictionary of DataFrames,
if there are multiple arguments for the `table` parameter,
or the argument to `table` is `None`.
""" # noqa : E501
Expand All @@ -219,6 +237,22 @@ def xlsx_table(
DeprecationWarning,
stacklevel=find_stack_level(),
)
if engine not in {"pandas", "polars"}:
raise ValueError("engine should be one of pandas or polars.")
base_engine = pd
if engine == "polars":
try:
import polars as pl

base_engine = pl
except ImportError:
import_message(
submodule="polars",
package="polars",
conda_channel="conda-forge",
pip_install=True,
)

if table is not None:
check("table", table, [str, list, tuple])
if isinstance(table, (list, tuple)):
Expand All @@ -245,13 +279,15 @@ def _create_dataframe_or_dictionary_from_table(
header_exist = contents.headerRowCount
coordinates = contents.ref
data = worksheet[coordinates]
data = [[entry.value for entry in cell] for cell in data]
if header_exist:
header, *data = data
header = [cell.value for cell in header]
else:
header = [f"C{num}" for num in range(len(data[0]))]
data = pd.DataFrame(data, columns=header)
dictionary[table_name] = data
data = zip(*data)
data = ([entry.value for entry in cell] for cell in data)
data = dict(zip(header, data))
dictionary[table_name] = base_engine.DataFrame(data)
return dictionary

worksheets = [worksheet for worksheet in ws if worksheet.tables.items()]
Expand Down

0 comments on commit 46ab4d8

Please sign in to comment.