Skip to content

Commit

Permalink
row_to_names improvement (#1379)
Browse files Browse the repository at this point in the history
This function improves `row_to_names` for polars dataframes, primarily with speed enhancements.
  • Loading branch information
samukweku authored Jul 13, 2024
1 parent bbb5891 commit a14061c
Show file tree
Hide file tree
Showing 6 changed files with 284 additions and 171 deletions.
187 changes: 141 additions & 46 deletions janitor/functions/row_to_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from __future__ import annotations

import warnings
from functools import singledispatch

import numpy as np
import pandas as pd
Expand All @@ -15,7 +15,7 @@
@deprecated_alias(row_number="row_numbers", remove_row="remove_rows")
def row_to_names(
df: pd.DataFrame,
row_numbers: int | list = 0,
row_numbers: int | list | slice = 0,
remove_rows: bool = False,
remove_rows_above: bool = False,
reset_index: bool = False,
Expand Down Expand Up @@ -47,7 +47,7 @@ def row_to_names(
1 9 y
>>> df.row_to_names([0,1], remove_rows=True, reset_index=True)
nums chars
6 x
6 x
0 9 y
Remove rows above the elevated row and the elevated row itself.
Expand All @@ -72,8 +72,7 @@ def row_to_names(
Args:
df: A pandas DataFrame.
row_numbers: Position of the row(s) containing the variable names.
Note that indexing starts from 0. It can also be a list,
in which case, a MultiIndex column is created.
It can be an integer, a list or a slice.
Defaults to 0 (first row).
remove_rows: Whether the row(s) should be removed from the DataFrame.
remove_rows_above: Whether the row(s) above the selected row should
Expand All @@ -83,53 +82,149 @@ def row_to_names(
Returns:
A pandas DataFrame with set column names.
""" # noqa: E501
if not pd.options.mode.copy_on_write:
df = df.copy()

check("row_numbers", row_numbers, [int, list])
if isinstance(row_numbers, list):
for entry in row_numbers:
check("entry in the row_numbers argument", entry, [int])

warnings.warn(
"The function row_to_names will, in the official 1.0 release, "
"change its behaviour to reset the dataframe's index by default. "
"You can prepare for this change right now by explicitly setting "
"`reset_index=True` when calling on `row_to_names`."

return _row_to_names(
row_numbers,
df=df,
remove_rows=remove_rows,
remove_rows_above=remove_rows_above,
reset_index=reset_index,
)


@singledispatch
def _row_to_names(
row_numbers, df, remove_rows, remove_rows_above, reset_index
) -> pd.DataFrame:
"""
Base function for row_to_names.
"""
raise TypeError(
"row_numbers should be either an integer, "
"a slice or a list; "
f"instead got type {type(row_numbers).__name__}"
)
# should raise if positional indexers are missing
# IndexError: positional indexers are out-of-bounds
headers = df.iloc[row_numbers]


@_row_to_names.register(int) # noqa: F811
def _row_to_names_dispatch( # noqa: F811
row_numbers, df, remove_rows, remove_rows_above, reset_index
):
df_ = df[:]
headers = df_.iloc[row_numbers]
df_.columns = headers
df_.columns.name = None
if not remove_rows and not remove_rows_above and not reset_index:
return df_
if not remove_rows and not remove_rows_above and reset_index:
return df_.reset_index(drop=True)

len_df = len(df_)
arrays = [arr._values for _, arr in df_.items()]
if remove_rows_above and remove_rows:
indexer = np.arange(row_numbers + 1, len_df)
elif remove_rows_above:
indexer = np.arange(row_numbers, len_df)
elif remove_rows:
indexer = np.arange(len_df)
mask = np.ones(len_df, dtype=np.bool_)
mask[row_numbers] = False
indexer = indexer[mask]
arrays = {num: arr[indexer] for num, arr in enumerate(arrays)}
if reset_index:
df_index = pd.RangeIndex(start=0, stop=indexer.size)
else:
df_index = df_.index[indexer]
_df = pd.DataFrame(data=arrays, index=df_index, copy=False)
_df.columns = df_.columns
return _df


@_row_to_names.register(slice) # noqa: F811
def _row_to_names_dispatch( # noqa: F811
row_numbers, df, remove_rows, remove_rows_above, reset_index
):
if row_numbers.step is not None:
raise ValueError(
"The step argument for slice is not supported in row_to_names."
)
df_ = df[:]
headers = df_.iloc[row_numbers]
if isinstance(headers, pd.DataFrame) and (len(headers) == 1):
headers = headers.squeeze()
if isinstance(headers, pd.Series):
headers = pd.Index(headers)
df_.columns = headers
df_.columns.name = None
else:
headers = [entry.array for _, entry in headers.items()]
headers = [array._values for _, array in headers.items()]
headers = pd.MultiIndex.from_tuples(headers)
df_.columns = headers
if not remove_rows and not remove_rows_above and not reset_index:
return df_
if not remove_rows and not remove_rows_above and reset_index:
return df_.reset_index(drop=True)
len_df = len(df_)
arrays = [arr._values for _, arr in df_.items()]
if remove_rows_above and remove_rows:
indexer = np.arange(row_numbers.stop, len_df)
elif remove_rows_above:
indexer = np.arange(row_numbers.start, len_df)
elif remove_rows:
indexer = np.arange(len_df)
mask = np.ones(len_df, dtype=np.bool_)
mask[row_numbers] = False
indexer = indexer[mask]
arrays = {num: arr[indexer] for num, arr in enumerate(arrays)}
if reset_index:
df_index = pd.RangeIndex(start=0, stop=indexer.size)
else:
df_index = df_.index[indexer]
_df = pd.DataFrame(data=arrays, index=df_index, copy=False)
_df.columns = df_.columns
return _df

df.columns = headers
df.columns.name = None

df_index = df.index
@_row_to_names.register(list) # noqa: F811
def _row_to_names_dispatch( # noqa: F811
row_numbers, df, remove_rows, remove_rows_above, reset_index
):
if remove_rows_above:
if isinstance(row_numbers, list):
if not (np.diff(row_numbers) == 1).all():
raise ValueError(
"The remove_rows_above argument is applicable "
"only if the row_numbers argument is an integer, "
"or the integers in a list are consecutive increasing, "
"with a difference of 1."
)
tail = row_numbers[0]
else:
tail = row_numbers
df = df.iloc[tail:]
if remove_rows:
if isinstance(row_numbers, int):
row_numbers = [row_numbers]
df_index = df.index.symmetric_difference(df_index[row_numbers])
df = df.loc[df_index]
raise ValueError(
"The remove_rows_above argument is applicable "
"only if the row_numbers argument is an integer "
"or a slice."
)

for entry in row_numbers:
check("entry in the row_numbers argument", entry, [int])

df_ = df[:]
headers = df_.iloc[row_numbers]
if isinstance(headers, pd.DataFrame) and (len(headers) == 1):
headers = headers.squeeze()
df_.columns = headers
df_.columns.name = None
else:
headers = [array._values for _, array in headers.items()]
headers = pd.MultiIndex.from_tuples(headers)
df_.columns = headers

if not remove_rows and reset_index:
return df_.reset_index(drop=True)
if not remove_rows and not reset_index:
return df_

len_df = len(df_)
arrays = [arr._values for _, arr in df_.items()]
indexer = np.arange(len_df)
mask = np.ones(len_df, dtype=np.bool_)
mask[row_numbers] = False
indexer = indexer[mask]

arrays = {num: arr[indexer] for num, arr in enumerate(arrays)}
if reset_index:
df.index = range(len(df))
return df
df_index = pd.RangeIndex(start=0, stop=indexer.size)
else:
df_index = df_.index[indexer]
_df = pd.DataFrame(data=arrays, index=df_index, copy=False)
_df.columns = df_.columns
return _df
2 changes: 1 addition & 1 deletion janitor/polars/complete.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
try:
import polars as pl
import polars.selectors as cs
from polars.type_aliases import ColumnNameOrSelector
from polars._typing import ColumnNameOrSelector
except ImportError:
import_message(
submodule="polars",
Expand Down
2 changes: 1 addition & 1 deletion janitor/polars/pivot_longer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

try:
import polars as pl
from polars.type_aliases import ColumnNameOrSelector
from polars._typing import ColumnNameOrSelector
except ImportError:
import_message(
submodule="polars",
Expand Down
Loading

0 comments on commit a14061c

Please sign in to comment.