Skip to content

Commit

Permalink
ENH: Enabled skipna argument on groupby reduction ops (pandas-dev#15675)
Browse files Browse the repository at this point in the history
Added a skipna argurment to the groupby reduction ops sum, prod, min, max, mean, median, var, std and sem
Added relevant tests
Updated whatsnew to reflect changes

Co-authored-by: Tiago Firmino <[email protected]>
  • Loading branch information
andremcorreia and tiago-firmino committed May 27, 2024
1 parent b162331 commit 3118e60
Show file tree
Hide file tree
Showing 11 changed files with 428 additions and 109 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,13 @@ Other enhancements
- Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`)
- :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`)
- :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`)
- :meth:`.DataFrameGroupBy.sum`, :meth:`.DataFrameGroupBy.prod`, :meth:`.DataFrameGroupBy.min`, :meth:`.DataFrameGroupBy.max`, :meth:`.DataFrameGroupBy.mean`, :meth:`.DataFrameGroupBy.median`, :meth:`.DataFrameGroupBy.sem`, :meth:`.DataFrameGroupBy.std` and :meth:`.DataFrameGroupBy.var` now accept a skipna argument. (:issue:`15675`)
- :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`)
- :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
- :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`)
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)
-

.. ---------------------------------------------------------------------------
.. _whatsnew_300.notable_bug_fixes:
Expand Down
103 changes: 73 additions & 30 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n
cdef float64_t median_linear(
float64_t* a,
int n,
bint is_datetimelike=False
bint is_datetimelike=False,
bint skipna=True
) noexcept nogil:
cdef:
int i, j, na_count = 0
Expand All @@ -118,10 +119,14 @@ cdef float64_t median_linear(
if is_datetimelike:
for i in range(n):
if a[i] == NPY_NAT:
if not skipna:
return NaN
na_count += 1
else:
for i in range(n):
if a[i] != a[i]:
if not skipna:
return NaN
na_count += 1

if na_count:
Expand Down Expand Up @@ -186,6 +191,7 @@ def group_median_float64(
const uint8_t[:, :] mask=None,
uint8_t[:, ::1] result_mask=None,
bint is_datetimelike=False,
bint skipna=True,
) -> None:
"""
Only aggregates on axis=0
Expand Down Expand Up @@ -244,7 +250,7 @@ def group_median_float64(
ptr += _counts[0]
for j in range(ngroups):
size = _counts[j + 1]
out[j, i] = median_linear(ptr, size, is_datetimelike)
out[j, i] = median_linear(ptr, size, is_datetimelike, skipna)
ptr += size


Expand Down Expand Up @@ -694,6 +700,7 @@ def group_sum(
uint8_t[:, ::1] result_mask=None,
Py_ssize_t min_count=0,
bint is_datetimelike=False,
bint skipna=True,
) -> None:
"""
Only aggregates on axis=0 using Kahan summation
Expand Down Expand Up @@ -733,32 +740,39 @@ def group_sum(
else:
isna_entry = _treat_as_na(val, is_datetimelike)

if not isna_entry:
nobs[lab, j] += 1
if isna_entry:
if skipna:
continue
else:
sumx[lab, j] = val
compensation[lab, j] = 0
break

if sum_t is object:
# NB: this does not use 'compensation' like the non-object
# track does.
if nobs[lab, j] == 1:
# i.e. we haven't added anything yet; avoid TypeError
# if e.g. val is a str and sumx[lab, j] is 0
t = val
else:
t = sumx[lab, j] + val
sumx[lab, j] = t
nobs[lab, j] += 1

if sum_t is object:
# NB: this does not use 'compensation' like the non-object
# track does.
if nobs[lab, j] == 1:
# i.e. we haven't added anything yet; avoid TypeError
# if e.g. val is a str and sumx[lab, j] is 0
t = val
else:
y = val - compensation[lab, j]
t = sumx[lab, j] + y
compensation[lab, j] = t - sumx[lab, j] - y
if compensation[lab, j] != compensation[lab, j]:
# GH#53606
# If val is +/- infinity compensation is NaN
# which would lead to results being NaN instead
# of +/- infinity. We cannot use util.is_nan
# because of no gil
compensation[lab, j] = 0
sumx[lab, j] = t
t = sumx[lab, j] + val
sumx[lab, j] = t

else:
y = val - compensation[lab, j]
t = sumx[lab, j] + y
compensation[lab, j] = t - sumx[lab, j] - y
if compensation[lab, j] != compensation[lab, j]:
# GH#53606
# If val is +/- infinity compensation is NaN
# which would lead to results being NaN instead
# of +/- infinity. We cannot use util.is_nan
# because of no gil
compensation[lab, j] = 0
sumx[lab, j] = t

_check_below_mincount(
out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx
Expand All @@ -775,6 +789,7 @@ def group_prod(
const uint8_t[:, ::1] mask,
uint8_t[:, ::1] result_mask=None,
Py_ssize_t min_count=0,
bint skipna=True,
) -> None:
"""
Only aggregates on axis=0
Expand Down Expand Up @@ -813,6 +828,10 @@ def group_prod(
if not isna_entry:
nobs[lab, j] += 1
prodx[lab, j] *= val
elif not skipna:
prodx[lab, j] = val
nobs[lab, j] = 0
break

_check_below_mincount(
out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx
Expand All @@ -832,6 +851,7 @@ def group_var(
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint is_datetimelike=False,
bint skipna=True,
str name="var",
) -> None:
cdef:
Expand Down Expand Up @@ -877,7 +897,12 @@ def group_var(
else:
isna_entry = _treat_as_na(val, is_datetimelike)

if not isna_entry:
if not skipna and isna_entry:
out[lab, j] = val
nobs[lab, j] = 0
break

elif not isna_entry:
nobs[lab, j] += 1
oldmean = mean[lab, j]
mean[lab, j] += (val - oldmean) / nobs[lab, j]
Expand Down Expand Up @@ -998,6 +1023,7 @@ def group_mean(
const intp_t[::1] labels,
Py_ssize_t min_count=-1,
bint is_datetimelike=False,
bint skipna=True,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
) -> None:
Expand All @@ -1021,6 +1047,8 @@ def group_mean(
Only used in sum and prod. Always -1.
is_datetimelike : bool
True if `values` contains datetime-like entries.
skipna : bool, default True
Exclude NA/null values when computing the result.
mask : ndarray[bool, ndim=2], optional
Mask of the input values.
result_mask : ndarray[bool, ndim=2], optional
Expand Down Expand Up @@ -1078,7 +1106,12 @@ def group_mean(
else:
isna_entry = _treat_as_na(val, is_datetimelike)

if not isna_entry:
if not skipna and isna_entry:
sumx[lab, j] = nan_val
nobs[lab, j] = 0
break

elif not isna_entry:
nobs[lab, j] += 1
y = val - compensation[lab, j]
t = sumx[lab, j] + y
Expand All @@ -1096,12 +1129,10 @@ def group_mean(
for j in range(K):
count = nobs[i, j]
if nobs[i, j] == 0:

if uses_mask:
result_mask[i, j] = True
else:
out[i, j] = nan_val

else:
out[i, j] = sumx[i, j] / count

Expand Down Expand Up @@ -1660,6 +1691,7 @@ cdef group_min_max(
Py_ssize_t min_count=-1,
bint is_datetimelike=False,
bint compute_max=True,
bint skipna=True,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
):
Expand All @@ -1683,6 +1715,8 @@ cdef group_min_max(
True if `values` contains datetime-like entries.
compute_max : bint, default True
True to compute group-wise max, False to compute min
skipna : bool, default True
Exclude NA/null values when computing the result.
mask : ndarray[bool, ndim=2], optional
If not None, indices represent missing values,
otherwise the mask will not be used
Expand Down Expand Up @@ -1729,7 +1763,12 @@ cdef group_min_max(
else:
isna_entry = _treat_as_na(val, is_datetimelike)

if not isna_entry:
if not skipna and isna_entry:
group_min_or_max[lab, j] = val
nobs[lab, j] = 0
break

elif not isna_entry:
nobs[lab, j] += 1
if compute_max:
if val > group_min_or_max[lab, j]:
Expand Down Expand Up @@ -1866,6 +1905,7 @@ def group_max(
const intp_t[::1] labels,
Py_ssize_t min_count=-1,
bint is_datetimelike=False,
bint skipna=True,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
) -> None:
Expand All @@ -1880,6 +1920,7 @@ def group_max(
compute_max=True,
mask=mask,
result_mask=result_mask,
skipna=skipna,
)


Expand All @@ -1892,6 +1933,7 @@ def group_min(
const intp_t[::1] labels,
Py_ssize_t min_count=-1,
bint is_datetimelike=False,
bint skipna=True,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
) -> None:
Expand All @@ -1906,6 +1948,7 @@ def group_min(
compute_max=False,
mask=mask,
result_mask=result_mask,
skipna=skipna,
)


Expand Down
15 changes: 11 additions & 4 deletions pandas/core/_numba/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,20 @@ def column_looper(
labels: np.ndarray,
ngroups: int,
min_periods: int,
skipna: bool = True,
*args,
):
result = np.empty((values.shape[0], ngroups), dtype=result_dtype)
na_positions = {}
for i in numba.prange(values.shape[0]):
output, na_pos = func(
values[i], result_dtype, labels, ngroups, min_periods, *args
values[i],
result_dtype,
labels,
ngroups,
min_periods,
*args,
skipna,
)
result[i] = output
if len(na_pos) > 0:
Expand Down Expand Up @@ -162,6 +169,7 @@ def generate_shared_aggregator(
nopython: bool,
nogil: bool,
parallel: bool,
skipna: bool = True,
):
"""
Generate a Numba function that loops over the columns 2D object and applies
Expand Down Expand Up @@ -190,7 +198,6 @@ def generate_shared_aggregator(
-------
Numba function
"""

# A wrapper around the looper function,
# to dispatch based on dtype since numba is unable to do that in nopython mode

Expand All @@ -214,11 +221,11 @@ def looper_wrapper(
# Need to unpack kwargs since numba only supports *args
if is_grouped_kernel:
result, na_positions = column_looper(
values, labels, ngroups, min_periods, *kwargs.values()
values, labels, ngroups, min_periods, skipna, *kwargs.values()
)
else:
result, na_positions = column_looper(
values, start, end, min_periods, *kwargs.values()
values, start, end, min_periods, skipna, *kwargs.values()
)
if result.dtype.kind == "i":
# Look if na_positions is not empty
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/_numba/kernels/mean_.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,10 @@ def grouped_mean(
labels: npt.NDArray[np.intp],
ngroups: int,
min_periods: int,
skipna: bool = True,
) -> tuple[np.ndarray, list[int]]:
output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum(
values, result_dtype, labels, ngroups
values, result_dtype, labels, ngroups, skipna
)

# Post-processing, replace sums that don't satisfy min_periods
Expand All @@ -187,7 +188,8 @@ def grouped_mean(
result = sum_x
else:
result = np.nan
result /= nobs
if nobs != 0:
result /= nobs
output[lab] = result

# na_position is empty list since float64 can already hold nans
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/_numba/kernels/min_max_.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def grouped_min_max(
ngroups: int,
min_periods: int,
is_max: bool,
skipna: bool = True,
) -> tuple[np.ndarray, list[int]]:
N = len(labels)
nobs = np.zeros(ngroups, dtype=np.int64)
Expand All @@ -102,6 +103,9 @@ def grouped_min_max(

if values.dtype.kind == "i" or not np.isnan(val):
nobs[lab] += 1
elif not skipna and np.isnan(val):
output[lab] = np.nan
continue
else:
# NaN value cannot be a min/max value
continue
Expand Down
Loading

0 comments on commit 3118e60

Please sign in to comment.