ENH: Enabled skipna argument on groupby reduction ops (pandas-dev#15675)

Added a skipna argurment to the groupby reduction ops sum, prod, min, max, mean, median, var, std and sem Added relevant tests Updated whatsnew to reflect changes Co-authored-by: Tiago Firmino <[email protected]>
andremcorreia · May 27, 2024 · 3118e60 · 3118e60
1 parent b162331
commit 3118e60
Show file tree

Hide file tree

Showing 11 changed files with 428 additions and 109 deletions.
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -39,13 +39,13 @@ Other enhancements
 - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`)
 - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`)
 - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`)
+- :meth:`.DataFrameGroupBy.sum`, :meth:`.DataFrameGroupBy.prod`, :meth:`.DataFrameGroupBy.min`, :meth:`.DataFrameGroupBy.max`, :meth:`.DataFrameGroupBy.mean`, :meth:`.DataFrameGroupBy.median`, :meth:`.DataFrameGroupBy.sem`, :meth:`.DataFrameGroupBy.std` and :meth:`.DataFrameGroupBy.var` now accept a skipna argument. (:issue:`15675`)
 - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`)
 - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
 - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`)
 - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
 - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
 - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)
--
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_300.notable_bug_fixes:

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -104,7 +104,8 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n
 cdef float64_t median_linear(
     float64_t* a,
     int n,
-    bint is_datetimelike=False
+    bint is_datetimelike=False,
+    bint skipna=True
 ) noexcept nogil:
     cdef:
         int i, j, na_count = 0
@@ -118,10 +119,14 @@ cdef float64_t median_linear(
     if is_datetimelike:
         for i in range(n):
             if a[i] == NPY_NAT:
+                if not skipna:
+                    return NaN
                 na_count += 1
     else:
         for i in range(n):
             if a[i] != a[i]:
+                if not skipna:
+                    return NaN
                 na_count += 1
 
     if na_count:
@@ -186,6 +191,7 @@ def group_median_float64(
     const uint8_t[:, :] mask=None,
     uint8_t[:, ::1] result_mask=None,
     bint is_datetimelike=False,
+    bint skipna=True,
 ) -> None:
     """
     Only aggregates on axis=0
@@ -244,7 +250,7 @@ def group_median_float64(
                 ptr += _counts[0]
                 for j in range(ngroups):
                     size = _counts[j + 1]
-                    out[j, i] = median_linear(ptr, size, is_datetimelike)
+                    out[j, i] = median_linear(ptr, size, is_datetimelike, skipna)
                     ptr += size
 
 
@@ -694,6 +700,7 @@ def group_sum(
     uint8_t[:, ::1] result_mask=None,
     Py_ssize_t min_count=0,
     bint is_datetimelike=False,
+    bint skipna=True,
 ) -> None:
     """
     Only aggregates on axis=0 using Kahan summation
@@ -733,32 +740,39 @@ def group_sum(
                 else:
                     isna_entry = _treat_as_na(val, is_datetimelike)
 
-                if not isna_entry:
-                    nobs[lab, j] += 1
+                if isna_entry:
+                    if skipna:
+                        continue
+                    else:
+                        sumx[lab, j] = val
+                        compensation[lab, j] = 0
+                        break
 
-                    if sum_t is object:
-                        # NB: this does not use 'compensation' like the non-object
-                        #  track does.
-                        if nobs[lab, j] == 1:
-                            # i.e. we haven't added anything yet; avoid TypeError
-                            #  if e.g. val is a str and sumx[lab, j] is 0
-                            t = val
-                        else:
-                            t = sumx[lab, j] + val
-                        sumx[lab, j] = t
+                nobs[lab, j] += 1
 
+                if sum_t is object:
+                    # NB: this does not use 'compensation' like the non-object
+                    #  track does.
+                    if nobs[lab, j] == 1:
+                        # i.e. we haven't added anything yet; avoid TypeError
+                        #  if e.g. val is a str and sumx[lab, j] is 0
+                        t = val
                     else:
-                        y = val - compensation[lab, j]
-                        t = sumx[lab, j] + y
-                        compensation[lab, j] = t - sumx[lab, j] - y
-                        if compensation[lab, j] != compensation[lab, j]:
-                            # GH#53606
-                            # If val is +/- infinity compensation is NaN
-                            # which would lead to results being NaN instead
-                            # of +/- infinity. We cannot use util.is_nan
-                            # because of no gil
-                            compensation[lab, j] = 0
-                        sumx[lab, j] = t
+                        t = sumx[lab, j] + val
+                    sumx[lab, j] = t
+
+                else:
+                    y = val - compensation[lab, j]
+                    t = sumx[lab, j] + y
+                    compensation[lab, j] = t - sumx[lab, j] - y
+                    if compensation[lab, j] != compensation[lab, j]:
+                        # GH#53606
+                        # If val is +/- infinity compensation is NaN
+                        # which would lead to results being NaN instead
+                        # of +/- infinity. We cannot use util.is_nan
+                        # because of no gil
+                        compensation[lab, j] = 0
+                    sumx[lab, j] = t
 
     _check_below_mincount(
         out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx
@@ -775,6 +789,7 @@ def group_prod(
     const uint8_t[:, ::1] mask,
     uint8_t[:, ::1] result_mask=None,
     Py_ssize_t min_count=0,
+    bint skipna=True,
 ) -> None:
     """
     Only aggregates on axis=0
@@ -813,6 +828,10 @@ def group_prod(
                 if not isna_entry:
                     nobs[lab, j] += 1
                     prodx[lab, j] *= val
+                elif not skipna:
+                    prodx[lab, j] = val
+                    nobs[lab, j] = 0
+                    break
 
     _check_below_mincount(
         out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx
@@ -832,6 +851,7 @@ def group_var(
     const uint8_t[:, ::1] mask=None,
     uint8_t[:, ::1] result_mask=None,
     bint is_datetimelike=False,
+    bint skipna=True,
     str name="var",
 ) -> None:
     cdef:
@@ -877,7 +897,12 @@ def group_var(
                 else:
                     isna_entry = _treat_as_na(val, is_datetimelike)
 
-                if not isna_entry:
+                if not skipna and isna_entry:
+                    out[lab, j] = val
+                    nobs[lab, j] = 0
+                    break
+
+                elif not isna_entry:
                     nobs[lab, j] += 1
                     oldmean = mean[lab, j]
                     mean[lab, j] += (val - oldmean) / nobs[lab, j]
@@ -998,6 +1023,7 @@ def group_mean(
     const intp_t[::1] labels,
     Py_ssize_t min_count=-1,
     bint is_datetimelike=False,
+    bint skipna=True,
     const uint8_t[:, ::1] mask=None,
     uint8_t[:, ::1] result_mask=None,
 ) -> None:
@@ -1021,6 +1047,8 @@ def group_mean(
         Only used in sum and prod. Always -1.
     is_datetimelike : bool
         True if `values` contains datetime-like entries.
+    skipna : bool, default True
+        Exclude NA/null values when computing the result.
     mask : ndarray[bool, ndim=2], optional
         Mask of the input values.
     result_mask : ndarray[bool, ndim=2], optional
@@ -1078,7 +1106,12 @@ def group_mean(
                 else:
                     isna_entry = _treat_as_na(val, is_datetimelike)
 
-                if not isna_entry:
+                if not skipna and isna_entry:
+                    sumx[lab, j] = nan_val
+                    nobs[lab, j] = 0
+                    break
+
+                elif not isna_entry:
                     nobs[lab, j] += 1
                     y = val - compensation[lab, j]
                     t = sumx[lab, j] + y
@@ -1096,12 +1129,10 @@ def group_mean(
             for j in range(K):
                 count = nobs[i, j]
                 if nobs[i, j] == 0:
-
                     if uses_mask:
                         result_mask[i, j] = True
                     else:
                         out[i, j] = nan_val
-
                 else:
                     out[i, j] = sumx[i, j] / count
 
@@ -1660,6 +1691,7 @@ cdef group_min_max(
     Py_ssize_t min_count=-1,
     bint is_datetimelike=False,
     bint compute_max=True,
+    bint skipna=True,
     const uint8_t[:, ::1] mask=None,
     uint8_t[:, ::1] result_mask=None,
 ):
@@ -1683,6 +1715,8 @@ cdef group_min_max(
         True if `values` contains datetime-like entries.
     compute_max : bint, default True
         True to compute group-wise max, False to compute min
+    skipna : bool, default True
+        Exclude NA/null values when computing the result.
     mask : ndarray[bool, ndim=2], optional
         If not None, indices represent missing values,
         otherwise the mask will not be used
@@ -1729,7 +1763,12 @@ cdef group_min_max(
                 else:
                     isna_entry = _treat_as_na(val, is_datetimelike)
 
-                if not isna_entry:
+                if not skipna and isna_entry:
+                    group_min_or_max[lab, j] = val
+                    nobs[lab, j] = 0
+                    break
+
+                elif not isna_entry:
                     nobs[lab, j] += 1
                     if compute_max:
                         if val > group_min_or_max[lab, j]:
@@ -1866,6 +1905,7 @@ def group_max(
     const intp_t[::1] labels,
     Py_ssize_t min_count=-1,
     bint is_datetimelike=False,
+    bint skipna=True,
     const uint8_t[:, ::1] mask=None,
     uint8_t[:, ::1] result_mask=None,
 ) -> None:
@@ -1880,6 +1920,7 @@ def group_max(
         compute_max=True,
         mask=mask,
         result_mask=result_mask,
+        skipna=skipna,
     )
 
 
@@ -1892,6 +1933,7 @@ def group_min(
     const intp_t[::1] labels,
     Py_ssize_t min_count=-1,
     bint is_datetimelike=False,
+    bint skipna=True,
     const uint8_t[:, ::1] mask=None,
     uint8_t[:, ::1] result_mask=None,
 ) -> None:
@@ -1906,6 +1948,7 @@ def group_min(
         compute_max=False,
         mask=mask,
         result_mask=result_mask,
+        skipna=skipna,
     )
 
 

diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py
@@ -69,13 +69,20 @@ def column_looper(
             labels: np.ndarray,
             ngroups: int,
             min_periods: int,
+            skipna: bool = True,
             *args,
         ):
             result = np.empty((values.shape[0], ngroups), dtype=result_dtype)
             na_positions = {}
             for i in numba.prange(values.shape[0]):
                 output, na_pos = func(
-                    values[i], result_dtype, labels, ngroups, min_periods, *args
+                    values[i],
+                    result_dtype,
+                    labels,
+                    ngroups,
+                    min_periods,
+                    *args,
+                    skipna,
                 )
                 result[i] = output
                 if len(na_pos) > 0:
@@ -162,6 +169,7 @@ def generate_shared_aggregator(
     nopython: bool,
     nogil: bool,
     parallel: bool,
+    skipna: bool = True,
 ):
     """
     Generate a Numba function that loops over the columns 2D object and applies
@@ -190,7 +198,6 @@ def generate_shared_aggregator(
     -------
     Numba function
     """
-
     # A wrapper around the looper function,
     # to dispatch based on dtype since numba is unable to do that in nopython mode
 
@@ -214,11 +221,11 @@ def looper_wrapper(
         # Need to unpack kwargs since numba only supports *args
         if is_grouped_kernel:
             result, na_positions = column_looper(
-                values, labels, ngroups, min_periods, *kwargs.values()
+                values, labels, ngroups, min_periods, skipna, *kwargs.values()
             )
         else:
             result, na_positions = column_looper(
-                values, start, end, min_periods, *kwargs.values()
+                values, start, end, min_periods, skipna, *kwargs.values()
             )
         if result.dtype.kind == "i":
             # Look if na_positions is not empty

diff --git a/pandas/core/_numba/kernels/mean_.py b/pandas/core/_numba/kernels/mean_.py
@@ -169,9 +169,10 @@ def grouped_mean(
     labels: npt.NDArray[np.intp],
     ngroups: int,
     min_periods: int,
+    skipna: bool = True,
 ) -> tuple[np.ndarray, list[int]]:
     output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum(
-        values, result_dtype, labels, ngroups
+        values, result_dtype, labels, ngroups, skipna
     )
 
     # Post-processing, replace sums that don't satisfy min_periods
@@ -187,7 +188,8 @@ def grouped_mean(
                 result = sum_x
         else:
             result = np.nan
-        result /= nobs
+        if nobs != 0:
+            result /= nobs
         output[lab] = result
 
     # na_position is empty list since float64 can already hold nans

diff --git a/pandas/core/_numba/kernels/min_max_.py b/pandas/core/_numba/kernels/min_max_.py
@@ -88,6 +88,7 @@ def grouped_min_max(
     ngroups: int,
     min_periods: int,
     is_max: bool,
+    skipna: bool = True,
 ) -> tuple[np.ndarray, list[int]]:
     N = len(labels)
     nobs = np.zeros(ngroups, dtype=np.int64)
@@ -102,6 +103,9 @@ def grouped_min_max(
 
         if values.dtype.kind == "i" or not np.isnan(val):
             nobs[lab] += 1
+        elif not skipna and np.isnan(val):
+            output[lab] = np.nan
+            continue
         else:
             # NaN value cannot be a min/max value
             continue