Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
OliEfr committed Mar 29, 2021
2 parents 1834e83 + ab599f3 commit 31260a3
Show file tree
Hide file tree
Showing 11 changed files with 94 additions and 20 deletions.
2 changes: 1 addition & 1 deletion doc/source/user_guide/window.rst
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ be calculated with :meth:`~Rolling.apply` by specifying a separate column of wei
All windowing operations support a ``min_periods`` argument that dictates the minimum amount of
non-``np.nan`` values a window must have; otherwise, the resulting value is ``np.nan``.
``min_peridos`` defaults to 1 for time-based windows and ``window`` for fixed windows
``min_periods`` defaults to 1 for time-based windows and ``window`` for fixed windows

.. ipython:: python
Expand Down
32 changes: 32 additions & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,38 @@ cast to ``dtype=object`` (:issue:`38709`)
ser2
.. _whatsnew_130.notable_bug_fixes.rolling_groupby_column:

GroupBy.rolling no longer returns grouped-by column in values
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The group-by column will now be dropped from the result of a
``groupby.rolling`` operation (:issue:`32262`)

.. ipython:: python
df = pd.DataFrame({"A": [1, 1, 2, 3], "B": [0, 1, 2, 3]})
df
*Previous behavior*:

.. code-block:: ipython
In [1]: df.groupby("A").rolling(2).sum()
Out[1]:
A B
A
1 0 NaN NaN
1 2.0 1.0
2 2 NaN NaN
3 3 NaN NaN
*New behavior*:

.. ipython:: python
df.groupby("A").rolling(2).sum()
.. _whatsnew_130.notable_bug_fixes.rolling_var_precision:

Removed artificial truncation in rolling variance and standard deviation
Expand Down
17 changes: 6 additions & 11 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -681,18 +681,17 @@ group_mean_float64 = _group_mean['double']

@cython.wraparound(False)
@cython.boundscheck(False)
def _group_ohlc(floating[:, ::1] out,
int64_t[::1] counts,
ndarray[floating, ndim=2] values,
const intp_t[:] labels,
Py_ssize_t min_count=-1):
def group_ohlc(floating[:, ::1] out,
int64_t[::1] counts,
ndarray[floating, ndim=2] values,
const intp_t[:] labels,
Py_ssize_t min_count=-1):
"""
Only aggregates on axis=0
"""
cdef:
Py_ssize_t i, j, N, K, lab
floating val, count
Py_ssize_t ngroups = len(counts)
floating val

assert min_count == -1, "'min_count' only used in add and prod"

Expand Down Expand Up @@ -727,10 +726,6 @@ def _group_ohlc(floating[:, ::1] out,
out[lab, 3] = val


group_ohlc_float32 = _group_ohlc['float']
group_ohlc_float64 = _group_ohlc['double']


@cython.boundscheck(False)
@cython.wraparound(False)
def group_quantile(ndarray[float64_t] out,
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,7 @@ class DataFrame(NDFrame, OpsMixin):
>>> from dataclasses import make_dataclass
>>> Point = make_dataclass("Point", [("x", int), ("y", int)])
>>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
x y
x y
0 0 0
1 0 3
2 2 3
Expand Down
6 changes: 6 additions & 0 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,12 @@ def _get_cython_func_and_vals(
func = _get_cython_function(kind, how, values.dtype, is_numeric)
else:
raise
else:
if values.dtype.kind in ["i", "u"]:
if how in ["ohlc"]:
# The output may still include nans, so we have to cast
values = ensure_float64(values)

return func, values

@final
Expand Down
5 changes: 1 addition & 4 deletions pandas/core/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -861,7 +861,4 @@ def _rolling_window(a: np.ndarray, window: int):
# https://stackoverflow.com/a/6811241
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
strides = a.strides + (a.strides[-1],)
# error: Module has no attribute "stride_tricks"
return np.lib.stride_tricks.as_strided( # type: ignore[attr-defined]
a, shape=shape, strides=strides
)
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
4 changes: 2 additions & 2 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1925,13 +1925,13 @@ def get_dummies(self, sep="|"):
Examples
--------
>>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies()
a b c
a b c
0 1 1 0
1 1 0 0
2 1 0 1
>>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()
a b c
a b c
0 1 1 0
1 0 0 0
2 1 0 1
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/window/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,10 @@ def __init__(
if _grouper is None:
raise ValueError("Must pass a Grouper object.")
self._grouper = _grouper
# GH 32262: It's convention to keep the grouping column in
# groupby.<agg_func>, but unexpected to users in
# groupby.rolling.<agg_func>
obj = obj.drop(columns=self._grouper.names, errors="ignore")
super().__init__(obj, *args, **kwargs)

def _apply(
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_libgroupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def _check(dtype):
counts = np.zeros(len(out), dtype=np.int64)
labels = ensure_platform_int(np.repeat(np.arange(3), np.diff(np.r_[0, bins])))

func = getattr(libgroupby, f"group_ohlc_{dtype}")
func = libgroupby.group_ohlc
func(out, counts, obj[:, None], labels)

def _ohlc(group):
Expand Down
37 changes: 37 additions & 0 deletions pandas/tests/window/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ def test_rolling(self, f):

result = getattr(r, f)()
expected = g.apply(lambda x: getattr(x.rolling(4), f)())
# groupby.apply doesn't drop the grouped-by column
expected = expected.drop("A", axis=1)
# GH 39732
expected_index = MultiIndex.from_arrays([self.frame["A"], range(40)])
expected.index = expected_index
Expand All @@ -95,6 +97,8 @@ def test_rolling_ddof(self, f):

result = getattr(r, f)(ddof=1)
expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1))
# groupby.apply doesn't drop the grouped-by column
expected = expected.drop("A", axis=1)
# GH 39732
expected_index = MultiIndex.from_arrays([self.frame["A"], range(40)])
expected.index = expected_index
Expand All @@ -111,6 +115,8 @@ def test_rolling_quantile(self, interpolation):
expected = g.apply(
lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation)
)
# groupby.apply doesn't drop the grouped-by column
expected = expected.drop("A", axis=1)
# GH 39732
expected_index = MultiIndex.from_arrays([self.frame["A"], range(40)])
expected.index = expected_index
Expand Down Expand Up @@ -147,6 +153,8 @@ def test_rolling_apply(self, raw):
# reduction
result = r.apply(lambda x: x.sum(), raw=raw)
expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw))
# groupby.apply doesn't drop the grouped-by column
expected = expected.drop("A", axis=1)
# GH 39732
expected_index = MultiIndex.from_arrays([self.frame["A"], range(40)])
expected.index = expected_index
Expand Down Expand Up @@ -442,6 +450,8 @@ def test_groupby_rolling_empty_frame(self):
# GH 36197
expected = DataFrame({"s1": []})
result = expected.groupby("s1").rolling(window=1).sum()
# GH 32262
expected = expected.drop(columns="s1")
# GH-38057 from_tuples gives empty object dtype, we now get float/int levels
# expected.index = MultiIndex.from_tuples([], names=["s1", None])
expected.index = MultiIndex.from_product(
Expand All @@ -451,6 +461,8 @@ def test_groupby_rolling_empty_frame(self):

expected = DataFrame({"s1": [], "s2": []})
result = expected.groupby(["s1", "s2"]).rolling(window=1).sum()
# GH 32262
expected = expected.drop(columns=["s1", "s2"])
expected.index = MultiIndex.from_product(
[
Index([], dtype="float64"),
Expand Down Expand Up @@ -503,6 +515,8 @@ def test_groupby_rolling_no_sort(self):
columns=["foo", "bar"],
index=MultiIndex.from_tuples([(2, 0), (1, 1)], names=["foo", None]),
)
# GH 32262
expected = expected.drop(columns="foo")
tm.assert_frame_equal(result, expected)

def test_groupby_rolling_count_closed_on(self):
Expand Down Expand Up @@ -553,6 +567,8 @@ def test_groupby_rolling_sem(self, func, kwargs):
[("a", 0), ("a", 1), ("b", 2), ("b", 3), ("b", 4)], names=["a", None]
),
)
# GH 32262
expected = expected.drop(columns="a")
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
Expand Down Expand Up @@ -666,6 +682,19 @@ def test_groupby_rolling_object_doesnt_affect_groupby_apply(self):
assert not g.mutated
assert not g.grouper.mutated

@pytest.mark.parametrize(
"columns", [MultiIndex.from_tuples([("A", ""), ("B", "C")]), ["A", "B"]]
)
def test_by_column_not_in_values(self, columns):
# GH 32262
df = DataFrame([[1, 0]] * 20 + [[2, 0]] * 12 + [[3, 0]] * 8, columns=columns)
g = df.groupby("A")
original_obj = g.obj.copy(deep=True)
r = g.rolling(4)
result = r.sum()
assert "A" not in result.columns
tm.assert_frame_equal(g.obj, original_obj)


class TestExpanding:
def setup_method(self):
Expand All @@ -680,6 +709,8 @@ def test_expanding(self, f):

result = getattr(r, f)()
expected = g.apply(lambda x: getattr(x.expanding(), f)())
# groupby.apply doesn't drop the grouped-by column
expected = expected.drop("A", axis=1)
# GH 39732
expected_index = MultiIndex.from_arrays([self.frame["A"], range(40)])
expected.index = expected_index
Expand All @@ -692,6 +723,8 @@ def test_expanding_ddof(self, f):

result = getattr(r, f)(ddof=0)
expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0))
# groupby.apply doesn't drop the grouped-by column
expected = expected.drop("A", axis=1)
# GH 39732
expected_index = MultiIndex.from_arrays([self.frame["A"], range(40)])
expected.index = expected_index
Expand All @@ -708,6 +741,8 @@ def test_expanding_quantile(self, interpolation):
expected = g.apply(
lambda x: x.expanding().quantile(0.4, interpolation=interpolation)
)
# groupby.apply doesn't drop the grouped-by column
expected = expected.drop("A", axis=1)
# GH 39732
expected_index = MultiIndex.from_arrays([self.frame["A"], range(40)])
expected.index = expected_index
Expand Down Expand Up @@ -748,6 +783,8 @@ def test_expanding_apply(self, raw):
# reduction
result = r.apply(lambda x: x.sum(), raw=raw)
expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw))
# groupby.apply doesn't drop the grouped-by column
expected = expected.drop("A", axis=1)
# GH 39732
expected_index = MultiIndex.from_arrays([self.frame["A"], range(40)])
expected.index = expected_index
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/window/test_rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -719,6 +719,9 @@ def scaled_sum(*args):
df = DataFrame(data={"X": range(5)}, index=[0, 0, 1, 1, 1])

expected = DataFrame(data={"X": [0.0, 0.5, 1.0, 1.5, 2.0]}, index=_index)
# GH 40341
if "by" in grouping:
expected = expected.drop(columns="X", errors="ignore")
result = df.groupby(**grouping).rolling(1).apply(scaled_sum, raw=raw, args=(2,))
tm.assert_frame_equal(result, expected)

Expand Down

0 comments on commit 31260a3

Please sign in to comment.