Skip to content

Commit

Permalink
Enable taking the mean of dask-backed cftime arrays (pydata#6940)
Browse files Browse the repository at this point in the history
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Deepak Cherian <[email protected]>
  • Loading branch information
3 people authored Sep 9, 2022
1 parent abe1e61 commit 2553762
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 25 deletions.
4 changes: 2 additions & 2 deletions ci/requirements/min-all-deps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ dependencies:
- cfgrib=0.9
- cftime=1.4
- coveralls
- dask-core=2021.04
- distributed=2021.04
- dask-core=2021.08.0
- distributed=2021.08.0
- flox=0.5
- h5netcdf=0.11
- h5py=3.1
Expand Down
8 changes: 7 additions & 1 deletion doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@ v2022.07.0 (unreleased)

New Features
~~~~~~~~~~~~

- Enable taking the mean of dask-backed :py:class:`cftime.datetime` arrays
(:pull:`6556`, :pull:`6940`). By `Deepak Cherian
<https://github.com/dcherian>`_ and `Spencer Clark
<https://github.com/spencerkclark>`_.

Breaking changes
~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -53,6 +56,9 @@ Bug fixes
By `Oliver Lopez <https://github.com/lopezvoliver>`_.
- Fix bug where index variables would be changed inplace (:issue:`6931`, :pull:`6938`)
By `Michael Niklas <https://github.com/headtr1ck>`_.
- Allow taking the mean over non-time dimensions of datasets containing
dask-backed cftime arrays (:issue:`5897`, :pull:`6950`). By `Spencer Clark
<https://github.com/spencerkclark>`_.
- Harmonize returned multi-indexed indexes when applying ``concat`` along new dimension (:issue:`6881`, :pull:`6889`)
By `Fabian Hofmann <https://github.com/FabianHofmann>`_.
- Fix step plots with ``hue`` arg. (:pull:`6944`)
Expand Down
12 changes: 4 additions & 8 deletions xarray/core/duck_array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,6 @@ def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float):
though some calendars would allow for them (e.g. no_leap). This is because there
is no `cftime.timedelta` object.
"""
# TODO: make this function dask-compatible?
# Set offset to minimum if not given
if offset is None:
if array.dtype.kind in "Mm":
Expand Down Expand Up @@ -531,7 +530,10 @@ def pd_timedelta_to_float(value, datetime_unit):


def _timedelta_to_seconds(array):
return np.reshape([a.total_seconds() for a in array.ravel()], array.shape) * 1e6
if isinstance(array, datetime.timedelta):
return array.total_seconds() * 1e6
else:
return np.reshape([a.total_seconds() for a in array.ravel()], array.shape) * 1e6


def py_timedelta_to_float(array, datetime_unit):
Expand Down Expand Up @@ -565,12 +567,6 @@ def mean(array, axis=None, skipna=None, **kwargs):
+ offset
)
elif _contains_cftime_datetimes(array):
if is_duck_dask_array(array):
raise NotImplementedError(
"Computing the mean of an array containing "
"cftime.datetime objects is not yet implemented on "
"dask arrays."
)
offset = min(array)
timedeltas = datetime_to_numeric(array, offset, datetime_unit="us")
mean_timedeltas = _mean(timedeltas, axis=axis, skipna=skipna, **kwargs)
Expand Down
63 changes: 49 additions & 14 deletions xarray/tests/test_duck_array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,18 +323,51 @@ def test_datetime_mean(dask: bool) -> None:


@requires_cftime
def test_cftime_datetime_mean():
@pytest.mark.parametrize("dask", [False, True])
def test_cftime_datetime_mean(dask):
if dask and not has_dask:
pytest.skip("requires dask")

times = cftime_range("2000", periods=4)
da = DataArray(times, dims=["time"])
da_2d = DataArray(times.values.reshape(2, 2))

assert da.isel(time=0).mean() == da.isel(time=0)
if dask:
da = da.chunk({"time": 2})
da_2d = da_2d.chunk({"dim_0": 2})

expected = da.isel(time=0)
# one compute needed to check the array contains cftime datetimes
with raise_if_dask_computes(max_computes=1):
result = da.isel(time=0).mean()
assert_dask_array(result, dask)
assert_equal(result, expected)

expected = DataArray(times.date_type(2000, 1, 2, 12))
result = da.mean()
with raise_if_dask_computes(max_computes=1):
result = da.mean()
assert_dask_array(result, dask)
assert_equal(result, expected)

da_2d = DataArray(times.values.reshape(2, 2))
result = da_2d.mean()
with raise_if_dask_computes(max_computes=1):
result = da_2d.mean()
assert_dask_array(result, dask)
assert_equal(result, expected)


@requires_cftime
@requires_dask
def test_mean_over_non_time_dim_of_dataset_with_dask_backed_cftime_data():
# Regression test for part two of GH issue 5897: averaging over a non-time
# dimension still fails if the time variable is dask-backed.
ds = Dataset(
{
"var1": (("time",), cftime_range("2021-10-31", periods=10, freq="D")),
"var2": (("x",), list(range(10))),
}
)
expected = ds.mean("x")
result = ds.chunk({}).mean("x")
assert_equal(result, expected)


Expand Down Expand Up @@ -372,15 +405,6 @@ def test_cftime_datetime_mean_long_time_period():
assert_equal(result, expected)


@requires_cftime
@requires_dask
def test_cftime_datetime_mean_dask_error():
times = cftime_range("2000", periods=4)
da = DataArray(times, dims=["time"]).chunk()
with pytest.raises(NotImplementedError):
da.mean()


def test_empty_axis_dtype():
ds = Dataset()
ds["pos"] = [1, 2, 3]
Expand Down Expand Up @@ -742,6 +766,17 @@ def test_datetime_to_numeric_cftime(dask):
expected = 24 * np.arange(0, 35, 7).astype(dtype)
np.testing.assert_array_equal(result, expected)

with raise_if_dask_computes():
if dask:
time = dask.array.asarray(times[1])
else:
time = np.asarray(times[1])
result = duck_array_ops.datetime_to_numeric(
time, offset=times[0], datetime_unit="h", dtype=int
)
expected = np.array(24 * 7).astype(int)
np.testing.assert_array_equal(result, expected)


@requires_cftime
def test_datetime_to_numeric_potential_overflow():
Expand Down

0 comments on commit 2553762

Please sign in to comment.