diff --git a/ci/requirements/min-all-deps.yml b/ci/requirements/min-all-deps.yml index 34879af730b..8ba7e901973 100644 --- a/ci/requirements/min-all-deps.yml +++ b/ci/requirements/min-all-deps.yml @@ -15,8 +15,8 @@ dependencies: - cfgrib=0.9 - cftime=1.4 - coveralls - - dask-core=2021.04 - - distributed=2021.04 + - dask-core=2021.08.0 + - distributed=2021.08.0 - flox=0.5 - h5netcdf=0.11 - h5py=3.1 diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9ce51e48983..d1910a70523 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,7 +21,10 @@ v2022.07.0 (unreleased) New Features ~~~~~~~~~~~~ - +- Enable taking the mean of dask-backed :py:class:`cftime.datetime` arrays + (:pull:`6556`, :pull:`6940`). By `Deepak Cherian + `_ and `Spencer Clark + `_. Breaking changes ~~~~~~~~~~~~~~~~ @@ -53,6 +56,9 @@ Bug fixes By `Oliver Lopez `_. - Fix bug where index variables would be changed inplace (:issue:`6931`, :pull:`6938`) By `Michael Niklas `_. +- Allow taking the mean over non-time dimensions of datasets containing + dask-backed cftime arrays (:issue:`5897`, :pull:`6950`). By `Spencer Clark + `_. - Harmonize returned multi-indexed indexes when applying ``concat`` along new dimension (:issue:`6881`, :pull:`6889`) By `Fabian Hofmann `_. - Fix step plots with ``hue`` arg. (:pull:`6944`) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 2cd2fb3af04..2bf05abb96a 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -426,7 +426,6 @@ def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float): though some calendars would allow for them (e.g. no_leap). This is because there is no `cftime.timedelta` object. """ - # TODO: make this function dask-compatible? # Set offset to minimum if not given if offset is None: if array.dtype.kind in "Mm": @@ -531,7 +530,10 @@ def pd_timedelta_to_float(value, datetime_unit): def _timedelta_to_seconds(array): - return np.reshape([a.total_seconds() for a in array.ravel()], array.shape) * 1e6 + if isinstance(array, datetime.timedelta): + return array.total_seconds() * 1e6 + else: + return np.reshape([a.total_seconds() for a in array.ravel()], array.shape) * 1e6 def py_timedelta_to_float(array, datetime_unit): @@ -565,12 +567,6 @@ def mean(array, axis=None, skipna=None, **kwargs): + offset ) elif _contains_cftime_datetimes(array): - if is_duck_dask_array(array): - raise NotImplementedError( - "Computing the mean of an array containing " - "cftime.datetime objects is not yet implemented on " - "dask arrays." - ) offset = min(array) timedeltas = datetime_to_numeric(array, offset, datetime_unit="us") mean_timedeltas = _mean(timedeltas, axis=axis, skipna=skipna, **kwargs) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 37fd13c8d3e..7faf706a3b8 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -323,18 +323,51 @@ def test_datetime_mean(dask: bool) -> None: @requires_cftime -def test_cftime_datetime_mean(): +@pytest.mark.parametrize("dask", [False, True]) +def test_cftime_datetime_mean(dask): + if dask and not has_dask: + pytest.skip("requires dask") + times = cftime_range("2000", periods=4) da = DataArray(times, dims=["time"]) + da_2d = DataArray(times.values.reshape(2, 2)) - assert da.isel(time=0).mean() == da.isel(time=0) + if dask: + da = da.chunk({"time": 2}) + da_2d = da_2d.chunk({"dim_0": 2}) + + expected = da.isel(time=0) + # one compute needed to check the array contains cftime datetimes + with raise_if_dask_computes(max_computes=1): + result = da.isel(time=0).mean() + assert_dask_array(result, dask) + assert_equal(result, expected) expected = DataArray(times.date_type(2000, 1, 2, 12)) - result = da.mean() + with raise_if_dask_computes(max_computes=1): + result = da.mean() + assert_dask_array(result, dask) assert_equal(result, expected) - da_2d = DataArray(times.values.reshape(2, 2)) - result = da_2d.mean() + with raise_if_dask_computes(max_computes=1): + result = da_2d.mean() + assert_dask_array(result, dask) + assert_equal(result, expected) + + +@requires_cftime +@requires_dask +def test_mean_over_non_time_dim_of_dataset_with_dask_backed_cftime_data(): + # Regression test for part two of GH issue 5897: averaging over a non-time + # dimension still fails if the time variable is dask-backed. + ds = Dataset( + { + "var1": (("time",), cftime_range("2021-10-31", periods=10, freq="D")), + "var2": (("x",), list(range(10))), + } + ) + expected = ds.mean("x") + result = ds.chunk({}).mean("x") assert_equal(result, expected) @@ -372,15 +405,6 @@ def test_cftime_datetime_mean_long_time_period(): assert_equal(result, expected) -@requires_cftime -@requires_dask -def test_cftime_datetime_mean_dask_error(): - times = cftime_range("2000", periods=4) - da = DataArray(times, dims=["time"]).chunk() - with pytest.raises(NotImplementedError): - da.mean() - - def test_empty_axis_dtype(): ds = Dataset() ds["pos"] = [1, 2, 3] @@ -742,6 +766,17 @@ def test_datetime_to_numeric_cftime(dask): expected = 24 * np.arange(0, 35, 7).astype(dtype) np.testing.assert_array_equal(result, expected) + with raise_if_dask_computes(): + if dask: + time = dask.array.asarray(times[1]) + else: + time = np.asarray(times[1]) + result = duck_array_ops.datetime_to_numeric( + time, offset=times[0], datetime_unit="h", dtype=int + ) + expected = np.array(24 * 7).astype(int) + np.testing.assert_array_equal(result, expected) + @requires_cftime def test_datetime_to_numeric_potential_overflow():