diff --git a/doc/recipe/preprocessor.rst b/doc/recipe/preprocessor.rst index 694c75f57a..74229e08eb 100644 --- a/doc/recipe/preprocessor.rst +++ b/doc/recipe/preprocessor.rst @@ -27,7 +27,7 @@ roughly following the default order in which preprocessor functions are applied: * :ref:`Detrend` * :ref:`Rolling window statistics` * :ref:`Unit conversion` -* :ref:`Bias` +* :ref:`comparison_with_ref` * :ref:`Other` See :ref:`preprocessor_functions` for implementation details and the exact default order. @@ -91,13 +91,13 @@ supported too if proper keyword arguments are specified: .. _supported_stat_operator: ============================== ================================================= ===================================== -`operator` Corresponding :class:`~iris.analysis.Aggregator` Weighted? [1]_ +`operator` Corresponding :class:`~iris.analysis.Aggregator` Weighted? [#f1]_ ============================== ================================================= ===================================== ``gmean`` :const:`iris.analysis.GMEAN` no ``hmean`` :const:`iris.analysis.HMEAN` no ``max`` :const:`iris.analysis.MAX` no ``mean`` :const:`iris.analysis.MEAN` yes -``median`` :const:`iris.analysis.MEDIAN` [2]_ no +``median`` :const:`iris.analysis.MEDIAN` [#f2]_ no ``min`` :const:`iris.analysis.MIN` no ``peak`` :const:`iris.analysis.PEAK` no ``percentile`` :const:`iris.analysis.PERCENTILE` no @@ -108,7 +108,7 @@ supported too if proper keyword arguments are specified: ``wpercentile`` :const:`iris.analysis.WPERCENTILE` yes ============================== ================================================= ===================================== -.. [1] The following preprocessor support weighted statistics by default: +.. [#f1] The following preprocessor support weighted statistics by default: :func:`~esmvalcore.preprocessor.area_statistics`: weighted by grid cell areas (see also :ref:`preprocessors_using_supplementary_variables`); :func:`~esmvalcore.preprocessor.climate_statistics`: weighted by lengths of @@ -117,7 +117,7 @@ supported too if proper keyword arguments are specified: :ref:`preprocessors_using_supplementary_variables`); :func:`~esmvalcore.preprocessor.axis_statistics`: weighted by corresponding coordinate bounds. -.. [2] :const:`iris.analysis.MEDIAN` is not lazy, but much faster than +.. [#f2] :const:`iris.analysis.MEDIAN` is not lazy, but much faster than :const:`iris.analysis.PERCENTILE`. For a lazy median, use ``percentile`` with the keyword argument ``percent: 50``. @@ -301,23 +301,29 @@ or to perform their computations. In ESMValCore we call both types of variables "supplementary variables". -============================================================== ============================== ===================================== -Preprocessor Variable short name Variable standard name -============================================================== ============================== ===================================== -:ref:`area_statistics` ``areacella``, ``areacello`` cell_area -:ref:`mask_landsea` ``sftlf``, ``sftof`` land_area_fraction, sea_area_fraction -:ref:`mask_landseaice` ``sftgif`` land_ice_area_fraction -:ref:`volume_statistics` ``volcello``, ``areacello`` ocean_volume, cell_area -:ref:`weighting_landsea_fraction` ``sftlf``, ``sftof`` land_area_fraction, sea_area_fraction -============================================================== ============================== ===================================== +===================================================================== ============================== ===================================== +Preprocessor Variable short name Variable standard name +===================================================================== ============================== ===================================== +:ref:`area_statistics` [#f4]_ ``areacella``, ``areacello`` cell_area +:ref:`mask_landsea` [#f4]_ ``sftlf``, ``sftof`` land_area_fraction, sea_area_fraction +:ref:`mask_landseaice` [#f3]_ ``sftgif`` land_ice_area_fraction +:ref:`volume_statistics` [#f4]_ ``volcello``, ``areacello`` ocean_volume, cell_area +:ref:`weighting_landsea_fraction` [#f3]_ ``sftlf``, ``sftof`` land_area_fraction, sea_area_fraction +:ref:`distance_metric` [#f5]_ ``areacella``, ``areacello`` cell_area +:ref:`histogram` [#f5]_ ``areacella``, ``areacello`` cell_area +===================================================================== ============================== ===================================== + +.. [#f3] This preprocessor requires at least one of the mentioned supplementary + variables. If none is defined in the recipe, automatically look for them. + If none is found, an error will be raised. +.. [#f4] This preprocessor prefers at least one of the mentioned supplementary + variables. If none is defined in the recipe, automatically look for them. + If none is found, a warning will be raised (but no error). +.. [#f5] This preprocessor optionally takes one of the mentioned supplementary + variables. If none is defined in the recipe, none is added. Only one of the listed variables is required. Supplementary variables can be defined in the recipe as described in :ref:`supplementary_variables`. -In some cases, preprocessor functions may work without supplementary variables, -this is documented case by case in the preprocessor function definition. -If a preprocessor function requiring supplementary variables is used -without specifying these in the recipe, these will be automatically -added. If the automatic selection does not give the desired result, specify the supplementary variables in the recipe as described in :ref:`supplementary_variables`. @@ -2510,15 +2516,17 @@ the time units in the variable. See also :func:`esmvalcore.preprocessor.accumulate_coordinate.` -.. _bias: +.. _comparison_with_ref: -Bias -==== +Comparison with reference dataset +================================= -The bias module contains the following preprocessor functions: +This module contains the following preprocessor functions: * ``bias``: Calculate absolute or relative biases with respect to a reference - dataset + dataset. +* ``distance_metric``: Calculate absolute or relative biases with respect to a + reference dataset. ``bias`` -------- @@ -2583,6 +2591,178 @@ Example: See also :func:`esmvalcore.preprocessor.bias`. +.. _distance_metric: + +``distance_metric`` +------------------- + +This function calculates a distance metric with respect to a given reference +dataset. +For this, exactly one input dataset needs to be declared as +``reference_for_metric: true`` in the recipe, e.g., + +.. code-block:: yaml + + datasets: + - {dataset: CanESM5, project: CMIP6, ensemble: r1i1p1f1, grid: gn} + - {dataset: CESM2, project: CMIP6, ensemble: r1i1p1f1, grid: gn} + - {dataset: MIROC6, project: CMIP6, ensemble: r1i1p1f1, grid: gn} + - {dataset: ERA-Interim, project: OBS6, tier: 3, type: reanaly, version: 1, + reference_for_metric: true} + +In the example above, ERA-Interim is used as reference dataset for the distance +metric calculation. +All datasets need to have the same shape and coordinates. +To ensure this, the preprocessors :func:`esmvalcore.preprocessor.regrid` and/or +:func:`esmvalcore.preprocessor.regrid_time` might be helpful. + +The ``distance_metric`` preprocessor supports the following arguments in the +recipe: + +.. _list_of_distance_metrics: + +* ``metric`` (:obj:`str`): Distance metric that is calculated. + Must be one of + + * ``'rmse'``: `Unweighted root mean square error`_. + + .. math:: + + RMSE = \sqrt{\frac{1}{N} \sum_{i=1}^N \left( x_i - r_i \right)^2} + + * ``'weighted_rmse'``: `Weighted root mean square error`_. + + .. math:: + + WRMSE = \sqrt{\sum_{i=1}^N w_i \left( x_i - r_i \right)^2} + + * ``'pearsonr'``: `Unweighted Pearson correlation coefficient`_. + + .. math:: + + r = \frac{ + \sum_{i=1}^N + \left( x_i - \bar{x} \right) \left( r_i - \bar{r} \right) + }{ + \sqrt{\sum_{i=1}^N \left( x_i - \bar{x} \right)^2} + \sqrt{\sum_{i=1}^N \left( r_i - \bar{r} \right)^2} + } + + * ``'weighted_pearsonr'``: `Weighted Pearson correlation coefficient`_. + + .. math:: + + r = \frac{ + \sum_{i=1}^N + w_i \left( x_i - \bar{x} \right) \left( r_i - \bar{r} \right) + }{ + \sqrt{\sum_{i=1}^N w_i \left( x_i - \bar{x} \right)^2} + \sqrt{\sum_{i=1}^N w_i \left( r_i - \bar{r} \right)^2} + } + + + * ``'emd'``: `Unweighted Earth mover's distance`_ (EMD). + The EMD is also known as first Wasserstein metric `W`\ :sub:`1`, which is a + metric that measures distance between two probability distributions. + For this, discrete probability distributions of the input data are created + through binning, which are then used as input for the Wasserstein metric. + The metric is also known as `Earth mover's distance` since, intuitively, it + can be seen as the minimum "cost" of turning one pile of earth into another + one (pile of earth = probability distribution). + This is also known as `optimal transport` problem. + Formally, this can be described with a joint probability distribution (or + `optimal transport matrix`) γ (whose marginals are the input distributions) + that minimizes the "transportation cost": + + .. math:: + + W_1 = \min_{\gamma \in \mathbb{R}^{n \times n}_{+}} \sum_{i,j}^{n} + \gamma_{ij} \lvert X_i - R_i \rvert \\ + \textrm{with} ~~ \gamma 1 = p_X(X);~ \gamma^T 1 = p_R(R) + + * ``'weighted_emd'``: `Weighted Earth mover's distance`_. + Similar to the unweighted EMD (see above), but here weights are considered + when calculating the probability distributions (i.e., instead of 1, each + element provides a weight in the bin count; see also `weights` + argument of :func:`numpy.histogram`). + + Here, `x`\ :sub:`i` and `r`\ :sub:`i` are samples of a variable of interest + and a corresponding reference, respectively (a bar over a variable denotes + its arithmetic/weighted mean [the latter for weighted metrics]). + Capital letters (`X`\ :sub:`i` and `R`\ :sub:`i`) refer to bin centers of a + discrete probability distribution with values `p`\ :sub:`X`\ (`X`\ :sub:`i`) + or `p`\ :sub:`R`\ (`R`\ :sub:`i`) and a number of bins `n` (see the argument + ``n_bins`` below) that has been derived for the variables `x` and `r` through + binning. + `w`\ :sub:`i` are weights that sum to one (see note below) and `N` is the + total number of samples. + + .. note:: + Metrics starting with `weighted_` will calculate weighted distance metrics + if possible. + Currently, the following `coords` (or any combinations that include them) + will trigger weighting: `time` (will use lengths of time intervals as + weights) and `latitude` (will use cell area weights). + Time weights are always calculated from the input data. + Area weights can be given as supplementary variables to the recipe + (`areacella` or `areacello`, see :ref:`supplementary_variables`) or + calculated from the input data (this only works for regular grids). + By default, **NO** supplementary variables will be used; they need to be + explicitly requested in the recipe. +* ``coords`` (:obj:`list` of :obj:`str`, default: ``None``): Coordinates over + which the distance metric is calculated. + If ``None``, calculate the metric over all coordinates, which results in a + scalar cube. +* ``keep_reference_dataset`` (:obj:`bool`, default: ``True``): If ``True``, + also calculate the distance of the reference dataset with itself. + If ``False``, drop the reference dataset. +* ``exclude`` (:obj:`list` of :obj:`str`): Exclude specific datasets from + this preprocessor. + Note that this option is only available in the recipe, not when using + :func:`esmvalcore.preprocessor.distance_metric` directly (e.g., in another + python script). + If the reference dataset has been excluded, an error is raised. +* Other parameters are directly used for the metric calculation. + The following keyword arguments are supported: + + * `rmse` and `weighted_rmse`: none. + * `pearsonr` and `weighted_pearsonr`: ``mdtol``, ``common_mask`` (all keyword + arguments are passed to :func:`iris.analysis.stats.pearsonr`, see that link + for more details on these arguments). + Note: in contrast to :func:`~iris.analysis.stats.pearsonr`, + ``common_mask=True`` by default. + * `emd` and `weighted_emd`: ``n_bins`` = number of bins used to create + discrete probability distribution of data before calculating the EMD + (:obj:`int`, default: 100). + +Example: + +.. code-block:: yaml + + preprocessors: + preproc_pearsonr: + distance_metric: + metric: weighted_pearsonr + coords: [latitude, longitude] + keep_reference_dataset: true + exclude: [CanESM2] + common_mask: true + +See also :func:`esmvalcore.preprocessor.distance_metric`. + +.. _Unweighted root mean square error: https://en.wikipedia.org/wiki/ + Root-mean-square_deviation +.. _Weighted root mean square error: https://en.wikipedia.org/wiki/ + Root-mean-square_deviation +.. _Unweighted Pearson correlation coefficient: https://en.wikipedia.org/ + wiki/Pearson_correlation_coefficient +.. _Weighted Pearson correlation coefficient: https://en.wikipedia.org/ + wiki/Pearson_correlation_coefficient +.. _Unweighted Earth mover's distance: https://pythonot.github.io/ + quickstart.html#computing-wasserstein-distance +.. _Weighted Earth mover's distance: https://pythonot.github.io/ + quickstart.html#computing-wasserstein-distance + .. _Memory use: @@ -2649,3 +2829,73 @@ The example below shows how to set all values below zero to zero. clip: minimum: 0 maximum: null + +.. _histogram: + +``histogram`` +------------------- + +This function calculates histograms. + +The ``histogram`` preprocessor supports the following arguments in the +recipe: + +* ``coords`` (:obj:`list` of :obj:`str`, default: ``None``): Coordinates over + which the histogram is calculated. + If ``None``, calculate the histogram over all coordinates. + The shape of the output cube will be `(x1, x2, ..., n_bins)`, where `xi` are + the dimensions of the input cube not appearing in `coords` and `n_bins` is + the number of bins. +* ``bins`` (:obj:`int` or sequence of :obj:`float`, default: 10): If `bins` is + an :obj:`int`, it defines the number of equal-width bins in the given + `bin_range`. + If `bins` is a sequence, it defines a monotonically increasing array of bin + edges, including the rightmost edge, allowing for non-uniform bin widths. +* ``bin_range`` (:obj:`tuple` of :obj:`float` or ``None``, default: ``None``): + The lower and upper range of the bins. + If ``None``, `bin_range` is simply (``cube.core_data().min(), + cube.core_data().max()``). + Values outside the range are ignored. + The first element of the range must be less than or equal to the second. + `bin_range` affects the automatic bin computation as well if `bins` is an + :obj:`int` (see description for `bins` above). +* ``weights`` (array-like, :obj:`bool`, or ``None``, default: ``None``): + Weights for the histogram calculation. + Each value in the input data only contributes its associated weight towards + the bin count (instead of 1). + Weights are normalized before entering the calculation if `normalization` is + ``'integral'`` or ``'sum'``. + Can be an array of the same shape as the input data, ``False`` or ``None`` + (no weighting), or ``True``. + In the latter case, weighting will depend on `coords`, and the following + coordinates will trigger weighting: `time` (will use lengths of time + intervals as weights) and `latitude` (will use cell area weights). + Time weights are always calculated from the input data. + Area weights can be given as supplementary variables in the recipe + (`areacella` or `areacello`, see :ref:`supplementary_variables`) or + calculated from the input data (this only works for regular grids). + By default, **NO** supplementary variables will be used; they need to be + explicitly requested in the recipe. +* ``normalization`` (``None``, ``'sum'``, or ``'integral'``, default: + ``None``): If ``None``, the result will contain the number of samples in each + bin. + If ``'integral'``, the result is the value of the probability `density` + function at the bin, normalized such that the integral over the range is 1. + If ``'sum'``, the result is the value of the probability `mass` function at + the bin, normalized such that the sum over the whole range is 1. + Normalization will be applied across `coords`, not the entire cube. + +Example: + +.. code-block:: yaml + + preprocessors: + preproc_histogram: + histogram: + coords: [latitude, longitude] + bins: 12 + bin_range: [100.0, 150.0] + weights: true + normalization: sum + +See also :func:`esmvalcore.preprocessor.histogram`. diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py index abbfc7f307..940a578ae2 100644 --- a/esmvalcore/_recipe/check.py +++ b/esmvalcore/_recipe/check.py @@ -5,6 +5,7 @@ import logging import os import subprocess +from functools import partial from pprint import pformat from shutil import which from typing import Any, Iterable @@ -395,47 +396,86 @@ def differing_timeranges(timeranges, required_vars): "Set `timerange` to a common value.") -def bias_type(settings: dict) -> None: - """Check that bias_type for bias preprocessor is valid.""" - if 'bias' not in settings: +def _check_literal( + settings: dict, + *, + step: str, + option: str, + allowed_values: tuple[str], +) -> None: + """Check that an option for a preprocessor has a valid value.""" + if step not in settings: return - valid_options = ('absolute', 'relative') - user_bias_type = settings['bias'].get('bias_type', 'absolute') - if user_bias_type not in valid_options: + user_value = settings[step].get(option, allowed_values[0]) + if user_value not in allowed_values: raise RecipeError( - f"Expected one of {valid_options} for `bias_type`, got " - f"'{user_bias_type}'" + f"Expected one of {allowed_values} for `{option}`, got " + f"'{user_value}'" ) -def reference_for_bias_preproc(products): - """Check that exactly one reference dataset for bias preproc is given.""" - step = 'bias' +bias_type = partial( + _check_literal, + step='bias', + option='bias_type', + allowed_values=('absolute', 'relative'), +) + + +metric_type = partial( + _check_literal, + step='distance_metric', + option='metric', + allowed_values=( + 'rmse', + 'weighted_rmse', + 'pearsonr', + 'weighted_pearsonr', + 'emd', + 'weighted_emd', + ), +) + + +def _check_ref_attributes(products: set, *, step: str, attr_name: str) -> None: + """Check that exactly one reference dataset is given.""" products = {p for p in products if step in p.settings} if not products: return - # Check that exactly one dataset contains the facet ``reference_for_bias: - # true`` + # Check that exactly one dataset contains the specified facet reference_products = [] for product in products: - if product.attributes.get('reference_for_bias', False): + if product.attributes.get(attr_name, False): reference_products.append(product) if len(reference_products) != 1: products_str = [p.filename for p in products] if not reference_products: ref_products_str = ". " else: - ref_products_str = [p.filename for p in reference_products] - ref_products_str = f":\n{pformat(ref_products_str)}.\n" + ref_products_str = ( + f":\n{pformat([p.filename for p in reference_products])}.\n" + ) raise RecipeError( - f"Expected exactly 1 dataset with 'reference_for_bias: true' in " + f"Expected exactly 1 dataset with '{attr_name}: true' in " f"products\n{pformat(products_str)},\nfound " f"{len(reference_products):d}{ref_products_str}Please also " f"ensure that the reference dataset is not excluded with the " f"'exclude' option") +reference_for_bias_preproc = partial( + _check_ref_attributes, step='bias', attr_name='reference_for_bias' +) + + +reference_for_distance_metric_preproc = partial( + _check_ref_attributes, + step='distance_metric', + attr_name='reference_for_metric', +) + + def statistics_preprocessors(settings: dict) -> None: """Check options of statistics preprocessors.""" mm_stats = ( diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index e0423fd49a..676e56cb52 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -37,13 +37,13 @@ ) from esmvalcore.preprocessor._area import _update_shapefile_path from esmvalcore.preprocessor._multimodel import _get_stat_identifier -from esmvalcore.preprocessor._other import _group_products from esmvalcore.preprocessor._regrid import ( _spec_to_latlonvals, get_cmor_levels, get_reference_levels, parse_cell_spec, ) +from esmvalcore.preprocessor._shared import _group_products from . import check from .from_datasets import datasets_to_recipe @@ -555,6 +555,7 @@ def _get_preprocessor_products( f'{separator.join(sorted(missing_vars))}') check.reference_for_bias_preproc(products) + check.reference_for_distance_metric_preproc(products) _configure_multi_product_preprocessor( products=products, @@ -656,6 +657,7 @@ def _update_preproc_functions(settings, dataset, datasets, missing_vars): check.statistics_preprocessors(settings) check.regridding_schemes(settings) check.bias_type(settings) + check.metric_type(settings) def _get_preprocessor_task(datasets, profiles, task_name): diff --git a/esmvalcore/iris_helpers.py b/esmvalcore/iris_helpers.py index 3e59932697..1191587546 100644 --- a/esmvalcore/iris_helpers.py +++ b/esmvalcore/iris_helpers.py @@ -236,7 +236,7 @@ def rechunk_cube( Input cube. complete_coords: (Names of) coordinates along which the output cubes should not be - chunked. The given coordinates must span exactly 1 dimension. + chunked. remaining_dims: Chunksize of the remaining dimensions. @@ -248,17 +248,11 @@ def rechunk_cube( """ cube = cube.copy() # do not modify input cube - # Make sure that complete_coords span exactly 1 dimension complete_dims = [] for coord in complete_coords: coord = cube.coord(coord) - dims = cube.coord_dims(coord) - if len(dims) != 1: - raise CoordinateMultiDimError( - f"Complete coordinates must be 1D coordinates, got " - f"{len(dims):d}D coordinate '{coord.name()}'" - ) - complete_dims.append(dims[0]) + complete_dims.extend(cube.coord_dims(coord)) + complete_dims = list(set(complete_dims)) # Rechunk data if cube.has_lazy_data(): diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py index db55ce4f17..3800fb1413 100644 --- a/esmvalcore/preprocessor/__init__.py +++ b/esmvalcore/preprocessor/__init__.py @@ -22,7 +22,7 @@ meridional_statistics, zonal_statistics, ) -from ._bias import bias +from ._compare_with_refs import bias, distance_metric from ._cycles import amplitude from ._derive import derive from ._detrend import detrend @@ -46,7 +46,7 @@ mask_outside_range, ) from ._multimodel import ensemble_statistics, multi_model_statistics -from ._other import clip +from ._other import clip, histogram from ._regrid import ( extract_coordinate_points, extract_levels, @@ -175,12 +175,15 @@ 'linear_trend_stderr', # Convert units 'convert_units', + # Histograms + 'histogram', # Ensemble statistics 'ensemble_statistics', # Multi model statistics 'multi_model_statistics', - # Bias calculation + # Comparison with reference datasets 'bias', + 'distance_metric', # Remove supplementary variables from cube 'remove_supplementary_variables', # Save to file @@ -215,6 +218,7 @@ MULTI_MODEL_FUNCTIONS = { 'bias', + 'distance_metric', 'ensemble_statistics', 'multi_model_statistics', 'mask_multimodel', diff --git a/esmvalcore/preprocessor/_bias.py b/esmvalcore/preprocessor/_bias.py deleted file mode 100644 index a8e0f4f2c0..0000000000 --- a/esmvalcore/preprocessor/_bias.py +++ /dev/null @@ -1,188 +0,0 @@ -"""Preprocessor functions to calculate biases from data.""" -from __future__ import annotations - -import logging -from collections.abc import Iterable -from typing import TYPE_CHECKING, Literal, Optional - -import dask.array as da -from iris.cube import Cube, CubeList - -from ._io import concatenate - -if TYPE_CHECKING: - from esmvalcore.preprocessor import PreprocessorFile - -logger = logging.getLogger(__name__) - - -BiasType = Literal['absolute', 'relative'] - - -def bias( - products: set[PreprocessorFile] | Iterable[Cube], - ref_cube: Optional[Cube] = None, - bias_type: BiasType = 'absolute', - denominator_mask_threshold: float = 1e-3, - keep_reference_dataset: bool = False, -) -> set[PreprocessorFile] | CubeList: - """Calculate biases relative to a reference dataset. - - The reference dataset needs to be broadcastable to all input `products`. - This supports `iris' rich broadcasting abilities - `__. To ensure this, the preprocessors - :func:`esmvalcore.preprocessor.regrid` and/or - :func:`esmvalcore.preprocessor.regrid_time` might be helpful. - - Notes - ----- - The reference dataset can be specified with the `ref_cube` argument. If - `ref_cube` is ``None``, exactly one input dataset in the `products` set - needs to have the facet ``reference_for_bias: true`` defined in the recipe. - Please do **not** specify the option `ref_cube` when using this - preprocessor function in a recipe. - - Parameters - ---------- - products: - Input datasets/cubes for which the bias is calculated relative to a - reference dataset/cube. - ref_cube: - Cube which is used as reference for the bias calculation. If ``None``, - `products` needs to be a :obj:`set` of - `~esmvalcore.preprocessor.PreprocessorFile` objects and exactly one - dataset in `products` needs the facet ``reference_for_bias: true``. - bias_type: - Bias type that is calculated. Must be one of ``'absolute'`` (dataset - - ref) or ``'relative'`` ((dataset - ref) / ref). - denominator_mask_threshold: - Threshold to mask values close to zero in the denominator (i.e., the - reference dataset) during the calculation of relative biases. All - values in the reference dataset with absolute value less than the given - threshold are masked out. This setting is ignored when ``bias_type`` is - set to ``'absolute'``. Please note that for some variables with very - small absolute values (e.g., carbon cycle fluxes, which are usually - :math:`< 10^{-6}` kg m :math:`^{-2}` s :math:`^{-1}`) it is absolutely - essential to change the default value in order to get reasonable - results. - keep_reference_dataset: - If ``True``, keep the reference dataset in the output. If ``False``, - drop the reference dataset. Ignored if `ref_cube` is given. - - Returns - ------- - set[PreprocessorFile] | CubeList - Output datasets/cubes. Will be a :obj:`set` of - :class:`~esmvalcore.preprocessor.PreprocessorFile` objects if - `products` is also one, a :class:`~iris.cube.CubeList` otherwise. - - Raises - ------ - ValueError - Not exactly one input datasets contains the facet - ``reference_for_bias: true`` if ``ref_cube=None``; ``ref_cube=None`` - and the input products are given as iterable of - :class:`~iris.cube.Cube` objects; ``bias_type`` is not one of - ``'absolute'`` or ``'relative'``. - - """ - ref_product = None - all_cubes_given = all(isinstance(p, Cube) for p in products) - - # Get reference cube if not explicitly given - if ref_cube is None: - if all_cubes_given: - raise ValueError( - "A list of Cubes is given to this preprocessor; please " - "specify a `ref_cube`" - ) - (ref_cube, ref_product) = _get_ref(products, 'reference_for_bias') - else: - ref_product = None - - # Mask reference cube appropriately for relative biases - if bias_type == 'relative': - ref_cube = ref_cube.copy() - ref_cube.data = da.ma.masked_inside( - ref_cube.core_data(), - -denominator_mask_threshold, - denominator_mask_threshold, - ) - - # If input is an Iterable of Cube objects, calculate bias for each element - if all_cubes_given: - cubes = [_calculate_bias(c, ref_cube, bias_type) for c in products] - return CubeList(cubes) - - # Otherwise, iterate over all input products, calculate bias and adapt - # metadata and provenance information accordingly - output_products = set() - for product in products: - if product == ref_product: - continue - cube = concatenate(product.cubes) - - # Calculate bias - cube = _calculate_bias(cube, ref_cube, bias_type) - - # Adapt metadata and provenance information - product.attributes['units'] = str(cube.units) - if ref_product is not None: - product.wasderivedfrom(ref_product) - - product.cubes = CubeList([cube]) - output_products.add(product) - - # Add reference dataset to output if desired - if keep_reference_dataset and ref_product is not None: - output_products.add(ref_product) - - return output_products - - -def _get_ref(products, ref_tag: str) -> tuple[Cube, PreprocessorFile]: - """Get reference cube and product.""" - ref_products = [] - for product in products: - if product.attributes.get(ref_tag, False): - ref_products.append(product) - if len(ref_products) != 1: - raise ValueError( - f"Expected exactly 1 dataset with '{ref_tag}: true', found " - f"{len(ref_products):d}" - ) - ref_product = ref_products[0] - - # Extract reference cube - # Note: For technical reasons, product objects contain the member - # ``cubes``, which is a list of cubes. However, this is expected to be a - # list with exactly one element due to the call of concatenate earlier in - # the preprocessing chain of ESMValTool. To make sure that this - # preprocessor can also be used outside the ESMValTool preprocessing chain, - # an additional concatenate call is added here. - ref_cube = concatenate(ref_product.cubes) - - return (ref_cube, ref_product) - - -def _calculate_bias(cube: Cube, ref_cube: Cube, bias_type: BiasType) -> Cube: - """Calculate bias for a single cube relative to a reference cube.""" - cube_metadata = cube.metadata - - if bias_type == 'absolute': - cube = cube - ref_cube - new_units = cube.units - elif bias_type == 'relative': - cube = (cube - ref_cube) / ref_cube - new_units = '1' - else: - raise ValueError( - f"Expected one of ['absolute', 'relative'] for bias_type, got " - f"'{bias_type}'" - ) - - cube.metadata = cube_metadata - cube.units = new_units - - return cube diff --git a/esmvalcore/preprocessor/_compare_with_refs.py b/esmvalcore/preprocessor/_compare_with_refs.py new file mode 100644 index 0000000000..1634fc1752 --- /dev/null +++ b/esmvalcore/preprocessor/_compare_with_refs.py @@ -0,0 +1,573 @@ +"""Preprocessor functions for comparisons of data with reference datasets.""" +from __future__ import annotations + +import logging +from collections.abc import Iterable +from functools import partial +from typing import TYPE_CHECKING, Literal, Optional + +import dask +import dask.array as da +import iris.analysis +import iris.analysis.stats +import numpy as np +from iris.common.metadata import CubeMetadata +from iris.coords import CellMethod, Coord +from iris.cube import Cube, CubeList +from scipy.stats import wasserstein_distance + +from esmvalcore.iris_helpers import rechunk_cube +from esmvalcore.preprocessor._io import concatenate +from esmvalcore.preprocessor._other import histogram +from esmvalcore.preprocessor._shared import ( + get_all_coord_dims, + get_all_coords, + get_array_module, + get_weights, + preserve_float_dtype, +) + +if TYPE_CHECKING: + from esmvalcore.preprocessor import PreprocessorFile + +logger = logging.getLogger(__name__) + + +BiasType = Literal['absolute', 'relative'] + + +def bias( + products: set[PreprocessorFile] | Iterable[Cube], + reference: Optional[Cube] = None, + bias_type: BiasType = 'absolute', + denominator_mask_threshold: float = 1e-3, + keep_reference_dataset: bool = False, +) -> set[PreprocessorFile] | CubeList: + """Calculate biases relative to a reference dataset. + + The reference dataset needs to be broadcastable to all input `products`. + This supports `iris' rich broadcasting abilities + `__. To ensure this, the preprocessors + :func:`esmvalcore.preprocessor.regrid` and/or + :func:`esmvalcore.preprocessor.regrid_time` might be helpful. + + Notes + ----- + The reference dataset can be specified with the `reference` argument. If + `reference` is ``None``, exactly one input dataset in the `products` set + needs to have the facet ``reference_for_bias: true`` defined in the recipe. + Please do **not** specify the option `reference` when using this + preprocessor function in a recipe. + + Parameters + ---------- + products: + Input datasets/cubes for which the bias is calculated relative to a + reference dataset/cube. + reference: + Cube which is used as reference for the bias calculation. If ``None``, + `products` needs to be a :obj:`set` of + `~esmvalcore.preprocessor.PreprocessorFile` objects and exactly one + dataset in `products` needs the facet ``reference_for_bias: true``. Do + not specify this argument in a recipe. + bias_type: + Bias type that is calculated. Must be one of ``'absolute'`` (dataset - + ref) or ``'relative'`` ((dataset - ref) / ref). + denominator_mask_threshold: + Threshold to mask values close to zero in the denominator (i.e., the + reference dataset) during the calculation of relative biases. All + values in the reference dataset with absolute value less than the given + threshold are masked out. This setting is ignored when ``bias_type`` is + set to ``'absolute'``. Please note that for some variables with very + small absolute values (e.g., carbon cycle fluxes, which are usually + :math:`< 10^{-6}` kg m :math:`^{-2}` s :math:`^{-1}`) it is absolutely + essential to change the default value in order to get reasonable + results. + keep_reference_dataset: + If ``True``, keep the reference dataset in the output. If ``False``, + drop the reference dataset. Ignored if `reference` is given. + + Returns + ------- + set[PreprocessorFile] | CubeList + Output datasets/cubes. Will be a :obj:`set` of + :class:`~esmvalcore.preprocessor.PreprocessorFile` objects if + `products` is also one, a :class:`~iris.cube.CubeList` otherwise. + + Raises + ------ + ValueError + Not exactly one input datasets contains the facet ``reference_for_bias: + true`` if ``reference=None``; ``reference=None`` and the input products + are given as iterable of :class:`~iris.cube.Cube` objects; + ``bias_type`` is not one of ``'absolute'`` or ``'relative'``. + + """ + ref_product = None + all_cubes_given = all(isinstance(p, Cube) for p in products) + + # Get reference cube if not explicitly given + if reference is None: + if all_cubes_given: + raise ValueError( + "A list of Cubes is given to this preprocessor; please " + "specify a `reference`" + ) + (reference, ref_product) = _get_ref(products, 'reference_for_bias') + else: + ref_product = None + + # Mask reference cube appropriately for relative biases + if bias_type == 'relative': + reference = reference.copy() + npx = get_array_module(reference.core_data()) + reference.data = npx.ma.masked_inside( + reference.core_data(), + -denominator_mask_threshold, + denominator_mask_threshold, + ) + + # If input is an Iterable of Cube objects, calculate bias for each element + if all_cubes_given: + cubes = [_calculate_bias(c, reference, bias_type) for c in products] + return CubeList(cubes) + + # Otherwise, iterate over all input products, calculate bias and adapt + # metadata and provenance information accordingly + output_products = set() + for product in products: + if product == ref_product: + continue + cube = concatenate(product.cubes) + + # Calculate bias + cube = _calculate_bias(cube, reference, bias_type) + + # Adapt metadata and provenance information + product.attributes['units'] = str(cube.units) + if ref_product is not None: + product.wasderivedfrom(ref_product) + + product.cubes = CubeList([cube]) + output_products.add(product) + + # Add reference dataset to output if desired + if keep_reference_dataset and ref_product is not None: + output_products.add(ref_product) + + return output_products + + +def _get_ref(products, ref_tag: str) -> tuple[Cube, PreprocessorFile]: + """Get reference cube and product.""" + ref_products = [] + for product in products: + if product.attributes.get(ref_tag, False): + ref_products.append(product) + if len(ref_products) != 1: + raise ValueError( + f"Expected exactly 1 dataset with '{ref_tag}: true', found " + f"{len(ref_products):d}" + ) + ref_product = ref_products[0] + + # Extract reference cube + # Note: For technical reasons, product objects contain the member + # ``cubes``, which is a list of cubes. However, this is expected to be a + # list with exactly one element due to the call of concatenate earlier in + # the preprocessing chain of ESMValTool. To make sure that this + # preprocessor can also be used outside the ESMValTool preprocessing chain, + # an additional concatenate call is added here. + reference = concatenate(ref_product.cubes) + + return (reference, ref_product) + + +def _calculate_bias(cube: Cube, reference: Cube, bias_type: BiasType) -> Cube: + """Calculate bias for a single cube relative to a reference cube.""" + cube_metadata = cube.metadata + + if bias_type == 'absolute': + cube = cube - reference + new_units = cube.units + elif bias_type == 'relative': + cube = (cube - reference) / reference + new_units = '1' + else: + raise ValueError( + f"Expected one of ['absolute', 'relative'] for bias_type, got " + f"'{bias_type}'" + ) + + cube.metadata = cube_metadata + cube.units = new_units + + return cube + + +MetricType = Literal[ + 'rmse', + 'weighted_rmse', + 'pearsonr', + 'weighted_pearsonr', + 'emd', + 'weighted_emd', +] + + +def distance_metric( + products: set[PreprocessorFile] | Iterable[Cube], + metric: MetricType, + reference: Optional[Cube] = None, + coords: Iterable[Coord] | Iterable[str] | None = None, + keep_reference_dataset: bool = True, + **kwargs, +) -> set[PreprocessorFile] | CubeList: + r"""Calculate distance metrics. + + All input datasets need to have identical dimensional coordinates. This can + for example be ensured with the preprocessors + :func:`esmvalcore.preprocessor.regrid` and/or + :func:`esmvalcore.preprocessor.regrid_time`. + + Notes + ----- + This preprocessor requires a reference dataset, which can be specified with + the `reference` argument. If `reference` is ``None``, exactly one input + dataset in the `products` set needs to have the facet + ``reference_for_metric: true`` defined in the recipe. Please do **not** + specify the option `reference` when using this preprocessor function in a + recipe. + + Parameters + ---------- + products: + Input datasets/cubes for which the distance metric is calculated + relative to a reference dataset/cube. + metric: + Distance metric that is calculated. Must be one of + + * ``'rmse'``: Unweighted root mean square error + * ``'weighted_rmse'``: Weighted root mean square error + * ``'pearsonr'``: Unweighted Pearson correlation coefficient + * ``'weighted_pearsonr'``: Weighted Pearson correlation coefficient + * ``'emd'``: Unweighted Earth mover's distance + * ``'weighted_emd'``: Weighted Earth mover's distance + + The Earth mover's distance is also known as first Wasserstein metric + `W`\ :sub:`1`. + + A detailed description of these metrics can be found :ref:`here + `. + + .. note:: + Metrics starting with `weighted_` will calculate weighted distance + metrics if possible. Currently, the following `coords` (or any + combinations that include them) will trigger weighting: `time` + (will use lengths of time intervals as weights) and `latitude` + (will use cell area weights). Time weights are always calculated + from the input data. Area weights can be given as supplementary + variables to the recipe (`areacella` or `areacello`, see + :ref:`supplementary_variables`) or calculated from the input data + (this only works for regular grids). By default, **NO** + supplementary variables will be used; they need to be explicitly + requested in the recipe. + reference: + Cube which is used as reference for the distance metric calculation. If + ``None``, `products` needs to be a :obj:`set` of + :class:`~esmvalcore.preprocessor.PreprocessorFile` objects and exactly + one dataset in `products` needs the facet ``reference_for_metric: + true``. Do not specify this argument in a recipe. + coords: + Coordinates over which the distance metric is calculated. If ``None``, + calculate the metric over all coordinates, which results in a scalar + cube. + keep_reference_dataset: + If ``True``, also calculate the distance of the reference dataset with + itself. If ``False``, drop the reference dataset. + **kwargs: + Additional options for the metric calculation. The following keyword + arguments are supported: + + * `rmse` and `weighted_rmse`: none. + * `pearsonr` and `weighted_pearsonr`: ``mdtol``, ``common_mask`` (all + keyword arguments are passed to :func:`iris.analysis.stats.pearsonr`, + see that link for more details on these arguments). Note: in contrast + to :func:`~iris.analysis.stats.pearsonr`, ``common_mask=True`` by + default. + * `emd` and `weighted_emd`: ``n_bins`` = number of bins used to create + discrete probability distribition of data before calculating the EMD + (:obj:`int`, default: 100). + + Returns + ------- + set of esmvalcore.preprocessor.PreprocessorFile or iris.cube.CubeList + Output datasets/cubes. Will be a :obj:`set` of + :class:`~esmvalcore.preprocessor.PreprocessorFile` objects if + `products` is also one, a :class:`~iris.cube.CubeList` otherwise. + + Raises + ------ + ValueError + Shape and coordinates of products and reference data does not match; + not exactly one input datasets contains the facet + ``reference_for_metric: true`` if ``reference=None`; ``reference=None`` + and the input products are given as iterable of + :class:`~iris.cube.Cube` objects; an invalid ``metric`` has been given. + iris.exceptions.CoordinateNotFoundError + `longitude` is not found in cube if a weighted metric shall be + calculated, `latitude` is in `coords`, and no `cell_area` is given + as :ref:`supplementary_variables`. + + """ + reference_product = None + all_cubes_given = all(isinstance(p, Cube) for p in products) + + # Get reference cube if not explicitly given + if reference is None: + if all_cubes_given: + raise ValueError( + "A list of Cubes is given to this preprocessor; please " + "specify a `reference`" + ) + reference_products = [] + for product in products: + if product.attributes.get('reference_for_metric', False): + reference_products.append(product) + if len(reference_products) != 1: + raise ValueError( + f"Expected exactly 1 dataset with 'reference_for_metric: " + f"true', found {len(reference_products):d}" + ) + reference_product = reference_products[0] + + # Extract reference cube + # Note: For technical reasons, product objects contain the member + # ``cubes``, which is a list of cubes. However, this is expected to be + # a list with exactly one element due to the call of concatenate + # earlier in the preprocessing chain of ESMValTool. To make sure that + # this preprocessor can also be used outside the ESMValTool + # preprocessing chain, an additional concatenate call is added here. + reference = concatenate(reference_product.cubes) + + # If input is an Iterable of Cube objects, calculate distance metric for + # each element + if all_cubes_given: + cubes = [ + _calculate_metric(c, reference, metric, coords, **kwargs) + for c in products + ] + return CubeList(cubes) + + # Otherwise, iterate over all input products, calculate metric and adapt + # metadata and provenance information accordingly + output_products = set() + for product in products: + if not keep_reference_dataset and product == reference_product: + continue + cube = concatenate(product.cubes) + + # Calculate distance metric + cube = _calculate_metric(cube, reference, metric, coords, **kwargs) + + # Adapt metadata and provenance information + product.attributes['standard_name'] = cube.standard_name + product.attributes['long_name'] = cube.long_name + product.attributes['short_name'] = cube.var_name + product.attributes['units'] = str(cube.units) + if product != reference_product: + product.wasderivedfrom(reference_product) + + product.cubes = CubeList([cube]) + output_products.add(product) + + return output_products + + +@preserve_float_dtype +def _calculate_metric( + cube: Cube, + reference: Cube, + metric: MetricType, + coords: Iterable[Coord] | Iterable[str] | None, + **kwargs, +) -> Cube: + """Calculate metric for a single cube relative to a reference cube.""" + # Make sure that dimensional metadata of data and ref data is compatible + if cube.shape != reference.shape: + raise ValueError( + f"Expected identical shapes of cube and reference cube for " + f"distance metric calculation, got {cube.shape} and " + f"{reference.shape}, respectively" + ) + try: + cube + reference # dummy operation to check if cubes are compatible + except Exception as exc: + raise ValueError( + "Cannot calculate distance metric between cube and reference cube " + ) from exc + + # Perform the actual calculation of the distance metric + # Note: we work on arrays here instead of cube to stay as flexible as + # possible since some operations (e.g., sqrt()) are not available for cubes + coords = get_all_coords(cube, coords) + metrics_funcs = { + 'rmse': partial(_calculate_rmse, weighted=False, **kwargs), + 'weighted_rmse': partial(_calculate_rmse, weighted=True, **kwargs), + 'pearsonr': partial(_calculate_pearsonr, weighted=False, **kwargs), + 'weighted_pearsonr': partial( + _calculate_pearsonr, weighted=True, **kwargs + ), + 'emd': partial(_calculate_emd, weighted=False, **kwargs), + 'weighted_emd': partial(_calculate_emd, weighted=True, **kwargs), + } + if metric not in metrics_funcs: + raise ValueError( + f"Expected one of {list(metrics_funcs)} for metric, got '{metric}'" + ) + (res_data, res_metadata) = metrics_funcs[metric](cube, reference, coords) + + # Get result cube with correct dimensional metadata by using dummy + # operation (max) + res_cube = cube.collapsed(coords, iris.analysis.MAX) + res_cube.data = res_data + res_cube.metadata = res_metadata + res_cube.cell_methods = [*cube.cell_methods, CellMethod(metric, coords)] + + return res_cube + + +def _calculate_rmse( + cube: Cube, + reference: Cube, + coords: Iterable[Coord] | Iterable[str], + *, + weighted: bool, +) -> tuple[np.ndarray | da.Array, CubeMetadata]: + """Calculate root mean square error.""" + # Data + axis = get_all_coord_dims(cube, coords) + weights = get_weights(cube, coords) if weighted else None + squared_error = (cube.core_data() - reference.core_data())**2 + npx = get_array_module(squared_error) + rmse = npx.sqrt(npx.ma.average(squared_error, axis=axis, weights=weights)) + + # Metadata + metadata = CubeMetadata( + None, + 'RMSE' if cube.long_name is None else f'RMSE of {cube.long_name}', + 'rmse' if cube.var_name is None else f'rmse_{cube.var_name}', + cube.units, + cube.attributes, + cube.cell_methods, + ) + + return (rmse, metadata) + + +def _calculate_pearsonr( + cube: Cube, + reference: Cube, + coords: Iterable[Coord] | Iterable[str], + *, + weighted: bool, + **kwargs, +) -> tuple[np.ndarray | da.Array, CubeMetadata]: + """Calculate Pearson correlation coefficient.""" + # Here, we want to use common_mask=True in iris.analysis.stats.pearsonr + # (iris' default is common_mask=False) + kwargs.setdefault('common_mask', True) + + # Data + weights = get_weights(cube, coords) if weighted else None + res_cube = iris.analysis.stats.pearsonr( + cube, reference, corr_coords=coords, weights=weights, **kwargs + ) + + # Metadata + metadata = CubeMetadata( + None, + ( + "Pearson's r" if cube.long_name is None + else f"Pearson's r of {cube.long_name}" + ), + 'pearsonr' if cube.var_name is None else f'pearsonr_{cube.var_name}', + '1', + cube.attributes, + cube.cell_methods, + ) + + return (res_cube.core_data(), metadata) + + +def _calculate_emd( + cube: Cube, + reference: Cube, + coords: Iterable[Coord] | Iterable[str], + *, + n_bins: int = 100, + weighted: bool, +) -> tuple[np.ndarray | da.Array, CubeMetadata]: + """Calculate Earth mover's distance.""" + weights = get_weights(cube, coords) if weighted else None + + # Get probability mass functions (using histogram preprocessor) + all_data = da.stack([cube.core_data(), reference.core_data()]) + bin_range = dask.compute(all_data.min(), all_data.max()) + pmf = histogram( + cube, + coords=coords, + bins=n_bins, + bin_range=bin_range, + weights=weights, + normalization='sum', + ) + pmf_ref = histogram( + reference, + coords=coords, + bins=n_bins, + bin_range=bin_range, + weights=weights, + normalization='sum', + ) + bin_centers = pmf.coord(cube.name()).points + + # Make sure that data is not chunked along `coords` + pmf = rechunk_cube(pmf, [cube.name()]) + pmf_ref = rechunk_cube(pmf_ref, [reference.name()]) + + # Data + if cube.has_lazy_data() and reference.has_lazy_data(): + emd = da.apply_gufunc( + _get_emd, + '(i),(i),(i)->()', + pmf.lazy_data(), + pmf_ref.lazy_data(), + bin_centers, + axes=[(-1,), (-1,), (-1,), ()], + output_dtypes=pmf.dtype, + vectorize=True, + ) + else: + v_get_emd = np.vectorize(_get_emd, signature='(n),(n),(n)->()') + emd = v_get_emd(pmf.data, pmf_ref.data, bin_centers) + + # Metadata + metadata = CubeMetadata( + None, + 'EMD' if cube.long_name is None else f'EMD of {cube.long_name}', + 'emd' if cube.var_name is None else f'emd_{cube.var_name}', + cube.units, + cube.attributes, + cube.cell_methods, + ) + + return (emd, metadata) + + +def _get_emd(arr, ref_arr, bin_centers): + """Calculate Earth mover's distance (non-lazy).""" + if np.ma.is_masked(arr) or np.ma.is_masked(ref_arr): + return np.ma.masked # this is safe because PMFs will be masked arrays + return wasserstein_distance(bin_centers, bin_centers, arr, ref_arr) diff --git a/esmvalcore/preprocessor/_multimodel.py b/esmvalcore/preprocessor/_multimodel.py index 5390517a26..d1e0d90e74 100644 --- a/esmvalcore/preprocessor/_multimodel.py +++ b/esmvalcore/preprocessor/_multimodel.py @@ -26,13 +26,14 @@ from iris.util import equalise_attributes, new_axis from esmvalcore.iris_helpers import date2num +from esmvalcore.preprocessor._shared import ( + _group_products, + get_iris_aggregator, +) from esmvalcore.preprocessor._supplementary_vars import ( remove_supplementary_variables, ) -from ._other import _group_products -from ._shared import get_iris_aggregator - if TYPE_CHECKING: from esmvalcore.preprocessor import PreprocessorFile diff --git a/esmvalcore/preprocessor/_other.py b/esmvalcore/preprocessor/_other.py index 14594ba7cf..3d047a4e24 100644 --- a/esmvalcore/preprocessor/_other.py +++ b/esmvalcore/preprocessor/_other.py @@ -1,10 +1,26 @@ """Preprocessor functions that do not fit into any of the categories.""" +from __future__ import annotations import logging -from collections import defaultdict +import string +from collections.abc import Iterable, Sequence +from typing import Literal +import dask import dask.array as da +import iris.analysis import numpy as np +from iris.coords import Coord, DimCoord +from iris.cube import Cube + +from esmvalcore.iris_helpers import rechunk_cube +from esmvalcore.preprocessor._shared import ( + get_all_coord_dims, + get_all_coords, + get_array_module, + get_weights, + preserve_float_dtype, +) logger = logging.getLogger(__name__) @@ -39,49 +55,377 @@ def clip(cube, minimum=None, maximum=None): return cube -def _groupby(iterable, keyfunc): - """Group iterable by key function. +@preserve_float_dtype +def histogram( + cube: Cube, + coords: Iterable[Coord] | Iterable[str] | None = None, + bins: int | Sequence[float] = 10, + bin_range: tuple[float, float] | None = None, + weights: np.ndarray | da.Array | bool | None = None, + normalization: Literal['sum', 'integral'] | None = None, +) -> Cube: + """Calculate histogram. + + Very similar to :func:`numpy.histogram`, but calculates histogram only over + the given coordinates. - The items are grouped by the value that is returned by the `keyfunc` + Handles lazy data and masked data. Parameters ---------- - iterable : list, tuple or iterable - List of items to group - keyfunc : callable - Used to determine the group of each item. These become the keys - of the returned dictionary + cube: + Input cube. + coords: + Coordinates over which the histogram is calculated. If ``None``, + calculate the histogram over all coordinates, which results in a scalar + cube. + bins: + If `bins` is an :obj:`int`, it defines the number of equal-width bins + in the given `bin_range`. If `bins` is a sequence, it defines a + monotonically increasing array of bin edges, including the rightmost + edge, allowing for non-uniform bin widths. + bin_range: + The lower and upper range of the bins. If ``None``, `bin_range` is + simply (``cube.core_data().min(), cube.core_data().max()``). Values + outside the range are ignored. The first element of the range must be + less than or equal to the second. `bin_range` affects the automatic bin + computation as well if `bins` is an :obj:`int` (see description for + `bins` above). + weights: + Weights for the histogram calculation. Each value in the input data + only contributes its associated weight towards the bin count (instead + of 1). Weights are normalized before entering the calculation if + `normalization` is ``'integral'`` or ``'sum'``. Can be an array of the + same shape as the input data, ``False`` or ``None`` (no weighting), or + ``True``. In the latter case, weighting will depend on `coords`, and + the following coordinates will trigger weighting: `time` (will use + lengths of time intervals as weights) and `latitude` (will use cell + area weights). Time weights are always calculated from the input data. + Area weights can be given as supplementary variables to the recipe + (`areacella` or `areacello`, see :ref:`supplementary_variables`) or + calculated from the input data (this only works for regular grids). By + default, **NO** supplementary variables will be used; they need to be + explicitly requested in the recipe. + normalization: + If ``None``, the result will contain the number of samples in each bin. + If ``'integral'``, the result is the value of the probability `density` + function at the bin, normalized such that the integral over the range + is 1. If ``'sum'``, the result is the value of the probability + `mass` function at the bin, normalized such that the sum over + the range is 1. Normalization will be applied across `coords`, not the + entire cube. Returns ------- - dict - Returns a dictionary with the grouped values. + Cube + Histogram cube. The shape of this cube will be `(x1, x2, ..., n_bins)`, + where `xi` are the dimensions of the input cube not appearing in + `coords` and `n_bins` is the number of bins. + + Raises + ------ + TypeError + Invalid `bin` type given + ValueError + Invalid `normalization` or `bin_range` given or `bin_range` is ``None`` + and data is fully masked. + iris.exceptions.CoordinateNotFoundError + `longitude` is not found in cube if `weights=True`, `latitude` is in + `coords`, and no `cell_area` is given as + :ref:`supplementary_variables`. + + """ + # Check arguments + if isinstance(bins, str): + raise TypeError( + f"bins cannot be a str (got '{bins}'), must be int or Sequence of " + f"int" + ) + allowed_norms = (None, 'sum', 'integral') + if normalization is not None and normalization not in allowed_norms: + raise ValueError( + f"Expected one of {allowed_norms} for normalization, got " + f"'{normalization}'" + ) + + # If histogram is calculated over all coordinates, we can use + # dask.array.histogram and do not need to worry about chunks; otherwise, + # make sure that the cube is not chunked along the given coordinates + coords = get_all_coords(cube, coords) + axes = get_all_coord_dims(cube, coords) + if cube.has_lazy_data() and len(axes) != cube.ndim: + cube = rechunk_cube(cube, coords) + + # Calculate histogram + weights = _get_histogram_weights(cube, coords, weights, normalization) + (bin_range, bin_edges) = _get_bins(cube, bins, bin_range) + if cube.has_lazy_data(): + func = _calculate_histogram_lazy # type: ignore + else: + func = _calculate_histogram_eager # type: ignore + hist_data = func( + cube.core_data(), + weights, # type: ignore + along_axes=axes, + bin_edges=bin_edges, + bin_range=bin_range, + normalization=normalization, + ) + + # Get final cube + hist_cube = _get_histogram_cube( + cube, hist_data, coords, bin_edges, normalization + ) + + return hist_cube + + +def _get_bins( + cube: Cube, + bins: int | Sequence[float], + bin_range: tuple[float, float] | None, +) -> tuple[tuple[float, float], np.ndarray]: + """Calculate bin range and edges.""" + if bin_range is None: + bin_range = dask.compute( + cube.core_data().min(), cube.core_data().max() + ) + if isinstance(bins, int): + bin_edges = np.linspace( + bin_range[0], bin_range[1], bins + 1, dtype=np.float64 + ) + else: + bin_edges = np.array(bins, dtype=np.float64) + + finite_bin_range = [bool(np.isfinite(r)) for r in bin_range] + if not all(finite_bin_range): + raise ValueError( + f"Cannot calculate histogram for bin_range={bin_range} (or for " + f"fully masked data when `bin_range` is not given)" + ) + + return (bin_range, bin_edges) + + +def _get_histogram_weights( + cube: Cube, + coords: Iterable[Coord] | Iterable[str], + weights: np.ndarray | da.Array | bool | None, + normalization: Literal['sum', 'integral'] | None, +) -> np.ndarray | da.Array: + """Get histogram weights.""" + axes = get_all_coord_dims(cube, coords) + npx = get_array_module(cube.core_data()) + + weights_array: np.ndarray | da.Array + if weights is None or weights is False: + weights_array = npx.ones_like(cube.core_data()) + elif weights is True: + weights_array = get_weights(cube, coords) + else: + weights_array = weights + + if normalization is not None: + norm = npx.sum(weights_array, axis=axes, keepdims=True) + weights_array = weights_array / norm + + # For lazy arrays, make sure that the chunks of the cube data and weights + # match + if isinstance(weights_array, da.Array): + weights_array = weights_array.rechunk(cube.lazy_data().chunks) + + return weights_array + + +def _calculate_histogram_lazy( + data: da.Array, + weights: da.Array, + *, + along_axes: tuple[int, ...], + bin_edges: np.ndarray, + bin_range: tuple[float, float], + normalization: Literal['sum', 'integral'] | None = None, +) -> da.Array: + """Calculate histogram over data along axes (lazy version). + + This will return an array of shape `(x1, x2, ..., n_bins)` where `xi` are + the dimensions of `data` not appearing in `axes` and `n_bins` is the number + of bins. + """ - grouped = defaultdict(set) - for item in iterable: - key = keyfunc(item) - grouped[key].add(item) + n_axes = len(along_axes) + + # (1) If histogram is calculated over all axes, use the efficient + # da.histogram function + if n_axes == data.ndim: + data = data.ravel() + weights = weights.ravel() + mask = da.ma.getmaskarray(data) + data = data[~mask] + weights = weights[~mask] + hist = da.histogram( + data, bins=bin_edges, range=bin_range, weights=weights + )[0] + hist_sum = hist.sum() + hist = da.ma.masked_array(hist, mask=da.allclose(hist_sum, 0.0)) + if normalization == 'sum': + hist = hist / hist_sum + elif normalization == 'integral': + diffs = np.array(np.diff(bin_edges), dtype=data.dtype) + hist = hist / hist_sum / diffs + hist = da.ma.masked_invalid(hist) - return grouped + # (2) Otherwise, use da.apply_gufunc with the eager version + # _calculate_histogram_eager + else: + # da.apply_gufunc transposes the input array so that the axes given by + # the `axes` argument to da.apply_gufunc are the rightmost dimensions. + # Thus, we need to use `along_axes=(ndim-n_axes, ..., ndim-2, ndim-1)` + # for _calculate_histogram_eager here. + axes_in_chunk = tuple(range(data.ndim - n_axes, data.ndim)) + # The call signature depends also on the number of axes in `axes`, and + # will be (a,b,...)->(nbins) where a,b,... are the data dimensions that + # are collapsed, and nbins the number of bin centers + in_signature = f"({','.join(list(string.ascii_lowercase)[:n_axes])})" + hist = da.apply_gufunc( + _calculate_histogram_eager, + f"{in_signature},{in_signature}->(nbins)", + data, + weights, + axes=[along_axes, along_axes, (-1,)], + output_sizes={'nbins': len(bin_edges) - 1}, + along_axes=axes_in_chunk, + bin_edges=bin_edges, + bin_range=bin_range, + normalization=normalization, + ) -def _group_products(products, by_key): - """Group products by the given list of attributes.""" - def grouper(product): - return product.group(by_key) + return hist - grouped = _groupby(products, keyfunc=grouper) - return grouped.items() +def _calculate_histogram_eager( + data: np.ndarray, + weights: np.ndarray, + *, + along_axes: tuple[int, ...], + bin_edges: np.ndarray, + bin_range: tuple[float, float], + normalization: Literal['sum', 'integral'] | None = None, +) -> np.ndarray: + """Calculate histogram over data along axes (eager version). -def get_array_module(*args): - """Return the best matching array module. + This will return an array of shape `(x1, x2, ..., n_bins)` where `xi` are + the dimensions of `data` not appearing in `axes` and `n_bins` is the number + of bins. - If at least one of the arguments is a :class:`dask.array.Array` object, - the :mod:`dask.array` module is returned. In all other cases the - :mod:`numpy` module is returned. """ - for arg in args: - if isinstance(arg, da.Array): - return da - return np + # Create array with shape (x1, x2, ..., y) where `y` is the product of all + # dimensions in `axes` and the `xi` are the remaining dimensions + remaining_dims = tuple(a for a in range(data.ndim) if a not in along_axes) + reshaped_data = np.transpose(data, axes=(*remaining_dims, *along_axes)) + reshaped_weights = np.transpose( + weights, axes=(*remaining_dims, *along_axes) + ) + shape_rem_dims = tuple(data.shape[a] for a in remaining_dims) + reshaped_data = reshaped_data.reshape(*shape_rem_dims, -1) + reshaped_weights = reshaped_weights.reshape(*shape_rem_dims, -1) + + # Apply vectorized version of np.histogram + def _get_hist_values(arr, wgts): + mask = np.ma.getmaskarray(arr) + arr = arr[~mask] + wgts = wgts[~mask] + return np.histogram( + arr, bins=bin_edges, range=bin_range, weights=wgts + )[0] + + v_histogram = np.vectorize(_get_hist_values, signature='(n),(n)->(m)') + hist = v_histogram(reshaped_data, reshaped_weights) + + # Mask points where all input data were masked (these are the ones where + # the histograms sums to 0) + hist_sum = hist.sum(axis=-1, keepdims=True) + mask = np.isclose(hist_sum, 0.0) + hist = np.ma.array(hist, mask=np.broadcast_to(mask, hist.shape)) + + # Apply normalization + if normalization == 'sum': + hist = hist / np.ma.array(hist_sum, mask=mask) + elif normalization == 'integral': + hist = ( + hist / + np.ma.array(hist_sum, mask=mask) / + np.ma.array(np.diff(bin_edges), dtype=data.dtype) + ) + + return hist + + +def _get_histogram_cube( + cube: Cube, + data: np.ndarray | da.Array, + coords: Iterable[Coord] | Iterable[str], + bin_edges: np.ndarray, + normalization: Literal['sum', 'integral'] | None, +): + """Get cube with correct metadata for histogram.""" + # Calculate bin centers using 2-window running mean and get corresponding + # coordinate + bin_centers = np.convolve(bin_edges, np.ones(2), 'valid') / 2.0 + bin_coord = DimCoord( + bin_centers, + bounds=np.stack((bin_edges[:-1], bin_edges[1:]), axis=-1), + standard_name=cube.standard_name, + long_name=cube.long_name, + var_name=cube.var_name, + units=cube.units, + ) + + # Get result cube with correct dimensional metadata by using dummy + # operation (max) + cell_methods = cube.cell_methods + cube = cube.collapsed(coords, iris.analysis.MAX) + + # Get histogram cube + long_name_suffix = ( + '' if cube.long_name is None else f' of {cube.long_name}' + ) + var_name_suffix = '' if cube.var_name is None else f'_{cube.var_name}' + dim_spec = ( + [(d, cube.coord_dims(d)) for d in cube.dim_coords] + + [(bin_coord, cube.ndim)] + ) + if normalization == 'sum': + long_name = f"Relative Frequency{long_name_suffix}" + var_name = f"relative_frequency{var_name_suffix}" + units = '1' + elif normalization == 'integral': + long_name = f"Density{long_name_suffix}" + var_name = f"density{var_name_suffix}" + units = cube.units**-1 + else: + long_name = f"Frequency{long_name_suffix}" + var_name = f"frequency{var_name_suffix}" + units = '1' + hist_cube = Cube( + data, + standard_name=None, + long_name=long_name, + var_name=var_name, + units=units, + attributes=cube.attributes, + cell_methods=cell_methods, + dim_coords_and_dims=dim_spec, + aux_coords_and_dims=[(a, cube.coord_dims(a)) for a in cube.aux_coords], + aux_factories=cube.aux_factories, + ancillary_variables_and_dims=[ + (a, cube.ancillary_variable_dims(a)) for a in + cube.ancillary_variables() + ], + cell_measures_and_dims=[ + (c, cube.cell_measure_dims(c)) for c in cube.cell_measures() + ], + ) + + return hist_cube diff --git a/esmvalcore/preprocessor/_regrid.py b/esmvalcore/preprocessor/_regrid.py index a3fbc7032e..c7913df0d7 100644 --- a/esmvalcore/preprocessor/_regrid.py +++ b/esmvalcore/preprocessor/_regrid.py @@ -11,7 +11,6 @@ import warnings from copy import deepcopy from decimal import Decimal -from functools import partial from pathlib import Path from typing import TYPE_CHECKING, Any, Optional @@ -30,8 +29,11 @@ from esmvalcore.cmor.table import CMOR_TABLES from esmvalcore.exceptions import ESMValCoreDeprecationWarning from esmvalcore.iris_helpers import has_irregular_grid, has_unstructured_grid -from esmvalcore.preprocessor._other import get_array_module -from esmvalcore.preprocessor._shared import preserve_float_dtype +from esmvalcore.preprocessor._shared import ( + broadcast_to_shape, + get_array_module, + preserve_float_dtype, +) from esmvalcore.preprocessor._supplementary_vars import ( add_ancillary_variable, add_cell_measure, @@ -1033,52 +1035,6 @@ def _create_cube(src_cube, data, src_levels, levels): return result -def is_lazy_masked_data(array): - """Similar to `iris._lazy_data.is_lazy_masked_data`.""" - return isinstance(array, da.Array) and isinstance( - da.utils.meta_from_array(array), np.ma.MaskedArray) - - -def broadcast_to_shape(array, shape, dim_map, chunks=None): - """Copy of `iris.util.broadcast_to_shape` that allows specifying chunks.""" - if isinstance(array, da.Array): - if chunks is not None: - chunks = list(chunks) - for src_idx, tgt_idx in enumerate(dim_map): - # Only use the specified chunks along new dimensions or on - # dimensions that have size 1 in the source array. - if array.shape[src_idx] != 1: - chunks[tgt_idx] = array.chunks[src_idx] - broadcast = partial(da.broadcast_to, shape=shape, chunks=chunks) - else: - broadcast = partial(np.broadcast_to, shape=shape) - - n_orig_dims = len(array.shape) - n_new_dims = len(shape) - n_orig_dims - array = array.reshape(array.shape + (1,) * n_new_dims) - - # Get dims in required order. - array = np.moveaxis(array, range(n_orig_dims), dim_map) - new_array = broadcast(array) - - if np.ma.isMA(array): - # broadcast_to strips masks so we need to handle them explicitly. - mask = np.ma.getmask(array) - if mask is np.ma.nomask: - new_mask = np.ma.nomask - else: - new_mask = broadcast(mask) - new_array = np.ma.array(new_array, mask=new_mask) - - elif is_lazy_masked_data(array): - # broadcast_to strips masks so we need to handle them explicitly. - mask = da.ma.getmaskarray(array) - new_mask = broadcast(mask) - new_array = da.ma.masked_array(new_array, new_mask) - - return new_array - - def _vertical_interpolate(cube, src_levels, levels, interpolation, extrapolation): """Perform vertical interpolation.""" diff --git a/esmvalcore/preprocessor/_shared.py b/esmvalcore/preprocessor/_shared.py index 0a5f5627dd..addd3617ac 100644 --- a/esmvalcore/preprocessor/_shared.py +++ b/esmvalcore/preprocessor/_shared.py @@ -8,23 +8,25 @@ import logging import re import warnings -from collections.abc import Callable -from functools import wraps +from collections import defaultdict +from collections.abc import Callable, Iterable +from functools import partial, wraps from typing import Any, Literal, Optional import dask.array as da import iris.analysis import numpy as np -from iris.coords import DimCoord +from iris.coords import CellMeasure, Coord, DimCoord from iris.cube import Cube +from iris.exceptions import CoordinateMultiDimError, CoordinateNotFoundError from esmvalcore.exceptions import ESMValCoreDeprecationWarning +from esmvalcore.iris_helpers import has_regular_grid from esmvalcore.typing import DataType logger = logging.getLogger(__name__) -# guess bounds tool def guess_bounds(cube, coords): """Guess bounds of a cube, or not.""" # check for bounds just in case @@ -266,3 +268,310 @@ def wrapper(data: DataType, *args: Any, **kwargs: Any) -> DataType: return result return wrapper + + +def _groupby(iterable, keyfunc): + """Group iterable by key function. + + The items are grouped by the value that is returned by the `keyfunc` + + Parameters + ---------- + iterable : list, tuple or iterable + List of items to group + keyfunc : callable + Used to determine the group of each item. These become the keys + of the returned dictionary + + Returns + ------- + dict + Returns a dictionary with the grouped values. + """ + grouped = defaultdict(set) + for item in iterable: + key = keyfunc(item) + grouped[key].add(item) + + return grouped + + +def _group_products(products, by_key): + """Group products by the given list of attributes.""" + def grouper(product): + return product.group(by_key) + + grouped = _groupby(products, keyfunc=grouper) + return grouped.items() + + +def get_array_module(*args): + """Return the best matching array module. + + If at least one of the arguments is a :class:`dask.array.Array` object, + the :mod:`dask.array` module is returned. In all other cases the + :mod:`numpy` module is returned. + """ + for arg in args: + if isinstance(arg, da.Array): + return da + return np + + +def broadcast_to_shape(array, shape, dim_map, chunks=None): + """Copy of `iris.util.broadcast_to_shape` that allows specifying chunks.""" + if isinstance(array, da.Array): + if chunks is not None: + chunks = list(chunks) + for src_idx, tgt_idx in enumerate(dim_map): + # Only use the specified chunks along new dimensions or on + # dimensions that have size 1 in the source array. + if array.shape[src_idx] != 1: + chunks[tgt_idx] = array.chunks[src_idx] + broadcast = partial(da.broadcast_to, shape=shape, chunks=chunks) + else: + broadcast = partial(np.broadcast_to, shape=shape) + + n_orig_dims = len(array.shape) + n_new_dims = len(shape) - n_orig_dims + array = array.reshape(array.shape + (1,) * n_new_dims) + + # Get dims in required order. + array = np.moveaxis(array, range(n_orig_dims), dim_map) + new_array = broadcast(array) + + if np.ma.isMA(array): + # broadcast_to strips masks so we need to handle them explicitly. + mask = np.ma.getmask(array) + if mask is np.ma.nomask: + new_mask = np.ma.nomask + else: + new_mask = broadcast(mask) + new_array = np.ma.array(new_array, mask=new_mask) + + elif _is_lazy_masked_data(array): + # broadcast_to strips masks so we need to handle them explicitly. + mask = da.ma.getmaskarray(array) + new_mask = broadcast(mask) + new_array = da.ma.masked_array(new_array, new_mask) + + return new_array + + +def _is_lazy_masked_data(array): + """Similar to `iris._lazy_data.is_lazy_masked_data`.""" + return isinstance(array, da.Array) and isinstance( + da.utils.meta_from_array(array), np.ma.MaskedArray) + + +def get_weights( + cube: Cube, + coords: Iterable[Coord] | Iterable[str], +) -> np.ndarray | da.Array: + """Calculate suitable weights for given coordinates.""" + npx = get_array_module(cube.core_data()) + weights = npx.ones_like(cube.core_data()) + + # Time weights: lengths of time interval + if 'time' in coords: + weights *= broadcast_to_shape( + npx.array(get_time_weights(cube)), + cube.shape, + cube.coord_dims('time'), + ) + + # Latitude weights: cell areas + if 'latitude' in coords: + cube = cube.copy() # avoid overwriting input cube + if ( + not cube.cell_measures('cell_area') and + not cube.coords('longitude') + ): + raise CoordinateNotFoundError( + f"Cube {cube.summary(shorten=True)} needs a `longitude` " + f"coordinate to calculate cell area weights for weighted " + f"distance metric over coordinates {coords} (alternatively, " + f"a `cell_area` can be given to the cube as supplementary " + f"variable)" + ) + try_adding_calculated_cell_area(cube) + weights *= broadcast_to_shape( + cube.cell_measure('cell_area').core_data(), + cube.shape, + cube.cell_measure_dims('cell_area'), + ) + + return weights + + +def get_time_weights(cube: Cube) -> np.ndarray | da.core.Array: + """Compute the weighting of the time axis. + + Parameters + ---------- + cube: + Input cube. + + Returns + ------- + np.ndarray or da.Array + Array of time weights for averaging. Returns a + :class:`dask.array.Array` if the input cube has lazy data; a + :class:`numpy.ndarray` otherwise. + + """ + time = cube.coord('time') + coord_dims = cube.coord_dims('time') + + # Multidimensional time coordinates are not supported: In this case, + # weights cannot be simply calculated as difference between the bounds + if len(coord_dims) > 1: + raise ValueError( + f"Weighted statistical operations are not supported for " + f"{len(coord_dims):d}D time coordinates, expected 0D or 1D" + ) + + # Extract 1D time weights (= lengths of time intervals) + time_weights = time.lazy_bounds()[:, 1] - time.lazy_bounds()[:, 0] + if cube.has_lazy_data(): + # Align the weight chunks with the data chunks to avoid excessively + # large chunks as a result of broadcasting. + time_chunks = cube.lazy_data().chunks[coord_dims[0]] + time_weights = time_weights.rechunk(time_chunks) + else: + time_weights = time_weights.compute() + return time_weights + + +def try_adding_calculated_cell_area(cube: Cube) -> None: + """Try to add calculated cell measure 'cell_area' to cube (in-place).""" + if cube.cell_measures('cell_area'): + return + + logger.debug( + "Found no cell measure 'cell_area' in cube %s. Check availability of " + "supplementary variables", + cube.summary(shorten=True), + ) + logger.debug("Attempting to calculate grid cell area") + + rotated_pole_grid = all([ + cube.coord('latitude').core_points().ndim == 2, + cube.coord('longitude').core_points().ndim == 2, + cube.coords('grid_latitude'), + cube.coords('grid_longitude'), + ]) + + # For regular grids, calculate grid cell areas with iris function + if has_regular_grid(cube): + cube = guess_bounds(cube, ['latitude', 'longitude']) + logger.debug("Calculating grid cell areas for regular grid") + cell_areas = _compute_area_weights(cube) + + # For rotated pole grids, use grid_latitude and grid_longitude to calculate + # grid cell areas + elif rotated_pole_grid: + cube = guess_bounds(cube, ['grid_latitude', 'grid_longitude']) + cube_tmp = cube.copy() + cube_tmp.remove_coord('latitude') + cube_tmp.coord('grid_latitude').rename('latitude') + cube_tmp.remove_coord('longitude') + cube_tmp.coord('grid_longitude').rename('longitude') + logger.debug("Calculating grid cell areas for rotated pole grid") + cell_areas = _compute_area_weights(cube_tmp) + + # For all other cases, grid cell areas cannot be calculated + else: + logger.error( + "Supplementary variables are needed to calculate grid cell " + "areas for irregular or unstructured grid of cube %s", + cube.summary(shorten=True), + ) + raise CoordinateMultiDimError(cube.coord('latitude')) + + # Add new cell measure + cell_measure = CellMeasure( + cell_areas, standard_name='cell_area', units='m2', measure='area', + ) + cube.add_cell_measure(cell_measure, np.arange(cube.ndim)) + + +def _compute_area_weights(cube): + """Compute area weights.""" + with warnings.catch_warnings(record=True) as caught_warnings: + warnings.filterwarnings( + 'always', + message="Using DEFAULT_SPHERICAL_EARTH_RADIUS.", + category=UserWarning, + module='iris.analysis.cartography', + ) + # TODO: replace the following line with + # weights = iris.analysis.cartography.area_weights( + # cube, compute=not cube.has_lazy_data() + # ) + # once https://github.com/SciTools/iris/pull/5658 is available + weights = _get_area_weights(cube) + + for warning in caught_warnings: + logger.debug( + "%s while computing area weights of the following cube:\n%s", + warning.message, cube) + return weights + + +def _get_area_weights(cube: Cube) -> np.ndarray | da.Array: + """Get area weights. + + For non-lazy data, simply use the according iris function. For lazy data, + calculate area weights for a single lat-lon slice and broadcast it to the + correct shape. + + Note + ---- + This is a temporary workaround to get lazy area weights. Can be removed + once https://github.com/SciTools/iris/pull/5658 is available. + + """ + if not cube.has_lazy_data(): + return iris.analysis.cartography.area_weights(cube) + + lat_lon_dims = sorted( + tuple(set(cube.coord_dims('latitude') + cube.coord_dims('longitude'))) + ) + lat_lon_slice = next(cube.slices(['latitude', 'longitude'], ordered=False)) + weights_2d = iris.analysis.cartography.area_weights(lat_lon_slice) + weights = broadcast_to_shape( + da.array(weights_2d), + cube.shape, + lat_lon_dims, + chunks=cube.lazy_data().chunks, + ) + return weights + + +def get_all_coords( + cube: Cube, + coords: Iterable[Coord] | Iterable[str] | None, +) -> Iterable[Coord] | Iterable[str]: + """Get all desired coordinates in a cube.""" + if coords is None: + coords = [c.name() for c in cube.dim_coords] + if len(coords) != cube.ndim: + raise ValueError( + f"If coords=None is specified, the cube " + f"{cube.summary(shorten=True)} must not have unnamed " + f"dimensions" + ) + return coords + + +def get_all_coord_dims( + cube: Cube, + coords: Iterable[Coord] | Iterable[str], +) -> tuple[int, ...]: + """Get sorted list of all coordinate dimensions from coordinates.""" + all_coord_dims = [] + for coord in coords: + all_coord_dims.extend(cube.coord_dims(coord)) + sorted_all_coord_dims = sorted(list(set(all_coord_dims))) + return tuple(sorted_all_coord_dims) diff --git a/esmvalcore/preprocessor/_time.py b/esmvalcore/preprocessor/_time.py index 635861b86c..b89cc92b0a 100644 --- a/esmvalcore/preprocessor/_time.py +++ b/esmvalcore/preprocessor/_time.py @@ -32,6 +32,7 @@ from esmvalcore.iris_helpers import date2num, rechunk_cube from esmvalcore.preprocessor._shared import ( get_iris_aggregator, + get_time_weights, preserve_float_dtype, update_weights_kwargs, ) @@ -383,45 +384,6 @@ def extract_month(cube: Cube, month: int) -> Cube: return result -def get_time_weights(cube: Cube) -> np.ndarray | da.core.Array: - """Compute the weighting of the time axis. - - Parameters - ---------- - cube: - Input cube. - - Returns - ------- - np.ndarray or da.Array - Array of time weights for averaging. Returns a - :class:`dask.array.Array` if the input cube has lazy data; a - :class:`numpy.ndarray` otherwise. - - """ - time = cube.coord('time') - coord_dims = cube.coord_dims('time') - - # Multidimensional time coordinates are not supported: In this case, - # weights cannot be simply calculated as difference between the bounds - if len(coord_dims) > 1: - raise ValueError( - f"Weighted statistical operations are not supported for " - f"{len(coord_dims):d}D time coordinates, expected 0D or 1D" - ) - - # Extract 1D time weights (= lengths of time intervals) - time_weights = time.lazy_bounds()[:, 1] - time.lazy_bounds()[:, 0] - if cube.has_lazy_data(): - # Align the weight chunks with the data chunks to avoid excessively - # large chunks as a result of broadcasting. - time_chunks = cube.lazy_data().chunks[coord_dims[0]] - time_weights = time_weights.rechunk(time_chunks) - else: - time_weights = time_weights.compute() - return time_weights - - def _aggregate_time_fx(result_cube, source_cube): time_dim = set(source_cube.coord_dims(source_cube.coord('time'))) if source_cube.cell_measures(): diff --git a/esmvalcore/preprocessor/_volume.py b/esmvalcore/preprocessor/_volume.py index 239fa99758..3d04f23500 100644 --- a/esmvalcore/preprocessor/_volume.py +++ b/esmvalcore/preprocessor/_volume.py @@ -15,11 +15,11 @@ from iris.coords import AuxCoord, CellMeasure from iris.cube import Cube -from ._area import _try_adding_calculated_cell_area from ._shared import ( get_iris_aggregator, get_normalized_cube, preserve_float_dtype, + try_adding_calculated_cell_area, update_weights_kwargs, ) from ._supplementary_vars import register_supplementaries @@ -155,7 +155,7 @@ def calculate_volume(cube: Cube) -> da.core.Array: # Get or calculate the horizontal areas of the cube has_cell_measure = bool(cube.cell_measures('cell_area')) - _try_adding_calculated_cell_area(cube) + try_adding_calculated_cell_area(cube) area = cube.cell_measure('cell_area').copy() area_dim = cube.cell_measure_dims(area) diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py index bcee6eac5b..c50d837b68 100644 --- a/tests/integration/recipe/test_recipe.py +++ b/tests/integration/recipe/test_recipe.py @@ -2983,7 +2983,7 @@ def test_bias_two_refs(tmp_path, patched_datafinder, session): assert "found 2" in exc.value.failed_tasks[0].message -def test_inlvaid_bias_type(tmp_path, patched_datafinder, session): +def test_invalid_bias_type(tmp_path, patched_datafinder, session): content = dedent(""" preprocessors: test_bias: @@ -3185,3 +3185,108 @@ def test_wildcard_derived_var( assert dataset.facets['dataset'] == 'BBB' assert dataset.facets['institute'] == 'B' assert dataset.facets['short_name'] == 'swcre' + + +def test_distance_metric_no_ref(tmp_path, patched_datafinder, session): + content = dedent(""" + preprocessors: + test_distance_metric: + distance_metric: + metric: emd + + diagnostics: + diagnostic_name: + variables: + ta: + preprocessor: test_distance_metric + project: CMIP6 + mip: Amon + exp: historical + timerange: '20000101/20001231' + ensemble: r1i1p1f1 + grid: gn + additional_datasets: + - {dataset: CanESM5} + - {dataset: CESM2} + + scripts: null + """) + msg = ( + "Expected exactly 1 dataset with 'reference_for_metric: true' in " + "products" + ) + with pytest.raises(RecipeError) as exc: + get_recipe(tmp_path, content, session) + assert str(exc.value) == INITIALIZATION_ERROR_MSG + assert msg in exc.value.failed_tasks[0].message + assert "found 0" in exc.value.failed_tasks[0].message + + +def test_distance_metric_two_refs(tmp_path, patched_datafinder, session): + content = dedent(""" + preprocessors: + test_distance_metric: + distance_metric: + metric: emd + + diagnostics: + diagnostic_name: + variables: + ta: + preprocessor: test_distance_metric + project: CMIP6 + mip: Amon + exp: historical + timerange: '20000101/20001231' + ensemble: r1i1p1f1 + grid: gn + additional_datasets: + - {dataset: CanESM5, reference_for_metric: true} + - {dataset: CESM2, reference_for_metric: true} + + scripts: null + """) + msg = ( + "Expected exactly 1 dataset with 'reference_for_metric: true' in " + "products" + ) + with pytest.raises(RecipeError) as exc: + get_recipe(tmp_path, content, session) + assert str(exc.value) == INITIALIZATION_ERROR_MSG + assert msg in exc.value.failed_tasks[0].message + assert "found 2" in exc.value.failed_tasks[0].message + + +def test_invalid_metric(tmp_path, patched_datafinder, session): + content = dedent(""" + preprocessors: + test_distance_metric: + distance_metric: + metric: INVALID + + diagnostics: + diagnostic_name: + variables: + ta: + preprocessor: test_distance_metric + project: CMIP6 + mip: Amon + exp: historical + timerange: '20000101/20001231' + ensemble: r1i1p1f1 + grid: gn + additional_datasets: + - {dataset: CanESM5} + - {dataset: CESM2, reference_for_metric: true} + + scripts: null + """) + msg = ( + "Expected one of ('rmse', 'weighted_rmse', 'pearsonr', " + "'weighted_pearsonr', 'emd', 'weighted_emd') for `metric`, got " + "'INVALID'" + ) + with pytest.raises(RecipeError) as exc: + get_recipe(tmp_path, content, session) + assert str(exc.value) == INITIALIZATION_ERROR_MSG + assert exc.value.failed_tasks[0].message == msg diff --git a/tests/unit/preprocessor/_area/test_area.py b/tests/unit/preprocessor/_area/test_area.py index a7374a5e8b..e7c3f998c5 100644 --- a/tests/unit/preprocessor/_area/test_area.py +++ b/tests/unit/preprocessor/_area/test_area.py @@ -2,7 +2,6 @@ import unittest from pathlib import Path -import dask.array as da import fiona import iris import numpy as np @@ -18,7 +17,6 @@ import tests from esmvalcore.preprocessor._area import ( _crop_cube, - _get_area_weights, _get_requested_geometries, _update_shapefile_path, area_statistics, @@ -1447,22 +1445,5 @@ def test_time_dependent_volcello(): assert cube.shape == cube.cell_measure('ocean_volume').shape -@pytest.mark.parametrize('lazy', [True, False]) -def test_get_area_weights(lazy): - """Test _get_area_weights.""" - cube = _create_sample_full_cube() - if lazy: - cube.data = cube.lazy_data() - weights = _get_area_weights(cube) - if lazy: - assert isinstance(weights, da.Array) - assert weights.chunks == cube.lazy_data().chunks - else: - assert isinstance(weights, np.ndarray) - np.testing.assert_allclose( - weights, iris.analysis.cartography.area_weights(cube) - ) - - if __name__ == '__main__': unittest.main() diff --git a/tests/unit/preprocessor/_bias/test_bias.py b/tests/unit/preprocessor/_bias/test_bias.py deleted file mode 100644 index 9adab3872e..0000000000 --- a/tests/unit/preprocessor/_bias/test_bias.py +++ /dev/null @@ -1,333 +0,0 @@ -"""Unit tests for :mod:`esmvalcore.preprocessor._bias`.""" - -import iris -import numpy as np -import pytest -from cf_units import Unit -from iris.cube import Cube, CubeList - -from esmvalcore.preprocessor._bias import bias -from tests import PreprocessorFile - - -def assert_array_equal(array_1, array_2): - """Assert that (masked) array 1 equals (masked) array 2.""" - if np.ma.is_masked(array_1) or np.ma.is_masked(array_2): - np.testing.assert_array_equal(np.ma.getmaskarray(array_1), - np.ma.getmaskarray(array_2)) - mask = np.ma.getmaskarray(array_1) - np.testing.assert_array_equal(array_1[~mask], array_2[~mask]) - else: - np.testing.assert_array_equal(array_1, array_2) - - -def products_set_to_dict(products): - """Convert :obj:`set` of products to :obj:`dict`.""" - new_dict = {} - for product in products: - new_dict[product.filename] = product - return new_dict - - -def get_3d_cube(data, **cube_kwargs): - """Create 3D cube.""" - time_units = Unit('days since 1850-01-01 00:00:00') - times = iris.coords.DimCoord([0.0, 1.0], standard_name='time', - var_name='time', long_name='time', - units=time_units) - lats = iris.coords.DimCoord([0.0, 10.0], standard_name='latitude', - var_name='lat', long_name='latitude', - units='degrees_north') - lons = iris.coords.DimCoord([20.0, 30.0], standard_name='longitude', - var_name='lon', long_name='longitude', - units='degrees_east') - coord_specs = [(times, 0), (lats, 1), (lons, 2)] - cube = Cube(data.astype('float32'), - dim_coords_and_dims=coord_specs, **cube_kwargs) - return cube - - -@pytest.fixture -def regular_cubes(): - """Regular cube.""" - cube_data = np.arange(8.0).reshape(2, 2, 2) - cube = get_3d_cube(cube_data, standard_name='air_temperature', - var_name='tas', units='K') - return CubeList([cube]) - - -@pytest.fixture -def ref_cubes(): - """Reference cube.""" - cube_data = np.full((2, 2, 2), 2.0) - cube_data[1, 1, 1] = 4.0 - cube = get_3d_cube(cube_data, standard_name='air_temperature', - var_name='tas', units='K') - return CubeList([cube]) - - -TEST_BIAS = [ - ('absolute', [[[-2.0, -1.0], [0.0, 1.0]], [[2.0, 3.0], [4.0, 3.0]]], 'K'), - ('relative', [[[-1.0, -0.5], [0.0, 0.5]], [[1.0, 1.5], [2.0, 0.75]]], '1'), -] - - -@pytest.mark.parametrize('bias_type,data,units', TEST_BIAS) -def test_bias_products(regular_cubes, ref_cubes, bias_type, data, units): - """Test calculation of bias with products.""" - ref_product = PreprocessorFile(ref_cubes, 'REF', - {'reference_for_bias': True}) - products = { - PreprocessorFile(regular_cubes, 'A', {'dataset': 'a'}), - PreprocessorFile(regular_cubes, 'B', {'dataset': 'b'}), - ref_product, - } - out_products = bias(products, bias_type=bias_type) - - assert isinstance(out_products, set) - out_dict = products_set_to_dict(out_products) - assert len(out_dict) == 2 - - product_a = out_dict['A'] - assert product_a.filename == 'A' - assert product_a.attributes == {'units': units, 'dataset': 'a'} - assert len(product_a.cubes) == 1 - out_cube = product_a.cubes[0] - assert_array_equal(out_cube.data, data) - assert out_cube.var_name == 'tas' - assert out_cube.standard_name == 'air_temperature' - assert out_cube.units == units - assert out_cube.dim_coords == regular_cubes[0].dim_coords - assert out_cube.aux_coords == regular_cubes[0].aux_coords - product_a.wasderivedfrom.assert_called_once() - assert product_a.mock_ancestors == {ref_product} - - product_b = out_dict['B'] - assert product_b.filename == 'B' - assert product_b.attributes == {'units': units, 'dataset': 'b'} - assert len(product_b.cubes) == 1 - out_cube = product_b.cubes[0] - assert_array_equal(out_cube.data, data) - assert out_cube.var_name == 'tas' - assert out_cube.standard_name == 'air_temperature' - assert out_cube.units == units - assert out_cube.dim_coords == regular_cubes[0].dim_coords - assert out_cube.aux_coords == regular_cubes[0].aux_coords - product_b.wasderivedfrom.assert_called_once() - assert product_b.mock_ancestors == {ref_product} - - -@pytest.mark.parametrize('bias_type,data,units', TEST_BIAS) -def test_bias_cubes(regular_cubes, ref_cubes, bias_type, data, units): - """Test calculation of bias with cubes.""" - ref_cube = ref_cubes[0] - out_cubes = bias(regular_cubes, ref_cube, bias_type=bias_type) - - assert isinstance(out_cubes, CubeList) - assert len(out_cubes) == 1 - out_cube = out_cubes[0] - - assert_array_equal(out_cube.data, data) - assert out_cube.var_name == 'tas' - assert out_cube.standard_name == 'air_temperature' - assert out_cube.units == units - assert out_cube.dim_coords == regular_cubes[0].dim_coords - assert out_cube.aux_coords == regular_cubes[0].aux_coords - - -TEST_BIAS_BROADCASTABLE = [ - ('absolute', [[[-2.0, -1.0], [0.0, 1.0]], [[2.0, 3.0], [4.0, 5.0]]], 'K'), - ('relative', [[[-1.0, -0.5], [0.0, 0.5]], [[1.0, 1.5], [2.0, 2.5]]], '1'), -] - - -@pytest.mark.parametrize('bias_type,data,units', TEST_BIAS_BROADCASTABLE) -def test_bias_cubes_broadcastable( - regular_cubes, ref_cubes, bias_type, data, units -): - """Test calculation of bias with cubes.""" - ref_cube = ref_cubes[0][0] # only select one time step - out_cubes = bias(regular_cubes, ref_cube, bias_type=bias_type) - - assert isinstance(out_cubes, CubeList) - assert len(out_cubes) == 1 - out_cube = out_cubes[0] - - assert_array_equal(out_cube.data, data) - assert out_cube.var_name == 'tas' - assert out_cube.standard_name == 'air_temperature' - assert out_cube.units == units - assert out_cube.dim_coords == regular_cubes[0].dim_coords - assert out_cube.aux_coords == regular_cubes[0].aux_coords - - -def test_denominator_mask_threshold_products(regular_cubes, ref_cubes): - """Test denominator_mask_threshold argument with products.""" - ref_product = PreprocessorFile(ref_cubes, 'REF', - {'reference_for_bias': True}) - products = { - PreprocessorFile(regular_cubes, 'A', {'dataset': 'a'}), - ref_product, - } - out_products = bias( - products, bias_type='relative', denominator_mask_threshold=3.0 - ) - - assert isinstance(out_products, set) - out_dict = products_set_to_dict(out_products) - assert len(out_dict) == 1 - - product_a = out_dict['A'] - assert product_a.filename == 'A' - assert product_a.attributes == {'units': '1', 'dataset': 'a'} - assert len(product_a.cubes) == 1 - out_cube = product_a.cubes[0] - expected_data = np.ma.masked_equal([[[42.0, 42.0], - [42.0, 42.0]], - [[42.0, 42.0], - [42.0, 0.75]]], 42.0) - assert_array_equal(out_cube.data, expected_data) - assert out_cube.var_name == 'tas' - assert out_cube.standard_name == 'air_temperature' - assert out_cube.units == '1' - assert out_cube.dim_coords == regular_cubes[0].dim_coords - assert out_cube.aux_coords == regular_cubes[0].aux_coords - product_a.wasderivedfrom.assert_called_once() - assert product_a.mock_ancestors == {ref_product} - - -def test_denominator_mask_threshold_cubes(regular_cubes, ref_cubes): - """Test denominator_mask_threshold argument with cubes.""" - ref_cube = ref_cubes[0] - out_cubes = bias( - regular_cubes, - ref_cube, - bias_type='relative', - denominator_mask_threshold=3.0, - ) - - assert isinstance(out_cubes, CubeList) - assert len(out_cubes) == 1 - out_cube = out_cubes[0] - - expected_data = np.ma.masked_equal([[[42.0, 42.0], - [42.0, 42.0]], - [[42.0, 42.0], - [42.0, 0.75]]], 42.0) - assert_array_equal(out_cube.data, expected_data) - assert out_cube.var_name == 'tas' - assert out_cube.standard_name == 'air_temperature' - assert out_cube.units == '1' - assert out_cube.dim_coords == regular_cubes[0].dim_coords - assert out_cube.aux_coords == regular_cubes[0].aux_coords - - -@pytest.mark.parametrize('bias_type', ['absolute', 'relative']) -def test_keep_reference_dataset(regular_cubes, ref_cubes, bias_type): - """Test denominator_mask_threshold argument.""" - products = { - PreprocessorFile(regular_cubes, 'A', {'dataset': 'a'}), - PreprocessorFile(ref_cubes, 'REF', {'reference_for_bias': True}) - } - out_products = bias( - products, bias_type=bias_type, keep_reference_dataset=True - ) - - assert isinstance(out_products, set) - out_dict = products_set_to_dict(out_products) - assert len(out_dict) == 2 - - product_ref = out_dict['REF'] - assert product_ref.filename == 'REF' - assert product_ref.attributes == {'reference_for_bias': True} - assert len(product_ref.cubes) == 1 - out_cube = product_ref.cubes[0] - expected_data = [[[2.0, 2.0], [2.0, 2.0]], [[2.0, 2.0], [2.0, 4.0]]] - assert_array_equal(out_cube.data, expected_data) - assert out_cube.var_name == 'tas' - assert out_cube.standard_name == 'air_temperature' - assert out_cube.units == 'K' - assert out_cube.dim_coords == ref_cubes[0].dim_coords - assert out_cube.aux_coords == ref_cubes[0].aux_coords - - -@pytest.mark.parametrize('bias_type,data,units', TEST_BIAS) -@pytest.mark.parametrize('keep_ref', [True, False]) -def test_bias_products_and_ref_cube( - regular_cubes, ref_cubes, keep_ref, bias_type, data, units -): - """Test calculation of bias with products and ref_cube given.""" - ref_cube = ref_cubes[0] - products = set([PreprocessorFile(regular_cubes, 'A', {'dataset': 'a'})]) - - out_products = bias( - products, - ref_cube=ref_cube, - bias_type=bias_type, - keep_reference_dataset=keep_ref, - ) - - assert isinstance(out_products, set) - out_dict = products_set_to_dict(out_products) - assert len(out_dict) == 1 - - product_a = out_dict['A'] - assert product_a.filename == 'A' - assert product_a.attributes == {'units': units, 'dataset': 'a'} - assert len(product_a.cubes) == 1 - out_cube = product_a.cubes[0] - assert_array_equal(out_cube.data, data) - assert out_cube.var_name == 'tas' - assert out_cube.standard_name == 'air_temperature' - assert out_cube.units == units - assert out_cube.dim_coords == regular_cubes[0].dim_coords - assert out_cube.aux_coords == regular_cubes[0].aux_coords - product_a.wasderivedfrom.assert_not_called() - assert product_a.mock_ancestors == set() - - -def test_no_reference_for_bias(regular_cubes, ref_cubes): - """Test fail when no reference_for_bias is given.""" - products = { - PreprocessorFile(regular_cubes, 'A', {}), - PreprocessorFile(regular_cubes, 'B', {}), - PreprocessorFile(ref_cubes, 'REF', {}), - } - msg = "Expected exactly 1 dataset with 'reference_for_bias: true', found 0" - with pytest.raises(ValueError, match=msg): - bias(products) - - -def test_two_references_for_bias(regular_cubes, ref_cubes): - """Test fail when two reference_for_bias products is given.""" - products = { - PreprocessorFile(regular_cubes, 'A', {'reference_for_bias': False}), - PreprocessorFile(ref_cubes, 'REF1', {'reference_for_bias': True}), - PreprocessorFile(ref_cubes, 'REF2', {'reference_for_bias': True}), - } - msg = "Expected exactly 1 dataset with 'reference_for_bias: true', found 2" - with pytest.raises(ValueError, match=msg): - bias(products) - - -def test_invalid_bias_type(regular_cubes, ref_cubes): - """Test fail when invalid bias_type is given.""" - products = { - PreprocessorFile(regular_cubes, 'A', {}), - PreprocessorFile(regular_cubes, 'B', {}), - PreprocessorFile(ref_cubes, 'REF', {'reference_for_bias': True}), - } - msg = (r"Expected one of \['absolute', 'relative'\] for bias_type, got " - r"'invalid_bias_type'") - with pytest.raises(ValueError, match=msg): - bias(products, bias_type='invalid_bias_type') - - -def test_ref_cube_non_cubes(regular_cubes): - """Test ref_cube=None with with cubes.""" - msg = ( - "A list of Cubes is given to this preprocessor; please specify a " - "`ref_cube`" - ) - with pytest.raises(ValueError, match=msg): - bias(regular_cubes) diff --git a/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py b/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py new file mode 100644 index 0000000000..7def9afe1b --- /dev/null +++ b/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py @@ -0,0 +1,809 @@ +"""Unit tests for :mod:`esmvalcore.preprocessor._compare_with_refs`.""" + +import contextlib + +import dask.array as da +import iris +import numpy as np +import pytest +from cf_units import Unit +from iris.coords import CellMeasure, CellMethod +from iris.cube import Cube, CubeList +from iris.exceptions import CoordinateNotFoundError + +from esmvalcore.preprocessor._compare_with_refs import bias, distance_metric +from tests import PreprocessorFile + + +def assert_allclose(array_1, array_2): + """Assert that (masked) array 1 is close to (masked) array 2.""" + if np.ma.is_masked(array_1) or np.ma.is_masked(array_2): + mask_1 = np.ma.getmaskarray(array_1) + mask_2 = np.ma.getmaskarray(array_2) + np.testing.assert_equal(mask_1, mask_2) + np.testing.assert_allclose(array_1[~mask_1], array_2[~mask_2]) + else: + np.testing.assert_allclose(array_1, array_2) + + +def products_set_to_dict(products): + """Convert :obj:`set` of products to :obj:`dict`.""" + new_dict = {} + for product in products: + new_dict[product.filename] = product + return new_dict + + +def get_3d_cube(data, **cube_kwargs): + """Create 3D cube.""" + time_units = Unit('days since 1850-01-01 00:00:00') + times = iris.coords.DimCoord([3.0, 7.0], + bounds=[[0.0, 6.0], [6.0, 8.0]], + standard_name='time', + var_name='time', long_name='time', + units=time_units) + lats = iris.coords.DimCoord([0.0, 10.0], standard_name='latitude', + var_name='lat', long_name='latitude', + units='degrees_north') + lons = iris.coords.DimCoord([20.0, 30.0], standard_name='longitude', + var_name='lon', long_name='longitude', + units='degrees_east') + coord_specs = [(times, 0), (lats, 1), (lons, 2)] + cube = Cube(data.astype('float32'), + dim_coords_and_dims=coord_specs, **cube_kwargs) + return cube + + +@pytest.fixture +def regular_cubes(): + """Regular cubes.""" + cube_data = np.arange(8.0).reshape(2, 2, 2) + cube = get_3d_cube( + cube_data, standard_name='air_temperature', var_name='tas', units='K' + ) + return CubeList([cube]) + + +@pytest.fixture +def ref_cubes(): + """Reference cubes.""" + cube_data = np.full((2, 2, 2), 2.0) + cube_data[1, 1, 1] = 4.0 + cube = get_3d_cube( + cube_data, standard_name='air_temperature', var_name='tas', units='K' + ) + return CubeList([cube]) + + +TEST_BIAS = [ + ('absolute', [[[-2.0, -1.0], [0.0, 1.0]], [[2.0, 3.0], [4.0, 3.0]]], 'K'), + ('relative', [[[-1.0, -0.5], [0.0, 0.5]], [[1.0, 1.5], [2.0, 0.75]]], '1'), +] + + +@pytest.mark.parametrize('bias_type,data,units', TEST_BIAS) +def test_bias_products(regular_cubes, ref_cubes, bias_type, data, units): + """Test calculation of bias with products.""" + ref_product = PreprocessorFile(ref_cubes, 'REF', + {'reference_for_bias': True}) + products = { + PreprocessorFile(regular_cubes, 'A', {'dataset': 'a'}), + PreprocessorFile(regular_cubes, 'B', {'dataset': 'b'}), + ref_product, + } + out_products = bias(products, bias_type=bias_type) + + assert isinstance(out_products, set) + out_dict = products_set_to_dict(out_products) + assert len(out_dict) == 2 + + product_a = out_dict['A'] + assert product_a.filename == 'A' + assert product_a.attributes == {'units': units, 'dataset': 'a'} + assert len(product_a.cubes) == 1 + out_cube = product_a.cubes[0] + assert out_cube.dtype == np.float32 + assert_allclose(out_cube.data, data) + assert out_cube.var_name == 'tas' + assert out_cube.standard_name == 'air_temperature' + assert out_cube.units == units + assert out_cube.dim_coords == regular_cubes[0].dim_coords + assert out_cube.aux_coords == regular_cubes[0].aux_coords + product_a.wasderivedfrom.assert_called_once() + assert product_a.mock_ancestors == {ref_product} + + product_b = out_dict['B'] + assert product_b.filename == 'B' + assert product_b.attributes == {'units': units, 'dataset': 'b'} + assert len(product_b.cubes) == 1 + out_cube = product_b.cubes[0] + assert out_cube.dtype == np.float32 + assert_allclose(out_cube.data, data) + assert out_cube.var_name == 'tas' + assert out_cube.standard_name == 'air_temperature' + assert out_cube.units == units + assert out_cube.dim_coords == regular_cubes[0].dim_coords + assert out_cube.aux_coords == regular_cubes[0].aux_coords + product_b.wasderivedfrom.assert_called_once() + assert product_b.mock_ancestors == {ref_product} + + +@pytest.mark.parametrize('bias_type,data,units', TEST_BIAS) +def test_bias_cubes(regular_cubes, ref_cubes, bias_type, data, units): + """Test calculation of bias with cubes.""" + ref_cube = ref_cubes[0] + out_cubes = bias(regular_cubes, ref_cube, bias_type=bias_type) + + assert isinstance(out_cubes, CubeList) + assert len(out_cubes) == 1 + out_cube = out_cubes[0] + + assert out_cube.dtype == np.float32 + assert_allclose(out_cube.data, data) + assert out_cube.var_name == 'tas' + assert out_cube.standard_name == 'air_temperature' + assert out_cube.units == units + assert out_cube.dim_coords == regular_cubes[0].dim_coords + assert out_cube.aux_coords == regular_cubes[0].aux_coords + + +TEST_BIAS_BROADCASTABLE = [ + ('absolute', [[[-2.0, -1.0], [0.0, 1.0]], [[2.0, 3.0], [4.0, 5.0]]], 'K'), + ('relative', [[[-1.0, -0.5], [0.0, 0.5]], [[1.0, 1.5], [2.0, 2.5]]], '1'), +] + + +@pytest.mark.parametrize('bias_type,data,units', TEST_BIAS_BROADCASTABLE) +def test_bias_cubes_broadcastable( + regular_cubes, ref_cubes, bias_type, data, units +): + """Test calculation of bias with cubes.""" + ref_cube = ref_cubes[0][0] # only select one time step + out_cubes = bias(regular_cubes, ref_cube, bias_type=bias_type) + + assert isinstance(out_cubes, CubeList) + assert len(out_cubes) == 1 + out_cube = out_cubes[0] + + assert out_cube.dtype == np.float32 + assert_allclose(out_cube.data, data) + assert out_cube.var_name == 'tas' + assert out_cube.standard_name == 'air_temperature' + assert out_cube.units == units + assert out_cube.dim_coords == regular_cubes[0].dim_coords + assert out_cube.aux_coords == regular_cubes[0].aux_coords + + +def test_denominator_mask_threshold_products(regular_cubes, ref_cubes): + """Test denominator_mask_threshold argument with products.""" + ref_product = PreprocessorFile(ref_cubes, 'REF', + {'reference_for_bias': True}) + products = { + PreprocessorFile(regular_cubes, 'A', {'dataset': 'a'}), + ref_product, + } + out_products = bias( + products, bias_type='relative', denominator_mask_threshold=3.0 + ) + + assert isinstance(out_products, set) + out_dict = products_set_to_dict(out_products) + assert len(out_dict) == 1 + + product_a = out_dict['A'] + assert product_a.filename == 'A' + assert product_a.attributes == {'units': '1', 'dataset': 'a'} + assert len(product_a.cubes) == 1 + out_cube = product_a.cubes[0] + assert out_cube.dtype == np.float32 + expected_data = np.ma.masked_equal([[[42.0, 42.0], + [42.0, 42.0]], + [[42.0, 42.0], + [42.0, 0.75]]], 42.0) + assert_allclose(out_cube.data, expected_data) + assert out_cube.var_name == 'tas' + assert out_cube.standard_name == 'air_temperature' + assert out_cube.units == '1' + assert out_cube.dim_coords == regular_cubes[0].dim_coords + assert out_cube.aux_coords == regular_cubes[0].aux_coords + product_a.wasderivedfrom.assert_called_once() + assert product_a.mock_ancestors == {ref_product} + + +def test_denominator_mask_threshold_cubes(regular_cubes, ref_cubes): + """Test denominator_mask_threshold argument with cubes.""" + ref_cube = ref_cubes[0] + out_cubes = bias( + regular_cubes, + ref_cube, + bias_type='relative', + denominator_mask_threshold=3.0, + ) + + assert isinstance(out_cubes, CubeList) + assert len(out_cubes) == 1 + out_cube = out_cubes[0] + assert out_cube.dtype == np.float32 + expected_data = np.ma.masked_equal([[[42.0, 42.0], + [42.0, 42.0]], + [[42.0, 42.0], + [42.0, 0.75]]], 42.0) + assert_allclose(out_cube.data, expected_data) + assert out_cube.var_name == 'tas' + assert out_cube.standard_name == 'air_temperature' + assert out_cube.units == '1' + assert out_cube.dim_coords == regular_cubes[0].dim_coords + assert out_cube.aux_coords == regular_cubes[0].aux_coords + + +@pytest.mark.parametrize('bias_type', ['absolute', 'relative']) +def test_keep_reference_dataset(regular_cubes, ref_cubes, bias_type): + """Test denominator_mask_threshold argument.""" + products = { + PreprocessorFile(regular_cubes, 'A', {'dataset': 'a'}), + PreprocessorFile(ref_cubes, 'REF', {'reference_for_bias': True}) + } + out_products = bias( + products, bias_type=bias_type, keep_reference_dataset=True + ) + + assert isinstance(out_products, set) + out_dict = products_set_to_dict(out_products) + assert len(out_dict) == 2 + + product_ref = out_dict['REF'] + assert product_ref.filename == 'REF' + assert product_ref.attributes == {'reference_for_bias': True} + assert len(product_ref.cubes) == 1 + out_cube = product_ref.cubes[0] + assert out_cube.dtype == np.float32 + expected_data = [[[2.0, 2.0], [2.0, 2.0]], [[2.0, 2.0], [2.0, 4.0]]] + assert_allclose(out_cube.data, expected_data) + assert out_cube.var_name == 'tas' + assert out_cube.standard_name == 'air_temperature' + assert out_cube.units == 'K' + assert out_cube.dim_coords == ref_cubes[0].dim_coords + assert out_cube.aux_coords == ref_cubes[0].aux_coords + + +@pytest.mark.parametrize('bias_type,data,units', TEST_BIAS) +@pytest.mark.parametrize('keep_ref', [True, False]) +def test_bias_products_and_ref_cube( + regular_cubes, ref_cubes, keep_ref, bias_type, data, units +): + """Test calculation of bias with products and ref_cube given.""" + ref_cube = ref_cubes[0] + products = set([PreprocessorFile(regular_cubes, 'A', {'dataset': 'a'})]) + + out_products = bias( + products, + reference=ref_cube, + bias_type=bias_type, + keep_reference_dataset=keep_ref, + ) + + assert isinstance(out_products, set) + out_dict = products_set_to_dict(out_products) + assert len(out_dict) == 1 + + product_a = out_dict['A'] + assert product_a.filename == 'A' + assert product_a.attributes == {'units': units, 'dataset': 'a'} + assert len(product_a.cubes) == 1 + out_cube = product_a.cubes[0] + assert out_cube.dtype == np.float32 + assert_allclose(out_cube.data, data) + assert out_cube.var_name == 'tas' + assert out_cube.standard_name == 'air_temperature' + assert out_cube.units == units + assert out_cube.dim_coords == regular_cubes[0].dim_coords + assert out_cube.aux_coords == regular_cubes[0].aux_coords + product_a.wasderivedfrom.assert_not_called() + assert product_a.mock_ancestors == set() + + +def test_no_reference_for_bias(regular_cubes, ref_cubes): + """Test fail when no reference_for_bias is given.""" + products = { + PreprocessorFile(regular_cubes, 'A', {}), + PreprocessorFile(regular_cubes, 'B', {}), + PreprocessorFile(ref_cubes, 'REF', {}), + } + msg = "Expected exactly 1 dataset with 'reference_for_bias: true', found 0" + with pytest.raises(ValueError, match=msg): + bias(products) + + +def test_two_references_for_bias(regular_cubes, ref_cubes): + """Test fail when two reference_for_bias products are given.""" + products = { + PreprocessorFile(regular_cubes, 'A', {'reference_for_bias': False}), + PreprocessorFile(ref_cubes, 'REF1', {'reference_for_bias': True}), + PreprocessorFile(ref_cubes, 'REF2', {'reference_for_bias': True}), + } + msg = "Expected exactly 1 dataset with 'reference_for_bias: true', found 2" + with pytest.raises(ValueError, match=msg): + bias(products) + + +def test_invalid_bias_type(regular_cubes, ref_cubes): + """Test fail when invalid bias_type is given.""" + products = { + PreprocessorFile(regular_cubes, 'A', {}), + PreprocessorFile(regular_cubes, 'B', {}), + PreprocessorFile(ref_cubes, 'REF', {'reference_for_bias': True}), + } + msg = (r"Expected one of \['absolute', 'relative'\] for bias_type, got " + r"'invalid_bias_type'") + with pytest.raises(ValueError, match=msg): + bias(products, bias_type='invalid_bias_type') + + +def test_reference_none_cubes(regular_cubes): + """Test reference=None with with cubes.""" + msg = ( + "A list of Cubes is given to this preprocessor; please specify a " + "`reference`" + ) + with pytest.raises(ValueError, match=msg): + bias(regular_cubes) + + +TEST_DISTANCE_METRICS = [ + ('rmse', 2.34520788, 0.0, 'RMSE', 'rmse_tas', 'K'), + ('weighted_rmse', 2.0, 0.0, 'RMSE', 'rmse_tas', 'K'), + ('pearsonr', 0.57735026, 1.0, "Pearson's r", 'pearsonr_tas', '1'), + ('weighted_pearsonr', np.nan, 1.0, "Pearson's r", 'pearsonr_tas', '1'), + ('emd', 1.98625, 0.0, 'EMD', 'emd_tas', 'K'), + ('weighted_emd', 0.9975, 0.0, 'EMD', 'emd_tas', 'K'), +] +AREA_WEIGHTS = CellMeasure( + np.array([0.0, 0.0, 2.0, 0.0]).reshape(2, 2), + standard_name='cell_area', + units='m2', +) + + +@pytest.mark.parametrize( + 'metric,data,ref_data,long_name,var_name,units', TEST_DISTANCE_METRICS +) +def test_distance_metric( + regular_cubes, + ref_cubes, + metric, + data, + ref_data, + long_name, + var_name, + units, +): + """Test `distance_metric`.""" + regular_cubes[0].add_cell_measure(AREA_WEIGHTS, (1, 2)) + ref_product = PreprocessorFile( + ref_cubes, 'REF', {'reference_for_metric': True} + ) + products = { + PreprocessorFile(regular_cubes, 'A', {'dataset': 'a'}), + PreprocessorFile(regular_cubes, 'B', {'dataset': 'b'}), + ref_product, + } + + out_products = distance_metric(products, metric) + + assert isinstance(out_products, set) + out_dict = products_set_to_dict(out_products) + assert len(out_dict) == 3 + expected_attrs = { + 'standard_name': None, + 'long_name': long_name, + 'short_name': var_name, + 'units': units, + } + + product_a = out_dict['A'] + assert product_a.filename == 'A' + assert product_a.attributes == {'dataset': 'a', **expected_attrs} + assert len(product_a.cubes) == 1 + out_cube = product_a.cubes[0] + assert out_cube.shape == () + assert out_cube.dtype == np.float32 + assert_allclose(out_cube.data, np.array(data, dtype=np.float32)) + assert out_cube.var_name == var_name + assert out_cube.long_name == long_name + assert out_cube.standard_name is None + assert out_cube.units == units + assert out_cube.cell_methods == ( + CellMethod(metric, ['time', 'latitude', 'longitude']), + ) + product_a.wasderivedfrom.assert_called_once() + assert product_a.mock_ancestors == {ref_product} + + product_b = out_dict['B'] + assert product_b.filename == 'B' + assert product_b.attributes == {'dataset': 'b', **expected_attrs} + assert len(product_b.cubes) == 1 + out_cube = product_b.cubes[0] + assert out_cube.shape == () + assert out_cube.dtype == np.float32 + assert_allclose(out_cube.data, np.array(data, dtype=np.float32)) + assert out_cube.var_name == var_name + assert out_cube.long_name == long_name + assert out_cube.standard_name is None + assert out_cube.units == units + assert out_cube.cell_methods == ( + CellMethod(metric, ['time', 'latitude', 'longitude']), + ) + product_b.wasderivedfrom.assert_called_once() + assert product_b.mock_ancestors == {ref_product} + + product_ref = out_dict['REF'] + assert product_ref.filename == 'REF' + assert product_ref.attributes == { + 'reference_for_metric': True, **expected_attrs + } + assert len(product_ref.cubes) == 1 + out_cube = product_ref.cubes[0] + assert out_cube.shape == () + assert out_cube.dtype == np.float32 + assert_allclose(out_cube.data, ref_data) + assert out_cube.var_name == var_name + assert out_cube.long_name == long_name + assert out_cube.standard_name is None + assert out_cube.units == units + assert out_cube.cell_methods == ( + CellMethod(metric, ['time', 'latitude', 'longitude']), + ) + product_ref.wasderivedfrom.assert_not_called() + assert product_ref.mock_ancestors == set() + + +TEST_DISTANCE_METRICS_LAZY = [ + ('rmse', [1.224744871, 3.082207001], 'RMSE', 'rmse_tas', 'K'), + ('weighted_rmse', [1.2278657, 3.0784798], 'RMSE', 'rmse_tas', 'K'), + ('pearsonr', [np.nan, 0.77459663], "Pearson's r", 'pearsonr_tas', '1'), + ( + 'weighted_pearsonr', + [np.nan, 0.7745946], + "Pearson's r", + 'pearsonr_tas', + '1', + ), + ('emd', [0.98, 2.9925], 'EMD', 'emd_tas', 'K'), + ('weighted_emd', [0.9837506, 2.9888833], 'EMD', 'emd_tas', 'K'), +] + + +@pytest.mark.parametrize( + 'metric,data,long_name,var_name,units', TEST_DISTANCE_METRICS_LAZY +) +def test_distance_metric_lazy( + regular_cubes, ref_cubes, metric, data, long_name, var_name, units +): + """Test `distance_metric` with lazy data.""" + regular_cubes[0].data = da.array(regular_cubes[0].data) + ref_cubes[0].data = da.array(ref_cubes[0].data) + ref_product = PreprocessorFile( + ref_cubes, 'REF', {'reference_for_metric': True} + ) + products = { + PreprocessorFile(regular_cubes, 'A', {'dataset': 'a'}), + ref_product, + } + + out_products = distance_metric( + products, + metric, + coords=['latitude', 'longitude'], + keep_reference_dataset=False, + ) + + assert isinstance(out_products, set) + out_dict = products_set_to_dict(out_products) + assert len(out_dict) == 1 + + product_a = out_dict['A'] + assert product_a.filename == 'A' + assert product_a.attributes == { + 'dataset': 'a', + 'standard_name': None, + 'long_name': long_name, + 'short_name': var_name, + 'units': units, + } + assert len(product_a.cubes) == 1 + out_cube = product_a.cubes[0] + assert out_cube.shape == (2,) + assert out_cube.dtype == np.float32 + assert out_cube.has_lazy_data() + assert_allclose( + out_cube.data, + np.ma.masked_invalid(np.array(data, dtype=np.float32)), + ) + assert out_cube.coord('time') == regular_cubes[0].coord('time') + assert out_cube.var_name == var_name + assert out_cube.long_name == long_name + assert out_cube.standard_name is None + assert out_cube.units == units + assert out_cube.cell_methods == ( + CellMethod(metric, ['latitude', 'longitude']), + ) + product_a.wasderivedfrom.assert_called_once() + assert product_a.mock_ancestors == {ref_product} + + +@pytest.mark.parametrize( + 'metric,data,_,long_name,var_name,units', TEST_DISTANCE_METRICS +) +def test_distance_metric_cubes( + regular_cubes, ref_cubes, metric, data, _, long_name, var_name, units +): + """Test `distance_metric` with cubes.""" + regular_cubes[0].add_cell_measure(AREA_WEIGHTS, (1, 2)) + out_cubes = distance_metric(regular_cubes, metric, reference=ref_cubes[0]) + + assert isinstance(out_cubes, CubeList) + assert len(out_cubes) == 1 + out_cube = out_cubes[0] + + assert out_cube.shape == () + assert out_cube.dtype == np.float32 + assert_allclose(out_cube.data, np.array(data, dtype=np.float32)) + assert out_cube.var_name == var_name + assert out_cube.long_name == long_name + assert out_cube.standard_name is None + assert out_cube.units == units + assert out_cube.cell_methods == ( + CellMethod(metric, ['time', 'latitude', 'longitude']), + ) + + +@pytest.mark.parametrize('lazy', [True, False]) +@pytest.mark.parametrize( + 'metric,data,_,long_name,var_name,units', TEST_DISTANCE_METRICS +) +def test_distance_metric_masked_data( + regular_cubes, ref_cubes, metric, data, _, long_name, var_name, units, lazy +): + """Test `distance_metric` with masked data.""" + # Test cube + time_units = Unit('days since 1850-01-01 00:00:00') + times = iris.coords.DimCoord([3.0, 7.0, 9.0], + bounds=[[0.0, 6.0], [6.0, 8.0], [8.0, 10.0]], + standard_name='time', + var_name='time', long_name='time', + units=time_units) + lats = regular_cubes[0].coord('latitude') + lons = regular_cubes[0].coord('longitude') + coord_specs = [(times, 0), (lats, 1), (lons, 2)] + cube_data = np.pad( + regular_cubes[0].data, + ((0, 1), (0, 0), (0, 0)), + 'constant', + constant_values=np.nan, + ) + cube = Cube( + np.ma.masked_invalid(cube_data), dim_coords_and_dims=coord_specs + ) + cube.metadata = regular_cubes[0].metadata + cube.add_cell_measure(AREA_WEIGHTS, (1, 2)) + + # Ref cube + ref_cube = cube.copy() + ref_data = np.pad( + ref_cubes[0].data, + ((0, 1), (0, 0), (0, 0)), + 'constant', + constant_values=np.nan, + ) + ref_cube.data = np.ma.masked_invalid(ref_data) + ref_cube.metadata = ref_cubes[0].metadata + + if lazy: + cube.data = da.array(cube.data) + ref_cube.data = da.array(ref_cube.data) + + out_cubes = distance_metric([cube], metric, reference=ref_cube) + + assert isinstance(out_cubes, CubeList) + assert len(out_cubes) == 1 + out_cube = out_cubes[0] + + assert out_cube.shape == () + if lazy: + assert out_cube.has_lazy_data() + else: + assert not out_cube.has_lazy_data() + assert out_cube.dtype == np.float32 + + # Mask handling differs for dask and numpy + if np.isnan(data) and not lazy: + expected_data = np.ma.masked_invalid(data) + else: + expected_data = np.array(data, dtype=np.float32) + assert_allclose(out_cube.data, expected_data) + assert out_cube.var_name == var_name + assert out_cube.long_name == long_name + assert out_cube.standard_name is None + assert out_cube.units == units + assert out_cube.cell_methods == ( + CellMethod(metric, ['time', 'latitude', 'longitude']), + ) + + +@pytest.mark.parametrize('lazy', [True, False]) +@pytest.mark.parametrize( + 'metric,_,__,long_name,var_name,units', TEST_DISTANCE_METRICS +) +def test_distance_metric_fully_masked_data( + regular_cubes, ref_cubes, metric, _, __, long_name, var_name, units, lazy +): + """Test `distance_metric` with fully_masked data.""" + cube = regular_cubes[0] + cube.data = np.ma.masked_invalid(np.full(cube.shape, np.nan)) + cube.add_cell_measure(AREA_WEIGHTS, (1, 2)) + ref_cube = ref_cubes[0] + + if lazy: + cube.data = da.array(cube.data) + ref_cube.data = da.array(ref_cube.data) + + out_cubes = distance_metric([cube], metric, reference=ref_cube) + + assert isinstance(out_cubes, CubeList) + assert len(out_cubes) == 1 + out_cube = out_cubes[0] + + assert out_cube.shape == () + if lazy: + assert out_cube.has_lazy_data() + else: + assert not out_cube.has_lazy_data() + assert out_cube.dtype == np.float64 + + expected_data = np.ma.masked_all(()) + assert_allclose(out_cube.data, expected_data) + assert out_cube.var_name == var_name + assert out_cube.long_name == long_name + assert out_cube.standard_name is None + assert out_cube.units == units + assert out_cube.cell_methods == ( + CellMethod(metric, ['time', 'latitude', 'longitude']), + ) + + +TEST_METRICS = [ + 'rmse', + 'weighted_rmse', + 'pearsonr', + 'weighted_pearsonr', + 'emd', + 'weighted_emd', +] + + +@pytest.mark.parametrize('metric', TEST_METRICS) +def test_no_reference_for_metric(regular_cubes, ref_cubes, metric): + """Test fail when no reference_for_metric is given.""" + products = { + PreprocessorFile(regular_cubes, 'A', {}), + PreprocessorFile(regular_cubes, 'B', {}), + PreprocessorFile(ref_cubes, 'REF', {}), + } + msg = ( + "Expected exactly 1 dataset with 'reference_for_metric: true', found 0" + ) + with pytest.raises(ValueError, match=msg): + distance_metric(products, metric) + + +@pytest.mark.parametrize('metric', TEST_METRICS) +def test_two_references_for_metric(regular_cubes, ref_cubes, metric): + """Test fail when two reference_for_metric products are given.""" + products = { + PreprocessorFile(regular_cubes, 'A', {'reference_for_metric': False}), + PreprocessorFile(ref_cubes, 'REF1', {'reference_for_metric': True}), + PreprocessorFile(ref_cubes, 'REF2', {'reference_for_metric': True}), + } + msg = ( + "Expected exactly 1 dataset with 'reference_for_metric: true', found 2" + ) + with pytest.raises(ValueError, match=msg): + distance_metric(products, 'rmse') + + +def test_invalid_metric(regular_cubes, ref_cubes): + """Test fail when invalid metric is given.""" + products = { + PreprocessorFile(regular_cubes, 'A', {}), + PreprocessorFile(regular_cubes, 'B', {}), + PreprocessorFile(ref_cubes, 'REF', {'reference_for_metric': True}), + } + msg = ( + r"Expected one of \['rmse', 'weighted_rmse', 'pearsonr', " + r"'weighted_pearsonr', 'emd', 'weighted_emd'\] for metric, got " + r"'invalid'" + ) + with pytest.raises(ValueError, match=msg): + distance_metric(products, 'invalid') + + +@pytest.mark.parametrize('metric', TEST_METRICS) +def test_distance_metric_reference_none_cubes(regular_cubes, metric): + """Test distance metric with reference=None with with cubes.""" + msg = ( + "A list of Cubes is given to this preprocessor; please specify a " + "`reference`" + ) + with pytest.raises(ValueError, match=msg): + distance_metric(regular_cubes, metric) + + +@pytest.mark.parametrize('metric', TEST_METRICS) +def test_distance_metric_no_named_dimensions(metric): + """Test distance metric with reference=None with with cubes.""" + ref_cube = Cube([0, 1]) + cubes = CubeList([ref_cube]) + msg = ( + "If coords=None is specified, the cube .* must not have unnamed " + "dimensions" + ) + with pytest.raises(ValueError, match=msg): + distance_metric(cubes, metric, reference=ref_cube) + + +@pytest.mark.parametrize('metric', TEST_METRICS) +def test_distance_metric_non_matching_shapes(regular_cubes, metric): + """Test distance metric with cubes of different shapes.""" + ref_cube = Cube(0) + msg = ( + r"Expected identical shapes of cube and reference cube for distance " + r"metric calculation, got \(2, 2, 2\) and \(\), respectively" + ) + with pytest.raises(ValueError, match=msg): + distance_metric(regular_cubes, metric, reference=ref_cube) + + +@pytest.mark.parametrize('metric', TEST_METRICS) +def test_distance_metric_non_matching_dims(regular_cubes, metric): + """Test distance metric with cubes with difference dimensions.""" + ref_cube = regular_cubes[0].copy() + ref_cube.remove_coord('time') + new_coord = iris.coords.DimCoord([0.0, 1.0], var_name='not_time') + ref_cube.add_dim_coord(new_coord, 0) + msg = "Cannot calculate distance metric between cube and reference cube" + with pytest.raises(ValueError, match=msg): + distance_metric(regular_cubes, metric, reference=ref_cube) + + +@pytest.mark.parametrize( + 'metric,error', + [ + ('rmse', False), + ('weighted_rmse', True), + ('pearsonr', False), + ('weighted_pearsonr', True), + ('emd', False), + ('weighted_emd', True), + ] +) +def test_distance_metric_no_lon_for_area_weights(regular_cubes, metric, error): + """Test distance metric with cubes that have no longitude.""" + regular_cubes[0].remove_coord('longitude') + ref_cube = regular_cubes[0].copy() + msg = ( + r"Cube .* needs a `longitude` coordinate to calculate cell area " + r"weights for weighted distance metric over coordinates \['time', " + r"'latitude'\] \(alternatively, a `cell_area` can be given to the " + r"cube as supplementary variable\)" + ) + if error: + context = pytest.raises(CoordinateNotFoundError, match=msg) + else: + context = contextlib.nullcontext() + with context: + distance_metric( + regular_cubes, + metric, + reference=ref_cube, + coords=['time', 'latitude'] + ) diff --git a/tests/unit/preprocessor/_other/test_other.py b/tests/unit/preprocessor/_other/test_other.py index 803b66d997..e5d6a871e1 100644 --- a/tests/unit/preprocessor/_other/test_other.py +++ b/tests/unit/preprocessor/_other/test_other.py @@ -6,15 +6,22 @@ import iris.coord_categorisation import iris.coords import numpy as np +import pytest from cf_units import Unit +from iris.aux_factory import AtmosphereSigmaFactory +from iris.coords import ( + AncillaryVariable, + AuxCoord, + CellMeasure, + CellMethod, + DimCoord, +) from iris.cube import Cube from numpy.testing import assert_array_equal -from esmvalcore.preprocessor import PreprocessorFile -from esmvalcore.preprocessor._other import ( - _group_products, - clip, - get_array_module, +from esmvalcore.preprocessor._other import clip, histogram +from tests.unit.preprocessor._compare_with_refs.test_compare_with_refs import ( + get_3d_cube, ) @@ -50,45 +57,376 @@ def test_clip(self): clip(cube, 10, 8) -def test_group_products_string_list(): - products = [ - PreprocessorFile( - filename='A_B.nc', - attributes={ - 'project': 'A', - 'dataset': 'B', - }, - ), - PreprocessorFile( - filename='A_C.nc', - attributes={ - 'project': 'A', - 'dataset': 'C', - } - ), +@pytest.fixture +def cube(): + """Regular cube.""" + cube_data = np.ma.masked_inside( + np.arange(8.0, dtype=np.float32).reshape(2, 2, 2), 1.5, 3.5 + ) + cube_data = np.swapaxes(cube_data, 0, -1) + cube = get_3d_cube( + cube_data, standard_name='air_temperature', var_name='tas', units='K' + ) + return cube + + +def assert_metadata(cube, normalization=None): + """Assert correct metadata.""" + assert cube.standard_name is None + if normalization == 'sum': + assert cube.long_name == 'Relative Frequency' + assert cube.var_name == 'relative_frequency_tas' + assert cube.units == '1' + elif normalization == 'integral': + assert cube.long_name == 'Density' + assert cube.var_name == 'density_tas' + assert cube.units == 'K-1' + else: + assert cube.long_name == 'Frequency' + assert cube.var_name == 'frequency_tas' + assert cube.units == '1' + assert cube.attributes == {} + assert cube.cell_methods == () + assert cube.coords('air_temperature') + bin_coord = cube.coord('air_temperature') + assert bin_coord.standard_name == 'air_temperature' + assert bin_coord.var_name == 'tas' + assert bin_coord.long_name is None + assert bin_coord.units == 'K' + assert bin_coord.attributes == {} + + +@pytest.mark.parametrize('lazy', [False, True]) +def test_histogram_defaults(cube, lazy): + """Test `histogram`.""" + if lazy: + cube.data = cube.lazy_data() + input_cube = cube.copy() + + result = histogram(input_cube) + + assert input_cube == cube + assert_metadata(result) + assert result.shape == (10,) + if lazy: + assert result.has_lazy_data() + else: + assert not result.has_lazy_data() + assert result.dtype == np.float32 + np.testing.assert_allclose( + result.data, [1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0] + ) + np.testing.assert_allclose(result.data.mask, [False] * 10) + bin_coord = result.coord('air_temperature') + bin_coord.shape == (10,) + bin_coord.dtype == np.float64 + bin_coord.bounds_dtype == np.float64 + np.testing.assert_allclose( + bin_coord.points, + [0.35, 1.05, 1.75, 2.45, 3.15, 3.85, 4.55, 5.25, 5.95, 6.65], + ) + np.testing.assert_allclose( + bin_coord.bounds, + [ + [0.0, 0.7], + [0.7, 1.4], + [1.4, 2.1], + [2.1, 2.8], + [2.8, 3.5], + [3.5, 4.2], + [4.2, 4.9], + [4.9, 5.6], + [5.6, 6.3], + [6.3, 7.0], + ], + ) + + +@pytest.mark.parametrize('normalization', [None, 'sum', 'integral']) +@pytest.mark.parametrize('weights', [False, None]) +@pytest.mark.parametrize('lazy', [False, True]) +def test_histogram_over_time(cube, lazy, weights, normalization): + """Test `histogram`.""" + if lazy: + cube.data = cube.lazy_data() + input_cube = cube.copy() + + result = histogram( + input_cube, + coords=['time'], + bins=[4.5, 6.5, 8.5, 10.5], + bin_range=(4.5, 10.5), + weights=weights, + normalization=normalization, + ) + + assert input_cube == cube + assert_metadata(result, normalization=normalization) + assert result.coord('latitude') == input_cube.coord('latitude') + assert result.coord('longitude') == input_cube.coord('longitude') + assert result.shape == (2, 2, 3) + if lazy: + assert result.has_lazy_data() + else: + assert not result.has_lazy_data() + assert result.dtype == np.float32 + if normalization == 'integral': + expected_data = np.ma.masked_invalid([ + [[np.nan, np.nan, np.nan], [0.5, 0.0, 0.0]], + [[np.nan, np.nan, np.nan], [0.25, 0.25, 0.0]], + ]) + elif normalization == 'sum': + expected_data = np.ma.masked_invalid([ + [[np.nan, np.nan, np.nan], [1.0, 0.0, 0.0]], + [[np.nan, np.nan, np.nan], [0.5, 0.5, 0.0]], + ]) + else: + expected_data = np.ma.masked_invalid([ + [[np.nan, np.nan, np.nan], [1.0, 0.0, 0.0]], + [[np.nan, np.nan, np.nan], [1.0, 1.0, 0.0]], + ]) + np.testing.assert_allclose(result.data, expected_data) + np.testing.assert_allclose(result.data.mask, expected_data.mask) + bin_coord = result.coord('air_temperature') + bin_coord.shape == (10,) + bin_coord.dtype == np.float64 + bin_coord.bounds_dtype == np.float64 + np.testing.assert_allclose(bin_coord.points, [5.5, 7.5, 9.5]) + np.testing.assert_allclose( + bin_coord.bounds, [[4.5, 6.5], [6.5, 8.5], [8.5, 10.5]], + ) + + +@pytest.mark.parametrize('normalization', [None, 'sum', 'integral']) +@pytest.mark.parametrize('lazy', [False, True]) +def test_histogram_fully_masked(cube, lazy, normalization): + """Test `histogram`.""" + cube.data = np.ma.masked_all((2, 2, 2), dtype=np.float32) + if lazy: + cube.data = cube.lazy_data() + + result = histogram(cube, bin_range=(0, 10), normalization=normalization) + + assert_metadata(result, normalization=normalization) + assert result.shape == (10,) + if lazy: + assert result.has_lazy_data() + else: + assert not result.has_lazy_data() + assert result.dtype == np.float32 + np.testing.assert_allclose(result.data, np.ma.masked_all(10,)) + np.testing.assert_equal(result.data.mask, [True] * 10) + bin_coord = result.coord('air_temperature') + bin_coord.shape == (10,) + bin_coord.dtype == np.float64 + bin_coord.bounds_dtype == np.float64 + np.testing.assert_allclose( + bin_coord.points, + [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5], + ) + np.testing.assert_allclose( + bin_coord.bounds, + [ + [0.0, 1.0], + [1.0, 2.0], + [2.0, 3.0], + [3.0, 4.0], + [4.0, 5.0], + [5.0, 6.0], + [6.0, 7.0], + [7.0, 8.0], + [8.0, 9.0], + [9.0, 10.0], + ], + ) + + +@pytest.mark.parametrize('normalization', [None, 'sum', 'integral']) +@pytest.mark.parametrize( + 'weights', + [ + True, + np.array([[[6, 6], [6, 6]], [[2, 2], [2, 2]]]), + da.array([[[6, 6], [6, 6]], [[2, 2], [2, 2]]]), ] - grouped_by_string = _group_products(products, 'project') - grouped_by_list = _group_products(products, ['project']) +) +@pytest.mark.parametrize('lazy', [False, True]) +def test_histogram_weights(cube, lazy, weights, normalization): + """Test `histogram`.""" + if lazy: + cube.data = cube.lazy_data() + input_cube = cube.copy() + + result = histogram( + input_cube, + coords=['time', 'longitude'], + bins=[0.0, 2.0, 4.0, 8.0], + weights=weights, + normalization=normalization, + ) + + assert input_cube == cube + assert_metadata(result, normalization=normalization) + assert result.coord('latitude') == input_cube.coord('latitude') + assert result.shape == (2, 3) + if lazy: + assert result.has_lazy_data() + else: + assert not result.has_lazy_data() + assert result.dtype == np.float32 + if normalization == 'integral': + expected_data = np.ma.masked_invalid( + [[0.25, 0.0, 0.125], [0.0, 0.0, 0.25]] + ) + elif normalization == 'sum': + expected_data = np.ma.masked_invalid( + [[0.5, 0.0, 0.5], [0.0, 0.0, 1.0]] + ) + else: + expected_data = np.ma.masked_invalid( + [[8.0, 0.0, 8.0], [0.0, 0.0, 8.0]] + ) + np.testing.assert_allclose(result.data, expected_data) + np.testing.assert_allclose(result.data.mask, expected_data.mask) + bin_coord = result.coord('air_temperature') + bin_coord.shape == (10,) + bin_coord.dtype == np.float64 + bin_coord.bounds_dtype == np.float64 + np.testing.assert_allclose(bin_coord.points, [1.0, 3.0, 6.0]) + np.testing.assert_allclose( + bin_coord.bounds, [[0.0, 2.0], [2.0, 4.0], [4.0, 8.0]], + ) + - assert grouped_by_list == grouped_by_string +@pytest.fixture +def cube_with_rich_metadata(): + """Cube with rich metadata.""" + time = DimCoord([0], bounds=[[-1, 1]], var_name='time', units='s') + sigma = DimCoord([0], var_name='sigma', units='1') + lat = DimCoord([0], var_name='lat', units='degrees') + lon = DimCoord([0], var_name='lon', units='degrees') + ptop = AuxCoord(0, var_name='ptop', units='Pa') + psur = AuxCoord([[0]], var_name='ps', units='Pa') + sigma_factory = AtmosphereSigmaFactory(ptop, sigma, psur) + cell_area = CellMeasure([[1]], var_name='area', units='m2', measure='area') + anc = AncillaryVariable([0], var_name='anc') + cube = Cube( + np.ones((1, 1, 1, 1), dtype=np.float32), + standard_name=None, + long_name='Air Temperature', + var_name=None, + units='K', + attributes={'test': '1'}, + cell_methods=(CellMethod('point', 'sigma'),), + dim_coords_and_dims=[(time, 0), (sigma, 1), (lat, 2), (lon, 3)], + aux_coords_and_dims=[(ptop, ()), (psur, (2, 3))], + aux_factories=[sigma_factory], + ancillary_variables_and_dims=[(anc, 1)], + cell_measures_and_dims=[(cell_area, (2, 3))], + ) + return cube -def test_get_array_module_da(): +@pytest.mark.parametrize('normalization', [None, 'sum', 'integral']) +@pytest.mark.parametrize('weights', [True, False, None]) +@pytest.mark.parametrize('lazy', [False, True]) +def test_histogram_metadata( + cube_with_rich_metadata, lazy, weights, normalization +): + """Test `histogram`.""" + if lazy: + cube_with_rich_metadata.data = cube_with_rich_metadata.lazy_data() + input_cube = cube_with_rich_metadata.copy() + + result = histogram( + input_cube, + coords=['time'], + bins=[0.0, 1.0, 2.0], + bin_range=(0.0, 2.0), + weights=weights, + normalization=normalization, + ) + + assert input_cube == cube_with_rich_metadata + assert result.shape == (1, 1, 1, 2) + + assert result.standard_name is None + if normalization == 'sum': + assert result.long_name == 'Relative Frequency of Air Temperature' + assert result.var_name == 'relative_frequency' + assert result.units == '1' + elif normalization == 'integral': + assert result.long_name == 'Density of Air Temperature' + assert result.var_name == 'density' + assert result.units == 'K-1' + else: + assert result.long_name == 'Frequency of Air Temperature' + assert result.var_name == 'frequency' + assert result.units == '1' + assert result.attributes == {'test': '1'} + assert result.cell_methods == (CellMethod('point', 'sigma'),) + + assert not result.coords('time', dim_coords=True) + for dim_coord in ('sigma', 'lat', 'lon'): + assert ( + result.coord(dim_coord, dim_coords=True) == + input_cube.coord(dim_coord, dim_coords=True) + ) + assert ( + result.coord_dims(dim_coord) == + (input_cube.coord_dims(dim_coord)[0] - 1,) + ) + assert result.coords('Air Temperature', dim_coords=True) + bin_coord = result.coord('Air Temperature') + assert result.coord_dims(bin_coord) == (3,) + assert bin_coord.standard_name is None + assert bin_coord.long_name == 'Air Temperature' + assert bin_coord.var_name is None + assert bin_coord.units == 'K' + assert bin_coord.attributes == {} + + assert result.coords('time', dim_coords=False) + assert result.coord_dims('time') == () + assert result.coord('ptop') == input_cube.coord('ptop') + assert result.coord('ps') == input_cube.coord('ps') + assert len(result.aux_factories) == 1 + assert isinstance(result.aux_factories[0], AtmosphereSigmaFactory) + assert result.ancillary_variables() == input_cube.ancillary_variables() + assert result.cell_measures() == input_cube.cell_measures() - npx = get_array_module(da.array([1, 2])) - assert npx is da +@pytest.mark.parametrize('lazy', [False, True]) +def test_histogram_fully_masked_no_bin_range(cube, lazy): + """Test `histogram`.""" + cube.data = np.ma.masked_all((2, 2, 2), dtype=np.float32) + if lazy: + cube.data = cube.lazy_data() -def test_get_array_module_np(): + msg = ( + r"Cannot calculate histogram for bin_range=\(masked, masked\) \(or " + r"for fully masked data when `bin_range` is not given\)" + ) + with pytest.raises(ValueError, match=msg): + histogram(cube) - npx = get_array_module(np.array([1, 2])) - assert npx is np +def test_histogram_invalid_bins(cube): + """Test `histogram`.""" + msg = ( + r"bins cannot be a str \(got 'auto'\), must be int or Sequence of int" + ) + with pytest.raises(TypeError, match=msg): + histogram(cube, bins='auto') -def test_get_array_module_mixed(): - npx = get_array_module(da.array([1]), np.array([1])) - assert npx is da +def test_histogram_invalid_normalization(cube): + """Test `histogram`.""" + msg = ( + r"Expected one of \(None, 'sum', 'integral'\) for normalization, got " + r"'invalid'" + ) + with pytest.raises(ValueError, match=msg): + histogram(cube, normalization='invalid') if __name__ == '__main__': diff --git a/tests/unit/preprocessor/test_shared.py b/tests/unit/preprocessor/test_shared.py index f931f449a9..4f70b2b419 100644 --- a/tests/unit/preprocessor/test_shared.py +++ b/tests/unit/preprocessor/test_shared.py @@ -6,13 +6,20 @@ import iris.analysis import numpy as np import pytest +from cf_units import Unit +from iris.coords import AuxCoord from iris.cube import Cube from esmvalcore.exceptions import ESMValCoreDeprecationWarning +from esmvalcore.preprocessor import PreprocessorFile from esmvalcore.preprocessor._shared import ( + _get_area_weights, + _group_products, aggregator_accept_weights, + get_array_module, get_iris_aggregator, preserve_float_dtype, + try_adding_calculated_cell_area, ) @@ -236,3 +243,107 @@ def test_preserve_float_dtype(data, dtype): assert _dummy_func.__name__ == '_dummy_func' signature = inspect.signature(_dummy_func) assert list(signature.parameters) == ['obj', 'arg', 'kwarg'] + + +def test_get_array_module_da(): + npx = get_array_module(da.array([1, 2])) + assert npx is da + + +def test_get_array_module_np(): + npx = get_array_module(np.array([1, 2])) + assert npx is np + + +def test_get_array_module_mixed(): + npx = get_array_module(da.array([1]), np.array([1])) + assert npx is da + + +def _create_sample_full_cube(): + cube = Cube(np.zeros((4, 180, 360)), var_name='co2', units='J') + cube.add_dim_coord( + iris.coords.DimCoord( + np.array([10., 40., 70., 110.]), + standard_name='time', + units=Unit('days since 1950-01-01 00:00:00', calendar='gregorian'), + ), + 0, + ) + cube.add_dim_coord( + iris.coords.DimCoord( + np.arange(-90., 90., 1.), + standard_name='latitude', + units='degrees', + ), + 1, + ) + cube.add_dim_coord( + iris.coords.DimCoord( + np.arange(0., 360., 1.), + standard_name='longitude', + units='degrees', + ), + 2, + ) + + cube.coord("time").guess_bounds() + cube.coord("longitude").guess_bounds() + cube.coord("latitude").guess_bounds() + + return cube + + +@pytest.mark.parametrize('lazy', [True, False]) +def test_get_area_weights(lazy): + """Test _get_area_weights.""" + cube = _create_sample_full_cube() + if lazy: + cube.data = cube.lazy_data() + weights = _get_area_weights(cube) + if lazy: + assert isinstance(weights, da.Array) + assert weights.chunks == cube.lazy_data().chunks + else: + assert isinstance(weights, np.ndarray) + np.testing.assert_allclose( + weights, iris.analysis.cartography.area_weights(cube) + ) + + +def test_group_products_string_list(): + products = [ + PreprocessorFile( + filename='A_B.nc', + attributes={ + 'project': 'A', + 'dataset': 'B', + }, + ), + PreprocessorFile( + filename='A_C.nc', + attributes={ + 'project': 'A', + 'dataset': 'C', + } + ), + ] + grouped_by_string = _group_products(products, 'project') + grouped_by_list = _group_products(products, ['project']) + + assert grouped_by_list == grouped_by_string + + +def test_try_adding_calculated_cell_area(): + """Test ``try_adding_calculated_cell_area``.""" + cube = _create_sample_full_cube() + cube.coord('latitude').rename('grid_latitude') + cube.coord('longitude').rename('grid_longitude') + lat = AuxCoord(np.zeros((180, 360)), standard_name='latitude') + lon = AuxCoord(np.zeros((180, 360)), standard_name='longitude') + cube.add_aux_coord(lat, (1, 2)) + cube.add_aux_coord(lon, (1, 2)) + + try_adding_calculated_cell_area(cube) + + assert cube.cell_measures('cell_area') diff --git a/tests/unit/test_iris_helpers.py b/tests/unit/test_iris_helpers.py index a9abafc15b..e7b18e4c67 100644 --- a/tests/unit/test_iris_helpers.py +++ b/tests/unit/test_iris_helpers.py @@ -303,7 +303,8 @@ def test_rechunk_cube_fully_lazy(cube_3d): assert result.ancillary_variable('anc_var').core_data().chunksize == (3, 2) -def test_rechunk_cube_partly_lazy(cube_3d): +@pytest.mark.parametrize('complete_dims', [['x', 'y'], ['xy']]) +def test_rechunk_cube_partly_lazy(cube_3d, complete_dims): """Test ``rechunk_cube``.""" input_cube = cube_3d.copy() @@ -313,7 +314,7 @@ def test_rechunk_cube_partly_lazy(cube_3d): input_cube.coord('xyz').bounds input_cube.cell_measure('cell_measure').data - result = rechunk_cube(input_cube, ['x', 'y'], remaining_dims=2) + result = rechunk_cube(input_cube, complete_dims, remaining_dims=2) assert input_cube == cube_3d assert result == cube_3d @@ -334,15 +335,6 @@ def test_rechunk_cube_partly_lazy(cube_3d): assert result.ancillary_variable('anc_var').core_data().chunksize == (3, 2) -def test_rechunk_cube_invalid_coord_fail(cube_3d): - """Test ``rechunk_cube``.""" - msg = ( - "Complete coordinates must be 1D coordinates, got 2D coordinate 'xy'" - ) - with pytest.raises(CoordinateMultiDimError, match=msg): - rechunk_cube(cube_3d, ['xy']) - - @pytest.fixture def lat_coord_1d(): """1D latitude coordinate."""