-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtest_roundtrips_ixi_xix.py
314 lines (272 loc) · 11.7 KB
/
test_roundtrips_ixi_xix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
"""
Test ncdata.iris by checking roundtrips for standard testcases.
Testcases start as netcdf files.
(1) check equivalence of cubes : iris.load(file) VS iris.load(ncdata(file))
(2) check equivalence of files : iris -> file VS iris->ncdata->file
"""
from subprocess import check_output
import dask.array as da
import netCDF4
import iris
import numpy as np
import pytest
import xarray
from ncdata import NcAttribute
from ncdata.iris import from_iris
from ncdata.iris_xarray import cubes_from_xarray, cubes_to_xarray
from ncdata.netcdf4 import from_nc4, to_nc4
from ncdata.threadlock_sharing import lockshare_context
from ncdata.xarray import from_xarray
from tests._compare_nc_datasets import compare_nc_datasets
from tests.data_testcase_schemas import (
session_testdir,
standard_testcase,
BAD_LOADSAVE_TESTCASES,
)
from tests.integration.roundtrips_utils import (
adjust_chunks,
cubes_equal__corrected,
set_tiny_chunks,
nanmask_cube,
prune_cube_varproperties,
remove_cube_nounits,
namesort_cubes,
)
# Avoid complaints that imported fixtures are "unused"
standard_testcase, session_testdir, adjust_chunks
# import iris.fileformats.netcdf._thread_safe_nc as ifnt
_FIX_LOCKS = True
# _FIX_LOCKS = False
if _FIX_LOCKS:
@pytest.fixture(scope="session")
def use_irislock():
with lockshare_context(iris=True, xarray=True):
yield
# _TINY_CHUNKS = True
_TINY_CHUNKS = False
set_tiny_chunks(_TINY_CHUNKS)
def test_roundtrip_ixi(standard_testcase, use_irislock, adjust_chunks):
source_filepath = standard_testcase.filepath
# Skip cases where there are no data variables for Iris to load.
exclude_case_keys = (
# unloadable by iris
BAD_LOADSAVE_TESTCASES["iris"]["load"]
# unsaveable by iris (= can't convert to xarray)
+ BAD_LOADSAVE_TESTCASES["iris"]["save"]
# unloadable by xarray
+ BAD_LOADSAVE_TESTCASES["xarray"]["load"]
# TODO: remaining unresolved problems ...
+ [
# string dimension problem
"ds__dtype__string",
# outstanding dims-mismatch problems.
"testing__small_theta_colpex",
# coordinate attributes on mesh coordinate variables
"testdata____unstructured_grid__data_C4",
"testdata____ugrid__21_triangle_example",
# Problem with units on time bounds
"label_and_climate__small_FC_167",
]
)
if any(key in standard_testcase.name for key in exclude_case_keys):
pytest.skip("excluded testcase")
# _Debug = True
_Debug = False
if _Debug:
print(f"\ntestcase: {standard_testcase.name}")
print("spec =")
print(standard_testcase.spec)
print("\nncdump =")
txt = check_output([f"ncdump {source_filepath}"], shell=True).decode()
print(txt)
# Load the testcase with Iris.
iris_cubes = iris.load(source_filepath)
# Remove any attributes that match properties of a netCDF4.Variable
# Since, due to an Iris bug, these cannot be saved to a file variable.
# N.B. a bit subtle, as mostly doesn't apply to
_PRUNE_CFVAR_PROPNAMES = True
# _PRUNE_CFVAR_PROPNAMES = False
if _PRUNE_CFVAR_PROPNAMES:
removed_props = prune_cube_varproperties(iris_cubes)
assert removed_props == set() or removed_props == {
"name",
}
# Remove any 'no_units' units, since these do not save correctly
# see : https://github.com/SciTools/iris/issues/5368
remove_cube_nounits(iris_cubes)
# Unfortunately, cube order is not guaranteed to be stable.
iris_cubes = namesort_cubes(iris_cubes)
# Convert to xarray, and back again.
ds = cubes_to_xarray(iris_cubes)
ncds_fromxr = from_xarray(ds)
from ncdata.iris import to_iris
iris_xr_cubes = to_iris(ncds_fromxr)
# Unfortunately, cube order is not guaranteed to be stable.
iris_xr_cubes = namesort_cubes(iris_xr_cubes)
# Because Iris awkwardly special-cases the units of variables with flag_values etc,
# we need to also ignore 'no_units' in the re-loaded data.
remove_cube_nounits(iris_xr_cubes)
# N.B. Conventions are not preserved from original, since Iris re-writes them.
# So for now, just remove them all
for cube in iris_cubes + iris_xr_cubes:
cube.attributes.pop("Conventions", None)
# correct for xarray handling of time units.
# not entirely clear yet where this happens,
# - probably in the loading phase, which therefore can maybe be controlled ??
_UNIFY_TIME_UNITS = True
# _UNIFY_TIME_UNITS = False
if _UNIFY_TIME_UNITS:
for iris_cube, iris_xr_cube in zip(iris_cubes, iris_xr_cubes):
for xr_coord in iris_xr_cube.coords():
iris_coords = iris_cube.coords(xr_coord.name())
if len(iris_coords) != 1:
# Coords don't match, which is nasty!
# Just skip out + let the test fail
break
(iris_coord,) = iris_coords
# Detecting differently constructed time units is awkward,
# because you can have unit1==unit2, but still str(unit1) != str(unit2)
xr_ut, iris_ut = (co.units for co in (xr_coord, iris_coord))
xr_utstr, iris_utstr = (str(ut) for ut in (xr_ut, iris_ut))
if (
iris_coord == xr_coord
and iris_coord.units == xr_coord.units
and xr_utstr != iris_utstr
):
# Fix "equal" units to have identical **string representations**.
xr_coord.units = iris_ut
iris_cubes = [nanmask_cube(cube) for cube in iris_cubes]
iris_xr_cubes = [nanmask_cube(cube) for cube in iris_xr_cubes]
results = [
(c1.name(), cubes_equal__corrected(c1, c2))
for c1, c2 in zip(iris_cubes, iris_xr_cubes)
]
expected = [(cube.name(), True) for cube in iris_cubes]
result = results == expected
if not result:
# FOR NOW: compare with experimental ncdata comparison.
# I know this is a bit circular, but it is useful for debugging, for now ...
result = compare_nc_datasets(
from_iris(iris_cubes), from_iris(iris_xr_cubes)
)
assert result == []
# N.B. FOR NOW skip this test entirely.
# There are lots of problems here, mostly caused by dimension naming.
# It's not currently clear if we *can* find a simple view of what "ought" to work here,
# or whether we should be attempting to test this at all.
@pytest.mark.skip("Roundtrip testing xarray-iris-xarray : currently disabled.")
def test_roundtrip_xix(
standard_testcase, use_irislock, adjust_chunks, tmp_path
):
source_filepath = standard_testcase.filepath
# Skip some cases
excluded_casename_keys = [
# these won't load (either Iris or Xarray)
"ds_Empty",
"ds__singleattr",
"ds__dimonly",
# these ones have a type of bounds variable that Xarray can't handle :
# xarray.core.variable.MissingDimensionsError: 'time_bnd' has more than
# 1-dimension and the same name as one of its dimensions
# ('time', 'time_bnd').
# xarray disallows such variables because they conflict with the
# coordinates used to label dimensions.
# E.G.
# (dims include 'time', 'time_bnds')
# float time(time) ;
# time:bounds = 'time_bnds'
# float time_bnds(time, time_bnds) ;
"label_and_climate__small_FC_167",
"rotated__xyt__small_rotPole_precipitation",
# This one fails to load in xarray, for somewhat unclear reasons
# NotImplementedError: Can not use auto rechunking with object dtype.
# We are unable to estimate the size in bytes of object data
"unstructured_grid__lfric_surface_mean",
# Iris loses the name of the unstructured dimension, causing multiple problems
"unstructured_grid__data_C4",
]
if any(key in standard_testcase.name for key in excluded_casename_keys):
pytest.skip("excluded testcase")
# _Debug = True
_Debug = False
if _Debug:
print(f"\ntestcase: {standard_testcase.name}")
print("spec =")
print(standard_testcase.spec)
print("\nncdump =")
txt = check_output([f"ncdump {source_filepath}"], shell=True).decode()
print(txt)
# Load the testcase with Xarray.
xrds = xarray.open_dataset(source_filepath, chunks="auto")
# Convert to iris cubes, and back again.
# TODO: reinstate un-staged xarray-->iris conversion.
# but for now, pull out intermediate stage for debugging
from ncdata.iris import to_iris
ncds = from_xarray(xrds)
iris_cubes = to_iris(ncds)
# iris_cubes = cubes_from_xarray(xrds)
xrds_iris = cubes_to_xarray(iris_cubes)
# # Check equivalence
# result = xrds_iris.identical(xrds)
#
# if not result:
# # FOR NOW: compare with experimental ncdata comparison.
# # I know this is a bit circular, but it is useful for debugging, for now ...
# Check equivalence (a less old but still obsolete way, via "from_xarray")
# ncds_xr = from_xarray(xrds)
# ncds_xr_iris = from_xarray(xrds_iris)
# Check equivalence of Xarray datasets : going via file
# This is a useful stopgap as it is flexible + diffs are explainable
# TODO: replace by fixes **to xarray datasets**, and ds.identical()
fp_xr = tmp_path / "tmp_xr.nc"
fp_xr_iris = tmp_path / "tmp_xr_iris.nc"
xrds.to_netcdf(fp_xr)
xrds_iris.to_netcdf(fp_xr_iris)
ncds_xr = from_nc4(fp_xr)
ncds_xr_iris = from_nc4(fp_xr_iris)
# Sanitise results in various ways to avoid common encoding disagreements
# (for now, at least)
for ds in (ncds_xr, ncds_xr_iris):
ds.attributes.pop("Conventions", None)
for var in ds.variables.values():
if var.name == "data":
pass
if "grid_mapping_name" in var.attributes:
# Fix datatypes of grid-mapping variables.
# Iris creates all these as floats, but int is more common in inputs.
FIXED_GRIDMAPPING_DTYPE = np.dtype("i4")
var.dtype = FIXED_GRIDMAPPING_DTYPE
var.data = var.data.astype(FIXED_GRIDMAPPING_DTYPE)
# Remove any coordinates of grid-mapping variables : Xarray adds these.
var.attributes.pop("coordinates", None)
fv = var.attributes.pop("_FillValue", None)
if fv is None:
dt = var.data.dtype
nn = f"{dt.kind}{dt.itemsize}"
fv = netCDF4.default_fillvals[nn]
else:
fv = fv.as_python_value()
data = da.ma.getdata(var.data)
mask = da.ma.getmaskarray(var.data)
if data.dtype.kind == "f":
mask |= da.isnan(data)
mask |= data == fv
data = da.ma.masked_array(data, mask=mask)
var.data = data
if "calendar" in var.attributes:
if var.attributes["calendar"].as_python_value() == "gregorian":
# the calendar name 'gregorian' is now deprecated, so Iris replaces it.
var.attributes["calendar"] = NcAttribute(
"calendar", "standard"
)
result = compare_nc_datasets(
ncds_xr, ncds_xr_iris
) # , check_var_data=False)
assert result == []
# TODO: check equivalence, in Xarray terms
# xr_result = xrds_iris.equals(xrds)
# ncd_result = compare_nc_datasets(
# ncds_xr, ncds_xr_iris
# ) # , check_var_data=False)
# print("\nDATASET COMPARE RESULTS:\n" + "\n".join(ncd_result))
# assert (xr_result, ncd_result) == (True, [])