diff --git a/conftest.py b/conftest.py index 32b3581f..e9c631d9 100644 --- a/conftest.py +++ b/conftest.py @@ -12,6 +12,57 @@ def pytest_addoption(parser): ) +def generate_test_filepath_dict() -> dict: + return { + "full_time_netcdf4": "virtualizarr/tests/data/test_ds_netcdf4_full_time.nc", + "time_1_netcdf4": "virtualizarr/tests/data/test_ds_netcdf4_split_time1.nc", + "time_2_netcdf4": "virtualizarr/tests/data/test_ds_netcdf4_split_time2.nc", + "netcdf3": "virtualizarr/tests/data/test_ds_netcdf3.nc", + "netcdf4_group": "virtualizarr/tests/data/test_ds_netcdf4_group.nc", + "netcdf4_non_standard_time": "virtualizarr/tests/data/test_ds_non_datetime_time.nc", + } + + +def generate_small_xr_datasets(): + """This function can be used to re-generate the locally stored dataset for testing + It's 43kB instead of the full 31MB air_temperature tutorial dataset""" + import numpy as np + + # building our test dataset from the air_temp dataset, but saving a subset + ds = xr.tutorial.open_dataset("air_temperature").isel(time=slice(0, 4)) + + lats = np.arange(-90, 90, 1) + lons = np.arange(-180, 180, 1) + + data = np.random.randint(0, 2, size=(4, 180, 360), dtype=np.int16) + + # create a dataset with non-standard time + non_standard_date_ds = xr.Dataset( + data_vars=dict(air=(["time", "lat", "lon"], data)), + coords=dict(time=[0, 1, 2, 3], lat=lats, lon=lons), + ) + + # Add attributes to the time coordinate + non_standard_date_ds.time.attrs["units"] = "days since '2000-01-01'" + non_standard_date_ds.time.attrs["calendar"] = "standard" + + # write datasets + ds.to_netcdf("virtualizarr/tests/data/test_ds_netcdf4_full_time.nc") + ds.isel(time=slice(0, 2)).to_netcdf( + "virtualizarr/tests/data/test_ds_netcdf4_split_time1.nc" + ) + ds.isel(time=slice(2, 4)).to_netcdf( + "virtualizarr/tests/data/test_ds_netcdf4_split_time2.nc" + ) + + ds.to_netcdf("virtualizarr/tests/data/test_ds_netcdf3.nc", engine="scipy") + ds.to_netcdf("virtualizarr/tests/data/test_ds_netcdf4_group.nc", group="test/group") + + non_standard_date_ds.to_netcdf( + "virtualizarr/tests/data/test_ds_non_datetime_time.nc" + ) + + def pytest_runtest_setup(item): # based on https://stackoverflow.com/questions/47559524 if "network" in item.keywords and not item.config.getoption("--run-network-tests"): @@ -21,49 +72,19 @@ def pytest_runtest_setup(item): @pytest.fixture -def netcdf4_file(tmpdir): - # Set up example xarray dataset - ds = xr.tutorial.open_dataset("air_temperature") - - # Save it to disk as netCDF (in temporary directory) - filepath = f"{tmpdir}/air.nc" - ds.to_netcdf(filepath, format="NETCDF4") - ds.close() - - return filepath +def netcdf4_file() -> str: + return generate_test_filepath_dict()["full_time_netcdf4"] @pytest.fixture -def hdf5_groups_file(tmpdir): - # Set up example xarray dataset - ds = xr.tutorial.open_dataset("air_temperature") - - # Save it to disk as netCDF (in temporary directory) - filepath = f"{tmpdir}/air.nc" - ds.to_netcdf(filepath, format="NETCDF4", group="test/group") - ds.close() - - return filepath +def hdf5_groups_file() -> str: + return generate_test_filepath_dict()["netcdf4_group"] @pytest.fixture -def netcdf4_files(tmpdir): - # Set up example xarray dataset - ds = xr.tutorial.open_dataset("air_temperature") - - # split inrto equal chunks so we can concatenate them back together later - ds1 = ds.isel(time=slice(None, 1460)) - ds2 = ds.isel(time=slice(1460, None)) - - # Save it to disk as netCDF (in temporary directory) - filepath1 = f"{tmpdir}/air1.nc" - filepath2 = f"{tmpdir}/air2.nc" - ds1.to_netcdf(filepath1) - ds2.to_netcdf(filepath2) - ds1.close() - ds2.close() - - return filepath1, filepath2 +def netcdf4_files(): + test_filepath_dict = generate_test_filepath_dict() + return test_filepath_dict["time_1_netcdf4"], test_filepath_dict["time_2_netcdf4"] @pytest.fixture diff --git a/docs/releases.rst b/docs/releases.rst index ec057807..78c46624 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -8,6 +8,11 @@ v1.0.1 (unreleased) New Features ~~~~~~~~~~~~ + +- Stores test datasets in repo. + (:pull:`235`) By `Raphael Hagen `_. + Closes (:issue:`226`) + - Adds defaults for `open_virtual_dataset_from_v3_store` in (:pull:`234`) By `Raphael Hagen `_. diff --git a/virtualizarr/tests/data/test_ds_netcdf3.nc b/virtualizarr/tests/data/test_ds_netcdf3.nc new file mode 100644 index 00000000..f58689a3 Binary files /dev/null and b/virtualizarr/tests/data/test_ds_netcdf3.nc differ diff --git a/virtualizarr/tests/data/test_ds_netcdf4_full_time.nc b/virtualizarr/tests/data/test_ds_netcdf4_full_time.nc new file mode 100644 index 00000000..28dc3244 Binary files /dev/null and b/virtualizarr/tests/data/test_ds_netcdf4_full_time.nc differ diff --git a/virtualizarr/tests/data/test_ds_netcdf4_group.nc b/virtualizarr/tests/data/test_ds_netcdf4_group.nc new file mode 100644 index 00000000..03dc4cf5 Binary files /dev/null and b/virtualizarr/tests/data/test_ds_netcdf4_group.nc differ diff --git a/virtualizarr/tests/data/test_ds_netcdf4_split_time1.nc b/virtualizarr/tests/data/test_ds_netcdf4_split_time1.nc new file mode 100644 index 00000000..d3428488 Binary files /dev/null and b/virtualizarr/tests/data/test_ds_netcdf4_split_time1.nc differ diff --git a/virtualizarr/tests/data/test_ds_netcdf4_split_time2.nc b/virtualizarr/tests/data/test_ds_netcdf4_split_time2.nc new file mode 100644 index 00000000..0afa035e Binary files /dev/null and b/virtualizarr/tests/data/test_ds_netcdf4_split_time2.nc differ diff --git a/virtualizarr/tests/data/test_ds_non_datetime_time.nc b/virtualizarr/tests/data/test_ds_non_datetime_time.nc new file mode 100644 index 00000000..0d9213f2 Binary files /dev/null and b/virtualizarr/tests/data/test_ds_non_datetime_time.nc differ diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 5894f643..70f01e9e 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -12,8 +12,8 @@ @pytest.mark.parametrize( "inline_threshold, vars_to_inline", [ - (5e2, ["lat", "lon"]), - (5e4, ["lat", "lon", "time"]), + (500, ["time", "lat", "lon"]), + (5e3, ["lat", "lon", "time"]), pytest.param( 5e7, ["lat", "lon", "time", "air"], @@ -36,7 +36,6 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( netcdf4_file, loadable_variables=vars_to_inline, indexes={} ) refs = vds.virtualize.to_kerchunk(format="dict") - # TODO I would just compare the entire dicts but kerchunk returns inconsistent results - see https://github.com/TomNicholas/VirtualiZarr/pull/73#issuecomment-2040931202 # assert refs == expected assert refs["refs"]["air/0.0.0"] == expected["refs"]["air/0.0.0"] @@ -47,15 +46,11 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( @pytest.mark.parametrize("format", ["dict", "json", "parquet"]) class TestKerchunkRoundtrip: - def test_kerchunk_roundtrip_no_concat(self, tmpdir, format): - # set up example xarray dataset - ds = xr.tutorial.open_dataset("air_temperature", decode_times=False) - - # save it to disk as netCDF (in temporary directory) - ds.to_netcdf(f"{tmpdir}/air.nc") + def test_kerchunk_roundtrip_no_concat(self, netcdf4_file, tmpdir, format): + ds = xr.open_dataset(netcdf4_file, decode_times=False) # use open_dataset_via_kerchunk to read it as references - vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={}) + vds = open_virtual_dataset(netcdf4_file, indexes={}) if format == "dict": # write those references to an in-memory kerchunk-formatted references dictionary @@ -76,12 +71,17 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format): xrt.assert_identical(roundtrip, ds) @pytest.mark.parametrize("decode_times,time_vars", [(False, []), (True, ["time"])]) - def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars): + def test_kerchunk_roundtrip_concat( + self, netcdf4_file, netcdf4_files, tmpdir, format, decode_times, time_vars + ): + netcdf1, netcdf2 = netcdf4_files + # set up example xarray dataset - ds = xr.tutorial.open_dataset("air_temperature", decode_times=decode_times) + ds = xr.open_dataset(netcdf4_file, decode_times=decode_times) # split into two datasets - ds1, ds2 = ds.isel(time=slice(None, 1460)), ds.isel(time=slice(1460, None)) + ds1 = xr.open_dataset(netcdf1, decode_times=decode_times) + ds2 = xr.open_dataset(netcdf2, decode_times=decode_times) # save it to disk as netCDF (in temporary directory) ds1.to_netcdf(f"{tmpdir}/air1.nc") diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 9db6e3a2..4d2f83a4 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -230,10 +230,7 @@ def test_combine_by_coords(self, netcdf4_files): vds1 = open_virtual_dataset(filepath1) with pytest.warns(UserWarning, match="will create in-memory pandas indexes"): vds2 = open_virtual_dataset(filepath2) - - combined_vds = xr.combine_by_coords( - [vds2, vds1], - ) + combined_vds = xr.combine_by_coords([vds2, vds1]) assert combined_vds.xindexes["time"].to_pandas_index().is_monotonic_increasing @@ -278,7 +275,7 @@ def local_to_s3_url(old_local_path: str) -> str: renamed_vds = vds.virtualize.rename_paths(local_to_s3_url) assert ( renamed_vds["air"].data.manifest.dict()["0.0.0"]["path"] - == "s3://bucket/air.nc" + == "s3://bucket/test_ds_netcdf4_full_time.nc" ) def test_invalid_type(self, netcdf4_file):