Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Store test datasets in repo #235

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 58 additions & 37 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,57 @@ def pytest_addoption(parser):
)


def generate_test_filepath_dict() -> dict:
return {
"full_time_netcdf4": "virtualizarr/tests/data/test_ds_netcdf4_full_time.nc",
"time_1_netcdf4": "virtualizarr/tests/data/test_ds_netcdf4_split_time1.nc",
"time_2_netcdf4": "virtualizarr/tests/data/test_ds_netcdf4_split_time2.nc",
"netcdf3": "virtualizarr/tests/data/test_ds_netcdf3.nc",
"netcdf4_group": "virtualizarr/tests/data/test_ds_netcdf4_group.nc",
"netcdf4_non_standard_time": "virtualizarr/tests/data/test_ds_non_datetime_time.nc",
}


def generate_small_xr_datasets():
"""This function can be used to re-generate the locally stored dataset for testing
It's 43kB instead of the full 31MB air_temperature tutorial dataset"""
import numpy as np

# building our test dataset from the air_temp dataset, but saving a subset
ds = xr.tutorial.open_dataset("air_temperature").isel(time=slice(0, 4))

lats = np.arange(-90, 90, 1)
lons = np.arange(-180, 180, 1)

data = np.random.randint(0, 2, size=(4, 180, 360), dtype=np.int16)

# create a dataset with non-standard time
non_standard_date_ds = xr.Dataset(
data_vars=dict(air=(["time", "lat", "lon"], data)),
coords=dict(time=[0, 1, 2, 3], lat=lats, lon=lons),
)

# Add attributes to the time coordinate
non_standard_date_ds.time.attrs["units"] = "days since '2000-01-01'"
non_standard_date_ds.time.attrs["calendar"] = "standard"

# write datasets
ds.to_netcdf("virtualizarr/tests/data/test_ds_netcdf4_full_time.nc")
ds.isel(time=slice(0, 2)).to_netcdf(
"virtualizarr/tests/data/test_ds_netcdf4_split_time1.nc"
)
ds.isel(time=slice(2, 4)).to_netcdf(
"virtualizarr/tests/data/test_ds_netcdf4_split_time2.nc"
)

ds.to_netcdf("virtualizarr/tests/data/test_ds_netcdf3.nc", engine="scipy")
ds.to_netcdf("virtualizarr/tests/data/test_ds_netcdf4_group.nc", group="test/group")

non_standard_date_ds.to_netcdf(
"virtualizarr/tests/data/test_ds_non_datetime_time.nc"
)


def pytest_runtest_setup(item):
# based on https://stackoverflow.com/questions/47559524
if "network" in item.keywords and not item.config.getoption("--run-network-tests"):
Expand All @@ -21,49 +72,19 @@ def pytest_runtest_setup(item):


@pytest.fixture
def netcdf4_file(tmpdir):
# Set up example xarray dataset
ds = xr.tutorial.open_dataset("air_temperature")

# Save it to disk as netCDF (in temporary directory)
filepath = f"{tmpdir}/air.nc"
ds.to_netcdf(filepath, format="NETCDF4")
ds.close()

return filepath
def netcdf4_file() -> str:
return generate_test_filepath_dict()["full_time_netcdf4"]


@pytest.fixture
def hdf5_groups_file(tmpdir):
# Set up example xarray dataset
ds = xr.tutorial.open_dataset("air_temperature")

# Save it to disk as netCDF (in temporary directory)
filepath = f"{tmpdir}/air.nc"
ds.to_netcdf(filepath, format="NETCDF4", group="test/group")
ds.close()

return filepath
def hdf5_groups_file() -> str:
return generate_test_filepath_dict()["netcdf4_group"]


@pytest.fixture
def netcdf4_files(tmpdir):
# Set up example xarray dataset
ds = xr.tutorial.open_dataset("air_temperature")

# split inrto equal chunks so we can concatenate them back together later
ds1 = ds.isel(time=slice(None, 1460))
ds2 = ds.isel(time=slice(1460, None))

# Save it to disk as netCDF (in temporary directory)
filepath1 = f"{tmpdir}/air1.nc"
filepath2 = f"{tmpdir}/air2.nc"
ds1.to_netcdf(filepath1)
ds2.to_netcdf(filepath2)
ds1.close()
ds2.close()

return filepath1, filepath2
def netcdf4_files():
test_filepath_dict = generate_test_filepath_dict()
return test_filepath_dict["time_1_netcdf4"], test_filepath_dict["time_2_netcdf4"]


@pytest.fixture
Expand Down
Binary file added virtualizarr/tests/data/test_ds_netcdf3.nc
Binary file not shown.
Binary file not shown.
Binary file added virtualizarr/tests/data/test_ds_netcdf4_group.nc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
26 changes: 13 additions & 13 deletions virtualizarr/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
@pytest.mark.parametrize(
"inline_threshold, vars_to_inline",
[
(5e2, ["lat", "lon"]),
(5e4, ["lat", "lon", "time"]),
(500, ["time", "lat", "lon"]),
(5e3, ["lat", "lon", "time"]),
pytest.param(
5e7,
["lat", "lon", "time", "air"],
Expand All @@ -36,7 +36,6 @@ def test_numpy_arrays_to_inlined_kerchunk_refs(
netcdf4_file, loadable_variables=vars_to_inline, indexes={}
)
refs = vds.virtualize.to_kerchunk(format="dict")

# TODO I would just compare the entire dicts but kerchunk returns inconsistent results - see https://github.com/TomNicholas/VirtualiZarr/pull/73#issuecomment-2040931202
# assert refs == expected
assert refs["refs"]["air/0.0.0"] == expected["refs"]["air/0.0.0"]
Expand All @@ -47,15 +46,11 @@ def test_numpy_arrays_to_inlined_kerchunk_refs(

@pytest.mark.parametrize("format", ["dict", "json", "parquet"])
class TestKerchunkRoundtrip:
def test_kerchunk_roundtrip_no_concat(self, tmpdir, format):
# set up example xarray dataset
ds = xr.tutorial.open_dataset("air_temperature", decode_times=False)

# save it to disk as netCDF (in temporary directory)
ds.to_netcdf(f"{tmpdir}/air.nc")
def test_kerchunk_roundtrip_no_concat(self, netcdf4_file, tmpdir, format):
ds = xr.open_dataset(netcdf4_file, decode_times=False)

# use open_dataset_via_kerchunk to read it as references
vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={})
vds = open_virtual_dataset(netcdf4_file, indexes={})

if format == "dict":
# write those references to an in-memory kerchunk-formatted references dictionary
Expand All @@ -76,12 +71,17 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format):
xrt.assert_identical(roundtrip, ds)

@pytest.mark.parametrize("decode_times,time_vars", [(False, []), (True, ["time"])])
def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars):
def test_kerchunk_roundtrip_concat(
self, netcdf4_file, netcdf4_files, tmpdir, format, decode_times, time_vars
):
netcdf1, netcdf2 = netcdf4_files

# set up example xarray dataset
ds = xr.tutorial.open_dataset("air_temperature", decode_times=decode_times)
ds = xr.open_dataset(netcdf4_file, decode_times=decode_times)

# split into two datasets
ds1, ds2 = ds.isel(time=slice(None, 1460)), ds.isel(time=slice(1460, None))
ds1 = xr.open_dataset(netcdf1, decode_times=decode_times)
ds2 = xr.open_dataset(netcdf2, decode_times=decode_times)

# save it to disk as netCDF (in temporary directory)
ds1.to_netcdf(f"{tmpdir}/air1.nc")
Expand Down
7 changes: 2 additions & 5 deletions virtualizarr/tests/test_xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,10 +230,7 @@ def test_combine_by_coords(self, netcdf4_files):
vds1 = open_virtual_dataset(filepath1)
with pytest.warns(UserWarning, match="will create in-memory pandas indexes"):
vds2 = open_virtual_dataset(filepath2)

combined_vds = xr.combine_by_coords(
[vds2, vds1],
)
combined_vds = xr.combine_by_coords([vds2, vds1])

assert combined_vds.xindexes["time"].to_pandas_index().is_monotonic_increasing

Expand Down Expand Up @@ -278,7 +275,7 @@ def local_to_s3_url(old_local_path: str) -> str:
renamed_vds = vds.virtualize.rename_paths(local_to_s3_url)
assert (
renamed_vds["air"].data.manifest.dict()["0.0.0"]["path"]
== "s3://bucket/air.nc"
== "s3://bucket/test_ds_netcdf4_full_time.nc"
)

def test_invalid_type(self, netcdf4_file):
Expand Down
Loading