From 99b623d138adeb0929fb8f78ca8665e5d4a15b13 Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Thu, 8 Feb 2024 22:23:16 +0000
Subject: [PATCH 01/14] Get missing values and dtype without re-opening the
 dataset

---
 activestorage/active.py              | 152 ++++++++++++++-------------
 activestorage/netcdf_to_zarr.py      |  34 ++++--
 tests/test_bigger_data.py            |   4 +-
 tests/test_data/daily_data_masked.nc | Bin 26632 -> 26737 bytes
 tests/unit/test_active.py            |   4 +-
 5 files changed, 108 insertions(+), 86 deletions(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index fed673ca..e3947399 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -68,7 +68,7 @@ def __new__(cls, *args, **kwargs):
         }
         return instance
 
-    def __init__(self, uri, ncvar, storage_type=None, missing_value=None, _FillValue=None, valid_min=None, valid_max=None, max_threads=100):
+    def __init__(self, uri, ncvar, storage_type=None, max_threads=100):
         """
         Instantiate with a NetCDF4 dataset and the variable of interest within that file.
         (We need the variable, because we need variable specific metadata from within that
@@ -92,65 +92,6 @@ def __init__(self, uri, ncvar, storage_type=None, missing_value=None, _FillValue
         self._method = None
         self._lock = False
         self._max_threads = max_threads
-      
-        # obtain metadata, using netcdf4_python for now
-        # FIXME: There is an outstanding issue with ._FilLValue to be handled.
-        # If the user actually wrote the data with no fill value, or the
-        # default fill value is in play, then this might go wrong.
-        if storage_type is None:
-            ds = Dataset(uri)
-        elif storage_type == "s3":
-            with load_from_s3(uri) as _ds:
-                ds = _ds
-        try:
-            ds_var = ds[ncvar]
-        except IndexError as exc:
-            print(f"Dataset {ds} does not contain ncvar {ncvar!r}.")
-            raise exc
-
-        # FIXME: We do not get the correct byte order on the Zarr Array's dtype
-        # when using S3, so capture it here.
-        self._dtype = ds_var.dtype
-
-        if (missing_value, _FillValue, valid_min, valid_max) == (None, None, None, None):
-            if isinstance(ds, Dataset):
-                self._missing = getattr(ds_var, 'missing_value', None)
-                self._fillvalue = getattr(ds_var, '_FillValue', None)
-                # could be fill_value set as netCDF4 attr
-                if self._fillvalue is None:
-                    self._fillvalue = getattr(ds_var, 'fill_value', None)
-                valid_min = getattr(ds_var, 'valid_min', None)
-                valid_max = getattr(ds_var, 'valid_max', None)
-                valid_range = getattr(ds_var, 'valid_range', None)
-            elif storage_type == "s3":
-                self._missing = ds_var.attrs.get('missing_value')
-                self._fillvalue = ds_var.attrs.get('_FillValue')
-                # could be fill_value set as netCDF4 attr
-                if self._fillvalue is None:
-                    self._fillvalue = ds_var.attrs.get('fill_value')
-                valid_min = ds_var.attrs.get('valid_min')
-                valid_max = ds_var.attrs.get('valid_max')
-                valid_range = ds_var.attrs.get('valid_range')
-
-            if valid_max is not None or valid_min is not None:
-                if valid_range is not None:
-                    raise ValueError(
-                        "Invalid combination in the file of valid_min, "
-                        "valid_max, valid_range: "
-                        f"{valid_min}, {valid_max}, {valid_range}"
-                    )                
-                valid_range = (valid_min, valid_max)
-            elif valid_range is None:
-                valid_range = (None, None)
-            self._valid_min, self._valid_max = valid_range
-
-        else:
-            self._missing = missing_value
-            self._fillvalue = _FillValue
-            self._valid_min = valid_min
-            self._valid_max = valid_max
-
-        ds.close()
 
     def __getitem__(self, index):
         """ 
@@ -174,22 +115,16 @@ def __getitem__(self, index):
             elif self.storage_type == "s3":
                 with load_from_s3(self.uri) as nc:
                     data = nc[ncvar][index]
-                    # h5netcdf doesn't return masked arrays.
-                    if self._fillvalue:
-                        data = np.ma.masked_equal(data, self._fillvalue)
-                    if self._missing:
-                        data = np.ma.masked_equal(data, self._missing)
-                    if self._valid_max:
-                        data = np.ma.masked_greater(data, self._valid_max)
-                    if self._valid_min:
-                        data = np.ma.masked_less(data, self._valid_min)
+                    data = self._mask_data(data, nc[ncvar])
 
             if lock:
                 lock.release()
-
+                
             return data
+        
         elif self._version == 1:
             return self._via_kerchunk(index)
+        
         elif self._version  == 2:
             # No active operation either
             lock = self.lock
@@ -202,6 +137,7 @@ def __getitem__(self, index):
                 lock.release()
 
             return data
+
         else:
             raise ValueError(f'Version {self._version} not supported')
 
@@ -299,14 +235,27 @@ def _via_kerchunk(self, index):
         if self.zds is None:
             print(f"Kerchunking file {self.uri} with variable "
                   f"{self.ncvar} for storage type {self.storage_type}")
-            ds = nz.load_netcdf_zarr_generic(self.uri,
-                                             self.ncvar,
-                                             self.storage_type)
+            ds, zarray, zattrs = nz.load_netcdf_zarr_generic(
+                self.uri,
+                self.ncvar,
+                self.storage_type
+            )
             # The following is a hangove from exploration
             # and is needed if using the original doing it ourselves
             # self.zds = make_an_array_instance_active(ds)
             self.zds = ds
 
+            # Retain attributes and other information
+            if zarray.get('fill_value') is not None:
+                zattrs['_FillValue'] = zarray['fill_value']
+            
+            self.zarray = zarray
+            self.zattrs = zattrs
+
+            # FIXME: We do not get the correct byte order on the Zarr
+            # Array's dtype when using S3, so capture it here.
+            self._dtype = np.dtype(zarray['dtype'])
+            
         return self._get_selection(index)
 
     def _get_selection(self, *args):
@@ -319,7 +268,28 @@ def _get_selection(self, *args):
         compressor = self.zds._compressor
         filters = self.zds._filters
 
-        missing = self._fillvalue, self._missing, self._valid_min, self._valid_max
+        # Get missing values
+        _FillValue = self.zattrs.get('_FillValue')
+        missing_value = self.zattrs.get('missing_value')
+        valid_min = self.zattrs.get('valid_min')
+        valid_max = self.zattrs.get('valid_max')
+        valid_range = self.zattrs.get('valid_range')
+        if valid_max is not None or valid_min is not None:
+            if valid_range is not None:
+                raise ValueError(
+                    "Invalid combination in the file of valid_min, "
+                    "valid_max, valid_range: "
+                    f"{valid_min}, {valid_max}, {valid_range}"
+                )
+        elif valid_range:            
+            valid_min, valid_max = valid_range
+        
+        missing = (
+            _FillValue,
+            missing_value,
+            valid_min,
+            valid_max,
+        )
 
         indexer = OrthogonalIndexer(*args, self.zds)
         out_shape = indexer.shape
@@ -468,3 +438,37 @@ def _process_chunk(self, session, fsref, chunk_coords, chunk_selection, counts,
             if drop_axes:
                 tmp = np.squeeze(tmp, axis=drop_axes)
             return tmp, out_selection
+
+    def _mask_data(self, data, ds_var):
+        """ppp"""
+        # TODO: replace with cfdm.NetCDFIndexer, hopefully.
+        attrs = ds_var.attrs
+        missing_value = attrs.get('missing_value')
+        _FillValue = attrs.get('_FillValue')
+        valid_min = attrs.get('valid_min')
+        valid_max = attrs.get('valid_max')
+        valid_range = attrs.get('valid_range')
+            
+        if valid_max is not None or valid_min is not None:
+            if valid_range is not None:
+                raise ValueError(
+                    "Invalid combination in the file of valid_min, "
+                    "valid_max, valid_range: "
+                    f"{valid_min}, {valid_max}, {valid_range}"
+                )
+        elif valid_range:
+            valid_min, valid_max = valid_range
+        
+        if _FillValue is not None:
+            data = np.ma.masked_equal(data, fillvalue)
+
+        if missing_value is not None:
+            data = np.ma.masked_equal(data, missing)
+
+        if valid_max is not None:
+            data = np.ma.masked_greater(data, valid_max)
+
+        if valid_min is not None:
+            data = np.ma.masked_less(data, valid_min)
+
+        return data
diff --git a/activestorage/netcdf_to_zarr.py b/activestorage/netcdf_to_zarr.py
index a8dcb034..782ff729 100644
--- a/activestorage/netcdf_to_zarr.py
+++ b/activestorage/netcdf_to_zarr.py
@@ -10,7 +10,7 @@
 from kerchunk.hdf import SingleHdf5ToZarr
 
 
-def gen_json(file_url, outf, storage_type):
+def gen_json(file_url, varname, outf, storage_type):
     """Generate a json file that contains the kerchunk-ed data for Zarr."""
     if storage_type == "s3":
         fs = s3fs.S3FileSystem(key=S3_ACCESS_KEY,
@@ -24,7 +24,8 @@ def gen_json(file_url, outf, storage_type):
             h5chunks = SingleHdf5ToZarr(s3file, file_url,
                                         inline_threshold=0)
             with fs2.open(outf, 'wb') as f:
-                f.write(ujson.dumps(h5chunks.translate()).encode())
+                content = h5chunks.translate()
+                f.write(ujson.dumps(content).encode())
     else:
         fs = fsspec.filesystem('')
         with fs.open(file_url, 'rb') as local_file:
@@ -43,9 +44,13 @@ def gen_json(file_url, outf, storage_type):
             # faster loading time
             # for active storage, we don't want anything inline
             with fs.open(outf, 'wb') as f:
-                f.write(ujson.dumps(h5chunks.translate()).encode())
+                content = h5chunks.translate()
+                f.write(ujson.dumps(content).encode())
 
-    return outf
+    zarray =  ujson.loads(content['refs'][f"{varname}/.zarray"])
+    zattrs =  ujson.loads(content['refs'][f"{varname}/.zattrs"])
+                
+    return outf, zarray, zattrs
 
 
 def open_zarr_group(out_json, varname):
@@ -60,6 +65,7 @@ def open_zarr_group(out_json, varname):
     mapper = fs.get_mapper("")  # local FS mapper
     #mapper.fs.reference has the kerchunk mapping, how does this propagate into the Zarr array?
     zarr_group = zarr.open_group(mapper)
+   
     try:
         zarr_array = getattr(zarr_group, varname)
     except AttributeError as attrerr:
@@ -67,7 +73,7 @@ def open_zarr_group(out_json, varname):
               f"Zarr Group info: {zarr_group.info}")
         raise attrerr
     #print("Zarr array info:",  zarr_array.info)
-
+    
     return zarr_array
 
 
@@ -77,10 +83,24 @@ def load_netcdf_zarr_generic(fileloc, varname, storage_type, build_dummy=True):
 
     # Write the Zarr group JSON to a temporary file.
     with tempfile.NamedTemporaryFile() as out_json:
-        gen_json(fileloc, out_json.name, storage_type)
+        _, zarray, zattrs = gen_json(fileloc, varname, out_json.name, storage_type)
 
         # open this monster
         print(f"Attempting to open and convert {fileloc}.")
         ref_ds = open_zarr_group(out_json.name, varname)
 
-    return ref_ds
+    return ref_ds, zarray, zattrs
+
+
+#d = {'version': 1,
+# 'refs': {
+#     '.zgroup': '{"zarr_format":2}',
+#     '.zattrs': '{"Conventions":"CF-1.6","access-list":"grenvillelister simonwilson jeffcole","awarning":"**** THIS SUITE WILL ARCHIVE NON-DUPLEXED DATA TO MOOSE. FOR CRITICAL MODEL RUNS SWITCH TO DUPLEXED IN: postproc --> Post Processing - common settings --> Moose Archiving --> non_duplexed_set. Follow guidance in http:\\/\\/www-twiki\\/Main\\/MassNonDuplexPolicy","branch-date":"1950-01-01","calendar":"360_day","code-version":"UM 11.6, NEMO vn3.6","creation_time":"2022-10-28 12:28","decription":"Initialised from EN4 climatology","description":"Copy of u-ar696\\/trunk@77470","email":"r.k.schieman@reading.ac.uk","end-date":"2015-01-01","experiment-id":"historical","forcing":"AA,BC,CO2","forcing-info":"blah, blah, blah","institution":"NCAS","macro-parent-experiment-id":"historical","macro-parent-experiment-mip":"CMIP","macro-parent-variant-id":"r1i1p1f3","model-id":"HadGEM3-CG31-MM","name":"\\/work\\/n02\\/n02\\/grenvill\\/cylc-run\\/u-cn134\\/share\\/cycle\\/19500101T0000Z\\/3h_","owner":"rosalynhatcher","project":"Coupled Climate","timeStamp":"2022-Oct-28 12:20:33 GMT","title":"[CANARI] GC3.1 N216 ORCA025 UM11.6","uuid":"51e5ef20-d376-4aa6-938e-4c242886b7b1"}',
+#     'lat/.zarray': '{"chunks":[324],"compressor":{"id":"zlib","level":1},"dtype":"<f4","fill_value":null,"filters":[{"elementsize":4,"id":"shuffle"}],"order":"C","shape":[324],"zarr_format":2}', 'lat/.zattrs': '{"_ARRAY_DIMENSIONS":["lat"],"axis":"Y","long_name":"Latitude","standard_name":"latitude","units":"degrees_north"}',
+#     'lat/0': ['/home/david/Downloads/3h__19500101-19500110.nc', 26477, 560],
+#     'lon/.zarray': '{"chunks":[432],"compressor":{"id":"zlib","level":1},"dtype":"<f4","fill_value":null,"filters":[{"elementsize":4,"id":"shuffle"}],"order":"C","shape":[432],"zarr_format":2}',
+#     'lon/.zattrs': '{"_ARRAY_DIMENSIONS":["lon"],"axis":"X","long_name":"Longitude","standard_name":"longitude","units":"degrees_east"}',
+#     'lon/0': ['/home/david/Downloads/3h__19500101-19500110.nc', 27037, 556],
+#     'm01s00i507_10/.zarray': '{"chunks":[1,324,432],"compressor":{"id":"zlib","level":1},"dtype":"<f4","fill_value":-1073741824.0,"filters":[{"elementsize":4,"id":"shuffle"}],"order":"C","shape":[80,324,432],"zarr_format":2}',
+#     'm01s00i507_10/.zattrs': '{"_ARRAY_DIMENSIONS":["time_counter","lat","lon"],"cell_methods":"time: mean (interval: 900 s)","coordinates":"time_centered","interval_offset":"0ts","interval_operation":"900 s","interval_write":"3 h","long_name":"OPEN SEA SURFACE TEMP AFTER TIMESTEP","missing_value":-1073741824.0,"online_operation":"average","standard_name":"surface_temperature","units":"K"}',
+#     }}
diff --git a/tests/test_bigger_data.py b/tests/test_bigger_data.py
index 368c089e..c64c6c75 100644
--- a/tests/test_bigger_data.py
+++ b/tests/test_bigger_data.py
@@ -243,13 +243,13 @@ def test_daily_data_masked(test_data_path):
     """
     ncfile = str(test_data_path / "daily_data_masked.nc")
     uri = utils.write_to_storage(ncfile)
-    active = Active(uri, "ta", utils.get_storage_type(), missing_value=999.)
+    active = Active(uri, "ta", utils.get_storage_type())
     active._version = 0
     d = active[:]
     d = np.ma.masked_where(d==999., d)
     mean_result = np.ma.mean(d)
 
-    active = Active(uri, "ta", utils.get_storage_type(), missing_value=999.)
+    active = Active(uri, "ta", utils.get_storage_type())
     active._version = 2
     active.method = "mean"
     active.components = True
diff --git a/tests/test_data/daily_data_masked.nc b/tests/test_data/daily_data_masked.nc
index 0465812c466d3306b469dbbc5fd605328df37502..ef0b21eba83be0a192f07a65c9d03fe7b58d5b60 100644
GIT binary patch
delta 679
zcmeCU!1(b3;{+Lz!VCs5Fn~}DJKdchO*F7)l$=;lt;~I&In>|T!&^WAB+CI+nh90P
zBg4SJI@!@$WU>mQ4I2Z)kD5PEH>WU~vQD<Nkea-iEkw(#UZKq?#KVPwlYxnWffuMR
zMS~*@qFs_f5^7HV$^TWG4cWgd^C84G1lc^<Y^ME1n0FNuLl`Sa<)#pc&y)LOBsT}x
ze-dVz%r)61SdCF~awCiS<ONJ(VrEd|7#VhWy03Q&2HMK7pK0>GU^kW-FMb(M<_NiL
zl)(Ztr<8#~97t8wDj#R~3Un5OJp%(jkXrpd<~YM$kcvtM1_>bbGCk(><kumqSOisH
z=uF<o<2(7ZDbHq+FjYo524s^^G@X%I`V6Rc<7SQsAr2O%>(!Gc%OwQWGc#}iA<*!Q
z%;J*#qDlr~pal^?401q7MyY~ZYLbG2g@Tcxm5HU5v9W@Yfsu)oLSAxWNl9vog08=U
zZlXeNW^r+5UV404VoqtQPD!FpvQC<grKP2wfkH}RW=>^%N@7W3d~RZKc4~@VUNQqS
z&_b}KetFLRU>g&lHs+<4IJ>wh_!p!WB?6sOtWcI(RGgWgr(mjQpl7C_;gO%4T9BBY
zs$i>-QBqQ1rLUitoUd1$rU%rhqu`vMf+UlkS&~tjq?eqZ3sMBanwvNBtY%uwk-{?Z
zy{x`70|PtIDSQkfK*)gN2L?d}c~GDLfdYd#2ZMM$1H<G-3sDY+1C=fXxBoc<0ORee
A0{{R3

delta 299
zcmex(fwAKP;{+KIjtmAcFn~}Do`%Wl6AkPc#U~b2vn<<mCT=oEhAfK=0|V=1M{AME
zN{luv?QC+7Hm5L}vNEwVOcvA<nXI72GI=vwh}Qf^Ma!H*JX{z!8JHLtc!4ISXmEr<
z3>Ie)huXmKoyU5!A^Ue_K7^RVs*Det&9uJ=^Uh#q$Y24f%;T3nJ-I(da&v(FCt;>W
z?#V8}YK-EO8(GvRFJKZAGn0fE$H;KB^TlGfV4$rGYndnS3wC2!S#9nznIq(~z*nFv
z80;As_<_{w_c6yO^M<Zsxzc%j@8pd<`IAqZ@@y6fQ)Oh)Hc;OR6gj<_BSMIS<yB$y
T)5&rPL7O-7%wS&3k-`E1iuzAR

diff --git a/tests/unit/test_active.py b/tests/unit/test_active.py
index 298fe8b7..05102535 100644
--- a/tests/unit/test_active.py
+++ b/tests/unit/test_active.py
@@ -81,9 +81,7 @@ def test_active():
     uri = "tests/test_data/cesm2_native.nc"
     ncvar = "TREFHT"
     active = Active(uri, ncvar=ncvar)
-    init = active.__init__(uri=uri, ncvar=ncvar, missing_value=True,
-                           _FillValue=1e20, valid_min=-1,
-                           valid_max=1200)
+    init = active.__init__(uri=uri, ncvar=ncvar)
 
 
 def test_lock():

From 7d9274b4029cc4c7b56ef4d8c8d81ffe25e4b328 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Fri, 9 Feb 2024 15:32:23 +0000
Subject: [PATCH 02/14] remove unneeded test case

---
 tests/unit/test_active.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/tests/unit/test_active.py b/tests/unit/test_active.py
index 05102535..9b3541ea 100644
--- a/tests/unit/test_active.py
+++ b/tests/unit/test_active.py
@@ -35,13 +35,6 @@ def test_getitem():
         active = Active(uri, ncvar=None)
     assert str(exc.value) == "Must set a netCDF variable name to slice"
 
-    # unopenable file
-    ncvar = "tas"
-    baseexc = "tas not found in /"
-    with pytest.raises(IndexError) as exc:
-        active = Active(uri, ncvar=ncvar)
-    assert baseexc in str(exc.value)
-
     # openable file and correct variable
     uri = "tests/test_data/cesm2_native.nc"
     ncvar = "TREFHT"

From dc0cde30dcfac335261ce615ef0fb603b9572024 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Fri, 9 Feb 2024 15:36:09 +0000
Subject: [PATCH 03/14] patch and retire bits in S3 test

---
 tests/unit/test_storage_types.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/unit/test_storage_types.py b/tests/unit/test_storage_types.py
index 2f789261..6bd82e3d 100644
--- a/tests/unit/test_storage_types.py
+++ b/tests/unit/test_storage_types.py
@@ -89,7 +89,9 @@ def reduce_chunk(
 
     assert result == 999.0
 
-    mock_load.assert_called_once_with(uri)
+    # S3 loading is not done from Active anymore
+    # mock_load.assert_called_once_with(uri)
+
     mock_nz.assert_called_once_with(uri, "data", "s3")
     # NOTE: This gets called multiple times with various arguments. Match on
     # the common ones.
@@ -134,6 +136,7 @@ def load_from_s3(uri):
     assert np.max(result) == 999.0
 
 
+@pytest.mark.skip(reason="No more valid file load in Active")
 @mock.patch.object(activestorage.active, "load_from_s3")
 def test_s3_load_failure(mock_load):
     """Test when an S3 object doesn't exist."""

From 0884af85bb8d107ad68dbced8b6852e2e11071cf Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Fri, 9 Feb 2024 17:39:54 +0000
Subject: [PATCH 04/14] valid_range boolean

---
 activestorage/active.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index e3947399..e1148d05 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -281,7 +281,7 @@ def _get_selection(self, *args):
                     "valid_max, valid_range: "
                     f"{valid_min}, {valid_max}, {valid_range}"
                 )
-        elif valid_range:            
+        elif valid_range is not None:            
             valid_min, valid_max = valid_range
         
         missing = (

From a69fb2c0661b80bdd699d9a50a1262bf847a4ac4 Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Fri, 9 Feb 2024 17:42:30 +0000
Subject: [PATCH 05/14] fill value typo

---
 activestorage/active.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index e1148d05..98221349 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -460,7 +460,7 @@ def _mask_data(self, data, ds_var):
             valid_min, valid_max = valid_range
         
         if _FillValue is not None:
-            data = np.ma.masked_equal(data, fillvalue)
+            data = np.ma.masked_equal(data, _FillValue)
 
         if missing_value is not None:
             data = np.ma.masked_equal(data, missing)

From 2f1082276ef93b32c44067e03d4c64a29f77627d Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Fri, 9 Feb 2024 17:43:10 +0000
Subject: [PATCH 06/14] missing value typo

---
 activestorage/active.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index 98221349..2ca3a60a 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -463,7 +463,7 @@ def _mask_data(self, data, ds_var):
             data = np.ma.masked_equal(data, _FillValue)
 
         if missing_value is not None:
-            data = np.ma.masked_equal(data, missing)
+            data = np.ma.masked_equal(data, missing_value)
 
         if valid_max is not None:
             data = np.ma.masked_greater(data, valid_max)

From a0fa0b744096ca4ffcf8f4377d4e6841739e391d Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Fri, 9 Feb 2024 18:00:57 +0000
Subject: [PATCH 07/14] dev

---
 tests/test_missing.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_missing.py b/tests/test_missing.py
index 7f5f4a19..87501fd9 100644
--- a/tests/test_missing.py
+++ b/tests/test_missing.py
@@ -53,7 +53,6 @@ def active_two(testfile):
 
     return active_mean
 
-
 def test_partially_missing_data(tmp_path):
     testfile = str(tmp_path / 'test_partially_missing_data.nc')
     r = dd.make_partially_missing_ncdata(testfile)

From 7cd6f0954f2e30e9a4f669d98e15d6d7b881a54b Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Fri, 9 Feb 2024 18:05:16 +0000
Subject: [PATCH 08/14] dev

---
 activestorage/active.py | 2 +-
 tests/test_missing.py   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index 2ca3a60a..82f27f92 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -456,7 +456,7 @@ def _mask_data(self, data, ds_var):
                     "valid_max, valid_range: "
                     f"{valid_min}, {valid_max}, {valid_range}"
                 )
-        elif valid_range:
+        elif valid_range is not None:
             valid_min, valid_max = valid_range
         
         if _FillValue is not None:
diff --git a/tests/test_missing.py b/tests/test_missing.py
index 87501fd9..7f5f4a19 100644
--- a/tests/test_missing.py
+++ b/tests/test_missing.py
@@ -53,6 +53,7 @@ def active_two(testfile):
 
     return active_mean
 
+
 def test_partially_missing_data(tmp_path):
     testfile = str(tmp_path / 'test_partially_missing_data.nc')
     r = dd.make_partially_missing_ncdata(testfile)

From d70746e07d2b606a3e0cafdaeb195e7db849ce7b Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Fri, 9 Feb 2024 18:26:22 +0000
Subject: [PATCH 09/14] dev

---
 tests/test_bigger_data.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/test_bigger_data.py b/tests/test_bigger_data.py
index c64c6c75..2f38bddb 100644
--- a/tests/test_bigger_data.py
+++ b/tests/test_bigger_data.py
@@ -149,7 +149,7 @@ def test_native_emac_model_fails(test_data_path):
     """
     An example of netCDF file that doesn't work
 
-    The actual issue  is with h5py - it can't read it (netCDF classic)
+    The actual issue  is with h5py - it can't read it (netCDF3 classic)
 
     h5py/_objects.pyx:54: in h5py._objects.with_phil.wrapper
         ???
@@ -175,8 +175,9 @@ def test_native_emac_model_fails(test_data_path):
         pass
 
     if USE_S3:
+        active = Active(uri, "aps_ave", utils.get_storage_type())
         with pytest.raises(OSError):
-            active = Active(uri, "aps_ave", utils.get_storage_type())
+            active[...]
     else:
         active = Active(uri, "aps_ave")
         active._version = 2

From 92495f74808a320890f785c8fa9ca69c9d2dd5c6 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 12 Feb 2024 13:18:28 +0000
Subject: [PATCH 10/14] add to tests

---
 tests/unit/test_active.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/unit/test_active.py b/tests/unit/test_active.py
index 9b3541ea..6570764c 100644
--- a/tests/unit/test_active.py
+++ b/tests/unit/test_active.py
@@ -4,6 +4,8 @@
 import threading
 
 from activestorage.active import Active
+from activestorage.active import load_from_s3
+from botocore.exceptions import EndpointConnectionError as botoExc
 
 
 def test_uri_none():
@@ -96,3 +98,13 @@ def test_lock():
 
     active.lock = None
     assert active.lock is False
+
+
+def test_load_from_s3():
+    """Test basic load from S3 without loading from S3."""
+    uri = "s3://bucket/file.nc"
+    expected_exc = "Could not connect to the endpoint URL"
+    with pytest.raises(botoExc) as exc:
+        with load_from_s3(uri) as nc:
+            data = nc["cow"][0]
+    assert expected_exc in str(exc.value)

From efdce9f42dc6383c3961ae290beec334a111c887 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 12 Feb 2024 13:26:24 +0000
Subject: [PATCH 11/14] adjust mock load

---
 tests/unit/test_storage_types.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/test_storage_types.py b/tests/unit/test_storage_types.py
index 6bd82e3d..82d6761a 100644
--- a/tests/unit/test_storage_types.py
+++ b/tests/unit/test_storage_types.py
@@ -90,7 +90,7 @@ def reduce_chunk(
     assert result == 999.0
 
     # S3 loading is not done from Active anymore
-    # mock_load.assert_called_once_with(uri)
+    mock_load.assert_not_called()
 
     mock_nz.assert_called_once_with(uri, "data", "s3")
     # NOTE: This gets called multiple times with various arguments. Match on

From d4ab4e759937e76dccce6d5b33cb2e37c7a18c3d Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 12 Feb 2024 14:37:37 +0000
Subject: [PATCH 12/14] improve testing

---
 tests/test_missing.py | 62 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/tests/test_missing.py b/tests/test_missing.py
index 7f5f4a19..722358af 100644
--- a/tests/test_missing.py
+++ b/tests/test_missing.py
@@ -7,6 +7,8 @@
 import tempfile
 import unittest
 
+import h5netcdf
+
 from netCDF4 import Dataset
 
 from activestorage.active import Active
@@ -240,3 +242,63 @@ def test_validrange(tmp_path):
 
     np.testing.assert_array_equal(masked_numpy_mean, active_mean)
     np.testing.assert_array_equal(no_active_mean, active_mean)
+
+
+def test_active_mask_data(tmp_path):
+    testfile = str(tmp_path / 'test_partially_missing_data.nc')
+
+    # with valid min
+    r = dd.make_validmin_ncdata(testfile, valid_min=500)
+
+    # retrieve the actual numpy-ed result
+    actual_data = load_dataset(testfile)
+
+    # dataset variable
+    ds = h5netcdf.File(testfile, 'r', invalid_netcdf=True)
+    dsvar = ds["data"]
+
+    # test the function
+    data = Active._mask_data(None, actual_data, dsvar)
+    ds.close()
+
+    # with valid range
+    r = dd.make_validrange_ncdata(testfile, valid_range=[750., 850.])
+
+    # retrieve the actual numpy-ed result
+    actual_data = load_dataset(testfile)
+
+    # dataset variable
+    ds = h5netcdf.File(testfile, 'r', invalid_netcdf=True)
+    dsvar = ds["data"]
+
+    # test the function
+    data = Active._mask_data(None, actual_data, dsvar)
+    ds.close()
+
+    # with missing
+    r = dd.make_missing_ncdata(testfile)
+
+    # retrieve the actual numpy-ed result
+    actual_data = load_dataset(testfile)
+
+    # dataset variable
+    ds = h5netcdf.File(testfile, 'r', invalid_netcdf=True)
+    dsvar = ds["data"]
+
+    # test the function
+    data = Active._mask_data(None, actual_data, dsvar)
+    ds.close()
+
+    # with _FillValue
+    r = dd.make_fillvalue_ncdata(testfile)
+
+    # retrieve the actual numpy-ed result
+    actual_data = load_dataset(testfile)
+
+    # dataset variable
+    ds = h5netcdf.File(testfile, 'r', invalid_netcdf=True)
+    dsvar = ds["data"]
+
+    # test the function
+    data = Active._mask_data(None, actual_data, dsvar)
+    ds.close()

From be23fb9d5b94e26c8eb5d7f1637cf5973195163e Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 12 Feb 2024 14:38:30 +0000
Subject: [PATCH 13/14] rm tab indented

---
 activestorage/active.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index 82f27f92..841c3a60 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -448,7 +448,7 @@ def _mask_data(self, data, ds_var):
         valid_min = attrs.get('valid_min')
         valid_max = attrs.get('valid_max')
         valid_range = attrs.get('valid_range')
-            
+
         if valid_max is not None or valid_min is not None:
             if valid_range is not None:
                 raise ValueError(

From 53f4ed4b96939c4ce41fb239557058a6a8cb1e3d Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 12 Feb 2024 15:02:16 +0000
Subject: [PATCH 14/14] skip silly S3 test

---
 tests/unit/test_active.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/unit/test_active.py b/tests/unit/test_active.py
index 6570764c..232ffac9 100644
--- a/tests/unit/test_active.py
+++ b/tests/unit/test_active.py
@@ -5,6 +5,7 @@
 
 from activestorage.active import Active
 from activestorage.active import load_from_s3
+from activestorage.config import *
 from botocore.exceptions import EndpointConnectionError as botoExc
 
 
@@ -100,6 +101,7 @@ def test_lock():
     assert active.lock is False
 
 
+@pytest.mark.skipif(USE_S3 = True, reason="it will look for silly bucket")
 def test_load_from_s3():
     """Test basic load from S3 without loading from S3."""
     uri = "s3://bucket/file.nc"