@@ -68,7 +68,7 @@ def __new__(cls, *args, **kwargs):
68
68
}
69
69
return instance
70
70
71
- def __init__ (self , uri , ncvar , storage_type = None , missing_value = None , _FillValue = None , valid_min = None , valid_max = None , max_threads = 100 ):
71
+ def __init__ (self , uri , ncvar , storage_type = None , max_threads = 100 ):
72
72
"""
73
73
Instantiate with a NetCDF4 dataset and the variable of interest within that file.
74
74
(We need the variable, because we need variable specific metadata from within that
@@ -92,65 +92,6 @@ def __init__(self, uri, ncvar, storage_type=None, missing_value=None, _FillValue
92
92
self ._method = None
93
93
self ._lock = False
94
94
self ._max_threads = max_threads
95
-
96
- # obtain metadata, using netcdf4_python for now
97
- # FIXME: There is an outstanding issue with ._FilLValue to be handled.
98
- # If the user actually wrote the data with no fill value, or the
99
- # default fill value is in play, then this might go wrong.
100
- if storage_type is None :
101
- ds = Dataset (uri )
102
- elif storage_type == "s3" :
103
- with load_from_s3 (uri ) as _ds :
104
- ds = _ds
105
- try :
106
- ds_var = ds [ncvar ]
107
- except IndexError as exc :
108
- print (f"Dataset { ds } does not contain ncvar { ncvar !r} ." )
109
- raise exc
110
-
111
- # FIXME: We do not get the correct byte order on the Zarr Array's dtype
112
- # when using S3, so capture it here.
113
- self ._dtype = ds_var .dtype
114
-
115
- if (missing_value , _FillValue , valid_min , valid_max ) == (None , None , None , None ):
116
- if isinstance (ds , Dataset ):
117
- self ._missing = getattr (ds_var , 'missing_value' , None )
118
- self ._fillvalue = getattr (ds_var , '_FillValue' , None )
119
- # could be fill_value set as netCDF4 attr
120
- if self ._fillvalue is None :
121
- self ._fillvalue = getattr (ds_var , 'fill_value' , None )
122
- valid_min = getattr (ds_var , 'valid_min' , None )
123
- valid_max = getattr (ds_var , 'valid_max' , None )
124
- valid_range = getattr (ds_var , 'valid_range' , None )
125
- elif storage_type == "s3" :
126
- self ._missing = ds_var .attrs .get ('missing_value' )
127
- self ._fillvalue = ds_var .attrs .get ('_FillValue' )
128
- # could be fill_value set as netCDF4 attr
129
- if self ._fillvalue is None :
130
- self ._fillvalue = ds_var .attrs .get ('fill_value' )
131
- valid_min = ds_var .attrs .get ('valid_min' )
132
- valid_max = ds_var .attrs .get ('valid_max' )
133
- valid_range = ds_var .attrs .get ('valid_range' )
134
-
135
- if valid_max is not None or valid_min is not None :
136
- if valid_range is not None :
137
- raise ValueError (
138
- "Invalid combination in the file of valid_min, "
139
- "valid_max, valid_range: "
140
- f"{ valid_min } , { valid_max } , { valid_range } "
141
- )
142
- valid_range = (valid_min , valid_max )
143
- elif valid_range is None :
144
- valid_range = (None , None )
145
- self ._valid_min , self ._valid_max = valid_range
146
-
147
- else :
148
- self ._missing = missing_value
149
- self ._fillvalue = _FillValue
150
- self ._valid_min = valid_min
151
- self ._valid_max = valid_max
152
-
153
- ds .close ()
154
95
155
96
def __getitem__ (self , index ):
156
97
"""
@@ -174,22 +115,16 @@ def __getitem__(self, index):
174
115
elif self .storage_type == "s3" :
175
116
with load_from_s3 (self .uri ) as nc :
176
117
data = nc [ncvar ][index ]
177
- # h5netcdf doesn't return masked arrays.
178
- if self ._fillvalue :
179
- data = np .ma .masked_equal (data , self ._fillvalue )
180
- if self ._missing :
181
- data = np .ma .masked_equal (data , self ._missing )
182
- if self ._valid_max :
183
- data = np .ma .masked_greater (data , self ._valid_max )
184
- if self ._valid_min :
185
- data = np .ma .masked_less (data , self ._valid_min )
118
+ data = self ._mask_data (data , nc [ncvar ])
186
119
187
120
if lock :
188
121
lock .release ()
189
-
122
+
190
123
return data
124
+
191
125
elif self ._version == 1 :
192
126
return self ._via_kerchunk (index )
127
+
193
128
elif self ._version == 2 :
194
129
# No active operation either
195
130
lock = self .lock
@@ -202,6 +137,7 @@ def __getitem__(self, index):
202
137
lock .release ()
203
138
204
139
return data
140
+
205
141
else :
206
142
raise ValueError (f'Version { self ._version } not supported' )
207
143
@@ -299,14 +235,27 @@ def _via_kerchunk(self, index):
299
235
if self .zds is None :
300
236
print (f"Kerchunking file { self .uri } with variable "
301
237
f"{ self .ncvar } for storage type { self .storage_type } " )
302
- ds = nz .load_netcdf_zarr_generic (self .uri ,
303
- self .ncvar ,
304
- self .storage_type )
238
+ ds , zarray , zattrs = nz .load_netcdf_zarr_generic (
239
+ self .uri ,
240
+ self .ncvar ,
241
+ self .storage_type
242
+ )
305
243
# The following is a hangove from exploration
306
244
# and is needed if using the original doing it ourselves
307
245
# self.zds = make_an_array_instance_active(ds)
308
246
self .zds = ds
309
247
248
+ # Retain attributes and other information
249
+ if zarray .get ('fill_value' ) is not None :
250
+ zattrs ['_FillValue' ] = zarray ['fill_value' ]
251
+
252
+ self .zarray = zarray
253
+ self .zattrs = zattrs
254
+
255
+ # FIXME: We do not get the correct byte order on the Zarr
256
+ # Array's dtype when using S3, so capture it here.
257
+ self ._dtype = np .dtype (zarray ['dtype' ])
258
+
310
259
return self ._get_selection (index )
311
260
312
261
def _get_selection (self , * args ):
@@ -319,7 +268,28 @@ def _get_selection(self, *args):
319
268
compressor = self .zds ._compressor
320
269
filters = self .zds ._filters
321
270
322
- missing = self ._fillvalue , self ._missing , self ._valid_min , self ._valid_max
271
+ # Get missing values
272
+ _FillValue = self .zattrs .get ('_FillValue' )
273
+ missing_value = self .zattrs .get ('missing_value' )
274
+ valid_min = self .zattrs .get ('valid_min' )
275
+ valid_max = self .zattrs .get ('valid_max' )
276
+ valid_range = self .zattrs .get ('valid_range' )
277
+ if valid_max is not None or valid_min is not None :
278
+ if valid_range is not None :
279
+ raise ValueError (
280
+ "Invalid combination in the file of valid_min, "
281
+ "valid_max, valid_range: "
282
+ f"{ valid_min } , { valid_max } , { valid_range } "
283
+ )
284
+ elif valid_range :
285
+ valid_min , valid_max = valid_range
286
+
287
+ missing = (
288
+ _FillValue ,
289
+ missing_value ,
290
+ valid_min ,
291
+ valid_max ,
292
+ )
323
293
324
294
indexer = OrthogonalIndexer (* args , self .zds )
325
295
out_shape = indexer .shape
@@ -468,3 +438,37 @@ def _process_chunk(self, session, fsref, chunk_coords, chunk_selection, counts,
468
438
if drop_axes :
469
439
tmp = np .squeeze (tmp , axis = drop_axes )
470
440
return tmp , out_selection
441
+
442
+ def _mask_data (self , data , ds_var ):
443
+ """ppp"""
444
+ # TODO: replace with cfdm.NetCDFIndexer, hopefully.
445
+ attrs = ds_var .attrs
446
+ missing_value = attrs .get ('missing_value' )
447
+ _FillValue = attrs .get ('_FillValue' )
448
+ valid_min = attrs .get ('valid_min' )
449
+ valid_max = attrs .get ('valid_max' )
450
+ valid_range = attrs .get ('valid_range' )
451
+
452
+ if valid_max is not None or valid_min is not None :
453
+ if valid_range is not None :
454
+ raise ValueError (
455
+ "Invalid combination in the file of valid_min, "
456
+ "valid_max, valid_range: "
457
+ f"{ valid_min } , { valid_max } , { valid_range } "
458
+ )
459
+ elif valid_range :
460
+ valid_min , valid_max = valid_range
461
+
462
+ if _FillValue is not None :
463
+ data = np .ma .masked_equal (data , fillvalue )
464
+
465
+ if missing_value is not None :
466
+ data = np .ma .masked_equal (data , missing )
467
+
468
+ if valid_max is not None :
469
+ data = np .ma .masked_greater (data , valid_max )
470
+
471
+ if valid_min is not None :
472
+ data = np .ma .masked_less (data , valid_min )
473
+
474
+ return data
0 commit comments