4
4
import pathlib
5
5
import urllib
6
6
import pyfive
7
+ import s3fs
7
8
import time
8
- from pyfive .h5d import StoreInfo
9
9
10
- import s3fs
10
+ from pathlib import Path
11
+ from pyfive .h5d import StoreInfo
12
+ from typing import Optional
11
13
12
14
from activestorage .config import *
13
15
from activestorage import reductionist
@@ -47,21 +49,6 @@ def load_from_s3(uri, storage_options=None):
47
49
print (f"Dataset loaded from S3 with s3fs and Pyfive: { uri } ({ t2 - t1 :.2} ,{ t3 - t2 :.2} )" )
48
50
return ds
49
51
50
- def _metricise (method ):
51
- """ Decorator for class methods loads into metric_data"""
52
- def timed (self , * args , ** kw ):
53
- ts = time .time ()
54
- metric_name = ''
55
- if '__metric_name' in kw :
56
- metric_name = kw ['__metric_name' ]
57
- del kw ['__metric_name' ]
58
- result = method (self ,* args , ** kw )
59
- te = time .time ()
60
- if metric_name :
61
- self .metric_data [metric_name ] = te - ts
62
- return result
63
- return timed
64
-
65
52
66
53
def get_missing_attributes (ds ):
67
54
""""
@@ -122,13 +109,13 @@ def __new__(cls, *args, **kwargs):
122
109
123
110
def __init__ (
124
111
self ,
125
- uri ,
126
- ncvar ,
127
- storage_type = None ,
128
- max_threads = 100 ,
129
- storage_options = None ,
130
- active_storage_url = None
131
- ):
112
+ dataset : Optional [ str | Path | object ] ,
113
+ ncvar : str = None ,
114
+ storage_type : str = None ,
115
+ max_threads : int = 100 ,
116
+ storage_options : dict = None ,
117
+ active_storage_url : str = None
118
+ ) -> None :
132
119
"""
133
120
Instantiate with a NetCDF4 dataset URI and the variable of interest within that file.
134
121
(We need the variable, because we need variable specific metadata from within that
@@ -138,50 +125,69 @@ def __init__(
138
125
:param storage_options: s3fs.S3FileSystem options
139
126
:param active_storage_url: Reductionist server URL
140
127
"""
141
- # Assume NetCDF4 for now
142
- self .uri = uri
143
- if self .uri is None :
144
- raise ValueError (f"Must use a valid file for uri. Got { uri } " )
128
+ self .ds = None
129
+ input_variable = False
130
+ if dataset is None :
131
+ raise ValueError (f"Must use a valid file name or variable object for dataset. Got { dataset !r} " )
132
+ if isinstance (dataset , Path ) and not dataset .exists ():
133
+ raise ValueError (f"Path to input file { dataset !r} does not exist." )
134
+ if not isinstance (dataset , Path ) and not isinstance (dataset , str ):
135
+ print (f"Treating input { dataset } as variable object." )
136
+ if not type (dataset ) is pyfive .high_level .Dataset :
137
+ raise TypeError (f"Variable object dataset can only be pyfive.high_level.Dataset. Got { dataset !r} " )
138
+ input_variable = True
139
+ self .ds = dataset
140
+ self .uri = dataset
141
+
145
142
146
143
# still allow for a passable storage_type
147
144
# for special cases eg "special-POSIX" ie DDN
148
145
if not storage_type and storage_options is not None :
149
- storage_type = urllib .parse .urlparse (uri ).scheme
146
+ storage_type = urllib .parse .urlparse (dataset ).scheme
150
147
self .storage_type = storage_type
151
148
149
+ # set correct filename attr
150
+ if input_variable and not self .storage_type :
151
+ self .filename = self .ds
152
+ elif input_variable and self .storage_type == "s3" :
153
+ self .filename = self .ds .id ._filename
154
+
152
155
# get storage_options
153
156
self .storage_options = storage_options
154
157
self .active_storage_url = active_storage_url
155
158
156
159
# basic check on file
157
- if not os .path .isfile (self .uri ) and not self .storage_type :
158
- raise ValueError (f"Must use existing file for uri. { self .uri } not found" )
160
+ if not input_variable :
161
+ if not os .path .isfile (self .uri ) and not self .storage_type :
162
+ raise ValueError (f"Must use existing file for uri. { self .uri } not found" )
159
163
160
164
self .ncvar = ncvar
161
- if self .ncvar is None :
165
+ if self .ncvar is None and not input_variable :
162
166
raise ValueError ("Must set a netCDF variable name to slice" )
163
167
164
168
self ._version = 1
165
169
self ._components = False
166
170
self ._method = None
167
171
self ._max_threads = max_threads
168
172
self .missing = None
169
- self .ds = None
170
- self .metric_data = {}
171
173
self .data_read = 0
172
174
173
- @_metricise
174
175
def __load_nc_file (self ):
175
- """ Get the netcdf file and it's b-tree"""
176
+ """
177
+ Get the netcdf file and its b-tree.
178
+
179
+ This private method is used only if the input to Active
180
+ is not a pyfive.high_level.Dataset object. In that case,
181
+ any file opening is skipped, and ncvar is not used. The
182
+ Dataset object will have already contained the b-tree,
183
+ and `_filename` attribute.
184
+ """
176
185
ncvar = self .ncvar
177
- # in all cases we need an open netcdf file to get at attributes
178
- # we keep it open because we need it's b-tree
179
186
if self .storage_type is None :
180
187
nc = pyfive .File (self .uri )
181
188
elif self .storage_type == "s3" :
182
189
nc = load_from_s3 (self .uri , self .storage_options )
183
190
self .filename = self .uri
184
-
185
191
self .ds = nc [ncvar ]
186
192
187
193
def __get_missing_attributes (self ):
@@ -194,10 +200,8 @@ def __getitem__(self, index):
194
200
Provides support for a standard get item.
195
201
#FIXME-BNL: Why is the argument index?
196
202
"""
197
- self .metric_data = {}
198
203
if self .ds is None :
199
- self .__load_nc_file (__metric_name = 'load nc time' )
200
- #self.__metricise('Load','__load_nc_file')
204
+ self .__load_nc_file ()
201
205
202
206
self .missing = self .__get_missing_attributes ()
203
207
@@ -206,21 +210,20 @@ def __getitem__(self, index):
206
210
if self .method is None and self ._version == 0 :
207
211
208
212
# No active operation
209
- return self ._get_vanilla (index , __metric_name = 'vanilla_time' )
213
+ return self ._get_vanilla (index )
210
214
211
215
elif self ._version == 1 :
212
216
213
217
#FIXME: is the difference between version 1 and 2 still honoured?
214
- return self ._get_selection (index , __metric_name = 'selection 1 time (s)' )
218
+ return self ._get_selection (index )
215
219
216
220
elif self ._version == 2 :
217
221
218
- return self ._get_selection (index , __metric_name = 'selection 2 time (s)' )
222
+ return self ._get_selection (index )
219
223
220
224
else :
221
225
raise ValueError (f'Version { self ._version } not supported' )
222
226
223
- @_metricise
224
227
def _get_vanilla (self , index ):
225
228
"""
226
229
Get the data without any active operation
@@ -294,7 +297,7 @@ def _get_active(self, method, *args):
294
297
"""
295
298
raise NotImplementedError
296
299
297
- @ _metricise
300
+
298
301
def _get_selection (self , * args ):
299
302
"""
300
303
At this point we have a Dataset object, but all the important information about
@@ -307,13 +310,8 @@ def _get_selection(self, *args):
307
310
308
311
name = self .ds .name
309
312
dtype = np .dtype (self .ds .dtype )
310
- # hopefully fix pyfive to get a dtype directly
311
313
array = pyfive .indexing .ZarrArrayStub (self .ds .shape , self .ds .chunks )
312
314
ds = self .ds .id
313
-
314
- self .metric_data ['args' ] = args
315
- self .metric_data ['dataset shape' ] = self .ds .shape
316
- self .metric_data ['dataset chunks' ] = self .ds .chunks
317
315
if ds .filter_pipeline is None :
318
316
compressor , filters = None , None
319
317
else :
@@ -359,16 +357,6 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, f
359
357
session = None
360
358
361
359
# Process storage chunks using a thread pool.
362
- # Because we do this, we need to read the dataset b-tree now, not as we go, so
363
- # it is already in cache. If we remove the thread pool from here, we probably
364
- # wouldn't need to do it before the first one.
365
-
366
- if ds .chunks is not None :
367
- t1 = time .time ()
368
- # ds._get_chunk_addresses()
369
- t2 = time .time () - t1
370
- self .metric_data ['indexing time (s)' ] = t2
371
- # self.metric_data['chunk number'] = len(ds._zchunk_index)
372
360
chunk_count = 0
373
361
t1 = time .time ()
374
362
with concurrent .futures .ThreadPoolExecutor (max_workers = self ._max_threads ) as executor :
@@ -433,10 +421,6 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, f
433
421
# size.
434
422
out = out / np .sum (counts ).reshape (shape1 )
435
423
436
- t2 = time .time ()
437
- self .metric_data ['reduction time (s)' ] = t2 - t1
438
- self .metric_data ['chunks processed' ] = chunk_count
439
- self .metric_data ['storage read (B)' ] = self .data_read
440
424
return out
441
425
442
426
def _get_endpoint_url (self ):
0 commit comments