20
20
21
21
22
22
@contextlib .contextmanager
23
- def load_from_s3 (uri ):
23
+ def load_from_s3 (uri , storage_options = None ):
24
24
"""
25
25
Load a netCDF4-like object from S3.
26
26
@@ -34,10 +34,15 @@ def load_from_s3(uri):
34
34
'<File-like object S3FileSystem, pyactivestorage/s3_test_bizarre.nc>'
35
35
instead, we use h5netcdf: https://github.com/h5netcdf/h5netcdf
36
36
a Python binder straight to HDF5-netCDF4 interface, that doesn't need a "local" file
37
+
38
+ storage_options: kwarg dict containing S3 credentials passed straight to Active
37
39
"""
38
- fs = s3fs .S3FileSystem (key = S3_ACCESS_KEY , # eg "minioadmin" for Minio
39
- secret = S3_SECRET_KEY , # eg "minioadmin" for Minio
40
- client_kwargs = {'endpoint_url' : S3_URL }) # eg "http://localhost:9000" for Minio
40
+ if storage_options is None : # use pre-configured S3 credentials
41
+ fs = s3fs .S3FileSystem (key = S3_ACCESS_KEY , # eg "minioadmin" for Minio
42
+ secret = S3_SECRET_KEY , # eg "minioadmin" for Minio
43
+ client_kwargs = {'endpoint_url' : S3_URL }) # eg "http://localhost:9000" for Minio
44
+ else :
45
+ fs = s3fs .S3FileSystem (** storage_options ) # use passed-in dictionary
41
46
with fs .open (uri , 'rb' ) as s3file :
42
47
ds = h5netcdf .File (s3file , 'r' , invalid_netcdf = True )
43
48
print (f"Dataset loaded from S3 via h5netcdf: { ds } " )
@@ -68,20 +73,43 @@ def __new__(cls, *args, **kwargs):
68
73
}
69
74
return instance
70
75
71
- def __init__ (self , uri , ncvar , storage_type = None , max_threads = 100 ):
76
+ def __init__ (
77
+ self ,
78
+ uri ,
79
+ ncvar ,
80
+ storage_type = None ,
81
+ max_threads = 100 ,
82
+ storage_options = None ,
83
+ active_storage_url = None
84
+ ):
72
85
"""
73
86
Instantiate with a NetCDF4 dataset and the variable of interest within that file.
74
87
(We need the variable, because we need variable specific metadata from within that
75
88
file, however, if that information is available at instantiation, it can be provided
76
89
using keywords and avoid a metadata read.)
90
+
91
+ :param storage_options: s3fs.S3FileSystem options
92
+ :param active_storage_url: Reductionist server URL
77
93
"""
78
94
# Assume NetCDF4 for now
79
95
self .uri = uri
80
96
if self .uri is None :
81
97
raise ValueError (f"Must use a valid file for uri. Got { self .uri } " )
98
+
99
+ # still allow for a passable storage_type
100
+ # for special cases eg "special-POSIX" ie DDN
101
+ if not storage_type and storage_options is not None :
102
+ storage_type = urllib .parse .urlparse (uri ).scheme
82
103
self .storage_type = storage_type
104
+
105
+ # get storage_options
106
+ self .storage_options = storage_options
107
+ self .active_storage_url = active_storage_url
108
+
109
+ # basic check on file
83
110
if not os .path .isfile (self .uri ) and not self .storage_type :
84
111
raise ValueError (f"Must use existing file for uri. { self .uri } not found" )
112
+
85
113
self .ncvar = ncvar
86
114
if self .ncvar is None :
87
115
raise ValueError ("Must set a netCDF variable name to slice" )
@@ -107,13 +135,13 @@ def __getitem__(self, index):
107
135
lock = self .lock
108
136
if lock :
109
137
lock .acquire ()
110
-
138
+
111
139
if self .storage_type is None :
112
140
nc = Dataset (self .uri )
113
141
data = nc [ncvar ][index ]
114
142
nc .close ()
115
143
elif self .storage_type == "s3" :
116
- with load_from_s3 (self .uri ) as nc :
144
+ with load_from_s3 (self .uri , self . storage_options ) as nc :
117
145
data = nc [ncvar ][index ]
118
146
data = self ._mask_data (data , nc [ncvar ])
119
147
@@ -238,7 +266,8 @@ def _via_kerchunk(self, index):
238
266
ds , zarray , zattrs = nz .load_netcdf_zarr_generic (
239
267
self .uri ,
240
268
self .ncvar ,
241
- self .storage_type
269
+ self .storage_type ,
270
+ self .storage_options ,
242
271
)
243
272
# The following is a hangove from exploration
244
273
# and is needed if using the original doing it ourselves
@@ -390,6 +419,20 @@ def _from_storage(self, stripped_indexer, drop_axes, out_shape, out_dtype,
390
419
391
420
return out
392
421
422
+ def _get_endpoint_url (self ):
423
+ """Return the endpoint_url of an S3 object store, or `None`"""
424
+ endpoint_url = self .storage_options .get ('endpoint_url' )
425
+ if endpoint_url is not None :
426
+ return endpoint_url
427
+
428
+ client_kwargs = self .storage_options .get ('client_kwargs' )
429
+ if client_kwargs :
430
+ endpoint_url = client_kwargs .get ('endpoint_url' )
431
+ if endpoint_url is not None :
432
+ return endpoint_url
433
+
434
+ return f"http://{ urllib .parse .urlparse (self .filename ).netloc } "
435
+
393
436
def _process_chunk (self , session , fsref , chunk_coords , chunk_selection , counts ,
394
437
out_selection , compressor , filters , missing ,
395
438
drop_axes = None ):
@@ -406,22 +449,44 @@ def _process_chunk(self, session, fsref, chunk_coords, chunk_selection, counts,
406
449
key = f"{ self .ncvar } /{ coord } "
407
450
rfile , offset , size = tuple (fsref [key ])
408
451
452
+ # S3: pass in pre-configured storage options (credentials)
409
453
if self .storage_type == "s3" :
410
454
parsed_url = urllib .parse .urlparse (rfile )
411
455
bucket = parsed_url .netloc
412
456
object = parsed_url .path
413
457
# FIXME: We do not get the correct byte order on the Zarr Array's dtype
414
458
# when using S3, so use the value captured earlier.
415
459
dtype = self ._dtype
416
- tmp , count = reductionist .reduce_chunk (session , S3_ACTIVE_STORAGE_URL ,
417
- S3_URL ,
418
- bucket , object , offset ,
419
- size , compressor , filters ,
420
- missing , dtype ,
421
- self .zds ._chunks ,
422
- self .zds ._order ,
423
- chunk_selection ,
424
- operation = self ._method )
460
+ if self .storage_options is None :
461
+ tmp , count = reductionist .reduce_chunk (session ,
462
+ S3_ACTIVE_STORAGE_URL ,
463
+ S3_URL ,
464
+ bucket , object , offset ,
465
+ size , compressor , filters ,
466
+ missing , dtype ,
467
+ self .zds ._chunks ,
468
+ self .zds ._order ,
469
+ chunk_selection ,
470
+ operation = self ._method )
471
+ else :
472
+ # special case for "anon=True" buckets that work only with e.g.
473
+ # fs = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': S3_URL})
474
+ # where file uri = bucketX/fileY.mc
475
+ print ("S3 Storage options to Reductionist:" , self .storage_options )
476
+ if self .storage_options .get ("anon" , None ) == True :
477
+ bucket = os .path .dirname (parsed_url .path ) # bucketX
478
+ object = os .path .basename (parsed_url .path ) # fileY
479
+ print ("S3 anon=True Bucket and File:" , bucket , object )
480
+ tmp , count = reductionist .reduce_chunk (session ,
481
+ self .active_storage_url ,
482
+ self ._get_endpoint_url (),
483
+ bucket , object , offset ,
484
+ size , compressor , filters ,
485
+ missing , dtype ,
486
+ self .zds ._chunks ,
487
+ self .zds ._order ,
488
+ chunk_selection ,
489
+ operation = self ._method )
425
490
else :
426
491
# note there is an ongoing discussion about this interface, and what it returns
427
492
# see https://github.com/valeriupredoi/PyActiveStorage/issues/33
0 commit comments