@@ -111,19 +111,20 @@ def __new__(cls, *args, **kwargs):
111
111
"""Store reduction methods."""
112
112
instance = super ().__new__ (cls )
113
113
instance ._methods = {
114
- "min" : np .min ,
115
- "max" : np .max ,
116
- "sum" : np .sum ,
114
+ "min" : np .ma . min ,
115
+ "max" : np .ma . max ,
116
+ "sum" : np .ma . sum ,
117
117
# For the unweighted mean we calulate the sum and divide
118
118
# by the number of non-missing elements
119
- "mean" : np .sum ,
119
+ "mean" : np .ma . sum ,
120
120
}
121
121
return instance
122
122
123
123
def __init__ (
124
124
self ,
125
125
uri ,
126
126
ncvar ,
127
+ axis = None ,
127
128
storage_type = None ,
128
129
max_threads = 100 ,
129
130
storage_options = None ,
@@ -161,6 +162,16 @@ def __init__(
161
162
if self .ncvar is None :
162
163
raise ValueError ("Must set a netCDF variable name to slice" )
163
164
165
+ # Parse axis (note, if axis is None then we'll work out how
166
+ # many dimensions there are at the time of an active
167
+ # __getitem__ call).
168
+ if axis is not None :
169
+ if isinstance (axis , int ):
170
+ axis = (axis ,)
171
+ else :
172
+ axis = tuple (axis )
173
+
174
+ self ._axis = axis
164
175
self ._version = 1
165
176
self ._components = False
166
177
self ._method = None
@@ -293,7 +304,7 @@ def _get_active(self, method, *args):
293
304
an array returned via getitem.
294
305
"""
295
306
raise NotImplementedError
296
-
307
+
297
308
@_metricise
298
309
def _get_selection (self , * args ):
299
310
"""
@@ -311,6 +322,9 @@ def _get_selection(self, *args):
311
322
array = pyfive .indexing .ZarrArrayStub (self .ds .shape , self .ds .chunks )
312
323
ds = self .ds .id
313
324
325
+ if self ._axis is None :
326
+ self ._axis = tuple (range (len (ds .shape )))
327
+
314
328
self .metric_data ['args' ] = args
315
329
self .metric_data ['dataset shape' ] = self .ds .shape
316
330
self .metric_data ['dataset chunks' ] = self .ds .chunks
@@ -324,20 +338,30 @@ def _get_selection(self, *args):
324
338
#stripped_indexer = [(a, b, c) for a,b,c in indexer]
325
339
drop_axes = indexer .drop_axes and keepdims
326
340
327
- # we use array._chunks rather than ds.chunks, as the latter is none in the case of
328
- # unchunked data, and we need to tell the storage the array dimensions in this case.
329
- return self ._from_storage (ds , indexer , array ._chunks , out_shape , dtype , compressor , filters , drop_axes )
341
+ # we use array._chunks rather than ds.chunks, as the latter is
342
+ # none in the case of unchunked data, and we need to tell the
343
+ # storage the array dimensions in this case.
344
+ return self ._from_storage (ds , indexer , array ._chunks , out_shape , dtype , compressor , filters , drop_axes , self ._axis )
330
345
331
- def _from_storage (self , ds , indexer , chunks , out_shape , out_dtype , compressor , filters , drop_axes ):
346
+ def _from_storage (self , ds , indexer , chunks , out_shape , out_dtype , compressor , filters , drop_axes , axis ):
332
347
method = self .method
333
-
348
+ need_counts = self .components or self ._method == "mean"
349
+
334
350
if method is not None :
335
- out = []
336
- counts = []
351
+ # Replace the size of each reduced axis with the number of
352
+ # chunks along that axis
353
+ out_shape = list (out_shape )
354
+ for i in axis :
355
+ out_shape [i ] = indexer .dim_indexers [i ].nchunks
356
+
357
+ out = np .ma .empty (out_shape , dtype = out_dtype , order = ds ._order )
358
+ if need_counts :
359
+ counts = np .ma .empty (
360
+ out_shape , dtype = out_dtype , order = ds ._order
361
+ )
337
362
else :
338
363
out = np .empty (out_shape , dtype = out_dtype , order = ds ._order )
339
- counts = None # should never get touched with no method!
340
-
364
+
341
365
# Create a shared session object.
342
366
if self .storage_type == "s3" and self ._version == 2 :
343
367
if self .storage_options is not None :
@@ -378,29 +402,32 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, f
378
402
future = executor .submit (
379
403
self ._process_chunk ,
380
404
session , ds , chunks , chunk_coords , chunk_selection ,
381
- counts , out_selection , compressor , filters , drop_axes = drop_axes )
405
+ out_selection , compressor , filters , drop_axes = drop_axes )
382
406
futures .append (future )
407
+
383
408
# Wait for completion.
384
409
for future in concurrent .futures .as_completed (futures ):
385
410
try :
386
- result = future .result ()
411
+ result , count , out_selection = future .result ()
387
412
except Exception as exc :
388
413
raise
389
- else :
390
- chunk_count += 1
391
- if method is not None :
392
- result , count = result
393
- out .append (result )
394
- counts .append (count )
395
- else :
396
- # store selected data in output
397
- result , selection = result
398
- out [selection ] = result
414
+
415
+ chunk_count += 1
416
+
417
+ # Store the selected data
418
+ out [out_selection ] = result
419
+
420
+ # Store the counts for the selected data
421
+ if need_counts :
422
+ counts [out_selection ] = count
399
423
400
424
if method is not None :
401
425
# Apply the method (again) to aggregate the result
402
- out = method (out )
403
- shape1 = (1 ,) * len (out_shape )
426
+ out = method (out , axis = axis , keepdims = True )
427
+
428
+ # Aggregate the counts
429
+ if need_counts :
430
+ n = np .ma .sum (counts , axis = axis , keepdims = True )
404
431
405
432
if self ._components :
406
433
# Return a dictionary of components containing the
@@ -415,9 +442,6 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, f
415
442
# reductions require the per-dask-chunk partial
416
443
# reductions to retain these dimensions so that
417
444
# partial results can be concatenated correctly.)
418
- out = out .reshape (shape1 )
419
-
420
- n = np .sum (counts ).reshape (shape1 )
421
445
if self ._method == "mean" :
422
446
# For the average, the returned component is
423
447
# "sum", not "mean"
@@ -431,7 +455,11 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, f
431
455
# For the average, it is actually the sum that has
432
456
# been created, so we need to divide by the sample
433
457
# size.
434
- out = out / np .sum (counts ).reshape (shape1 )
458
+ #
459
+ # Note: It's OK if an element of 'n' is zero,
460
+ # because it will necessarily correspond to
461
+ # a masked value in 'out'.
462
+ out = out / n
435
463
436
464
t2 = time .time ()
437
465
self .metric_data ['reduction time (s)' ] = t2 - t1
@@ -453,24 +481,23 @@ def _get_endpoint_url(self):
453
481
454
482
return f"http://{ urllib .parse .urlparse (self .filename ).netloc } "
455
483
456
- def _process_chunk (self , session , ds , chunks , chunk_coords , chunk_selection , counts ,
484
+ def _process_chunk (self , session , ds , chunks , chunk_coords , chunk_selection ,
457
485
out_selection , compressor , filters , drop_axes = None ):
458
486
"""
459
487
Obtain part or whole of a chunk.
460
488
461
489
This is done by taking binary data from storage and filling
462
490
the output array.
463
491
464
- Note the need to use counts for some methods
465
- #FIXME: Do, we, it's not actually used?
466
-
467
492
"""
468
493
469
494
# retrieve coordinates from chunk index
470
495
storeinfo = ds .get_chunk_info_from_chunk_coord (chunk_coords )
471
496
offset , size = storeinfo .byte_offset , storeinfo .size
472
497
self .data_read += size
473
498
499
+ axis = self ._axis
500
+
474
501
if self .storage_type == 's3' and self ._version == 1 :
475
502
476
503
tmp , count = reduce_opens3_chunk (ds ._fh , offset , size , compressor , filters ,
@@ -483,6 +510,7 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
483
510
# S3: pass in pre-configured storage options (credentials)
484
511
# print("S3 rfile is:", self.filename)
485
512
parsed_url = urllib .parse .urlparse (self .filename )
513
+
486
514
bucket = parsed_url .netloc
487
515
object = parsed_url .path
488
516
@@ -504,6 +532,7 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
504
532
chunks ,
505
533
ds ._order ,
506
534
chunk_selection ,
535
+ axis ,
507
536
operation = self ._method )
508
537
else :
509
538
# special case for "anon=True" buckets that work only with e.g.
@@ -523,6 +552,7 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
523
552
chunks ,
524
553
ds ._order ,
525
554
chunk_selection ,
555
+ axis ,
526
556
operation = self ._method )
527
557
elif self .storage_type == 'ActivePosix' and self .version == 2 :
528
558
# This is where the DDN Fuse and Infinia wrappers go
@@ -532,17 +562,38 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
532
562
# see https://github.com/valeriupredoi/PyActiveStorage/issues/33
533
563
# so neither the returned data or the interface should be considered stable
534
564
# although we will version changes.
565
+
535
566
tmp , count = reduce_chunk (self .filename , offset , size , compressor , filters ,
536
567
self .missing , ds .dtype ,
537
568
chunks , ds ._order ,
538
- chunk_selection , method = self .method )
539
-
569
+ chunk_selection , axis , method = self .method )
570
+
540
571
if self .method is not None :
541
- return tmp , count
572
+ # Replace the index corresponding to each reduced axis
573
+ # with its size-1 position in chunk-space.
574
+ #
575
+ # E.g. if 'out_selection' is (slice(0,12), slice(20,60)),
576
+ # 'chunk_coord' is (1, 3), and 'axis' is (1,); then
577
+ # 'out_selection' will become (slice(0,12),
578
+ # slice(3,4)). If 'axis' were instead (0, 1) then
579
+ # 'out_selection' would become (slice(1,2),
580
+ # slice(3,4)).
581
+ #
582
+ # This makes sure that 'out_selection' puts 'tmp' in the
583
+ # correct place of the numpy array defined by the method
584
+ # that collates the 'tmp's for each chunk (currently
585
+ # `_from_storage`).
586
+ out_selection = list (out_selection )
587
+ for i in axis :
588
+ n = chunk_coords [i ]
589
+ out_selection [i ] = slice (n , n + 1 )
590
+
591
+ return tmp , count , tuple (out_selection )
542
592
else :
543
593
if drop_axes :
544
594
tmp = np .squeeze (tmp , axis = drop_axes )
545
- return tmp , out_selection
595
+
596
+ return tmp , None , out_selection
546
597
547
598
def _mask_data (self , data ):
548
599
"""
0 commit comments