axis subsets

davidhassell · davidhassell · commit 549d89a551ad · 2025-02-26T19:38:01.000Z
diff --git a/activestorage/active.py b/activestorage/active.py
@@ -111,19 +111,20 @@ def __new__(cls, *args, **kwargs):
         """Store reduction methods."""
         instance = super().__new__(cls)
         instance._methods = {
-            "min": np.min,
-            "max": np.max,
-            "sum": np.sum,
+            "min": np.ma.min,
+            "max": np.ma.max,
+            "sum": np.ma.sum,
             # For the unweighted mean we calulate the sum and divide
             # by the number of non-missing elements
-            "mean": np.sum,
+            "mean": np.ma.sum,
         }
         return instance
 
     def __init__(
         self,
         uri,
         ncvar,
+        axis=None,
         storage_type=None,
         max_threads=100,
         storage_options=None,
@@ -161,6 +162,16 @@ def __init__(
         if self.ncvar is None:
             raise ValueError("Must set a netCDF variable name to slice")
 
+        # Parse axis (note, if axis is None then we'll work out how
+        # many dimensions there are at the time of an active
+        # __getitem__ call).
+        if axis is not None:
+            if isinstance(axis, int):
+                axis = (axis,)
+            else:
+                axis = tuple(axis)
+
+        self._axis = axis
         self._version = 1
         self._components = False
         self._method = None
@@ -293,7 +304,7 @@ def _get_active(self, method, *args):
         an array returned via getitem.
         """
         raise NotImplementedError
- 
+
     @_metricise
     def _get_selection(self, *args):
         """ 
@@ -311,6 +322,9 @@ def _get_selection(self, *args):
         array = pyfive.indexing.ZarrArrayStub(self.ds.shape, self.ds.chunks)
         ds = self.ds.id
         
+        if self._axis is None:                
+            self._axis = tuple(range(len(ds.shape)))
+
         self.metric_data['args'] = args
         self.metric_data['dataset shape'] = self.ds.shape
         self.metric_data['dataset chunks'] = self.ds.chunks
@@ -324,20 +338,30 @@ def _get_selection(self, *args):
         #stripped_indexer = [(a, b, c) for a,b,c in indexer]
         drop_axes = indexer.drop_axes and keepdims
 
-        # we use array._chunks rather than ds.chunks, as the latter is none in the case of
-        # unchunked data, and we need to tell the storage the array dimensions in this case.
-        return self._from_storage(ds, indexer, array._chunks, out_shape, dtype, compressor, filters, drop_axes)
+        # we use array._chunks rather than ds.chunks, as the latter is
+        # none in the case of unchunked data, and we need to tell the
+        # storage the array dimensions in this case.
+        return self._from_storage(ds, indexer, array._chunks, out_shape, dtype, compressor, filters, drop_axes, self._axis)
 
-    def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, filters, drop_axes):
+    def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, filters, drop_axes, axis):
         method = self.method
-       
+        need_counts = self.components or self._method == "mean"
+        
         if method is not None:
-            out = []
-            counts = []
+            # Replace the size of each reduced axis with the number of
+            # chunks along that axis
+            out_shape = list(out_shape)
+            for i in axis:
+                out_shape[i] = indexer.dim_indexers[i].nchunks
+                    
+            out = np.ma.empty(out_shape, dtype=out_dtype, order=ds._order)
+            if need_counts:
+                counts = np.ma.empty(
+                    out_shape, dtype=out_dtype, order=ds._order
+                )
         else:
             out = np.empty(out_shape, dtype=out_dtype, order=ds._order)
-            counts = None  # should never get touched with no method!
-
+            
         # Create a shared session object.
         if self.storage_type == "s3" and self._version==2:
             if self.storage_options is not None:
@@ -378,29 +402,32 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, f
                 future = executor.submit(
                     self._process_chunk,
                     session,  ds, chunks, chunk_coords, chunk_selection,
-                    counts, out_selection, compressor, filters, drop_axes=drop_axes)
+                    out_selection, compressor, filters, drop_axes=drop_axes)
                 futures.append(future)
+
             # Wait for completion.
             for future in concurrent.futures.as_completed(futures):
                 try:
-                    result = future.result()
+                    result, count, out_selection = future.result()
                 except Exception as exc:
                     raise
-                else:
-                    chunk_count +=1
-                    if method is not None:
-                        result, count = result
-                        out.append(result)
-                        counts.append(count)
-                    else:
-                        # store selected data in output
-                        result, selection = result
-                        out[selection] = result
+
+                chunk_count += 1
+                
+                # Store the selected data
+                out[out_selection] = result
+                
+                # Store the counts for the selected data
+                if need_counts:
+                    counts[out_selection] = count
 
         if method is not None:
             # Apply the method (again) to aggregate the result
-            out = method(out)
-            shape1 = (1,) * len(out_shape)
+            out = method(out, axis=axis, keepdims=True)
+            
+            # Aggregate the counts
+            if need_counts:
+                n = np.ma.sum(counts, axis=axis, keepdims=True)         
                 
             if self._components:
                 # Return a dictionary of components containing the
@@ -415,9 +442,6 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, f
                 # reductions require the per-dask-chunk partial
                 # reductions to retain these dimensions so that
                 # partial results can be concatenated correctly.)
-                out = out.reshape(shape1)
-
-                n = np.sum(counts).reshape(shape1)
                 if self._method == "mean":
                     # For the average, the returned component is
                     # "sum", not "mean"
@@ -431,7 +455,11 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, f
                     # For the average, it is actually the sum that has
                     # been created, so we need to divide by the sample
                     # size.
-                    out = out / np.sum(counts).reshape(shape1)
+                    #
+                    # Note: It's OK if an element of 'n' is zero,
+                    #       because it will necessarily correspond to
+                    #       a masked value in 'out'.
+                    out = out / n
 
         t2 = time.time()
         self.metric_data['reduction time (s)'] = t2-t1
@@ -453,24 +481,23 @@ def _get_endpoint_url(self):
 
         return f"http://{urllib.parse.urlparse(self.filename).netloc}"
 
-    def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, counts,
+    def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, 
                        out_selection, compressor, filters, drop_axes=None):
         """
         Obtain part or whole of a chunk.
 
         This is done by taking binary data from storage and filling
         the output array.
 
-        Note the need to use counts for some methods
-        #FIXME: Do, we, it's not actually used?
-
         """
 
         # retrieve coordinates from chunk index
         storeinfo = ds.get_chunk_info_from_chunk_coord(chunk_coords)
         offset, size = storeinfo.byte_offset, storeinfo.size
         self.data_read += size
 
+        axis = self._axis
+        
         if self.storage_type == 's3' and self._version == 1:
 
             tmp, count = reduce_opens3_chunk(ds._fh, offset, size, compressor, filters,
@@ -483,6 +510,7 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
             # S3: pass in pre-configured storage options (credentials)
             # print("S3 rfile is:", self.filename)
             parsed_url = urllib.parse.urlparse(self.filename)
+
             bucket = parsed_url.netloc
             object = parsed_url.path
         
@@ -504,6 +532,7 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
                                                        chunks,
                                                        ds._order,
                                                        chunk_selection,
+                                                       axis, 
                                                        operation=self._method)
             else:
                 # special case for "anon=True" buckets that work only with e.g.
@@ -523,6 +552,7 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
                                                        chunks,
                                                        ds._order,
                                                        chunk_selection,
+                                                       axis,
                                                        operation=self._method)
         elif self.storage_type=='ActivePosix' and self.version==2:
             # This is where the DDN Fuse and Infinia wrappers go
@@ -532,17 +562,38 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
             # see https://github.com/valeriupredoi/PyActiveStorage/issues/33
             # so neither the returned data or the interface should be considered stable
             # although we will version changes.
+
             tmp, count = reduce_chunk(self.filename, offset, size, compressor, filters,
                                       self.missing, ds.dtype,
                                       chunks, ds._order,
-                                      chunk_selection, method=self.method)
-
+                                      chunk_selection, axis, method=self.method)
+            
         if self.method is not None:
-            return tmp, count 
+            # Replace the index corresponding to each reduced axis
+            # with its size-1 position in chunk-space.
+            #
+            # E.g. if 'out_selection' is (slice(0,12), slice(20,60)),
+            #      'chunk_coord' is (1, 3), and 'axis' is (1,); then
+            #      'out_selection' will become (slice(0,12),
+            #      slice(3,4)). If 'axis' were instead (0, 1) then
+            #      'out_selection' would become (slice(1,2),
+            #      slice(3,4)).
+            #
+            # This makes sure that 'out_selection' puts 'tmp' in the
+            # correct place of the numpy array defined by the method
+            # that collates the 'tmp's for each chunk (currently
+            # `_from_storage`).
+            out_selection = list(out_selection)
+            for i in axis:
+                n = chunk_coords[i]
+                out_selection[i] = slice(n, n+1)
+                
+            return tmp, count, tuple(out_selection)
         else:
             if drop_axes:
                 tmp = np.squeeze(tmp, axis=drop_axes)
-            return tmp, out_selection 
+                
+            return tmp, None, out_selection
 
     def _mask_data(self, data):
         """ 
diff --git a/activestorage/reductionist.py b/activestorage/reductionist.py
@@ -27,7 +27,7 @@ def get_session(username: str, password: str, cacert: typing.Optional[str]) -> r
 
 def reduce_chunk(session, server, source, bucket, object,
                  offset, size, compression, filters, missing, dtype, shape,
-                 order, chunk_selection, operation):
+                 order, chunk_selection, axis, operation):
     """Perform a reduction on a chunk using Reductionist.
 
     :param server: Reductionist server URL
@@ -49,12 +49,13 @@ def reduce_chunk(session, server, source, bucket, object,
                             1), slice(1, 3, 1), slice(0, 1, 1))
                             this defines the part of the chunk which is to be
                             obtained or operated upon.
+    :param axis: tuple of the axes to reduce (non-negative integers)
     :param operation: name of operation to perform
     :returns: the reduced data as a numpy array or scalar
     :raises ReductionistError: if the request to Reductionist fails
     """
 
-    request_data = build_request_data(source, bucket, object, offset, size, compression, filters, missing, dtype, shape, order, chunk_selection)
+    request_data = build_request_data(source, bucket, object, offset, size, compression, filters, missing, dtype, shape, order, chunk_selection, axis)
     if DEBUG:
         print(f"Reductionist request data dictionary: {request_data}")
     api_operation = "sum" if operation == "mean" else operation or "select"
@@ -134,7 +135,7 @@ def encode_missing(missing):
 
 def build_request_data(source: str, bucket: str, object: str, offset: int,
                        size: int, compression, filters, missing, dtype, shape,
-                       order, selection) -> dict:
+                       order, selection, axis) -> dict:
     """Build request data for Reductionist API."""
     request_data = {
         'source': source,
@@ -145,6 +146,7 @@ def build_request_data(source: str, bucket: str, object: str, offset: int,
         'offset': int(offset),
         'size': int(size),
         'order': order,
+        'axis': axis,
     }
     if shape:
         request_data["shape"] = shape
@@ -178,7 +180,8 @@ def decode_result(response):
     shape = json.loads(response.headers['x-activestorage-shape'])
     result = np.frombuffer(response.content, dtype=dtype)
     result = result.reshape(shape)
-    count = json.loads(response.headers['x-activestorage-count'])
+    count = json.loads(response.headers['x-activestorage-count']) # TODO this is wrong for now!
+    count = np.frombuffer(response.content, dtype=dtype) # TODO this is wrong for now!
     return result, count
 
 
diff --git a/activestorage/storage.py b/activestorage/storage.py
@@ -3,9 +3,7 @@
 
 from numcodecs.compat import ensure_ndarray
 
-def reduce_chunk(rfile, 
-                 offset, size, compression, filters, missing, dtype, shape, 
-                 order, chunk_selection, method=None):
+def reduce_chunk(rfile, offset, size, compression, filters, missing, dtype, shape, order, chunk_selection, axis, method=None):
     """ We do our own read of chunks and decoding etc 
     
     rfile - the actual file with the data 
@@ -20,6 +18,7 @@ def reduce_chunk(rfile,
                         (slice(0, 2, 1), slice(1, 3, 1), slice(0, 1, 1))
                         this defines the part of the chunk which is to be obtained
                         or operated upon.
+    axis - tuple of the axes to reduce (non-negative integers)
     method - computation desired 
             (in this Python version it's an actual method, in 
             storage implementations we'll change to controlled vocabulary)
@@ -41,18 +40,15 @@ def reduce_chunk(rfile,
         chunk = chunk.reshape(shape, order=order)
 
     tmp = chunk[chunk_selection]
+    tmp = mask_missing(tmp, missing)
+
     if method:
-        if missing != (None, None, None, None):
-            tmp = remove_missing(tmp, missing)
-        # Check on size of tmp; method(empty) fails or gives incorrect
-        # results
-        if tmp.size:
-            return method(tmp), tmp.size
-        else:
-            return tmp, 0
+        N = np.ma.count(tmp, axis=axis, keepdims=True)
+        tmp = method(tmp, axis=axis, keepdims=True)
     else:
-        return tmp, None
+        N = None
 
+    return tmp, N
 
 def filter_pipeline(chunk, compression, filters):
     """
@@ -73,6 +69,24 @@ def filter_pipeline(chunk, compression, filters):
     return chunk
 
 
+def mask_missing(data, missing):
+    """ 
+    As we are using numpy, we can use a masked array, storage implementations
+    will have to do this by hand 
+    """
+    fill_value, missing_value, valid_min, valid_max = missing
+
+    if fill_value:
+        data = np.ma.masked_equal(data, fill_value)
+    if missing_value:
+        data = np.ma.masked_equal(data, missing_value)
+    if valid_max:
+        data = np.ma.masked_greater(data, valid_max)
+    if valid_min:
+        data = np.ma.masked_less(data, valid_min)
+
+    return data
+
 def remove_missing(data, missing):
     """ 
     As we are using numpy, we can use a masked array, storage implementations