NCAS-CMS · valeriupredoi · Mar 20, 2025 · Mar 10, 2025 · Mar 10, 2025 · Mar 10, 2025
diff --git a/.github/workflows/install-from-condalock-file.yml b/.github/workflows/install-from-condalock-file.yml
@@ -37,6 +37,13 @@ jobs:
       - run: which python
       - run: python -V
       - run: conda create --name activestorage-fromlock --file conda-linux-64.lock
+      - name: Install development version of NCAS-CMS/Pyfive:wacasoft
+        run: |
+          cd ..
+          git clone https://github.com/NCAS-CMS/pyfive.git
+          cd pyfive
+          git checkout wacasoft
+          pip install -e .
       - run: which python
       - run: pip --version
       - run: pip install -e .

diff --git a/.github/workflows/run-s3-test-push.yml b/.github/workflows/run-s3-test-push.yml
@@ -0,0 +1,85 @@
+# adapted GA workflow from https://github.com/stackhpc/reductionist-rs
+---
+name: S3/Minio Test Latest Python
+
+on:
+  push:
+
+# Required shell entrypoint to have properly configured bash shell
+defaults:
+  run:
+    shell: bash -l {0}
+
+jobs:
+  linux-test:
+    runs-on: "ubuntu-latest"
+    strategy:
+      matrix:
+        python-version: ["3.13"]
+      fail-fast: false
+    name: Linux Python ${{ matrix.python-version }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - uses: conda-incubator/setup-miniconda@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+          miniforge-version: "latest"
+          use-mamba: true
+          mamba-version: "2.0.5"  # https://github.com/conda-incubator/setup-miniconda/issues/392
+      - name: Get conda and Python versions
+        run: |
+          conda --version
+          python -V
+      - name: Export proxy
+        run: |
+          echo 'USE_S3 = True' >> activestorage/config.py
+      - name: Start minio object storage
+        run: tests/s3_exploratory/minio_scripts/minio-start
+      - name: Wait for minio object storage to start
+        run: |
+          until curl -if http://localhost:9001; do
+            sleep 1;
+          done
+      - name: Run Reductionist container
+        run: docker run -it --detach --rm --net=host --name reductionist ghcr.io/stackhpc/reductionist-rs:latest
+      - uses: conda-incubator/setup-miniconda@v3
+        with:
+          activate-environment: activestorage-minio
+          environment-file: environment.yml
+          python-version: ${{ matrix.python-version }}
+          miniforge-version: "latest"
+          use-mamba: true
+          mamba-version: "2.0.5"  # https://github.com/conda-incubator/setup-miniconda/issues/392
+      - name: Install development version of NCAS-CMS/Pyfive:wacasoft
+        run: |
+          cd ..
+          git clone https://github.com/NCAS-CMS/pyfive.git
+          cd pyfive
+          git checkout wacasoft
+          pip install -e .
+      - name: Install PyActiveStorage
+        run: |
+          conda --version
+          python -V
+          which python
+          pip install -e .
+      - name: Run tests
+        run: |
+          pytest -n 2
+      - name: Run S3 exploratory tests
+        run: |
+          pytest tests/s3_exploratory/test_s3_reduction.py --html=test-reports/s3-exploratory-report.html
+        if: always()
+      - name: Install pytest-monitor
+        run: pip install pytest-monitor
+      - name: Run S3 performance tests
+        run: |
+          pytest tests/s3_exploratory/test_s3_arrange_files.py
+          pytest tests/s3_exploratory/test_s3_performance.py --db ../.pymon
+      - name: Analyze S3 and local test performance
+        run: python tests/s3_exploratory/parse_pymon.py
+      - name: Stop minio object storage
+        run: tests/s3_exploratory/minio_scripts/minio-stop
+        if: always()
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -4,7 +4,6 @@ on:
   push:
     branches:
       - main
-      - pyfive
   schedule:
     - cron: '0 0 * * *'  # nightly
 
@@ -78,4 +77,4 @@ jobs:
       - run: conda list
       - run: mamba install -c conda-forge git
       - run: pip install -e .
-      - run: pytest
+      - run: pytest -n 2
diff --git a/.github/workflows/test_s3_minio.yml b/.github/workflows/test_s3_minio.yml
@@ -5,9 +5,7 @@ name: S3/Minio Test
 on:
   push:
     branches:
-      - main  # keep this at all times
-      - pyfive
-  pull_request:
+      - main
   schedule:
     - cron: '0 0 * * *'  # nightly
 
@@ -73,7 +71,7 @@ jobs:
           pip install -e .
       - name: Run tests
         run: |
-          pytest
+          pytest -n 2
       - name: Run S3 exploratory tests
         run: |
           pytest tests/s3_exploratory/test_s3_reduction.py --html=test-reports/s3-exploratory-report.html

diff --git a/activestorage/active.py b/activestorage/active.py
@@ -1,9 +1,11 @@
 import concurrent.futures
 import os
+import fsspec
 import numpy as np
 import pathlib
 import urllib
 import pyfive
+import requests
 import s3fs
 import time
 
@@ -17,6 +19,33 @@
 from activestorage.hdf2numcodec import decode_filters
 
 
+def return_storage_type(uri):
+    """
+    Extract the gateway-protocol to infer what type of storage
+    """
+    try:
+        resp = requests.head(uri)
+    except requests.exceptions.MissingSchema:  # eg local file
+        return
+    except requests.exceptions.InvalidSchema:  # eg Minio file s3://pyactivestorage/common_cl_a.nc
+        if not uri.startswith("s3:"):
+            return
+        else:
+            return "s3"
+    except requests.exceptions.ConnectionError as exc:  # eg invalid link or offline
+        print(exc)
+        return
+    response = resp.headers
+
+    # https files on NGINX don't have "gateway-protocol" key
+    if "gateway-protocol" in response:
+        if response["gateway-protocol"] == "s3":
+            print("Gateway protocol indicates S3 storage.")
+            return "s3"
+    else:
+        return "https"
+
+
 def load_from_s3(uri, storage_options=None):
     """
     Load a netCDF4-like object from S3.
@@ -50,6 +79,19 @@ def load_from_s3(uri, storage_options=None):
     return ds
 
 
+def load_from_https(uri):
+    """
+    Load a pyfive.high_level.Dataset from a
+    netCDF4 file on an https server (NGINX).
+    """
+    #TODO need to test if NGINX server behind https://
+    fs = fsspec.filesystem('http')
+    http_file = fs.open(uri, 'rb')
+    ds = pyfive.File(http_file)
+    print(f"Dataset loaded from https with Pyfive: {uri}")
+    return ds
+
+
 def get_missing_attributes(ds):
     """" 
     Load all the missing attributes we need from a netcdf file
@@ -140,6 +182,24 @@ def __init__(
             self.ds = dataset
         self.uri = dataset
 
+        # determine the storage_type
+        # based on what we have available
+        if not storage_type:
+            if not input_variable:
+                check_uri = self.uri
+            else:
+                check_uri = self.ds.id._filename
+
+            # "special" case when we have to deal
+            # with storage_options['client_kwargs']["endpoint_url"]
+            if storage_options is not None and 'client_kwargs' in storage_options:
+                if "endpoint_url" in storage_options['client_kwargs']:
+                    base_url = storage_options['client_kwargs']["endpoint_url"]
+                    if not input_variable:
+                        check_uri = os.path.join(base_url, self.uri)
+                    else:
+                        check_uri = os.path.join(base_url, self.ds.id._filename)
+            storage_type = return_storage_type(check_uri)
 
         # still allow for a passable storage_type
         # for special cases eg "special-POSIX" ie DDN
@@ -152,6 +212,8 @@ def __init__(
             self.filename = self.ds
         elif input_variable and self.storage_type == "s3":
             self.filename = self.ds.id._filename
+        elif input_variable and self.storage_type == "https":
+            self.filename = self.ds
 
         # get storage_options
         self.storage_options = storage_options
@@ -198,6 +260,8 @@ def __load_nc_file(self):
             nc = pyfive.File(self.uri)
         elif self.storage_type == "s3":
             nc = load_from_s3(self.uri, self.storage_options)
+        elif self.storage_type == "https":
+            nc = load_from_https(self.uri)
         self.filename = self.uri
         self.ds = nc[ncvar]
 
@@ -518,7 +582,7 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection,
                             method=self.method
             )
 
-        elif self.storage_type == "s3" and self._version==2:
+        elif self.storage_type == "s3" and self._version == 2:
             # S3: pass in pre-configured storage options (credentials)
             # print("S3 rfile is:", self.filename)
             parsed_url = urllib.parse.urlparse(self.filename)
@@ -567,6 +631,31 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection,
                                                        chunk_selection,
                                                        axis,
                                                        operation=self._method)
+        # this is for testing ONLY until Reductionist is able to handle https
+        # located files; after that, we can pipe any regular https file through
+        # to Reductionist, provided the https server is "closer" to Reductionist
+        elif self.storage_type == "https" and self._version == 2:
+            # build a simple session
+            session = requests.Session()
+            session.auth = (None, None)
+            session.verify = False
+            bucket = "https"  # really doesn't matter
+
+            # note the extra "storage_type" kwarg
+            # this currently makes Reductionist throw a wobbly
+            # E           activestorage.reductionist.ReductionistError: Reductionist error: HTTP 400: {"error": {"message": "request data is not valid", "caused_by": ["Failed to deserialize the JSON body into the target type", "storage_type: unknown field `storage_type`, expected one of `source`, `bucket`, `object`, `dtype`, `byte_order`, `offset`, `size`, `shape`, `order`, `selection`, `compression`, `filters`, `missing` at line 1 column 550"]}}
+            tmp, count = reductionist.reduce_chunk(session,
+                                                   "https://192.171.169.113:8080",
+                                                   self.filename,
+                                                   bucket, self.filename, offset,
+                                                   size, compressor, filters,
+                                                   self.missing, np.dtype(ds.dtype),
+                                                   chunks,
+                                                   ds._order,
+                                                   chunk_selection,
+                                                   axis,
+                                                   operation=self._method,
+                                                   storage_type="https")
         elif self.storage_type=='ActivePosix' and self.version==2:
             # This is where the DDN Fuse and Infinia wrappers go
             raise NotImplementedError

diff --git a/activestorage/reductionist.py b/activestorage/reductionist.py
@@ -29,7 +29,7 @@ def get_session(username: str, password: str, cacert: typing.Optional[str]) -> r
 
 def reduce_chunk(session, server, source, bucket, object,
                  offset, size, compression, filters, missing, dtype, shape,
-                 order, chunk_selection, axis, operation):
+                 order, chunk_selection, axis, operation, storage_type=None):
     """Perform a reduction on a chunk using Reductionist.
 
     :param server: Reductionist server URL
@@ -53,11 +53,14 @@ def reduce_chunk(session, server, source, bucket, object,
                             obtained or operated upon.
     :param axis: tuple of the axes to be reduced (non-negative integers)
     :param operation: name of operation to perform
+    :param storage_type: optional testing flag to allow HTTPS reduction
     :returns: the reduced data as a numpy array or scalar
     :raises ReductionistError: if the request to Reductionist fails
     """
 
-    request_data = build_request_data(source, bucket, object, offset, size, compression, filters, missing, dtype, shape, order, chunk_selection, axis)
+    request_data = build_request_data(source, bucket, object, offset, size, compression,
+                                      filters, missing, dtype, shape, order, chunk_selection,
+                                      axis, storage_type=storage_type)
     if DEBUG:
         print(f"Reductionist request data dictionary: {request_data}")
     api_operation = "sum" if operation == "mean" else operation or "select"
@@ -137,7 +140,7 @@ def encode_missing(missing):
 
 def build_request_data(source: str, bucket: str, object: str, offset: int,
                        size: int, compression, filters, missing, dtype, shape,
-                       order, selection, axis) -> dict:
+                       order, selection, axis, storage_type=None) -> dict:
     """Build request data for Reductionist API."""
     request_data = {
         'source': source,
@@ -148,6 +151,7 @@ def build_request_data(source: str, bucket: str, object: str, offset: int,
         'offset': int(offset),
         'size': int(size),
         'order': order,
+        'storage_type': storage_type,
     }
     if shape:
         request_data["shape"] = shape

diff --git a/activestorage/storage.py b/activestorage/storage.py
@@ -1,4 +1,5 @@
 """Active storage module."""
+import fsspec
 import numpy as np
 import pyfive
 
@@ -35,18 +36,33 @@ def reduce_chunk(rfile,
         #FIXME: for the moment, open the file every time ... we might want to do that, or not
         # we could just use an instance of pyfive.high_level.Dataset.id
         # passed directly from active.py, as below
-        with open(rfile,'rb') as open_file:
-            # get the data
-            chunk = read_block(open_file, offset, size)
-            # reverse any compression and filters
-            chunk = filter_pipeline(chunk, compression, filters)
-            # make it a numpy array of bytes
-            chunk = ensure_ndarray(chunk)
-            # convert to the appropriate data type
-            chunk = chunk.view(dtype)
-            # sort out ordering and convert to the parent hyperslab dimensions
-            chunk = chunk.reshape(-1, order='A')
-            chunk = chunk.reshape(shape, order=order)
+        try:
+            with open(rfile,'rb') as open_file:
+                # get the data
+                chunk = read_block(open_file, offset, size)
+                # reverse any compression and filters
+                chunk = filter_pipeline(chunk, compression, filters)
+                # make it a numpy array of bytes
+                chunk = ensure_ndarray(chunk)
+                # convert to the appropriate data type
+                chunk = chunk.view(dtype)
+                # sort out ordering and convert to the parent hyperslab dimensions
+                chunk = chunk.reshape(-1, order='A')
+                chunk = chunk.reshape(shape, order=order)
+        except FileNotFoundError:  # could a https file
+            fs = fsspec.filesystem('http')
+            with fs.open(rfile, 'rb') as open_file:
+                # get the data
+                chunk = read_block(open_file, offset, size)
+                # reverse any compression and filters
+                chunk = filter_pipeline(chunk, compression, filters)
+                # make it a numpy array of bytes
+                chunk = ensure_ndarray(chunk)
+                # convert to the appropriate data type
+                chunk = chunk.view(dtype)
+                # sort out ordering and convert to the parent hyperslab dimensions
+                chunk = chunk.reshape(-1, order='A')
+                chunk = chunk.reshape(shape, order=order)
     else:
             class storeinfo: pass
             storeinfo.byte_offset = offset