From aaebad27a83433baeff5f76218de677d7d2158ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n=20V=C3=B6r=C3=B6s?= <zvoros@gmail.com>
Date: Tue, 16 Feb 2021 07:31:16 +0100
Subject: [PATCH 01/19] add option to extend dtype

---
 code/ndarray.c                   | 224 ++++++------
 code/ndarray.h                   |  12 +-
 code/ndarray_operators.c         | 494 +++++++++++++--------------
 code/ndarray_operators.h         |  42 +--
 code/numpy/approx/approx.c       |  26 +-
 code/numpy/compare/compare.c     |  66 ++--
 code/numpy/fft/fft_tools.c       |   4 +-
 code/numpy/filter/filter.c       |   4 +-
 code/numpy/linalg/linalg.c       |  10 +-
 code/numpy/numerical/numerical.c |  98 +++---
 code/numpy/poly/poly.c           |   2 +-
 code/numpy/stats/stats.c         |   4 +-
 code/numpy/transform/transform.c |   4 +-
 code/numpy/vector/vector.c       |  18 +-
 code/scipy/signal/signal.c       |   4 +-
 code/ulab.h                      |   6 +
 code/ulab_create.c               |  10 +-
 code/user/user.c                 |  10 +-
 run-tests                        | 570 -------------------------------
 19 files changed, 526 insertions(+), 1082 deletions(-)
 delete mode 100755 run-tests

diff --git a/code/ndarray.c b/code/ndarray.c
index ad911a73..8b2ee707 100644
--- a/code/ndarray.c
+++ b/code/ndarray.c
@@ -321,15 +321,15 @@ void ndarray_dtype_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kin
     (void)kind;
     dtype_obj_t *self = MP_OBJ_TO_PTR(self_in);
     mp_print_str(print, "dtype('");
-    if(self->dtype == NDARRAY_BOOLEAN) {
+    if(self->dtype.type == NDARRAY_BOOLEAN) {
         mp_print_str(print, "bool')");
-    } else if(self->dtype == NDARRAY_UINT8) {
+    } else if(self->dtype.type == NDARRAY_UINT8) {
         mp_print_str(print, "uint8')");
-    } else if(self->dtype == NDARRAY_INT8) {
+    } else if(self->dtype.type == NDARRAY_INT8) {
         mp_print_str(print, "int8')");
-    } else if(self->dtype == NDARRAY_UINT16) {
+    } else if(self->dtype.type == NDARRAY_UINT16) {
         mp_print_str(print, "uint16')");
-    } else if(self->dtype == NDARRAY_INT16) {
+    } else if(self->dtype.type == NDARRAY_INT16) {
         mp_print_str(print, "int16')");
     } else {
         #if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
@@ -358,7 +358,7 @@ mp_obj_t ndarray_dtype_make_new(const mp_obj_type_t *type, size_t n_args, size_t
     if(MP_OBJ_IS_TYPE(args[0], &ulab_ndarray_type)) {
         // return the dtype of the array
         ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0]);
-        dtype->dtype = ndarray->dtype;
+        dtype->dtype.type = ndarray->dtype.type;
     } else {
         uint8_t _dtype;
         if(MP_OBJ_IS_INT(_args[0].u_obj)) {
@@ -384,7 +384,7 @@ mp_obj_t ndarray_dtype_make_new(const mp_obj_type_t *type, size_t n_args, size_t
                 mp_raise_TypeError(translate("data type not understood"));
             }
         }
-        dtype->dtype = _dtype;
+        dtype->dtype.type = _dtype;
     }
     return dtype;
 }
@@ -393,7 +393,7 @@ mp_obj_t ndarray_dtype(mp_obj_t self_in) {
     ndarray_obj_t *self = MP_OBJ_TO_PTR(self_in);
     dtype_obj_t *dtype = m_new_obj(dtype_obj_t);
     dtype->base.type = &ulab_dtype_type;
-    dtype->dtype = self->dtype;
+    dtype->dtype.type = self->dtype.type;
     return dtype;
 }
 
@@ -403,7 +403,7 @@ mp_obj_t ndarray_dtype(mp_obj_t self_in) {
     uint8_t dtype;
     if(MP_OBJ_IS_TYPE(self_in, &ulab_ndarray_type)) {
         ndarray_obj_t *self = MP_OBJ_TO_PTR(self_in);
-        dtype = self->dtype;
+        dtype = self->dtype.type;
     } else { // we assume here that the input is a single character
         GET_STR_DATA_LEN(self_in, _dtype, len);
         if((len != 1) || ((*_dtype != NDARRAY_BOOL) && (*_dtype != NDARRAY_UINT8)
@@ -451,7 +451,7 @@ MP_DEFINE_CONST_FUN_OBJ_0(ndarray_get_printoptions_obj, ndarray_get_printoptions
 mp_obj_t ndarray_get_item(ndarray_obj_t *ndarray, void *array) {
     // returns a proper micropython object from an array
     if(!ndarray->boolean) {
-        return mp_binary_get_val_array(ndarray->dtype, array, 0);
+        return mp_binary_get_val_array(ndarray->dtype.type, array, 0);
     } else {
         if(*(uint8_t *)array) {
             return mp_const_true;
@@ -547,13 +547,13 @@ void ndarray_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t ki
     #endif
     if(self->boolean) {
         mp_print_str(print, ", dtype=bool)");
-    } else if(self->dtype == NDARRAY_UINT8) {
+    } else if(self->dtype.type == NDARRAY_UINT8) {
         mp_print_str(print, ", dtype=uint8)");
-    } else if(self->dtype == NDARRAY_INT8) {
+    } else if(self->dtype.type == NDARRAY_INT8) {
         mp_print_str(print, ", dtype=int8)");
-    } else if(self->dtype == NDARRAY_UINT16) {
+    } else if(self->dtype.type == NDARRAY_UINT16) {
         mp_print_str(print, ", dtype=uint16)");
-    } else if(self->dtype == NDARRAY_INT16) {
+    } else if(self->dtype.type == NDARRAY_INT16) {
         mp_print_str(print, ", dtype=int16)");
     } else {
         #if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
@@ -600,14 +600,14 @@ ndarray_obj_t *ndarray_new_ndarray(uint8_t ndim, size_t *shape, int32_t *strides
     // Creates the base ndarray with shape, and initialises the values to straight 0s
     ndarray_obj_t *ndarray = m_new_obj(ndarray_obj_t);
     ndarray->base.type = &ulab_ndarray_type;
-    ndarray->dtype = dtype == NDARRAY_BOOL ? NDARRAY_UINT8 : dtype;
+    ndarray->dtype.type = dtype == NDARRAY_BOOL ? NDARRAY_UINT8 : dtype;
     ndarray->boolean = dtype == NDARRAY_BOOL ? NDARRAY_BOOLEAN : NDARRAY_NUMERIC;
     ndarray->ndim = ndim;
     ndarray->len = ndim == 0 ? 0 : 1;
-    ndarray->itemsize = mp_binary_get_size('@', ndarray->dtype, NULL);
+    ndarray->itemsize = mp_binary_get_size('@', ndarray->dtype.type, NULL);
     int32_t *_strides;
     if(strides == NULL) {
-        _strides = strides_from_shape(shape, ndarray->dtype);
+        _strides = strides_from_shape(shape, ndarray->dtype.type);
     } else {
         _strides = strides;
     }
@@ -704,7 +704,7 @@ ndarray_obj_t *ndarray_new_view(ndarray_obj_t *source, uint8_t ndim, size_t *sha
     ndarray_obj_t *ndarray = m_new_obj(ndarray_obj_t);
     ndarray->base.type = &ulab_ndarray_type;
     ndarray->boolean = source->boolean;
-    ndarray->dtype = source->dtype;
+    ndarray->dtype.type = source->dtype.type;
     ndarray->ndim = ndim;
     ndarray->itemsize = source->itemsize;
     ndarray->len = ndim == 0 ? 0 : 1;
@@ -725,9 +725,9 @@ ndarray_obj_t *ndarray_copy_view(ndarray_obj_t *source) {
     // In order to make it dtype-agnostic, we copy the memory content
     // instead of reading out the values
 
-    int32_t *strides = strides_from_shape(source->shape, source->dtype);
+    int32_t *strides = strides_from_shape(source->shape, source->dtype.type);
 
-    uint8_t dtype = source->dtype;
+    uint8_t dtype = source->dtype.type;
     if(source->boolean) {
         dtype = NDARRAY_BOOLEAN;
     }
@@ -755,7 +755,7 @@ mp_obj_t ndarray_byteswap(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_
     } else {
         ndarray = ndarray_new_view(self, self->ndim, self->shape, self->strides, 0);
     }
-    if((self->dtype == NDARRAY_BOOL) || (self->dtype == NDARRAY_UINT8) || (self->dtype == NDARRAY_INT8)) {
+    if((self->dtype.type == NDARRAY_BOOL) || (self->dtype.type == NDARRAY_UINT8) || (self->dtype.type == NDARRAY_INT8)) {
         return MP_OBJ_FROM_PTR(ndarray);
     } else {
         uint8_t *array = (uint8_t *)ndarray->array;
@@ -773,7 +773,7 @@ mp_obj_t ndarray_byteswap(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_
                 #endif
                     size_t l = 0;
                     do {
-                        if(self->dtype == NDARRAY_FLOAT) {
+                        if(self->dtype.type == NDARRAY_FLOAT) {
                             #if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
                             SWAP(uint8_t, array[0], array[3]);
                             SWAP(uint8_t, array[1], array[2]);
@@ -845,7 +845,7 @@ STATIC uint8_t ndarray_init_helper(size_t n_args, const mp_obj_t *pos_args, mp_m
     #if ULAB_HAS_DTYPE_OBJECT
     if(MP_OBJ_IS_TYPE(args[1].u_obj, &ulab_dtype_type)) {
         dtype_obj_t *dtype = MP_OBJ_TO_PTR(args[1].u_obj);
-        _dtype = dtype->dtype;
+        _dtype = dtype->dtype.type;
     } else { // this must be an integer defined as a class constant (ulba.uint8 etc.)
         _dtype = mp_obj_get_int(args[1].u_obj);
     }
@@ -860,7 +860,7 @@ STATIC mp_obj_t ndarray_make_new_core(const mp_obj_type_t *type, size_t n_args,
 
     if(MP_OBJ_IS_TYPE(args[0], &ulab_ndarray_type)) {
         ndarray_obj_t *source = MP_OBJ_TO_PTR(args[0]);
-        if(dtype == source->dtype) {
+        if(dtype == source->dtype.type) {
             return ndarray_copy_view(source);
         }
         ndarray_obj_t *target = ndarray_new_dense_ndarray(source->ndim, source->shape, dtype);
@@ -882,12 +882,12 @@ STATIC mp_obj_t ndarray_make_new_core(const mp_obj_type_t *type, size_t n_args,
                     do {
                         mp_obj_t item;
                         // floats must be treated separately, because they can't directly be converted to integer types
-                        if((source->dtype == NDARRAY_FLOAT) && (dtype != NDARRAY_FLOAT)) {
+                        if((source->dtype.type == NDARRAY_FLOAT) && (dtype != NDARRAY_FLOAT)) {
                             // floats must be treated separately, because they can't directly be converted to integer types
-                            mp_float_t f = ndarray_get_float_value(sarray, source->dtype);
+                            mp_float_t f = ndarray_get_float_value(sarray, source->dtype.type);
                             item = mp_obj_new_int((int32_t)MICROPY_FLOAT_C_FUN(floor)(f));
                         } else {
-                            item = mp_binary_get_val_array(source->dtype, sarray, 0);
+                            item = mp_binary_get_val_array(source->dtype.type, sarray, 0);
                         }
                         mp_binary_set_val_array(dtype, tarray, 0, item);
                         tarray += target->itemsize;
@@ -915,7 +915,7 @@ STATIC mp_obj_t ndarray_make_new_core(const mp_obj_type_t *type, size_t n_args,
         return MP_OBJ_FROM_PTR(target);
     }
 
-    // We have to figure out, whether the elements of the iterable are iterables themself
+    // We have to figure out, whether the elements of the iterable are iterables themselves
     uint8_t ndim = 0;
     size_t shape[ULAB_MAX_DIMS];
     mp_obj_iter_buf_t iter_buf[ULAB_MAX_DIMS];
@@ -961,7 +961,7 @@ STATIC mp_obj_t ndarray_make_new_core(const mp_obj_type_t *type, size_t n_args,
             do {
             #endif
                 iterable[ULAB_MAX_DIMS - 1] = mp_getiter(item, &iter_buf[ULAB_MAX_DIMS - 1]);
-                ndarray_assign_elements(self, iterable[ULAB_MAX_DIMS - 1], self->dtype, &idx);
+                ndarray_assign_elements(self, iterable[ULAB_MAX_DIMS - 1], self->dtype.type, &idx);
             #if ULAB_MAX_DIMS > 1
                 item = ndim > 1 ? mp_iternext(iterable[ULAB_MAX_DIMS - 2]) : MP_OBJ_STOP_ITERATION;
             } while(item != MP_OBJ_STOP_ITERATION);
@@ -1167,64 +1167,64 @@ void ndarray_assign_view(ndarray_obj_t *view, ndarray_obj_t *values) {
         lstrides[i] /= view->itemsize;
     }
 
-    if(view->dtype == NDARRAY_UINT8) {
-        if(values->dtype == NDARRAY_UINT8) {
+    if(view->dtype.type == NDARRAY_UINT8) {
+        if(values->dtype.type == NDARRAY_UINT8) {
             ASSIGNMENT_LOOP(view, uint8_t, uint8_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_INT8) {
+        } else if(values->dtype.type == NDARRAY_INT8) {
             ASSIGNMENT_LOOP(view, uint8_t, int8_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_UINT16) {
+        } else if(values->dtype.type == NDARRAY_UINT16) {
             ASSIGNMENT_LOOP(view, uint8_t, uint16_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_INT16) {
+        } else if(values->dtype.type == NDARRAY_INT16) {
             ASSIGNMENT_LOOP(view, uint8_t, int16_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_FLOAT) {
+        } else if(values->dtype.type == NDARRAY_FLOAT) {
             ASSIGNMENT_LOOP(view, uint8_t, mp_float_t, lstrides, rarray, rstrides);
         }
-    } else if(view->dtype == NDARRAY_INT8) {
-        if(values->dtype == NDARRAY_UINT8) {
+    } else if(view->dtype.type == NDARRAY_INT8) {
+        if(values->dtype.type == NDARRAY_UINT8) {
             ASSIGNMENT_LOOP(view, int8_t, uint8_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_INT8) {
+        } else if(values->dtype.type == NDARRAY_INT8) {
             ASSIGNMENT_LOOP(view, int8_t, int8_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_UINT16) {
+        } else if(values->dtype.type == NDARRAY_UINT16) {
             ASSIGNMENT_LOOP(view, int8_t, uint16_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_INT16) {
+        } else if(values->dtype.type == NDARRAY_INT16) {
             ASSIGNMENT_LOOP(view, int8_t, int16_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_FLOAT) {
+        } else if(values->dtype.type == NDARRAY_FLOAT) {
             ASSIGNMENT_LOOP(view, int8_t, mp_float_t, lstrides, rarray, rstrides);
         }
-    } else if(view->dtype == NDARRAY_UINT16) {
-        if(values->dtype == NDARRAY_UINT8) {
+    } else if(view->dtype.type == NDARRAY_UINT16) {
+        if(values->dtype.type == NDARRAY_UINT8) {
             ASSIGNMENT_LOOP(view, uint16_t, uint8_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_INT8) {
+        } else if(values->dtype.type == NDARRAY_INT8) {
             ASSIGNMENT_LOOP(view, uint16_t, int8_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_UINT16) {
+        } else if(values->dtype.type == NDARRAY_UINT16) {
             ASSIGNMENT_LOOP(view, uint16_t, uint16_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_INT16) {
+        } else if(values->dtype.type == NDARRAY_INT16) {
             ASSIGNMENT_LOOP(view, uint16_t, int16_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_FLOAT) {
+        } else if(values->dtype.type == NDARRAY_FLOAT) {
             ASSIGNMENT_LOOP(view, uint16_t, mp_float_t, lstrides, rarray, rstrides);
         }
-    } else if(view->dtype == NDARRAY_INT16) {
-        if(values->dtype == NDARRAY_UINT8) {
+    } else if(view->dtype.type == NDARRAY_INT16) {
+        if(values->dtype.type == NDARRAY_UINT8) {
             ASSIGNMENT_LOOP(view, int16_t, uint8_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_INT8) {
+        } else if(values->dtype.type == NDARRAY_INT8) {
             ASSIGNMENT_LOOP(view, int16_t, int8_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_UINT16) {
+        } else if(values->dtype.type == NDARRAY_UINT16) {
             ASSIGNMENT_LOOP(view, int16_t, uint16_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_INT16) {
+        } else if(values->dtype.type == NDARRAY_INT16) {
             ASSIGNMENT_LOOP(view, int16_t, int16_t,  lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_FLOAT) {
+        } else if(values->dtype.type == NDARRAY_FLOAT) {
             ASSIGNMENT_LOOP(view, int16_t, mp_float_t,  lstrides, rarray, rstrides);
         }
     } else { // the dtype must be an mp_float_t now
-        if(values->dtype == NDARRAY_UINT8) {
+        if(values->dtype.type == NDARRAY_UINT8) {
             ASSIGNMENT_LOOP(view, mp_float_t, uint8_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_INT8) {
+        } else if(values->dtype.type == NDARRAY_INT8) {
             ASSIGNMENT_LOOP(view, mp_float_t, int8_t,  lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_UINT16) {
+        } else if(values->dtype.type == NDARRAY_UINT16) {
             ASSIGNMENT_LOOP(view, mp_float_t, uint16_t,  lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_INT16) {
+        } else if(values->dtype.type == NDARRAY_INT16) {
             ASSIGNMENT_LOOP(view, mp_float_t, int16_t,  lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_FLOAT) {
+        } else if(values->dtype.type == NDARRAY_FLOAT) {
             ASSIGNMENT_LOOP(view, mp_float_t, mp_float_t,  lstrides, rarray, rstrides);
         }
     }
@@ -1242,7 +1242,7 @@ static mp_obj_t ndarray_from_boolean_index(ndarray_obj_t *ndarray, ndarray_obj_t
         count += *iarray;
         iarray += index->strides[ULAB_MAX_DIMS - 1];
     }
-    ndarray_obj_t *results = ndarray_new_linear_array(count, ndarray->dtype);
+    ndarray_obj_t *results = ndarray_new_linear_array(count, ndarray->dtype.type);
     uint8_t *rarray = (uint8_t *)results->array;
     uint8_t *array = (uint8_t *)ndarray->array;
     // re-wind the index array
@@ -1281,64 +1281,64 @@ static mp_obj_t ndarray_assign_from_boolean_index(ndarray_obj_t *ndarray, ndarra
         // there is a single value
         vstride = 0;
     }
-    if(ndarray->dtype == NDARRAY_UINT8) {
-        if(values->dtype == NDARRAY_UINT8) {
+    if(ndarray->dtype.type == NDARRAY_UINT8) {
+        if(values->dtype.type == NDARRAY_UINT8) {
             BOOLEAN_ASSIGNMENT_LOOP(uint8_t, uint8_t, ndarray, iarray, istride, varray, vstride);
-        } else if(values->dtype == NDARRAY_INT8) {
+        } else if(values->dtype.type == NDARRAY_INT8) {
             BOOLEAN_ASSIGNMENT_LOOP(uint8_t, int8_t, ndarray, iarray, istride, varray, vstride);
-        } else if(values->dtype == NDARRAY_UINT16) {
+        } else if(values->dtype.type == NDARRAY_UINT16) {
             BOOLEAN_ASSIGNMENT_LOOP(uint8_t, uint16_t, ndarray, iarray, istride, varray, vstride);
-        } else if(values->dtype == NDARRAY_INT16) {
+        } else if(values->dtype.type == NDARRAY_INT16) {
             BOOLEAN_ASSIGNMENT_LOOP(uint8_t, int16_t, ndarray, iarray, istride, varray, vstride);
-        } else if(values->dtype == NDARRAY_FLOAT) {
+        } else if(values->dtype.type == NDARRAY_FLOAT) {
             BOOLEAN_ASSIGNMENT_LOOP(uint8_t, mp_float_t, ndarray, iarray, istride, varray, vstride);
         }
-    } else if(ndarray->dtype == NDARRAY_INT8) {
-        if(values->dtype == NDARRAY_UINT8) {
+    } else if(ndarray->dtype.type == NDARRAY_INT8) {
+        if(values->dtype.type == NDARRAY_UINT8) {
             BOOLEAN_ASSIGNMENT_LOOP(int8_t, uint8_t, ndarray, iarray, istride, varray, vstride);
-        } else if(values->dtype == NDARRAY_INT8) {
+        } else if(values->dtype.type == NDARRAY_INT8) {
             BOOLEAN_ASSIGNMENT_LOOP(int8_t, int8_t, ndarray, iarray, istride, varray, vstride);
-        } else if(values->dtype == NDARRAY_UINT16) {
+        } else if(values->dtype.type == NDARRAY_UINT16) {
             BOOLEAN_ASSIGNMENT_LOOP(int8_t, uint16_t, ndarray, iarray, istride, varray, vstride);
-        } else if(values->dtype == NDARRAY_INT16) {
+        } else if(values->dtype.type == NDARRAY_INT16) {
             BOOLEAN_ASSIGNMENT_LOOP(int8_t, int16_t, ndarray, iarray, istride, varray, vstride);
-        } else if(values->dtype == NDARRAY_FLOAT) {
+        } else if(values->dtype.type == NDARRAY_FLOAT) {
             BOOLEAN_ASSIGNMENT_LOOP(int8_t, mp_float_t, ndarray, iarray, istride, varray, vstride);
         }
-    } else if(ndarray->dtype == NDARRAY_UINT16) {
-        if(values->dtype == NDARRAY_UINT8) {
+    } else if(ndarray->dtype.type == NDARRAY_UINT16) {
+        if(values->dtype.type == NDARRAY_UINT8) {
             BOOLEAN_ASSIGNMENT_LOOP(uint16_t, uint8_t, ndarray, iarray, istride, varray, vstride);
-        } else if(values->dtype == NDARRAY_INT8) {
+        } else if(values->dtype.type == NDARRAY_INT8) {
             BOOLEAN_ASSIGNMENT_LOOP(uint16_t, int8_t, ndarray, iarray, istride, varray, vstride);
-        } else if(values->dtype == NDARRAY_UINT16) {
+        } else if(values->dtype.type == NDARRAY_UINT16) {
             BOOLEAN_ASSIGNMENT_LOOP(uint16_t, uint16_t, ndarray, iarray, istride, varray, vstride);
-        } else if(values->dtype == NDARRAY_INT16) {
+        } else if(values->dtype.type == NDARRAY_INT16) {
             BOOLEAN_ASSIGNMENT_LOOP(uint16_t, int16_t, ndarray, iarray, istride, varray, vstride);
-        } else if(values->dtype == NDARRAY_FLOAT) {
+        } else if(values->dtype.type == NDARRAY_FLOAT) {
             BOOLEAN_ASSIGNMENT_LOOP(uint16_t, mp_float_t, ndarray, iarray, istride, varray, vstride);
         }
-    } else if(ndarray->dtype == NDARRAY_INT16) {
-        if(values->dtype == NDARRAY_UINT8) {
+    } else if(ndarray->dtype.type == NDARRAY_INT16) {
+        if(values->dtype.type == NDARRAY_UINT8) {
             BOOLEAN_ASSIGNMENT_LOOP(int16_t, uint8_t, ndarray, iarray, istride, varray, vstride);
-        } else if(values->dtype == NDARRAY_INT8) {
+        } else if(values->dtype.type == NDARRAY_INT8) {
             BOOLEAN_ASSIGNMENT_LOOP(int16_t, int8_t, ndarray, iarray, istride, varray, vstride);
-        } else if(values->dtype == NDARRAY_UINT16) {
+        } else if(values->dtype.type == NDARRAY_UINT16) {
             BOOLEAN_ASSIGNMENT_LOOP(int16_t, uint16_t, ndarray, iarray, istride, varray, vstride);
-        } else if(values->dtype == NDARRAY_INT16) {
+        } else if(values->dtype.type == NDARRAY_INT16) {
             BOOLEAN_ASSIGNMENT_LOOP(int16_t, int16_t, ndarray, iarray, istride, varray, vstride);
-        } else if(values->dtype == NDARRAY_FLOAT) {
+        } else if(values->dtype.type == NDARRAY_FLOAT) {
             BOOLEAN_ASSIGNMENT_LOOP(int16_t, mp_float_t, ndarray, iarray, istride, varray, vstride);
         }
     } else {
-        if(values->dtype == NDARRAY_UINT8) {
+        if(values->dtype.type == NDARRAY_UINT8) {
             BOOLEAN_ASSIGNMENT_LOOP(mp_float_t, uint8_t, ndarray, iarray, istride, varray, vstride);
-        } else if(values->dtype == NDARRAY_INT8) {
+        } else if(values->dtype.type == NDARRAY_INT8) {
             BOOLEAN_ASSIGNMENT_LOOP(mp_float_t, int8_t, ndarray, iarray, istride, varray, vstride);
-        } else if(values->dtype == NDARRAY_UINT16) {
+        } else if(values->dtype.type == NDARRAY_UINT16) {
             BOOLEAN_ASSIGNMENT_LOOP(mp_float_t, uint16_t, ndarray, iarray, istride, varray, vstride);
-        } else if(values->dtype == NDARRAY_INT16) {
+        } else if(values->dtype.type == NDARRAY_INT16) {
             BOOLEAN_ASSIGNMENT_LOOP(mp_float_t, int16_t, ndarray, iarray, istride, varray, vstride);
-        } else if(values->dtype == NDARRAY_FLOAT) {
+        } else if(values->dtype.type == NDARRAY_FLOAT) {
             BOOLEAN_ASSIGNMENT_LOOP(mp_float_t, mp_float_t, ndarray, iarray, istride, varray, vstride);
         }
     }
@@ -1373,7 +1373,7 @@ static mp_obj_t ndarray_get_slice(ndarray_obj_t *ndarray, mp_obj_t index, ndarra
         if(values == NULL) { // return value(s)
             // if the view has been reduced to nothing, return a single value
             if(view->ndim == 0) {
-                return mp_binary_get_val_array(view->dtype, view->array, 0);
+                return mp_binary_get_val_array(view->dtype.type, view->array, 0);
             } else {
                 return MP_OBJ_FROM_PTR(view);
             }
@@ -1465,7 +1465,7 @@ mp_obj_t ndarray_flatten(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_a
     }
 
     uint8_t *sarray = (uint8_t *)self->array;
-    ndarray_obj_t *ndarray = ndarray_new_linear_array(self->len, self->dtype);
+    ndarray_obj_t *ndarray = ndarray_new_linear_array(self->len, self->dtype.type);
     uint8_t *array = (uint8_t *)ndarray->array;
 
     if(memcmp(order, "C", 1) == 0) { // C-type ordering
@@ -1704,14 +1704,14 @@ mp_obj_t ndarray_binary_op(mp_binary_op_t _op, mp_obj_t lobj, mp_obj_t robj) {
             case MP_BINARY_OP_SUBTRACT:
                 // here we don't have to list those cases that result in an int16,
                 // because dtype is initialised with that NDARRAY_INT16
-                if(lhs->dtype == rhs->dtype) {
-                    dtype = rhs->dtype;
-                } else if((lhs->dtype == NDARRAY_FLOAT) || (rhs->dtype == NDARRAY_FLOAT)) {
+                if(lhs->dtype.type == rhs->dtype.type) {
+                    dtype = rhs->dtype.type;
+                } else if((lhs->dtype.type == NDARRAY_FLOAT) || (rhs->dtype.type == NDARRAY_FLOAT)) {
                     dtype = NDARRAY_FLOAT;
-                } else if(((lhs->dtype == NDARRAY_UINT8) && (rhs->dtype == NDARRAY_UINT16)) ||
-                            ((lhs->dtype == NDARRAY_INT8) && (rhs->dtype == NDARRAY_UINT16)) ||
-                            ((rhs->dtype == NDARRAY_UINT8) && (lhs->dtype == NDARRAY_UINT16)) ||
-                            ((rhs->dtype == NDARRAY_INT8) && (lhs->dtype == NDARRAY_UINT16))) {
+                } else if(((lhs->dtype.type == NDARRAY_UINT8) && (rhs->dtype.type == NDARRAY_UINT16)) ||
+                            ((lhs->dtype.type == NDARRAY_INT8) && (rhs->dtype.type == NDARRAY_UINT16)) ||
+                            ((rhs->dtype.type == NDARRAY_UINT8) && (lhs->dtype.type == NDARRAY_UINT16)) ||
+                            ((rhs->dtype.type == NDARRAY_INT8) && (lhs->dtype.type == NDARRAY_UINT16))) {
                     dtype = NDARRAY_UINT16;
                 }
                 return MP_OBJ_FROM_PTR(ndarray_new_linear_array(0, dtype));
@@ -1844,12 +1844,12 @@ mp_obj_t ndarray_unary_op(mp_unary_op_t op, mp_obj_t self_in) {
         case MP_UNARY_OP_ABS:
             ndarray = ndarray_copy_view(self);
             // if Booleam, NDARRAY_UINT8, or NDARRAY_UINT16, there is nothing to do
-            if(self->dtype == NDARRAY_INT8) {
+            if(self->dtype.type == NDARRAY_INT8) {
                 int8_t *array = (int8_t *)ndarray->array;
                 for(size_t i=0; i < self->len; i++, array++) {
                     if(*array < 0) *array = -(*array);
                 }
-            } else if(self->dtype == NDARRAY_INT16) {
+            } else if(self->dtype.type == NDARRAY_INT16) {
                 int16_t *array = (int16_t *)ndarray->array;
                 for(size_t i=0; i < self->len; i++, array++) {
                     if(*array < 0) *array = -(*array);
@@ -1865,7 +1865,7 @@ mp_obj_t ndarray_unary_op(mp_unary_op_t op, mp_obj_t self_in) {
         #endif
         #if NDARRAY_HAS_UNARY_OP_INVERT
         case MP_UNARY_OP_INVERT:
-            if(self->dtype == NDARRAY_FLOAT) {
+            if(self->dtype.type == NDARRAY_FLOAT) {
                 mp_raise_ValueError(translate("operation is not supported for given type"));
             }
             // we can invert the content byte by byte, no need to distinguish between different dtypes
@@ -1874,7 +1874,7 @@ mp_obj_t ndarray_unary_op(mp_unary_op_t op, mp_obj_t self_in) {
             if(ndarray->boolean) {
                 for(size_t i=0; i < ndarray->len; i++, array++) *array = *array ^ 0x01;
             } else {
-                uint8_t itemsize = mp_binary_get_size('@', self->dtype, NULL);
+                uint8_t itemsize = mp_binary_get_size('@', self->dtype.type, NULL);
                 for(size_t i=0; i < ndarray->len*itemsize; i++, array++) *array ^= 0xFF;
             }
             return MP_OBJ_FROM_PTR(ndarray);
@@ -1888,16 +1888,16 @@ mp_obj_t ndarray_unary_op(mp_unary_op_t op, mp_obj_t self_in) {
         #if NDARRAY_HAS_UNARY_OP_NEGATIVE
         case MP_UNARY_OP_NEGATIVE:
             ndarray = ndarray_copy_view(self); // from this point, this is a dense copy
-            if(self->dtype == NDARRAY_UINT8) {
+            if(self->dtype.type == NDARRAY_UINT8) {
                 uint8_t *array = (uint8_t *)ndarray->array;
                 for(size_t i=0; i < self->len; i++, array++) *array = -(*array);
-            } else if(self->dtype == NDARRAY_INT8) {
+            } else if(self->dtype.type == NDARRAY_INT8) {
                 int8_t *array = (int8_t *)ndarray->array;
                 for(size_t i=0; i < self->len; i++, array++) *array = -(*array);
-            } else if(self->dtype == NDARRAY_UINT16) {
+            } else if(self->dtype.type == NDARRAY_UINT16) {
                 uint16_t *array = (uint16_t *)ndarray->array;
                 for(size_t i=0; i < self->len; i++, array++) *array = -(*array);
-            } else if(self->dtype == NDARRAY_INT16) {
+            } else if(self->dtype.type == NDARRAY_INT16) {
                 int16_t *array = (int16_t *)ndarray->array;
                 for(size_t i=0; i < self->len; i++, array++) *array = -(*array);
             } else {
@@ -1969,10 +1969,10 @@ mp_obj_t ndarray_reshape(mp_obj_t oin, mp_obj_t _shape) {
     ndarray_obj_t *ndarray;
     if(ndarray_is_dense(source)) {
         // TODO: check if this is what numpy does
-        int32_t *new_strides = strides_from_shape(new_shape, source->dtype);
+        int32_t *new_strides = strides_from_shape(new_shape, source->dtype.type);
         ndarray = ndarray_new_view(source, shape->len, new_shape, new_strides, 0);
     } else {
-        ndarray = ndarray_new_ndarray_from_tuple(shape, source->dtype);
+        ndarray = ndarray_new_ndarray_from_tuple(shape, source->dtype.type);
         ndarray_copy_array(source, ndarray);
     }
     return MP_OBJ_FROM_PTR(ndarray);
@@ -2010,15 +2010,15 @@ mp_obj_t ndarray_info(mp_obj_t obj_in) {
     mp_printf(MP_PYTHON_PRINTER, "type: ");
     if(ndarray->boolean) {
         mp_printf(MP_PYTHON_PRINTER, "bool\n");
-    } else if(ndarray->dtype == NDARRAY_UINT8) {
+    } else if(ndarray->dtype.type == NDARRAY_UINT8) {
         mp_printf(MP_PYTHON_PRINTER, "uint8\n");
-    } else if(ndarray->dtype == NDARRAY_INT8) {
+    } else if(ndarray->dtype.type == NDARRAY_INT8) {
         mp_printf(MP_PYTHON_PRINTER, "int8\n");
-    } else if(ndarray->dtype == NDARRAY_UINT16) {
+    } else if(ndarray->dtype.type == NDARRAY_UINT16) {
         mp_printf(MP_PYTHON_PRINTER, "uint16\n");
-    } else if(ndarray->dtype == NDARRAY_INT16) {
+    } else if(ndarray->dtype.type == NDARRAY_INT16) {
         mp_printf(MP_PYTHON_PRINTER, "int16\n");
-    } else if(ndarray->dtype == NDARRAY_FLOAT) {
+    } else if(ndarray->dtype.type == NDARRAY_FLOAT) {
         mp_printf(MP_PYTHON_PRINTER, "float\n");
     }
     return mp_const_none;
diff --git a/code/ndarray.h b/code/ndarray.h
index c26164ad..7a66fdc7 100644
--- a/code/ndarray.h
+++ b/code/ndarray.h
@@ -63,9 +63,17 @@ enum NDARRAY_TYPE {
     NDARRAY_FLOAT = FLOAT_TYPECODE,
 };
 
+typedef struct _dtype_dtype {
+    uint8_t type;
+    #if ULAB_DTYPE_HAS_FUNC_POINTER
+    uint8_t flags = 0;
+    void *arrfunc;
+    #endif
+} dtype_dtype;
+
 typedef struct _ndarray_obj_t {
     mp_obj_base_t base;
-    uint8_t dtype;
+    dtype_dtype dtype;
     uint8_t itemsize;
     uint8_t boolean;
     uint8_t ndim;
@@ -80,7 +88,7 @@ extern const mp_obj_type_t ulab_dtype_type;
 
 typedef struct _dtype_obj_t {
     mp_obj_base_t base;
-    uint8_t dtype;
+    dtype_dtype dtype;
 } dtype_obj_t;
 
 void ndarray_dtype_print(const mp_print_t *, mp_obj_t , mp_print_kind_t );
diff --git a/code/ndarray_operators.c b/code/ndarray_operators.c
index 465140b6..011f821e 100644
--- a/code/ndarray_operators.c
+++ b/code/ndarray_operators.c
@@ -47,50 +47,50 @@ mp_obj_t ndarray_binary_equality(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
 
     #if NDARRAY_HAS_BINARY_OP_EQUAL
     if(op == MP_BINARY_OP_EQUAL) {
-        if(lhs->dtype == NDARRAY_UINT8) {
-            if(rhs->dtype == NDARRAY_UINT8) {
+        if(lhs->dtype.type == NDARRAY_UINT8) {
+            if(rhs->dtype.type == NDARRAY_UINT8) {
                 EQUALITY_LOOP(results, array, uint8_t, uint8_t, larray, lstrides, rarray, rstrides, ==);
-            } else if(rhs->dtype == NDARRAY_INT8) {
+            } else if(rhs->dtype.type == NDARRAY_INT8) {
                 EQUALITY_LOOP(results, array, uint8_t, int8_t, larray, lstrides, rarray, rstrides, ==);
-            } else if(rhs->dtype == NDARRAY_UINT16) {
+            } else if(rhs->dtype.type == NDARRAY_UINT16) {
                 EQUALITY_LOOP(results, array, uint8_t, uint16_t, larray, lstrides, rarray, rstrides, ==);
-            } else if(rhs->dtype == NDARRAY_INT16) {
+            } else if(rhs->dtype.type == NDARRAY_INT16) {
                 EQUALITY_LOOP(results, array, uint8_t, int16_t, larray, lstrides, rarray, rstrides, ==);
-            } else if(rhs->dtype == NDARRAY_FLOAT) {
+            } else if(rhs->dtype.type == NDARRAY_FLOAT) {
                 EQUALITY_LOOP(results, array, uint8_t, mp_float_t, larray, lstrides, rarray, rstrides, ==);
             }
-        } else if(lhs->dtype == NDARRAY_INT8) {
-            if(rhs->dtype == NDARRAY_INT8) {
+        } else if(lhs->dtype.type == NDARRAY_INT8) {
+            if(rhs->dtype.type == NDARRAY_INT8) {
                 EQUALITY_LOOP(results, array, int8_t, int8_t, larray, lstrides, rarray, rstrides, ==);
-            } else if(rhs->dtype == NDARRAY_UINT16) {
+            } else if(rhs->dtype.type == NDARRAY_UINT16) {
                 EQUALITY_LOOP(results, array, int8_t, uint16_t, larray, lstrides, rarray, rstrides, ==);
-            } else if(rhs->dtype == NDARRAY_INT16) {
+            } else if(rhs->dtype.type == NDARRAY_INT16) {
                 EQUALITY_LOOP(results, array, int8_t, int16_t, larray, lstrides, rarray, rstrides, ==);
-            } else if(rhs->dtype == NDARRAY_FLOAT) {
+            } else if(rhs->dtype.type == NDARRAY_FLOAT) {
                 EQUALITY_LOOP(results, array, int8_t, mp_float_t, larray, lstrides, rarray, rstrides, ==);
             } else {
                 return ndarray_binary_op(op, rhs, lhs);
             }
-        } else if(lhs->dtype == NDARRAY_UINT16) {
-            if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(lhs->dtype.type == NDARRAY_UINT16) {
+            if(rhs->dtype.type == NDARRAY_UINT16) {
                 EQUALITY_LOOP(results, array, uint16_t, uint16_t, larray, lstrides, rarray, rstrides, ==);
-            } else if(rhs->dtype == NDARRAY_INT16) {
+            } else if(rhs->dtype.type == NDARRAY_INT16) {
                 EQUALITY_LOOP(results, array, uint16_t, int16_t, larray, lstrides, rarray, rstrides, ==);
-            } else if(rhs->dtype == NDARRAY_FLOAT) {
+            } else if(rhs->dtype.type == NDARRAY_FLOAT) {
                 EQUALITY_LOOP(results, array, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, ==);
             } else {
                 return ndarray_binary_op(op, rhs, lhs);
             }
-        } else if(lhs->dtype == NDARRAY_INT16) {
-            if(rhs->dtype == NDARRAY_INT16) {
+        } else if(lhs->dtype.type == NDARRAY_INT16) {
+            if(rhs->dtype.type == NDARRAY_INT16) {
                 EQUALITY_LOOP(results, array, int16_t, int16_t, larray, lstrides, rarray, rstrides, ==);
-            } else if(rhs->dtype == NDARRAY_FLOAT) {
+            } else if(rhs->dtype.type == NDARRAY_FLOAT) {
                 EQUALITY_LOOP(results, array, int16_t, mp_float_t, larray, lstrides, rarray, rstrides, ==);
             } else {
                 return ndarray_binary_op(op, rhs, lhs);
             }
-        } else if(lhs->dtype == NDARRAY_FLOAT) {
-            if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(lhs->dtype.type == NDARRAY_FLOAT) {
+            if(rhs->dtype.type == NDARRAY_FLOAT) {
                 EQUALITY_LOOP(results, array, mp_float_t, mp_float_t, larray, lstrides, rarray, rstrides, ==);
             } else {
                 return ndarray_binary_op(op, rhs, lhs);
@@ -101,50 +101,50 @@ mp_obj_t ndarray_binary_equality(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
 
     #if NDARRAY_HAS_BINARY_OP_NOT_EQUAL
     if(op == MP_BINARY_OP_NOT_EQUAL) {
-        if(lhs->dtype == NDARRAY_UINT8) {
-            if(rhs->dtype == NDARRAY_UINT8) {
+        if(lhs->dtype.type == NDARRAY_UINT8) {
+            if(rhs->dtype.type == NDARRAY_UINT8) {
                 EQUALITY_LOOP(results, array, uint8_t, uint8_t, larray, lstrides, rarray, rstrides, !=);
-            } else if(rhs->dtype == NDARRAY_INT8) {
+            } else if(rhs->dtype.type == NDARRAY_INT8) {
                 EQUALITY_LOOP(results, array, uint8_t, int8_t, larray, lstrides, rarray, rstrides, !=);
-            } else if(rhs->dtype == NDARRAY_UINT16) {
+            } else if(rhs->dtype.type == NDARRAY_UINT16) {
                 EQUALITY_LOOP(results, array, uint8_t, uint16_t, larray, lstrides, rarray, rstrides, !=);
-            } else if(rhs->dtype == NDARRAY_INT16) {
+            } else if(rhs->dtype.type == NDARRAY_INT16) {
                 EQUALITY_LOOP(results, array, uint8_t, int16_t, larray, lstrides, rarray, rstrides, !=);
-            } else if(rhs->dtype == NDARRAY_FLOAT) {
+            } else if(rhs->dtype.type == NDARRAY_FLOAT) {
                 EQUALITY_LOOP(results, array, uint8_t, mp_float_t, larray, lstrides, rarray, rstrides, !=);
             }
-        } else if(lhs->dtype == NDARRAY_INT8) {
-            if(rhs->dtype == NDARRAY_INT8) {
+        } else if(lhs->dtype.type == NDARRAY_INT8) {
+            if(rhs->dtype.type == NDARRAY_INT8) {
                 EQUALITY_LOOP(results, array, int8_t, int8_t, larray, lstrides, rarray, rstrides, !=);
-            } else if(rhs->dtype == NDARRAY_UINT16) {
+            } else if(rhs->dtype.type == NDARRAY_UINT16) {
                 EQUALITY_LOOP(results, array, int8_t, uint16_t, larray, lstrides, rarray, rstrides, !=);
-            } else if(rhs->dtype == NDARRAY_INT16) {
+            } else if(rhs->dtype.type == NDARRAY_INT16) {
                 EQUALITY_LOOP(results, array, int8_t, int16_t, larray, lstrides, rarray, rstrides, !=);
-            } else if(rhs->dtype == NDARRAY_FLOAT) {
+            } else if(rhs->dtype.type == NDARRAY_FLOAT) {
                 EQUALITY_LOOP(results, array, int8_t, mp_float_t, larray, lstrides, rarray, rstrides, !=);
             } else {
                 return ndarray_binary_op(op, rhs, lhs);
             }
-        } else if(lhs->dtype == NDARRAY_UINT16) {
-            if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(lhs->dtype.type == NDARRAY_UINT16) {
+            if(rhs->dtype.type == NDARRAY_UINT16) {
                 EQUALITY_LOOP(results, array, uint16_t, uint16_t, larray, lstrides, rarray, rstrides, !=);
-            } else if(rhs->dtype == NDARRAY_INT16) {
+            } else if(rhs->dtype.type == NDARRAY_INT16) {
                 EQUALITY_LOOP(results, array, uint16_t, int16_t, larray, lstrides, rarray, rstrides, !=);
-            } else if(rhs->dtype == NDARRAY_FLOAT) {
+            } else if(rhs->dtype.type == NDARRAY_FLOAT) {
                 EQUALITY_LOOP(results, array, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, !=);
             } else {
                 return ndarray_binary_op(op, rhs, lhs);
             }
-        } else if(lhs->dtype == NDARRAY_INT16) {
-            if(rhs->dtype == NDARRAY_INT16) {
+        } else if(lhs->dtype.type == NDARRAY_INT16) {
+            if(rhs->dtype.type == NDARRAY_INT16) {
                 EQUALITY_LOOP(results, array, int16_t, int16_t, larray, lstrides, rarray, rstrides, !=);
-            } else if(rhs->dtype == NDARRAY_FLOAT) {
+            } else if(rhs->dtype.type == NDARRAY_FLOAT) {
                 EQUALITY_LOOP(results, array, int16_t, mp_float_t, larray, lstrides, rarray, rstrides, !=);
             } else {
                 return ndarray_binary_op(op, rhs, lhs);
             }
-        } else if(lhs->dtype == NDARRAY_FLOAT) {
-            if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(lhs->dtype.type == NDARRAY_FLOAT) {
+            if(rhs->dtype.type == NDARRAY_FLOAT) {
                 EQUALITY_LOOP(results, array, mp_float_t, mp_float_t, larray, lstrides, rarray, rstrides, !=);
             } else {
                 return ndarray_binary_op(op, rhs, lhs);
@@ -165,64 +165,64 @@ mp_obj_t ndarray_binary_add(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
     uint8_t *larray = (uint8_t *)lhs->array;
     uint8_t *rarray = (uint8_t *)rhs->array;
 
-    if(lhs->dtype == NDARRAY_UINT8) {
-        if(rhs->dtype == NDARRAY_UINT8) {
+    if(lhs->dtype.type == NDARRAY_UINT8) {
+        if(rhs->dtype.type == NDARRAY_UINT8) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT16);
             BINARY_LOOP(results, uint16_t, uint8_t, uint8_t, larray, lstrides, rarray, rstrides, +);
-        } else if(rhs->dtype == NDARRAY_INT8) {
+        } else if(rhs->dtype.type == NDARRAY_INT8) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
             BINARY_LOOP(results, int16_t, uint8_t, int8_t, larray, lstrides, rarray, rstrides, +);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT16);
             BINARY_LOOP(results, uint16_t, uint8_t, uint16_t, larray, lstrides, rarray, rstrides, +);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
             BINARY_LOOP(results, int16_t, uint8_t, int16_t, larray, lstrides, rarray, rstrides, +);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, uint8_t, mp_float_t, larray, lstrides, rarray, rstrides, +);
         }
-    } else if(lhs->dtype == NDARRAY_INT8) {
-        if(rhs->dtype == NDARRAY_INT8) {
+    } else if(lhs->dtype.type == NDARRAY_INT8) {
+        if(rhs->dtype.type == NDARRAY_INT8) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT8);
             BINARY_LOOP(results, int8_t, int8_t, int8_t, larray, lstrides, rarray, rstrides, +);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
             BINARY_LOOP(results, int16_t, int8_t, uint16_t, larray, lstrides, rarray, rstrides, +);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
             BINARY_LOOP(results, int16_t, int8_t, int16_t, larray, lstrides, rarray, rstrides, +);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, int8_t, mp_float_t, larray, lstrides, rarray, rstrides, +);
         } else {
             return ndarray_binary_op(MP_BINARY_OP_ADD, rhs, lhs);
         }
-    } else if(lhs->dtype == NDARRAY_UINT16) {
-        if(rhs->dtype == NDARRAY_UINT16) {
+    } else if(lhs->dtype.type == NDARRAY_UINT16) {
+        if(rhs->dtype.type == NDARRAY_UINT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT16);
             BINARY_LOOP(results, uint16_t, uint16_t, uint16_t, larray, lstrides, rarray, rstrides, +);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, uint16_t, int16_t, larray, lstrides, rarray, rstrides, +);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, +);
         } else {
             return ndarray_binary_op(MP_BINARY_OP_ADD, rhs, lhs);
         }
-    } else if(lhs->dtype == NDARRAY_INT16) {
-        if(rhs->dtype == NDARRAY_INT16) {
+    } else if(lhs->dtype.type == NDARRAY_INT16) {
+        if(rhs->dtype.type == NDARRAY_INT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
             BINARY_LOOP(results, int16_t, int16_t, int16_t, larray, lstrides, rarray, rstrides, +);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, int16_t, mp_float_t, larray, lstrides, rarray, rstrides, +);
         } else {
             return ndarray_binary_op(MP_BINARY_OP_ADD, rhs, lhs);
         }
-    } else if(lhs->dtype == NDARRAY_FLOAT) {
-        if(rhs->dtype == NDARRAY_FLOAT) {
+    } else if(lhs->dtype.type == NDARRAY_FLOAT) {
+        if(rhs->dtype.type == NDARRAY_FLOAT) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, mp_float_t, mp_float_t, larray, lstrides, rarray, rstrides, +);
         } else {
@@ -242,64 +242,64 @@ mp_obj_t ndarray_binary_multiply(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
     uint8_t *larray = (uint8_t *)lhs->array;
     uint8_t *rarray = (uint8_t *)rhs->array;
 
-    if(lhs->dtype == NDARRAY_UINT8) {
-        if(rhs->dtype == NDARRAY_UINT8) {
+    if(lhs->dtype.type == NDARRAY_UINT8) {
+        if(rhs->dtype.type == NDARRAY_UINT8) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT16);
             BINARY_LOOP(results, uint16_t, uint8_t, uint8_t, larray, lstrides, rarray, rstrides, *);
-        } else if(rhs->dtype == NDARRAY_INT8) {
+        } else if(rhs->dtype.type == NDARRAY_INT8) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
             BINARY_LOOP(results, int16_t, uint8_t, int8_t, larray, lstrides, rarray, rstrides, *);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT16);
             BINARY_LOOP(results, uint16_t, uint8_t, uint16_t, larray, lstrides, rarray, rstrides, *);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
             BINARY_LOOP(results, int16_t, uint8_t, int16_t, larray, lstrides, rarray, rstrides, *);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, uint8_t, mp_float_t, larray, lstrides, rarray, rstrides, *);
         }
-    } else if(lhs->dtype == NDARRAY_INT8) {
-        if(rhs->dtype == NDARRAY_INT8) {
+    } else if(lhs->dtype.type == NDARRAY_INT8) {
+        if(rhs->dtype.type == NDARRAY_INT8) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT8);
             BINARY_LOOP(results, int8_t, int8_t, int8_t, larray, lstrides, rarray, rstrides, *);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
             BINARY_LOOP(results, int16_t, int8_t, uint16_t, larray, lstrides, rarray, rstrides, *);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
             BINARY_LOOP(results, int16_t, int8_t, int16_t, larray, lstrides, rarray, rstrides, *);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, int8_t, mp_float_t, larray, lstrides, rarray, rstrides, *);
         } else {
             return ndarray_binary_op(MP_BINARY_OP_MULTIPLY, rhs, lhs);
         }
-    } else if(lhs->dtype == NDARRAY_UINT16) {
-        if(rhs->dtype == NDARRAY_UINT16) {
+    } else if(lhs->dtype.type == NDARRAY_UINT16) {
+        if(rhs->dtype.type == NDARRAY_UINT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT16);
             BINARY_LOOP(results, uint16_t, uint16_t, uint16_t, larray, lstrides, rarray, rstrides, *);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, uint16_t, int16_t, larray, lstrides, rarray, rstrides, *);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, *);
         } else {
             return ndarray_binary_op(MP_BINARY_OP_MULTIPLY, rhs, lhs);
         }
-    } else if(lhs->dtype == NDARRAY_INT16) {
-        if(rhs->dtype == NDARRAY_INT16) {
+    } else if(lhs->dtype.type == NDARRAY_INT16) {
+        if(rhs->dtype.type == NDARRAY_INT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
             BINARY_LOOP(results, int16_t, int16_t, int16_t, larray, lstrides, rarray, rstrides, *);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, int16_t, mp_float_t, larray, lstrides, rarray, rstrides, *);
         } else {
             return ndarray_binary_op(MP_BINARY_OP_MULTIPLY, rhs, lhs);
         }
-    } else if(lhs->dtype == NDARRAY_FLOAT) {
-        if(rhs->dtype == NDARRAY_FLOAT) {
+    } else if(lhs->dtype.type == NDARRAY_FLOAT) {
+        if(rhs->dtype.type == NDARRAY_FLOAT) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, mp_float_t, mp_float_t, larray, lstrides, rarray, rstrides, *);
         } else {
@@ -323,64 +323,64 @@ mp_obj_t ndarray_binary_more(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
 
     #if NDARRAY_HAS_BINARY_OP_MORE | NDARRAY_HAS_BINARY_OP_LESS
     if(op == MP_BINARY_OP_MORE) {
-        if(lhs->dtype == NDARRAY_UINT8) {
-            if(rhs->dtype == NDARRAY_UINT8) {
+        if(lhs->dtype.type == NDARRAY_UINT8) {
+            if(rhs->dtype.type == NDARRAY_UINT8) {
                 EQUALITY_LOOP(results, array, uint8_t, uint8_t, larray, lstrides, rarray, rstrides, >);
-            } else if(rhs->dtype == NDARRAY_INT8) {
+            } else if(rhs->dtype.type == NDARRAY_INT8) {
                 EQUALITY_LOOP(results, array, uint8_t, int8_t, larray, lstrides, rarray, rstrides, >);
-            } else if(rhs->dtype == NDARRAY_UINT16) {
+            } else if(rhs->dtype.type == NDARRAY_UINT16) {
                 EQUALITY_LOOP(results, array, uint8_t, uint16_t, larray, lstrides, rarray, rstrides, >);
-            } else if(rhs->dtype == NDARRAY_INT16) {
+            } else if(rhs->dtype.type == NDARRAY_INT16) {
                 EQUALITY_LOOP(results, array, uint8_t, int16_t, larray, lstrides, rarray, rstrides, >);
-            } else if(rhs->dtype == NDARRAY_FLOAT) {
+            } else if(rhs->dtype.type == NDARRAY_FLOAT) {
                 EQUALITY_LOOP(results, array, uint8_t, mp_float_t, larray, lstrides, rarray, rstrides, >);
             }
-        } else if(lhs->dtype == NDARRAY_INT8) {
-            if(rhs->dtype == NDARRAY_UINT8) {
+        } else if(lhs->dtype.type == NDARRAY_INT8) {
+            if(rhs->dtype.type == NDARRAY_UINT8) {
                 EQUALITY_LOOP(results, array, int8_t, uint8_t, larray, lstrides, rarray, rstrides, >);
-            } else if(rhs->dtype == NDARRAY_INT8) {
+            } else if(rhs->dtype.type == NDARRAY_INT8) {
                 EQUALITY_LOOP(results, array, int8_t, int8_t, larray, lstrides, rarray, rstrides, >);
-            } else if(rhs->dtype == NDARRAY_UINT16) {
+            } else if(rhs->dtype.type == NDARRAY_UINT16) {
                 EQUALITY_LOOP(results, array, int8_t, uint16_t, larray, lstrides, rarray, rstrides, >);
-            } else if(rhs->dtype == NDARRAY_INT16) {
+            } else if(rhs->dtype.type == NDARRAY_INT16) {
                 EQUALITY_LOOP(results, array, int8_t, int16_t, larray, lstrides, rarray, rstrides, >);
-            } else if(rhs->dtype == NDARRAY_FLOAT) {
+            } else if(rhs->dtype.type == NDARRAY_FLOAT) {
                 EQUALITY_LOOP(results, array, int8_t, mp_float_t, larray, lstrides, rarray, rstrides, >);
             }
-        } else if(lhs->dtype == NDARRAY_UINT16) {
-            if(rhs->dtype == NDARRAY_UINT8) {
+        } else if(lhs->dtype.type == NDARRAY_UINT16) {
+            if(rhs->dtype.type == NDARRAY_UINT8) {
                 EQUALITY_LOOP(results, array, uint16_t, uint8_t, larray, lstrides, rarray, rstrides, >);
-            } else if(rhs->dtype == NDARRAY_INT8) {
+            } else if(rhs->dtype.type == NDARRAY_INT8) {
                 EQUALITY_LOOP(results, array, uint16_t, int8_t, larray, lstrides, rarray, rstrides, >);
-            } else if(rhs->dtype == NDARRAY_UINT16) {
+            } else if(rhs->dtype.type == NDARRAY_UINT16) {
                 EQUALITY_LOOP(results, array, uint16_t, uint16_t, larray, lstrides, rarray, rstrides, >);
-            } else if(rhs->dtype == NDARRAY_INT16) {
+            } else if(rhs->dtype.type == NDARRAY_INT16) {
                 EQUALITY_LOOP(results, array, uint16_t, int16_t, larray, lstrides, rarray, rstrides, >);
-            } else if(rhs->dtype == NDARRAY_FLOAT) {
+            } else if(rhs->dtype.type == NDARRAY_FLOAT) {
                 EQUALITY_LOOP(results, array, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, >);
             }
-        } else if(lhs->dtype == NDARRAY_INT16) {
-            if(rhs->dtype == NDARRAY_UINT8) {
+        } else if(lhs->dtype.type == NDARRAY_INT16) {
+            if(rhs->dtype.type == NDARRAY_UINT8) {
                 EQUALITY_LOOP(results, array, int16_t, uint8_t, larray, lstrides, rarray, rstrides, >);
-            } else if(rhs->dtype == NDARRAY_INT8) {
+            } else if(rhs->dtype.type == NDARRAY_INT8) {
                 EQUALITY_LOOP(results, array, int16_t, int8_t, larray, lstrides, rarray, rstrides, >);
-            } else if(rhs->dtype == NDARRAY_UINT16) {
+            } else if(rhs->dtype.type == NDARRAY_UINT16) {
                 EQUALITY_LOOP(results, array, int16_t, uint16_t, larray, lstrides, rarray, rstrides, >);
-            } else if(rhs->dtype == NDARRAY_INT16) {
+            } else if(rhs->dtype.type == NDARRAY_INT16) {
                 EQUALITY_LOOP(results, array, int16_t, int16_t, larray, lstrides, rarray, rstrides, >);
-            } else if(rhs->dtype == NDARRAY_FLOAT) {
+            } else if(rhs->dtype.type == NDARRAY_FLOAT) {
                 EQUALITY_LOOP(results, array, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, >);
             }
-        } else if(lhs->dtype == NDARRAY_FLOAT) {
-            if(rhs->dtype == NDARRAY_UINT8) {
+        } else if(lhs->dtype.type == NDARRAY_FLOAT) {
+            if(rhs->dtype.type == NDARRAY_UINT8) {
                 EQUALITY_LOOP(results, array, mp_float_t, uint8_t, larray, lstrides, rarray, rstrides, >);
-            } else if(rhs->dtype == NDARRAY_INT8) {
+            } else if(rhs->dtype.type == NDARRAY_INT8) {
                 EQUALITY_LOOP(results, array, mp_float_t, int8_t, larray, lstrides, rarray, rstrides, >);
-            } else if(rhs->dtype == NDARRAY_UINT16) {
+            } else if(rhs->dtype.type == NDARRAY_UINT16) {
                 EQUALITY_LOOP(results, array, mp_float_t, uint16_t, larray, lstrides, rarray, rstrides, >);
-            } else if(rhs->dtype == NDARRAY_INT16) {
+            } else if(rhs->dtype.type == NDARRAY_INT16) {
                 EQUALITY_LOOP(results, array, mp_float_t, int16_t, larray, lstrides, rarray, rstrides, >);
-            } else if(rhs->dtype == NDARRAY_FLOAT) {
+            } else if(rhs->dtype.type == NDARRAY_FLOAT) {
                 EQUALITY_LOOP(results, array, mp_float_t, mp_float_t, larray, lstrides, rarray, rstrides, >);
             }
         }
@@ -388,64 +388,64 @@ mp_obj_t ndarray_binary_more(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
     #endif /* NDARRAY_HAS_BINARY_OP_MORE | NDARRAY_HAS_BINARY_OP_LESS*/
     #if NDARRAY_HAS_BINARY_OP_MORE_EQUAL | NDARRAY_HAS_BINARY_OP_LESS_EQUAL
     if(op == MP_BINARY_OP_MORE_EQUAL) {
-        if(lhs->dtype == NDARRAY_UINT8) {
-            if(rhs->dtype == NDARRAY_UINT8) {
+        if(lhs->dtype.type == NDARRAY_UINT8) {
+            if(rhs->dtype.type == NDARRAY_UINT8) {
                 EQUALITY_LOOP(results, array, uint8_t, uint8_t, larray, lstrides, rarray, rstrides, >=);
-            } else if(rhs->dtype == NDARRAY_INT8) {
+            } else if(rhs->dtype.type == NDARRAY_INT8) {
                 EQUALITY_LOOP(results, array, uint8_t, int8_t, larray, lstrides, rarray, rstrides, >=);
-            } else if(rhs->dtype == NDARRAY_UINT16) {
+            } else if(rhs->dtype.type == NDARRAY_UINT16) {
                 EQUALITY_LOOP(results, array, uint8_t, uint16_t, larray, lstrides, rarray, rstrides, >=);
-            } else if(rhs->dtype == NDARRAY_INT16) {
+            } else if(rhs->dtype.type == NDARRAY_INT16) {
                 EQUALITY_LOOP(results, array, uint8_t, int16_t, larray, lstrides, rarray, rstrides, >=);
-            } else if(rhs->dtype == NDARRAY_FLOAT) {
+            } else if(rhs->dtype.type == NDARRAY_FLOAT) {
                 EQUALITY_LOOP(results, array, uint8_t, mp_float_t, larray, lstrides, rarray, rstrides, >=);
             }
-        } else if(lhs->dtype == NDARRAY_INT8) {
-            if(rhs->dtype == NDARRAY_UINT8) {
+        } else if(lhs->dtype.type == NDARRAY_INT8) {
+            if(rhs->dtype.type == NDARRAY_UINT8) {
                 EQUALITY_LOOP(results, array, int8_t, uint8_t, larray, lstrides, rarray, rstrides, >=);
-            } else if(rhs->dtype == NDARRAY_INT8) {
+            } else if(rhs->dtype.type == NDARRAY_INT8) {
                 EQUALITY_LOOP(results, array, int8_t, int8_t, larray, lstrides, rarray, rstrides, >=);
-            } else if(rhs->dtype == NDARRAY_UINT16) {
+            } else if(rhs->dtype.type == NDARRAY_UINT16) {
                 EQUALITY_LOOP(results, array, int8_t, uint16_t, larray, lstrides, rarray, rstrides, >=);
-            } else if(rhs->dtype == NDARRAY_INT16) {
+            } else if(rhs->dtype.type == NDARRAY_INT16) {
                 EQUALITY_LOOP(results, array, int8_t, int16_t, larray, lstrides, rarray, rstrides, >=);
-            } else if(rhs->dtype == NDARRAY_FLOAT) {
+            } else if(rhs->dtype.type == NDARRAY_FLOAT) {
                 EQUALITY_LOOP(results, array, int8_t, mp_float_t, larray, lstrides, rarray, rstrides, >=);
             }
-        } else if(lhs->dtype == NDARRAY_UINT16) {
-            if(rhs->dtype == NDARRAY_UINT8) {
+        } else if(lhs->dtype.type == NDARRAY_UINT16) {
+            if(rhs->dtype.type == NDARRAY_UINT8) {
                 EQUALITY_LOOP(results, array, uint16_t, uint8_t, larray, lstrides, rarray, rstrides, >=);
-            } else if(rhs->dtype == NDARRAY_INT8) {
+            } else if(rhs->dtype.type == NDARRAY_INT8) {
                 EQUALITY_LOOP(results, array, uint16_t, int8_t, larray, lstrides, rarray, rstrides, >=);
-            } else if(rhs->dtype == NDARRAY_UINT16) {
+            } else if(rhs->dtype.type == NDARRAY_UINT16) {
                 EQUALITY_LOOP(results, array, uint16_t, uint16_t, larray, lstrides, rarray, rstrides, >=);
-            } else if(rhs->dtype == NDARRAY_INT16) {
+            } else if(rhs->dtype.type == NDARRAY_INT16) {
                 EQUALITY_LOOP(results, array, uint16_t, int16_t, larray, lstrides, rarray, rstrides, >=);
-            } else if(rhs->dtype == NDARRAY_FLOAT) {
+            } else if(rhs->dtype.type == NDARRAY_FLOAT) {
                 EQUALITY_LOOP(results, array, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, >=);
             }
-        } else if(lhs->dtype == NDARRAY_INT16) {
-            if(rhs->dtype == NDARRAY_UINT8) {
+        } else if(lhs->dtype.type == NDARRAY_INT16) {
+            if(rhs->dtype.type == NDARRAY_UINT8) {
                 EQUALITY_LOOP(results, array, int16_t, uint8_t, larray, lstrides, rarray, rstrides, >=);
-            } else if(rhs->dtype == NDARRAY_INT8) {
+            } else if(rhs->dtype.type == NDARRAY_INT8) {
                 EQUALITY_LOOP(results, array, int16_t, int8_t, larray, lstrides, rarray, rstrides, >=);
-            } else if(rhs->dtype == NDARRAY_UINT16) {
+            } else if(rhs->dtype.type == NDARRAY_UINT16) {
                 EQUALITY_LOOP(results, array, int16_t, uint16_t, larray, lstrides, rarray, rstrides, >=);
-            } else if(rhs->dtype == NDARRAY_INT16) {
+            } else if(rhs->dtype.type == NDARRAY_INT16) {
                 EQUALITY_LOOP(results, array, int16_t, int16_t, larray, lstrides, rarray, rstrides, >=);
-            } else if(rhs->dtype == NDARRAY_FLOAT) {
+            } else if(rhs->dtype.type == NDARRAY_FLOAT) {
                 EQUALITY_LOOP(results, array, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, >=);
             }
-        } else if(lhs->dtype == NDARRAY_FLOAT) {
-            if(rhs->dtype == NDARRAY_UINT8) {
+        } else if(lhs->dtype.type == NDARRAY_FLOAT) {
+            if(rhs->dtype.type == NDARRAY_UINT8) {
                 EQUALITY_LOOP(results, array, mp_float_t, uint8_t, larray, lstrides, rarray, rstrides, >=);
-            } else if(rhs->dtype == NDARRAY_INT8) {
+            } else if(rhs->dtype.type == NDARRAY_INT8) {
                 EQUALITY_LOOP(results, array, mp_float_t, int8_t, larray, lstrides, rarray, rstrides, >=);
-            } else if(rhs->dtype == NDARRAY_UINT16) {
+            } else if(rhs->dtype.type == NDARRAY_UINT16) {
                 EQUALITY_LOOP(results, array, mp_float_t, uint16_t, larray, lstrides, rarray, rstrides, >=);
-            } else if(rhs->dtype == NDARRAY_INT16) {
+            } else if(rhs->dtype.type == NDARRAY_INT16) {
                 EQUALITY_LOOP(results, array, mp_float_t, int16_t, larray, lstrides, rarray, rstrides, >=);
-            } else if(rhs->dtype == NDARRAY_FLOAT) {
+            } else if(rhs->dtype.type == NDARRAY_FLOAT) {
                 EQUALITY_LOOP(results, array, mp_float_t, mp_float_t, larray, lstrides, rarray, rstrides, >=);
             }
         }
@@ -464,88 +464,88 @@ mp_obj_t ndarray_binary_subtract(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
     uint8_t *larray = (uint8_t *)lhs->array;
     uint8_t *rarray = (uint8_t *)rhs->array;
 
-    if(lhs->dtype == NDARRAY_UINT8) {
-        if(rhs->dtype == NDARRAY_UINT8) {
+    if(lhs->dtype.type == NDARRAY_UINT8) {
+        if(rhs->dtype.type == NDARRAY_UINT8) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT8);
             BINARY_LOOP(results, uint8_t, uint8_t, uint8_t, larray, lstrides, rarray, rstrides, -);
-        } else if(rhs->dtype == NDARRAY_INT8) {
+        } else if(rhs->dtype.type == NDARRAY_INT8) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
             BINARY_LOOP(results, int16_t, uint8_t, int8_t, larray, lstrides, rarray, rstrides, -);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT16);
             BINARY_LOOP(results, uint16_t, uint8_t, uint16_t, larray, lstrides, rarray, rstrides, -);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
             BINARY_LOOP(results, int16_t, uint8_t, int16_t, larray, lstrides, rarray, rstrides, -);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, uint8_t, mp_float_t, larray, lstrides, rarray, rstrides, -);
         }
-    } else if(lhs->dtype == NDARRAY_INT8) {
-        if(rhs->dtype == NDARRAY_UINT8) {
+    } else if(lhs->dtype.type == NDARRAY_INT8) {
+        if(rhs->dtype.type == NDARRAY_UINT8) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
             BINARY_LOOP(results, int16_t, int8_t, uint8_t, larray, lstrides, rarray, rstrides, -);
-        } else if(rhs->dtype == NDARRAY_INT8) {
+        } else if(rhs->dtype.type == NDARRAY_INT8) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT8);
             BINARY_LOOP(results, int8_t, int8_t, int8_t, larray, lstrides, rarray, rstrides, -);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
             BINARY_LOOP(results, int16_t, int8_t, uint16_t, larray, lstrides, rarray, rstrides, -);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
             BINARY_LOOP(results, int16_t, int8_t, int16_t, larray, lstrides, rarray, rstrides, -);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, int8_t, mp_float_t, larray, lstrides, rarray, rstrides, -);
         }
-    } else if(lhs->dtype == NDARRAY_UINT16) {
-        if(rhs->dtype == NDARRAY_UINT8) {
+    } else if(lhs->dtype.type == NDARRAY_UINT16) {
+        if(rhs->dtype.type == NDARRAY_UINT8) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT16);
             BINARY_LOOP(results, uint16_t, uint16_t, uint8_t, larray, lstrides, rarray, rstrides, -);
-        } else if(rhs->dtype == NDARRAY_INT8) {
+        } else if(rhs->dtype.type == NDARRAY_INT8) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT16);
             BINARY_LOOP(results, uint16_t, uint16_t, int8_t, larray, lstrides, rarray, rstrides, -);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT16);
             BINARY_LOOP(results, uint16_t, uint16_t, uint16_t, larray, lstrides, rarray, rstrides, -);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, uint16_t, int16_t, larray, lstrides, rarray, rstrides, -);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, -);
         }
-    } else if(lhs->dtype == NDARRAY_INT16) {
-        if(rhs->dtype == NDARRAY_UINT8) {
+    } else if(lhs->dtype.type == NDARRAY_INT16) {
+        if(rhs->dtype.type == NDARRAY_UINT8) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
             BINARY_LOOP(results, int16_t, int16_t, uint8_t, larray, lstrides, rarray, rstrides, -);
-        } else if(rhs->dtype == NDARRAY_INT8) {
+        } else if(rhs->dtype.type == NDARRAY_INT8) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
             BINARY_LOOP(results, int16_t, int16_t, int8_t, larray, lstrides, rarray, rstrides, -);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, int16_t, uint16_t, larray, lstrides, rarray, rstrides, -);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
             BINARY_LOOP(results, int16_t, int16_t, int16_t, larray, lstrides, rarray, rstrides, -);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, -);
         }
-    } else if(lhs->dtype == NDARRAY_FLOAT) {
-        if(rhs->dtype == NDARRAY_UINT8) {
+    } else if(lhs->dtype.type == NDARRAY_FLOAT) {
+        if(rhs->dtype.type == NDARRAY_UINT8) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, mp_float_t, uint8_t, larray, lstrides, rarray, rstrides, -);
-        } else if(rhs->dtype == NDARRAY_INT8) {
+        } else if(rhs->dtype.type == NDARRAY_INT8) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, mp_float_t, int8_t, larray, lstrides, rarray, rstrides, -);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, mp_float_t, uint16_t, larray, lstrides, rarray, rstrides, -);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, mp_float_t, int16_t, larray, lstrides, rarray, rstrides, -);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
             BINARY_LOOP(results, mp_float_t, mp_float_t, mp_float_t, larray, lstrides, rarray, rstrides, -);
         }
@@ -564,8 +564,8 @@ mp_obj_t ndarray_binary_true_divide(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
     uint8_t *rarray = (uint8_t *)rhs->array;
 
     #if NDARRAY_BINARY_USES_FUN_POINTER
-    mp_float_t (*get_lhs)(void *) = ndarray_get_float_function(lhs->dtype);
-    mp_float_t (*get_rhs)(void *) = ndarray_get_float_function(rhs->dtype);
+    mp_float_t (*get_lhs)(void *) = ndarray_get_float_function(lhs->dtype.type);
+    mp_float_t (*get_rhs)(void *) = ndarray_get_float_function(rhs->dtype.type);
 
     uint8_t *array = (uint8_t *)results->array;
     void (*set_result)(void *, mp_float_t ) = ndarray_set_float_function(NDARRAY_FLOAT);
@@ -574,64 +574,64 @@ mp_obj_t ndarray_binary_true_divide(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
     FUNC_POINTER_LOOP(results, array, get_lhs, get_rhs, larray, lstrides, rarray, rstrides, lvalue/rvalue);
 
     #else
-    if(lhs->dtype == NDARRAY_UINT8) {
-        if(rhs->dtype == NDARRAY_UINT8) {
+    if(lhs->dtype.type == NDARRAY_UINT8) {
+        if(rhs->dtype.type == NDARRAY_UINT8) {
             BINARY_LOOP(results, mp_float_t, uint8_t, uint8_t, larray, lstrides, rarray, rstrides, /);
-        } else if(rhs->dtype == NDARRAY_INT8) {
+        } else if(rhs->dtype.type == NDARRAY_INT8) {
             BINARY_LOOP(results, mp_float_t, uint8_t, int8_t, larray, lstrides, rarray, rstrides, /);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             BINARY_LOOP(results, mp_float_t, uint8_t, uint16_t, larray, lstrides, rarray, rstrides, /);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             BINARY_LOOP(results, mp_float_t, uint8_t, int16_t, larray, lstrides, rarray, rstrides, /);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             BINARY_LOOP(results, mp_float_t, uint8_t, mp_float_t, larray, lstrides, rarray, rstrides, /);
         }
-    } else if(lhs->dtype == NDARRAY_INT8) {
-        if(rhs->dtype == NDARRAY_UINT8) {
+    } else if(lhs->dtype.type == NDARRAY_INT8) {
+        if(rhs->dtype.type == NDARRAY_UINT8) {
             BINARY_LOOP(results, mp_float_t, int8_t, uint8_t, larray, lstrides, rarray, rstrides, /);
-        } else if(rhs->dtype == NDARRAY_INT8) {
+        } else if(rhs->dtype.type == NDARRAY_INT8) {
             BINARY_LOOP(results, mp_float_t, int8_t, int8_t, larray, lstrides, rarray, rstrides, /);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             BINARY_LOOP(results, mp_float_t, int8_t, uint16_t, larray, lstrides, rarray, rstrides, /);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             BINARY_LOOP(results, mp_float_t, int8_t, int16_t, larray, lstrides, rarray, rstrides, /);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             BINARY_LOOP(results, mp_float_t, int8_t, mp_float_t, larray, lstrides, rarray, rstrides, /);
         }
-    } else if(lhs->dtype == NDARRAY_UINT16) {
-        if(rhs->dtype == NDARRAY_UINT8) {
+    } else if(lhs->dtype.type == NDARRAY_UINT16) {
+        if(rhs->dtype.type == NDARRAY_UINT8) {
             BINARY_LOOP(results, mp_float_t, uint16_t, uint8_t, larray, lstrides, rarray, rstrides, /);
-        } else if(rhs->dtype == NDARRAY_INT8) {
+        } else if(rhs->dtype.type == NDARRAY_INT8) {
             BINARY_LOOP(results, mp_float_t, uint16_t, int8_t, larray, lstrides, rarray, rstrides, /);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             BINARY_LOOP(results, mp_float_t, uint16_t, uint16_t, larray, lstrides, rarray, rstrides, /);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             BINARY_LOOP(results, mp_float_t, uint16_t, int16_t, larray, lstrides, rarray, rstrides, /);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             BINARY_LOOP(results, mp_float_t, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, /);
         }
-    } else if(lhs->dtype == NDARRAY_INT16) {
-        if(rhs->dtype == NDARRAY_UINT8) {
+    } else if(lhs->dtype.type == NDARRAY_INT16) {
+        if(rhs->dtype.type == NDARRAY_UINT8) {
             BINARY_LOOP(results, mp_float_t, int16_t, uint8_t, larray, lstrides, rarray, rstrides, /);
-        } else if(rhs->dtype == NDARRAY_INT8) {
+        } else if(rhs->dtype.type == NDARRAY_INT8) {
             BINARY_LOOP(results, mp_float_t, int16_t, int8_t, larray, lstrides, rarray, rstrides, /);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             BINARY_LOOP(results, mp_float_t, int16_t, uint16_t, larray, lstrides, rarray, rstrides, /);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             BINARY_LOOP(results, mp_float_t, int16_t, int16_t, larray, lstrides, rarray, rstrides, /);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             BINARY_LOOP(results, mp_float_t, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, /);
         }
-    } else if(lhs->dtype == NDARRAY_FLOAT) {
-        if(rhs->dtype == NDARRAY_UINT8) {
+    } else if(lhs->dtype.type == NDARRAY_FLOAT) {
+        if(rhs->dtype.type == NDARRAY_UINT8) {
             BINARY_LOOP(results, mp_float_t, mp_float_t, uint8_t, larray, lstrides, rarray, rstrides, /);
-        } else if(rhs->dtype == NDARRAY_INT8) {
+        } else if(rhs->dtype.type == NDARRAY_INT8) {
             BINARY_LOOP(results, mp_float_t, mp_float_t, int8_t, larray, lstrides, rarray, rstrides, /);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             BINARY_LOOP(results, mp_float_t, mp_float_t, uint16_t, larray, lstrides, rarray, rstrides, /);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             BINARY_LOOP(results, mp_float_t, mp_float_t, int16_t, larray, lstrides, rarray, rstrides, /);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             BINARY_LOOP(results, mp_float_t, mp_float_t, mp_float_t, larray, lstrides, rarray, rstrides, /);
         }
     }
@@ -652,8 +652,8 @@ mp_obj_t ndarray_binary_power(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
     uint8_t *rarray = (uint8_t *)rhs->array;
 
     #if NDARRAY_BINARY_USES_FUN_POINTER
-    mp_float_t (*get_lhs)(void *) = ndarray_get_float_function(lhs->dtype);
-    mp_float_t (*get_rhs)(void *) = ndarray_get_float_function(rhs->dtype);
+    mp_float_t (*get_lhs)(void *) = ndarray_get_float_function(lhs->dtype.type);
+    mp_float_t (*get_rhs)(void *) = ndarray_get_float_function(rhs->dtype.type);
 
     uint8_t *array = (uint8_t *)results->array;
     void (*set_result)(void *, mp_float_t ) = ndarray_set_float_function(NDARRAY_FLOAT);
@@ -662,64 +662,64 @@ mp_obj_t ndarray_binary_power(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
     FUNC_POINTER_LOOP(results, array, get_lhs, get_rhs, larray, lstrides, rarray, rstrides, MICROPY_FLOAT_C_FUN(pow)(lvalue, rvalue));
 
     #else
-    if(lhs->dtype == NDARRAY_UINT8) {
-        if(rhs->dtype == NDARRAY_UINT8) {
+    if(lhs->dtype.type == NDARRAY_UINT8) {
+        if(rhs->dtype.type == NDARRAY_UINT8) {
             POWER_LOOP(results, mp_float_t, uint8_t, uint8_t, larray, lstrides, rarray, rstrides);
-        } else if(rhs->dtype == NDARRAY_INT8) {
+        } else if(rhs->dtype.type == NDARRAY_INT8) {
             POWER_LOOP(results, mp_float_t, uint8_t, int8_t, larray, lstrides, rarray, rstrides);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             POWER_LOOP(results, mp_float_t, uint8_t, uint16_t, larray, lstrides, rarray, rstrides);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             POWER_LOOP(results, mp_float_t, uint8_t, int16_t, larray, lstrides, rarray, rstrides);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             POWER_LOOP(results, mp_float_t, uint8_t, mp_float_t, larray, lstrides, rarray, rstrides);
         }
-    } else if(lhs->dtype == NDARRAY_INT8) {
-        if(rhs->dtype == NDARRAY_UINT8) {
+    } else if(lhs->dtype.type == NDARRAY_INT8) {
+        if(rhs->dtype.type == NDARRAY_UINT8) {
             POWER_LOOP(results, mp_float_t, int8_t, uint8_t, larray, lstrides, rarray, rstrides);
-        } else if(rhs->dtype == NDARRAY_INT8) {
+        } else if(rhs->dtype.type == NDARRAY_INT8) {
             POWER_LOOP(results, mp_float_t, int8_t, int8_t, larray, lstrides, rarray, rstrides);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             POWER_LOOP(results, mp_float_t, int8_t, uint16_t, larray, lstrides, rarray, rstrides);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             POWER_LOOP(results, mp_float_t, int8_t, int16_t, larray, lstrides, rarray, rstrides);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             POWER_LOOP(results, mp_float_t, int8_t, mp_float_t, larray, lstrides, rarray, rstrides);
         }
-    } else if(lhs->dtype == NDARRAY_UINT16) {
-        if(rhs->dtype == NDARRAY_UINT8) {
+    } else if(lhs->dtype.type == NDARRAY_UINT16) {
+        if(rhs->dtype.type == NDARRAY_UINT8) {
             POWER_LOOP(results, mp_float_t, uint16_t, uint8_t, larray, lstrides, rarray, rstrides);
-        } else if(rhs->dtype == NDARRAY_INT8) {
+        } else if(rhs->dtype.type == NDARRAY_INT8) {
             POWER_LOOP(results, mp_float_t, uint16_t, int8_t, larray, lstrides, rarray, rstrides);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             POWER_LOOP(results, mp_float_t, uint16_t, uint16_t, larray, lstrides, rarray, rstrides);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             POWER_LOOP(results, mp_float_t, uint16_t, int16_t, larray, lstrides, rarray, rstrides);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             POWER_LOOP(results, mp_float_t, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides);
         }
-    } else if(lhs->dtype == NDARRAY_INT16) {
-        if(rhs->dtype == NDARRAY_UINT8) {
+    } else if(lhs->dtype.type == NDARRAY_INT16) {
+        if(rhs->dtype.type == NDARRAY_UINT8) {
             POWER_LOOP(results, mp_float_t, int16_t, uint8_t, larray, lstrides, rarray, rstrides);
-        } else if(rhs->dtype == NDARRAY_INT8) {
+        } else if(rhs->dtype.type == NDARRAY_INT8) {
             POWER_LOOP(results, mp_float_t, int16_t, int8_t, larray, lstrides, rarray, rstrides);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             POWER_LOOP(results, mp_float_t, int16_t, uint16_t, larray, lstrides, rarray, rstrides);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             POWER_LOOP(results, mp_float_t, int16_t, int16_t, larray, lstrides, rarray, rstrides);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             POWER_LOOP(results, mp_float_t, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides);
         }
-    } else if(lhs->dtype == NDARRAY_FLOAT) {
-        if(rhs->dtype == NDARRAY_UINT8) {
+    } else if(lhs->dtype.type == NDARRAY_FLOAT) {
+        if(rhs->dtype.type == NDARRAY_UINT8) {
             POWER_LOOP(results, mp_float_t, mp_float_t, uint8_t, larray, lstrides, rarray, rstrides);
-        } else if(rhs->dtype == NDARRAY_INT8) {
+        } else if(rhs->dtype.type == NDARRAY_INT8) {
             POWER_LOOP(results, mp_float_t, mp_float_t, int8_t, larray, lstrides, rarray, rstrides);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             POWER_LOOP(results, mp_float_t, mp_float_t, uint16_t, larray, lstrides, rarray, rstrides);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             POWER_LOOP(results, mp_float_t, mp_float_t, int16_t, larray, lstrides, rarray, rstrides);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             POWER_LOOP(results, mp_float_t, mp_float_t, mp_float_t, larray, lstrides, rarray, rstrides);
         }
     }
@@ -732,7 +732,7 @@ mp_obj_t ndarray_binary_power(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
 #if NDARRAY_HAS_INPLACE_ADD || NDARRAY_HAS_INPLACE_MULTIPLY || NDARRAY_HAS_INPLACE_SUBTRACT
 mp_obj_t ndarray_inplace_ams(ndarray_obj_t *lhs, ndarray_obj_t *rhs, int32_t *rstrides, uint8_t optype) {
 
-    if((lhs->dtype != NDARRAY_FLOAT) && (rhs->dtype == NDARRAY_FLOAT)) {
+    if((lhs->dtype.type != NDARRAY_FLOAT) && (rhs->dtype.type == NDARRAY_FLOAT)) {
         mp_raise_TypeError(translate("cannot cast output with casting rule"));
     }
     uint8_t *larray = (uint8_t *)lhs->array;
@@ -761,21 +761,21 @@ mp_obj_t ndarray_inplace_ams(ndarray_obj_t *lhs, ndarray_obj_t *rhs, int32_t *rs
 #if NDARRAY_HAS_INPLACE_TRUE_DIVIDE
 mp_obj_t ndarray_inplace_divide(ndarray_obj_t *lhs, ndarray_obj_t *rhs, int32_t *rstrides) {
 
-    if((lhs->dtype != NDARRAY_FLOAT)) {
+    if((lhs->dtype.type != NDARRAY_FLOAT)) {
         mp_raise_TypeError(translate("results cannot be cast to specified type"));
     }
     uint8_t *larray = (uint8_t *)lhs->array;
     uint8_t *rarray = (uint8_t *)rhs->array;
 
-    if(rhs->dtype == NDARRAY_UINT8) {
+    if(rhs->dtype.type == NDARRAY_UINT8) {
         INPLACE_LOOP(lhs, mp_float_t, uint8_t, larray, rarray, rstrides, /=);
-    } else if(rhs->dtype == NDARRAY_INT8) {
+    } else if(rhs->dtype.type == NDARRAY_INT8) {
         INPLACE_LOOP(lhs, mp_float_t, int8_t, larray, rarray, rstrides, /=);
-    } else if(rhs->dtype == NDARRAY_UINT16) {
+    } else if(rhs->dtype.type == NDARRAY_UINT16) {
         INPLACE_LOOP(lhs, mp_float_t, uint16_t, larray, rarray, rstrides, /=);
-    } else if(rhs->dtype == NDARRAY_INT16) {
+    } else if(rhs->dtype.type == NDARRAY_INT16) {
         INPLACE_LOOP(lhs, mp_float_t, int16_t, larray, rarray, rstrides, /=);
-    } else if(lhs->dtype == NDARRAY_FLOAT) {
+    } else if(lhs->dtype.type == NDARRAY_FLOAT) {
         INPLACE_LOOP(lhs, mp_float_t, mp_float_t, larray, rarray, rstrides, /=);
     }
     return MP_OBJ_FROM_PTR(lhs);
@@ -785,21 +785,21 @@ mp_obj_t ndarray_inplace_divide(ndarray_obj_t *lhs, ndarray_obj_t *rhs, int32_t
 #if NDARRAY_HAS_INPLACE_POWER
 mp_obj_t ndarray_inplace_power(ndarray_obj_t *lhs, ndarray_obj_t *rhs, int32_t *rstrides) {
 
-    if((lhs->dtype != NDARRAY_FLOAT)) {
+    if((lhs->dtype.type != NDARRAY_FLOAT)) {
         mp_raise_TypeError(translate("results cannot be cast to specified type"));
     }
     uint8_t *larray = (uint8_t *)lhs->array;
     uint8_t *rarray = (uint8_t *)rhs->array;
 
-    if(rhs->dtype == NDARRAY_UINT8) {
+    if(rhs->dtype.type == NDARRAY_UINT8) {
         INPLACE_POWER(lhs, mp_float_t, uint8_t, larray, rarray, rstrides);
-    } else if(rhs->dtype == NDARRAY_INT8) {
+    } else if(rhs->dtype.type == NDARRAY_INT8) {
         INPLACE_POWER(lhs, mp_float_t, int8_t, larray, rarray, rstrides);
-    } else if(rhs->dtype == NDARRAY_UINT16) {
+    } else if(rhs->dtype.type == NDARRAY_UINT16) {
         INPLACE_POWER(lhs, mp_float_t, uint16_t, larray, rarray, rstrides);
-    } else if(rhs->dtype == NDARRAY_INT16) {
+    } else if(rhs->dtype.type == NDARRAY_INT16) {
         INPLACE_POWER(lhs, mp_float_t, int16_t, larray, rarray, rstrides);
-    } else if(lhs->dtype == NDARRAY_FLOAT) {
+    } else if(lhs->dtype.type == NDARRAY_FLOAT) {
         INPLACE_POWER(lhs, mp_float_t, mp_float_t, larray, rarray, rstrides);
     }
     return MP_OBJ_FROM_PTR(lhs);
diff --git a/code/ndarray_operators.h b/code/ndarray_operators.h
index 7849e030..3ca419b2 100644
--- a/code/ndarray_operators.h
+++ b/code/ndarray_operators.h
@@ -24,54 +24,54 @@ mp_obj_t ndarray_inplace_divide(ndarray_obj_t *, ndarray_obj_t *, int32_t *);
 
 #define UNWRAP_INPLACE_OPERATOR(lhs, larray, rarray, rstrides, OPERATOR)\
 ({\
-    if((lhs)->dtype == NDARRAY_UINT8) {\
-        if((rhs)->dtype == NDARRAY_UINT8) {\
+    if((lhs)->dtype.type == NDARRAY_UINT8) {\
+        if((rhs)->dtype.type == NDARRAY_UINT8) {\
             INPLACE_LOOP((lhs), uint8_t, uint8_t, (larray), (rarray), (rstrides), OPERATOR);\
-        } else if(rhs->dtype == NDARRAY_INT8) {\
+        } else if(rhs->dtype.type == NDARRAY_INT8) {\
             INPLACE_LOOP((lhs), uint8_t, int8_t, (larray), (rarray), (rstrides), OPERATOR);\
-        } else if(rhs->dtype == NDARRAY_UINT16) {\
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {\
             INPLACE_LOOP((lhs), uint8_t, uint16_t, (larray), (rarray), (rstrides), OPERATOR);\
         } else {\
             INPLACE_LOOP((lhs), uint8_t, int16_t, (larray), (rarray), (rstrides), OPERATOR);\
         }\
-    } else if(lhs->dtype == NDARRAY_INT8) {\
-        if(rhs->dtype == NDARRAY_UINT8) {\
+    } else if(lhs->dtype.type == NDARRAY_INT8) {\
+        if(rhs->dtype.type == NDARRAY_UINT8) {\
             INPLACE_LOOP((lhs), int8_t, uint8_t, (larray), (rarray), (rstrides), OPERATOR);\
-        } else if(rhs->dtype == NDARRAY_INT8) {\
+        } else if(rhs->dtype.type == NDARRAY_INT8) {\
             INPLACE_LOOP((lhs), int8_t, int8_t, (larray), (rarray), (rstrides), OPERATOR);\
-        } else if(rhs->dtype == NDARRAY_UINT16) {\
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {\
             INPLACE_LOOP((lhs), int8_t, uint16_t, (larray), (rarray), (rstrides), OPERATOR);\
         } else {\
             INPLACE_LOOP((lhs), int8_t, int16_t, (larray), (rarray), (rstrides), OPERATOR);\
         }\
-    } else if(lhs->dtype == NDARRAY_UINT16) {\
-        if(rhs->dtype == NDARRAY_UINT8) {\
+    } else if(lhs->dtype.type == NDARRAY_UINT16) {\
+        if(rhs->dtype.type == NDARRAY_UINT8) {\
             INPLACE_LOOP((lhs), uint16_t, uint8_t, (larray), (rarray), (rstrides), OPERATOR);\
-        } else if(rhs->dtype == NDARRAY_INT8) {\
+        } else if(rhs->dtype.type == NDARRAY_INT8) {\
             INPLACE_LOOP((lhs), uint16_t, int8_t, (larray), (rarray), (rstrides), OPERATOR);\
-        } else if(rhs->dtype == NDARRAY_UINT16) {\
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {\
             INPLACE_LOOP((lhs), uint16_t, uint16_t, (larray), (rarray), (rstrides), OPERATOR);\
         } else {\
             INPLACE_LOOP((lhs), uint16_t, int16_t, (larray), (rarray), (rstrides), OPERATOR);\
         }\
-    } else if(lhs->dtype == NDARRAY_INT16) {\
-        if(rhs->dtype == NDARRAY_UINT8) {\
+    } else if(lhs->dtype.type == NDARRAY_INT16) {\
+        if(rhs->dtype.type == NDARRAY_UINT8) {\
             INPLACE_LOOP((lhs), int16_t, uint8_t, (larray), (rarray), (rstrides), OPERATOR);\
-        } else if(rhs->dtype == NDARRAY_INT8) {\
+        } else if(rhs->dtype.type == NDARRAY_INT8) {\
             INPLACE_LOOP((lhs), int16_t, int8_t, (larray), (rarray), (rstrides), OPERATOR);\
-        } else if(rhs->dtype == NDARRAY_UINT16) {\
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {\
             INPLACE_LOOP((lhs), int16_t, uint16_t, (larray), (rarray), (rstrides), OPERATOR);\
         } else {\
             INPLACE_LOOP((lhs), int16_t, int16_t, (larray), (rarray), (rstrides), OPERATOR);\
         }\
-    } else if(lhs->dtype == NDARRAY_FLOAT) {\
-        if(rhs->dtype == NDARRAY_UINT8) {\
+    } else if(lhs->dtype.type == NDARRAY_FLOAT) {\
+        if(rhs->dtype.type == NDARRAY_UINT8) {\
             INPLACE_LOOP((lhs), mp_float_t, uint8_t, (larray), (rarray), (rstrides), OPERATOR);\
-        } else if(rhs->dtype == NDARRAY_INT8) {\
+        } else if(rhs->dtype.type == NDARRAY_INT8) {\
             INPLACE_LOOP((lhs), mp_float_t, int8_t, (larray), (rarray), (rstrides), OPERATOR);\
-        } else if(rhs->dtype == NDARRAY_UINT16) {\
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {\
             INPLACE_LOOP((lhs), mp_float_t, uint16_t, (larray), (rarray), (rstrides), OPERATOR);\
-        } else if(rhs->dtype == NDARRAY_INT16) {\
+        } else if(rhs->dtype.type == NDARRAY_INT16) {\
             INPLACE_LOOP((lhs), mp_float_t, int16_t, (larray), (rarray), (rstrides), OPERATOR);\
         } else {\
             INPLACE_LOOP((lhs), mp_float_t, mp_float_t, (larray), (rarray), (rstrides), OPERATOR);\
diff --git a/code/numpy/approx/approx.c b/code/numpy/approx/approx.c
index 171e0f4f..c02e0a63 100644
--- a/code/numpy/approx/approx.c
+++ b/code/numpy/approx/approx.c
@@ -69,20 +69,20 @@ STATIC mp_obj_t approx_interp(size_t n_args, const mp_obj_t *pos_args, mp_map_t
     mp_float_t left_value, right_value;
     uint8_t *xparray = (uint8_t *)xp->array;
 
-    mp_float_t xp_left = ndarray_get_float_value(xparray, xp->dtype);
+    mp_float_t xp_left = ndarray_get_float_value(xparray, xp->dtype.type);
     xparray += (xp->len-1) * xp->strides[ULAB_MAX_DIMS - 1];
-    mp_float_t xp_right = ndarray_get_float_value(xparray, xp->dtype);
+    mp_float_t xp_right = ndarray_get_float_value(xparray, xp->dtype.type);
 
     uint8_t *fparray = (uint8_t *)fp->array;
 
     if(args[3].u_obj == mp_const_none) {
-        left_value = ndarray_get_float_value(fparray, fp->dtype);
+        left_value = ndarray_get_float_value(fparray, fp->dtype.type);
     } else {
         left_value = mp_obj_get_float(args[3].u_obj);
     }
     if(args[4].u_obj == mp_const_none) {
         fparray += (fp->len-1) * fp->strides[ULAB_MAX_DIMS - 1];
-        right_value = ndarray_get_float_value(fparray, fp->dtype);
+        right_value = ndarray_get_float_value(fparray, fp->dtype.type);
     } else {
         right_value = mp_obj_get_float(args[4].u_obj);
     }
@@ -95,7 +95,7 @@ STATIC mp_obj_t approx_interp(size_t n_args, const mp_obj_t *pos_args, mp_map_t
     uint8_t *temp;
 
     for(size_t i=0; i < x->len; i++, yarray++) {
-        mp_float_t x_value = ndarray_get_float_value(xarray, x->dtype);
+        mp_float_t x_value = ndarray_get_float_value(xarray, x->dtype.type);
         xarray += x->strides[ULAB_MAX_DIMS - 1];
         if(x_value < xp_left) {
             *yarray = left_value;
@@ -108,7 +108,7 @@ STATIC mp_obj_t approx_interp(size_t n_args, const mp_obj_t *pos_args, mp_map_t
             while(right_index - left_index > 1) {
                 middle_index = left_index + (right_index - left_index) / 2;
                 temp = xparray + middle_index * xp->strides[ULAB_MAX_DIMS - 1];
-                mp_float_t xp_middle = ndarray_get_float_value(temp, xp->dtype);
+                mp_float_t xp_middle = ndarray_get_float_value(temp, xp->dtype.type);
                 if(x_value <= xp_middle) {
                     right_index = middle_index;
                 } else {
@@ -116,16 +116,16 @@ STATIC mp_obj_t approx_interp(size_t n_args, const mp_obj_t *pos_args, mp_map_t
                 }
             }
             temp = xparray + left_index * xp->strides[ULAB_MAX_DIMS - 1];
-            xp_left_ = ndarray_get_float_value(temp, xp->dtype);
+            xp_left_ = ndarray_get_float_value(temp, xp->dtype.type);
 
             temp = xparray + right_index * xp->strides[ULAB_MAX_DIMS - 1];
-            xp_right_ = ndarray_get_float_value(temp, xp->dtype);
+            xp_right_ = ndarray_get_float_value(temp, xp->dtype.type);
 
             temp = fparray + left_index * fp->strides[ULAB_MAX_DIMS - 1];
-            fp_left = ndarray_get_float_value(temp, fp->dtype);
+            fp_left = ndarray_get_float_value(temp, fp->dtype.type);
 
             temp = fparray + right_index * fp->strides[ULAB_MAX_DIMS - 1];
-            fp_right = ndarray_get_float_value(temp, fp->dtype);
+            fp_right = ndarray_get_float_value(temp, fp->dtype.type);
 
             *yarray = fp_left + (x_value - xp_left_) * (fp_right - fp_left) / (xp_right_ - xp_left_);
         }
@@ -167,7 +167,7 @@ STATIC mp_obj_t approx_trapz(size_t n_args, const mp_obj_t *pos_args, mp_map_t *
         mp_raise_ValueError(translate("trapz is defined for 1D arrays"));
     }
 
-    mp_float_t (*funcy)(void *) = ndarray_get_float_function(y->dtype);
+    mp_float_t (*funcy)(void *) = ndarray_get_float_function(y->dtype.type);
     uint8_t *yarray = (uint8_t *)y->array;
 
     size_t count = 1;
@@ -179,7 +179,7 @@ STATIC mp_obj_t approx_trapz(size_t n_args, const mp_obj_t *pos_args, mp_map_t *
             mp_raise_ValueError(translate("trapz is defined for 1D arrays of equal length"));
         }
 
-        mp_float_t (*funcx)(void *) = ndarray_get_float_function(x->dtype);
+        mp_float_t (*funcx)(void *) = ndarray_get_float_function(x->dtype.type);
         uint8_t *xarray = (uint8_t *)x->array;
         mp_float_t x1, x2;
 
@@ -206,7 +206,7 @@ STATIC mp_obj_t approx_trapz(size_t n_args, const mp_obj_t *pos_args, mp_map_t *
         yarray += y->strides[ULAB_MAX_DIMS - 1];
 
         for(size_t i=1; i < y->len; i++) {
-            y2 = ndarray_get_float_index(y->array, y->dtype, i);
+            y2 = ndarray_get_float_index(y->array, y->dtype.type, i);
             mp_float_t value = (y2 + y1);
             m = mean + (value - mean) / (mp_float_t)count;
             mean = m;
diff --git a/code/numpy/compare/compare.c b/code/numpy/compare/compare.c
index dd22a9d6..e799e8c1 100644
--- a/code/numpy/compare/compare.c
+++ b/code/numpy/compare/compare.c
@@ -54,64 +54,64 @@ static mp_obj_t compare_function(mp_obj_t x1, mp_obj_t x2, uint8_t op) {
     // uint16 + int16 => float
     // The parameters of RUN_COMPARE_LOOP are
     // typecode of result, type_out, type_left, type_right, lhs operand, rhs operand, operator
-    if(lhs->dtype == NDARRAY_UINT8) {
-        if(rhs->dtype == NDARRAY_UINT8) {
+    if(lhs->dtype.type == NDARRAY_UINT8) {
+        if(rhs->dtype.type == NDARRAY_UINT8) {
             RUN_COMPARE_LOOP(NDARRAY_UINT8, uint8_t, uint8_t, uint8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
-        } else if(rhs->dtype == NDARRAY_INT8) {
+        } else if(rhs->dtype.type == NDARRAY_INT8) {
             RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, uint8_t, int8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             RUN_COMPARE_LOOP(NDARRAY_UINT16, uint16_t, uint8_t, uint16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, uint8_t, int16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, uint8_t, mp_float_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
         }
-    } else if(lhs->dtype == NDARRAY_INT8) {
-        if(rhs->dtype == NDARRAY_UINT8) {
+    } else if(lhs->dtype.type == NDARRAY_INT8) {
+        if(rhs->dtype.type == NDARRAY_UINT8) {
             RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, int8_t, uint8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
-        } else if(rhs->dtype == NDARRAY_INT8) {
+        } else if(rhs->dtype.type == NDARRAY_INT8) {
             RUN_COMPARE_LOOP(NDARRAY_INT8, int8_t, int8_t, int8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, int8_t, uint16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, int8_t, int16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, int8_t, mp_float_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
         }
-    } else if(lhs->dtype == NDARRAY_UINT16) {
-        if(rhs->dtype == NDARRAY_UINT8) {
+    } else if(lhs->dtype.type == NDARRAY_UINT16) {
+        if(rhs->dtype.type == NDARRAY_UINT8) {
             RUN_COMPARE_LOOP(NDARRAY_UINT16, uint16_t, uint16_t, uint8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
-        } else if(rhs->dtype == NDARRAY_INT8) {
+        } else if(rhs->dtype.type == NDARRAY_INT8) {
             RUN_COMPARE_LOOP(NDARRAY_UINT16, uint16_t, uint16_t, int8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             RUN_COMPARE_LOOP(NDARRAY_UINT16, uint16_t, uint16_t, uint16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, uint16_t, int16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, uint8_t, mp_float_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
         }
-    } else if(lhs->dtype == NDARRAY_INT16) {
-        if(rhs->dtype == NDARRAY_UINT8) {
+    } else if(lhs->dtype.type == NDARRAY_INT16) {
+        if(rhs->dtype.type == NDARRAY_UINT8) {
             RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, int16_t, uint8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
-        } else if(rhs->dtype == NDARRAY_INT8) {
+        } else if(rhs->dtype.type == NDARRAY_INT8) {
             RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, int16_t, int8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, int16_t, uint16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, int16_t, int16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
         }
-    } else if(lhs->dtype == NDARRAY_FLOAT) {
-        if(rhs->dtype == NDARRAY_UINT8) {
+    } else if(lhs->dtype.type == NDARRAY_FLOAT) {
+        if(rhs->dtype.type == NDARRAY_UINT8) {
             RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, mp_float_t, uint8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
-        } else if(rhs->dtype == NDARRAY_INT8) {
+        } else if(rhs->dtype.type == NDARRAY_INT8) {
             RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, mp_float_t, int8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
-        } else if(rhs->dtype == NDARRAY_UINT16) {
+        } else if(rhs->dtype.type == NDARRAY_UINT16) {
             RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, mp_float_t, uint16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
-        } else if(rhs->dtype == NDARRAY_INT16) {
+        } else if(rhs->dtype.type == NDARRAY_INT16) {
             RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, mp_float_t, int16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
-        } else if(rhs->dtype == NDARRAY_FLOAT) {
+        } else if(rhs->dtype.type == NDARRAY_FLOAT) {
             RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, mp_float_t, mp_float_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
         }
     }
@@ -199,7 +199,7 @@ static mp_obj_t compare_isinf_isfinite(mp_obj_t _x, uint8_t mask) {
         ndarray_obj_t *results = ndarray_new_dense_ndarray(x->ndim, x->shape, NDARRAY_BOOL);
         // At this point, results is all False
         uint8_t *rarray = (uint8_t *)results->array;
-        if(x->dtype != NDARRAY_FLOAT) {
+        if(x->dtype.type != NDARRAY_FLOAT) {
             // int types can never be infinite...
             if(!mask) {
                 // ...so flip all values in the array, if the function was called from isfinite
@@ -281,7 +281,7 @@ mp_obj_t compare_maximum(mp_obj_t x1, mp_obj_t x2) {
     mp_obj_t result = compare_function(x1, x2, COMPARE_MAXIMUM);
     if((MP_OBJ_IS_INT(x1) || mp_obj_is_float(x1)) && (MP_OBJ_IS_INT(x2) || mp_obj_is_float(x2))) {
         ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(result);
-        return mp_binary_get_val_array(ndarray->dtype, ndarray->array, 0);
+        return mp_binary_get_val_array(ndarray->dtype.type, ndarray->array, 0);
     }
     return result;
 }
@@ -296,7 +296,7 @@ mp_obj_t compare_minimum(mp_obj_t x1, mp_obj_t x2) {
     mp_obj_t result = compare_function(x1, x2, COMPARE_MINIMUM);
     if((MP_OBJ_IS_INT(x1) || mp_obj_is_float(x1)) && (MP_OBJ_IS_INT(x2) || mp_obj_is_float(x2))) {
         ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(result);
-        return mp_binary_get_val_array(ndarray->dtype, ndarray->array, 0);
+        return mp_binary_get_val_array(ndarray->dtype.type, ndarray->array, 0);
     }
     return result;
 }
diff --git a/code/numpy/fft/fft_tools.c b/code/numpy/fft/fft_tools.c
index e527f22d..39d43112 100644
--- a/code/numpy/fft/fft_tools.c
+++ b/code/numpy/fft/fft_tools.c
@@ -108,7 +108,7 @@ mp_obj_t fft_fft_ifft_spectrogram(size_t n_args, mp_obj_t arg_re, mp_obj_t arg_i
     mp_float_t *data_re = (mp_float_t *)out_re->array;
 
     uint8_t *array = (uint8_t *)re->array;
-    mp_float_t (*func)(void *) = ndarray_get_float_function(re->dtype);
+    mp_float_t (*func)(void *) = ndarray_get_float_function(re->dtype.type);
 
     for(size_t i=0; i < len; i++) {
         *data_re++ = func(array);
@@ -129,7 +129,7 @@ mp_obj_t fft_fft_ifft_spectrogram(size_t n_args, mp_obj_t arg_re, mp_obj_t arg_i
             mp_raise_ValueError(translate("real and imaginary parts must be of equal length"));
         }
         array = (uint8_t *)im->array;
-        func = ndarray_get_float_function(im->dtype);
+        func = ndarray_get_float_function(im->dtype.type);
         for(size_t i=0; i < len; i++) {
            *data_im++ = func(array);
            array += im->strides[ULAB_MAX_DIMS - 1];
diff --git a/code/numpy/filter/filter.c b/code/numpy/filter/filter.c
index 280efd0e..e90d3e38 100644
--- a/code/numpy/filter/filter.c
+++ b/code/numpy/filter/filter.c
@@ -69,8 +69,8 @@ mp_obj_t filter_convolve(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_a
         for(int32_t n=bot_n; n < top_n; n++) {
             int32_t idx_c = (len_c - n - 1) * cs;
             int32_t idx_a = (n + k) * as;
-            mp_float_t ai = ndarray_get_float_index(aarray, a->dtype, idx_a);
-            mp_float_t ci = ndarray_get_float_index(carray, c->dtype, idx_c);
+            mp_float_t ai = ndarray_get_float_index(aarray, a->dtype.type, idx_a);
+            mp_float_t ci = ndarray_get_float_index(carray, c->dtype.type, idx_c);
             accum += ai * ci;
         }
         *outptr++ = accum;
diff --git a/code/numpy/linalg/linalg.c b/code/numpy/linalg/linalg.c
index e62a0922..c7ccccda 100644
--- a/code/numpy/linalg/linalg.c
+++ b/code/numpy/linalg/linalg.c
@@ -46,7 +46,7 @@ static mp_obj_t linalg_cholesky(mp_obj_t oin) {
 
     size_t N = ndarray->shape[ULAB_MAX_DIMS - 1];
     uint8_t *array = (uint8_t *)ndarray->array;
-    mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype);
+    mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype.type);
 
     for(size_t m=0; m < N; m++) { // rows
         for(size_t n=0; n < N; n++) { // columns
@@ -112,7 +112,7 @@ static mp_obj_t linalg_det(mp_obj_t oin) {
     mp_float_t *tmp = m_new(mp_float_t, N * N);
     for(size_t m=0; m < N; m++) { // rows
         for(size_t n=0; n < N; n++) { // columns
-            *tmp++ = ndarray_get_float_value(array, ndarray->dtype);
+            *tmp++ = ndarray_get_float_value(array, ndarray->dtype.type);
             array += ndarray->strides[ULAB_MAX_DIMS - 1];
         }
         array -= ndarray->strides[ULAB_MAX_DIMS - 1] * N;
@@ -184,7 +184,7 @@ static mp_obj_t linalg_eig(mp_obj_t oin) {
     mp_float_t *array = m_new(mp_float_t, S*S);
     for(size_t i=0; i < S; i++) { // rows
         for(size_t j=0; j < S; j++) { // columns
-            *array++ = ndarray_get_float_value(iarray, in->dtype);
+            *array++ = ndarray_get_float_value(iarray, in->dtype.type);
             iarray += in->strides[ULAB_MAX_DIMS - 1];
         }
         iarray -= in->strides[ULAB_MAX_DIMS - 1] * S;
@@ -245,7 +245,7 @@ static mp_obj_t linalg_inv(mp_obj_t o_in) {
     ndarray_obj_t *inverted = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, N, N), NDARRAY_FLOAT);
     mp_float_t *iarray = (mp_float_t *)inverted->array;
 
-    mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype);
+    mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype.type);
 
     for(size_t i=0; i < N; i++) { // rows
         for(size_t j=0; j < N; j++) { // columns
@@ -304,7 +304,7 @@ static mp_obj_t linalg_norm(size_t n_args, const mp_obj_t *pos_args, mp_map_t *k
         ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(x);
         uint8_t *array = (uint8_t *)ndarray->array;
         // always get a float, so that we don't have to resolve the dtype later
-        mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype);
+        mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype.type);
         shape_strides _shape_strides = tools_reduce_axes(ndarray, axis);
         ndarray_obj_t *results = ndarray_new_dense_ndarray(_shape_strides.ndim, _shape_strides.shape, NDARRAY_FLOAT);
         mp_float_t *rarray = (mp_float_t *)results->array;
diff --git a/code/numpy/numerical/numerical.c b/code/numpy/numerical/numerical.c
index a8227d87..8679f954 100644
--- a/code/numpy/numerical/numerical.c
+++ b/code/numpy/numerical/numerical.c
@@ -70,7 +70,7 @@ static mp_obj_t numerical_all_any(mp_obj_t oin, mp_obj_t axis, uint8_t optype) {
         ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(oin);
         uint8_t *array = (uint8_t *)ndarray->array;
         // always get a float, so that we don't have to resolve the dtype later
-        mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype);
+        mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype.type);
         ndarray_obj_t *results = NULL;
         uint8_t *rarray = NULL;
         shape_strides _shape_strides = tools_reduce_axes(ndarray, axis);
@@ -198,7 +198,7 @@ static mp_obj_t numerical_sum_mean_std_ndarray(ndarray_obj_t *ndarray, mp_obj_t
             // if there are too many degrees of freedom, there is no point in calculating anything
             return mp_obj_new_float(MICROPY_FLOAT_CONST(0.0));
         }
-        mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype);
+        mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype.type);
         mp_float_t M = 0.0, m = 0.0, S = 0.0, s = 0.0;
         size_t count = 0;
 
@@ -247,7 +247,7 @@ static mp_obj_t numerical_sum_mean_std_ndarray(ndarray_obj_t *ndarray, mp_obj_t
         #endif
         if(optype == NUMERICAL_SUM) {
             // numpy returns an integer for integer input types
-            if(ndarray->dtype == NDARRAY_FLOAT) {
+            if(ndarray->dtype.type == NDARRAY_FLOAT) {
                 return mp_obj_new_float(M * ndarray->len);
             } else {
                 return mp_obj_new_int((int32_t)(M * ndarray->len));
@@ -263,16 +263,16 @@ static mp_obj_t numerical_sum_mean_std_ndarray(ndarray_obj_t *ndarray, mp_obj_t
         uint8_t *rarray = NULL;
         mp_float_t *farray = NULL;
         if(optype == NUMERICAL_SUM) {
-            results = ndarray_new_dense_ndarray(_shape_strides.ndim, _shape_strides.shape, ndarray->dtype);
+            results = ndarray_new_dense_ndarray(_shape_strides.ndim, _shape_strides.shape, ndarray->dtype.type);
             rarray = (uint8_t *)results->array;
             // TODO: numpy promotes the output to the highest integer type
-            if(ndarray->dtype == NDARRAY_UINT8) {
+            if(ndarray->dtype.type == NDARRAY_UINT8) {
                 RUN_SUM(uint8_t, array, results, rarray, _shape_strides);
-            } else if(ndarray->dtype == NDARRAY_INT8) {
+            } else if(ndarray->dtype.type == NDARRAY_INT8) {
                 RUN_SUM(int8_t, array, results, rarray, _shape_strides);
-            } else if(ndarray->dtype == NDARRAY_UINT16) {
+            } else if(ndarray->dtype.type == NDARRAY_UINT16) {
                 RUN_SUM(uint16_t, array, results, rarray, _shape_strides);
-            } else if(ndarray->dtype == NDARRAY_INT16) {
+            } else if(ndarray->dtype.type == NDARRAY_INT16) {
                 RUN_SUM(int16_t, array, results, rarray, _shape_strides);
             } else {
                 // for floats, the sum might be inaccurate with the naive summation
@@ -295,20 +295,20 @@ static mp_obj_t numerical_sum_mean_std_ndarray(ndarray_obj_t *ndarray, mp_obj_t
                 return MP_OBJ_FROM_PTR(results);
             }
             mp_float_t div = optype == NUMERICAL_STD ? (mp_float_t)(_shape_strides.shape[0] - ddof) : 0.0;
-            if(ndarray->dtype == NDARRAY_UINT8) {
+            if(ndarray->dtype.type == NDARRAY_UINT8) {
                 RUN_MEAN_STD(uint8_t, array, farray, _shape_strides, div, isStd);
-            } else if(ndarray->dtype == NDARRAY_INT8) {
+            } else if(ndarray->dtype.type == NDARRAY_INT8) {
                 RUN_MEAN_STD(int8_t, array, farray, _shape_strides, div, isStd);
-            } else if(ndarray->dtype == NDARRAY_UINT16) {
+            } else if(ndarray->dtype.type == NDARRAY_UINT16) {
                 RUN_MEAN_STD(uint16_t, array, farray, _shape_strides, div, isStd);
-            } else if(ndarray->dtype == NDARRAY_INT16) {
+            } else if(ndarray->dtype.type == NDARRAY_INT16) {
                 RUN_MEAN_STD(int16_t, array, farray, _shape_strides, div, isStd);
             } else {
                 RUN_MEAN_STD(mp_float_t, array, farray, _shape_strides, div, isStd);
             }
         }
         if(results->ndim == 0) { // return a scalar here
-            return mp_binary_get_val_array(results->dtype, results->array, 0);
+            return mp_binary_get_val_array(results->dtype.type, results->array, 0);
         }
         return MP_OBJ_FROM_PTR(results);
     }
@@ -359,7 +359,7 @@ static mp_obj_t numerical_argmin_argmax_ndarray(ndarray_obj_t *ndarray, mp_obj_t
 
     if(axis == mp_const_none) {
         // work with the flattened array
-        mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype);
+        mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype.type);
         uint8_t *array = (uint8_t *)ndarray->array;
         mp_float_t best_value = func(array);
         mp_float_t value;
@@ -417,7 +417,7 @@ static mp_obj_t numerical_argmin_argmax_ndarray(ndarray_obj_t *ndarray, mp_obj_t
         if((optype == NUMERICAL_ARGMIN) || (optype == NUMERICAL_ARGMAX)) {
             return mp_obj_new_int(best_index);
         } else {
-            if(ndarray->dtype == NDARRAY_FLOAT) {
+            if(ndarray->dtype.type == NDARRAY_FLOAT) {
                 return mp_obj_new_float(best_value);
             } else {
                 return MP_OBJ_NEW_SMALL_INT((int32_t)best_value);
@@ -443,24 +443,24 @@ static mp_obj_t numerical_argmin_argmax_ndarray(ndarray_obj_t *ndarray, mp_obj_t
         if((optype == NUMERICAL_ARGMIN) || (optype == NUMERICAL_ARGMAX)) {
             results = ndarray_new_dense_ndarray(MAX(1, ndarray->ndim-1), shape, NDARRAY_INT16);
         } else {
-            results = ndarray_new_dense_ndarray(MAX(1, ndarray->ndim-1), shape, ndarray->dtype);
+            results = ndarray_new_dense_ndarray(MAX(1, ndarray->ndim-1), shape, ndarray->dtype.type);
         }
 
         uint8_t *rarray = (uint8_t *)results->array;
 
-        if(ndarray->dtype == NDARRAY_UINT8) {
+        if(ndarray->dtype.type == NDARRAY_UINT8) {
             RUN_ARGMIN(ndarray, uint8_t, array, results, rarray, shape, strides, index, optype);
-        } else if(ndarray->dtype == NDARRAY_INT8) {
+        } else if(ndarray->dtype.type == NDARRAY_INT8) {
             RUN_ARGMIN(ndarray, int8_t, array, results, rarray, shape, strides, index, optype);
-        } else if(ndarray->dtype == NDARRAY_UINT16) {
+        } else if(ndarray->dtype.type == NDARRAY_UINT16) {
             RUN_ARGMIN(ndarray, uint16_t, array, results, rarray, shape, strides, index, optype);
-        } else if(ndarray->dtype == NDARRAY_INT16) {
+        } else if(ndarray->dtype.type == NDARRAY_INT16) {
             RUN_ARGMIN(ndarray, int16_t, array, results, rarray, shape, strides, index, optype);
         } else {
             RUN_ARGMIN(ndarray, mp_float_t, array, results, rarray, shape, strides, index, optype);
         }
         if(results->len == 1) {
-            return mp_binary_get_val_array(results->dtype, results->array, 0);
+            return mp_binary_get_val_array(results->dtype.type, results->array, 0);
         }
         return MP_OBJ_FROM_PTR(results);
     }
@@ -561,9 +561,9 @@ static mp_obj_t numerical_sort_helper(mp_obj_t oin, mp_obj_t axis, uint8_t inpla
     int32_t increment = ndarray->strides[ax] / ndarray->itemsize;
 
     uint8_t *array = (uint8_t *)ndarray->array;
-    if((ndarray->dtype == NDARRAY_UINT8) || (ndarray->dtype == NDARRAY_INT8)) {
+    if((ndarray->dtype.type == NDARRAY_UINT8) || (ndarray->dtype.type == NDARRAY_INT8)) {
         HEAPSORT(ndarray, uint8_t, array, shape, strides, ax, increment, ndarray->shape[ax]);
-    } else if((ndarray->dtype == NDARRAY_INT16) || (ndarray->dtype == NDARRAY_INT16)) {
+    } else if((ndarray->dtype.type == NDARRAY_INT16) || (ndarray->dtype.type == NDARRAY_INT16)) {
         HEAPSORT(ndarray, uint16_t, array, shape, strides, ax, increment, ndarray->shape[ax]);
     } else {
         HEAPSORT(ndarray, mp_float_t, array, shape, strides, ax, increment, ndarray->shape[ax]);
@@ -710,9 +710,9 @@ mp_obj_t numerical_argsort(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw
     // reset the array
     iarray = indices->array;
 
-    if((ndarray->dtype == NDARRAY_UINT8) || (ndarray->dtype == NDARRAY_INT8)) {
+    if((ndarray->dtype.type == NDARRAY_UINT8) || (ndarray->dtype.type == NDARRAY_INT8)) {
         HEAP_ARGSORT(ndarray, uint8_t, array, shape, strides, ax, increment, ndarray->shape[ax], iarray, istrides, iincrement);
-    } else if((ndarray->dtype == NDARRAY_UINT16) || (ndarray->dtype == NDARRAY_INT16)) {
+    } else if((ndarray->dtype.type == NDARRAY_UINT16) || (ndarray->dtype.type == NDARRAY_INT16)) {
         HEAP_ARGSORT(ndarray, uint16_t, array, shape, strides, ax, increment, ndarray->shape[ax], iarray, istrides, iincrement);
     } else {
         HEAP_ARGSORT(ndarray, mp_float_t, array, shape, strides, ax, increment, ndarray->shape[ax], iarray, istrides, iincrement);
@@ -740,12 +740,12 @@ static mp_obj_t numerical_cross(mp_obj_t _a, mp_obj_t _b) {
     }
 
     mp_float_t *results = m_new(mp_float_t, 3);
-    results[0] = ndarray_get_float_index(a->array, a->dtype, 1) * ndarray_get_float_index(b->array, b->dtype, 2);
-    results[0] -= ndarray_get_float_index(a->array, a->dtype, 2) * ndarray_get_float_index(b->array, b->dtype, 1);
-    results[1] = -ndarray_get_float_index(a->array, a->dtype, 0) * ndarray_get_float_index(b->array, b->dtype, 2);
-    results[1] += ndarray_get_float_index(a->array, a->dtype, 2) * ndarray_get_float_index(b->array, b->dtype, 0);
-    results[2] = ndarray_get_float_index(a->array, a->dtype, 0) * ndarray_get_float_index(b->array, b->dtype, 1);
-    results[2] -= ndarray_get_float_index(a->array, a->dtype, 1) * ndarray_get_float_index(b->array, b->dtype, 0);
+    results[0] = ndarray_get_float_index(a->array, a->dtype.type, 1) * ndarray_get_float_index(b->array, b->dtype.type, 2);
+    results[0] -= ndarray_get_float_index(a->array, a->dtype.type, 2) * ndarray_get_float_index(b->array, b->dtype.type, 1);
+    results[1] = -ndarray_get_float_index(a->array, a->dtype.type, 0) * ndarray_get_float_index(b->array, b->dtype.type, 2);
+    results[1] += ndarray_get_float_index(a->array, a->dtype.type, 2) * ndarray_get_float_index(b->array, b->dtype.type, 0);
+    results[2] = ndarray_get_float_index(a->array, a->dtype.type, 0) * ndarray_get_float_index(b->array, b->dtype.type, 1);
+    results[2] -= ndarray_get_float_index(a->array, a->dtype.type, 1) * ndarray_get_float_index(b->array, b->dtype.type, 0);
 
     /* The upcasting happens here with the rules
 
@@ -762,17 +762,17 @@ static mp_obj_t numerical_cross(mp_obj_t _a, mp_obj_t _b) {
     */
 
     uint8_t dtype = NDARRAY_FLOAT;
-    if(a->dtype == b->dtype) {
-        dtype = a->dtype;
-    } else if(((a->dtype == NDARRAY_UINT8) && (b->dtype == NDARRAY_INT8)) || ((a->dtype == NDARRAY_INT8) && (b->dtype == NDARRAY_UINT8))) {
+    if(a->dtype.type == b->dtype.type) {
+        dtype = a->dtype.type;
+    } else if(((a->dtype.type == NDARRAY_UINT8) && (b->dtype.type == NDARRAY_INT8)) || ((a->dtype.type == NDARRAY_INT8) && (b->dtype.type == NDARRAY_UINT8))) {
         dtype = NDARRAY_INT16;
-    } else if(((a->dtype == NDARRAY_UINT8) && (b->dtype == NDARRAY_INT16)) || ((a->dtype == NDARRAY_INT16) && (b->dtype == NDARRAY_UINT8))) {
+    } else if(((a->dtype.type == NDARRAY_UINT8) && (b->dtype.type == NDARRAY_INT16)) || ((a->dtype.type == NDARRAY_INT16) && (b->dtype.type == NDARRAY_UINT8))) {
         dtype = NDARRAY_INT16;
-    } else if(((a->dtype == NDARRAY_UINT8) && (b->dtype == NDARRAY_UINT16)) || ((a->dtype == NDARRAY_UINT16) && (b->dtype == NDARRAY_UINT8))) {
+    } else if(((a->dtype.type == NDARRAY_UINT8) && (b->dtype.type == NDARRAY_UINT16)) || ((a->dtype.type == NDARRAY_UINT16) && (b->dtype.type == NDARRAY_UINT8))) {
         dtype = NDARRAY_UINT16;
-    } else if(((a->dtype == NDARRAY_INT8) && (b->dtype == NDARRAY_INT16)) || ((a->dtype == NDARRAY_INT16) && (b->dtype == NDARRAY_INT8))) {
+    } else if(((a->dtype.type == NDARRAY_INT8) && (b->dtype.type == NDARRAY_INT16)) || ((a->dtype.type == NDARRAY_INT16) && (b->dtype.type == NDARRAY_INT8))) {
         dtype = NDARRAY_INT16;
-    } else if(((a->dtype == NDARRAY_INT8) && (b->dtype == NDARRAY_UINT16)) || ((a->dtype == NDARRAY_UINT16) && (b->dtype == NDARRAY_INT8))) {
+    } else if(((a->dtype.type == NDARRAY_INT8) && (b->dtype.type == NDARRAY_UINT16)) || ((a->dtype.type == NDARRAY_UINT16) && (b->dtype.type == NDARRAY_INT8))) {
         dtype = NDARRAY_UINT16;
     }
 
@@ -854,7 +854,7 @@ mp_obj_t numerical_diff(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_ar
         }
     }
     uint8_t *array = (uint8_t *)ndarray->array;
-    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndarray->ndim, shape, ndarray->dtype);
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndarray->ndim, shape, ndarray->dtype.type);
     uint8_t *rarray = (uint8_t *)results->array;
 
     memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
@@ -862,13 +862,13 @@ mp_obj_t numerical_diff(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_ar
     memset(strides, 0, sizeof(int32_t)*ULAB_MAX_DIMS);
     numerical_reduce_axes(ndarray, ax, shape, strides);
 
-    if(ndarray->dtype == NDARRAY_UINT8) {
+    if(ndarray->dtype.type == NDARRAY_UINT8) {
         RUN_DIFF(ndarray, uint8_t, array, results, rarray, shape, strides, index, stencil, N);
-    } else if(ndarray->dtype == NDARRAY_INT8) {
+    } else if(ndarray->dtype.type == NDARRAY_INT8) {
         RUN_DIFF(ndarray, int8_t, array, results, rarray, shape, strides, index, stencil, N);
-    }  else if(ndarray->dtype == NDARRAY_UINT16) {
+    }  else if(ndarray->dtype.type == NDARRAY_UINT16) {
         RUN_DIFF(ndarray, uint16_t, array, results, rarray, shape, strides, index, stencil, N);
-    } else if(ndarray->dtype == NDARRAY_INT16) {
+    } else if(ndarray->dtype.type == NDARRAY_INT16) {
         RUN_DIFF(ndarray, int16_t, array, results, rarray, shape, strides, index, stencil, N);
     } else {
         RUN_DIFF(ndarray, mp_float_t, array, results, rarray, shape, strides, index, stencil, N);
@@ -905,7 +905,7 @@ mp_obj_t numerical_flip(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_ar
     ndarray_obj_t *results = NULL;
     ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0].u_obj);
     if(args[1].u_obj == mp_const_none) { // flip the flattened array
-        results = ndarray_new_linear_array(ndarray->len, ndarray->dtype);
+        results = ndarray_new_linear_array(ndarray->len, ndarray->dtype.type);
         ndarray_copy_array(ndarray, results);
         uint8_t *rarray = (uint8_t *)results->array;
         rarray += (results->len - 1) * results->itemsize;
@@ -981,10 +981,10 @@ mp_obj_t numerical_median(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_
         uint8_t *array = (uint8_t *)ndarray->array;
         size_t len = ndarray->len;
         array += (len >> 1) * ndarray->itemsize;
-        mp_float_t median = ndarray_get_float_value(array, ndarray->dtype);
+        mp_float_t median = ndarray_get_float_value(array, ndarray->dtype.type);
         if(!(len & 0x01)) { // len is an even number
             array -= ndarray->itemsize;
-            median += ndarray_get_float_value(array, ndarray->dtype);
+            median += ndarray_get_float_value(array, ndarray->dtype.type);
             median *= MICROPY_FLOAT_CONST(0.5);
         }
         return mp_obj_new_float(median);
@@ -1017,10 +1017,10 @@ mp_obj_t numerical_median(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_
                 size_t k = 0;
                 do {
                     array += ndarray->strides[ax] * (len >> 1);
-                    mp_float_t median = ndarray_get_float_value(array, ndarray->dtype);
+                    mp_float_t median = ndarray_get_float_value(array, ndarray->dtype.type);
                     if(!(len & 0x01)) { // len is an even number
                         array -= ndarray->strides[ax];
-                        median += ndarray_get_float_value(array, ndarray->dtype);
+                        median += ndarray_get_float_value(array, ndarray->dtype.type);
                         median *= MICROPY_FLOAT_CONST(0.5);
                         array += ndarray->strides[ax];
                     }
@@ -1087,7 +1087,7 @@ mp_obj_t numerical_roll(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_ar
     }
     ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0].u_obj);
     uint8_t *array = ndarray->array;
-    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndarray->ndim, ndarray->shape, ndarray->dtype);
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndarray->ndim, ndarray->shape, ndarray->dtype.type);
 
     int32_t shift = mp_obj_get_int(args[1].u_obj);
     int32_t _shift = shift < 0 ? -shift : shift;
diff --git a/code/numpy/poly/poly.c b/code/numpy/poly/poly.c
index 6c1ed816..cb7a1989 100644
--- a/code/numpy/poly/poly.c
+++ b/code/numpy/poly/poly.c
@@ -162,7 +162,7 @@ mp_obj_t poly_polyval(mp_obj_t o_p, mp_obj_t o_x) {
         ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_FLOAT);
         mp_float_t *array = (mp_float_t *)ndarray->array;
         
-        mp_float_t (*func)(void *) = ndarray_get_float_function(source->dtype);
+        mp_float_t (*func)(void *) = ndarray_get_float_function(source->dtype.type);
 
         // TODO: these loops are really nothing, but the re-impplementation of 
         // ITERATE_VECTOR from vectorise.c. We could pass a function pointer here
diff --git a/code/numpy/stats/stats.c b/code/numpy/stats/stats.c
index 8022ebe3..1d136562 100644
--- a/code/numpy/stats/stats.c
+++ b/code/numpy/stats/stats.c
@@ -39,9 +39,9 @@ static mp_obj_t stats_trace(mp_obj_t oin) {
     mp_float_t trace = 0.0;
     for(size_t i=0; i < ndarray->shape[ULAB_MAX_DIMS - 1]; i++) {
         int32_t pos = i * (ndarray->strides[ULAB_MAX_DIMS - 1] + ndarray->strides[ULAB_MAX_DIMS - 2]);
-        trace += ndarray_get_float_index(ndarray->array, ndarray->dtype, pos/ndarray->itemsize);
+        trace += ndarray_get_float_index(ndarray->array, ndarray->dtype.type, pos/ndarray->itemsize);
     }
-    if(ndarray->dtype == NDARRAY_FLOAT) {
+    if(ndarray->dtype.type == NDARRAY_FLOAT) {
         return mp_obj_new_float(trace);
     }
     return mp_obj_new_int_from_float(trace);
diff --git a/code/numpy/transform/transform.c b/code/numpy/transform/transform.c
index 610b6173..1d68bfa3 100644
--- a/code/numpy/transform/transform.c
+++ b/code/numpy/transform/transform.c
@@ -42,8 +42,8 @@ mp_obj_t transform_dot(mp_obj_t _m1, mp_obj_t _m2) {
     uint8_t *array1 = (uint8_t *)m1->array;
     uint8_t *array2 = (uint8_t *)m2->array;
 
-    mp_float_t (*func1)(void *) = ndarray_get_float_function(m1->dtype);
-    mp_float_t (*func2)(void *) = ndarray_get_float_function(m2->dtype);
+    mp_float_t (*func1)(void *) = ndarray_get_float_function(m1->dtype.type);
+    mp_float_t (*func2)(void *) = ndarray_get_float_function(m2->dtype.type);
 
     #if ULAB_MAX_DIMS > 1
     if ((m1->ndim == 1) && (m2->ndim == 1)) {
diff --git a/code/numpy/vector/vector.c b/code/numpy/vector/vector.c
index a92edcca..48ddadde 100644
--- a/code/numpy/vector/vector.c
+++ b/code/numpy/vector/vector.c
@@ -46,7 +46,7 @@ static mp_obj_t vectorise_generic_vector(mp_obj_t o_in, mp_float_t (*f)(mp_float
         
         #if ULAB_VECTORISE_USES_FUN_POINTER
         
-            mp_float_t (*func)(void *) = ndarray_get_float_function(source->dtype);
+            mp_float_t (*func)(void *) = ndarray_get_float_function(source->dtype.type);
             
             #if ULAB_MAX_DIMS > 3
             size_t i = 0;
@@ -86,13 +86,13 @@ static mp_obj_t vectorise_generic_vector(mp_obj_t o_in, mp_float_t (*f)(mp_float
             } while(i < source->shape[ULAB_MAX_DIMS - 4]);
             #endif /* ULAB_MAX_DIMS > 3 */
         #else
-        if(source->dtype == NDARRAY_UINT8) {
+        if(source->dtype.type == NDARRAY_UINT8) {
             ITERATE_VECTOR(uint8_t, array, source, sarray);
-        } else if(source->dtype == NDARRAY_INT8) {
+        } else if(source->dtype.type == NDARRAY_INT8) {
             ITERATE_VECTOR(int8_t, array, source, sarray);
-        } else if(source->dtype == NDARRAY_UINT16) {
+        } else if(source->dtype.type == NDARRAY_UINT16) {
             ITERATE_VECTOR(uint16_t, array, source, sarray);
-        } else if(source->dtype == NDARRAY_INT16) {
+        } else if(source->dtype.type == NDARRAY_INT16) {
             ITERATE_VECTOR(int16_t, array, source, sarray);
         } else {
             ITERATE_VECTOR(mp_float_t, array, source, sarray);
@@ -183,7 +183,7 @@ mp_obj_t vectorise_around(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_
     mp_float_t *narray = (mp_float_t *)ndarray->array;
     uint8_t *sarray = (uint8_t *)source->array;
 
-    mp_float_t (*func)(void *) = ndarray_get_float_function(source->dtype);
+    mp_float_t (*func)(void *) = ndarray_get_float_function(source->dtype.type);
 
     #if ULAB_MAX_DIMS > 3
     size_t i = 0;
@@ -267,8 +267,8 @@ mp_obj_t vectorise_arctan2(mp_obj_t y, mp_obj_t x) {
     ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
     mp_float_t *rarray = (mp_float_t *)results->array;
 
-    mp_float_t (*funcx)(void *) = ndarray_get_float_function(ndarray_x->dtype);
-    mp_float_t (*funcy)(void *) = ndarray_get_float_function(ndarray_y->dtype);
+    mp_float_t (*funcx)(void *) = ndarray_get_float_function(ndarray_x->dtype.type);
+    mp_float_t (*funcy)(void *) = ndarray_get_float_function(ndarray_y->dtype.type);
 
     #if ULAB_MAX_DIMS > 3
     size_t i = 0;
@@ -556,7 +556,7 @@ static mp_obj_t vectorise_vectorized_function_call(mp_obj_t self_in, size_t n_ar
         ndarray_obj_t *source = MP_OBJ_TO_PTR(args[0]);
         ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, self->otypes);
         for(size_t i=0; i < source->len; i++) {
-            avalue[0] = mp_binary_get_val_array(source->dtype, source->array, i);
+            avalue[0] = mp_binary_get_val_array(source->dtype.type, source->array, i);
             fvalue = self->type->call(self->fun, 1, 0, avalue);
             mp_binary_set_val_array(self->otypes, ndarray->array, i, fvalue);
         }
diff --git a/code/scipy/signal/signal.c b/code/scipy/signal/signal.c
index 09e92d79..eda26351 100644
--- a/code/scipy/signal/signal.c
+++ b/code/scipy/signal/signal.c
@@ -79,7 +79,7 @@ mp_obj_t signal_sosfilt(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_ar
         #endif
         uint8_t *iarray = (uint8_t *)inarray->array;
         for(size_t i=0; i < lenx; i++) {
-            *yarray++ = ndarray_get_float_value(iarray, inarray->dtype);
+            *yarray++ = ndarray_get_float_value(iarray, inarray->dtype.type);
             iarray += inarray->strides[ULAB_MAX_DIMS - 1];
         }
         yarray -= lenx;
@@ -103,7 +103,7 @@ mp_obj_t signal_sosfilt(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_ar
             if((zi->shape[ULAB_MAX_DIMS - 1] != lensos) || (zi->shape[ULAB_MAX_DIMS - 1] != 2)) {
                 mp_raise_ValueError(translate("zi must be of shape (n_section, 2)"));
             }
-            if(zi->dtype != NDARRAY_FLOAT) {
+            if(zi->dtype.type != NDARRAY_FLOAT) {
                 mp_raise_ValueError(translate("zi must be of float type"));
             }
             // TODO: this won't work with sparse arrays
diff --git a/code/ulab.h b/code/ulab.h
index 92dcca78..d9beddc5 100644
--- a/code/ulab.h
+++ b/code/ulab.h
@@ -72,6 +72,12 @@
 #define ULAB_HAS_DTYPE_OBJECT               (0)
 #endif
 
+// This constant determines, whether a function pointer can be attached to the dtype object
+// Such function pointers are useful for custom data types
+#ifndef ULAB_DTYPE_IS_EXTENDABLE
+#define ULAB_DTYPE_IS_EXTENDABLE            (0)
+#endif
+
 // the ndarray binary operators
 #ifndef NDARRAY_HAS_BINARY_OPS
 #define NDARRAY_HAS_BINARY_OPS              (1)
diff --git a/code/ulab_create.c b/code/ulab_create.c
index 737de6b9..a6e18eb5 100644
--- a/code/ulab_create.c
+++ b/code/ulab_create.c
@@ -179,7 +179,7 @@ mp_obj_t create_concatenate(size_t n_args, const mp_obj_t *pos_args, mp_map_t *k
 
     // first check, whether the arrays are compatible
     ndarray_obj_t *_ndarray = MP_OBJ_TO_PTR(ndarrays->items[0]);
-    uint8_t dtype = _ndarray->dtype;
+    uint8_t dtype = _ndarray->dtype.type;
     uint8_t ndim = _ndarray->ndim;
     if(axis < 0) {
         axis += ndim;
@@ -196,7 +196,7 @@ mp_obj_t create_concatenate(size_t n_args, const mp_obj_t *pos_args, mp_map_t *k
     for(uint8_t i=1; i < ndarrays->len; i++) {
         _ndarray = MP_OBJ_TO_PTR(ndarrays->items[i]);
         // check, whether the arrays are compatible
-        if((dtype != _ndarray->dtype) || (ndim != _ndarray->ndim)) {
+        if((dtype != _ndarray->dtype.type) || (ndim != _ndarray->ndim)) {
             mp_raise_ValueError(translate("input arrays are not compatible"));
         }
         for(uint8_t j=0; j < ULAB_MAX_DIMS; j++) {
@@ -298,7 +298,7 @@ mp_obj_t create_diag(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args)
     }
     ndarray_obj_t *source = MP_OBJ_TO_PTR(args[0].u_obj);
     if(source->ndim == 1) { // return a rank-2 tensor with the prescribed diagonal
-        ndarray_obj_t *target = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, source->len, source->len), source->dtype);
+        ndarray_obj_t *target = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, source->len, source->len), source->dtype.type);
         uint8_t *sarray = (uint8_t *)source->array;
         uint8_t *tarray = (uint8_t *)target->array;
         for(size_t i=0; i < source->len; i++) {
@@ -330,7 +330,7 @@ mp_obj_t create_diag(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args)
         mp_raise_ValueError(translate("offset is too large"));
     }
 
-    ndarray_obj_t *target = ndarray_new_linear_array(len, source->dtype);
+    ndarray_obj_t *target = ndarray_new_linear_array(len, source->dtype.type);
     uint8_t *tarray = (uint8_t *)target->array;
 
     for(size_t i=0; i < len; i++) {
@@ -671,7 +671,7 @@ mp_obj_t create_frombuffer(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw
         }
         ndarray_obj_t *ndarray = m_new_obj(ndarray_obj_t);
         ndarray->base.type = &ulab_ndarray_type;
-        ndarray->dtype = dtype == NDARRAY_BOOL ? NDARRAY_UINT8 : dtype;
+        ndarray->dtype.type = dtype == NDARRAY_BOOL ? NDARRAY_UINT8 : dtype;
         ndarray->boolean = dtype == NDARRAY_BOOL ? NDARRAY_BOOLEAN : NDARRAY_NUMERIC;
         ndarray->ndim = 1;
         ndarray->len = len;
diff --git a/code/user/user.c b/code/user/user.c
index fa5e660e..17829b43 100644
--- a/code/user/user.c
+++ b/code/user/user.c
@@ -39,29 +39,29 @@ static mp_obj_t user_square(mp_obj_t arg) {
 
     // if the input is a dense array, create `results` with the same number of
     // dimensions, shape, and dtype
-    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndarray->ndim, ndarray->shape, ndarray->dtype);
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndarray->ndim, ndarray->shape, ndarray->dtype.type);
 
     // since in a dense array the iteration over the elements is trivial, we
     // can cast the data arrays ndarray->array and results->array to the actual type
-    if(ndarray->dtype == NDARRAY_UINT8) {
+    if(ndarray->dtype.type == NDARRAY_UINT8) {
         uint8_t *array = (uint8_t *)ndarray->array;
         uint8_t *rarray = (uint8_t *)results->array;
         for(size_t i=0; i < ndarray->len; i++, array++) {
             *rarray++ = (*array) * (*array);
         }
-    } else if(ndarray->dtype == NDARRAY_INT8) {
+    } else if(ndarray->dtype.type == NDARRAY_INT8) {
         int8_t *array = (int8_t *)ndarray->array;
         int8_t *rarray = (int8_t *)results->array;
         for(size_t i=0; i < ndarray->len; i++, array++) {
             *rarray++ = (*array) * (*array);
         }
-    } else if(ndarray->dtype == NDARRAY_UINT16) {
+    } else if(ndarray->dtype.type == NDARRAY_UINT16) {
         uint16_t *array = (uint16_t *)ndarray->array;
         uint16_t *rarray = (uint16_t *)results->array;
         for(size_t i=0; i < ndarray->len; i++, array++) {
             *rarray++ = (*array) * (*array);
         }
-    } else if(ndarray->dtype == NDARRAY_INT16) {
+    } else if(ndarray->dtype.type == NDARRAY_INT16) {
         int16_t *array = (int16_t *)ndarray->array;
         int16_t *rarray = (int16_t *)results->array;
         for(size_t i=0; i < ndarray->len; i++, array++) {
diff --git a/run-tests b/run-tests
deleted file mode 100755
index 880b13f0..00000000
--- a/run-tests
+++ /dev/null
@@ -1,570 +0,0 @@
-#! /usr/bin/env python3
-
-import os
-import subprocess
-import sys
-import platform
-import argparse
-import re
-import threading
-import multiprocessing
-from multiprocessing.pool import ThreadPool
-from glob import glob
-
-if os.name == 'nt':
-    MICROPYTHON = os.getenv('MICROPY_MICROPYTHON', 'micropython/ports/windows/micropython.exe')
-else:
-    MICROPYTHON = os.getenv('MICROPY_MICROPYTHON', 'micropython/ports/unix/micropython')
-
-# mpy-cross is only needed if --via-mpy command-line arg is passed
-MPYCROSS = os.getenv('MICROPY_MPYCROSS', '../mpy-cross/mpy-cross')
-
-# Set PYTHONIOENCODING so that CPython will use utf-8 on systems which set another encoding in the locale
-os.environ['PYTHONIOENCODING'] = 'utf-8'
-
-def rm_f(fname):
-    if os.path.exists(fname):
-        os.remove(fname)
-
-
-# unescape wanted regex chars and escape unwanted ones
-def convert_regex_escapes(line):
-    cs = []
-    escape = False
-    for c in str(line, 'utf8'):
-        if escape:
-            escape = False
-            cs.append(c)
-        elif c == '\\':
-            escape = True
-        elif c in ('(', ')', '[', ']', '{', '}', '.', '*', '+', '^', '$'):
-            cs.append('\\' + c)
-        else:
-            cs.append(c)
-    # accept carriage-return(s) before final newline
-    if cs[-1] == '\n':
-        cs[-1] = '\r*\n'
-    return bytes(''.join(cs), 'utf8')
-
-
-def run_micropython(pyb, args, test_file, is_special=False):
-    special_tests = (
-        'micropython/meminfo.py', 'basics/bytes_compare3.py',
-        'basics/builtin_help.py', 'thread/thread_exc2.py',
-    )
-    had_crash = False
-    if pyb is None:
-        # run on PC
-        if test_file.startswith(('cmdline/', 'feature_check/')) or test_file in special_tests:
-            # special handling for tests of the unix cmdline program
-            is_special = True
-
-        if is_special:
-            # check for any cmdline options needed for this test
-            args = [MICROPYTHON]
-            with open(test_file, 'rb') as f:
-                line = f.readline()
-                if line.startswith(b'# cmdline:'):
-                    # subprocess.check_output on Windows only accepts strings, not bytes
-                    args += [str(c, 'utf-8') for c in line[10:].strip().split()]
-
-            # run the test, possibly with redirected input
-            try:
-                if 'repl_' in test_file:
-                    # Need to use a PTY to test command line editing
-                    try:
-                        import pty
-                    except ImportError:
-                        # in case pty module is not available, like on Windows
-                        return b'SKIP\n'
-                    import select
-
-                    def get(required=False):
-                        rv = b''
-                        while True:
-                            ready = select.select([emulator], [], [], 0.02)
-                            if ready[0] == [emulator]:
-                                rv += os.read(emulator, 1024)
-                            else:
-                                if not required or rv:
-                                    return rv
-
-                    def send_get(what):
-                        os.write(emulator, what)
-                        return get()
-
-                    with open(test_file, 'rb') as f:
-                        # instead of: output_mupy = subprocess.check_output(args, stdin=f)
-                        # openpty returns two read/write file descriptors.  The first one is
-                        # used by the program which provides the virtual
-                        # terminal service, and the second one is used by the
-                        # subprogram which requires a tty to work.
-                        emulator, subterminal = pty.openpty()
-                        p = subprocess.Popen(args, stdin=subterminal, stdout=subterminal,
-                                             stderr=subprocess.STDOUT, bufsize=0)
-                        banner = get(True)
-                        output_mupy = banner + b''.join(send_get(line) for line in f)
-                        send_get(b'\x04') # exit the REPL, so coverage info is saved
-                        p.kill()
-                        os.close(emulator)
-                        os.close(subterminal)
-                else:
-                    output_mupy = subprocess.check_output(args + [test_file], stderr=subprocess.STDOUT)
-            except subprocess.CalledProcessError:
-                return b'CRASH'
-
-        else:
-            # a standard test run on PC
-
-            # create system command
-            cmdlist = [MICROPYTHON, '-X', 'emit=' + args.emit]
-            if args.heapsize is not None:
-                cmdlist.extend(['-X', 'heapsize=' + args.heapsize])
-
-            # if running via .mpy, first compile the .py file
-            if args.via_mpy:
-                subprocess.check_output([MPYCROSS, '-mcache-lookup-bc', '-o', 'mpytest.mpy', test_file])
-                cmdlist.extend(['-m', 'mpytest'])
-            else:
-                cmdlist.append(test_file)
-
-            # run the actual test
-            e = {"MICROPYPATH": os.getcwd() + ":", "LANG": "en_US.UTF-8"}
-            p = subprocess.Popen(cmdlist, env=e, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-            output_mupy = b''
-            while p.poll() is None:
-                output_mupy += p.stdout.read()
-            output_mupy += p.stdout.read()
-            if p.returncode != 0:
-                output_mupy = b'CRASH'
-
-            # clean up if we had an intermediate .mpy file
-            if args.via_mpy:
-                rm_f('mpytest.mpy')
-
-    else:
-        # run on pyboard
-        import pyboard
-        pyb.enter_raw_repl()
-        try:
-            output_mupy = pyb.execfile(test_file)
-        except pyboard.PyboardError:
-            had_crash = True
-            output_mupy = b'CRASH'
-
-    # canonical form for all ports/platforms is to use \n for end-of-line
-    output_mupy = output_mupy.replace(b'\r\n', b'\n')
-
-    # don't try to convert the output if we should skip this test
-    if had_crash or output_mupy in (b'SKIP\n', b'CRASH'):
-        return output_mupy
-
-    if is_special or test_file in special_tests:
-        # convert parts of the output that are not stable across runs
-        with open(test_file + '.exp', 'rb') as f:
-            lines_exp = []
-            for line in f.readlines():
-                if line == b'########\n':
-                    line = (line,)
-                else:
-                    line = (line, re.compile(convert_regex_escapes(line)))
-                lines_exp.append(line)
-        lines_mupy = [line + b'\n' for line in output_mupy.split(b'\n')]
-        if output_mupy.endswith(b'\n'):
-            lines_mupy = lines_mupy[:-1] # remove erroneous last empty line
-        i_mupy = 0
-        for i in range(len(lines_exp)):
-            if lines_exp[i][0] == b'########\n':
-                # 8x #'s means match 0 or more whole lines
-                line_exp = lines_exp[i + 1]
-                skip = 0
-                while i_mupy + skip < len(lines_mupy) and not line_exp[1].match(lines_mupy[i_mupy + skip]):
-                    skip += 1
-                if i_mupy + skip >= len(lines_mupy):
-                    lines_mupy[i_mupy] = b'######## FAIL\n'
-                    break
-                del lines_mupy[i_mupy:i_mupy + skip]
-                lines_mupy.insert(i_mupy, b'########\n')
-                i_mupy += 1
-            else:
-                # a regex
-                if lines_exp[i][1].match(lines_mupy[i_mupy]):
-                    lines_mupy[i_mupy] = lines_exp[i][0]
-                else:
-                    #print("don't match: %r %s" % (lines_exp[i][1], lines_mupy[i_mupy])) # DEBUG
-                    pass
-                i_mupy += 1
-            if i_mupy >= len(lines_mupy):
-                break
-        output_mupy = b''.join(lines_mupy)
-
-    return output_mupy
-
-
-def run_feature_check(pyb, args, base_path, test_file):
-    return run_micropython(pyb, args, base_path + "/feature_check/" + test_file, is_special=True)
-
-class ThreadSafeCounter:
-    def __init__(self, start=0):
-        self._value = start
-        self._lock = threading.Lock()
-
-    def add(self, to_add):
-        with self._lock: self._value += to_add
-
-    def append(self, arg):
-        self.add([arg])
-
-    @property
-    def value(self):
-        return self._value
-
-def run_tests(pyb, tests, args, base_path=".", num_threads=1):
-    test_count = ThreadSafeCounter()
-    testcase_count = ThreadSafeCounter()
-    passed_count = ThreadSafeCounter()
-    failed_tests = ThreadSafeCounter([])
-    skipped_tests = ThreadSafeCounter([])
-
-    skip_tests = set()
-    skip_native = False
-    skip_int_big = False
-    skip_set_type = False
-    skip_async = False
-    skip_const = False
-    skip_revops = False
-    skip_endian = False
-    has_complex = True
-    has_coverage = False
-
-    upy_float_precision = 32
-
-    # Some tests shouldn't be run under Travis CI
-    if os.getenv('TRAVIS') == 'true':
-        skip_tests.add('basics/memoryerror.py')
-        skip_tests.add('thread/thread_gc1.py') # has reliability issues
-        skip_tests.add('thread/thread_lock4.py') # has reliability issues
-        skip_tests.add('thread/stress_heap.py') # has reliability issues
-        skip_tests.add('thread/stress_recurse.py') # has reliability issues
-
-    if upy_float_precision == 0:
-        skip_tests.add('extmod/ujson_dumps_float.py')
-        skip_tests.add('extmod/ujson_loads_float.py')
-        skip_tests.add('misc/rge_sm.py')
-    if upy_float_precision < 32:
-        skip_tests.add('float/float2int_intbig.py') # requires fp32, there's float2int_fp30_intbig.py instead
-        skip_tests.add('float/string_format.py') # requires fp32, there's string_format_fp30.py instead
-        skip_tests.add('float/bytes_construct.py') # requires fp32
-        skip_tests.add('float/bytearray_construct.py') # requires fp32
-    if upy_float_precision < 64:
-        skip_tests.add('float/float_divmod.py') # tested by float/float_divmod_relaxed.py instead
-        skip_tests.add('float/float2int_doubleprec_intbig.py')
-        skip_tests.add('float/float_parse_doubleprec.py')
-
-    if not has_complex:
-        skip_tests.add('float/complex1.py')
-        skip_tests.add('float/complex1_intbig.py')
-        skip_tests.add('float/int_big_float.py')
-        skip_tests.add('float/true_value.py')
-        skip_tests.add('float/types.py')
-
-    if not has_coverage:
-        skip_tests.add('cmdline/cmd_parsetree.py')
-
-    # Some tests shouldn't be run on a PC
-    if args.target == 'unix':
-        # unix build does not have the GIL so can't run thread mutation tests
-        for t in tests:
-            if t.startswith('thread/mutate_'):
-                skip_tests.add(t)
-
-    # Some tests shouldn't be run on pyboard
-    if args.target != 'unix':
-        skip_tests.add('basics/exception_chain.py') # warning is not printed
-        skip_tests.add('micropython/meminfo.py') # output is very different to PC output
-        skip_tests.add('extmod/machine_mem.py') # raw memory access not supported
-
-        if args.target == 'wipy':
-            skip_tests.add('misc/print_exception.py')       # requires error reporting full
-            skip_tests.update({'extmod/uctypes_%s.py' % t for t in 'bytearray le native_le ptr_le ptr_native_le sizeof sizeof_native array_assign_le array_assign_native_le'.split()}) # requires uctypes
-            skip_tests.add('extmod/zlibd_decompress.py')    # requires zlib
-            skip_tests.add('extmod/uheapq1.py')             # uheapq not supported by WiPy
-            skip_tests.add('extmod/urandom_basic.py')       # requires urandom
-            skip_tests.add('extmod/urandom_extra.py')       # requires urandom
-        elif args.target == 'esp8266':
-            skip_tests.add('misc/rge_sm.py')                # too large
-        elif args.target == 'minimal':
-            skip_tests.add('basics/class_inplace_op.py')    # all special methods not supported
-            skip_tests.add('basics/subclass_native_init.py')# native subclassing corner cases not support
-            skip_tests.add('misc/rge_sm.py')                # too large
-            skip_tests.add('micropython/opt_level.py')      # don't assume line numbers are stored
-
-    # Some tests are known to fail on 64-bit machines
-    if pyb is None and platform.architecture()[0] == '64bit':
-        pass
-
-    # Some tests use unsupported features on Windows
-    if os.name == 'nt':
-        skip_tests.add('import/import_file.py') # works but CPython prints forward slashes
-
-    # Some tests are known to fail with native emitter
-    # Remove them from the below when they work
-    if args.emit == 'native':
-        skip_tests.update({'basics/%s.py' % t for t in 'gen_yield_from gen_yield_from_close gen_yield_from_ducktype gen_yield_from_exc gen_yield_from_executing gen_yield_from_iter gen_yield_from_send gen_yield_from_stopped gen_yield_from_throw gen_yield_from_throw2 gen_yield_from_throw3 generator1 generator2 generator_args generator_close generator_closure generator_exc generator_pend_throw generator_return generator_send'.split()}) # require yield
-        skip_tests.update({'basics/%s.py' % t for t in 'bytes_gen class_store_class globals_del string_join gen_stack_overflow'.split()}) # require yield
-        skip_tests.update({'basics/async_%s.py' % t for t in 'def await await2 for for2 with with2 coroutine'.split()}) # require yield
-        skip_tests.update({'basics/%s.py' % t for t in 'try_reraise try_reraise2'.split()}) # require raise_varargs
-        skip_tests.update({'basics/%s.py' % t for t in 'with_break with_continue with_return'.split()}) # require complete with support
-        skip_tests.add('basics/array_construct2.py') # requires generators
-        skip_tests.add('basics/bool1.py') # seems to randomly fail
-        skip_tests.add('basics/builtin_hash_gen.py') # requires yield
-        skip_tests.add('basics/class_bind_self.py') # requires yield
-        skip_tests.add('basics/del_deref.py') # requires checking for unbound local
-        skip_tests.add('basics/del_local.py') # requires checking for unbound local
-        skip_tests.add('basics/exception_chain.py') # raise from is not supported
-        skip_tests.add('basics/for_range.py') # requires yield_value
-        skip_tests.add('basics/try_finally_loops.py') # requires proper try finally code
-        skip_tests.add('basics/try_finally_return.py') # requires proper try finally code
-        skip_tests.add('basics/try_finally_return2.py') # requires proper try finally code
-        skip_tests.add('basics/unboundlocal.py') # requires checking for unbound local
-        skip_tests.add('import/gen_context.py') # requires yield_value
-        skip_tests.add('misc/features.py') # requires raise_varargs
-        skip_tests.add('misc/rge_sm.py') # requires yield
-        skip_tests.add('misc/print_exception.py') # because native doesn't have proper traceback info
-        skip_tests.add('misc/sys_exc_info.py') # sys.exc_info() is not supported for native
-        skip_tests.add('micropython/emg_exc.py') # because native doesn't have proper traceback info
-        skip_tests.add('micropython/heapalloc_traceback.py') # because native doesn't have proper traceback info
-        skip_tests.add('micropython/heapalloc_iter.py') # requires generators
-        skip_tests.add('micropython/schedule.py') # native code doesn't check pending events
-        skip_tests.add('stress/gc_trace.py') # requires yield
-        skip_tests.add('stress/recursive_gen.py') # requires yield
-        skip_tests.add('extmod/vfs_userfs.py') # because native doesn't properly handle globals across different modules
-        skip_tests.add('../extmod/ulab/tests/argminmax.py') # requires yield
-
-    def run_one_test(test_file):
-        test_file = test_file.replace('\\', '/')
-
-        if args.filters:
-            # Default verdict is the opposit of the first action
-            verdict = "include" if args.filters[0][0] == "exclude" else "exclude"
-            for action, pat in args.filters:
-                if pat.search(test_file):
-                    verdict = action
-            if verdict == "exclude":
-                return
-
-        test_basename = os.path.basename(test_file)
-        test_name = os.path.splitext(test_basename)[0]
-        is_native = test_name.startswith("native_") or test_name.startswith("viper_")
-        is_endian = test_name.endswith("_endian")
-        is_int_big = test_name.startswith("int_big") or test_name.endswith("_intbig")
-        is_set_type = test_name.startswith("set_") or test_name.startswith("frozenset")
-        is_async = test_name.startswith("async_")
-        is_const = test_name.startswith("const")
-
-        skip_it = test_file in skip_tests
-        skip_it |= skip_native and is_native
-        skip_it |= skip_endian and is_endian
-        skip_it |= skip_int_big and is_int_big
-        skip_it |= skip_set_type and is_set_type
-        skip_it |= skip_async and is_async
-        skip_it |= skip_const and is_const
-        skip_it |= skip_revops and test_name.startswith("class_reverse_op")
-
-        if args.list_tests:
-            if not skip_it:
-                print(test_file)
-            return
-
-        if skip_it:
-            print("skip ", test_file)
-            skipped_tests.append(test_name)
-            return
-
-        # get expected output
-        test_file_expected = test_file + '.exp'
-        if os.path.isfile(test_file_expected):
-            # expected output given by a file, so read that in
-            with open(test_file_expected, 'rb') as f:
-                output_expected = f.read()
-        else:
-            if not args.write_exp:
-                output_expected = b"NOEXP\n"
-            else:
-                # run CPython to work out expected output
-                e = {"PYTHONPATH": os.getcwd(),
-                     "PATH": os.environ["PATH"],
-                     "LANG": "en_US.UTF-8"}
-                p = subprocess.Popen([MICROPYTHON, test_file], env=e, stdout=subprocess.PIPE)
-                output_expected = b''
-                while p.poll() is None:
-                    output_expected += p.stdout.read()
-                output_expected += p.stdout.read()
-                with open(test_file_expected, 'wb') as f:
-                    f.write(output_expected)
-
-        # canonical form for all host platforms is to use \n for end-of-line
-        output_expected = output_expected.replace(b'\r\n', b'\n')
-
-        if args.write_exp:
-            return
-
-        # run MicroPython
-        output_mupy = run_micropython(pyb, args, test_file)
-
-        if output_mupy == b'SKIP\n':
-            print("skip ", test_file)
-            skipped_tests.append(test_name)
-            return
-
-        if output_expected == b'NOEXP\n':
-            print("noexp", test_file)
-            failed_tests.append(test_name)
-            return
-
-        testcase_count.add(len(output_expected.splitlines()))
-
-        filename_expected = test_basename + ".exp"
-        filename_mupy = test_basename + ".out"
-
-        if output_expected == output_mupy:
-            print("pass ", test_file)
-            passed_count.add(1)
-            rm_f(filename_expected)
-            rm_f(filename_mupy)
-        else:
-            with open(filename_expected, "wb") as f:
-                f.write(output_expected)
-            with open(filename_mupy, "wb") as f:
-                f.write(output_mupy)
-            print("### Expected")
-            print(output_expected)
-            print("### Actual")
-            print(output_mupy)
-            print("FAIL ", test_file)
-            failed_tests.append(test_name)
-
-        test_count.add(1)
-
-    if args.list_tests:
-        return True
-
-    if num_threads > 1:
-        pool = ThreadPool(num_threads)
-        pool.map(run_one_test, tests)
-    else:
-        for test in tests:
-            run_one_test(test)
-
-    print("{} tests performed ({} individual testcases)".format(test_count.value, testcase_count.value))
-    print("{} tests passed".format(passed_count.value))
-
-    if len(skipped_tests.value) > 0:
-        print("{} tests skipped: {}".format(len(skipped_tests.value), ' '.join(sorted(skipped_tests.value))))
-    if len(failed_tests.value) > 0:
-        print("{} tests failed: {}".format(len(failed_tests.value), ' '.join(sorted(failed_tests.value))))
-        return False
-
-    # all tests succeeded
-    return True
-
-
-class append_filter(argparse.Action):
-
-    def __init__(self, option_strings, dest, **kwargs):
-        super().__init__(option_strings, dest, default=[], **kwargs)
-
-    def __call__(self, parser, args, value, option):
-        if not hasattr(args, self.dest):
-            args.filters = []
-        if option.startswith(("-e", "--e")):
-            option = "exclude"
-        else:
-            option = "include"
-        args.filters.append((option, re.compile(value)))
-
-
-def main():
-    cmd_parser = argparse.ArgumentParser(
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        description='Run and manage tests for MicroPython.',
-        epilog='''\
-Options -i and -e can be multiple and processed in the order given. Regex
-"search" (vs "match") operation is used. An action (include/exclude) of
-the last matching regex is used:
-  run-tests -i async - exclude all, then include tests containg "async" anywhere
-  run-tests -e '/big.+int' - include all, then exclude by regex
-  run-tests -e async -i async_foo - include all, exclude async, yet still include async_foo
-''')
-    cmd_parser.add_argument('--target', default='unix', help='the target platform')
-    cmd_parser.add_argument('--device', default='/dev/ttyACM0', help='the serial device or the IP address of the pyboard')
-    cmd_parser.add_argument('-b', '--baudrate', default=115200, help='the baud rate of the serial device')
-    cmd_parser.add_argument('-u', '--user', default='micro', help='the telnet login username')
-    cmd_parser.add_argument('-p', '--password', default='python', help='the telnet login password')
-    cmd_parser.add_argument('-d', '--test-dirs', nargs='*', help='input test directories (if no files given)')
-    cmd_parser.add_argument('-e', '--exclude', action=append_filter, metavar='REGEX', dest='filters', help='exclude test by regex on path/name.py')
-    cmd_parser.add_argument('-i', '--include', action=append_filter, metavar='REGEX', dest='filters', help='include test by regex on path/name.py')
-    cmd_parser.add_argument('--write-exp', action='store_true', help='save .exp files to run tests w/o CPython')
-    cmd_parser.add_argument('--list-tests', action='store_true', help='list tests instead of running them')
-    cmd_parser.add_argument('--emit', default='bytecode', help='MicroPython emitter to use (bytecode or native)')
-    cmd_parser.add_argument('--heapsize', help='heapsize to use (use default if not specified)')
-    cmd_parser.add_argument('--via-mpy', action='store_true', help='compile .py files to .mpy first')
-    cmd_parser.add_argument('--keep-path', action='store_true', help='do not clear MICROPYPATH when running tests')
-    cmd_parser.add_argument('-j', '--jobs', default=1, metavar='N', type=int, help='Number of tests to run simultaneously')
-    cmd_parser.add_argument('--auto-jobs', action='store_const', dest='jobs', const=multiprocessing.cpu_count(), help='Set the -j values to the CPU (thread) count')
-    cmd_parser.add_argument('files', nargs='*', help='input test files')
-    args = cmd_parser.parse_args()
-
-    EXTERNAL_TARGETS = ('pyboard', 'wipy', 'esp8266', 'esp32', 'minimal')
-    if args.target == 'unix' or args.list_tests:
-        pyb = None
-    elif args.target in EXTERNAL_TARGETS:
-        import pyboard
-        pyb = pyboard.Pyboard(args.device, args.baudrate, args.user, args.password)
-        pyb.enter_raw_repl()
-    else:
-        raise ValueError('target must be either %s or unix' % ", ".join(EXTERNAL_TARGETS))
-
-    if len(args.files) == 0:
-        if args.test_dirs is None:
-            if args.target == 'pyboard':
-                # run pyboard tests
-                test_dirs = ('basics', 'micropython', 'float', 'misc', 'stress', 'extmod', 'pyb', 'pybnative', 'inlineasm')
-            elif args.target in ('esp8266', 'esp32', 'minimal'):
-                test_dirs = ('basics', 'micropython', 'float', 'misc', 'extmod')
-            elif args.target == 'wipy':
-                # run WiPy tests
-                test_dirs = ('basics', 'micropython', 'misc', 'extmod', 'wipy')
-            else:
-                # run PC tests
-                test_dirs = (
-                    'basics', 'micropython', 'float', 'import', 'io', 'misc',
-                    'stress', 'unicode', 'extmod', '../extmod/ulab/tests', 'unix', 'cmdline',
-                )
-        else:
-            # run tests from these directories
-            test_dirs = args.test_dirs
-        tests = sorted(test_file for test_files in (glob('{}/*.py'.format(dir)) for dir in test_dirs) for test_file in test_files)
-    else:
-        # tests explicitly given
-        tests = args.files
-
-    if not args.keep_path:
-        # clear search path to make sure tests use only builtin modules
-        os.environ['MICROPYPATH'] = ''
-
-    # Even if we run completely different tests in a different directory,
-    # we need to access feature_check's from the same directory as the
-    # run-tests script itself.
-    base_path = os.path.dirname(sys.argv[0]) or "."
-    try:
-        res = run_tests(pyb, tests, args, base_path, args.jobs)
-    finally:
-        if pyb:
-            pyb.close()
-
-    if not res:
-        sys.exit(1)
-
-if __name__ == "__main__":
-    main()

From cb94dceff00603caa51abfd00514bb55730349b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n=20V=C3=B6r=C3=B6s?= <zvoros@gmail.com>
Date: Tue, 16 Feb 2021 18:20:52 +0100
Subject: [PATCH 02/19] add "recursive" macros in numerical.h

---
 code/numpy/numerical/numerical.c |  20 +-
 code/numpy/numerical/numerical.h | 312 ++++++++-----------------------
 2 files changed, 91 insertions(+), 241 deletions(-)

diff --git a/code/numpy/numerical/numerical.c b/code/numpy/numerical/numerical.c
index 8679f954..7f0ef71f 100644
--- a/code/numpy/numerical/numerical.c
+++ b/code/numpy/numerical/numerical.c
@@ -267,18 +267,18 @@ static mp_obj_t numerical_sum_mean_std_ndarray(ndarray_obj_t *ndarray, mp_obj_t
             rarray = (uint8_t *)results->array;
             // TODO: numpy promotes the output to the highest integer type
             if(ndarray->dtype.type == NDARRAY_UINT8) {
-                RUN_SUM(uint8_t, array, results, rarray, _shape_strides);
+                RUN_SUM(uint8_t, ndarray, array, results, rarray, _shape_strides);
             } else if(ndarray->dtype.type == NDARRAY_INT8) {
-                RUN_SUM(int8_t, array, results, rarray, _shape_strides);
+                RUN_SUM(int8_t, ndarray, array, results, rarray, _shape_strides);
             } else if(ndarray->dtype.type == NDARRAY_UINT16) {
-                RUN_SUM(uint16_t, array, results, rarray, _shape_strides);
+                RUN_SUM(uint16_t, ndarray, array, results, rarray, _shape_strides);
             } else if(ndarray->dtype.type == NDARRAY_INT16) {
-                RUN_SUM(int16_t, array, results, rarray, _shape_strides);
+                RUN_SUM(int16_t, ndarray, array, results, rarray, _shape_strides);
             } else {
                 // for floats, the sum might be inaccurate with the naive summation
                 // call mean, and multiply with the number of samples
                 farray = (mp_float_t *)results->array;
-                RUN_MEAN_STD(mp_float_t, array, farray, _shape_strides, 0.0, 0);
+                RUN_MEAN_STD(mp_float_t, ndarray, array, farray, _shape_strides, 0.0, 0);
                 mp_float_t norm = (mp_float_t)_shape_strides.shape[0];
                 // re-wind the array here
                 farray = (mp_float_t *)results->array;
@@ -296,15 +296,15 @@ static mp_obj_t numerical_sum_mean_std_ndarray(ndarray_obj_t *ndarray, mp_obj_t
             }
             mp_float_t div = optype == NUMERICAL_STD ? (mp_float_t)(_shape_strides.shape[0] - ddof) : 0.0;
             if(ndarray->dtype.type == NDARRAY_UINT8) {
-                RUN_MEAN_STD(uint8_t, array, farray, _shape_strides, div, isStd);
+                RUN_MEAN_STD(uint8_t, ndarray, array, farray, _shape_strides, div, isStd);
             } else if(ndarray->dtype.type == NDARRAY_INT8) {
-                RUN_MEAN_STD(int8_t, array, farray, _shape_strides, div, isStd);
+                RUN_MEAN_STD(int8_t, ndarray, array, farray, _shape_strides, div, isStd);
             } else if(ndarray->dtype.type == NDARRAY_UINT16) {
-                RUN_MEAN_STD(uint16_t, array, farray, _shape_strides, div, isStd);
+                RUN_MEAN_STD(uint16_t, ndarray, array, farray, _shape_strides, div, isStd);
             } else if(ndarray->dtype.type == NDARRAY_INT16) {
-                RUN_MEAN_STD(int16_t, array, farray, _shape_strides, div, isStd);
+                RUN_MEAN_STD(int16_t, ndarray, array, farray, _shape_strides, div, isStd);
             } else {
-                RUN_MEAN_STD(mp_float_t, array, farray, _shape_strides, div, isStd);
+                RUN_MEAN_STD(mp_float_t, ndarray, array, farray, _shape_strides, div, isStd);
             }
         }
         if(results->ndim == 0) { // return a scalar here
diff --git a/code/numpy/numerical/numerical.h b/code/numpy/numerical/numerical.h
index ef7b95d7..c5c2652b 100644
--- a/code/numpy/numerical/numerical.h
+++ b/code/numpy/numerical/numerical.h
@@ -46,7 +46,7 @@
     (rarray) += (results)->itemsize;\
 })
 
-#define RUN_SUM1(type, array, results, rarray, ss)\
+#define RUN_SUM1(type, ndarray, array, results, rarray, ss)\
 ({\
     type sum = 0;\
     for(size_t i=0; i < (ss).shape[0]; i++) {\
@@ -57,36 +57,41 @@
     (rarray) += (results)->itemsize;\
 })
 
-// The mean could be calculated by simply dividing the sum by
-// the number of elements, but that method is numerically unstable
-#define RUN_MEAN1(type, array, rarray, ss)\
-({\
-    mp_float_t M = 0.0;\
-    for(size_t i=0; i < (ss).shape[0]; i++) {\
-        mp_float_t value = (mp_float_t)(*(type *)(array));\
-        M = M + (value - M) / (mp_float_t)(i+1);\
-        (array) += (ss).strides[0];\
-    }\
-    *(rarray)++ = M;\
-})
+#define RUN_SUM2(type, ndarray, array, results, rarray, ss) do {\
+    size_t l = 0;\
+    do {\
+        RUN_SUM1(type, (ndarray), (array), (results), (rarray), (ss));\
+        (array) -= (ss).strides[0] * (ss).shape[0];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+} while(0)
+
+#define RUN_SUM3(type, ndarray, array, results, rarray, ss) do {\
+    size_t k = 0;\
+    do {\
+        RUN_SUM2(type, (ndarray), (array), (results), (rarray), (ss));\
+        (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
+} while(0)
+
+#define RUN_SUM4(type, ndarray, array, results, rarray, ss) do {\
+    size_t j = 0;\
+    do {\
+        RUN_SUM3(type, (ndarray), (array), (results), (rarray), (ss));\
+        (array) -= (ss).strides[ULAB_MAX_DIMS - 2] * (ss).shape[ULAB_MAX_DIMS - 2];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (ss).shape[ULAB_MAX_DIMS - 3]);\
+} while(0)
 
 // Instead of the straightforward implementation of the definition,
 // we take the numerically stable Welford algorithm here
 // https://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
-#define RUN_STD1(type, array, rarray, ss, div)\
-({\
-    mp_float_t M = 0.0, m = 0.0, S = 0.0;\
-    for(size_t i=0; i < (ss).shape[0]; i++) {\
-        mp_float_t value = (mp_float_t)(*(type *)(array));\
-        m = M + (value - M) / (mp_float_t)(i+1);\
-        S = S + (value - M) * (value - m);\
-        M = m;\
-        (array) += (ss).strides[0];\
-    }\
-    *(rarray)++ = MICROPY_FLOAT_C_FUN(sqrt)(S / (div));\
-})
 
-#define RUN_MEAN_STD1(type, array, rarray, ss, div, isStd)\
+#define RUN_MEAN_STD1(type, ndarray, array, rarray, ss, div, isStd)\
 ({\
     mp_float_t M = 0.0, m = 0.0, S = 0.0;\
     for(size_t i=0; i < (ss).shape[0]; i++) {\
@@ -101,6 +106,56 @@
     *(rarray)++ = isStd ? MICROPY_FLOAT_C_FUN(sqrt)(S / (div)) : M;\
 })
 
+#define RUN_MEAN_STD2(type, ndarray, array, rarray, ss, div, isStd) do {\
+    size_t l = 0;\
+    do {\
+        RUN_MEAN_STD1(type, (ndarray), (array), (rarray), (ss), (div), (isStd));\
+        (array) -= (ss).strides[0] * (ss).shape[0];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+} while(0)
+
+#define RUN_MEAN_STD3(type, ndarray, array, rarray, ss, div, isStd) do {\
+    size_t k = 0;\
+    do {\
+        RUN_MEAN_STD2(type, (ndarray), (array), (rarray), (ss), (div), (isStd));\
+        (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
+} while(0)
+
+#define RUN_MEAN_STD4(type, ndarray, array, rarray, ss, div, isStd) do {\
+    size_t j = 0;\
+    do {\
+        RUN_MEAN_STD3(type, (ndarray), (array), (rarray), (ss), (div), (isStd));\
+        (array) -= (ss).strides[ULAB_MAX_DIMS - 2] * (ss).shape[ULAB_MAX_DIMS - 2];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (ss).shape[ULAB_MAX_DIMS - 3]);\
+} while(0)
+
+#if ULAB_MAX_DIMS == 1
+#define RUN_SUM RUN_SUM1
+#define RUN_MEAN_STD RUN_MEAN_STD1
+#endif
+
+#if ULAB_MAX_DIMS == 2
+#define RUN_SUM RUN_SUM2
+#define RUN_MEAN_STD RUN_MEAN_STD2
+#endif
+
+#if ULAB_MAX_DIMS == 3
+#define RUN_SUM RUN_SUM3
+#define RUN_MEAN_STD RUN_MEAN_STD3
+#endif
+
+#if ULAB_MAX_DIMS == 4
+#define RUN_SUM RUN_SUM4
+#define RUN_MEAN_STD RUN_MEAN_STD4
+#endif
+
 #define RUN_DIFF1(ndarray, type, array, results, rarray, index, stencil, N)\
 ({\
     for(size_t i=0; i < (results)->shape[ULAB_MAX_DIMS - 1]; i++) {\
@@ -188,22 +243,6 @@
 })
 
 #if ULAB_MAX_DIMS == 1
-#define RUN_SUM(type, array, results, rarray, ss) do {\
-    RUN_SUM1(type, (array), (results), (rarray), (ss));\
-} while(0)
-
-#define RUN_MEAN(type, array, rarray, ss) do {\
-    RUN_MEAN1(type, (array), (rarray), (ss));\
-} while(0)
-
-#define RUN_STD(type, array, rarray, ss, div) do {\
-    RUN_STD1(type, (array), (results), (rarray), (ss), (div));\
-} while(0)
-
-#define RUN_MEAN_STD(type, array, rarray, ss, div, isStd) do {\
-    RUN_MEAN_STD1(type, (array), (results), (rarray), (ss), (div), (isStd));\
-} while(0)
-
 #define RUN_ARGMIN(ndarray, type, array, results, rarray, shape, strides, index, op) do {\
     RUN_ARGMIN1((ndarray), type, (array), (results), (rarray), (index), (op));\
 } while(0)
@@ -223,46 +262,6 @@
 #endif
 
 #if ULAB_MAX_DIMS == 2
-#define RUN_SUM(type, array, results, rarray, ss) do {\
-    size_t l = 0;\
-    do {\
-        RUN_SUM1(type, (array), (results), (rarray), (ss));\
-        (array) -= (ss).strides[0] * (ss).shape[0];\
-        (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
-        l++;\
-    } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
-} while(0)
-
-#define RUN_MEAN(type, array, rarray, ss) do {\
-    size_t l = 0;\
-    do {\
-        RUN_MEAN1(type, (array), (rarray), (ss));\
-        (array) -= (ss).strides[0] * (ss).shape[0];\
-        (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
-        l++;\
-    } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
-} while(0)
-
-#define RUN_STD(type, array, rarray, ss, div) do {\
-    size_t l = 0;\
-    do {\
-        RUN_STD1(type, (array), (rarray), (ss), (div));\
-        (array) -= (ss).strides[0] * (ss).shape[0];\
-        (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
-        l++;\
-    } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
-} while(0)
-
-#define RUN_MEAN_STD(type, array, rarray, ss, div, isStd) do {\
-    size_t l = 0;\
-    do {\
-        RUN_MEAN_STD1(type, (array), (rarray), (ss), (div), (isStd));\
-        (array) -= (ss).strides[0] * (ss).shape[0];\
-        (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
-        l++;\
-    } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
-} while(0)
-
 
 #define RUN_ARGMIN(ndarray, type, array, results, rarray, shape, strides, index, op) do {\
     size_t l = 0;\
@@ -308,69 +307,7 @@
 #endif
 
 #if ULAB_MAX_DIMS == 3
-#define RUN_SUM(type, array, results, rarray, ss) do {\
-    size_t k = 0;\
-    do {\
-        size_t l = 0;\
-        do {\
-            RUN_SUM1(type, (array), (results), (rarray), (ss));\
-            (array) -= (ss).strides[0] * (ss).shape[0];\
-            (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
-            l++;\
-        } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
-        (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
-        (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
-        k++;\
-    } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
-} while(0)
 
-#define RUN_MEAN(type, array, rarray, ss) do {\
-    size_t k = 0;\
-    do {\
-        size_t l = 0;\
-        do {\
-            RUN_MEAN1(type, (array), (rarray), (ss));\
-            (array) -= (ss).strides[0] * (ss).shape[0];\
-            (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
-            l++;\
-        } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
-        (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
-        (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
-        k++;\
-    } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
-} while(0)
-
-#define RUN_STD(type, array, rarray, ss, div) do {\
-    size_t k = 0;\
-    do {\
-        size_t l = 0;\
-        do {\
-            RUN_STD1(type, (array), (rarray), (ss), (div));\
-            (array) -= (ss).strides[0] * (ss).shape[0];\
-            (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
-            l++;\
-        } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
-        (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
-        (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
-        k++;\
-    } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
-} while(0)
-
-#define RUN_MEAN_STD(type, array, rarray, ss, div, isStd) do {\
-    size_t k = 0;\
-    do {\
-        size_t l = 0;\
-        do {\
-            RUN_MEAN_STD1(type, (array), (rarray), (ss), (div), (isStd));\
-            (array) -= (ss).strides[0] * (ss).shape[0];\
-            (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
-            l++;\
-        } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
-        (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
-        (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
-        k++;\
-    } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
-} while(0)
 
 #define RUN_ARGMIN(ndarray, type, array, results, rarray, shape, strides, index, op) do {\
     size_t k = 0;\
@@ -444,93 +381,6 @@
 #endif
 
 #if ULAB_MAX_DIMS == 4
-#define RUN_SUM(type, array, results, rarray, shape, strides, index) do {\
-    size_t j = 0;\
-    do {\
-        size_t k = 0;\
-        do {\
-            size_t l = 0;\
-            do {\
-                RUN_SUM1(type, (array), (results), (rarray), (ss));\
-                (array) -= (ss).strides[0] * (ss).shape[0];\
-                (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
-                l++;\
-            } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
-            (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
-            (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
-            k++;\
-        } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
-        (array) -= (ss).strides[ULAB_MAX_DIMS - 2] * (ss).shape[ULAB_MAX_DIMS - 2];\
-        (array) += (ss).strides[ULAB_MAX_DIMS - 3];\
-        j++;\
-    } while(j < (ss).shape[ULAB_MAX_DIMS - 3]);\
-} while(0)
-
-#define RUN_MEAN(type, array, rarray, ss) do {\
-    size_t j = 0;\
-    do {\
-        size_t k = 0;\
-        do {\
-            size_t l = 0;\
-            do {\
-                RUN_MEAN1(type, (array), (rarray), (ss));\
-                (array) -= (ss).strides[0] * (ss).shape[0];\
-                (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
-                l++;\
-            } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
-            (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
-            (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
-            k++;\
-        } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
-        (array) -= (ss).strides[ULAB_MAX_DIMS - 2] * (ss).shape[ULAB_MAX_DIMS - 2];\
-        (array) += (ss).strides[ULAB_MAX_DIMS - 3];\
-        j++;\
-    } while(j < (ss).shape[ULAB_MAX_DIMS - 3]);\
-} while(0)
-
-#define RUN_STD(type, array, rarray, ss, div) do {\
-    size_t j = 0;\
-    do {\
-        size_t k = 0;\
-        do {\
-            size_t l = 0;\
-            do {\
-                RUN_STD1(type, (array), (rarray), (ss), (div));\
-                (array) -= (ss).strides[0] * (ss).shape[0];\
-                (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
-                l++;\
-            } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
-            (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
-            (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
-            k++;\
-        } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
-        (array) -= (ss).strides[ULAB_MAX_DIMS - 2] * (ss).shape[ULAB_MAX_DIMS - 2];\
-        (array) += (ss).strides[ULAB_MAX_DIMS - 3];\
-        j++;\
-    } while(j < (ss).shape[ULAB_MAX_DIMS - 3]);\
-} while(0)
-
-#define RUN_MEAN_STD(type, array, rarray, ss, div, isStd) do {\
-    size_t j = 0;\
-    do {\
-        size_t k = 0;\
-        do {\
-            size_t l = 0;\
-            do {\
-                RUN_MEAN_STD1(type, (array), (rarray), (ss), (div), (isStd));\
-                (array) -= (ss).strides[0] * (ss).shape[0];\
-                (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
-                l++;\
-            } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
-            (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
-            (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
-            k++;\
-        } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
-        (array) -= (ss).strides[ULAB_MAX_DIMS - 2] * (ss).shape[ULAB_MAX_DIMS - 2];\
-        (array) += (ss).strides[ULAB_MAX_DIMS - 3];\
-        j++;\
-    } while(j < (ss).shape[ULAB_MAX_DIMS - 3]);\
-} while(0)
 
 #define RUN_ARGMIN(ndarray, type, array, results, rarray, shape, strides, index, op) do {\
     size_t j = 0;\

From 0e6eaf3b002840b536699d5db14c65343dcd440f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n=20V=C3=B6r=C3=B6s?= <zvoros@gmail.com>
Date: Tue, 16 Feb 2021 19:27:26 +0100
Subject: [PATCH 03/19] recovered deleted run-tests file

---
 run-tests | 570 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 570 insertions(+)
 create mode 100755 run-tests

diff --git a/run-tests b/run-tests
new file mode 100755
index 00000000..880b13f0
--- /dev/null
+++ b/run-tests
@@ -0,0 +1,570 @@
+#! /usr/bin/env python3
+
+import os
+import subprocess
+import sys
+import platform
+import argparse
+import re
+import threading
+import multiprocessing
+from multiprocessing.pool import ThreadPool
+from glob import glob
+
+if os.name == 'nt':
+    MICROPYTHON = os.getenv('MICROPY_MICROPYTHON', 'micropython/ports/windows/micropython.exe')
+else:
+    MICROPYTHON = os.getenv('MICROPY_MICROPYTHON', 'micropython/ports/unix/micropython')
+
+# mpy-cross is only needed if --via-mpy command-line arg is passed
+MPYCROSS = os.getenv('MICROPY_MPYCROSS', '../mpy-cross/mpy-cross')
+
+# Set PYTHONIOENCODING so that CPython will use utf-8 on systems which set another encoding in the locale
+os.environ['PYTHONIOENCODING'] = 'utf-8'
+
+def rm_f(fname):
+    if os.path.exists(fname):
+        os.remove(fname)
+
+
+# unescape wanted regex chars and escape unwanted ones
+def convert_regex_escapes(line):
+    cs = []
+    escape = False
+    for c in str(line, 'utf8'):
+        if escape:
+            escape = False
+            cs.append(c)
+        elif c == '\\':
+            escape = True
+        elif c in ('(', ')', '[', ']', '{', '}', '.', '*', '+', '^', '$'):
+            cs.append('\\' + c)
+        else:
+            cs.append(c)
+    # accept carriage-return(s) before final newline
+    if cs[-1] == '\n':
+        cs[-1] = '\r*\n'
+    return bytes(''.join(cs), 'utf8')
+
+
+def run_micropython(pyb, args, test_file, is_special=False):
+    special_tests = (
+        'micropython/meminfo.py', 'basics/bytes_compare3.py',
+        'basics/builtin_help.py', 'thread/thread_exc2.py',
+    )
+    had_crash = False
+    if pyb is None:
+        # run on PC
+        if test_file.startswith(('cmdline/', 'feature_check/')) or test_file in special_tests:
+            # special handling for tests of the unix cmdline program
+            is_special = True
+
+        if is_special:
+            # check for any cmdline options needed for this test
+            args = [MICROPYTHON]
+            with open(test_file, 'rb') as f:
+                line = f.readline()
+                if line.startswith(b'# cmdline:'):
+                    # subprocess.check_output on Windows only accepts strings, not bytes
+                    args += [str(c, 'utf-8') for c in line[10:].strip().split()]
+
+            # run the test, possibly with redirected input
+            try:
+                if 'repl_' in test_file:
+                    # Need to use a PTY to test command line editing
+                    try:
+                        import pty
+                    except ImportError:
+                        # in case pty module is not available, like on Windows
+                        return b'SKIP\n'
+                    import select
+
+                    def get(required=False):
+                        rv = b''
+                        while True:
+                            ready = select.select([emulator], [], [], 0.02)
+                            if ready[0] == [emulator]:
+                                rv += os.read(emulator, 1024)
+                            else:
+                                if not required or rv:
+                                    return rv
+
+                    def send_get(what):
+                        os.write(emulator, what)
+                        return get()
+
+                    with open(test_file, 'rb') as f:
+                        # instead of: output_mupy = subprocess.check_output(args, stdin=f)
+                        # openpty returns two read/write file descriptors.  The first one is
+                        # used by the program which provides the virtual
+                        # terminal service, and the second one is used by the
+                        # subprogram which requires a tty to work.
+                        emulator, subterminal = pty.openpty()
+                        p = subprocess.Popen(args, stdin=subterminal, stdout=subterminal,
+                                             stderr=subprocess.STDOUT, bufsize=0)
+                        banner = get(True)
+                        output_mupy = banner + b''.join(send_get(line) for line in f)
+                        send_get(b'\x04') # exit the REPL, so coverage info is saved
+                        p.kill()
+                        os.close(emulator)
+                        os.close(subterminal)
+                else:
+                    output_mupy = subprocess.check_output(args + [test_file], stderr=subprocess.STDOUT)
+            except subprocess.CalledProcessError:
+                return b'CRASH'
+
+        else:
+            # a standard test run on PC
+
+            # create system command
+            cmdlist = [MICROPYTHON, '-X', 'emit=' + args.emit]
+            if args.heapsize is not None:
+                cmdlist.extend(['-X', 'heapsize=' + args.heapsize])
+
+            # if running via .mpy, first compile the .py file
+            if args.via_mpy:
+                subprocess.check_output([MPYCROSS, '-mcache-lookup-bc', '-o', 'mpytest.mpy', test_file])
+                cmdlist.extend(['-m', 'mpytest'])
+            else:
+                cmdlist.append(test_file)
+
+            # run the actual test
+            e = {"MICROPYPATH": os.getcwd() + ":", "LANG": "en_US.UTF-8"}
+            p = subprocess.Popen(cmdlist, env=e, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+            output_mupy = b''
+            while p.poll() is None:
+                output_mupy += p.stdout.read()
+            output_mupy += p.stdout.read()
+            if p.returncode != 0:
+                output_mupy = b'CRASH'
+
+            # clean up if we had an intermediate .mpy file
+            if args.via_mpy:
+                rm_f('mpytest.mpy')
+
+    else:
+        # run on pyboard
+        import pyboard
+        pyb.enter_raw_repl()
+        try:
+            output_mupy = pyb.execfile(test_file)
+        except pyboard.PyboardError:
+            had_crash = True
+            output_mupy = b'CRASH'
+
+    # canonical form for all ports/platforms is to use \n for end-of-line
+    output_mupy = output_mupy.replace(b'\r\n', b'\n')
+
+    # don't try to convert the output if we should skip this test
+    if had_crash or output_mupy in (b'SKIP\n', b'CRASH'):
+        return output_mupy
+
+    if is_special or test_file in special_tests:
+        # convert parts of the output that are not stable across runs
+        with open(test_file + '.exp', 'rb') as f:
+            lines_exp = []
+            for line in f.readlines():
+                if line == b'########\n':
+                    line = (line,)
+                else:
+                    line = (line, re.compile(convert_regex_escapes(line)))
+                lines_exp.append(line)
+        lines_mupy = [line + b'\n' for line in output_mupy.split(b'\n')]
+        if output_mupy.endswith(b'\n'):
+            lines_mupy = lines_mupy[:-1] # remove erroneous last empty line
+        i_mupy = 0
+        for i in range(len(lines_exp)):
+            if lines_exp[i][0] == b'########\n':
+                # 8x #'s means match 0 or more whole lines
+                line_exp = lines_exp[i + 1]
+                skip = 0
+                while i_mupy + skip < len(lines_mupy) and not line_exp[1].match(lines_mupy[i_mupy + skip]):
+                    skip += 1
+                if i_mupy + skip >= len(lines_mupy):
+                    lines_mupy[i_mupy] = b'######## FAIL\n'
+                    break
+                del lines_mupy[i_mupy:i_mupy + skip]
+                lines_mupy.insert(i_mupy, b'########\n')
+                i_mupy += 1
+            else:
+                # a regex
+                if lines_exp[i][1].match(lines_mupy[i_mupy]):
+                    lines_mupy[i_mupy] = lines_exp[i][0]
+                else:
+                    #print("don't match: %r %s" % (lines_exp[i][1], lines_mupy[i_mupy])) # DEBUG
+                    pass
+                i_mupy += 1
+            if i_mupy >= len(lines_mupy):
+                break
+        output_mupy = b''.join(lines_mupy)
+
+    return output_mupy
+
+
+def run_feature_check(pyb, args, base_path, test_file):
+    return run_micropython(pyb, args, base_path + "/feature_check/" + test_file, is_special=True)
+
+class ThreadSafeCounter:
+    def __init__(self, start=0):
+        self._value = start
+        self._lock = threading.Lock()
+
+    def add(self, to_add):
+        with self._lock: self._value += to_add
+
+    def append(self, arg):
+        self.add([arg])
+
+    @property
+    def value(self):
+        return self._value
+
+def run_tests(pyb, tests, args, base_path=".", num_threads=1):
+    test_count = ThreadSafeCounter()
+    testcase_count = ThreadSafeCounter()
+    passed_count = ThreadSafeCounter()
+    failed_tests = ThreadSafeCounter([])
+    skipped_tests = ThreadSafeCounter([])
+
+    skip_tests = set()
+    skip_native = False
+    skip_int_big = False
+    skip_set_type = False
+    skip_async = False
+    skip_const = False
+    skip_revops = False
+    skip_endian = False
+    has_complex = True
+    has_coverage = False
+
+    upy_float_precision = 32
+
+    # Some tests shouldn't be run under Travis CI
+    if os.getenv('TRAVIS') == 'true':
+        skip_tests.add('basics/memoryerror.py')
+        skip_tests.add('thread/thread_gc1.py') # has reliability issues
+        skip_tests.add('thread/thread_lock4.py') # has reliability issues
+        skip_tests.add('thread/stress_heap.py') # has reliability issues
+        skip_tests.add('thread/stress_recurse.py') # has reliability issues
+
+    if upy_float_precision == 0:
+        skip_tests.add('extmod/ujson_dumps_float.py')
+        skip_tests.add('extmod/ujson_loads_float.py')
+        skip_tests.add('misc/rge_sm.py')
+    if upy_float_precision < 32:
+        skip_tests.add('float/float2int_intbig.py') # requires fp32, there's float2int_fp30_intbig.py instead
+        skip_tests.add('float/string_format.py') # requires fp32, there's string_format_fp30.py instead
+        skip_tests.add('float/bytes_construct.py') # requires fp32
+        skip_tests.add('float/bytearray_construct.py') # requires fp32
+    if upy_float_precision < 64:
+        skip_tests.add('float/float_divmod.py') # tested by float/float_divmod_relaxed.py instead
+        skip_tests.add('float/float2int_doubleprec_intbig.py')
+        skip_tests.add('float/float_parse_doubleprec.py')
+
+    if not has_complex:
+        skip_tests.add('float/complex1.py')
+        skip_tests.add('float/complex1_intbig.py')
+        skip_tests.add('float/int_big_float.py')
+        skip_tests.add('float/true_value.py')
+        skip_tests.add('float/types.py')
+
+    if not has_coverage:
+        skip_tests.add('cmdline/cmd_parsetree.py')
+
+    # Some tests shouldn't be run on a PC
+    if args.target == 'unix':
+        # unix build does not have the GIL so can't run thread mutation tests
+        for t in tests:
+            if t.startswith('thread/mutate_'):
+                skip_tests.add(t)
+
+    # Some tests shouldn't be run on pyboard
+    if args.target != 'unix':
+        skip_tests.add('basics/exception_chain.py') # warning is not printed
+        skip_tests.add('micropython/meminfo.py') # output is very different to PC output
+        skip_tests.add('extmod/machine_mem.py') # raw memory access not supported
+
+        if args.target == 'wipy':
+            skip_tests.add('misc/print_exception.py')       # requires error reporting full
+            skip_tests.update({'extmod/uctypes_%s.py' % t for t in 'bytearray le native_le ptr_le ptr_native_le sizeof sizeof_native array_assign_le array_assign_native_le'.split()}) # requires uctypes
+            skip_tests.add('extmod/zlibd_decompress.py')    # requires zlib
+            skip_tests.add('extmod/uheapq1.py')             # uheapq not supported by WiPy
+            skip_tests.add('extmod/urandom_basic.py')       # requires urandom
+            skip_tests.add('extmod/urandom_extra.py')       # requires urandom
+        elif args.target == 'esp8266':
+            skip_tests.add('misc/rge_sm.py')                # too large
+        elif args.target == 'minimal':
+            skip_tests.add('basics/class_inplace_op.py')    # all special methods not supported
+            skip_tests.add('basics/subclass_native_init.py')# native subclassing corner cases not support
+            skip_tests.add('misc/rge_sm.py')                # too large
+            skip_tests.add('micropython/opt_level.py')      # don't assume line numbers are stored
+
+    # Some tests are known to fail on 64-bit machines
+    if pyb is None and platform.architecture()[0] == '64bit':
+        pass
+
+    # Some tests use unsupported features on Windows
+    if os.name == 'nt':
+        skip_tests.add('import/import_file.py') # works but CPython prints forward slashes
+
+    # Some tests are known to fail with native emitter
+    # Remove them from the below when they work
+    if args.emit == 'native':
+        skip_tests.update({'basics/%s.py' % t for t in 'gen_yield_from gen_yield_from_close gen_yield_from_ducktype gen_yield_from_exc gen_yield_from_executing gen_yield_from_iter gen_yield_from_send gen_yield_from_stopped gen_yield_from_throw gen_yield_from_throw2 gen_yield_from_throw3 generator1 generator2 generator_args generator_close generator_closure generator_exc generator_pend_throw generator_return generator_send'.split()}) # require yield
+        skip_tests.update({'basics/%s.py' % t for t in 'bytes_gen class_store_class globals_del string_join gen_stack_overflow'.split()}) # require yield
+        skip_tests.update({'basics/async_%s.py' % t for t in 'def await await2 for for2 with with2 coroutine'.split()}) # require yield
+        skip_tests.update({'basics/%s.py' % t for t in 'try_reraise try_reraise2'.split()}) # require raise_varargs
+        skip_tests.update({'basics/%s.py' % t for t in 'with_break with_continue with_return'.split()}) # require complete with support
+        skip_tests.add('basics/array_construct2.py') # requires generators
+        skip_tests.add('basics/bool1.py') # seems to randomly fail
+        skip_tests.add('basics/builtin_hash_gen.py') # requires yield
+        skip_tests.add('basics/class_bind_self.py') # requires yield
+        skip_tests.add('basics/del_deref.py') # requires checking for unbound local
+        skip_tests.add('basics/del_local.py') # requires checking for unbound local
+        skip_tests.add('basics/exception_chain.py') # raise from is not supported
+        skip_tests.add('basics/for_range.py') # requires yield_value
+        skip_tests.add('basics/try_finally_loops.py') # requires proper try finally code
+        skip_tests.add('basics/try_finally_return.py') # requires proper try finally code
+        skip_tests.add('basics/try_finally_return2.py') # requires proper try finally code
+        skip_tests.add('basics/unboundlocal.py') # requires checking for unbound local
+        skip_tests.add('import/gen_context.py') # requires yield_value
+        skip_tests.add('misc/features.py') # requires raise_varargs
+        skip_tests.add('misc/rge_sm.py') # requires yield
+        skip_tests.add('misc/print_exception.py') # because native doesn't have proper traceback info
+        skip_tests.add('misc/sys_exc_info.py') # sys.exc_info() is not supported for native
+        skip_tests.add('micropython/emg_exc.py') # because native doesn't have proper traceback info
+        skip_tests.add('micropython/heapalloc_traceback.py') # because native doesn't have proper traceback info
+        skip_tests.add('micropython/heapalloc_iter.py') # requires generators
+        skip_tests.add('micropython/schedule.py') # native code doesn't check pending events
+        skip_tests.add('stress/gc_trace.py') # requires yield
+        skip_tests.add('stress/recursive_gen.py') # requires yield
+        skip_tests.add('extmod/vfs_userfs.py') # because native doesn't properly handle globals across different modules
+        skip_tests.add('../extmod/ulab/tests/argminmax.py') # requires yield
+
+    def run_one_test(test_file):
+        test_file = test_file.replace('\\', '/')
+
+        if args.filters:
+            # Default verdict is the opposit of the first action
+            verdict = "include" if args.filters[0][0] == "exclude" else "exclude"
+            for action, pat in args.filters:
+                if pat.search(test_file):
+                    verdict = action
+            if verdict == "exclude":
+                return
+
+        test_basename = os.path.basename(test_file)
+        test_name = os.path.splitext(test_basename)[0]
+        is_native = test_name.startswith("native_") or test_name.startswith("viper_")
+        is_endian = test_name.endswith("_endian")
+        is_int_big = test_name.startswith("int_big") or test_name.endswith("_intbig")
+        is_set_type = test_name.startswith("set_") or test_name.startswith("frozenset")
+        is_async = test_name.startswith("async_")
+        is_const = test_name.startswith("const")
+
+        skip_it = test_file in skip_tests
+        skip_it |= skip_native and is_native
+        skip_it |= skip_endian and is_endian
+        skip_it |= skip_int_big and is_int_big
+        skip_it |= skip_set_type and is_set_type
+        skip_it |= skip_async and is_async
+        skip_it |= skip_const and is_const
+        skip_it |= skip_revops and test_name.startswith("class_reverse_op")
+
+        if args.list_tests:
+            if not skip_it:
+                print(test_file)
+            return
+
+        if skip_it:
+            print("skip ", test_file)
+            skipped_tests.append(test_name)
+            return
+
+        # get expected output
+        test_file_expected = test_file + '.exp'
+        if os.path.isfile(test_file_expected):
+            # expected output given by a file, so read that in
+            with open(test_file_expected, 'rb') as f:
+                output_expected = f.read()
+        else:
+            if not args.write_exp:
+                output_expected = b"NOEXP\n"
+            else:
+                # run CPython to work out expected output
+                e = {"PYTHONPATH": os.getcwd(),
+                     "PATH": os.environ["PATH"],
+                     "LANG": "en_US.UTF-8"}
+                p = subprocess.Popen([MICROPYTHON, test_file], env=e, stdout=subprocess.PIPE)
+                output_expected = b''
+                while p.poll() is None:
+                    output_expected += p.stdout.read()
+                output_expected += p.stdout.read()
+                with open(test_file_expected, 'wb') as f:
+                    f.write(output_expected)
+
+        # canonical form for all host platforms is to use \n for end-of-line
+        output_expected = output_expected.replace(b'\r\n', b'\n')
+
+        if args.write_exp:
+            return
+
+        # run MicroPython
+        output_mupy = run_micropython(pyb, args, test_file)
+
+        if output_mupy == b'SKIP\n':
+            print("skip ", test_file)
+            skipped_tests.append(test_name)
+            return
+
+        if output_expected == b'NOEXP\n':
+            print("noexp", test_file)
+            failed_tests.append(test_name)
+            return
+
+        testcase_count.add(len(output_expected.splitlines()))
+
+        filename_expected = test_basename + ".exp"
+        filename_mupy = test_basename + ".out"
+
+        if output_expected == output_mupy:
+            print("pass ", test_file)
+            passed_count.add(1)
+            rm_f(filename_expected)
+            rm_f(filename_mupy)
+        else:
+            with open(filename_expected, "wb") as f:
+                f.write(output_expected)
+            with open(filename_mupy, "wb") as f:
+                f.write(output_mupy)
+            print("### Expected")
+            print(output_expected)
+            print("### Actual")
+            print(output_mupy)
+            print("FAIL ", test_file)
+            failed_tests.append(test_name)
+
+        test_count.add(1)
+
+    if args.list_tests:
+        return True
+
+    if num_threads > 1:
+        pool = ThreadPool(num_threads)
+        pool.map(run_one_test, tests)
+    else:
+        for test in tests:
+            run_one_test(test)
+
+    print("{} tests performed ({} individual testcases)".format(test_count.value, testcase_count.value))
+    print("{} tests passed".format(passed_count.value))
+
+    if len(skipped_tests.value) > 0:
+        print("{} tests skipped: {}".format(len(skipped_tests.value), ' '.join(sorted(skipped_tests.value))))
+    if len(failed_tests.value) > 0:
+        print("{} tests failed: {}".format(len(failed_tests.value), ' '.join(sorted(failed_tests.value))))
+        return False
+
+    # all tests succeeded
+    return True
+
+
+class append_filter(argparse.Action):
+
+    def __init__(self, option_strings, dest, **kwargs):
+        super().__init__(option_strings, dest, default=[], **kwargs)
+
+    def __call__(self, parser, args, value, option):
+        if not hasattr(args, self.dest):
+            args.filters = []
+        if option.startswith(("-e", "--e")):
+            option = "exclude"
+        else:
+            option = "include"
+        args.filters.append((option, re.compile(value)))
+
+
+def main():
+    cmd_parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description='Run and manage tests for MicroPython.',
+        epilog='''\
+Options -i and -e can be multiple and processed in the order given. Regex
+"search" (vs "match") operation is used. An action (include/exclude) of
+the last matching regex is used:
+  run-tests -i async - exclude all, then include tests containg "async" anywhere
+  run-tests -e '/big.+int' - include all, then exclude by regex
+  run-tests -e async -i async_foo - include all, exclude async, yet still include async_foo
+''')
+    cmd_parser.add_argument('--target', default='unix', help='the target platform')
+    cmd_parser.add_argument('--device', default='/dev/ttyACM0', help='the serial device or the IP address of the pyboard')
+    cmd_parser.add_argument('-b', '--baudrate', default=115200, help='the baud rate of the serial device')
+    cmd_parser.add_argument('-u', '--user', default='micro', help='the telnet login username')
+    cmd_parser.add_argument('-p', '--password', default='python', help='the telnet login password')
+    cmd_parser.add_argument('-d', '--test-dirs', nargs='*', help='input test directories (if no files given)')
+    cmd_parser.add_argument('-e', '--exclude', action=append_filter, metavar='REGEX', dest='filters', help='exclude test by regex on path/name.py')
+    cmd_parser.add_argument('-i', '--include', action=append_filter, metavar='REGEX', dest='filters', help='include test by regex on path/name.py')
+    cmd_parser.add_argument('--write-exp', action='store_true', help='save .exp files to run tests w/o CPython')
+    cmd_parser.add_argument('--list-tests', action='store_true', help='list tests instead of running them')
+    cmd_parser.add_argument('--emit', default='bytecode', help='MicroPython emitter to use (bytecode or native)')
+    cmd_parser.add_argument('--heapsize', help='heapsize to use (use default if not specified)')
+    cmd_parser.add_argument('--via-mpy', action='store_true', help='compile .py files to .mpy first')
+    cmd_parser.add_argument('--keep-path', action='store_true', help='do not clear MICROPYPATH when running tests')
+    cmd_parser.add_argument('-j', '--jobs', default=1, metavar='N', type=int, help='Number of tests to run simultaneously')
+    cmd_parser.add_argument('--auto-jobs', action='store_const', dest='jobs', const=multiprocessing.cpu_count(), help='Set the -j values to the CPU (thread) count')
+    cmd_parser.add_argument('files', nargs='*', help='input test files')
+    args = cmd_parser.parse_args()
+
+    EXTERNAL_TARGETS = ('pyboard', 'wipy', 'esp8266', 'esp32', 'minimal')
+    if args.target == 'unix' or args.list_tests:
+        pyb = None
+    elif args.target in EXTERNAL_TARGETS:
+        import pyboard
+        pyb = pyboard.Pyboard(args.device, args.baudrate, args.user, args.password)
+        pyb.enter_raw_repl()
+    else:
+        raise ValueError('target must be either %s or unix' % ", ".join(EXTERNAL_TARGETS))
+
+    if len(args.files) == 0:
+        if args.test_dirs is None:
+            if args.target == 'pyboard':
+                # run pyboard tests
+                test_dirs = ('basics', 'micropython', 'float', 'misc', 'stress', 'extmod', 'pyb', 'pybnative', 'inlineasm')
+            elif args.target in ('esp8266', 'esp32', 'minimal'):
+                test_dirs = ('basics', 'micropython', 'float', 'misc', 'extmod')
+            elif args.target == 'wipy':
+                # run WiPy tests
+                test_dirs = ('basics', 'micropython', 'misc', 'extmod', 'wipy')
+            else:
+                # run PC tests
+                test_dirs = (
+                    'basics', 'micropython', 'float', 'import', 'io', 'misc',
+                    'stress', 'unicode', 'extmod', '../extmod/ulab/tests', 'unix', 'cmdline',
+                )
+        else:
+            # run tests from these directories
+            test_dirs = args.test_dirs
+        tests = sorted(test_file for test_files in (glob('{}/*.py'.format(dir)) for dir in test_dirs) for test_file in test_files)
+    else:
+        # tests explicitly given
+        tests = args.files
+
+    if not args.keep_path:
+        # clear search path to make sure tests use only builtin modules
+        os.environ['MICROPYPATH'] = ''
+
+    # Even if we run completely different tests in a different directory,
+    # we need to access feature_check's from the same directory as the
+    # run-tests script itself.
+    base_path = os.path.dirname(sys.argv[0]) or "."
+    try:
+        res = run_tests(pyb, tests, args, base_path, args.jobs)
+    finally:
+        if pyb:
+            pyb.close()
+
+    if not res:
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()

From 390da2249f2b273a59c3127a64491627ebb46083 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n=20V=C3=B6r=C3=B6s?= <zvoros@gmail.com>
Date: Tue, 16 Feb 2021 19:41:13 +0100
Subject: [PATCH 04/19] all pre-processor constants can now be passed from
 command line

---
 code/ulab.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/code/ulab.h b/code/ulab.h
index d9beddc5..9eedd978 100644
--- a/code/ulab.h
+++ b/code/ulab.h
@@ -40,12 +40,16 @@
 
 // The maximum number of dimensions the firmware should be able to support
 // Possible values lie between 1, and 4, inclusive
-#define ULAB_MAX_DIMS                       2
+#ifndef ULAB_MAX_DIMS
+#define ULAB_MAX_DIMS                       (2)
+#endif
 
 // By setting this constant to 1, iteration over array dimensions will be implemented
 // as a function (ndarray_rewind_array), instead of writing out the loops in macros
 // This reduces firmware size at the expense of speed
+#ifndef ULAB_HAS_FUNCTION_ITERATOR
 #define ULAB_HAS_FUNCTION_ITERATOR          (0)
+#endif
 
 // If NDARRAY_IS_ITERABLE is 1, the ndarray object defines its own iterator function
 // This option saves approx. 250 bytes of flash space
@@ -63,8 +67,8 @@
 #ifndef ULAB_HAS_PRINTOPTIONS
 #define ULAB_HAS_PRINTOPTIONS               (1)
 #endif
-#define NDARRAY_PRINT_THRESHOLD             10
-#define NDARRAY_PRINT_EDGEITEMS             3
+#define NDARRAY_PRINT_THRESHOLD             (10)
+#define NDARRAY_PRINT_EDGEITEMS             (3)
 
 // determines, whether the dtype is an object, or simply a character
 // the object implementation is numpythonic, but requires more space

From a6e515bf144fe9e596874e8e3bee5c4f5b035780 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n=20V=C3=B6r=C3=B6s?= <zvoros@gmail.com>
Date: Tue, 16 Feb 2021 21:16:27 +0100
Subject: [PATCH 05/19] created mock-up image reader object in user module

---
 code/ndarray.c                   |  4 ++++
 code/ndarray.h                   |  4 ++--
 code/numpy/numerical/numerical.h | 21 +++++++++++++++++++++
 code/ulab.h                      |  4 ++--
 code/user/user.c                 | 22 ++++++++++++++++++++++
 5 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/code/ndarray.c b/code/ndarray.c
index 8b2ee707..af6151a2 100644
--- a/code/ndarray.c
+++ b/code/ndarray.c
@@ -601,6 +601,10 @@ ndarray_obj_t *ndarray_new_ndarray(uint8_t ndim, size_t *shape, int32_t *strides
     ndarray_obj_t *ndarray = m_new_obj(ndarray_obj_t);
     ndarray->base.type = &ulab_ndarray_type;
     ndarray->dtype.type = dtype == NDARRAY_BOOL ? NDARRAY_UINT8 : dtype;
+    #if ULAB_DTYPE_IS_EXTENDABLE
+    ndarray->dtype.flags = 0;
+    ndarray->dtype.arrfunc = NULL;
+    #endif
     ndarray->boolean = dtype == NDARRAY_BOOL ? NDARRAY_BOOLEAN : NDARRAY_NUMERIC;
     ndarray->ndim = ndim;
     ndarray->len = ndim == 0 ? 0 : 1;
diff --git a/code/ndarray.h b/code/ndarray.h
index 7a66fdc7..8044774d 100644
--- a/code/ndarray.h
+++ b/code/ndarray.h
@@ -65,8 +65,8 @@ enum NDARRAY_TYPE {
 
 typedef struct _dtype_dtype {
     uint8_t type;
-    #if ULAB_DTYPE_HAS_FUNC_POINTER
-    uint8_t flags = 0;
+    #if ULAB_DTYPE_IS_EXTENDABLE
+    uint8_t flags;
     void *arrfunc;
     #endif
 } dtype_dtype;
diff --git a/code/numpy/numerical/numerical.h b/code/numpy/numerical/numerical.h
index c5c2652b..e29cf81c 100644
--- a/code/numpy/numerical/numerical.h
+++ b/code/numpy/numerical/numerical.h
@@ -46,6 +46,7 @@
     (rarray) += (results)->itemsize;\
 })
 
+#if !(ULAB_DTYPE_IS_EXTENDABLE)
 #define RUN_SUM1(type, ndarray, array, results, rarray, ss)\
 ({\
     type sum = 0;\
@@ -56,6 +57,26 @@
     memcpy((rarray), &sum, (results)->itemsize);\
     (rarray) += (results)->itemsize;\
 })
+#else
+#define RUN_SUM1(type, ndarray, array, results, rarray, ss)\
+({\
+    type sum = 0;\
+    if((ndarray)->dtype.flags == 0) {\
+        for(size_t i=0; i < (ss).shape[0]; i++) {\
+            sum += *((type *)(array));\
+            (array) += (ss).strides[0];\
+        }\
+    } else {\
+        type (*arrfunc)(ndarray_obj_t *, void *, int32_t , size_t ) = (ndarray)->dtype.arrfunc;\
+        for(size_t i=0; i < (ss).shape[0]; i++) {\
+            sum += arrfunc((ndarray), (array), (ss).strides[0], i);\
+            (array) += (ss).strides[0];\
+        }\
+    }\
+    memcpy((rarray), &sum, (results)->itemsize);\
+    (rarray) += (results)->itemsize;\
+})
+#endif
 
 #define RUN_SUM2(type, ndarray, array, results, rarray, ss) do {\
     size_t l = 0;\
diff --git a/code/ulab.h b/code/ulab.h
index 9eedd978..8a111371 100644
--- a/code/ulab.h
+++ b/code/ulab.h
@@ -79,7 +79,7 @@
 // This constant determines, whether a function pointer can be attached to the dtype object
 // Such function pointers are useful for custom data types
 #ifndef ULAB_DTYPE_IS_EXTENDABLE
-#define ULAB_DTYPE_IS_EXTENDABLE            (0)
+#define ULAB_DTYPE_IS_EXTENDABLE            (1)
 #endif
 
 // the ndarray binary operators
@@ -622,7 +622,7 @@
 // user-defined module; source of the module and
 // its sub-modules should be placed in code/user/
 #ifndef ULAB_HAS_USER_MODULE
-#define ULAB_HAS_USER_MODULE                (0)
+#define ULAB_HAS_USER_MODULE                (1)
 #endif
 
 #endif
diff --git a/code/user/user.c b/code/user/user.c
index 17829b43..ec4b21d3 100644
--- a/code/user/user.c
+++ b/code/user/user.c
@@ -80,9 +80,31 @@ static mp_obj_t user_square(mp_obj_t arg) {
 
 MP_DEFINE_CONST_FUN_OBJ_1(user_square_obj, user_square);
 
+static uint8_t user_imreader(ndarray_obj_t *ndarray, void *array, int32_t strides, size_t i) {
+    return (uint8_t)i*i;
+}
+
+static mp_obj_t user_imread(mp_obj_t shape) {
+
+    size_t len = mp_obj_get_int(shape);
+
+    ndarray_obj_t *ndarray = ndarray_new_linear_array(len, NDARRAY_UINT8);
+    uint8_t *array = (uint8_t *)ndarray->array;
+    for(size_t i=0; i < len; i++) {
+        array[i] = i;
+    }
+    ndarray->dtype.flags = 1;
+    ndarray->dtype.arrfunc = user_imreader;
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+
+MP_DEFINE_CONST_FUN_OBJ_1(user_imread_obj, user_imread);
+
 static const mp_rom_map_elem_t ulab_user_globals_table[] = {
     { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_user) },
     { MP_OBJ_NEW_QSTR(MP_QSTR_square), (mp_obj_t)&user_square_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_imread), (mp_obj_t)&user_imread_obj },
 };
 
 static MP_DEFINE_CONST_DICT(mp_module_ulab_user_globals, ulab_user_globals_table);

From 42860ae7a554a09dba6cdcd623ebf5ab32813f03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n=20V=C3=B6r=C3=B6s?= <zvoros@gmail.com>
Date: Thu, 18 Feb 2021 22:39:53 +0100
Subject: [PATCH 06/19] mock image can now be iterated over in sum/mean/std

---
 code/ndarray.c                   | 11 +++--
 code/ndarray.h                   |  4 ++
 code/numpy/numerical/numerical.c | 30 ++++++++-----
 code/numpy/numerical/numerical.h | 77 ++++++++++++++++++++------------
 code/ulab.h                      |  4 +-
 code/ulab_tools.c                | 16 +++++++
 code/ulab_tools.h                |  1 +
 code/user/user.c                 | 30 ++++++++++---
 8 files changed, 123 insertions(+), 50 deletions(-)

diff --git a/code/ndarray.c b/code/ndarray.c
index af6151a2..67642d8b 100644
--- a/code/ndarray.c
+++ b/code/ndarray.c
@@ -601,10 +601,6 @@ ndarray_obj_t *ndarray_new_ndarray(uint8_t ndim, size_t *shape, int32_t *strides
     ndarray_obj_t *ndarray = m_new_obj(ndarray_obj_t);
     ndarray->base.type = &ulab_ndarray_type;
     ndarray->dtype.type = dtype == NDARRAY_BOOL ? NDARRAY_UINT8 : dtype;
-    #if ULAB_DTYPE_IS_EXTENDABLE
-    ndarray->dtype.flags = 0;
-    ndarray->dtype.arrfunc = NULL;
-    #endif
     ndarray->boolean = dtype == NDARRAY_BOOL ? NDARRAY_BOOLEAN : NDARRAY_NUMERIC;
     ndarray->ndim = ndim;
     ndarray->len = ndim == 0 ? 0 : 1;
@@ -628,6 +624,13 @@ ndarray_obj_t *ndarray_new_ndarray(uint8_t ndim, size_t *shape, int32_t *strides
     // we could, perhaps, leave this step out, and initialise the array only, when needed
     memset(array, 0, len);
     ndarray->array = array;
+
+    #if ULAB_DTYPE_IS_EXTENDABLE
+    // indicate that the array doesn't need special treatment in the readout function
+    ndarray->dtype.flags = 0;
+    ndarray->dtype.origin = array;
+    #endif
+
     return ndarray;
 }
 
diff --git a/code/ndarray.h b/code/ndarray.h
index 8044774d..f636fa50 100644
--- a/code/ndarray.h
+++ b/code/ndarray.h
@@ -68,6 +68,10 @@ typedef struct _dtype_dtype {
     #if ULAB_DTYPE_IS_EXTENDABLE
     uint8_t flags;
     void *arrfunc;
+    uint8_t *subarray;
+    size_t shape[ULAB_MAX_DIMS];
+    void *origin;
+    uint8_t *name;
     #endif
 } dtype_dtype;
 
diff --git a/code/numpy/numerical/numerical.c b/code/numpy/numerical/numerical.c
index 7f0ef71f..12880c19 100644
--- a/code/numpy/numerical/numerical.c
+++ b/code/numpy/numerical/numerical.c
@@ -262,23 +262,33 @@ static mp_obj_t numerical_sum_mean_std_ndarray(ndarray_obj_t *ndarray, mp_obj_t
         ndarray_obj_t *results = NULL;
         uint8_t *rarray = NULL;
         mp_float_t *farray = NULL;
+
+        #if ULAB_DTYPE_IS_EXTENDABLE
+        size_t (*arrfunc)(ndarray_obj_t *, void *, int32_t *, size_t) = NULL;
+        if(ndarray->dtype.flags) {
+            arrfunc = ndarray->dtype.arrfunc;
+        }
+        #else
+        uint8_t arrfunc = 0;
+        #endif
+
         if(optype == NUMERICAL_SUM) {
             results = ndarray_new_dense_ndarray(_shape_strides.ndim, _shape_strides.shape, ndarray->dtype.type);
             rarray = (uint8_t *)results->array;
             // TODO: numpy promotes the output to the highest integer type
             if(ndarray->dtype.type == NDARRAY_UINT8) {
-                RUN_SUM(uint8_t, ndarray, array, results, rarray, _shape_strides);
+                RUN_SUM(uint8_t, ndarray, array, results, rarray, _shape_strides, arrfunc);
             } else if(ndarray->dtype.type == NDARRAY_INT8) {
-                RUN_SUM(int8_t, ndarray, array, results, rarray, _shape_strides);
+                RUN_SUM(int8_t, ndarray, array, results, rarray, _shape_strides, arrfunc);
             } else if(ndarray->dtype.type == NDARRAY_UINT16) {
-                RUN_SUM(uint16_t, ndarray, array, results, rarray, _shape_strides);
+                RUN_SUM(uint16_t, ndarray, array, results, rarray, _shape_strides, arrfunc);
             } else if(ndarray->dtype.type == NDARRAY_INT16) {
-                RUN_SUM(int16_t, ndarray, array, results, rarray, _shape_strides);
+                RUN_SUM(int16_t, ndarray, array, results, rarray, _shape_strides, arrfunc);
             } else {
                 // for floats, the sum might be inaccurate with the naive summation
                 // call mean, and multiply with the number of samples
                 farray = (mp_float_t *)results->array;
-                RUN_MEAN_STD(mp_float_t, ndarray, array, farray, _shape_strides, 0.0, 0);
+                RUN_MEAN_STD(mp_float_t, ndarray, array, farray, _shape_strides, 0.0, 0, arrfunc);
                 mp_float_t norm = (mp_float_t)_shape_strides.shape[0];
                 // re-wind the array here
                 farray = (mp_float_t *)results->array;
@@ -296,15 +306,15 @@ static mp_obj_t numerical_sum_mean_std_ndarray(ndarray_obj_t *ndarray, mp_obj_t
             }
             mp_float_t div = optype == NUMERICAL_STD ? (mp_float_t)(_shape_strides.shape[0] - ddof) : 0.0;
             if(ndarray->dtype.type == NDARRAY_UINT8) {
-                RUN_MEAN_STD(uint8_t, ndarray, array, farray, _shape_strides, div, isStd);
+                RUN_MEAN_STD(uint8_t, ndarray, array, farray, _shape_strides, div, isStd, arrfunc);
             } else if(ndarray->dtype.type == NDARRAY_INT8) {
-                RUN_MEAN_STD(int8_t, ndarray, array, farray, _shape_strides, div, isStd);
+                RUN_MEAN_STD(int8_t, ndarray, array, farray, _shape_strides, div, isStd, arrfunc);
             } else if(ndarray->dtype.type == NDARRAY_UINT16) {
-                RUN_MEAN_STD(uint16_t, ndarray, array, farray, _shape_strides, div, isStd);
+                RUN_MEAN_STD(uint16_t, ndarray, array, farray, _shape_strides, div, isStd, arrfunc);
             } else if(ndarray->dtype.type == NDARRAY_INT16) {
-                RUN_MEAN_STD(int16_t, ndarray, array, farray, _shape_strides, div, isStd);
+                RUN_MEAN_STD(int16_t, ndarray, array, farray, _shape_strides, div, isStd, arrfunc);
             } else {
-                RUN_MEAN_STD(mp_float_t, ndarray, array, farray, _shape_strides, div, isStd);
+                RUN_MEAN_STD(mp_float_t, ndarray, array, farray, _shape_strides, div, isStd, arrfunc);
             }
         }
         if(results->ndim == 0) { // return a scalar here
diff --git a/code/numpy/numerical/numerical.h b/code/numpy/numerical/numerical.h
index e29cf81c..58e05414 100644
--- a/code/numpy/numerical/numerical.h
+++ b/code/numpy/numerical/numerical.h
@@ -47,9 +47,10 @@
 })
 
 #if !(ULAB_DTYPE_IS_EXTENDABLE)
-#define RUN_SUM1(type, ndarray, array, results, rarray, ss)\
+#define RUN_SUM1(type, ndarray, array, results, rarray, ss, arrfunc)\
 ({\
     type sum = 0;\
+    (void)arrfunc;\
     for(size_t i=0; i < (ss).shape[0]; i++) {\
         sum += *((type *)(array));\
         (array) += (ss).strides[0];\
@@ -58,50 +59,48 @@
     (rarray) += (results)->itemsize;\
 })
 #else
-#define RUN_SUM1(type, ndarray, array, results, rarray, ss)\
+#define RUN_SUM1(type, ndarray, array, results, rarray, ss, arrfunc)\
 ({\
     type sum = 0;\
-    if((ndarray)->dtype.flags == 0) {\
-        for(size_t i=0; i < (ss).shape[0]; i++) {\
-            sum += *((type *)(array));\
-            (array) += (ss).strides[0];\
-        }\
-    } else {\
-        type (*arrfunc)(ndarray_obj_t *, void *, int32_t , size_t ) = (ndarray)->dtype.arrfunc;\
-        for(size_t i=0; i < (ss).shape[0]; i++) {\
-            sum += arrfunc((ndarray), (array), (ss).strides[0], i);\
-            (array) += (ss).strides[0];\
-        }\
+    int32_t increment = (ss).strides[0];\
+    (ndarray)->dtype.subarray = (array);\
+    if((ndarray)->dtype.flags) {\
+        (arrfunc)((ndarray), (array), &increment, (ss).shape[0]);\
     }\
+    for(size_t i=0; i < (ss).shape[0]; i++) {\
+        sum += *((type *)((ndarray)->dtype.subarray));\
+        (array) += increment;\
+    }\
+    (array) += (ss).shape[0] * (ss).strides[0];\
     memcpy((rarray), &sum, (results)->itemsize);\
     (rarray) += (results)->itemsize;\
 })
 #endif
 
-#define RUN_SUM2(type, ndarray, array, results, rarray, ss) do {\
+#define RUN_SUM2(type, ndarray, array, results, rarray, ss, arrfunc) do {\
     size_t l = 0;\
     do {\
-        RUN_SUM1(type, (ndarray), (array), (results), (rarray), (ss));\
+        RUN_SUM1(type, (ndarray), (array), (results), (rarray), (ss), (arrfunc));\
         (array) -= (ss).strides[0] * (ss).shape[0];\
         (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
         l++;\
     } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
 } while(0)
 
-#define RUN_SUM3(type, ndarray, array, results, rarray, ss) do {\
+#define RUN_SUM3(type, ndarray, array, results, rarray, ss, arrfunc) do {\
     size_t k = 0;\
     do {\
-        RUN_SUM2(type, (ndarray), (array), (results), (rarray), (ss));\
+        RUN_SUM2(type, (ndarray), (array), (results), (rarray), (ss), (arrfunc));\
         (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
         (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
         k++;\
     } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
 } while(0)
 
-#define RUN_SUM4(type, ndarray, array, results, rarray, ss) do {\
+#define RUN_SUM4(type, ndarray, array, results, rarray, ss, arrfunc) do {\
     size_t j = 0;\
     do {\
-        RUN_SUM3(type, (ndarray), (array), (results), (rarray), (ss));\
+        RUN_SUM3(type, (ndarray), (array), (results), (rarray), (ss), (arrfunc));\
         (array) -= (ss).strides[ULAB_MAX_DIMS - 2] * (ss).shape[ULAB_MAX_DIMS - 2];\
         (array) += (ss).strides[ULAB_MAX_DIMS - 3];\
         j++;\
@@ -111,12 +110,12 @@
 // Instead of the straightforward implementation of the definition,
 // we take the numerically stable Welford algorithm here
 // https://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
-
-#define RUN_MEAN_STD1(type, ndarray, array, rarray, ss, div, isStd)\
+#if !(ULAB_DTYPE_IS_EXTENDABLE)
+#define RUN_MEAN_STD1(type, ndarray, array, rarray, ss, div, isStd, arrfunc)\
 ({\
     mp_float_t M = 0.0, m = 0.0, S = 0.0;\
     for(size_t i=0; i < (ss).shape[0]; i++) {\
-        mp_float_t value = (mp_float_t)(*(type *)(array));\
+        mp_float_t value = (mp_float_t)(*(type *)((array)));\
         m = M + (value - M) / (mp_float_t)(i+1);\
         if(isStd) {\
             S += (value - M) * (value - m);\
@@ -126,31 +125,53 @@
     }\
     *(rarray)++ = isStd ? MICROPY_FLOAT_C_FUN(sqrt)(S / (div)) : M;\
 })
+#else
+#define RUN_MEAN_STD1(type, ndarray, array, rarray, ss, div, isStd, arrfunc)\
+({\
+    mp_float_t M = 0.0, m = 0.0, S = 0.0;\
+    int32_t increment = (ss).strides[0];\
+    (ndarray)->dtype.subarray = (array);\
+    if((ndarray)->dtype.flags) {\
+        (arrfunc)((ndarray), (array), &increment, (ss).shape[0]);\
+    }\
+    for(size_t i=0; i < (ss).shape[0]; i++) {\
+        mp_float_t value = (mp_float_t)(*(type *)((ndarray)->dtype.subarray));\
+        m = M + (value - M) / (mp_float_t)(i+1);\
+        if(isStd) {\
+            S += (value - M) * (value - m);\
+        }\
+        M = m;\
+        (ndarray)->dtype.subarray += increment;\
+    }\
+    (array) += (ss).shape[0] * (ss).strides[0];\
+    *(rarray)++ = isStd ? MICROPY_FLOAT_C_FUN(sqrt)(S / (div)) : M;\
+})
+#endif
 
-#define RUN_MEAN_STD2(type, ndarray, array, rarray, ss, div, isStd) do {\
+#define RUN_MEAN_STD2(type, ndarray, array, rarray, ss, div, isStd, arrfunc) do {\
     size_t l = 0;\
     do {\
-        RUN_MEAN_STD1(type, (ndarray), (array), (rarray), (ss), (div), (isStd));\
+        RUN_MEAN_STD1(type, (ndarray), (array), (rarray), (ss), (div), (isStd), (arrfunc));\
         (array) -= (ss).strides[0] * (ss).shape[0];\
         (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
         l++;\
     } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
 } while(0)
 
-#define RUN_MEAN_STD3(type, ndarray, array, rarray, ss, div, isStd) do {\
+#define RUN_MEAN_STD3(type, ndarray, array, rarray, ss, div, isStd, arrfunc) do {\
     size_t k = 0;\
     do {\
-        RUN_MEAN_STD2(type, (ndarray), (array), (rarray), (ss), (div), (isStd));\
+        RUN_MEAN_STD2(type, (ndarray), (array), (rarray), (ss), (div), (isStd), (arrfunc));\
         (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
         (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
         k++;\
     } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
 } while(0)
 
-#define RUN_MEAN_STD4(type, ndarray, array, rarray, ss, div, isStd) do {\
+#define RUN_MEAN_STD4(type, ndarray, array, rarray, ss, div, isStd, arrfunc) do {\
     size_t j = 0;\
     do {\
-        RUN_MEAN_STD3(type, (ndarray), (array), (rarray), (ss), (div), (isStd));\
+        RUN_MEAN_STD3(type, (ndarray), (array), (rarray), (ss), (div), (isStd), (arrfunc));\
         (array) -= (ss).strides[ULAB_MAX_DIMS - 2] * (ss).shape[ULAB_MAX_DIMS - 2];\
         (array) += (ss).strides[ULAB_MAX_DIMS - 3];\
         j++;\
diff --git a/code/ulab.h b/code/ulab.h
index 8a111371..9eedd978 100644
--- a/code/ulab.h
+++ b/code/ulab.h
@@ -79,7 +79,7 @@
 // This constant determines, whether a function pointer can be attached to the dtype object
 // Such function pointers are useful for custom data types
 #ifndef ULAB_DTYPE_IS_EXTENDABLE
-#define ULAB_DTYPE_IS_EXTENDABLE            (1)
+#define ULAB_DTYPE_IS_EXTENDABLE            (0)
 #endif
 
 // the ndarray binary operators
@@ -622,7 +622,7 @@
 // user-defined module; source of the module and
 // its sub-modules should be placed in code/user/
 #ifndef ULAB_HAS_USER_MODULE
-#define ULAB_HAS_USER_MODULE                (1)
+#define ULAB_HAS_USER_MODULE                (0)
 #endif
 
 #endif
diff --git a/code/ulab_tools.c b/code/ulab_tools.c
index 9663d3d5..02b919a2 100644
--- a/code/ulab_tools.c
+++ b/code/ulab_tools.c
@@ -10,6 +10,7 @@
 
 
 #include <string.h>
+#include <math.h>
 #include "py/runtime.h"
 
 #include "ulab.h"
@@ -228,3 +229,18 @@ ndarray_obj_t *tools_object_is_square(mp_obj_t obj) {
     return ndarray;
 }
 #endif
+
+#if ULAB_DTYPE_IS_EXTENDABLE
+size_t *tools_coords_from_pointer(void *p1, ndarray_obj_t *ndarray) {
+    // calculates the coordinates in the original tensor from the position of the pointer
+    // The original view is assumed to be dense, i.e., the strides can be computed from the shape
+    size_t diff = (uint8_t *)p1 - (uint8_t *)ndarray->dtype.origin;
+    size_t accumulator = 1;
+    size_t *coords = m_new(size_t, ULAB_MAX_DIMS);
+    for(uint8_t i = 1; i < ndarray->ndim + 1; i++) {
+        accumulator *= ndarray->dtype.shape[ULAB_MAX_DIMS - i];
+        coords[ULAB_MAX_DIMS - i] = diff % accumulator;
+    }
+    return coords;
+}
+#endif
diff --git a/code/ulab_tools.h b/code/ulab_tools.h
index 378e4f0c..3804a462 100644
--- a/code/ulab_tools.h
+++ b/code/ulab_tools.h
@@ -34,4 +34,5 @@ void *ndarray_set_float_function(uint8_t );
 
 shape_strides tools_reduce_axes(ndarray_obj_t *, mp_obj_t );
 ndarray_obj_t *tools_object_is_square(mp_obj_t );
+size_t *tools_coords_from_pointer(void *, ndarray_obj_t *);
 #endif
diff --git a/code/user/user.c b/code/user/user.c
index ec4b21d3..2c795330 100644
--- a/code/user/user.c
+++ b/code/user/user.c
@@ -15,6 +15,8 @@
 #include "py/obj.h"
 #include "py/runtime.h"
 #include "py/misc.h"
+
+#include "../ulab_tools.h"
 #include "user.h"
 
 #if ULAB_HAS_USER_MODULE
@@ -80,21 +82,37 @@ static mp_obj_t user_square(mp_obj_t arg) {
 
 MP_DEFINE_CONST_FUN_OBJ_1(user_square_obj, user_square);
 
-static uint8_t user_imreader(ndarray_obj_t *ndarray, void *array, int32_t strides, size_t i) {
-    return (uint8_t)i*i;
+static size_t user_imreader(ndarray_obj_t *ndarray, void *array, int32_t *strides, size_t count) {
+    uint16_t *subarray = (uint16_t *)ndarray->dtype.subarray;
+    // if necessary, get the coordinates in the original reference frame, i.e.,
+    // in the coordinates used at the time of the creation of the object
+    // size_t *coords = tools_coords_from_pointer(array, ndarray);
+    for(size_t i = 0; i < count; i += *strides/ndarray->itemsize) {
+        // fill up the array with dummy data
+        *subarray++ = i*i;
+    }
+    // since strides is going to be used in computation loops, and subarray is
+    // meant to be a dense array, simply overwrite strides with the itemsize
+    *strides = ndarray->itemsize;
+    // uint8_t *_array = (uint8_t *)subarray;
+    return 0; //ndarray->itemsize;
 }
 
 static mp_obj_t user_imread(mp_obj_t shape) {
 
     size_t len = mp_obj_get_int(shape);
 
-    ndarray_obj_t *ndarray = ndarray_new_linear_array(len, NDARRAY_UINT8);
-    uint8_t *array = (uint8_t *)ndarray->array;
-    for(size_t i=0; i < len; i++) {
-        array[i] = i;
+    ndarray_obj_t *ndarray = ndarray_new_linear_array(len, NDARRAY_UINT16);
+    uint16_t *array = (uint16_t *)ndarray->array;
+    for(size_t i = 0; i < len; i++) {
+        *array = (uint16_t)i;
+        array += 1;
     }
     ndarray->dtype.flags = 1;
     ndarray->dtype.arrfunc = user_imreader;
+    uint8_t *subarray = m_new(uint8_t, ndarray->itemsize * ndarray->shape[ULAB_MAX_DIMS - 1]);
+    ndarray->dtype.subarray = subarray;
+    memcpy(&(ndarray->dtype.shape), &(ndarray->shape), sizeof(size_t) *ULAB_MAX_DIMS);
     return MP_OBJ_FROM_PTR(ndarray);
 }
 

From 637e932ef69d1df9271431f0d542c7179b4c581e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n=20V=C3=B6r=C3=B6s?= <zvoros@gmail.com>
Date: Thu, 18 Feb 2021 22:54:36 +0100
Subject: [PATCH 07/19] changed return value of pointer function

---
 code/numpy/numerical/numerical.c | 2 +-
 code/user/user.c                 | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/code/numpy/numerical/numerical.c b/code/numpy/numerical/numerical.c
index 12880c19..3a1f9047 100644
--- a/code/numpy/numerical/numerical.c
+++ b/code/numpy/numerical/numerical.c
@@ -264,7 +264,7 @@ static mp_obj_t numerical_sum_mean_std_ndarray(ndarray_obj_t *ndarray, mp_obj_t
         mp_float_t *farray = NULL;
 
         #if ULAB_DTYPE_IS_EXTENDABLE
-        size_t (*arrfunc)(ndarray_obj_t *, void *, int32_t *, size_t) = NULL;
+        void (*arrfunc)(ndarray_obj_t *, void *, int32_t *, size_t) = NULL;
         if(ndarray->dtype.flags) {
             arrfunc = ndarray->dtype.arrfunc;
         }
diff --git a/code/user/user.c b/code/user/user.c
index 2c795330..6e12e44f 100644
--- a/code/user/user.c
+++ b/code/user/user.c
@@ -82,7 +82,7 @@ static mp_obj_t user_square(mp_obj_t arg) {
 
 MP_DEFINE_CONST_FUN_OBJ_1(user_square_obj, user_square);
 
-static size_t user_imreader(ndarray_obj_t *ndarray, void *array, int32_t *strides, size_t count) {
+static void user_imreader(ndarray_obj_t *ndarray, void *array, int32_t *strides, size_t count) {
     uint16_t *subarray = (uint16_t *)ndarray->dtype.subarray;
     // if necessary, get the coordinates in the original reference frame, i.e.,
     // in the coordinates used at the time of the creation of the object
@@ -94,8 +94,6 @@ static size_t user_imreader(ndarray_obj_t *ndarray, void *array, int32_t *stride
     // since strides is going to be used in computation loops, and subarray is
     // meant to be a dense array, simply overwrite strides with the itemsize
     *strides = ndarray->itemsize;
-    // uint8_t *_array = (uint8_t *)subarray;
-    return 0; //ndarray->itemsize;
 }
 
 static mp_obj_t user_imread(mp_obj_t shape) {

From 5b14d7c6ee675263454bfbe3cbf3ea77209d1b10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n=20V=C3=B6r=C3=B6s?= <zvoros@gmail.com>
Date: Sat, 20 Feb 2021 07:50:06 +0100
Subject: [PATCH 08/19] created blocks module, moved some code from user there

---
 code/blocks/blocks.c             | 73 ++++++++++++++++++++++++++++++++
 code/blocks/blocks.h             | 20 +++++++++
 code/micropython.mk              |  2 +-
 code/ndarray.c                   |  2 +-
 code/ndarray.h                   |  2 +-
 code/numpy/numerical/numerical.h |  2 +-
 code/ulab.c                      |  6 ++-
 code/ulab.h                      | 29 +++++++------
 code/user/user.c                 | 36 ----------------
 9 files changed, 118 insertions(+), 54 deletions(-)
 create mode 100644 code/blocks/blocks.c
 create mode 100644 code/blocks/blocks.h

diff --git a/code/blocks/blocks.c b/code/blocks/blocks.c
new file mode 100644
index 00000000..7076f0c1
--- /dev/null
+++ b/code/blocks/blocks.c
@@ -0,0 +1,73 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2021 Zoltán Vörös
+*/
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/misc.h"
+
+#include "../ulab_tools.h"
+#include "blocks.h"
+
+#if ULAB_HAS_BLOCKS
+
+static void blocks_imreader(ndarray_obj_t *ndarray, void *array, int32_t *strides, size_t count) {
+    uint16_t *subarray = (uint16_t *)ndarray->dtype.subarray;
+    // if necessary, get the coordinates in the original reference frame, i.e.,
+    // in the coordinates used at the time of the creation of the object
+    // size_t *coords = tools_coords_from_pointer(array, ndarray);
+    for(size_t i = 0; i < count; i += *strides/ndarray->itemsize) {
+        // fill up the array with dummy data
+        *subarray++ = i*i;
+    }
+    // since strides is going to be used in computation loops, and subarray is
+    // meant to be a dense array, simply overwrite strides with the itemsize
+    *strides = ndarray->itemsize;
+}
+
+ndarray_obj_t *ndarray_ndarray_header(size_t len, void *arrfunc, uint8_t dtype) {
+    ndarray_obj_t *ndarray = ndarray_new_linear_array(len, dtype);
+    ndarray->dtype.flags = 1;
+    ndarray->dtype.arrfunc = arrfunc;
+    uint8_t *subarray = m_new(uint8_t, ndarray->itemsize * ndarray->shape[ULAB_MAX_DIMS - 1]);
+    ndarray->dtype.subarray = subarray;
+    memcpy(&(ndarray->dtype.shape), &(ndarray->shape), sizeof(size_t) *ULAB_MAX_DIMS);
+    return ndarray;
+}
+
+static mp_obj_t blocks_imread(mp_obj_t shape, mp_obj_t dtype) {
+
+    size_t len = mp_obj_get_int(shape);
+    uint8_t _dtype = MP_OBJ_SMALL_INT_VALUE(dtype);
+
+    void (*imreader)(ndarray_obj_t *, void *, int32_t *, size_t) = blocks_imreader;
+
+    ndarray_obj_t *ndarray = ndarray_ndarray_header(len, imreader, _dtype);
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_2(blocks_imread_obj, blocks_imread);
+
+static const mp_rom_map_elem_t ulab_blocks_globals_table[] = {
+    { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_blocks) },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_imread), (mp_obj_t)&blocks_imread_obj },
+};
+
+static MP_DEFINE_CONST_DICT(mp_module_ulab_blocks_globals, ulab_blocks_globals_table);
+
+mp_obj_module_t ulab_blocks_module = {
+    .base = { &mp_type_module },
+    .globals = (mp_obj_dict_t*)&mp_module_ulab_blocks_globals,
+};
+
+#endif
diff --git a/code/blocks/blocks.h b/code/blocks/blocks.h
new file mode 100644
index 00000000..08bfbba4
--- /dev/null
+++ b/code/blocks/blocks.h
@@ -0,0 +1,20 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2021 Zoltán Vörös
+*/
+
+#ifndef _BLOCKS_
+#define _BLOCKS_
+
+#include "ulab.h"
+#include "ndarray.h"
+
+extern mp_obj_module_t ulab_blocks_module;
+
+#endif
diff --git a/code/micropython.mk b/code/micropython.mk
index f376ae49..e7ed7bdc 100644
--- a/code/micropython.mk
+++ b/code/micropython.mk
@@ -1,7 +1,6 @@
 
 USERMODULES_DIR := $(USERMOD_DIR)
 
-# Add all C files to SRC_USERMOD.
 SRC_USERMOD += $(USERMODULES_DIR)/scipy/optimize/optimize.c
 SRC_USERMOD += $(USERMODULES_DIR)/scipy/signal/signal.c
 SRC_USERMOD += $(USERMODULES_DIR)/scipy/special/special.c
@@ -22,6 +21,7 @@ SRC_USERMOD += $(USERMODULES_DIR)/numpy/stats/stats.c
 SRC_USERMOD += $(USERMODULES_DIR)/numpy/transform/transform.c
 SRC_USERMOD += $(USERMODULES_DIR)/numpy/vector/vector.c
 SRC_USERMOD += $(USERMODULES_DIR)/user/user.c
+SRC_USERMOD += $(USERMODULES_DIR)/blocks/blocks.c
 
 SRC_USERMOD += $(USERMODULES_DIR)/numpy/numpy.c
 SRC_USERMOD += $(USERMODULES_DIR)/scipy/scipy.c
diff --git a/code/ndarray.c b/code/ndarray.c
index 67642d8b..257069ee 100644
--- a/code/ndarray.c
+++ b/code/ndarray.c
@@ -625,7 +625,7 @@ ndarray_obj_t *ndarray_new_ndarray(uint8_t ndim, size_t *shape, int32_t *strides
     memset(array, 0, len);
     ndarray->array = array;
 
-    #if ULAB_DTYPE_IS_EXTENDABLE
+    #if ULAB_HAS_BLOCKS
     // indicate that the array doesn't need special treatment in the readout function
     ndarray->dtype.flags = 0;
     ndarray->dtype.origin = array;
diff --git a/code/ndarray.h b/code/ndarray.h
index f636fa50..b222a9d9 100644
--- a/code/ndarray.h
+++ b/code/ndarray.h
@@ -65,7 +65,7 @@ enum NDARRAY_TYPE {
 
 typedef struct _dtype_dtype {
     uint8_t type;
-    #if ULAB_DTYPE_IS_EXTENDABLE
+    #if ULAB_HAS_BLOCKS
     uint8_t flags;
     void *arrfunc;
     uint8_t *subarray;
diff --git a/code/numpy/numerical/numerical.h b/code/numpy/numerical/numerical.h
index 58e05414..1507dfb3 100644
--- a/code/numpy/numerical/numerical.h
+++ b/code/numpy/numerical/numerical.h
@@ -69,7 +69,7 @@
     }\
     for(size_t i=0; i < (ss).shape[0]; i++) {\
         sum += *((type *)((ndarray)->dtype.subarray));\
-        (array) += increment;\
+        (ndarray)->dtype.subarray += increment;\
     }\
     (array) += (ss).shape[0] * (ss).strides[0];\
     memcpy((rarray), &sum, (results)->itemsize);\
diff --git a/code/ulab.c b/code/ulab.c
index 1be865db..858aa071 100644
--- a/code/ulab.c
+++ b/code/ulab.c
@@ -26,6 +26,7 @@
 
 #include "numpy/numpy.h"
 #include "scipy/scipy.h"
+#include "blocks/blocks.h"
 #include "numpy/fft/fft.h"
 #include "numpy/linalg/linalg.h"
 // TODO: we should get rid of this; array.sort depends on it
@@ -127,10 +128,13 @@ STATIC const mp_map_elem_t ulab_globals_table[] = {
         { MP_OBJ_NEW_QSTR(MP_QSTR_dtype), (mp_obj_t)&ndarray_dtype_obj },
         #endif /* NDARRAY_HAS_DTYPE */
     #endif /* ULAB_HAS_DTYPE_OBJECT */
-        { MP_ROM_QSTR(MP_QSTR_numpy), MP_ROM_PTR(&ulab_numpy_module) },
+    { MP_ROM_QSTR(MP_QSTR_numpy), MP_ROM_PTR(&ulab_numpy_module) },
     #if ULAB_HAS_SCIPY
         { MP_ROM_QSTR(MP_QSTR_scipy), MP_ROM_PTR(&ulab_scipy_module) },
     #endif
+    #if ULAB_HAS_BLOCKS
+        { MP_ROM_QSTR(MP_QSTR_blocks), MP_ROM_PTR(&ulab_blocks_module) },
+    #endif
     #if ULAB_HAS_USER_MODULE
         { MP_ROM_QSTR(MP_QSTR_user), MP_ROM_PTR(&ulab_user_module) },
     #endif
diff --git a/code/ulab.h b/code/ulab.h
index 9eedd978..cf2c05e5 100644
--- a/code/ulab.h
+++ b/code/ulab.h
@@ -38,6 +38,13 @@
 #define ULAB_HAS_SCIPY                      (1)
 #endif
 
+// Determines, whether the blocks module is defined in ulab. When ULAB_HAS_BLOCKS is
+// defined 1, a function pointer can be attached to the dtype object
+// Such function pointers are useful for custom data types
+#ifndef ULAB_HAS_BLOCKS
+#define ULAB_HAS_BLOCKS                      (1)
+#endif
+
 // The maximum number of dimensions the firmware should be able to support
 // Possible values lie between 1, and 4, inclusive
 #ifndef ULAB_MAX_DIMS
@@ -52,7 +59,7 @@
 #endif
 
 // If NDARRAY_IS_ITERABLE is 1, the ndarray object defines its own iterator function
-// This option saves approx. 250 bytes of flash space
+// Setting this option to 0 saves approx. 250 bytes of flash space
 #ifndef NDARRAY_IS_ITERABLE
 #define NDARRAY_IS_ITERABLE                 (1)
 #endif
@@ -63,12 +70,12 @@
 #endif
 
 // The default threshold for pretty printing. These variables can be overwritten
-// at run-time via the set_printoptions() function
+// at run-time via the set_printoptions() function, if ULAB_HAS_PRINTOPTIONS is 1
+#define NDARRAY_PRINT_THRESHOLD             (10)
+#define NDARRAY_PRINT_EDGEITEMS             (3)
 #ifndef ULAB_HAS_PRINTOPTIONS
 #define ULAB_HAS_PRINTOPTIONS               (1)
 #endif
-#define NDARRAY_PRINT_THRESHOLD             (10)
-#define NDARRAY_PRINT_EDGEITEMS             (3)
 
 // determines, whether the dtype is an object, or simply a character
 // the object implementation is numpythonic, but requires more space
@@ -76,19 +83,15 @@
 #define ULAB_HAS_DTYPE_OBJECT               (0)
 #endif
 
-// This constant determines, whether a function pointer can be attached to the dtype object
-// Such function pointers are useful for custom data types
-#ifndef ULAB_DTYPE_IS_EXTENDABLE
-#define ULAB_DTYPE_IS_EXTENDABLE            (0)
-#endif
-
-// the ndarray binary operators
+// The ndarray binary operators
+// If, e.g., only FFTs are required, massive savings of
+// flash space can be had by disabling the binary operators
 #ifndef NDARRAY_HAS_BINARY_OPS
 #define NDARRAY_HAS_BINARY_OPS              (1)
 #endif
 
 // Firmware size can be reduced at the expense of speed by using function
-// pointers in iterations. For each operator, he function pointer saves around
+// pointers in iterations. For each operator, the function pointer saves around
 // 2 kB in the two-dimensional case, and around 4 kB in the four-dimensional case.
 
 #ifndef NDARRAY_BINARY_USES_FUN_POINTER
@@ -189,7 +192,7 @@
 #endif
 
 
-// determines, which ndarray methods are available
+// Constants in the following section determine, which ndarray methods are available
 #ifndef NDARRAY_HAS_BYTESWAP
 #define NDARRAY_HAS_BYTESWAP            (1)
 #endif
diff --git a/code/user/user.c b/code/user/user.c
index 6e12e44f..6b72d4b4 100644
--- a/code/user/user.c
+++ b/code/user/user.c
@@ -82,45 +82,9 @@ static mp_obj_t user_square(mp_obj_t arg) {
 
 MP_DEFINE_CONST_FUN_OBJ_1(user_square_obj, user_square);
 
-static void user_imreader(ndarray_obj_t *ndarray, void *array, int32_t *strides, size_t count) {
-    uint16_t *subarray = (uint16_t *)ndarray->dtype.subarray;
-    // if necessary, get the coordinates in the original reference frame, i.e.,
-    // in the coordinates used at the time of the creation of the object
-    // size_t *coords = tools_coords_from_pointer(array, ndarray);
-    for(size_t i = 0; i < count; i += *strides/ndarray->itemsize) {
-        // fill up the array with dummy data
-        *subarray++ = i*i;
-    }
-    // since strides is going to be used in computation loops, and subarray is
-    // meant to be a dense array, simply overwrite strides with the itemsize
-    *strides = ndarray->itemsize;
-}
-
-static mp_obj_t user_imread(mp_obj_t shape) {
-
-    size_t len = mp_obj_get_int(shape);
-
-    ndarray_obj_t *ndarray = ndarray_new_linear_array(len, NDARRAY_UINT16);
-    uint16_t *array = (uint16_t *)ndarray->array;
-    for(size_t i = 0; i < len; i++) {
-        *array = (uint16_t)i;
-        array += 1;
-    }
-    ndarray->dtype.flags = 1;
-    ndarray->dtype.arrfunc = user_imreader;
-    uint8_t *subarray = m_new(uint8_t, ndarray->itemsize * ndarray->shape[ULAB_MAX_DIMS - 1]);
-    ndarray->dtype.subarray = subarray;
-    memcpy(&(ndarray->dtype.shape), &(ndarray->shape), sizeof(size_t) *ULAB_MAX_DIMS);
-    return MP_OBJ_FROM_PTR(ndarray);
-}
-
-
-MP_DEFINE_CONST_FUN_OBJ_1(user_imread_obj, user_imread);
-
 static const mp_rom_map_elem_t ulab_user_globals_table[] = {
     { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_user) },
     { MP_OBJ_NEW_QSTR(MP_QSTR_square), (mp_obj_t)&user_square_obj },
-    { MP_OBJ_NEW_QSTR(MP_QSTR_imread), (mp_obj_t)&user_imread_obj },
 };
 
 static MP_DEFINE_CONST_DICT(mp_module_ulab_user_globals, ulab_user_globals_table);

From 8af7a93741fa8d3db69e5323929fd3ac50173923 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n=20V=C3=B6r=C3=B6s?= <zvoros@gmail.com>
Date: Sun, 21 Feb 2021 18:18:13 +0100
Subject: [PATCH 09/19] copy a couple of bug fixes from the legacy and master
 branches

---
 code/ndarray.c     | 11 +++++------
 code/ulab_create.c | 11 +++++++++++
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/code/ndarray.c b/code/ndarray.c
index 257069ee..f955ffab 100644
--- a/code/ndarray.c
+++ b/code/ndarray.c
@@ -588,13 +588,12 @@ void ndarray_assign_elements(ndarray_obj_t *ndarray, mp_obj_t iterable, uint8_t
 bool ndarray_is_dense(ndarray_obj_t *ndarray) {
     // returns true, if the array is dense, false otherwise
     // the array should be dense, if the very first stride can be calculated from shape
-    // TODO: this function could probably be removed
     int32_t stride = ndarray->itemsize;
-    for(uint8_t i=ULAB_MAX_DIMS; i > ULAB_MAX_DIMS-ndarray->ndim; i--) {
-        stride *= ndarray->shape[i];
+    for(uint8_t i = ULAB_MAX_DIMS - 1; i > ULAB_MAX_DIMS-ndarray->ndim; i--) {
+         stride *= ndarray->shape[i];
     }
-    return stride == ndarray->strides[ULAB_MAX_DIMS-ndarray->ndim-1] ? true : false;
-}
+    return stride == ndarray->strides[ULAB_MAX_DIMS-ndarray->ndim] ? true : false;
+ }
 
 ndarray_obj_t *ndarray_new_ndarray(uint8_t ndim, size_t *shape, int32_t *strides, uint8_t dtype) {
     // Creates the base ndarray with shape, and initialises the values to straight 0s
@@ -1850,7 +1849,7 @@ mp_obj_t ndarray_unary_op(mp_unary_op_t op, mp_obj_t self_in) {
         #if NDARRAY_HAS_UNARY_OP_ABS
         case MP_UNARY_OP_ABS:
             ndarray = ndarray_copy_view(self);
-            // if Booleam, NDARRAY_UINT8, or NDARRAY_UINT16, there is nothing to do
+            // if Boolean, NDARRAY_UINT8, or NDARRAY_UINT16, there is nothing to do
             if(self->dtype.type == NDARRAY_INT8) {
                 int8_t *array = (int8_t *)ndarray->array;
                 for(size_t i=0; i < self->len; i++, array++) {
diff --git a/code/ulab_create.c b/code/ulab_create.c
index a6e18eb5..71161024 100644
--- a/code/ulab_create.c
+++ b/code/ulab_create.c
@@ -46,6 +46,14 @@ static mp_obj_t create_zeros_ones_full(mp_obj_t oshape, uint8_t dtype, mp_obj_t
         ndarray = ndarray_new_dense_ndarray(len, shape, dtype);
     }
     if(value != mp_const_none) {
+        if(dtype == NDARRAY_BOOL) {
+            dtype = NDARRAY_UINT8;
+            if(mp_obj_is_true(value)) {
+                value = mp_obj_new_int(1);
+            } else {
+                value = mp_obj_new_int(0);
+            }
+        }
         for(size_t i=0; i < ndarray->len; i++) {
             mp_binary_set_val_array(dtype, ndarray->array, i, value);
         }
@@ -373,6 +381,9 @@ mp_obj_t create_eye(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args)
         m = mp_obj_get_int(args[1].u_rom_obj);
     }
     ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, n, m), dtype);
+    if(dtype == NDARRAY_BOOL) {
+        dtype = NDARRAY_UINT8;
+    }
     mp_obj_t one = mp_obj_new_int(1);
     size_t i = 0;
     if((args[2].u_int >= 0)) {

From e35178fa30e9b852c11462cc40f7ebb73e690d1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n=20V=C3=B6r=C3=B6s?= <zvoros@gmail.com>
Date: Sun, 21 Feb 2021 21:58:12 +0100
Subject: [PATCH 10/19] added buffer protocol

---
 code/ndarray.c             | 18 ++++++++++++++++++
 code/ndarray.h             |  1 +
 code/ulab.c                |  1 +
 tests/common/buffer.py     | 16 ++++++++++++++++
 tests/common/buffer.py.exp |  9 +++++++++
 5 files changed, 45 insertions(+)
 create mode 100644 tests/common/buffer.py
 create mode 100644 tests/common/buffer.py.exp

diff --git a/code/ndarray.c b/code/ndarray.c
index f955ffab..a48912f6 100644
--- a/code/ndarray.c
+++ b/code/ndarray.c
@@ -2032,3 +2032,21 @@ mp_obj_t ndarray_info(mp_obj_t obj_in) {
 
 MP_DEFINE_CONST_FUN_OBJ_1(ndarray_info_obj, ndarray_info);
 #endif
+
+// (the get_buffer protocol returns 0 for success, 1 for failure)
+mp_int_t ndarray_get_buffer(mp_obj_t self_in, mp_buffer_info_t *bufinfo, mp_uint_t flags) {
+    ndarray_obj_t *self = MP_OBJ_TO_PTR(self_in);
+    #if ULAB_HAS_BLOCKS
+    if(!ndarray_is_dense(self) || self->dtype.flags) {
+        return 1;
+    }
+    #else
+    if(!ndarray_is_dense(self)) {
+        return 1;
+    }
+    #endif
+    bufinfo->len = self->itemsize * self->len;
+    bufinfo->buf = self->array;
+    bufinfo->typecode = self->dtype.type;
+    return 0;
+}
diff --git a/code/ndarray.h b/code/ndarray.h
index b222a9d9..9cbd59eb 100644
--- a/code/ndarray.h
+++ b/code/ndarray.h
@@ -193,6 +193,7 @@ mp_obj_t ndarray_info(mp_obj_t );
 MP_DECLARE_CONST_FUN_OBJ_1(ndarray_info_obj);
 #endif
 
+mp_int_t ndarray_get_buffer(mp_obj_t , mp_buffer_info_t *, mp_uint_t );
 //void ndarray_attributes(mp_obj_t , qstr , mp_obj_t *);
 
 ndarray_obj_t *ndarray_from_mp_obj(mp_obj_t );
diff --git a/code/ulab.c b/code/ulab.c
index 858aa071..539604da 100644
--- a/code/ulab.c
+++ b/code/ulab.c
@@ -106,6 +106,7 @@ const mp_obj_type_t ulab_ndarray_type = {
     #if NDARRAY_HAS_BINARY_OPS
     .binary_op = ndarray_binary_op,
     #endif
+    .buffer_p = { .get_buffer = ndarray_get_buffer, },
     .locals_dict = (mp_obj_dict_t*)&ulab_ndarray_locals_dict,
 };
 
diff --git a/tests/common/buffer.py b/tests/common/buffer.py
new file mode 100644
index 00000000..6d2f678b
--- /dev/null
+++ b/tests/common/buffer.py
@@ -0,0 +1,16 @@
+try:
+    from ulab import numpy as np
+except:
+    import numpy as np
+
+def print_as_buffer(a):
+    print(len(memoryview(a)), list(memoryview(a)))
+print_as_buffer(np.ones(3))
+print_as_buffer(np.zeros(3))
+print_as_buffer(np.eye(4))
+print_as_buffer(np.ones(1, dtype=np.int8))
+print_as_buffer(np.ones(2, dtype=np.uint8))
+print_as_buffer(np.ones(3, dtype=np.int16))
+print_as_buffer(np.ones(4, dtype=np.uint16))
+print_as_buffer(np.ones(5, dtype=np.float))
+print_as_buffer(np.linspace(0, 1, 9))
diff --git a/tests/common/buffer.py.exp b/tests/common/buffer.py.exp
new file mode 100644
index 00000000..f5fb3d41
--- /dev/null
+++ b/tests/common/buffer.py.exp
@@ -0,0 +1,9 @@
+3 [1.0, 1.0, 1.0]
+3 [0.0, 0.0, 0.0]
+16 [1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0]
+1 [1]
+2 [1, 1]
+3 [1, 1, 1]
+4 [1, 1, 1, 1]
+5 [1.0, 1.0, 1.0, 1.0, 1.0]
+9 [0.0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]

From b1cd38201b85f67a2837c9537e1251b086b63884 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n=20V=C3=B6r=C3=B6s?= <zvoros@gmail.com>
Date: Mon, 22 Feb 2021 20:22:22 +0100
Subject: [PATCH 11/19] incremental change, not necessarily functional

---
 code/blocks/blocks.c        | 100 +++++++++++++++++++++++++++---------
 code/ndarray.c              |  19 ++++---
 code/ndarray.h              |   5 +-
 docs/ulab-programming.ipynb |  45 +++++++++++++++-
 4 files changed, 136 insertions(+), 33 deletions(-)

diff --git a/code/blocks/blocks.c b/code/blocks/blocks.c
index 7076f0c1..c340fbe8 100644
--- a/code/blocks/blocks.c
+++ b/code/blocks/blocks.c
@@ -21,46 +21,100 @@
 
 #if ULAB_HAS_BLOCKS
 
-static void blocks_imreader(ndarray_obj_t *ndarray, void *array, int32_t *strides, size_t count) {
-    uint16_t *subarray = (uint16_t *)ndarray->dtype.subarray;
-    // if necessary, get the coordinates in the original reference frame, i.e.,
-    // in the coordinates used at the time of the creation of the object
-    // size_t *coords = tools_coords_from_pointer(array, ndarray);
-    for(size_t i = 0; i < count; i += *strides/ndarray->itemsize) {
-        // fill up the array with dummy data
-        *subarray++ = i*i;
+//
+// const mp_obj_type_t block_function_type = {
+//     { &mp_type_type },
+//     .name = MP_QSTR_,
+//     .arrfunc = NULL,
+// };
+
+// static void blocks_imreader(ndarray_obj_t *ndarray, void *array, int32_t *strides, size_t count) {
+//     uint16_t *subarray = (uint16_t *)ndarray->dtype.subarray;
+//     // if necessary, get the coordinates in the original reference frame, i.e.,
+//     // in the coordinates used at the time of the creation of the object
+//     // size_t *coords = tools_coords_from_pointer(array, ndarray);
+//     for(size_t i = 0; i < count; i += *strides/ndarray->itemsize) {
+//         // fill up the array with dummy data
+//         *subarray++ = i*i;
+//     }
+//     // since strides is going to be used in computation loops, and subarray is
+//     // meant to be a dense array, simply overwrite strides with the itemsize
+//     *strides = ndarray->itemsize;
+// }
+
+ndarray_obj_t *blocks_new_ndarray(mp_obj_t _shape, uint8_t dtype, void *arrfunc) {
+    if(!MP_OBJ_IS_TYPE(_shape, &mp_type_tuple)) {
+        mp_raise_TypeError(translate("shape must be a tuple"));
+    }
+    mp_obj_tuple_t *shape_tuple = MP_OBJ_TO_PTR(_shape);
+    if(shape_tuple->len > ULAB_MAX_DIMS) {
+        mp_raise_ValueError(translate("too many dimensions"));
+    }
+
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+    for(uint8_t i=0; i < shape_tuple->len; i++) {
+        shape[ULAB_MAX_DIMS - i - 1] = mp_obj_get_int(shape_tuple->items[shape_tuple->len - i - 1]);
     }
-    // since strides is going to be used in computation loops, and subarray is
-    // meant to be a dense array, simply overwrite strides with the itemsize
-    *strides = ndarray->itemsize;
-}
 
-ndarray_obj_t *ndarray_ndarray_header(size_t len, void *arrfunc, uint8_t dtype) {
-    ndarray_obj_t *ndarray = ndarray_new_linear_array(len, dtype);
+    ndarray_obj_t *ndarray = ndarray_new_ndarray_header(shape_tuple->len, shape, NULL, dtype);
     ndarray->dtype.flags = 1;
     ndarray->dtype.arrfunc = arrfunc;
-    uint8_t *subarray = m_new(uint8_t, ndarray->itemsize * ndarray->shape[ULAB_MAX_DIMS - 1]);
+    // reserve so much space that data for the longest array can still be accommodated
+    size_t len = 0;
+    for(uint8_t i = 0; i < ULAB_MAX_DIMS; i++) {
+        if(ndarray->shape[i] > len) {
+            len = ndarray->shape[i];
+        }
+    }
+    uint8_t *subarray = m_new(uint8_t, ndarray->itemsize * len);
     ndarray->dtype.subarray = subarray;
-    memcpy(&(ndarray->dtype.shape), &(ndarray->shape), sizeof(size_t) *ULAB_MAX_DIMS);
+    // store the original array dimensions; dtype.shape is immuateble
+    memcpy(&(ndarray->dtype.shape), &(ndarray->shape), sizeof(size_t) * ULAB_MAX_DIMS);
     return ndarray;
 }
 
-static mp_obj_t blocks_imread(mp_obj_t shape, mp_obj_t dtype) {
+mp_obj_t blocks_block(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_shape, MP_ARG_OBJ | MP_ARG_REQUIRED, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_transformer, MP_ARG_KW_ONLY | MP_ARG_OBJ | MP_ARG_REQUIRED, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_obj = MP_ROM_INT(NDARRAY_FLOAT) } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    mp_obj_tuple_t *shape_tuple = MP_OBJ_TO_PTR(args[0].u_obj);
+    if(shape_tuple->len > ULAB_MAX_DIMS) {
+        mp_raise_ValueError(translate("too many dimensions"));
+    }
+
+    uint8_t _dtype;
+    #if ULAB_HAS_DTYPE_OBJECT
+    if(MP_OBJ_IS_TYPE(args[1].u_obj, &ulab_dtype_type)) {
+        dtype_obj_t *dtype = MP_OBJ_TO_PTR(args[1].u_obj);
+        _dtype = dtype->dtype.type;
+    } else {
+        _dtype = mp_obj_get_int(args[1].u_obj);
+    }
+    #else
+    _dtype = mp_obj_get_int(args[1].u_obj);
+    #endif
+    mp_obj_t transformer = args[1].u_obj;
 
-    size_t len = mp_obj_get_int(shape);
-    uint8_t _dtype = MP_OBJ_SMALL_INT_VALUE(dtype);
+    ndarray_obj_t *ndarray = blocks_new_ndarray(args[0].u_obj, _dtype, transformer);
 
-    void (*imreader)(ndarray_obj_t *, void *, int32_t *, size_t) = blocks_imreader;
+    //void (*imreader)(ndarray_obj_t *, void *, int32_t *, size_t) = blocks_imreader;
+    //ndarray->dtype.arrfunc = transformer;
 
-    ndarray_obj_t *ndarray = ndarray_ndarray_header(len, imreader, _dtype);
     return MP_OBJ_FROM_PTR(ndarray);
 }
 
-MP_DEFINE_CONST_FUN_OBJ_2(blocks_imread_obj, blocks_imread);
+MP_DEFINE_CONST_FUN_OBJ_KW(blocks_block_obj, 2, blocks_block);
 
 static const mp_rom_map_elem_t ulab_blocks_globals_table[] = {
     { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_blocks) },
-    { MP_OBJ_NEW_QSTR(MP_QSTR_imread), (mp_obj_t)&blocks_imread_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_block), (mp_obj_t)&blocks_block_obj },
 };
 
 static MP_DEFINE_CONST_DICT(mp_module_ulab_blocks_globals, ulab_blocks_globals_table);
diff --git a/code/ndarray.c b/code/ndarray.c
index 7945613e..92f74d88 100644
--- a/code/ndarray.c
+++ b/code/ndarray.c
@@ -595,9 +595,8 @@ bool ndarray_is_dense(ndarray_obj_t *ndarray) {
     return stride == ndarray->strides[ULAB_MAX_DIMS-ndarray->ndim] ? true : false;
 }
 
-
-ndarray_obj_t *ndarray_new_ndarray(uint8_t ndim, size_t *shape, int32_t *strides, uint8_t dtype) {
-    // Creates the base ndarray with shape, and initialises the values to straight 0s
+ndarray_obj_t *ndarray_new_ndarray_header(uint8_t ndim, size_t *shape, int32_t *strides, uint8_t dtype) {
+    // creates an empty ndarray, i.e., one with header, but without data
     ndarray_obj_t *ndarray = m_new_obj(ndarray_obj_t);
     ndarray->base.type = &ulab_ndarray_type;
     ndarray->dtype.type = dtype == NDARRAY_BOOL ? NDARRAY_UINT8 : dtype;
@@ -617,6 +616,16 @@ ndarray_obj_t *ndarray_new_ndarray(uint8_t ndim, size_t *shape, int32_t *strides
         ndarray->len *= shape[i-1];
     }
 
+    #if ULAB_HAS_BLOCKS
+    // indicate that the array doesn't need special treatment in the readout function
+    ndarray->dtype.flags = 0;
+    #endif
+    return ndarray;
+}
+
+ndarray_obj_t *ndarray_new_ndarray(uint8_t ndim, size_t *shape, int32_t *strides, uint8_t dtype) {
+    // Creates the base ndarray with shape, and initialises the values to straight 0s
+    ndarray_obj_t *ndarray = ndarray_new_ndarray_header(ndim, shape, strides, dtype);
     // if the length is 0, still allocate a single item, so that contractions can be handled
     size_t len = ndarray->itemsize * MAX(1, ndarray->len);
     uint8_t *array = m_new(byte, len);
@@ -626,8 +635,6 @@ ndarray_obj_t *ndarray_new_ndarray(uint8_t ndim, size_t *shape, int32_t *strides
     ndarray->array = array;
 
     #if ULAB_HAS_BLOCKS
-    // indicate that the array doesn't need special treatment in the readout function
-    ndarray->dtype.flags = 0;
     ndarray->dtype.origin = array;
     #endif
 
@@ -1961,7 +1968,7 @@ mp_obj_t ndarray_reshape(mp_obj_t oin, mp_obj_t _shape) {
 
     mp_obj_tuple_t *shape = MP_OBJ_TO_PTR(_shape);
     if(shape->len > ULAB_MAX_DIMS) {
-        mp_raise_ValueError(translate("maximum number of dimensions is 4"));
+        mp_raise_ValueError(translate("too many dimensions"));
     }
     size_t *new_shape = m_new(size_t, ULAB_MAX_DIMS);
     memset(new_shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
diff --git a/code/ndarray.h b/code/ndarray.h
index 9cbd59eb..633a1c1e 100644
--- a/code/ndarray.h
+++ b/code/ndarray.h
@@ -69,8 +69,8 @@ typedef struct _dtype_dtype {
     uint8_t flags;
     void *arrfunc;
     uint8_t *subarray;
-    size_t shape[ULAB_MAX_DIMS];
-    void *origin;
+    size_t shape[ULAB_MAX_DIMS]; // original shape of array; this member should never be overwritten
+    void *origin; // origin stores the address of ndarray->array at the time of creation, and should never be changed
     uint8_t *name;
     #endif
 } dtype_dtype;
@@ -126,6 +126,7 @@ void ndarray_assign_elements(ndarray_obj_t *, mp_obj_t , uint8_t , size_t *);
 size_t *ndarray_contract_shape(ndarray_obj_t *, uint8_t );
 int32_t *ndarray_contract_strides(ndarray_obj_t *, uint8_t );
 
+ndarray_obj_t *ndarray_new_ndarray_header(uint8_t , size_t *, int32_t *, uint8_t );
 ndarray_obj_t *ndarray_new_dense_ndarray(uint8_t , size_t *, uint8_t );
 ndarray_obj_t *ndarray_new_ndarray_from_tuple(mp_obj_tuple_t *, uint8_t );
 ndarray_obj_t *ndarray_new_ndarray(uint8_t , size_t *, int32_t *, uint8_t );
diff --git a/docs/ulab-programming.ipynb b/docs/ulab-programming.ipynb
index 0776981f..38be5fec 100644
--- a/docs/ulab-programming.ipynb
+++ b/docs/ulab-programming.ipynb
@@ -441,6 +441,12 @@
    "source": [
     "## Extending ulab\n",
     "\n",
+    "`ulab` offers two ways of extending it: one is via the `user` module, where you can implement your own functions and methods. Since the `user` module is not part of `ulab` _per se_, these functions do not have to conform to `numpy` or `scipy` conventions. \n",
+    "\n",
+    "The other possibility is extending `ulab` in the sense that you define your own data container and supply a single read-out function that will be called, whenever `ulab` needs access to the data. This method does not add new functions to `ulab`: you use the available functions, but you can specify, how the data are piped into them. The rationale for this will be spelt out later, when we discuss the `blocks` module. \n",
+    "\n",
+    "### Including the user module\n",
+    "\n",
     "The `user` module is disabled by default, as can be seen from the last couple of lines of [ulab.h](https://github.com/v923z/micropython-ulab/blob/master/code/ulab.h)\n",
     "\n",
     "```c\n",
@@ -461,7 +467,8 @@
     "```\n",
     "which should just return 5.0. Even if `numpy`-compatibility is required (i.e., if most functions are bound at the top level to `ulab` directly), having to `import` the module has a great advantage. Namely, only the [user.h](https://github.com/v923z/micropython-ulab/blob/master/code/user/user.h) and [user.c](https://github.com/v923z/micropython-ulab/blob/master/code/user/user.c) files have to be modified, thus it should be relatively straightforward to update your local copy from [github](https://github.com/v923z/micropython-ulab/blob/master/). \n",
     "\n",
-    "Now, let us see, how we can add a more meaningful function. "
+    "\n",
+    "Now, let us see, how we can add a more meaningful function."
    ]
   },
   {
@@ -606,7 +613,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Boilerplate\n",
+    "## user module boilerplate\n",
     "\n",
     "In the next section, we will construct a function that generates the element-wise square of a dense array, otherwise, raises a `TypeError` exception. Dense arrays can easily be iterated over, since we do not have to care about the `shape` and the `strides`. If the array is sparse, the section [Iterating over elements of a tensor](#Iterating-over-elements-of-a-tensor) should contain hints as to how the iteration can be implemented.\n",
     "\n",
@@ -730,6 +737,40 @@
     "2. The definition of a function object by calling MP_DEFINE_CONST_FUN_OBJ_N()\n",
     "3. Binding this function object to the namespace in the `ulab_user_globals_table[]`"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Working with the blocks module\n",
+    "\n",
+    "Version 3 of `ulab` introduced the `blocks` sub-module for extensions. You can enable it by setting the `ULAB_HAS_BLOCKS` constant in [ulab.h](https://github.com/v923z/micropython-ulab/blob/master/code/ulab.h)\n",
+    "\n",
+    "```c\n",
+    "#ifndef ULAB_HAS_BLOCKS\n",
+    "#define ULAB_HAS_BLOCKS                      (1)\n",
+    "#endif\n",
+    "```\n",
+    "\n",
+    "and in `python`, you would use it as in \n",
+    "\n",
+    "```python\n",
+    "from ulab import numpy as np\n",
+    "from ulab import blocks\n",
+    "\n",
+    "from yourmodule import func\n",
+    "\n",
+    "b = blocks.block(shape=(10, 10), transformer=func, dtype=np.uint8)\n",
+    "print(np.std(b, axis=0))\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

From 087f0052324f1d0279265045c58e6c15fc47ee6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n=20V=C3=B6r=C3=B6s?= <zvoros@gmail.com>
Date: Tue, 23 Feb 2021 19:16:01 +0100
Subject: [PATCH 12/19] implemented block initialisation function

---
 code/blocks/blocks.c             | 77 ++++++++++++++++++--------------
 code/numpy/numerical/numerical.c |  2 +-
 code/numpy/numerical/numerical.h |  7 +--
 code/ulab_tools.c                |  2 +-
 4 files changed, 50 insertions(+), 38 deletions(-)

diff --git a/code/blocks/blocks.c b/code/blocks/blocks.c
index c340fbe8..d49b2c51 100644
--- a/code/blocks/blocks.c
+++ b/code/blocks/blocks.c
@@ -21,26 +21,34 @@
 
 #if ULAB_HAS_BLOCKS
 
-//
-// const mp_obj_type_t block_function_type = {
-//     { &mp_type_type },
-//     .name = MP_QSTR_,
-//     .arrfunc = NULL,
-// };
-
-// static void blocks_imreader(ndarray_obj_t *ndarray, void *array, int32_t *strides, size_t count) {
-//     uint16_t *subarray = (uint16_t *)ndarray->dtype.subarray;
-//     // if necessary, get the coordinates in the original reference frame, i.e.,
-//     // in the coordinates used at the time of the creation of the object
-//     // size_t *coords = tools_coords_from_pointer(array, ndarray);
-//     for(size_t i = 0; i < count; i += *strides/ndarray->itemsize) {
-//         // fill up the array with dummy data
-//         *subarray++ = i*i;
-//     }
-//     // since strides is going to be used in computation loops, and subarray is
-//     // meant to be a dense array, simply overwrite strides with the itemsize
-//     *strides = ndarray->itemsize;
-// }
+typedef struct _blocks_function_obj_t {
+    void *arrfunc;
+} blocks_function_obj_t;
+
+static void blocks_imreader(ndarray_obj_t *ndarray, void *array, int32_t *strides, size_t count) {
+    printf("imreader: %p\n", ndarray->dtype.subarray);
+    uint8_t *subarray = (uint8_t *)ndarray->dtype.subarray;
+    // if necessary, get the coordinates in the original reference frame, i.e.,
+    // in the coordinates used at the time of the creation of the object
+    // size_t *coords = tools_coords_from_pointer(array, ndarray);
+    for(size_t i = 0; i < count; i++) {
+        // fill up the array with dummy data
+        *subarray++ = (uint8_t)i*i;
+         // array += *strides/ndarray->itemsize
+    }
+    // since strides is going to be used in computation loops, and subarray is
+    // meant to be a dense array, simply overwrite strides with the itemsize
+    *strides = ndarray->itemsize;
+}
+
+static mp_obj_t blocks_imreader_function(void) {
+    blocks_function_obj_t *func = m_new(blocks_function_obj_t, 1);
+    void (*imreader)(ndarray_obj_t *, void *, int32_t *, size_t) = blocks_imreader;
+    func->arrfunc = imreader;
+    return MP_OBJ_FROM_PTR(func);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_0(blocks_imreader_function_obj, blocks_imreader_function);
 
 ndarray_obj_t *blocks_new_ndarray(mp_obj_t _shape, uint8_t dtype, void *arrfunc) {
     if(!MP_OBJ_IS_TYPE(_shape, &mp_type_tuple)) {
@@ -52,14 +60,16 @@ ndarray_obj_t *blocks_new_ndarray(mp_obj_t _shape, uint8_t dtype, void *arrfunc)
     }
 
     size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
-    memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+    memset(shape, 0, sizeof(size_t) * ULAB_MAX_DIMS);
     for(uint8_t i=0; i < shape_tuple->len; i++) {
         shape[ULAB_MAX_DIMS - i - 1] = mp_obj_get_int(shape_tuple->items[shape_tuple->len - i - 1]);
     }
 
     ndarray_obj_t *ndarray = ndarray_new_ndarray_header(shape_tuple->len, shape, NULL, dtype);
     ndarray->dtype.flags = 1;
-    ndarray->dtype.arrfunc = arrfunc;
+//    ndarray->dtype.arrfunc = arrfunc;
+//    void (*imreader)(ndarray_obj_t *, void *, int32_t *, size_t) = blocks_imreader;
+    ndarray->dtype.arrfunc = blocks_imreader;
     // reserve so much space that data for the longest array can still be accommodated
     size_t len = 0;
     for(uint8_t i = 0; i < ULAB_MAX_DIMS; i++) {
@@ -71,6 +81,7 @@ ndarray_obj_t *blocks_new_ndarray(mp_obj_t _shape, uint8_t dtype, void *arrfunc)
     ndarray->dtype.subarray = subarray;
     // store the original array dimensions; dtype.shape is immuateble
     memcpy(&(ndarray->dtype.shape), &(ndarray->shape), sizeof(size_t) * ULAB_MAX_DIMS);
+    printf("%p\n", ndarray->dtype.subarray);
     return ndarray;
 }
 
@@ -88,33 +99,33 @@ mp_obj_t blocks_block(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args
     if(shape_tuple->len > ULAB_MAX_DIMS) {
         mp_raise_ValueError(translate("too many dimensions"));
     }
-
     uint8_t _dtype;
     #if ULAB_HAS_DTYPE_OBJECT
     if(MP_OBJ_IS_TYPE(args[1].u_obj, &ulab_dtype_type)) {
-        dtype_obj_t *dtype = MP_OBJ_TO_PTR(args[1].u_obj);
+        dtype_obj_t *dtype = MP_OBJ_TO_PTR(args[2].u_obj);
         _dtype = dtype->dtype.type;
     } else {
-        _dtype = mp_obj_get_int(args[1].u_obj);
+        _dtype = mp_obj_get_int(args[2].u_obj);
     }
     #else
-    _dtype = mp_obj_get_int(args[1].u_obj);
+    _dtype = mp_obj_get_int(args[2].u_obj);
     #endif
-    mp_obj_t transformer = args[1].u_obj;
-
-    ndarray_obj_t *ndarray = blocks_new_ndarray(args[0].u_obj, _dtype, transformer);
-
-    //void (*imreader)(ndarray_obj_t *, void *, int32_t *, size_t) = blocks_imreader;
-    //ndarray->dtype.arrfunc = transformer;
 
+    blocks_function_obj_t *transformer = MP_OBJ_TO_PTR(args[1].u_obj);
+    (void)transformer;
+    void (*arrfunc)(ndarray_obj_t *, void *, int32_t *, size_t) = blocks_imreader;
+    ndarray_obj_t *ndarray = blocks_new_ndarray(args[0].u_obj, _dtype, arrfunc);
+    ndarray->dtype.arrfunc = arrfunc;
+    printf("%p\n", ndarray->dtype.subarray);
     return MP_OBJ_FROM_PTR(ndarray);
 }
 
-MP_DEFINE_CONST_FUN_OBJ_KW(blocks_block_obj, 2, blocks_block);
+MP_DEFINE_CONST_FUN_OBJ_KW(blocks_block_obj, 0, blocks_block);
 
 static const mp_rom_map_elem_t ulab_blocks_globals_table[] = {
     { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_blocks) },
     { MP_OBJ_NEW_QSTR(MP_QSTR_block), (mp_obj_t)&blocks_block_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_imreader), (mp_obj_t)&blocks_imreader_function_obj },
 };
 
 static MP_DEFINE_CONST_DICT(mp_module_ulab_blocks_globals, ulab_blocks_globals_table);
diff --git a/code/numpy/numerical/numerical.c b/code/numpy/numerical/numerical.c
index 3a1f9047..5084f671 100644
--- a/code/numpy/numerical/numerical.c
+++ b/code/numpy/numerical/numerical.c
@@ -263,7 +263,7 @@ static mp_obj_t numerical_sum_mean_std_ndarray(ndarray_obj_t *ndarray, mp_obj_t
         uint8_t *rarray = NULL;
         mp_float_t *farray = NULL;
 
-        #if ULAB_DTYPE_IS_EXTENDABLE
+        #if ULAB_HAS_BLOCKS
         void (*arrfunc)(ndarray_obj_t *, void *, int32_t *, size_t) = NULL;
         if(ndarray->dtype.flags) {
             arrfunc = ndarray->dtype.arrfunc;
diff --git a/code/numpy/numerical/numerical.h b/code/numpy/numerical/numerical.h
index 1507dfb3..511d65e8 100644
--- a/code/numpy/numerical/numerical.h
+++ b/code/numpy/numerical/numerical.h
@@ -46,7 +46,7 @@
     (rarray) += (results)->itemsize;\
 })
 
-#if !(ULAB_DTYPE_IS_EXTENDABLE)
+#if !(ULAB_HAS_BLOCKS)
 #define RUN_SUM1(type, ndarray, array, results, rarray, ss, arrfunc)\
 ({\
     type sum = 0;\
@@ -63,9 +63,10 @@
 ({\
     type sum = 0;\
     int32_t increment = (ss).strides[0];\
-    (ndarray)->dtype.subarray = (array);\
     if((ndarray)->dtype.flags) {\
         (arrfunc)((ndarray), (array), &increment, (ss).shape[0]);\
+    } else {\
+        (ndarray)->dtype.subarray = array;\
     }\
     for(size_t i=0; i < (ss).shape[0]; i++) {\
         sum += *((type *)((ndarray)->dtype.subarray));\
@@ -110,7 +111,7 @@
 // Instead of the straightforward implementation of the definition,
 // we take the numerically stable Welford algorithm here
 // https://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
-#if !(ULAB_DTYPE_IS_EXTENDABLE)
+#if !(ULAB_HAS_BLOCKS)
 #define RUN_MEAN_STD1(type, ndarray, array, rarray, ss, div, isStd, arrfunc)\
 ({\
     mp_float_t M = 0.0, m = 0.0, S = 0.0;\
diff --git a/code/ulab_tools.c b/code/ulab_tools.c
index 02b919a2..b84f4a1c 100644
--- a/code/ulab_tools.c
+++ b/code/ulab_tools.c
@@ -230,7 +230,7 @@ ndarray_obj_t *tools_object_is_square(mp_obj_t obj) {
 }
 #endif
 
-#if ULAB_DTYPE_IS_EXTENDABLE
+#if ULAB_HAS_BLOCKS
 size_t *tools_coords_from_pointer(void *p1, ndarray_obj_t *ndarray) {
     // calculates the coordinates in the original tensor from the position of the pointer
     // The original view is assumed to be dense, i.e., the strides can be computed from the shape

From 8cd66793998aaac55bd35d541b14ed84c5eb78e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n=20V=C3=B6r=C3=B6s?= <zvoros@gmail.com>
Date: Thu, 25 Feb 2021 23:23:29 +0100
Subject: [PATCH 13/19] implemented proper block class

---
 code/blocks/blocks.c             | 172 +++++++++++++++++++++----------
 code/blocks/blocks.h             |  12 +++
 code/ndarray.c                   |  17 +--
 code/ndarray.h                   |  20 ++--
 code/numpy/numerical/numerical.c |  38 +++----
 code/numpy/numerical/numerical.h |  56 +++++-----
 code/ulab_tools.c                |  15 ---
 code/ulab_tools.h                |   1 -
 8 files changed, 200 insertions(+), 131 deletions(-)

diff --git a/code/blocks/blocks.c b/code/blocks/blocks.c
index d49b2c51..49f1bcc6 100644
--- a/code/blocks/blocks.c
+++ b/code/blocks/blocks.c
@@ -21,71 +21,105 @@
 
 #if ULAB_HAS_BLOCKS
 
-typedef struct _blocks_function_obj_t {
-    void *arrfunc;
-} blocks_function_obj_t;
+extern const mp_obj_type_t block_function_type;
 
-static void blocks_imreader(ndarray_obj_t *ndarray, void *array, int32_t *strides, size_t count) {
-    printf("imreader: %p\n", ndarray->dtype.subarray);
-    uint8_t *subarray = (uint8_t *)ndarray->dtype.subarray;
+const mp_obj_type_t block_function_type = {
+    { &mp_type_type },
+    .name = MP_QSTR_block_function,
+};
+
+extern const mp_obj_type_t imreader_type;
+
+void imreader_imreader(ndarray_obj_t *ndarray, void *array, int32_t *strides, size_t count) {
+    blocks_block_obj_t *block = (blocks_block_obj_t *)ndarray->block;
+    uint8_t *barray = (uint8_t *)block->subarray;
     // if necessary, get the coordinates in the original reference frame, i.e.,
     // in the coordinates used at the time of the creation of the object
-    // size_t *coords = tools_coords_from_pointer(array, ndarray);
+    // size_t *coords = blocks_coords_from_pointer(array, ndarray);
     for(size_t i = 0; i < count; i++) {
         // fill up the array with dummy data
-        *subarray++ = (uint8_t)i*i;
-         // array += *strides/ndarray->itemsize
+        *barray++ = (uint8_t)i*i;
     }
-    // since strides is going to be used in computation loops, and subarray is
-    // meant to be a dense array, simply overwrite strides with the itemsize
+    // The subarray is a forward propagating dense array, so set the strides to the itemsize
     *strides = ndarray->itemsize;
 }
 
-static mp_obj_t blocks_imreader_function(void) {
-    blocks_function_obj_t *func = m_new(blocks_function_obj_t, 1);
-    void (*imreader)(ndarray_obj_t *, void *, int32_t *, size_t) = blocks_imreader;
-    func->arrfunc = imreader;
-    return MP_OBJ_FROM_PTR(func);
+mp_obj_t imreader_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args) {
+    (void) type;
+    mp_arg_check_num(n_args, n_kw, 0, 1, true);
+    mp_map_t kw_args;
+    mp_map_init_fixed_table(&kw_args, n_kw, args + n_args);
+
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_OBJ, { .u_obj = mp_const_none } },
+    };
+    mp_arg_val_t _args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, args, &kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, _args);
+
+    blocks_function_obj_t *block_function = m_new_obj(blocks_function_obj_t);
+    block_function->base.type = &block_function_type;
+    void (*arrfunc)(ndarray_obj_t *, void *, int32_t *, size_t) = imreader_imreader;
+    block_function->arrfunc = arrfunc;
+    return MP_OBJ_FROM_PTR(block_function);
 }
 
-MP_DEFINE_CONST_FUN_OBJ_0(blocks_imreader_function_obj, blocks_imreader_function);
+const mp_obj_type_t imreader_type = {
+    { &mp_type_type },
+    .name = MP_QSTR_imreader,
+    .make_new = imreader_make_new,
+};
 
-ndarray_obj_t *blocks_new_ndarray(mp_obj_t _shape, uint8_t dtype, void *arrfunc) {
-    if(!MP_OBJ_IS_TYPE(_shape, &mp_type_tuple)) {
-        mp_raise_TypeError(translate("shape must be a tuple"));
-    }
-    mp_obj_tuple_t *shape_tuple = MP_OBJ_TO_PTR(_shape);
-    if(shape_tuple->len > ULAB_MAX_DIMS) {
-        mp_raise_ValueError(translate("too many dimensions"));
+size_t *blocks_coords_from_pointer(void *p1, ndarray_obj_t *ndarray) {
+    // calculates the coordinates in the original tensor from the position of the pointer
+    // The original view is assumed to be dense, i.e., the strides can be computed from the shape
+    blocks_block_obj_t *block = ndarray->block;
+    size_t diff = (uint8_t *)p1 - (uint8_t *)block->origin;
+    size_t accumulator = 1;
+    size_t *coords = m_new(size_t, ULAB_MAX_DIMS);
+    for(uint8_t i = 1; i < ndarray->ndim + 1; i++) {
+        accumulator *= block->shape[ULAB_MAX_DIMS - i];
+        coords[ULAB_MAX_DIMS - i] = diff % accumulator;
     }
+    return coords;
+}
 
-    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
-    memset(shape, 0, sizeof(size_t) * ULAB_MAX_DIMS);
-    for(uint8_t i=0; i < shape_tuple->len; i++) {
-        shape[ULAB_MAX_DIMS - i - 1] = mp_obj_get_int(shape_tuple->items[shape_tuple->len - i - 1]);
+void blocks_block_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) {
+    (void)kind;
+    blocks_block_obj_t *self = MP_OBJ_TO_PTR(self_in);
+    ndarray_obj_t *ndarray = (ndarray_obj_t *)self->ndarray;
+    mp_print_str(print, "block(shape=(");
+    for(uint8_t i = 0; i < ndarray->ndim - 1; i++) {
+        mp_printf(print, "%ld, ", ndarray->shape[ULAB_MAX_DIMS - ndarray->ndim + i]);
     }
-
-    ndarray_obj_t *ndarray = ndarray_new_ndarray_header(shape_tuple->len, shape, NULL, dtype);
-    ndarray->dtype.flags = 1;
-//    ndarray->dtype.arrfunc = arrfunc;
-//    void (*imreader)(ndarray_obj_t *, void *, int32_t *, size_t) = blocks_imreader;
-    ndarray->dtype.arrfunc = blocks_imreader;
-    // reserve so much space that data for the longest array can still be accommodated
-    size_t len = 0;
-    for(uint8_t i = 0; i < ULAB_MAX_DIMS; i++) {
-        if(ndarray->shape[i] > len) {
-            len = ndarray->shape[i];
-        }
+    mp_printf(print, "%ld), ", ndarray->shape[ULAB_MAX_DIMS - 1]);
+    // mp_printf(print, "transformer=%s, ", self->);
+    // this is duplicate from ndarray.c:ndarray_print, but allows complete decoupling
+    if(ndarray->boolean == NDARRAY_BOOL) {
+        mp_print_str(print, "dtype=bool)");
+    } else if(ndarray->dtype.type == NDARRAY_UINT8) {
+        mp_print_str(print, "dtype=uint8)");
+    } else if(ndarray->dtype.type == NDARRAY_INT8) {
+        mp_print_str(print, "dtype=int8)");
+    } else if(ndarray->dtype.type == NDARRAY_UINT16) {
+        mp_print_str(print, "dtype=uint16)");
+    } else if(ndarray->dtype.type == NDARRAY_INT16) {
+        mp_print_str(print, "dtype=int16)");
+    } else {
+        #if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
+        mp_print_str(print, "dtype=float32)");
+        #else
+        mp_print_str(print, "dtype=float64)");
+        #endif
     }
-    uint8_t *subarray = m_new(uint8_t, ndarray->itemsize * len);
-    ndarray->dtype.subarray = subarray;
-    // store the original array dimensions; dtype.shape is immuateble
-    memcpy(&(ndarray->dtype.shape), &(ndarray->shape), sizeof(size_t) * ULAB_MAX_DIMS);
-    printf("%p\n", ndarray->dtype.subarray);
-    return ndarray;
 }
 
-mp_obj_t blocks_block(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+const mp_obj_type_t blocks_block_type = {
+    { &mp_type_type },
+    .name = MP_QSTR_block,
+    .print = blocks_block_print,
+};
+
+mp_obj_t blocks_new_ndarray(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
     static const mp_arg_t allowed_args[] = {
         { MP_QSTR_shape, MP_ARG_OBJ | MP_ARG_REQUIRED, { .u_rom_obj = mp_const_none } },
         { MP_QSTR_transformer, MP_ARG_KW_ONLY | MP_ARG_OBJ | MP_ARG_REQUIRED, { .u_rom_obj = mp_const_none } },
@@ -95,10 +129,14 @@ mp_obj_t blocks_block(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args
     mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
     mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
 
+    if(!MP_OBJ_IS_TYPE(args[0].u_obj, &mp_type_tuple)) {
+        mp_raise_TypeError(translate("shape must be a tuple"));
+    }
     mp_obj_tuple_t *shape_tuple = MP_OBJ_TO_PTR(args[0].u_obj);
     if(shape_tuple->len > ULAB_MAX_DIMS) {
         mp_raise_ValueError(translate("too many dimensions"));
     }
+
     uint8_t _dtype;
     #if ULAB_HAS_DTYPE_OBJECT
     if(MP_OBJ_IS_TYPE(args[1].u_obj, &ulab_dtype_type)) {
@@ -111,21 +149,41 @@ mp_obj_t blocks_block(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args
     _dtype = mp_obj_get_int(args[2].u_obj);
     #endif
 
-    blocks_function_obj_t *transformer = MP_OBJ_TO_PTR(args[1].u_obj);
-    (void)transformer;
-    void (*arrfunc)(ndarray_obj_t *, void *, int32_t *, size_t) = blocks_imreader;
-    ndarray_obj_t *ndarray = blocks_new_ndarray(args[0].u_obj, _dtype, arrfunc);
-    ndarray->dtype.arrfunc = arrfunc;
-    printf("%p\n", ndarray->dtype.subarray);
-    return MP_OBJ_FROM_PTR(ndarray);
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    memset(shape, 0, sizeof(size_t) * ULAB_MAX_DIMS);
+    size_t len = 0;
+    for(uint8_t i=0; i < shape_tuple->len; i++) {
+        shape[ULAB_MAX_DIMS - i - 1] = mp_obj_get_int(shape_tuple->items[shape_tuple->len - i - 1]);
+        // reserve as much space that data for the longest array can still be accommodated
+        if(shape[ULAB_MAX_DIMS - i - 1] > len) {
+            len = shape[ULAB_MAX_DIMS - i - 1];
+        }
+    }
+    ndarray_obj_t *ndarray = ndarray_new_ndarray_header(shape_tuple->len, shape, NULL, _dtype);
+    ndarray->flags = BLOCK_IS_READ_ONLY;
+    blocks_block_obj_t *block = m_new_obj(blocks_block_obj_t);
+    block->base.type = &blocks_block_type;
+    // store a pointer to the ndarray
+    block->ndarray = ndarray;
+
+    uint8_t *barray = m_new(uint8_t, ndarray->itemsize * len);
+    block->subarray = barray;
+    // store the original array dimensions; block->shape should never be touched
+    memcpy(&(block->shape), &(ndarray->shape), sizeof(size_t) * ULAB_MAX_DIMS);
+    // store the original address of the array; block->origin should never be touched
+    block->origin = ndarray->array;
+    block->arrfunc = imreader_imreader;
+    ndarray->block = block;
+    return ndarray;
 }
 
-MP_DEFINE_CONST_FUN_OBJ_KW(blocks_block_obj, 0, blocks_block);
+MP_DEFINE_CONST_FUN_OBJ_KW(blocks_new_ndarray_obj, 0, blocks_new_ndarray);
 
 static const mp_rom_map_elem_t ulab_blocks_globals_table[] = {
     { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_blocks) },
-    { MP_OBJ_NEW_QSTR(MP_QSTR_block), (mp_obj_t)&blocks_block_obj },
-    { MP_OBJ_NEW_QSTR(MP_QSTR_imreader), (mp_obj_t)&blocks_imreader_function_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_ndarray), (mp_obj_t)&blocks_new_ndarray_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_imreader), (mp_obj_t)&imreader_type },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_block), (mp_obj_t)&blocks_block_type },
 };
 
 static MP_DEFINE_CONST_DICT(mp_module_ulab_blocks_globals, ulab_blocks_globals_table);
diff --git a/code/blocks/blocks.h b/code/blocks/blocks.h
index 08bfbba4..d227666a 100644
--- a/code/blocks/blocks.h
+++ b/code/blocks/blocks.h
@@ -15,6 +15,18 @@
 #include "ulab.h"
 #include "ndarray.h"
 
+#define BLOCK_NO_FLAG               0x00
+#define BLOCK_IS_READ_ONLY          0x01
+#define BLOCK_IS_READ_WRITE         0x02
+
+typedef struct _blocks_function_obj_t {
+    mp_obj_base_t base;
+    void *arrfunc;
+} blocks_function_obj_t;
+
+extern const mp_obj_type_t blocks_block_type;
 extern mp_obj_module_t ulab_blocks_module;
 
+size_t *blocks_coords_from_pointer(void *, ndarray_obj_t *);
+
 #endif
diff --git a/code/ndarray.c b/code/ndarray.c
index 92f74d88..4cdb14a3 100644
--- a/code/ndarray.c
+++ b/code/ndarray.c
@@ -24,6 +24,7 @@
 #include "ulab_tools.h"
 #include "ndarray.h"
 #include "ndarray_operators.h"
+#include "blocks/blocks.h"
 
 mp_uint_t ndarray_print_threshold = NDARRAY_PRINT_THRESHOLD;
 mp_uint_t ndarray_print_edgeitems = NDARRAY_PRINT_EDGEITEMS;
@@ -501,6 +502,13 @@ static void ndarray_print_bracket(const mp_print_t *print, const size_t conditio
 void ndarray_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) {
     (void)kind;
     ndarray_obj_t *self = MP_OBJ_TO_PTR(self_in);
+    #if ULAB_HAS_BLOCKS
+    if(self->flags) {
+        const mp_obj_type_t *type = mp_obj_get_type(self->block);
+        type->print((mp_print_t *)print, self->block, kind);
+        return;
+    }
+    #endif
     uint8_t *array = (uint8_t *)self->array;
     mp_print_str(print, "array(");
     if(self->len == 0) {
@@ -618,7 +626,7 @@ ndarray_obj_t *ndarray_new_ndarray_header(uint8_t ndim, size_t *shape, int32_t *
 
     #if ULAB_HAS_BLOCKS
     // indicate that the array doesn't need special treatment in the readout function
-    ndarray->dtype.flags = 0;
+    ndarray->flags = BLOCK_NO_FLAG;
     #endif
     return ndarray;
 }
@@ -633,11 +641,6 @@ ndarray_obj_t *ndarray_new_ndarray(uint8_t ndim, size_t *shape, int32_t *strides
     // we could, perhaps, leave this step out, and initialise the array only, when needed
     memset(array, 0, len);
     ndarray->array = array;
-
-    #if ULAB_HAS_BLOCKS
-    ndarray->dtype.origin = array;
-    #endif
-
     return ndarray;
 }
 
@@ -2045,7 +2048,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(ndarray_info_obj, ndarray_info);
 mp_int_t ndarray_get_buffer(mp_obj_t self_in, mp_buffer_info_t *bufinfo, mp_uint_t flags) {
     ndarray_obj_t *self = MP_OBJ_TO_PTR(self_in);
     #if ULAB_HAS_BLOCKS
-    if(!ndarray_is_dense(self) || self->dtype.flags) {
+    if(!ndarray_is_dense(self) || self->flags) {
         return 1;
     }
     #else
diff --git a/code/ndarray.h b/code/ndarray.h
index 633a1c1e..4d3e5cda 100644
--- a/code/ndarray.h
+++ b/code/ndarray.h
@@ -65,15 +65,17 @@ enum NDARRAY_TYPE {
 
 typedef struct _dtype_dtype {
     uint8_t type;
-    #if ULAB_HAS_BLOCKS
-    uint8_t flags;
+} dtype_dtype;
+
+typedef struct _blocks_block_obj_t {
+    mp_obj_base_t base;
+    // TODO: can the garbage collector deal with circular references?
+    void *ndarray;
     void *arrfunc;
     uint8_t *subarray;
-    size_t shape[ULAB_MAX_DIMS]; // original shape of array; this member should never be overwritten
-    void *origin; // origin stores the address of ndarray->array at the time of creation, and should never be changed
-    uint8_t *name;
-    #endif
-} dtype_dtype;
+    size_t shape[ULAB_MAX_DIMS];
+    void *origin;
+} blocks_block_obj_t;
 
 typedef struct _ndarray_obj_t {
     mp_obj_base_t base;
@@ -85,6 +87,10 @@ typedef struct _ndarray_obj_t {
     size_t shape[ULAB_MAX_DIMS];
     int32_t strides[ULAB_MAX_DIMS];
     void *array;
+    #if ULAB_HAS_BLOCKS
+    uint8_t flags;
+    blocks_block_obj_t *block;
+    #endif
 } ndarray_obj_t;
 
 #if ULAB_HAS_DTYPE_OBJECT
diff --git a/code/numpy/numerical/numerical.c b/code/numpy/numerical/numerical.c
index 5084f671..f73182be 100644
--- a/code/numpy/numerical/numerical.c
+++ b/code/numpy/numerical/numerical.c
@@ -23,6 +23,7 @@
 #include "../../ulab.h"
 #include "../../ulab_tools.h"
 #include "numerical.h"
+#include "blocks/blocks.h"
 
 enum NUMERICAL_FUNCTION_TYPE {
     NUMERICAL_ALL,
@@ -263,32 +264,33 @@ static mp_obj_t numerical_sum_mean_std_ndarray(ndarray_obj_t *ndarray, mp_obj_t
         uint8_t *rarray = NULL;
         mp_float_t *farray = NULL;
 
-        #if ULAB_HAS_BLOCKS
-        void (*arrfunc)(ndarray_obj_t *, void *, int32_t *, size_t) = NULL;
-        if(ndarray->dtype.flags) {
-            arrfunc = ndarray->dtype.arrfunc;
-        }
-        #else
-        uint8_t arrfunc = 0;
-        #endif
+        // #if ULAB_HAS_BLOCKS
+        // blocks_block_obj_t *block = NULL;
+        // if(ndarray->flags) {
+        //     block = ndarray->block;
+        //     // return mp_const_none;
+        // }
+        // #else
+        // uint8_t block = 0;
+        // #endif
 
         if(optype == NUMERICAL_SUM) {
             results = ndarray_new_dense_ndarray(_shape_strides.ndim, _shape_strides.shape, ndarray->dtype.type);
             rarray = (uint8_t *)results->array;
             // TODO: numpy promotes the output to the highest integer type
             if(ndarray->dtype.type == NDARRAY_UINT8) {
-                RUN_SUM(uint8_t, ndarray, array, results, rarray, _shape_strides, arrfunc);
+                RUN_SUM(uint8_t, ndarray, array, results, rarray, _shape_strides);
             } else if(ndarray->dtype.type == NDARRAY_INT8) {
-                RUN_SUM(int8_t, ndarray, array, results, rarray, _shape_strides, arrfunc);
+                RUN_SUM(int8_t, ndarray, array, results, rarray, _shape_strides);
             } else if(ndarray->dtype.type == NDARRAY_UINT16) {
-                RUN_SUM(uint16_t, ndarray, array, results, rarray, _shape_strides, arrfunc);
+                RUN_SUM(uint16_t, ndarray, array, results, rarray, _shape_strides);
             } else if(ndarray->dtype.type == NDARRAY_INT16) {
-                RUN_SUM(int16_t, ndarray, array, results, rarray, _shape_strides, arrfunc);
+                RUN_SUM(int16_t, ndarray, array, results, rarray, _shape_strides);
             } else {
                 // for floats, the sum might be inaccurate with the naive summation
                 // call mean, and multiply with the number of samples
                 farray = (mp_float_t *)results->array;
-                RUN_MEAN_STD(mp_float_t, ndarray, array, farray, _shape_strides, 0.0, 0, arrfunc);
+                RUN_MEAN_STD(mp_float_t, ndarray, array, farray, _shape_strides, 0.0, 0);
                 mp_float_t norm = (mp_float_t)_shape_strides.shape[0];
                 // re-wind the array here
                 farray = (mp_float_t *)results->array;
@@ -306,15 +308,15 @@ static mp_obj_t numerical_sum_mean_std_ndarray(ndarray_obj_t *ndarray, mp_obj_t
             }
             mp_float_t div = optype == NUMERICAL_STD ? (mp_float_t)(_shape_strides.shape[0] - ddof) : 0.0;
             if(ndarray->dtype.type == NDARRAY_UINT8) {
-                RUN_MEAN_STD(uint8_t, ndarray, array, farray, _shape_strides, div, isStd, arrfunc);
+                RUN_MEAN_STD(uint8_t, ndarray, array, farray, _shape_strides, div, isStd);
             } else if(ndarray->dtype.type == NDARRAY_INT8) {
-                RUN_MEAN_STD(int8_t, ndarray, array, farray, _shape_strides, div, isStd, arrfunc);
+                RUN_MEAN_STD(int8_t, ndarray, array, farray, _shape_strides, div, isStd);
             } else if(ndarray->dtype.type == NDARRAY_UINT16) {
-                RUN_MEAN_STD(uint16_t, ndarray, array, farray, _shape_strides, div, isStd, arrfunc);
+                RUN_MEAN_STD(uint16_t, ndarray, array, farray, _shape_strides, div, isStd);
             } else if(ndarray->dtype.type == NDARRAY_INT16) {
-                RUN_MEAN_STD(int16_t, ndarray, array, farray, _shape_strides, div, isStd, arrfunc);
+                RUN_MEAN_STD(int16_t, ndarray, array, farray, _shape_strides, div, isStd);
             } else {
-                RUN_MEAN_STD(mp_float_t, ndarray, array, farray, _shape_strides, div, isStd, arrfunc);
+                RUN_MEAN_STD(mp_float_t, ndarray, array, farray, _shape_strides, div, isStd);
             }
         }
         if(results->ndim == 0) { // return a scalar here
diff --git a/code/numpy/numerical/numerical.h b/code/numpy/numerical/numerical.h
index 511d65e8..9e2d593e 100644
--- a/code/numpy/numerical/numerical.h
+++ b/code/numpy/numerical/numerical.h
@@ -59,18 +59,19 @@
     (rarray) += (results)->itemsize;\
 })
 #else
-#define RUN_SUM1(type, ndarray, array, results, rarray, ss, arrfunc)\
+#define RUN_SUM1(type, ndarray, array, results, rarray, ss)\
 ({\
     type sum = 0;\
+    uint8_t *barray = (array);\
     int32_t increment = (ss).strides[0];\
-    if((ndarray)->dtype.flags) {\
-        (arrfunc)((ndarray), (array), &increment, (ss).shape[0]);\
-    } else {\
-        (ndarray)->dtype.subarray = array;\
+    if((ndarray)->flags) {\
+        void (*arrfunc)(ndarray_obj_t *, void *, int32_t *, size_t) = (ndarray)->block->arrfunc;\
+        arrfunc((ndarray), (array), &increment, (ss).shape[0]);\
+        barray = (ndarray)->block->subarray;\
     }\
     for(size_t i=0; i < (ss).shape[0]; i++) {\
-        sum += *((type *)((ndarray)->dtype.subarray));\
-        (ndarray)->dtype.subarray += increment;\
+        sum += *((type *)(barray));\
+        barray += increment;\
     }\
     (array) += (ss).shape[0] * (ss).strides[0];\
     memcpy((rarray), &sum, (results)->itemsize);\
@@ -78,30 +79,30 @@
 })
 #endif
 
-#define RUN_SUM2(type, ndarray, array, results, rarray, ss, arrfunc) do {\
+#define RUN_SUM2(type, ndarray, array, results, rarray, ss) do {\
     size_t l = 0;\
     do {\
-        RUN_SUM1(type, (ndarray), (array), (results), (rarray), (ss), (arrfunc));\
+        RUN_SUM1(type, (ndarray), (array), (results), (rarray), (ss));\
         (array) -= (ss).strides[0] * (ss).shape[0];\
         (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
         l++;\
     } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
 } while(0)
 
-#define RUN_SUM3(type, ndarray, array, results, rarray, ss, arrfunc) do {\
+#define RUN_SUM3(type, ndarray, array, results, rarray, ss) do {\
     size_t k = 0;\
     do {\
-        RUN_SUM2(type, (ndarray), (array), (results), (rarray), (ss), (arrfunc));\
+        RUN_SUM2(type, (ndarray), (array), (results), (rarray), (ss));\
         (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
         (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
         k++;\
     } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
 } while(0)
 
-#define RUN_SUM4(type, ndarray, array, results, rarray, ss, arrfunc) do {\
+#define RUN_SUM4(type, ndarray, array, results, rarray, ss) do {\
     size_t j = 0;\
     do {\
-        RUN_SUM3(type, (ndarray), (array), (results), (rarray), (ss), (arrfunc));\
+        RUN_SUM3(type, (ndarray), (array), (results), (rarray), (ss));\
         (array) -= (ss).strides[ULAB_MAX_DIMS - 2] * (ss).shape[ULAB_MAX_DIMS - 2];\
         (array) += (ss).strides[ULAB_MAX_DIMS - 3];\
         j++;\
@@ -111,8 +112,9 @@
 // Instead of the straightforward implementation of the definition,
 // we take the numerically stable Welford algorithm here
 // https://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
+
 #if !(ULAB_HAS_BLOCKS)
-#define RUN_MEAN_STD1(type, ndarray, array, rarray, ss, div, isStd, arrfunc)\
+#define RUN_MEAN_STD1(type, ndarray, array, rarray, ss, div, isStd)\
 ({\
     mp_float_t M = 0.0, m = 0.0, S = 0.0;\
     for(size_t i=0; i < (ss).shape[0]; i++) {\
@@ -127,52 +129,54 @@
     *(rarray)++ = isStd ? MICROPY_FLOAT_C_FUN(sqrt)(S / (div)) : M;\
 })
 #else
-#define RUN_MEAN_STD1(type, ndarray, array, rarray, ss, div, isStd, arrfunc)\
+#define RUN_MEAN_STD1(type, ndarray, array, rarray, ss, div, isStd)\
 ({\
     mp_float_t M = 0.0, m = 0.0, S = 0.0;\
+    uint8_t *barray = (array);\
     int32_t increment = (ss).strides[0];\
-    (ndarray)->dtype.subarray = (array);\
-    if((ndarray)->dtype.flags) {\
-        (arrfunc)((ndarray), (array), &increment, (ss).shape[0]);\
+    if((ndarray)->flags) {\
+        void (*arrfunc)(ndarray_obj_t *, void *, int32_t *, size_t) = (ndarray)->block->arrfunc;\
+        arrfunc((ndarray), (array), &increment, (ss).shape[0]);\
+        barray = (ndarray)->block->subarray;\
     }\
     for(size_t i=0; i < (ss).shape[0]; i++) {\
-        mp_float_t value = (mp_float_t)(*(type *)((ndarray)->dtype.subarray));\
+        mp_float_t value = (mp_float_t)(*(type *)(barray));\
         m = M + (value - M) / (mp_float_t)(i+1);\
         if(isStd) {\
             S += (value - M) * (value - m);\
         }\
         M = m;\
-        (ndarray)->dtype.subarray += increment;\
+        barray += increment;\
     }\
     (array) += (ss).shape[0] * (ss).strides[0];\
     *(rarray)++ = isStd ? MICROPY_FLOAT_C_FUN(sqrt)(S / (div)) : M;\
 })
 #endif
 
-#define RUN_MEAN_STD2(type, ndarray, array, rarray, ss, div, isStd, arrfunc) do {\
+#define RUN_MEAN_STD2(type, ndarray, array, rarray, ss, div, isStd) do {\
     size_t l = 0;\
     do {\
-        RUN_MEAN_STD1(type, (ndarray), (array), (rarray), (ss), (div), (isStd), (arrfunc));\
+        RUN_MEAN_STD1(type, (ndarray), (array), (rarray), (ss), (div), (isStd));\
         (array) -= (ss).strides[0] * (ss).shape[0];\
         (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
         l++;\
     } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
 } while(0)
 
-#define RUN_MEAN_STD3(type, ndarray, array, rarray, ss, div, isStd, arrfunc) do {\
+#define RUN_MEAN_STD3(type, ndarray, array, rarray, ss, div, isStd) do {\
     size_t k = 0;\
     do {\
-        RUN_MEAN_STD2(type, (ndarray), (array), (rarray), (ss), (div), (isStd), (arrfunc));\
+        RUN_MEAN_STD2(type, (ndarray), (array), (rarray), (ss), (div), (isStd));\
         (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
         (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
         k++;\
     } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
 } while(0)
 
-#define RUN_MEAN_STD4(type, ndarray, array, rarray, ss, div, isStd, arrfunc) do {\
+#define RUN_MEAN_STD4(type, ndarray, array, rarray, ss, div, isStd) do {\
     size_t j = 0;\
     do {\
-        RUN_MEAN_STD3(type, (ndarray), (array), (rarray), (ss), (div), (isStd), (arrfunc));\
+        RUN_MEAN_STD3(type, (ndarray), (array), (rarray), (ss), (div), (isStd));\
         (array) -= (ss).strides[ULAB_MAX_DIMS - 2] * (ss).shape[ULAB_MAX_DIMS - 2];\
         (array) += (ss).strides[ULAB_MAX_DIMS - 3];\
         j++;\
diff --git a/code/ulab_tools.c b/code/ulab_tools.c
index b84f4a1c..b3a1499f 100644
--- a/code/ulab_tools.c
+++ b/code/ulab_tools.c
@@ -229,18 +229,3 @@ ndarray_obj_t *tools_object_is_square(mp_obj_t obj) {
     return ndarray;
 }
 #endif
-
-#if ULAB_HAS_BLOCKS
-size_t *tools_coords_from_pointer(void *p1, ndarray_obj_t *ndarray) {
-    // calculates the coordinates in the original tensor from the position of the pointer
-    // The original view is assumed to be dense, i.e., the strides can be computed from the shape
-    size_t diff = (uint8_t *)p1 - (uint8_t *)ndarray->dtype.origin;
-    size_t accumulator = 1;
-    size_t *coords = m_new(size_t, ULAB_MAX_DIMS);
-    for(uint8_t i = 1; i < ndarray->ndim + 1; i++) {
-        accumulator *= ndarray->dtype.shape[ULAB_MAX_DIMS - i];
-        coords[ULAB_MAX_DIMS - i] = diff % accumulator;
-    }
-    return coords;
-}
-#endif
diff --git a/code/ulab_tools.h b/code/ulab_tools.h
index 3804a462..378e4f0c 100644
--- a/code/ulab_tools.h
+++ b/code/ulab_tools.h
@@ -34,5 +34,4 @@ void *ndarray_set_float_function(uint8_t );
 
 shape_strides tools_reduce_axes(ndarray_obj_t *, mp_obj_t );
 ndarray_obj_t *tools_object_is_square(mp_obj_t );
-size_t *tools_coords_from_pointer(void *, ndarray_obj_t *);
 #endif

From 702eb0b9ed8c765704547275d7b821eb51c10a27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n=20V=C3=B6r=C3=B6s?= <zvoros@gmail.com>
Date: Sat, 27 Feb 2021 16:26:18 +0100
Subject: [PATCH 14/19] simplified dtype printing

---
 code/blocks/blocks.c | 157 ++++++++++++++++++++++++++++++++++++-------
 code/blocks/blocks.h |   5 +-
 code/ndarray.c       |  82 ++++++++++------------
 code/ndarray.h       |   4 +-
 4 files changed, 171 insertions(+), 77 deletions(-)

diff --git a/code/blocks/blocks.c b/code/blocks/blocks.c
index 49f1bcc6..c1f98bd9 100644
--- a/code/blocks/blocks.c
+++ b/code/blocks/blocks.c
@@ -16,16 +16,17 @@
 #include "py/runtime.h"
 #include "py/misc.h"
 
+#include "ndarray.h"
 #include "../ulab_tools.h"
 #include "blocks.h"
 
 #if ULAB_HAS_BLOCKS
 
-extern const mp_obj_type_t block_function_type;
+extern const mp_obj_type_t blocks_transformer_type;
 
-const mp_obj_type_t block_function_type = {
+const mp_obj_type_t blocks_transformer_type = {
     { &mp_type_type },
-    .name = MP_QSTR_block_function,
+    .name = MP_QSTR_transformer,
 };
 
 extern const mp_obj_type_t imreader_type;
@@ -56,11 +57,11 @@ mp_obj_t imreader_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw
     mp_arg_val_t _args[MP_ARRAY_SIZE(allowed_args)];
     mp_arg_parse_all(n_args, args, &kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, _args);
 
-    blocks_function_obj_t *block_function = m_new_obj(blocks_function_obj_t);
-    block_function->base.type = &block_function_type;
-    void (*arrfunc)(ndarray_obj_t *, void *, int32_t *, size_t) = imreader_imreader;
-    block_function->arrfunc = arrfunc;
-    return MP_OBJ_FROM_PTR(block_function);
+    blocks_transformer_obj_t *transformer = m_new_obj(blocks_transformer_obj_t);
+    transformer->base.type = &blocks_transformer_type;
+    transformer->arrfunc = imreader_imreader;
+    transformer->array = NULL;
+    return MP_OBJ_FROM_PTR(transformer);
 }
 
 const mp_obj_type_t imreader_type = {
@@ -94,23 +95,9 @@ void blocks_block_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind
     mp_printf(print, "%ld), ", ndarray->shape[ULAB_MAX_DIMS - 1]);
     // mp_printf(print, "transformer=%s, ", self->);
     // this is duplicate from ndarray.c:ndarray_print, but allows complete decoupling
-    if(ndarray->boolean == NDARRAY_BOOL) {
-        mp_print_str(print, "dtype=bool)");
-    } else if(ndarray->dtype.type == NDARRAY_UINT8) {
-        mp_print_str(print, "dtype=uint8)");
-    } else if(ndarray->dtype.type == NDARRAY_INT8) {
-        mp_print_str(print, "dtype=int8)");
-    } else if(ndarray->dtype.type == NDARRAY_UINT16) {
-        mp_print_str(print, "dtype=uint16)");
-    } else if(ndarray->dtype.type == NDARRAY_INT16) {
-        mp_print_str(print, "dtype=int16)");
-    } else {
-        #if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
-        mp_print_str(print, "dtype=float32)");
-        #else
-        mp_print_str(print, "dtype=float64)");
-        #endif
-    }
+    mp_print_str(print, "dtype=");
+    ndarray_print_dtype(print, ndarray);
+    mp_print_str(print, ")");
 }
 
 const mp_obj_type_t blocks_block_type = {
@@ -172,7 +159,11 @@ mp_obj_t blocks_new_ndarray(size_t n_args, const mp_obj_t *pos_args, mp_map_t *k
     memcpy(&(block->shape), &(ndarray->shape), sizeof(size_t) * ULAB_MAX_DIMS);
     // store the original address of the array; block->origin should never be touched
     block->origin = ndarray->array;
-    block->arrfunc = imreader_imreader;
+
+    // get the pointer to the reader function
+    blocks_transformer_obj_t *transformer = MP_OBJ_TO_PTR(args[1].u_obj);
+    block->arrfunc = transformer->arrfunc;
+    ndarray->array = transformer->array;
     ndarray->block = block;
     return ndarray;
 }
@@ -194,3 +185,117 @@ mp_obj_module_t ulab_blocks_module = {
 };
 
 #endif
+
+// typedef struct _imreader_obj_t {
+//     mp_obj_base_t base;
+//     void *arrfunc;
+// } imreader_obj_t;
+
+/*
+mp_obj_t blocks_block_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args) {
+    (void) type;
+    mp_arg_check_num(n_args, n_kw, 0, 1, true);
+    mp_map_t kw_args;
+    mp_map_init_fixed_table(&kw_args, n_kw, args + n_args);
+
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_OBJ, { .u_obj = mp_const_none } },
+    };
+    mp_arg_val_t _args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, args, &kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, _args);
+
+    blocks_function_obj_t *function = m_new_obj(blocks_obj_t);
+    function->base.type = &blocks_block_type;
+
+    return MP_OBJ_FROM_PTR(function);
+}
+*/
+
+/*
+mp_obj_t blocks_block_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args) {
+    (void) type;
+    mp_arg_check_num(n_args, n_kw, 0, 1, true);
+    mp_map_t kw_args;
+    mp_map_init_fixed_table(&kw_args, n_kw, args + n_args);
+
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_OBJ, { .u_obj = mp_const_none } },
+    };
+    mp_arg_val_t _args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, args, &kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, _args);
+
+    blocks_function_obj_t *function = m_new_obj(blocks_obj_t);
+    function->base.type = &blocks_block_type;
+
+    return MP_OBJ_FROM_PTR(function);
+}
+*/
+
+#if 0
+static void blocks_imreader(ndarray_obj_t *ndarray, void *array, int32_t *strides, size_t count) {
+    printf("imreader: %p\n", ndarray->dtype.subarray);
+    uint8_t *subarray = (uint8_t *)ndarray->dtype.subarray;
+    // if necessary, get the coordinates in the original reference frame, i.e.,
+    // in the coordinates used at the time of the creation of the object
+    // size_t *coords = tools_coords_from_pointer(array, ndarray);
+    for(size_t i = 0; i < count; i++) {
+        // fill up the array with dummy data
+        *subarray++ = (uint8_t)i*i;
+         // array += *strides/ndarray->itemsize
+    }
+    // since strides is going to be used in computation loops, and subarray is
+    // meant to be a dense array, simply overwrite strides with the itemsize
+    *strides = ndarray->itemsize;
+}
+
+static mp_obj_t blocks_imreader_function(void) {
+    blocks_function_obj_t *func = m_new(blocks_function_obj_t, 1);
+    void (*imreader)(ndarray_obj_t *, void *, int32_t *, size_t) = blocks_imreader;
+    func->arrfunc = imreader;
+    return MP_OBJ_FROM_PTR(func);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_0(blocks_imreader_function_obj, blocks_imreader_function);
+#endif
+
+#if 0
+mp_obj_t blocks_ndarray(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_shape, MP_ARG_OBJ | MP_ARG_REQUIRED, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_transformer, MP_ARG_KW_ONLY | MP_ARG_OBJ | MP_ARG_REQUIRED, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_obj = MP_ROM_INT(NDARRAY_FLOAT) } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    mp_obj_tuple_t *shape_tuple = MP_OBJ_TO_PTR(args[0].u_obj);
+    if(shape_tuple->len > ULAB_MAX_DIMS) {
+        mp_raise_ValueError(translate("too many dimensions"));
+    }
+    uint8_t _dtype;
+    #if ULAB_HAS_DTYPE_OBJECT
+    if(MP_OBJ_IS_TYPE(args[1].u_obj, &ulab_dtype_type)) {
+        dtype_obj_t *dtype = MP_OBJ_TO_PTR(args[2].u_obj);
+        _dtype = dtype->dtype.type;
+    } else {
+        _dtype = mp_obj_get_int(args[2].u_obj);
+    }
+    #else
+    _dtype = mp_obj_get_int(args[2].u_obj);
+    #endif
+
+    if(!MP_OBJ_IS_TYPE(args[1].u_obj, &blocks_block_function_type)) {
+        mp_raise_TypeError(translate("transformer must be a block function object"));
+    }
+    blocks_block_function_obj_t *transformer = MP_OBJ_TO_PTR(args[1].u_obj);
+    (void)transformer;
+    void (*arrfunc)(ndarray_obj_t *, void *, int32_t *, size_t) = blocks_imreader;
+    ndarray_obj_t *ndarray = blocks_new_ndarray(args[0].u_obj, _dtype, arrfunc);
+    ndarray->dtype.arrfunc = arrfunc;
+    printf("%p\n", ndarray->dtype.subarray);
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(blocks_ndarray_obj, 0, blocks_ndarray);
+#endif
diff --git a/code/blocks/blocks.h b/code/blocks/blocks.h
index d227666a..ae71fa8a 100644
--- a/code/blocks/blocks.h
+++ b/code/blocks/blocks.h
@@ -19,10 +19,11 @@
 #define BLOCK_IS_READ_ONLY          0x01
 #define BLOCK_IS_READ_WRITE         0x02
 
-typedef struct _blocks_function_obj_t {
+typedef struct _blocks_transformer_obj_t {
     mp_obj_base_t base;
     void *arrfunc;
-} blocks_function_obj_t;
+    void *array;
+} blocks_transformer_obj_t;
 
 extern const mp_obj_type_t blocks_block_type;
 extern mp_obj_module_t ulab_blocks_module;
diff --git a/code/ndarray.c b/code/ndarray.c
index 4cdb14a3..73a70624 100644
--- a/code/ndarray.c
+++ b/code/ndarray.c
@@ -316,29 +316,34 @@ void fill_array_iterable(mp_float_t *array, mp_obj_t iterable) {
     }
 }
 
+void ndarray_print_dtype(const mp_print_t *print, ndarray_obj_t *ndarray) {
+    if(ndarray->boolean) {
+        mp_printf(print, "bool");
+    } else if(ndarray->dtype.type == NDARRAY_UINT8) {
+        mp_printf(print, "uint8");
+    } else if(ndarray->dtype.type == NDARRAY_INT8) {
+        mp_printf(print, "int8");
+    } else if(ndarray->dtype.type == NDARRAY_UINT16) {
+        mp_printf(print, "uint16");
+    } else if(ndarray->dtype.type == NDARRAY_INT16) {
+        mp_printf(print, "int16");
+    } else if(ndarray->dtype.type == NDARRAY_FLOAT) {
+        #if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
+        mp_print_str(print, "float32");
+        #else
+        mp_print_str(print, "float64");
+        #endif
+    }
+}
+
 #if NDARRAY_HAS_DTYPE
 #if ULAB_HAS_DTYPE_OBJECT
 void ndarray_dtype_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) {
     (void)kind;
     dtype_obj_t *self = MP_OBJ_TO_PTR(self_in);
     mp_print_str(print, "dtype('");
-    if(self->dtype.type == NDARRAY_BOOLEAN) {
-        mp_print_str(print, "bool')");
-    } else if(self->dtype.type == NDARRAY_UINT8) {
-        mp_print_str(print, "uint8')");
-    } else if(self->dtype.type == NDARRAY_INT8) {
-        mp_print_str(print, "int8')");
-    } else if(self->dtype.type == NDARRAY_UINT16) {
-        mp_print_str(print, "uint16')");
-    } else if(self->dtype.type == NDARRAY_INT16) {
-        mp_print_str(print, "int16')");
-    } else {
-        #if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
-        mp_print_str(print, "float32')");
-        #else
-        mp_print_str(print, "float64')");
-        #endif
-    }
+    ndarray_print_dtype(print, self);
+    mp_print_str(print, "')");
 }
 
 mp_obj_t ndarray_dtype_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args) {
@@ -553,23 +558,9 @@ void ndarray_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t ki
     } while(i < self->shape[ULAB_MAX_DIMS-4]);
     ndarray_print_bracket(print, 0, self->shape[ULAB_MAX_DIMS-4], "]");
     #endif
-    if(self->boolean) {
-        mp_print_str(print, ", dtype=bool)");
-    } else if(self->dtype.type == NDARRAY_UINT8) {
-        mp_print_str(print, ", dtype=uint8)");
-    } else if(self->dtype.type == NDARRAY_INT8) {
-        mp_print_str(print, ", dtype=int8)");
-    } else if(self->dtype.type == NDARRAY_UINT16) {
-        mp_print_str(print, ", dtype=uint16)");
-    } else if(self->dtype.type == NDARRAY_INT16) {
-        mp_print_str(print, ", dtype=int16)");
-    } else {
-        #if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
-        mp_print_str(print, ", dtype=float32)");
-        #else
-        mp_print_str(print, ", dtype=float64)");
-        #endif
-    }
+    mp_print_str(print, ", dtype=");
+    ndarray_print_dtype(print, self);
+    mp_print_str(print, ")");
 }
 
 void ndarray_assign_elements(ndarray_obj_t *ndarray, mp_obj_t iterable, uint8_t dtype, size_t *idx) {
@@ -2005,7 +1996,13 @@ mp_obj_t ndarray_info(mp_obj_t obj_in) {
     if(!MP_OBJ_IS_TYPE(ndarray, &ulab_ndarray_type)) {
         mp_raise_TypeError(translate("function is defined for ndarrays only"));
     }
+    #if ULAB_HAS_BLOCKS
+    if(ndarray->flags) {
+        mp_printf(MP_PYTHON_PRINTER, "class: block\n");
+    }
+    #else
     mp_printf(MP_PYTHON_PRINTER, "class: ndarray\n");
+    #endif
     mp_printf(MP_PYTHON_PRINTER, "shape: (");
     if(ndarray->ndim == 1) {
         mp_printf(MP_PYTHON_PRINTER, "%d,", ndarray->shape[ULAB_MAX_DIMS-1]);
@@ -2025,19 +2022,10 @@ mp_obj_t ndarray_info(mp_obj_t obj_in) {
     mp_printf(MP_PYTHON_PRINTER, "itemsize: %d\n", ndarray->itemsize);
     mp_printf(MP_PYTHON_PRINTER, "data pointer: 0x%p\n", ndarray->array);
     mp_printf(MP_PYTHON_PRINTER, "type: ");
-    if(ndarray->boolean) {
-        mp_printf(MP_PYTHON_PRINTER, "bool\n");
-    } else if(ndarray->dtype.type == NDARRAY_UINT8) {
-        mp_printf(MP_PYTHON_PRINTER, "uint8\n");
-    } else if(ndarray->dtype.type == NDARRAY_INT8) {
-        mp_printf(MP_PYTHON_PRINTER, "int8\n");
-    } else if(ndarray->dtype.type == NDARRAY_UINT16) {
-        mp_printf(MP_PYTHON_PRINTER, "uint16\n");
-    } else if(ndarray->dtype.type == NDARRAY_INT16) {
-        mp_printf(MP_PYTHON_PRINTER, "int16\n");
-    } else if(ndarray->dtype.type == NDARRAY_FLOAT) {
-        mp_printf(MP_PYTHON_PRINTER, "float\n");
-    }
+    ndarray_print_dtype(MP_PYTHON_PRINTER, ndarray);
+    #if ULAB_HAS_BLOCKS
+    mp_printf(MP_PYTHON_PRINTER, "\nflags: %d\n", ndarray->flags);
+    #endif
     return mp_const_none;
 }
 
diff --git a/code/ndarray.h b/code/ndarray.h
index 4d3e5cda..1483e879 100644
--- a/code/ndarray.h
+++ b/code/ndarray.h
@@ -101,8 +101,6 @@ typedef struct _dtype_obj_t {
     dtype_dtype dtype;
 } dtype_obj_t;
 
-void ndarray_dtype_print(const mp_print_t *, mp_obj_t , mp_print_kind_t );
-
 #ifdef CIRCUITPY
 mp_obj_t ndarray_dtype_make_new(const mp_obj_type_t *type, size_t n_args, const mp_obj_t *args, mp_map_t *kw_args);
 #else
@@ -110,6 +108,8 @@ mp_obj_t ndarray_dtype_make_new(const mp_obj_type_t *, size_t , size_t , const m
 #endif /* CIRCUITPY */
 #endif /* ULAB_HAS_DTYPE_OBJECT */
 
+void ndarray_print_dtype(const mp_print_t *, ndarray_obj_t *);
+
 mp_obj_t ndarray_new_ndarray_iterator(mp_obj_t , mp_obj_iter_buf_t *);
 
 mp_float_t ndarray_get_float_value(void *, uint8_t );

From 183d9aabdb489ef53d913af13ca6a2d246f07cc4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n=20V=C3=B6r=C3=B6s?= <zvoros@gmail.com>
Date: Mon, 1 Mar 2021 18:23:45 +0100
Subject: [PATCH 15/19] updated programming section in manual, moved imread to
 the user module

---
 code/blocks/blocks.c             |  45 --------------
 code/blocks/blocks.h             |   2 +
 code/ndarray.c                   |   7 ++-
 code/numpy/numerical/numerical.c |  21 ++++++-
 code/numpy/numerical/numerical.h |   3 +-
 code/ulab.h                      |   2 +-
 code/user/user.c                 |  43 +++++++++++++
 docs/ulab-programming.ipynb      | 102 +++++++++++++++++++++++++++++--
 8 files changed, 168 insertions(+), 57 deletions(-)

diff --git a/code/blocks/blocks.c b/code/blocks/blocks.c
index c1f98bd9..11df891f 100644
--- a/code/blocks/blocks.c
+++ b/code/blocks/blocks.c
@@ -22,54 +22,11 @@
 
 #if ULAB_HAS_BLOCKS
 
-extern const mp_obj_type_t blocks_transformer_type;
-
 const mp_obj_type_t blocks_transformer_type = {
     { &mp_type_type },
     .name = MP_QSTR_transformer,
 };
 
-extern const mp_obj_type_t imreader_type;
-
-void imreader_imreader(ndarray_obj_t *ndarray, void *array, int32_t *strides, size_t count) {
-    blocks_block_obj_t *block = (blocks_block_obj_t *)ndarray->block;
-    uint8_t *barray = (uint8_t *)block->subarray;
-    // if necessary, get the coordinates in the original reference frame, i.e.,
-    // in the coordinates used at the time of the creation of the object
-    // size_t *coords = blocks_coords_from_pointer(array, ndarray);
-    for(size_t i = 0; i < count; i++) {
-        // fill up the array with dummy data
-        *barray++ = (uint8_t)i*i;
-    }
-    // The subarray is a forward propagating dense array, so set the strides to the itemsize
-    *strides = ndarray->itemsize;
-}
-
-mp_obj_t imreader_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args) {
-    (void) type;
-    mp_arg_check_num(n_args, n_kw, 0, 1, true);
-    mp_map_t kw_args;
-    mp_map_init_fixed_table(&kw_args, n_kw, args + n_args);
-
-    static const mp_arg_t allowed_args[] = {
-        { MP_QSTR_, MP_ARG_OBJ, { .u_obj = mp_const_none } },
-    };
-    mp_arg_val_t _args[MP_ARRAY_SIZE(allowed_args)];
-    mp_arg_parse_all(n_args, args, &kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, _args);
-
-    blocks_transformer_obj_t *transformer = m_new_obj(blocks_transformer_obj_t);
-    transformer->base.type = &blocks_transformer_type;
-    transformer->arrfunc = imreader_imreader;
-    transformer->array = NULL;
-    return MP_OBJ_FROM_PTR(transformer);
-}
-
-const mp_obj_type_t imreader_type = {
-    { &mp_type_type },
-    .name = MP_QSTR_imreader,
-    .make_new = imreader_make_new,
-};
-
 size_t *blocks_coords_from_pointer(void *p1, ndarray_obj_t *ndarray) {
     // calculates the coordinates in the original tensor from the position of the pointer
     // The original view is assumed to be dense, i.e., the strides can be computed from the shape
@@ -173,7 +130,6 @@ MP_DEFINE_CONST_FUN_OBJ_KW(blocks_new_ndarray_obj, 0, blocks_new_ndarray);
 static const mp_rom_map_elem_t ulab_blocks_globals_table[] = {
     { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_blocks) },
     { MP_OBJ_NEW_QSTR(MP_QSTR_ndarray), (mp_obj_t)&blocks_new_ndarray_obj },
-    { MP_OBJ_NEW_QSTR(MP_QSTR_imreader), (mp_obj_t)&imreader_type },
     { MP_OBJ_NEW_QSTR(MP_QSTR_block), (mp_obj_t)&blocks_block_type },
 };
 
@@ -293,7 +249,6 @@ mp_obj_t blocks_ndarray(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_ar
     void (*arrfunc)(ndarray_obj_t *, void *, int32_t *, size_t) = blocks_imreader;
     ndarray_obj_t *ndarray = blocks_new_ndarray(args[0].u_obj, _dtype, arrfunc);
     ndarray->dtype.arrfunc = arrfunc;
-    printf("%p\n", ndarray->dtype.subarray);
     return MP_OBJ_FROM_PTR(ndarray);
 }
 
diff --git a/code/blocks/blocks.h b/code/blocks/blocks.h
index ae71fa8a..0f25276a 100644
--- a/code/blocks/blocks.h
+++ b/code/blocks/blocks.h
@@ -25,6 +25,8 @@ typedef struct _blocks_transformer_obj_t {
     void *array;
 } blocks_transformer_obj_t;
 
+extern const mp_obj_type_t blocks_transformer_type;
+
 extern const mp_obj_type_t blocks_block_type;
 extern mp_obj_module_t ulab_blocks_module;
 
diff --git a/code/ndarray.c b/code/ndarray.c
index 73a70624..43d5c316 100644
--- a/code/ndarray.c
+++ b/code/ndarray.c
@@ -242,13 +242,13 @@ void ndarray_fill_array_iterable(mp_float_t *array, mp_obj_t iterable) {
     }
 }
 
-#if ULAB_HAS_FUNCTION_ITERATOR
 size_t *ndarray_new_coords(uint8_t ndim) {
     size_t *coords = m_new(size_t, ndim);
     memset(coords, 0, ndim*sizeof(size_t));
     return coords;
 }
 
+#if ULAB_HAS_FUNCTION_ITERATOR
 void ndarray_rewind_array(uint8_t ndim, uint8_t *array, size_t *shape, int32_t *strides, size_t *coords) {
     // resets the data pointer of a single array, whenever an axis is full
     // since we always iterate over the very last axis, we have to keep track of
@@ -1398,6 +1398,11 @@ mp_obj_t ndarray_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
     if (value == MP_OBJ_SENTINEL) { // return value(s)
         return ndarray_get_slice(self, index, NULL);
     } else { // assignment to slices; the value must be an ndarray, or a scalar
+        #if ULAB_HAS_BLOCKS
+        if(self->flags) {
+            mp_raise_ValueError(translate("blocks cannot be assigned to"));
+        }
+        #endif
         ndarray_obj_t *values = ndarray_from_mp_obj(value);
         return ndarray_get_slice(self, index, values);
     }
diff --git a/code/numpy/numerical/numerical.c b/code/numpy/numerical/numerical.c
index f73182be..e2db345d 100644
--- a/code/numpy/numerical/numerical.c
+++ b/code/numpy/numerical/numerical.c
@@ -200,6 +200,15 @@ static mp_obj_t numerical_sum_mean_std_ndarray(ndarray_obj_t *ndarray, mp_obj_t
             return mp_obj_new_float(MICROPY_FLOAT_CONST(0.0));
         }
         mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype.type);
+        #if ULAB_HAS_BLOCKS
+        void (*arrfunc)(ndarray_obj_t *, void *, int32_t *, size_t) = NULL;
+        if(ndarray->flags) {
+            arrfunc = ndarray->block->arrfunc;
+        }
+        #endif
+        int32_t increment = _shape_strides.strides[ULAB_MAX_DIMS - 1];
+        uint8_t *barray = array;
+
         mp_float_t M = 0.0, m = 0.0, S = 0.0, s = 0.0;
         size_t count = 0;
 
@@ -216,20 +225,26 @@ static mp_obj_t numerical_sum_mean_std_ndarray(ndarray_obj_t *ndarray, mp_obj_t
                 do {
                 #endif
                     size_t l = 0;
+                    #if ULAB_HAS_BLOCKS
+                    if(ndarray->flags) {
+                        arrfunc(ndarray, array, &increment, _shape_strides.shape[ULAB_MAX_DIMS - 1]);
+                        barray = ndarray->block->subarray;
+                    }
+                    #endif
                     do {
                         count++;
-                        mp_float_t value = func(array);
+                        mp_float_t value = func(barray);
                         m = M + (value - M) / (mp_float_t)count;
                         if(optype == NUMERICAL_STD) {
                             s = S + (value - M) * (value - m);
                             S = s;
                         }
                         M = m;
-                        array += _shape_strides.strides[ULAB_MAX_DIMS - 1];
+                        barray += increment;
                         l++;
                     } while(l < _shape_strides.shape[ULAB_MAX_DIMS - 1]);
                 #if ULAB_MAX_DIMS > 1
-                    array -= _shape_strides.strides[ULAB_MAX_DIMS - 1] * _shape_strides.shape[ULAB_MAX_DIMS - 1];
+//                    array -= _shape_strides.strides[ULAB_MAX_DIMS - 1] * _shape_strides.shape[ULAB_MAX_DIMS - 1];
                     array += _shape_strides.strides[ULAB_MAX_DIMS - 2];
                     k++;
                 } while(k < _shape_strides.shape[ULAB_MAX_DIMS - 2]);
diff --git a/code/numpy/numerical/numerical.h b/code/numpy/numerical/numerical.h
index 9e2d593e..295e68f7 100644
--- a/code/numpy/numerical/numerical.h
+++ b/code/numpy/numerical/numerical.h
@@ -47,10 +47,9 @@
 })
 
 #if !(ULAB_HAS_BLOCKS)
-#define RUN_SUM1(type, ndarray, array, results, rarray, ss, arrfunc)\
+#define RUN_SUM1(type, ndarray, array, results, rarray, ss)\
 ({\
     type sum = 0;\
-    (void)arrfunc;\
     for(size_t i=0; i < (ss).shape[0]; i++) {\
         sum += *((type *)(array));\
         (array) += (ss).strides[0];\
diff --git a/code/ulab.h b/code/ulab.h
index cf2c05e5..ae2c1b0a 100644
--- a/code/ulab.h
+++ b/code/ulab.h
@@ -625,7 +625,7 @@
 // user-defined module; source of the module and
 // its sub-modules should be placed in code/user/
 #ifndef ULAB_HAS_USER_MODULE
-#define ULAB_HAS_USER_MODULE                (0)
+#define ULAB_HAS_USER_MODULE                (1)
 #endif
 
 #endif
diff --git a/code/user/user.c b/code/user/user.c
index 6b72d4b4..a74db4f3 100644
--- a/code/user/user.c
+++ b/code/user/user.c
@@ -17,6 +17,7 @@
 #include "py/misc.h"
 
 #include "../ulab_tools.h"
+#include "../blocks/blocks.h"
 #include "user.h"
 
 #if ULAB_HAS_USER_MODULE
@@ -82,9 +83,51 @@ static mp_obj_t user_square(mp_obj_t arg) {
 
 MP_DEFINE_CONST_FUN_OBJ_1(user_square_obj, user_square);
 
+extern const mp_obj_type_t imreader_type;
+
+void imreader_imreader(ndarray_obj_t *ndarray, void *array, int32_t *strides, size_t count) {
+    blocks_block_obj_t *block = (blocks_block_obj_t *)ndarray->block;
+    uint8_t *barray = (uint8_t *)block->subarray;
+    // if necessary, get the coordinates in the original reference frame, i.e.,
+    // in the coordinates used at the time of the creation of the object
+    // size_t *coords = blocks_coords_from_pointer(array, ndarray);
+    for(size_t i = 0; i < count; i++) {
+        // fill up the array with dummy data
+        *barray++ = (uint8_t)i * i;
+    }
+    // The subarray is a forward propagating dense array, so set the strides to the itemsize
+    *strides = ndarray->itemsize;
+}
+
+mp_obj_t imreader_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args) {
+    (void)type;
+    mp_arg_check_num(n_args, n_kw, 0, 1, true);
+    mp_map_t kw_args;
+    mp_map_init_fixed_table(&kw_args, n_kw, args + n_args);
+
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_OBJ, { .u_obj = mp_const_none } },
+    };
+    mp_arg_val_t _args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, args, &kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, _args);
+
+    blocks_transformer_obj_t *transformer = m_new_obj(blocks_transformer_obj_t);
+    transformer->base.type = &blocks_transformer_type;
+    transformer->arrfunc = imreader_imreader;
+    transformer->array = NULL;
+    return MP_OBJ_FROM_PTR(transformer);
+}
+
+const mp_obj_type_t imreader_type = {
+    { &mp_type_type },
+    .name = MP_QSTR_imreader,
+    .make_new = imreader_make_new,
+};
+
 static const mp_rom_map_elem_t ulab_user_globals_table[] = {
     { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_user) },
     { MP_OBJ_NEW_QSTR(MP_QSTR_square), (mp_obj_t)&user_square_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_imreader), (mp_obj_t)&imreader_type },
 };
 
 static MP_DEFINE_CONST_DICT(mp_module_ulab_user_globals, ulab_user_globals_table);
diff --git a/docs/ulab-programming.ipynb b/docs/ulab-programming.ipynb
index 38be5fec..5cba55dc 100644
--- a/docs/ulab-programming.ipynb
+++ b/docs/ulab-programming.ipynb
@@ -40,7 +40,7 @@
     "\n",
     "## Code organisation\n",
     "\n",
-    "As mentioned earlier, the `python` functions are organised into sub-modules at the C level. The C sub-modules can be found in `./ulab/code/`."
+    "As mentioned earlier, functions are organised into sub-modules at the C, as well as at the `python` level. At the moment, `ulab` has four such sub-modules, namely, `numpy`, `scipy`, `blocks`, and `user`, and the source of the corresponding C sub-modules can be found in `./ulab/code/numpy`, `./ulab/code/scipy`, `./ulab/code/blocks`, and `./ulab/code/user`, respectively."
    ]
   },
   {
@@ -742,9 +742,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Working with the blocks module\n",
+    "## Working with the `blocks` module\n",
     "\n",
-    "Version 3 of `ulab` introduced the `blocks` sub-module for extensions. You can enable it by setting the `ULAB_HAS_BLOCKS` constant in [ulab.h](https://github.com/v923z/micropython-ulab/blob/master/code/ulab.h)\n",
+    "Version 3.0 of `ulab` introduced the `blocks` sub-module for extensions. You can enable it by setting the `ULAB_HAS_BLOCKS` constant in [ulab.h](https://github.com/v923z/micropython-ulab/blob/master/code/ulab.h)\n",
     "\n",
     "```c\n",
     "#ifndef ULAB_HAS_BLOCKS\n",
@@ -758,13 +758,105 @@
     "from ulab import numpy as np\n",
     "from ulab import blocks\n",
     "\n",
-    "from yourmodule import func\n",
+    "from mymodule import func\n",
     "\n",
-    "b = blocks.block(shape=(10, 10), transformer=func, dtype=np.uint8)\n",
+    "b = blocks.block(shape=(10, 10), transformer=func(), dtype=np.uint8)\n",
     "print(np.std(b, axis=0))\n",
     "```"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We have seen how you can write your own `numpy`-compatible functions in the `user` sub-module. What `blocks` enables you to do is the opposite in a sense: you can define your own data containers, write a simple _transformer_ function, and via the transformer, have access to all `ulab` functions.\n",
+    "\n",
+    "But first, what is the point of such an exercise?\n",
+    "\n",
+    "The first use of the `blocks` extension is what we have already stated: standard `numpy`-compatible numerical computations can be done on arbitrary data containers without having to change the core, and without having to compromise `numpy`-compatibility. In other words, if the data can be converted in some way to one of the five native `dtype`s, `ulab` will be able to deal with it. A trivial example is an image, which is encoded in jpeg, PNG or some other format. \n",
+    "\n",
+    "But `blocks` is capable of much more. It allows lazy loading: data are fetched, when they are needed in the computation loops, but otherwise, they do not even have to reside in RAM. This means that you can work with data sets that do not even fit into the RAM of your microcontroller."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### The inner workings of the `blocks` module"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A standard `ndarray` has already been discussed at length. The gist is that it consists of a pointer to the data, and a small header telling the interpreter, how the pointer has to move in computation loops, and how the bytes at the pointer position are to be interpreted. Most of the magic happens in the header, and many operations, e.g., slicing, iterations, reshaping etc. do not even change the underlying data, they simply re-write the header.\n",
+    "\n",
+    "With the help of the `blocks` module, instead of attaching a pointer to actual data, one can attach a pointer to a function that supplies the data, when needed. So, the header of the `ndarray` now looks like \n",
+    "\n",
+    "```c\n",
+    "typedef struct _blocks_block_obj_t {\n",
+    "    mp_obj_base_t base;\n",
+    "    void *ndarray;\n",
+    "    void *arrfunc;\n",
+    "    uint8_t *subarray;\n",
+    "    size_t shape[ULAB_MAX_DIMS];\n",
+    "    void *origin;\n",
+    "} blocks_block_obj_t;\n",
+    "\n",
+    "typedef struct _ndarray_obj_t {\n",
+    "    mp_obj_base_t base;\n",
+    "    dtype_dtype dtype;\n",
+    "    uint8_t itemsize;\n",
+    "    uint8_t boolean;\n",
+    "    uint8_t ndim;\n",
+    "    size_t len;\n",
+    "    size_t shape[ULAB_MAX_DIMS];\n",
+    "    int32_t strides[ULAB_MAX_DIMS];\n",
+    "    void *array;\n",
+    "    #if ULAB_HAS_BLOCKS\n",
+    "    uint8_t flags;\n",
+    "    blocks_block_obj_t *block;\n",
+    "    #endif\n",
+    "} ndarray_obj_t;\n",
+    "\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "and the function will be used as, e.g., in the `RUN_SUM` macro that calculates the sum along a particular axis:\n",
+    "\n",
+    "```c\n",
+    "#define RUN_SUM1(type, ndarray, array, results, rarray, ss)\\\n",
+    "({\\\n",
+    "    type sum = 0;\\\n",
+    "    uint8_t *barray = (array);\\\n",
+    "    int32_t increment = (ss).strides[0];\\\n",
+    "    if((ndarray)->flags) {\\\n",
+    "        void (*arrfunc)(ndarray_obj_t *, void *, int32_t *, size_t) = (ndarray)->block->arrfunc;\\\n",
+    "        arrfunc((ndarray), (array), &increment, (ss).shape[0]);\\\n",
+    "        barray = (ndarray)->block->subarray;\\\n",
+    "    }\\\n",
+    "    for(size_t i=0; i < (ss).shape[0]; i++) {\\\n",
+    "        sum += *((type *)(barray));\\\n",
+    "        barray += increment;\\\n",
+    "    }\\\n",
+    "    (array) += (ss).shape[0] * (ss).strides[0];\\\n",
+    "    memcpy((rarray), &sum, (results)->itemsize);\\\n",
+    "    (rarray) += (results)->itemsize;\\\n",
+    "})\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `arrfunc` function fills the values in its own void pointer, `subarray`, and this is the array that will be used in the actual iteration. "
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

From 00ab5bcc1443927d5c954d55ab05880ad29fd196 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n=20V=C3=B6r=C3=B6s?= <zvoros@gmail.com>
Date: Mon, 1 Mar 2021 21:40:27 +0100
Subject: [PATCH 16/19] block objects can now be iterated over

---
 code/blocks/blocks.c | 113 -------------------------------------------
 code/ndarray.c       |  15 ++++++
 2 files changed, 15 insertions(+), 113 deletions(-)

diff --git a/code/blocks/blocks.c b/code/blocks/blocks.c
index 11df891f..0e496626 100644
--- a/code/blocks/blocks.c
+++ b/code/blocks/blocks.c
@@ -141,116 +141,3 @@ mp_obj_module_t ulab_blocks_module = {
 };
 
 #endif
-
-// typedef struct _imreader_obj_t {
-//     mp_obj_base_t base;
-//     void *arrfunc;
-// } imreader_obj_t;
-
-/*
-mp_obj_t blocks_block_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args) {
-    (void) type;
-    mp_arg_check_num(n_args, n_kw, 0, 1, true);
-    mp_map_t kw_args;
-    mp_map_init_fixed_table(&kw_args, n_kw, args + n_args);
-
-    static const mp_arg_t allowed_args[] = {
-        { MP_QSTR_, MP_ARG_OBJ, { .u_obj = mp_const_none } },
-    };
-    mp_arg_val_t _args[MP_ARRAY_SIZE(allowed_args)];
-    mp_arg_parse_all(n_args, args, &kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, _args);
-
-    blocks_function_obj_t *function = m_new_obj(blocks_obj_t);
-    function->base.type = &blocks_block_type;
-
-    return MP_OBJ_FROM_PTR(function);
-}
-*/
-
-/*
-mp_obj_t blocks_block_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args) {
-    (void) type;
-    mp_arg_check_num(n_args, n_kw, 0, 1, true);
-    mp_map_t kw_args;
-    mp_map_init_fixed_table(&kw_args, n_kw, args + n_args);
-
-    static const mp_arg_t allowed_args[] = {
-        { MP_QSTR_, MP_ARG_OBJ, { .u_obj = mp_const_none } },
-    };
-    mp_arg_val_t _args[MP_ARRAY_SIZE(allowed_args)];
-    mp_arg_parse_all(n_args, args, &kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, _args);
-
-    blocks_function_obj_t *function = m_new_obj(blocks_obj_t);
-    function->base.type = &blocks_block_type;
-
-    return MP_OBJ_FROM_PTR(function);
-}
-*/
-
-#if 0
-static void blocks_imreader(ndarray_obj_t *ndarray, void *array, int32_t *strides, size_t count) {
-    printf("imreader: %p\n", ndarray->dtype.subarray);
-    uint8_t *subarray = (uint8_t *)ndarray->dtype.subarray;
-    // if necessary, get the coordinates in the original reference frame, i.e.,
-    // in the coordinates used at the time of the creation of the object
-    // size_t *coords = tools_coords_from_pointer(array, ndarray);
-    for(size_t i = 0; i < count; i++) {
-        // fill up the array with dummy data
-        *subarray++ = (uint8_t)i*i;
-         // array += *strides/ndarray->itemsize
-    }
-    // since strides is going to be used in computation loops, and subarray is
-    // meant to be a dense array, simply overwrite strides with the itemsize
-    *strides = ndarray->itemsize;
-}
-
-static mp_obj_t blocks_imreader_function(void) {
-    blocks_function_obj_t *func = m_new(blocks_function_obj_t, 1);
-    void (*imreader)(ndarray_obj_t *, void *, int32_t *, size_t) = blocks_imreader;
-    func->arrfunc = imreader;
-    return MP_OBJ_FROM_PTR(func);
-}
-
-MP_DEFINE_CONST_FUN_OBJ_0(blocks_imreader_function_obj, blocks_imreader_function);
-#endif
-
-#if 0
-mp_obj_t blocks_ndarray(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
-    static const mp_arg_t allowed_args[] = {
-        { MP_QSTR_shape, MP_ARG_OBJ | MP_ARG_REQUIRED, { .u_rom_obj = mp_const_none } },
-        { MP_QSTR_transformer, MP_ARG_KW_ONLY | MP_ARG_OBJ | MP_ARG_REQUIRED, { .u_rom_obj = mp_const_none } },
-        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_obj = MP_ROM_INT(NDARRAY_FLOAT) } },
-    };
-
-    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
-    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
-
-    mp_obj_tuple_t *shape_tuple = MP_OBJ_TO_PTR(args[0].u_obj);
-    if(shape_tuple->len > ULAB_MAX_DIMS) {
-        mp_raise_ValueError(translate("too many dimensions"));
-    }
-    uint8_t _dtype;
-    #if ULAB_HAS_DTYPE_OBJECT
-    if(MP_OBJ_IS_TYPE(args[1].u_obj, &ulab_dtype_type)) {
-        dtype_obj_t *dtype = MP_OBJ_TO_PTR(args[2].u_obj);
-        _dtype = dtype->dtype.type;
-    } else {
-        _dtype = mp_obj_get_int(args[2].u_obj);
-    }
-    #else
-    _dtype = mp_obj_get_int(args[2].u_obj);
-    #endif
-
-    if(!MP_OBJ_IS_TYPE(args[1].u_obj, &blocks_block_function_type)) {
-        mp_raise_TypeError(translate("transformer must be a block function object"));
-    }
-    blocks_block_function_obj_t *transformer = MP_OBJ_TO_PTR(args[1].u_obj);
-    (void)transformer;
-    void (*arrfunc)(ndarray_obj_t *, void *, int32_t *, size_t) = blocks_imreader;
-    ndarray_obj_t *ndarray = blocks_new_ndarray(args[0].u_obj, _dtype, arrfunc);
-    ndarray->dtype.arrfunc = arrfunc;
-    return MP_OBJ_FROM_PTR(ndarray);
-}
-
-MP_DEFINE_CONST_FUN_OBJ_KW(blocks_ndarray_obj, 0, blocks_ndarray);
-#endif
diff --git a/code/ndarray.c b/code/ndarray.c
index 43d5c316..a5929872 100644
--- a/code/ndarray.c
+++ b/code/ndarray.c
@@ -724,6 +724,10 @@ ndarray_obj_t *ndarray_new_view(ndarray_obj_t *source, uint8_t ndim, size_t *sha
     uint8_t *pointer = (uint8_t *)source->array;
     pointer += offset;
     ndarray->array = pointer;
+    #if ULAB_HAS_BLOCKS
+    ndarray->flags = source->flags;
+    ndarray->block = source->block;
+    #endif
     return ndarray;
 }
 
@@ -1440,7 +1444,18 @@ mp_obj_t ndarray_iternext(mp_obj_t self_in) {
         if(ndarray->ndim == 1) { // we have a linear array
             array += self->cur * ndarray->strides[ULAB_MAX_DIMS - 1];
             self->cur++;
+            #if ULAB_HAS_BLOCKS
+            if(ndarray->flags) {
+                void (*arrfunc)(ndarray_obj_t *, void *, int32_t *, size_t) = ndarray->block->arrfunc;
+                int32_t increment;
+                arrfunc(ndarray, array, &increment, 1);
+                return ndarray_get_item(ndarray, ndarray->block->subarray);
+            } else {
+                return ndarray_get_item(ndarray, array);
+            }
+            #else
             return ndarray_get_item(ndarray, array);
+            #endif
         } else { // we have a tensor, return the reduced view
             size_t offset = self->cur * ndarray->strides[ULAB_MAX_DIMS - ndarray->ndim];
             self->cur++;

From 4aedf35e57ad1fd6f583b889c8e87ce128fac8a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n=20V=C3=B6r=C3=B6s?= <zvoros@gmail.com>
Date: Wed, 3 Mar 2021 20:00:45 +0100
Subject: [PATCH 17/19] no functional change

---
 code/blocks/blocks.c        | 28 +++++++++-------
 code/ndarray.c              |  1 +
 code/ndarray.h              |  1 -
 code/user/user.c            |  6 ++--
 docs/ulab-programming.ipynb | 65 +++++++++++++------------------------
 5 files changed, 44 insertions(+), 57 deletions(-)

diff --git a/code/blocks/blocks.c b/code/blocks/blocks.c
index 0e496626..18c1283f 100644
--- a/code/blocks/blocks.c
+++ b/code/blocks/blocks.c
@@ -28,15 +28,21 @@ const mp_obj_type_t blocks_transformer_type = {
 };
 
 size_t *blocks_coords_from_pointer(void *p1, ndarray_obj_t *ndarray) {
-    // calculates the coordinates in the original tensor from the position of the pointer
+        // Calculates the coordinates in the original tensor from the position of the pointer
     // The original view is assumed to be dense, i.e., the strides can be computed from the shape
+    // This is a utility function, and is not exposed to the python interpreter
     blocks_block_obj_t *block = ndarray->block;
     size_t diff = (uint8_t *)p1 - (uint8_t *)block->origin;
-    size_t accumulator = 1;
+    printf("pointer: 0x%p, 0x%p, 0x%p, %ld\n", p1, ndarray->array, block->origin, diff);
+    size_t accumulator = ndarray->itemsize;
     size_t *coords = m_new(size_t, ULAB_MAX_DIMS);
-    for(uint8_t i = 1; i < ndarray->ndim + 1; i++) {
-        accumulator *= block->shape[ULAB_MAX_DIMS - i];
+
+    for(uint8_t i = 0; i < ndarray->ndim; i++) {
+        accumulator *= block->shape[ULAB_MAX_DIMS - i - 1];
         coords[ULAB_MAX_DIMS - i] = diff % accumulator;
+        // diff -= coords[ULAB_MAX_DIMS - i] * block->shape[ULAB_MAX_DIMS - i];
+        diff -= coords[ULAB_MAX_DIMS - i - 1] * block->shape[ULAB_MAX_DIMS - i - 1];
+        printf("accumulator: %ld\n", diff);
     }
     return coords;
 }
@@ -45,14 +51,14 @@ void blocks_block_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind
     (void)kind;
     blocks_block_obj_t *self = MP_OBJ_TO_PTR(self_in);
     ndarray_obj_t *ndarray = (ndarray_obj_t *)self->ndarray;
-    mp_print_str(print, "block(shape=(");
-    for(uint8_t i = 0; i < ndarray->ndim - 1; i++) {
-        mp_printf(print, "%ld, ", ndarray->shape[ULAB_MAX_DIMS - ndarray->ndim + i]);
+    mp_printf(print, "block(shape=(%ld,", ndarray->shape[ULAB_MAX_DIMS - ndarray->ndim]);
+    for(uint8_t i = 1; i < ndarray->ndim - 1; i++) {
+        mp_printf(print, " %ld,", ndarray->shape[ULAB_MAX_DIMS - ndarray->ndim + i]);
+    }
+    if(ndarray->ndim > 1) {
+        mp_printf(print, " %ld", ndarray->shape[ULAB_MAX_DIMS - 1]);
     }
-    mp_printf(print, "%ld), ", ndarray->shape[ULAB_MAX_DIMS - 1]);
-    // mp_printf(print, "transformer=%s, ", self->);
-    // this is duplicate from ndarray.c:ndarray_print, but allows complete decoupling
-    mp_print_str(print, "dtype=");
+    mp_print_str(print, "), dtype=");
     ndarray_print_dtype(print, ndarray);
     mp_print_str(print, ")");
 }
diff --git a/code/ndarray.c b/code/ndarray.c
index a5929872..9442dd43 100644
--- a/code/ndarray.c
+++ b/code/ndarray.c
@@ -727,6 +727,7 @@ ndarray_obj_t *ndarray_new_view(ndarray_obj_t *source, uint8_t ndim, size_t *sha
     #if ULAB_HAS_BLOCKS
     ndarray->flags = source->flags;
     ndarray->block = source->block;
+    ndarray->block->ndarray = ndarray;
     #endif
     return ndarray;
 }
diff --git a/code/ndarray.h b/code/ndarray.h
index 1483e879..a029d4a9 100644
--- a/code/ndarray.h
+++ b/code/ndarray.h
@@ -69,7 +69,6 @@ typedef struct _dtype_dtype {
 
 typedef struct _blocks_block_obj_t {
     mp_obj_base_t base;
-    // TODO: can the garbage collector deal with circular references?
     void *ndarray;
     void *arrfunc;
     uint8_t *subarray;
diff --git a/code/user/user.c b/code/user/user.c
index a74db4f3..d195b9e2 100644
--- a/code/user/user.c
+++ b/code/user/user.c
@@ -90,10 +90,12 @@ void imreader_imreader(ndarray_obj_t *ndarray, void *array, int32_t *strides, si
     uint8_t *barray = (uint8_t *)block->subarray;
     // if necessary, get the coordinates in the original reference frame, i.e.,
     // in the coordinates used at the time of the creation of the object
-    // size_t *coords = blocks_coords_from_pointer(array, ndarray);
+    size_t *coords = blocks_coords_from_pointer(array, ndarray);
+    uint8_t x = (uint8_t)coords[ULAB_MAX_DIMS - 2] * (uint8_t)coords[ULAB_MAX_DIMS - 1];
+    printf("coords: %ld, %ld\n", coords[ULAB_MAX_DIMS - 2], coords[ULAB_MAX_DIMS - 1]);
     for(size_t i = 0; i < count; i++) {
         // fill up the array with dummy data
-        *barray++ = (uint8_t)i * i;
+        *barray++ = (uint8_t)(x + i) * (x + i);
     }
     // The subarray is a forward propagating dense array, so set the strides to the itemsize
     *strides = ndarray->itemsize;
diff --git a/docs/ulab-programming.ipynb b/docs/ulab-programming.ipynb
index 5cba55dc..99cdc41f 100644
--- a/docs/ulab-programming.ipynb
+++ b/docs/ulab-programming.ipynb
@@ -81,7 +81,7 @@
    "source": [
     "### Memory layout\n",
     "\n",
-    "The values of an `ndarray` are stored in a contiguous segment in the RAM. The `ndarray` can be dense, meaning that all numbers in the linear memory segment belong to a linar combination of coordinates, and it can also be sparse, i.e., some elements of the linear storage space will be skipped, when the elements of the tensor are traversed. \n",
+    "The values of an `ndarray` are stored in a contiguous segment in the RAM. The `ndarray` can be dense, meaning that all numbers in the linear memory segment belong to a linear combination of coordinates, and it can also be sparse, i.e., some elements of the linear storage space will be skipped, when the elements of the tensor are traversed. \n",
     "\n",
     "In the RAM, the position of the item $M(n_1, n_2, ..., n_{k-1}, n_k)$ in a dense tensor of rank $k$ is given by the linear combination \n",
     "\n",
@@ -98,11 +98,11 @@
     "\n",
     "When creating a *view*, we simply re-calculate the `strides`, and re-set the `*array` pointer.\n",
     "\n",
-    "## Iterating over elements of a tensor\n",
+    "## Iterating over the elements of a tensor\n",
     "\n",
     "The `shape` and `strides` members of the array tell us how we have to move our pointer, when we want to read out the numbers. For technical reasons that will become clear later, the numbers in `shape` and in `strides` are aligned to the right, and begin on the right hand side, i.e., if the number of possible dimensions is `ULAB_MAX_DIMS`, then `shape[ULAB_MAX_DIMS-1]` is the length of the last axis, `shape[ULAB_MAX_DIMS-2]` is the length of the last but one axis, and so on. If the number of actual dimensions, `ndim < ULAB_MAX_DIMS`, the first `ULAB_MAX_DIMS - ndim` entries in `shape` and `strides` will be equal to zero, but they could, in fact, be assigned any value, because these will never be accessed in an operation.\n",
     "\n",
-    "With this definition of the strides, the linear combination in $P(n_1, n_2, ..., n_{k-1}, n_k)$ is a one-to-one mapping from the space of tensor coordinates, $(n_1, n_2, ..., n_{k-1}, n_k)$, and the coordinate in the linear array, $n_1s_1 + n_2s_2 + ... + n_{k-1}s_{k-1} + n_ks_k$, i.e., no two distinct sets of coordinates will result in the same position in the linear array. \n",
+    "With this definition of the strides, the linear combination in $P(n_1, n_2, ..., n_{k-1}, n_k)$ is a one-to-one mapping from the space of tensor coordinates, $(n_1, n_2, ..., n_{k-1}, n_k)$ to the coordinate in the linear array, $n_1s_1 + n_2s_2 + ... + n_{k-1}s_{k-1} + n_ks_k$, i.e., no two distinct sets of coordinates will result in the same position in the linear array. \n",
     "\n",
     "Since the `strides` are given in terms of bytes, when we iterate over an array, the void data pointer is usually cast to `uint8_t`, and the values are converted using the proper data type stored in `ndarray->dtype`. However, there might be cases, when it makes perfect sense to cast `*array` to a different type, in which case the `strides` have to be re-scaled by the value of `ndarray->itemsize`.\n",
     "\n",
@@ -123,39 +123,39 @@
     "                    *(array)++ = f(*((type *)(sarray)));\n",
     "                    (sarray) += (source)->strides[ULAB_MAX_DIMS - 1];\n",
     "                    l++;\n",
-    "                } while(l < (source)->shape[ULAB_MAX_DIMS-1]);\n",
-    "                (sarray) -= (source)->strides[ULAB_MAX_DIMS - 1] * (source)->shape[ULAB_MAX_DIMS-1];\n",
+    "                } while(l < (source)->shape[ULAB_MAX_DIMS - 1]);\n",
+    "                (sarray) -= (source)->strides[ULAB_MAX_DIMS - 1] * (source)->shape[ULAB_MAX_DIMS - 1];\n",
     "                (sarray) += (source)->strides[ULAB_MAX_DIMS - 2];\n",
     "                k++;\n",
-    "            } while(k < (source)->shape[ULAB_MAX_DIMS-2]);\n",
-    "            (sarray) -= (source)->strides[ULAB_MAX_DIMS - 2] * (source)->shape[ULAB_MAX_DIMS-2];\n",
+    "            } while(k < (source)->shape[ULAB_MAX_DIMS - 2]);\n",
+    "            (sarray) -= (source)->strides[ULAB_MAX_DIMS - 2] * (source)->shape[ULAB_MAX_DIMS - 2];\n",
     "            (sarray) += (source)->strides[ULAB_MAX_DIMS - 3];\n",
     "            j++;\n",
-    "        } while(j < (source)->shape[ULAB_MAX_DIMS-3]);\n",
-    "        (sarray) -= (source)->strides[ULAB_MAX_DIMS - 3] * (source)->shape[ULAB_MAX_DIMS-3];\n",
+    "        } while(j < (source)->shape[ULAB_MAX_DIMS - 3]);\n",
+    "        (sarray) -= (source)->strides[ULAB_MAX_DIMS - 3] * (source)->shape[ULAB_MAX_DIMS - 3];\n",
     "        (sarray) += (source)->strides[ULAB_MAX_DIMS - 4];\n",
     "        i++;\n",
-    "    } while(i < (source)->shape[ULAB_MAX_DIMS-4]);\n",
+    "    } while(i < (source)->shape[ULAB_MAX_DIMS - 4]);\n",
     "} while(0)\n",
     "```\n",
     "\n",
     "We start with the innermost loop, the one recursing `l`. `array` is already of type `mp_float_t`, while the source array, `sarray`, has been cast to `uint8_t` in the calling function. The numbers contained in `sarray` have to be read out in the proper type dictated by `ndarray->dtype`. This is what happens in the statement `*((type *)(sarray))`, and this number is then fed into the function `f`. Vectorised mathematical functions produce *dense* arrays, and for this reason, we can simply advance the `array` pointer. \n",
     "\n",
-    "The advancing of the `sarray` pointer is a bit more involving: first, in the innermost loop, we simply move forward by the amount given by the last stride, which is `(source)->strides[ULAB_MAX_DIMS - 1]`, because the `shape` and the `strides` are aligned to the right. We move the pointer as many times as given by `(source)->shape[ULAB_MAX_DIMS-1]`, which is the length of the very last axis. Hence the the structure of the loop\n",
+    "The advancing of the `sarray` pointer is a bit more involving: first, in the innermost loop, we simply move forward by the amount given by the last stride, which is `(source)->strides[ULAB_MAX_DIMS - 1]`, because the `shape` and the `strides` are aligned to the right. We move the pointer as many times as given by `(source)->shape[ULAB_MAX_DIMS - 1]`, which is the length of the very last axis. Hence the the structure of the loop\n",
     "\n",
     "```c\n",
     "    size_t l = 0;\n",
     "    do {\n",
     "        ...\n",
     "        l++;\n",
-    "    } while(l < (source)->shape[ULAB_MAX_DIMS-1]);\n",
+    "    } while(l < (source)->shape[ULAB_MAX_DIMS - 1]);\n",
     "\n",
     "```\n",
-    "Once we have exhausted the last axis, we have to re-wind the pointer, and advance it by an amount given by the last but one stride. Keep in mind that in the the innermost loop we moved our pointer `(source)->shape[ULAB_MAX_DIMS-1]` times by `(source)->strides[ULAB_MAX_DIMS - 1]`, i.e., we re-wind it by moving it backwards by `(source)->strides[ULAB_MAX_DIMS - 1] * (source)->shape[ULAB_MAX_DIMS-1]`. In the next step, we move forward by `(source)->strides[ULAB_MAX_DIMS - 2]`, which is the last but one stride. \n",
+    "Once we have exhausted the last axis, we have to re-wind the pointer, and advance it by an amount given by the last but one stride. Keep in mind that in the the innermost loop we moved our pointer `(source)->shape[ULAB_MAX_DIMS - 1]` times by `(source)->strides[ULAB_MAX_DIMS - 1]`, i.e., we re-wind it by moving it backwards by `(source)->strides[ULAB_MAX_DIMS - 1] * (source)->shape[ULAB_MAX_DIMS - 1]`. In the next step, we move forward by `(source)->strides[ULAB_MAX_DIMS - 2]`, which is the last but one stride. \n",
     "\n",
     "\n",
     "```c\n",
-    "    (sarray) -= (source)->strides[ULAB_MAX_DIMS - 1] * (source)->shape[ULAB_MAX_DIMS-1];\n",
+    "    (sarray) -= (source)->strides[ULAB_MAX_DIMS - 1] * (source)->shape[ULAB_MAX_DIMS - 1];\n",
     "    (sarray) += (source)->strides[ULAB_MAX_DIMS - 2];\n",
     "\n",
     "```\n",
@@ -199,7 +199,7 @@
     "\n",
     "```c\n",
     "    size_t *coords = ndarray_new_coords(results->ndim);\n",
-    "    for(size_t i=0; i < results->len/results->shape[ULAB_MAX_DIMS -1]; i++) {\n",
+    "    for(size_t i=0; i < results->len/results->shape[ULAB_MAX_DIMS - 1]; i++) {\n",
     "        size_t l = 0;\n",
     "        do {\n",
     "            ...\n",
@@ -773,9 +773,12 @@
     "\n",
     "But first, what is the point of such an exercise?\n",
     "\n",
-    "The first use of the `blocks` extension is what we have already stated: standard `numpy`-compatible numerical computations can be done on arbitrary data containers without having to change the core, and without having to compromise `numpy`-compatibility. In other words, if the data can be converted in some way to one of the five native `dtype`s, `ulab` will be able to deal with it. A trivial example is an image, which is encoded in jpeg, PNG or some other format. \n",
+    "The first use of the `blocks` extension is what we have already stated: standard `numpy`-compatible numerical computations can be done on arbitrary data containers without having to change the core, and without having to compromise `numpy`-compatibility. In other words, if the data can be converted in some way to one of the five native `dtype`s, `ulab` will be able to deal with it. \n",
     "\n",
-    "But `blocks` is capable of much more. It allows lazy loading: data are fetched, when they are needed in the computation loops, but otherwise, they do not even have to reside in RAM. This means that you can work with data sets that do not even fit into the RAM of your microcontroller."
+    "A trivial example is an image, which is encoded in jpeg, PNG or some other format. The image still has a tensorial structure, i.e., it has _x_, _y_, and possibly colour and _alpha_ axis, but the image as such cannot just be fed into an `ndarray`. Here is, where the transformer function plays its role: in `ulab`, all calculations are carried out along axes, so, if the there is a function that can convert the image data axis-wise, and pass it on to the computation routine, then the calculation is, in effect, done on the image itself. \n",
+    "\n",
+    "\n",
+    "However, `blocks` is capable of much more. It allows lazy loading: data are fetched, when they are needed in the computation loops, but otherwise, they do not even have to reside in the main RAM. This means that you can work with data sets that are so big, that they do not even fit into the RAM of your microcontroller, and you can off-load everything to an SPI RAM that cannot be mapped into the address space of the microcontroller. Data can also come directly from a peripheral device, or can be pixels of a display. "
    ]
   },
   {
@@ -826,37 +829,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "and the function will be used as, e.g., in the `RUN_SUM` macro that calculates the sum along a particular axis:\n",
+    "The `arrfunc` function fills the values in its own void pointer, `subarray`, and this is the array that will be used in the actual iteration: \n",
     "\n",
     "```c\n",
-    "#define RUN_SUM1(type, ndarray, array, results, rarray, ss)\\\n",
-    "({\\\n",
-    "    type sum = 0;\\\n",
-    "    uint8_t *barray = (array);\\\n",
-    "    int32_t increment = (ss).strides[0];\\\n",
-    "    if((ndarray)->flags) {\\\n",
-    "        void (*arrfunc)(ndarray_obj_t *, void *, int32_t *, size_t) = (ndarray)->block->arrfunc;\\\n",
-    "        arrfunc((ndarray), (array), &increment, (ss).shape[0]);\\\n",
-    "        barray = (ndarray)->block->subarray;\\\n",
-    "    }\\\n",
-    "    for(size_t i=0; i < (ss).shape[0]; i++) {\\\n",
-    "        sum += *((type *)(barray));\\\n",
-    "        barray += increment;\\\n",
-    "    }\\\n",
-    "    (array) += (ss).shape[0] * (ss).strides[0];\\\n",
-    "    memcpy((rarray), &sum, (results)->itemsize);\\\n",
-    "    (rarray) += (results)->itemsize;\\\n",
-    "})\n",
+    "barray = (ndarray)->block->subarray;\n",
     "```"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The `arrfunc` function fills the values in its own void pointer, `subarray`, and this is the array that will be used in the actual iteration. "
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,

From d66c10a23bd6316866cab60aef684c2b6253dc5c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n=20V=C3=B6r=C3=B6s?= <zvoros@gmail.com>
Date: Thu, 4 Mar 2021 07:34:47 +0100
Subject: [PATCH 18/19] fixed blocks_coords_from_pointer

---
 code/blocks/blocks.c             | 18 ++++++++++--------
 code/ndarray.h                   |  1 +
 code/numpy/numerical/numerical.c | 10 ----------
 code/user/user.c                 |  5 ++---
 4 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/code/blocks/blocks.c b/code/blocks/blocks.c
index 18c1283f..9bb9d388 100644
--- a/code/blocks/blocks.c
+++ b/code/blocks/blocks.c
@@ -33,16 +33,17 @@ size_t *blocks_coords_from_pointer(void *p1, ndarray_obj_t *ndarray) {
     // This is a utility function, and is not exposed to the python interpreter
     blocks_block_obj_t *block = ndarray->block;
     size_t diff = (uint8_t *)p1 - (uint8_t *)block->origin;
-    printf("pointer: 0x%p, 0x%p, 0x%p, %ld\n", p1, ndarray->array, block->origin, diff);
-    size_t accumulator = ndarray->itemsize;
+    size_t stride = ndarray->itemsize;
     size_t *coords = m_new(size_t, ULAB_MAX_DIMS);
 
-    for(uint8_t i = 0; i < ndarray->ndim; i++) {
-        accumulator *= block->shape[ULAB_MAX_DIMS - i - 1];
-        coords[ULAB_MAX_DIMS - i] = diff % accumulator;
-        // diff -= coords[ULAB_MAX_DIMS - i] * block->shape[ULAB_MAX_DIMS - i];
-        diff -= coords[ULAB_MAX_DIMS - i - 1] * block->shape[ULAB_MAX_DIMS - i - 1];
-        printf("accumulator: %ld\n", diff);
+    // first, calculate the very first stride
+    for(uint8_t i = 0; i < block->ndim - 1; i++) {
+        stride *= block->shape[ULAB_MAX_DIMS - i - 1];
+    }
+    for(uint8_t i = block->ndim; i > 1; i--) {
+        coords[ULAB_MAX_DIMS - i] = diff / stride;
+        diff -= coords[ULAB_MAX_DIMS - i] * block->shape[ULAB_MAX_DIMS - i];
+        stride /= block->shape[ULAB_MAX_DIMS - i + 1];
     }
     return coords;
 }
@@ -113,6 +114,7 @@ mp_obj_t blocks_new_ndarray(size_t n_args, const mp_obj_t *pos_args, mp_map_t *k
     ndarray->flags = BLOCK_IS_READ_ONLY;
     blocks_block_obj_t *block = m_new_obj(blocks_block_obj_t);
     block->base.type = &blocks_block_type;
+    block->ndim = ndarray->ndim;
     // store a pointer to the ndarray
     block->ndarray = ndarray;
 
diff --git a/code/ndarray.h b/code/ndarray.h
index a029d4a9..0d4e067d 100644
--- a/code/ndarray.h
+++ b/code/ndarray.h
@@ -69,6 +69,7 @@ typedef struct _dtype_dtype {
 
 typedef struct _blocks_block_obj_t {
     mp_obj_base_t base;
+    uint8_t ndim;
     void *ndarray;
     void *arrfunc;
     uint8_t *subarray;
diff --git a/code/numpy/numerical/numerical.c b/code/numpy/numerical/numerical.c
index e2db345d..c44aea56 100644
--- a/code/numpy/numerical/numerical.c
+++ b/code/numpy/numerical/numerical.c
@@ -279,16 +279,6 @@ static mp_obj_t numerical_sum_mean_std_ndarray(ndarray_obj_t *ndarray, mp_obj_t
         uint8_t *rarray = NULL;
         mp_float_t *farray = NULL;
 
-        // #if ULAB_HAS_BLOCKS
-        // blocks_block_obj_t *block = NULL;
-        // if(ndarray->flags) {
-        //     block = ndarray->block;
-        //     // return mp_const_none;
-        // }
-        // #else
-        // uint8_t block = 0;
-        // #endif
-
         if(optype == NUMERICAL_SUM) {
             results = ndarray_new_dense_ndarray(_shape_strides.ndim, _shape_strides.shape, ndarray->dtype.type);
             rarray = (uint8_t *)results->array;
diff --git a/code/user/user.c b/code/user/user.c
index d195b9e2..6b6136ee 100644
--- a/code/user/user.c
+++ b/code/user/user.c
@@ -91,11 +91,10 @@ void imreader_imreader(ndarray_obj_t *ndarray, void *array, int32_t *strides, si
     // if necessary, get the coordinates in the original reference frame, i.e.,
     // in the coordinates used at the time of the creation of the object
     size_t *coords = blocks_coords_from_pointer(array, ndarray);
-    uint8_t x = (uint8_t)coords[ULAB_MAX_DIMS - 2] * (uint8_t)coords[ULAB_MAX_DIMS - 1];
-    printf("coords: %ld, %ld\n", coords[ULAB_MAX_DIMS - 2], coords[ULAB_MAX_DIMS - 1]);
+    uint8_t x = (uint8_t)coords[ULAB_MAX_DIMS - 2] * (uint8_t)block->shape[ULAB_MAX_DIMS - 2];
     for(size_t i = 0; i < count; i++) {
         // fill up the array with dummy data
-        *barray++ = (uint8_t)(x + i) * (x + i);
+        *barray++ = (uint8_t)((x + i) * (x + i));
     }
     // The subarray is a forward propagating dense array, so set the strides to the itemsize
     *strides = ndarray->itemsize;

From 3227831a0adfc70c090664d8c8f9ae212a9a220a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n=20V=C3=B6r=C3=B6s?= <zvoros@gmail.com>
Date: Thu, 4 Mar 2021 17:20:26 +0100
Subject: [PATCH 19/19] fixed failing tests

---
 code/ndarray.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/code/ndarray.c b/code/ndarray.c
index 9442dd43..15353b84 100644
--- a/code/ndarray.c
+++ b/code/ndarray.c
@@ -726,8 +726,11 @@ ndarray_obj_t *ndarray_new_view(ndarray_obj_t *source, uint8_t ndim, size_t *sha
     ndarray->array = pointer;
     #if ULAB_HAS_BLOCKS
     ndarray->flags = source->flags;
-    ndarray->block = source->block;
-    ndarray->block->ndarray = ndarray;
+    if(source->flags) {
+        // copy the block, only if ndarray has a block object
+        ndarray->block = source->block;
+        ndarray->block->ndarray = ndarray;
+    }
     #endif
     return ndarray;
 }