From 728154646ee264b16ab1e4db2d6875843d99ddec Mon Sep 17 00:00:00 2001
From: William Silversmith <william.silversmith@gmail.com>
Date: Thu, 6 Feb 2025 22:00:38 -0500
Subject: [PATCH 01/11] feat: support int8, int16 for 2x2x1 for single mip

---
 tinybrain/accelerated.pyx | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tinybrain/accelerated.pyx b/tinybrain/accelerated.pyx
index 9c1a9b8..a6bb91d 100644
--- a/tinybrain/accelerated.pyx
+++ b/tinybrain/accelerated.pyx
@@ -109,17 +109,29 @@ def average_pooling_2x2(channel, size_t num_mips=1, sparse=False):
   return results
 
 def _average_pooling_2x2_single_mip_py(np.ndarray[NUMBER, ndim=5] channel, sparse):
+  cdef int8_t[:,:,:,:,:] arr_memview8i
+  cdef int16_t[:,:,:,:,:] arr_memview16i
+  cdef int32_t[:,:,:,:,:] arr_memview32i
+  cdef int64_t[:,:,:,:,:] arr_memview64i
+
   cdef uint8_t[:,:,:,:,:] arr_memview8u
   cdef uint16_t[:,:,:,:,:] arr_memview16u
   cdef uint32_t[:,:,:,:,:] arr_memview32u
   cdef uint64_t[:,:,:,:,:] arr_memview64u
+  
   cdef float[:,:,:,:,:] arr_memviewf
   cdef double[:,:,:,:,:] arr_memviewd
 
+  cdef int8_t[:,:,:,:,:] out_memview8i
+  cdef int16_t[:,:,:,:,:] out_memview16i
+  cdef int32_t[:,:,:,:,:] out_memview32i
+  cdef int64_t[:,:,:,:,:] out_memview64i
+
   cdef uint8_t[:,:,:,:,:] out_memview8u
   cdef uint16_t[:,:,:,:,:] out_memview16u
   cdef uint32_t[:,:,:,:,:] out_memview32u
   cdef uint64_t[:,:,:,:,:] out_memview64u
+  
   cdef float[:,:,:,:,:] out_memviewf
   cdef double[:,:,:,:,:] out_memviewd
 
@@ -149,6 +161,22 @@ def _average_pooling_2x2_single_mip_py(np.ndarray[NUMBER, ndim=5] channel, spars
     arr_memview64u = channel
     out_memview64u = out
     _average_pooling_2x2_single_mip[uint64_t](&arr_memview64u[0,0,0,0,0], sx, sy, sz, sw, sv, &out_memview64u[0,0,0,0,0], bool(sparse))
+  elif channel.dtype == np.int8:
+    arr_memview8i = channel
+    out_memview8i = out
+    _average_pooling_2x2_single_mip[int8_t](&arr_memview8i[0,0,0,0,0], sx, sy, sz, sw, sv, &out_memview8i[0,0,0,0,0], bool(sparse))
+  elif channel.dtype == np.int16:
+    arr_memview16i = channel
+    out_memview16i = out
+    _average_pooling_2x2_single_mip[int16_t](&arr_memview16i[0,0,0,0,0], sx, sy, sz, sw, sv, &out_memview16i[0,0,0,0,0], bool(sparse))
+  elif channel.dtype == np.int32:
+    arr_memview32i = channel
+    out_memview32i = out
+    _average_pooling_2x2_single_mip[int32_t](&arr_memview32i[0,0,0,0,0], sx, sy, sz, sw, sv, &out_memview32i[0,0,0,0,0], bool(sparse))
+  elif channel.dtype == np.int64:
+    arr_memview64i = channel
+    out_memview64i = out
+    _average_pooling_2x2_single_mip[int64_t](&arr_memview64i[0,0,0,0,0], sx, sy, sz, sw, sv, &out_memview64i[0,0,0,0,0], bool(sparse))
   elif channel.dtype == np.float32:
     arr_memviewf = channel
     out_memviewf = out

From 14ffe77d3a6190a08b9c2beb4a9b1a7328833e0e Mon Sep 17 00:00:00 2001
From: William Silversmith <william.silversmith@gmail.com>
Date: Thu, 6 Feb 2025 22:24:51 -0500
Subject: [PATCH 02/11] feat: basic implementation of 2x2x1 but sparse isn't
 working

---
 tinybrain/accelerated.pyx | 140 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)

diff --git a/tinybrain/accelerated.pyx b/tinybrain/accelerated.pyx
index a6bb91d..854f656 100644
--- a/tinybrain/accelerated.pyx
+++ b/tinybrain/accelerated.pyx
@@ -94,8 +94,12 @@ def average_pooling_2x2(channel, size_t num_mips=1, sparse=False):
     results = _average_pooling_2x2_single_mip_py(channel, sparse)
   elif channel.dtype == np.uint8:
     results = _average_pooling_2x2_uint8(channel, num_mips, sparse)
+  elif channel.dtype == np.int8:
+    results = _average_pooling_2x2_int8(channel, num_mips, sparse)
   elif channel.dtype == np.uint16:
     results = _average_pooling_2x2_uint16(channel, num_mips, sparse)
+  elif channel.dtype == np.int16:
+    results = _average_pooling_2x2_int16(channel, num_mips, sparse)
   elif channel.dtype == np.float32:
     results = _average_pooling_2x2_float(channel, num_mips, sparse)
   elif channel.dtype == np.float64:
@@ -258,6 +262,142 @@ def _average_pooling_2x2_uint8(np.ndarray[uint8_t, ndim=5] channel, uint32_t num
 
   return results
 
+def _average_pooling_2x2_int8(np.ndarray[int8_t, ndim=5] channel, uint32_t num_mips, sparse):
+  cdef size_t sx = channel.shape[0]
+  cdef size_t sy = channel.shape[1]
+  cdef size_t sz = channel.shape[2]
+  cdef size_t sw = channel.shape[3]
+  cdef size_t sv = channel.shape[4]
+  cdef size_t sxy = sx * sy
+
+  cdef size_t osx = (sx + 1) // 2
+  cdef size_t osy = (sy + 1) // 2
+  cdef size_t osxy = osx * osy
+  cdef size_t ovoxels = osxy * sz * sw * sv
+
+  cdef int8_t[:,:,:,:,:] channelview = channel
+  cdef int16_t* accum = accumulate_2x2[int8_t, int16_t](&channelview[0,0,0,0,0], sx, sy, sz, sw, sv)
+  cdef int16_t[:] accumview = <int16_t[:ovoxels]>accum
+  cdef int16_t* tmp
+  cdef uint32_t mip, bitshift
+
+  cdef int16_t* denominator
+  if sparse:
+    denominator = denominator_2x2[int8_t, int16_t](&channelview[0,0,0,0,0], sx, sy, sz, sw, sv)
+
+  cdef int8_t[:] oimgview
+
+  results = []
+  for mip in range(num_mips):
+    bitshift = 2 * ((mip % 4) + 1) # integer truncation every 4 mip levels
+    oimg = np.zeros( (ovoxels,), dtype=np.uint8, order='F')
+    oimgview = oimg
+
+    if sparse:
+      render_image_sparse[int16_t, int8_t](&accumview[0], denominator, &oimgview[0], ovoxels)
+    else:
+      render_image[int16_t, int8_t](&accumview[0], &oimgview[0], bitshift, ovoxels)
+
+    results.append(
+      oimg.reshape( (osx, osy, sz, sw, sv), order='F' )
+    )
+
+    if mip == num_mips - 1:
+      break
+
+    if bitshift == 8:
+      shift_right[int16_t](accum, ovoxels, bitshift)
+
+    sx = osx 
+    sy = osy 
+    sxy = sx * sy
+    osx = (sx + 1) // 2
+    osy = (sy + 1) // 2
+    osxy = osx * osy
+    ovoxels = osxy * sz * sw * sv
+
+    tmp = accum 
+    accum = accumulate_2x2[int16_t, int16_t](accum, sx, sy, sz, sw, sv)
+    accumview = <int16_t[:ovoxels]>accum
+    PyMem_Free(tmp)
+
+    if sparse:
+      tmp = denominator
+      denominator = accumulate_2x2[int16_t, int16_t](denominator, sx, sy, sz, sw, sv)
+      PyMem_Free(tmp)
+
+  PyMem_Free(accum)
+
+  return results
+
+def _average_pooling_2x2_int16(np.ndarray[int16_t, ndim=5] channel, uint32_t num_mips, sparse):
+  cdef size_t sx = channel.shape[0]
+  cdef size_t sy = channel.shape[1]
+  cdef size_t sz = channel.shape[2]
+  cdef size_t sw = channel.shape[3]
+  cdef size_t sv = channel.shape[4]
+  cdef size_t sxy = sx * sy
+
+  cdef size_t osx = (sx + 1) // 2
+  cdef size_t osy = (sy + 1) // 2
+  cdef size_t osxy = osx * osy
+  cdef size_t ovoxels = osxy * sz * sw * sv
+
+  cdef int16_t[:,:,:,:,:] channelview = channel
+  cdef int32_t* accum = accumulate_2x2[int16_t, int32_t](&channelview[0,0,0,0,0], sx, sy, sz, sw, sv)
+  cdef int32_t[:] accumview = <int32_t[:ovoxels]>accum
+  cdef int32_t* tmp
+  cdef int32_t mip, bitshift
+
+  cdef int32_t* denominator
+  if sparse:
+    denominator = denominator_2x2[int16_t, int32_t](&channelview[0,0,0,0,0], sx, sy, sz, sw, sv)
+
+  cdef int16_t[:] oimgview
+
+  results = []
+  for mip in range(num_mips):
+    bitshift = 2 * ((mip % 4) + 1) # integer truncation every 4 mip levels
+    oimg = np.zeros( (ovoxels,), dtype=np.uint16, order='F')
+    oimgview = oimg
+
+    if sparse:
+      render_image_sparse[int32_t, int16_t](&accumview[0], denominator, &oimgview[0], ovoxels)
+    else:
+      render_image[int32_t, int16_t](&accumview[0], &oimgview[0], bitshift, ovoxels)
+
+    results.append(
+      oimg.reshape( (osx, osy, sz, sw, sv), order='F' )
+    )
+
+    if mip == num_mips - 1:
+      break
+
+    if bitshift == 8:
+      shift_right[int32_t](accum, ovoxels, bitshift)
+
+    sx = osx 
+    sy = osy 
+    sxy = sx * sy
+    osx = (sx + 1) // 2
+    osy = (sy + 1) // 2
+    osxy = osx * osy
+    ovoxels = osxy * sz * sw * sv
+
+    tmp = accum 
+    accum = accumulate_2x2[int32_t, int32_t](accum, sx, sy, sz, sw, sv)
+    accumview = <int32_t[:ovoxels]>accum
+    PyMem_Free(tmp)
+
+    if sparse:
+      tmp = denominator
+      denominator = accumulate_2x2[int32_t, int32_t](denominator, sx, sy, sz, sw, sv)
+      PyMem_Free(tmp)
+
+  PyMem_Free(accum)
+
+  return results
+
 def _average_pooling_2x2_uint16(np.ndarray[uint16_t, ndim=5] channel, uint32_t num_mips, sparse):
   cdef size_t sx = channel.shape[0]
   cdef size_t sy = channel.shape[1]

From 700452d1f5ece7541f1bea7a359be088bb273816 Mon Sep 17 00:00:00 2001
From: William Silversmith <william.silversmith@gmail.com>
Date: Thu, 6 Feb 2025 22:30:22 -0500
Subject: [PATCH 03/11] fix: ensure outputs have right data type

---
 tinybrain/accelerated.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tinybrain/accelerated.pyx b/tinybrain/accelerated.pyx
index 854f656..2bbfff5 100644
--- a/tinybrain/accelerated.pyx
+++ b/tinybrain/accelerated.pyx
@@ -290,7 +290,7 @@ def _average_pooling_2x2_int8(np.ndarray[int8_t, ndim=5] channel, uint32_t num_m
   results = []
   for mip in range(num_mips):
     bitshift = 2 * ((mip % 4) + 1) # integer truncation every 4 mip levels
-    oimg = np.zeros( (ovoxels,), dtype=np.uint8, order='F')
+    oimg = np.zeros( (ovoxels,), dtype=np.int8, order='F')
     oimgview = oimg
 
     if sparse:
@@ -358,7 +358,7 @@ def _average_pooling_2x2_int16(np.ndarray[int16_t, ndim=5] channel, uint32_t num
   results = []
   for mip in range(num_mips):
     bitshift = 2 * ((mip % 4) + 1) # integer truncation every 4 mip levels
-    oimg = np.zeros( (ovoxels,), dtype=np.uint16, order='F')
+    oimg = np.zeros( (ovoxels,), dtype=np.int16, order='F')
     oimgview = oimg
 
     if sparse:

From 0365cf76c76af1aa664d1fd25caf5582b3b04349 Mon Sep 17 00:00:00 2001
From: William Silversmith <william.silversmith@gmail.com>
Date: Thu, 6 Feb 2025 22:54:27 -0500
Subject: [PATCH 04/11] fix: don't gate accelerated functions for integer types

---
 tinybrain/downsample.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tinybrain/downsample.py b/tinybrain/downsample.py
index e7933e0..8d09779 100644
--- a/tinybrain/downsample.py
+++ b/tinybrain/downsample.py
@@ -45,7 +45,7 @@ def downsample_with_averaging(img, factor, num_mips=1, sparse=False):
   Returns: [ mip0, mip1, mip2, ..., num_mip ]
   """
   if (
-    img.dtype in (np.uint8, np.uint16, np.float32, np.float64)
+    img.dtype in (np.int8, np.int16, np.uint8, np.uint16, np.float32, np.float64)
     or num_mips == 1 # _average_pooling_2x2_single_mip_py supports all primative types
   ):
     img = np.asfortranarray(img)

From 696c6d5423b73ef4982c9541caa506c8e8cab029 Mon Sep 17 00:00:00 2001
From: William Silversmith <william.silversmith@gmail.com>
Date: Thu, 6 Feb 2025 23:02:46 -0500
Subject: [PATCH 05/11] docs: update year

---
 tinybrain/accelerated.hpp | 2 +-
 tinybrain/accelerated.pyx | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tinybrain/accelerated.hpp b/tinybrain/accelerated.hpp
index 148d2cd..29b9e62 100644
--- a/tinybrain/accelerated.hpp
+++ b/tinybrain/accelerated.hpp
@@ -1,5 +1,5 @@
 /*
-Copyright (C) 2019, William Silversmith
+Copyright (C) 2019,2025 William Silversmith
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
diff --git a/tinybrain/accelerated.pyx b/tinybrain/accelerated.pyx
index 2bbfff5..ddb4ef5 100644
--- a/tinybrain/accelerated.pyx
+++ b/tinybrain/accelerated.pyx
@@ -4,7 +4,7 @@ Cython accelerated routines for common downsampling operations.
 
 Author: William Silversmith
 Affiliation: Seung Lab, Princeton Neuroscience Institute
-Date: March 2019
+Date: March 2019, Februrary 2025
 """
 cimport cython
 from cython cimport floating

From dd5ba8a04a2aa4e96aa293860834f243b89ef214 Mon Sep 17 00:00:00 2001
From: William Silversmith <william.silversmith@gmail.com>
Date: Thu, 6 Feb 2025 23:02:57 -0500
Subject: [PATCH 06/11] feat: add signed ints for 2x2x2

---
 tinybrain/accelerated.pyx | 183 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 183 insertions(+)

diff --git a/tinybrain/accelerated.pyx b/tinybrain/accelerated.pyx
index ddb4ef5..e646b97 100644
--- a/tinybrain/accelerated.pyx
+++ b/tinybrain/accelerated.pyx
@@ -619,8 +619,12 @@ def average_pooling_2x2x2(channel, size_t num_mips=1, sparse=False):
     results = _average_pooling_2x2x2_single_mip_py(channel)
   elif channel.dtype == np.uint8:
     results = _average_pooling_2x2x2_uint8(channel, num_mips, sparse)
+  elif channel.dtype == np.int8:
+    results = _average_pooling_2x2x2_int8(channel, num_mips, sparse)
   elif channel.dtype == np.uint16:
     results = _average_pooling_2x2x2_uint16(channel, num_mips, sparse)
+  elif channel.dtype == np.int16:
+    results = _average_pooling_2x2x2_int16(channel, num_mips, sparse)
   elif channel.dtype == np.float32:
     results = _average_pooling_2x2x2_float(channel, num_mips)
   elif channel.dtype == np.float64:
@@ -634,6 +638,8 @@ def average_pooling_2x2x2(channel, size_t num_mips=1, sparse=False):
   return results
 
 def _average_pooling_2x2x2_single_mip_py(np.ndarray[NUMBER, ndim=5] channel):
+  cdef int8_t[:,:,:,:,:] arr_memview8i
+  cdef int16_t[:,:,:,:,:] arr_memview16i
   cdef uint8_t[:,:,:,:,:] arr_memview8u
   cdef uint16_t[:,:,:,:,:] arr_memview16u
   cdef uint32_t[:,:,:,:,:] arr_memview32u
@@ -641,6 +647,8 @@ def _average_pooling_2x2x2_single_mip_py(np.ndarray[NUMBER, ndim=5] channel):
   cdef float[:,:,:,:,:] arr_memviewf
   cdef double[:,:,:,:,:] arr_memviewd
 
+  cdef int8_t[:,:,:,:,:] out_memview8i
+  cdef int16_t[:,:,:,:,:] out_memview16i
   cdef uint8_t[:,:,:,:,:] out_memview8u
   cdef uint16_t[:,:,:,:,:] out_memview16u
   cdef uint32_t[:,:,:,:,:] out_memview32u
@@ -662,10 +670,18 @@ def _average_pooling_2x2x2_single_mip_py(np.ndarray[NUMBER, ndim=5] channel):
     arr_memview8u = channel
     out_memview8u = out
     _average_pooling_2x2x2_single_mip[uint8_t](&arr_memview8u[0,0,0,0,0], sx, sy, sz, sw, sv, &out_memview8u[0,0,0,0,0])
+  elif channel.dtype == np.int8:
+    arr_memview8i = channel
+    out_memview8i = out
+    _average_pooling_2x2x2_single_mip[int8_t](&arr_memview8i[0,0,0,0,0], sx, sy, sz, sw, sv, &out_memview8i[0,0,0,0,0])
   elif channel.dtype == np.uint16:
     arr_memview16u = channel
     out_memview16u = out
     _average_pooling_2x2x2_single_mip[uint16_t](&arr_memview16u[0,0,0,0,0], sx, sy, sz, sw, sv, &out_memview16u[0,0,0,0,0])
+  elif channel.dtype == np.int16:
+    arr_memview16i = channel
+    out_memview16i = out
+    _average_pooling_2x2x2_single_mip[int16_t](&arr_memview16i[0,0,0,0,0], sx, sy, sz, sw, sv, &out_memview16i[0,0,0,0,0])
   elif channel.dtype == np.uint32:
     arr_memview32u = channel
     out_memview32u = out
@@ -771,6 +787,90 @@ def _average_pooling_2x2x2_uint8(np.ndarray[uint8_t, ndim=5] channel, uint32_t n
 
   return results
 
+def _average_pooling_2x2x2_int8(np.ndarray[int8_t, ndim=5] channel, uint32_t num_mips, sparse=False):
+  cdef size_t sx = channel.shape[0]
+  cdef size_t sy = channel.shape[1]
+  cdef size_t sz = channel.shape[2]
+  cdef size_t sw = channel.shape[3]
+  cdef size_t sv = channel.shape[4]
+  cdef size_t sxy = sx * sy
+
+  cdef size_t osx = (sx + 1) // 2
+  cdef size_t osy = (sy + 1) // 2
+  cdef size_t osz = (sz + 1) // 2
+  cdef size_t osxy = osx * osy
+  cdef size_t ovoxels = osxy * osz * sw * sv
+
+  cdef int8_t[:,:,:,:,:] channelview = channel
+  cdef int32_t* accum = accumulate_2x2x2[int8_t, int32_t](
+    &channelview[0,0,0,0,0], sx, sy, sz, sw, sv
+  )
+  cdef int32_t[:] accumview = <int32_t[:ovoxels]>accum
+
+  # "denominator"
+  cdef int32_t* denom
+  cdef int32_t[:] denomview
+  if sparse:
+    denom = denominator_2x2x2[int8_t, int32_t](
+      &channelview[0,0,0,0,0], sx, sy, sz, sw, sv
+    )
+    denomview = <int32_t[:ovoxels]>denom
+
+  cdef int32_t* tmp
+  cdef int32_t mip, bitshift
+
+  cdef int8_t[:] oimgview
+
+  results = []
+  for mip in range(num_mips):
+    bitshift = 3 * ((mip % 8) + 1) # integer truncation every 8 mip levels
+    oimg = np.zeros( (ovoxels,), dtype=np.int8, order='F')
+    oimgview = oimg
+
+    if sparse:
+      render_image_sparse[int32_t, int8_t](&accumview[0], &denomview[0], &oimgview[0], ovoxels)
+    else:
+      render_image[int32_t, int8_t](&accumview[0], &oimgview[0], bitshift, ovoxels)
+
+    results.append(
+      oimg.reshape( (osx, osy, osz, sw), order='F' )
+    )
+
+    if mip == num_mips - 1:
+      break
+
+    if bitshift == 24:
+      shift_right[int32_t](accum, ovoxels, bitshift)
+      if sparse:
+        shift_right[int32_t](denom, ovoxels, bitshift)
+
+    sx = osx 
+    sy = osy 
+    sz = osz
+    sxy = sx * sy
+    osx = (sx + 1) // 2
+    osy = (sy + 1) // 2
+    osz = (sz + 1) // 2
+    osxy = osx * osy
+    ovoxels = osxy * osz * sw * sv
+
+    tmp = accum 
+    accum = accumulate_2x2x2[int32_t, int32_t](accum, sx, sy, sz, sw, sv)
+    accumview = <int32_t[:ovoxels]>accum
+    PyMem_Free(tmp)
+
+    if sparse:
+      tmp = denom
+      denom = accumulate_2x2x2[int32_t, int32_t](denom, sx, sy, sz, sw, sv)
+      denomview = <int32_t[:ovoxels]>denom
+      PyMem_Free(tmp)
+
+  PyMem_Free(accum)
+  if sparse:
+    PyMem_Free(denom)
+
+  return results
+
 def _average_pooling_2x2x2_uint16(np.ndarray[uint16_t, ndim=5] channel, uint32_t num_mips, sparse):
   cdef size_t sx = channel.shape[0]
   cdef size_t sy = channel.shape[1]
@@ -854,6 +954,89 @@ def _average_pooling_2x2x2_uint16(np.ndarray[uint16_t, ndim=5] channel, uint32_t
 
   return results
 
+def _average_pooling_2x2x2_int16(np.ndarray[int16_t, ndim=5] channel, uint32_t num_mips, sparse):
+  cdef size_t sx = channel.shape[0]
+  cdef size_t sy = channel.shape[1]
+  cdef size_t sz = channel.shape[2]
+  cdef size_t sw = channel.shape[3]
+  cdef size_t sv = channel.shape[4]
+  cdef size_t sxy = sx * sy
+
+  cdef size_t osx = (sx + 1) // 2
+  cdef size_t osy = (sy + 1) // 2
+  cdef size_t osz = (sz + 1) // 2
+  cdef size_t osxy = osx * osy
+  cdef size_t ovoxels = osxy * osz * sw * sv
+
+  cdef int16_t[:,:,:,:,:] channelview = channel
+  cdef int32_t* accum = accumulate_2x2x2[int16_t, int32_t](
+    &channelview[0,0,0,0,0], sx, sy, sz, sw, sv
+  )
+  cdef int32_t[:] accumview = <int32_t[:ovoxels]>accum
+
+  # "denominator"
+  cdef int32_t* denom
+  cdef int32_t[:] denomview
+  if sparse:
+    denom = denominator_2x2x2[int16_t, int32_t](
+      &channelview[0,0,0,0,0], sx, sy, sz, sw, sv
+    )
+    denomview = <int32_t[:ovoxels]>denom
+
+  cdef int32_t* tmp
+  cdef int32_t mip, bitshift
+
+  cdef int16_t[:] oimgview
+
+  results = []
+  for mip in range(num_mips):
+    bitshift = 3 * ((mip % 5) + 1) # integer truncation every 5 mip levels
+    oimg = np.zeros( (ovoxels,), dtype=np.uint16, order='F')
+    oimgview = oimg
+    if sparse:
+      render_image_sparse[int32_t, int16_t](&accumview[0], &denomview[0], &oimgview[0], ovoxels)
+    else:
+      render_image[int32_t, int16_t](&accumview[0], &oimgview[0], bitshift, ovoxels)
+
+    results.append(
+      oimg.reshape( (osx, osy, osz, sw, sv), order='F' )
+    )
+
+    if mip == num_mips - 1:
+      break
+
+    if bitshift == 15:
+      shift_right[int32_t](accum, ovoxels, bitshift)
+      if sparse:
+        shift_right[int32_t](denom, ovoxels, bitshift)
+
+    sx = osx 
+    sy = osy 
+    sz = osz
+    sxy = sx * sy
+    osx = (sx + 1) // 2
+    osy = (sy + 1) // 2
+    osz = (sz + 1) // 2
+    osxy = osx * osy
+    ovoxels = osxy * osz * sw * sv
+
+    tmp = accum 
+    accum = accumulate_2x2x2[int32_t, int32_t](accum, sx, sy, sz, sw, sv)
+    accumview = <int32_t[:ovoxels]>accum
+    PyMem_Free(tmp)
+
+    if sparse:
+      tmp = denom
+      denom = accumulate_2x2x2[int32_t, int32_t](denom, sx, sy, sz, sw, sv)
+      denomview = <int32_t[:ovoxels]>denom
+      PyMem_Free(tmp)
+
+  PyMem_Free(accum)
+  if sparse:
+    PyMem_Free(denom)
+
+  return results
+
 def _average_pooling_2x2x2_float(np.ndarray[float, ndim=5] channel, uint32_t num_mips):
   cdef size_t sx = channel.shape[0]
   cdef size_t sy = channel.shape[1]

From 27ea89c24f536ee7cbae25e7157868afb8ed3d03 Mon Sep 17 00:00:00 2001
From: William Silversmith <william.silversmith@gmail.com>
Date: Thu, 6 Feb 2025 23:04:49 -0500
Subject: [PATCH 07/11] test: check signed average pooling

---
 automated_test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/automated_test.py b/automated_test.py
index 68b7eba..d15b1f5 100644
--- a/automated_test.py
+++ b/automated_test.py
@@ -125,7 +125,7 @@ def test_even_odd2d():
   assert np.array_equal(oddimg, ans3x3x3)
   assert np.array_equal(oddimgf, ans3x3x3)
 
-@pytest.mark.parametrize("dtype", (np.uint8, np.uint16, np.float32, np.float64))
+@pytest.mark.parametrize("dtype", (np.int8, np.int16, np.uint8, np.uint16, np.float32, np.float64))
 @pytest.mark.parametrize("sparse", [False, True])
 def test_accelerated_vs_numpy_avg_pooling_2x2x1(dtype, sparse):
   image = np.random.randint(0,255, size=(512, 512, 6), dtype=np.uint8).astype(dtype, copy=False)
@@ -147,7 +147,7 @@ def test_accelerated_vs_numpy_avg_pooling_2x2x1(dtype, sparse):
 
   assert np.all(mips[-1] == npimg)
 
-@pytest.mark.parametrize("dtype", (np.uint8, np.uint16, np.float32, np.float64))
+@pytest.mark.parametrize("dtype", (np.int8, np.int16, np.uint8, np.uint16, np.float32, np.float64))
 def test_accelerated_vs_numpy_avg_pooling_2x2x1_simple_sparse(dtype):
   for x in [0,1]:
     for y in [0,1]:
@@ -189,7 +189,7 @@ def test_accelerated_vs_numpy_avg_pooling_2x2x1_simple_sparse(dtype):
   assert np.all(res[1] == ans)
 
 
-@pytest.mark.parametrize("dtype", (np.uint8, np.uint16, np.float32, np.float64))
+@pytest.mark.parametrize("dtype", (np.int8, np.int16, np.uint8, np.uint16, np.float32, np.float64))
 @pytest.mark.parametrize("sx", (6,7,1024,1025))
 @pytest.mark.parametrize("sy", (6,7,1024,1025))
 @pytest.mark.parametrize("sz", (4,5,32,33))

From 0ea1f127d93b6b68154bd2c450eca1bfbb72f503 Mon Sep 17 00:00:00 2001
From: William Silversmith <william.silversmith@gmail.com>
Date: Thu, 6 Feb 2025 23:14:21 -0500
Subject: [PATCH 08/11] docs: show other downsample types

---
 README.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 33ad0a1..34d9263 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,12 @@ img = load_3d_em_stack()
 img_pyramid = tinybrain.downsample_with_averaging(img, factor=(2,2,1), num_mips=5, sparse=False)
 
 labels = load_3d_labels()
-label_pyramid = tinybrain.downsample_segmentation(labels, factor=(2,2,1), num_mips=5, sparse=False))
+label_pyramid = tinybrain.downsample_segmentation(labels, factor=(2,2,1), num_mips=5, sparse=False)
+
+# We also have a few other types
+img_pyramid = tinybrain.downsample_with_min_pooling(image, factor=(2,2,1), num_mips=5)
+img_pyramid = tinybrain.downsample_with_max_pooling(image, factor=(2,2,1), num_mips=5)
+img_pyramid = tinybrain.downsample_with_striding(image, factor=(2,2,1), num_mips=5)
 ```
 
 ## Installation 

From c4e891dd798cbf5c2fe9b6787f348ca60e1998aa Mon Sep 17 00:00:00 2001
From: William Silversmith <william.silversmith@gmail.com>
Date: Thu, 6 Feb 2025 23:20:00 -0500
Subject: [PATCH 09/11] fix: spelling error

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 34d9263..0a42440 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ pip install tinybrain
 ## Motivation
 
 Image hierarchy generation in connectomics uses a few different techniques for
-visualizing data, but predominantly we create image pyramids of uint8 grayscale images using 2x2 average pooling and of uint8 to uint64 segmentation labels using 2x2 mode pooling. When images become very large and people wish to visualze upper mip levels using three axes at once, it becomes desirable to perform 2x2x2 downsamples to maintain isotropy.
+visualizing data, but predominantly we create image pyramids of uint8 grayscale images using 2x2 average pooling and of uint8 to uint64 segmentation labels using 2x2 mode pooling. When images become very large and people wish to visualize upper mip levels using three axes at once, it becomes desirable to perform 2x2x2 downsamples to maintain isotropy.
 
 It's possible to compute both of these using numpy, however as multiple packages found it useful to copy the downsample functions, it makes sense to formalize these functions into a seperate library located on PyPI.
 

From 0fdb5073495d67eb975b0e2e15fc29e1fb81f2f8 Mon Sep 17 00:00:00 2001
From: William Silversmith <william.silversmith@gmail.com>
Date: Thu, 6 Feb 2025 23:23:54 -0500
Subject: [PATCH 10/11] ci: update build system

---
 .github/workflows/build_wheel.yml | 12 ++++++++----
 .github/workflows/run_tests.yml   |  2 +-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
index 461c6fe..2c5260f 100644
--- a/.github/workflows/build_wheel.yml
+++ b/.github/workflows/build_wheel.yml
@@ -28,12 +28,16 @@ jobs:
         uses: docker/setup-qemu-action@v1
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.19.1
+        uses: joerick/cibuildwheel@v2.22.0
         # to supply options, put them in 'env', like:
         env:
-          CIBW_ARCHS_LINUX: ${{matrix.arch}}
-          CIBW_BEFORE_BUILD: pip install numpy setuptools wheel cython
+          CIBW_BEFORE_BUILD: pip install numpy pybind11 setuptools wheel pkginfo twine
+          CIBW_ARCHS_MACOS: "x86_64 arm64"
+          CIBW_ARCHS: auto64
 
-      - uses: actions/upload-artifact@v2
+      - name: Upload built wheels
+        uses: actions/upload-artifact@v4
         with:
+          name: built-wheels-${{ matrix.os }}-${{ matrix.arch }}
           path: ./wheelhouse/*.whl
+          if-no-files-found: warn
\ No newline at end of file
diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index de08566..28e9b5c 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-20.04, macos-latest, windows-2019]
+        os: [ubuntu-latest, macos-latest, windows-latest]
         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
 
     steps:

From 72e66a16d95876e365b8a8c5a787ee56341985ad Mon Sep 17 00:00:00 2001
From: William Silversmith <william.silversmith@gmail.com>
Date: Fri, 7 Feb 2025 12:06:58 -0500
Subject: [PATCH 11/11] fix: redefine sparse to mean != 0, not > 0 in all cases

---
 tinybrain/accelerated.hpp | 17 +++++++++++++----
 tinybrain/downsample.py   |  2 +-
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/tinybrain/accelerated.hpp b/tinybrain/accelerated.hpp
index 29b9e62..036ce20 100644
--- a/tinybrain/accelerated.hpp
+++ b/tinybrain/accelerated.hpp
@@ -359,7 +359,7 @@ T* _average_pooling_2x2_single_mip(
                   + static_cast<T2>(d)
                 ) / static_cast<T2>(
                     std::max(
-                      static_cast<T>((a > 0) + (b > 0) + (c > 0) + (d > 0)),
+                      static_cast<T>((a != 0) + (b != 0) + (c != 0) + (d != 0)),
                       static_cast<T>(1)
                     )
                   )
@@ -375,7 +375,7 @@ T* _average_pooling_2x2_single_mip(
                   + static_cast<T2>(b)
                 ) / static_cast<T2>(
                   std::max(
-                    static_cast<T>((a > 0) + (b > 0)),
+                    static_cast<T>((a != 0) + (b != 0)),
                     static_cast<T>(1)
                 )
               ));
@@ -931,8 +931,17 @@ U* denominator_2x2x2(
 
 template <typename T, typename U>
 inline void render_image(T* accum, U* oimg, const uint32_t bitshift, const size_t ovoxels) {
-  for (size_t i = 0; i < ovoxels; i++) {
-    oimg[i] = static_cast<U>(accum[i] >> bitshift);
+  if constexpr (std::is_signed<T>::value) {
+    for (size_t i = 0; i < ovoxels; i++) {
+      oimg[i] = (accum[i] < 0) 
+        ? -static_cast<U>(std::abs(accum[i]) >> bitshift)
+        : static_cast<U>(accum[i] >> bitshift);
+    }
+  } 
+  else {
+    for (size_t i = 0; i < ovoxels; i++) {
+      oimg[i] = static_cast<U>(accum[i] >> bitshift);
+    }
   }
 }
 
diff --git a/tinybrain/downsample.py b/tinybrain/downsample.py
index 8d09779..2802d92 100644
--- a/tinybrain/downsample.py
+++ b/tinybrain/downsample.py
@@ -136,7 +136,7 @@ def downsample_with_averaging_numpy(array, factor, sparse=False):
     indexing_expr = tuple(np.s_[:s] for s in part.shape)
     temp[indexing_expr] += part
     if sparse:
-      counts[indexing_expr] += part > 0
+      counts[indexing_expr] += part != 0
     else:
       counts[indexing_expr] += 1