diff --git a/dask_grblas/_automethods.py b/dask_grblas/_automethods.py
new file mode 100644
index 0000000..a781ad0
--- /dev/null
+++ b/dask_grblas/_automethods.py
@@ -0,0 +1,21 @@
+from grblas import config
+
+
+def _get_value(self, attr=None, default=None):
+    if config.get("autocompute"):
+        if self._value is None:
+            self._value = self.new()
+            if getattr(self, "is_dOnion", False):
+                self._value = self._value.strip()
+        if attr is None:
+            return self._value
+        else:
+            return getattr(self._value, attr)
+    if default is not None:
+        return default.__get__(self)
+    raise TypeError(
+        f"{attr} not enabled for objects of type {type(self)}.  "
+        f"Use `.new()` to create a new {self.output_type.__name__}.\n\n"
+        "Hint: use `grblas.config.set(autocompute=True)` to enable "
+        "automatic computation of expressions."
+    )
diff --git a/dask_grblas/_ss/matrix.py b/dask_grblas/_ss/matrix.py
index 8aad87a..ed64b4e 100644
--- a/dask_grblas/_ss/matrix.py
+++ b/dask_grblas/_ss/matrix.py
@@ -30,3 +30,27 @@ def diag(self, vector, k=0, chunks="auto", dtype=None):
         vector = self._parent._expect_type(vector, dgb.Vector, within="ss.diag", argname="vector")
         rv = vector._diag(k, chunks=chunks, dtype=dtype)
         self._parent.__init__(rv._delayed, nvals=rv._nvals)
+
+    def build_scalar(
+        self,
+        rows,
+        columns,
+        values,
+        *,
+        dup_op=None,
+        clear=False,
+        nrows=None,
+        ncols=None,
+        chunks=None,
+        in_dOnion=False,  # not part of the API
+    ):
+        self._parent.build(
+            rows,
+            columns,
+            values,
+            dup_op=dup_op,
+            clear=clear,
+            nrows=nrows,
+            ncols=ncols,
+            chunks=chunks,
+        )
diff --git a/dask_grblas/base.py b/dask_grblas/base.py
index 6febd0b..c1e80fb 100644
--- a/dask_grblas/base.py
+++ b/dask_grblas/base.py
@@ -1,4 +1,7 @@
 from numbers import Number
+from collections.abc import Iterable
+from tlz import compose
+from functools import partial
 import dask.array as da
 import grblas as gb
 import numpy as np
@@ -6,16 +9,23 @@
 
 from . import replace as replace_singleton
 from .mask import Mask
+from .functools import flexible_partial, skip
 from .utils import get_grblas_type, get_meta, np_dtype, wrap_inner
+from dask.base import is_dask_collection
 
 _expect_type = gb.base._expect_type
 
 
-def _check_mask(mask, output=None):
+def is_type(arg_type, a):
+    return type(a) is arg_type
+
+
+def _check_mask(mask, output=None, ignore_None=False):
     if not isinstance(mask, Mask):
         if isinstance(mask, BaseType):
             raise TypeError("Mask must indicate values (M.V) or structure (M.S)")
-        raise TypeError(f"Invalid mask: {type(mask)}")
+        elif mask is None and not ignore_None:
+            raise TypeError(f"Invalid mask: {type(mask)}")
     if output is not None:
         from .vector import Vector
 
@@ -32,9 +42,27 @@ class BaseType:
     _expect_type = _expect_type
     _is_scalar = False
 
+    @property
+    def is_dOnion(self):
+        return is_DOnion(self._delayed)
+
+    @property
+    def dOnion_if(self):
+        return self._delayed if self.is_dOnion else self
+
+    def strip(self, *args, **kwargs):
+        return self._delayed.strip(*args, **kwargs) if self.is_dOnion else self
+
     def isequal(self, other, *, check_dtype=False):
         from .scalar import PythonScalar
 
+        if any_dOnions(self, other):
+            meta = gb.Scalar.new(bool)
+            delayed = DOnion.multi_access(
+                meta, self.__class__.isequal, self, other, check_dtype=check_dtype
+            )
+            return PythonScalar(delayed, meta=meta)
+
         # if type(other) is not type(self):
         #     raise TypeError(f'Argument of isequal must be of type {type(self).__name__}')
         if not self._meta.isequal(other._meta):
@@ -52,46 +80,125 @@ def isequal(self, other, *, check_dtype=False):
             adjust_chunks={i: 1 for i in range(self._delayed.ndim)},
         )
         """
-        delayed = da.core.elemwise(
-            _isequal,
-            self._delayed,
-            other._delayed,
-            check_dtype,
-            dtype=bool,
+        ndim = (
+            self._matrix._delayed.ndim
+            if getattr(self, "_is_transposed", False)
+            else self._delayed.ndim
         )
-        if self._delayed.ndim > 0:
+        if ndim < 2:
             delayed = da.core.elemwise(
-                _to_scalar,
-                delayed.all(),
-                bool,
+                partial(_isequal, False, False),
+                self._delayed,
+                other._delayed,
+                check_dtype,
+                dtype=bool,
+            )
+        else:
+            xt = getattr(self, "_is_transposed", False)
+            yt = getattr(other, "_is_transposed", False)
+            self_ = (self._matrix._delayed, "ji") if xt else (self._delayed, "ij")
+            other_ = (other._matrix._delayed, "ji") if yt else (other._delayed, "ij")
+            delayed = da.core.blockwise(
+                *(partial(_isequal, xt, yt), "ij"),
+                *self_,
+                *other_,
+                *(check_dtype, None),
+                dtype=bool,
+            )
+        if ndim > 0:
+            delayed = da.core.blockwise(
+                *(_to_scalar, ()),
+                *(delayed.all(), None),
+                *(bool, None),
+                dtype=np.bool_,
+                meta=wrap_inner(gb.Scalar.new(bool)),
             )
         return PythonScalar(delayed)
 
     def isclose(self, other, *, rel_tol=1e-7, abs_tol=0.0, check_dtype=False):
         from .scalar import PythonScalar
 
+        if any_dOnions(self, other):
+            meta = gb.Scalar.new(bool)
+            delayed = DOnion.multi_access(
+                meta,
+                self.__class__.isclose,
+                self,
+                other,
+                rel_tol=rel_tol,
+                abs_tol=abs_tol,
+                check_dtype=check_dtype,
+            )
+            return PythonScalar(delayed, meta=meta)
+
         # if type(other) is not type(self):
         #     raise TypeError(f'Argument of isclose must be of type {type(self).__name__}')
         if not self._meta.isequal(other._meta):
             return PythonScalar.from_value(False)
-        delayed = da.core.elemwise(
-            _isclose,
-            self._delayed,
-            other._delayed,
-            rel_tol,
-            abs_tol,
-            check_dtype,
-            dtype=bool,
+
+        ndim = (
+            self._matrix._delayed.ndim
+            if getattr(self, "_is_transposed", False)
+            else self._delayed.ndim
         )
-        if self._delayed.ndim > 0:
+        if ndim < 2:
             delayed = da.core.elemwise(
-                _to_scalar,
-                delayed.all(),
-                bool,
+                partial(_isclose, False, False),
+                self._delayed,
+                other._delayed,
+                rel_tol,
+                abs_tol,
+                check_dtype,
+                dtype=bool,
+            )
+        else:
+            xt = getattr(self, "_is_transposed", False)
+            yt = getattr(other, "_is_transposed", False)
+            self_ = (self._matrix._delayed, "ji") if xt else (self._delayed, "ij")
+            other_ = (other._matrix._delayed, "ji") if yt else (other._delayed, "ij")
+            delayed = da.core.blockwise(
+                *(partial(_isclose, xt, yt), "ij"),
+                *self_,
+                *other_,
+                *(rel_tol, None),
+                *(abs_tol, None),
+                *(check_dtype, None),
+                dtype=bool,
+            )
+        if ndim > 0:
+            delayed = da.core.blockwise(
+                *(_to_scalar, ()),
+                *(delayed.all(), None),
+                *(bool, None),
+                dtype=np.bool_,
+                meta=wrap_inner(gb.Scalar.new(bool)),
             )
         return PythonScalar(delayed)
 
+    def _clear(self):
+        delayed = self._optional_dup()
+        # for a function like this, what's the difference between `map_blocks` and `elemwise`?
+        if self.ndim == 0:
+            return self.__class__(
+                delayed.map_blocks(
+                    _clear,
+                    dtype=np_dtype(self.dtype),
+                )
+            )
+        else:
+            return self.__class__(
+                delayed.map_blocks(
+                    _clear,
+                    dtype=np_dtype(self.dtype),
+                ),
+                nvals=0,
+            )
+
     def clear(self):
+        if is_DOnion(self._delayed):
+            self.__init__(self._delayed.getattr(self._meta, "_clear"), meta=self._meta, nvals=0)
+            return
+
         # Should we copy and mutate or simply create new chunks?
         delayed = self._optional_dup()
         # for a function like this, what's the difference between `map_blocks` and `elemwise`?
@@ -110,6 +217,13 @@ def clear(self):
             )
 
     def dup(self, dtype=None, *, mask=None, name=None):
+        if any_dOnions(self, mask):
+            meta = self._meta.dup(dtype=dtype)
+            donion = DOnion.multi_access(
+                meta, self.__class__.dup, self, dtype=dtype, mask=mask, name=name
+            )
+            return self.__class__(donion, meta=meta)
+
         if mask is not None:
             if not isinstance(mask, Mask):
                 self._meta.dup(dtype=dtype, mask=mask, name=name)  # should raise
@@ -188,6 +302,9 @@ def __call__(
     __imatmul__ = gb.base.BaseType.__imatmul__
 
     def _optional_dup(self):
+        if self.is_dOnion:
+            return DOnion.multi_access(self._meta, _dOnion_dup, self)
+
         # TODO: maybe try to create an optimization pass that remove these if they are unnecessary
         return da.core.elemwise(
             _optional_dup,
@@ -210,16 +327,22 @@ def compute_and_store_nvals(self):
     def nvals(self):
         from .scalar import PythonScalar
 
+        if self.is_dOnion:
+            donion = DOnion.multi_access(self._meta.nvals, getattr, self, "nvals")
+            return PythonScalar(donion)
+
         delayed = da.core.elemwise(
             _nvals,
             self._delayed,
             dtype=int,
         )
         if self._delayed.ndim > 0:
-            delayed = da.core.elemwise(
-                _to_scalar,
-                delayed.sum(),
-                int,
+            delayed = da.core.blockwise(
+                *(_to_scalar, ()),
+                *(delayed.sum(), None),
+                *(int, None),
+                dtype=np.int_,
+                meta=wrap_inner(gb.Scalar.new(int)),
             )
         return PythonScalar(delayed)
 
@@ -239,7 +362,7 @@ def _name_html(self):
             return self.name
         return f"{split[0]}<sub>{split[1]}</sub>"
 
-    def update(self, expr):
+    def update(self, expr, in_dOnion=False):
         if isinstance(expr, Number):
             if self.ndim == 2:
                 raise TypeError(
@@ -250,12 +373,75 @@ def update(self, expr):
                     "If you do wish to make a dense matrix, then please be explicit:"
                     "\n\n    M[:, :] = s"
                 )
+        typ = type(expr)
+        if any_dOnions(self, expr):
+            self_copy = self.__class__(self._optional_dup(), meta=self._meta)
+            expr_ = expr
+            if isinstance(expr, AmbiguousAssignOrExtract) and expr.has_dOnion:
+
+                def update_by_aae(c, p, k_0, k_1):
+                    keys = k_0 if k_1 is None else (k_0, k_1)
+                    return c.update(p[keys], in_dOnion=True)
+
+                if _is_pair(expr_.index):
+                    keys_0, keys_1 = expr_.index[0], expr_.index[1]
+                else:
+                    keys_0, keys_1 = expr_.index, None
+
+                donion = DOnion.multi_access(
+                    self._meta,
+                    update_by_aae,
+                    self_copy,
+                    expr_.parent,
+                    *(keys_0, keys_1),
+                )
+                self.__init__(donion, self._meta)
+                return
+
+            if isinstance(expr, GbDelayed) and expr.has_dOnion:
+
+                def update_by_gbd(c, *args, **kwargs):
+                    gbd = getattr(args[0], args[1])(*args[2:], **kwargs)
+                    return c.update(gbd, in_dOnion=True)
+
+                donion = DOnion.multi_access(
+                    self._meta,
+                    update_by_gbd,
+                    self_copy,
+                    expr_.parent,
+                    expr_.method_name,
+                    *expr_.args,
+                    **expr_.kwargs,
+                )
+                self.__init__(donion, self._meta)
+                return
+
+            elif typ is TransposedMatrix and expr.is_dOnion:
+
+                donion = DOnion.multi_access(
+                    self._meta, BaseType.update, self_copy, expr_, in_dOnion=True
+                )
+                self.__init__(donion, self._meta)
+                return
+
+            donion = DOnion.multi_access(
+                self._meta, BaseType.update, self_copy, expr_, in_dOnion=True
+            )
+            self.__init__(donion, self._meta)
+            return
+
+        if typ is Box:
+            expr = expr.content
+            typ = type(expr)
+
+        if isinstance(expr, Number):
             Updater(self)[...] << expr
+            if in_dOnion:
+                return self.__class__(self._delayed, meta=self._meta)
             return
-        self._meta.update(expr._meta)
+
         self._meta.clear()
-        typ = type(expr)
-        if typ is AmbiguousAssignOrExtract:
+        if isinstance(expr, AmbiguousAssignOrExtract):
             # Extract (w << v[index])
             # Is it safe/reasonable to simply replace `_delayed`?
             # Should we try to preserve e.g. format or partitions?
@@ -266,25 +452,100 @@ def update(self, expr):
                 self.__init__(expr._optional_dup())
             else:
                 self.__init__(expr.dup(dtype=self.dtype)._delayed)
-        elif typ is GbDelayed:
+        elif isinstance(expr, GbDelayed):
             expr._update(self)
         elif typ is TransposedMatrix:
             # "C << A.T"
-            C = expr.new()
+            C = expr.new(dtype=self.dtype)
             self.__init__(C._delayed)
+        elif typ is type(None):  # noqa
+            raise TypeError("Assignment value must be a valid expression")
         else:
             # Anything else we need to handle?
-            raise TypeError()
+            raise NotImplementedError()
+        if in_dOnion:
+            return self.__class__(self._delayed, meta=self._meta)
+
+    def _update(self, expr, *, mask=None, accum=None, replace=None, in_dOnion=False):
+        typ = type(expr)
+        if any_dOnions(self, expr, mask):
+            self_copy = self.__class__(self._optional_dup(), meta=self._meta)
+            mask_ = mask.dOnion_if if mask is not None else None
+            expr_ = expr
+            if isinstance(expr, AmbiguousAssignOrExtract) and expr.has_dOnion:
+
+                def _update_by_aae(c, p, k_0, k_1, mask=None, accum=None, replace=None):
+                    keys = k_0 if k_1 is None else (k_0, k_1)
+                    return c.update(
+                        p[keys], mask=mask, accum=accum, replace=replace, in_dOnion=True
+                    )
+
+                if _is_pair(expr_.index):
+                    keys_0, keys_1 = expr_.index[0], expr_.index[1]
+                else:
+                    keys_0, keys_1 = expr_.index, None
+
+                donion = DOnion.multi_access(
+                    self._meta,
+                    _update_by_aae,
+                    self_copy,
+                    expr_.parent,
+                    *(keys_0, keys_1),
+                    mask=mask_,
+                    accum=accum,
+                    replace=replace,
+                )
+                self.__init__(donion, self._meta)
+                return
+
+            if isinstance(expr, GbDelayed) and expr.has_dOnion:
+
+                def _update_by_gbd(c, *args, mask=None, accum=None, replace=None, **kwargs):
+                    gbd = getattr(args[0], args[1])(*args[2:], **kwargs)
+                    return c._update(gbd, mask=mask, accum=accum, replace=replace, in_dOnion=True)
+
+                donion = DOnion.multi_access(
+                    self._meta,
+                    _update_by_gbd,
+                    self_copy,
+                    expr_.parent,
+                    expr_.method_name,
+                    *expr_.args,
+                    mask=mask_,
+                    accum=accum,
+                    replace=replace,
+                    **expr_.kwargs,
+                )
+                self.__init__(donion, self._meta)
+                return
+
+            donion = DOnion.multi_access(
+                self._meta,
+                BaseType._update,
+                self_copy,
+                expr_,
+                mask=mask_,
+                accum=accum,
+                replace=replace,
+                in_dOnion=True,
+            )
+            self.__init__(donion, meta=self._meta)
+            return
+
+        if typ is Box:
+            expr = expr.content
+            typ = type(expr)
 
-    def _update(self, expr, *, mask=None, accum=None, replace=None):
         if mask is None and accum is None:
             self.update(expr)
+            if in_dOnion:
+                return self
             return
-        typ = type(expr)
-        if typ is AmbiguousAssignOrExtract:
+        if isinstance(expr, AmbiguousAssignOrExtract):
             # Extract (w(mask=mask, accum=accum) << v[index])
+            expr_new = expr.new(dtype=self.dtype)
+            expr_delayed = expr_new._delayed
             delayed = self._optional_dup()
-            expr_delayed = expr.new(dtype=self.dtype)._delayed
             self._meta(mask=get_meta(mask), accum=accum, replace=replace)
             if mask is not None:
                 delayed_mask = mask.mask._delayed
@@ -304,7 +565,7 @@ def _update(self, expr, *, mask=None, accum=None, replace=None):
                     dtype=np_dtype(self._meta.dtype),
                 )
             )
-        elif typ is GbDelayed:
+        elif isinstance(expr, GbDelayed):
             # v(mask=mask) << left.ewise_mult(right)
             # Meta check handled in Updater
             expr._update(self, mask=mask, accum=accum, replace=replace)
@@ -333,6 +594,9 @@ def _update(self, expr, *, mask=None, accum=None, replace=None):
         else:
             raise NotImplementedError(f"{typ}")
 
+        if in_dOnion:
+            return self.__class__(self._delayed, meta=self._meta)
+
     def wait(self):
         # TODO: What should this do?
         self._meta.wait()
@@ -340,6 +604,8 @@ def wait(self):
     def compute(self, *args, **kwargs):
         # kwargs['scheduler'] = 'synchronous'
         val = self._delayed.compute(*args, **kwargs)
+        if self.is_dOnion:
+            return val
         return val.value
 
     def persist(self, *args, **kwargs):
@@ -349,6 +615,276 @@ def visualize(self, *args, **kwargs):
         return self._delayed.visualize(*args, **kwargs)
 
 
+class Box:
+    """
+    An arbitrary wrapper to wrap around the inner values of
+    an Array object to prevent dask from post-processing the
+    Array at the end of compute()
+    """
+
+    def __init__(self, content):
+        self.content = content
+
+    def __getattr__(self, item):
+        return getattr(self.content, item)
+
+
+const_obj = object()
+_const0_DOnion = {"dtype": np.object_, "meta": np.array(const_obj, dtype=np.object_)}
+
+
+class DOnion:
+    """
+    Dask (or Delayed) Onion (DOnion):
+
+    Encapsulates a dask array whose inner value is also a dask array.
+    Intended to be used in cases where the size of the inner dask
+    array (the seed) depends on the inner value of another dask array
+    (the shroud)
+    """
+
+    is_dOnion = True
+
+    @classmethod
+    def sprout(cls, shroud, seed_meta, seed_func, *args, **kwargs):
+        """
+        Develop a DOnion from dask arrays listed in `shroud` and using function `seed_func`
+
+        Return dask.array.map_blocks(seed_func, shroud) as a DOnion.
+
+        :shroud: a dask array; or an iterable of multiple such dask arrays; or a tuple (x, y)
+            where x and y are respectively a list of dask arrays and a dict of named dask arrays.
+            The inner values of these arrays determine the (size of) seed dask array
+        :seed_meta: empty instance of the inner value type of the seed
+        :seed_func: the function that takes as input the inner value of `shroud` and returns
+            another dask array (the seed)
+        :args: tuple of arguments to `seed_func`.  May contain one or more `skip` sentinels
+            denoting a vacant positions to be taken up by the inner values of dask arrays in
+            shroud.
+        :kwargs: dict of keyword arguments to `seed_func`
+        """
+        named_shrouds = {}
+        if is_dask_collection(shroud):
+            shroud = [shroud]
+        else:
+            if isinstance(shroud, Iterable):
+                if len(shroud) > 0:
+                    if (
+                        len(shroud) == 2
+                        and isinstance(shroud[0], Iterable)
+                        and isinstance(shroud[1], dict)
+                    ):
+                        shroud = shroud[0]
+                        named_shrouds = shroud[1]
+                else:
+                    raise ValueError("`shroud` must contain at least one dask array!")
+            else:
+                raise ValueError(
+                    "`shroud` must be a dask array; a list x of dask arrays or"
+                    "a dict y of named dask arrays; or a tuple of both: (x, y)"
+                )
+
+        seed_func = flexible_partial(seed_func, *args, **kwargs)
+        kernel = da.map_blocks(seed_func, *shroud, **named_shrouds, **_const0_DOnion)
+        return DOnion(kernel, meta=seed_meta)
+
+    def __init__(self, kernel, meta=None):
+        self.kernel = kernel
+        # Why have ._meta and .dtype attributes? B'cos Scalar, Vector & Matrix need them
+        self._meta = meta
+        self.dtype = getattr(meta, "dtype", type(meta))
+
+    def __eq__(self, other):
+        if like_dOnion(other):
+            other = other.compute()
+        return self.compute() == other
+
+    def compute(self, *args, **kwargs):
+        value = self.kernel.compute(*args, **kwargs)
+        while hasattr(value, "compute"):
+            value = value.compute(*args, **kwargs)
+        if type(value) is Box:
+            value = value.content
+        return value
+
+    def compute_once(self, *args, **kwargs):
+        value = self.kernel.compute(*args, **kwargs)
+        if type(value) is Box:
+            value = value.content
+        return value
+
+    def strip(self, *args, **kwargs):
+        value = self.compute_once(*args, **kwargs)
+        while like_dOnion(value):
+            if type(value) is DOnion:
+                value = value.compute_once(*args, **kwargs)
+            else:
+                value = value._delayed.compute_once(*args, **kwargs)
+        return value
+
+    def persist(self, *args, **kwargs):
+        value = self.strip(*args, **kwargs)
+        if hasattr(value, "persist"):
+            return value.persist(*args, **kwargs)
+        else:
+            raise AttributeError(
+                f"Something went wrong: stripped dOnion {self} value {value} has"
+                " no `persist()` attribute."
+            )
+
+    def _persist(self, *args, **kwargs):
+        value = self.strip(*args, **kwargs)
+        if hasattr(value, "_persist"):
+            value._persist(*args, **kwargs)
+            return value._delayed
+        else:
+            raise AttributeError(
+                f"Something went wrong: stripped dOnion {self} value {value} has"
+                " no `_persist()` attribute."
+            )
+
+    @classmethod
+    def multi_access(cls, out_meta, func, *args, **kwargs):
+        def adaptor(func, ts, cs, ss, vs, kwargs_desc, *args, **kwargs):
+            args_ = ()
+            for arg, t, c, s, v in zip(args, ts, cs, ss, vs):
+                if type(arg) is Box:
+                    arg = arg.content
+                if t:
+                    arg = arg.T
+                if s:
+                    arg = arg.S
+                if v:
+                    arg = arg.V
+                if c:
+                    arg = arg.__invert__()
+                args_ += (arg,)
+
+            kwargs_ = kwargs.copy()
+            for k in kwargs:
+                t, c, s, v = kwargs_desc[k]
+                if t:
+                    kwargs_[k] = kwargs_[k].T
+                if s:
+                    kwargs_[k] = kwargs_[k].S
+                if v:
+                    kwargs_[k] = kwargs_[k].V
+                if c:
+                    kwargs_[k] = kwargs_[k].__invert__()
+
+            return func(*args_, **kwargs_)
+
+        _args = [getattr(arg, "dOnion_if", arg) for arg in args]
+        ts = [
+            getattr(arg, "is_dOnion", False) and getattr(arg, "_is_transposed", False)
+            for arg in args
+        ]
+        cs = [
+            getattr(arg, "is_dOnion", False)
+            and isinstance(arg, Mask)
+            and getattr(arg, "complement", False)
+            for arg in args
+        ]
+        ss = [
+            getattr(arg, "is_dOnion", False)
+            and isinstance(arg, Mask)
+            and getattr(arg, "structure", False)
+            for arg in args
+        ]
+        vs = [
+            getattr(arg, "is_dOnion", False)
+            and isinstance(arg, Mask)
+            and getattr(arg, "value", False)
+            for arg in args
+        ]
+
+        _kwargs = {k: getattr(arg, "dOnion_if", arg) for k, arg in kwargs.items()}
+
+        kwargs_desc = {
+            k: (
+                getattr(arg, "is_dOnion", False) and getattr(arg, "_is_transposed", False),
+                getattr(arg, "is_dOnion", False)
+                and isinstance(arg, Mask)
+                and getattr(arg, "complement", False),
+                getattr(arg, "is_dOnion", False)
+                and isinstance(arg, Mask)
+                and getattr(arg, "structure", False),
+                getattr(arg, "is_dOnion", False)
+                and isinstance(arg, Mask)
+                and getattr(arg, "value", False),
+            )
+            for k, arg in kwargs.items()
+        }
+        return DOnion.multiple_access(
+            out_meta, adaptor, func, ts, cs, ss, vs, kwargs_desc, *_args, **_kwargs
+        )
+
+    @classmethod
+    def multiple_access(cls, out_meta, func, *args, **kwargs):
+        """
+        Pass inner values of any DOnions in `args` and/or `kwargs` into `func`.
+
+        :func: Callable that can accept the contents of `args` and/or `kwargs`
+            as parameters
+        :args: a list of positional arguments to `func`
+        :kwargs: a dict of named arguments to `func`
+        """
+        # First, pass non-DOnion args and kwargs to func:
+        skip_Donions = [arg if not is_DOnion(arg) else skip for arg in args]
+        non_DOnion_kwargs = {k: v for (k, v) in kwargs.items() if not is_DOnion(v)}
+        func = flexible_partial(func, *skip_Donions, **non_DOnion_kwargs)
+
+        # Next, pass func and DOnion args and kwargs to map_blocks:
+        donion_args = tuple(arg.kernel for arg in args if is_DOnion(arg))
+        donion_kwargs = {k: v.kernel for (k, v) in kwargs.items() if is_DOnion(v)}
+        kernel = da.map_blocks(func, *donion_args, **donion_kwargs, **_const0_DOnion)
+        return DOnion(kernel, meta=out_meta)
+
+    def deep_extract(self, out_meta, func, *args, **kwargs):
+        func = flexible_partial(func, *args, **kwargs)
+        if not isinstance(
+            out_meta, (np.ndarray, gb.base.BaseType, gb.mask.Mask, gb.matrix.TransposedMatrix)
+        ):
+            func = compose(Box, func)
+        kernel = self.kernel.map_blocks(func, **_const0_DOnion)
+        return DOnion(kernel, meta=out_meta)
+
+    def __call__(self, *args, **kwargs):
+        meta = self._meta(*args, **kwargs)
+        return self.getattr(meta, "__call__", *args, **kwargs)
+
+    def __getattr__(self, item):
+        try:
+            meta = getattr(self._meta, item, getattr(self.kernel, item))
+        except AttributeError:
+            raise AttributeError(f"Unable to compute meta corresponding to attribute {item}.")
+        _getattr = flexible_partial(getattr, skip, item)
+        return self.deep_extract(meta, _getattr)
+
+    def getattr(self, meta, attr_name, *args, **kwargs):
+        _getattr = flexible_partial(DOnion._getattr, skip, attr_name, *args, **kwargs)
+        return self.deep_extract(meta, _getattr)
+
+    @classmethod
+    def _getattr(cls, x, attr_name, *args, **kwargs):
+        return getattr(x, attr_name)(*args, **kwargs)
+
+
+is_DOnion = partial(is_type, DOnion)
+
+
+def like_dOnion(arg):
+    return arg is not None and (
+        is_DOnion(arg) or getattr(arg, "is_dOnion", False) or getattr(arg, "has_dOnion", False)
+    )
+
+
+def any_dOnions(*args, **kwargs):
+    return np.any([like_dOnion(arg) for arg in args]) or np.any(
+        [like_dOnion(v) for _, v in kwargs.items()]
+    )
+
+
 # Dask task functions
 def _clear(x):
     x.value.clear()
@@ -361,13 +897,17 @@ def _dup(x, mask, dtype, mask_type):
     return wrap_inner(x.value.dup(dtype=dtype, mask=mask))
 
 
-def _isclose(x, y, rel_tol, abs_tol, check_dtype):
-    val = x.value.isclose(y.value, rel_tol=rel_tol, abs_tol=abs_tol, check_dtype=check_dtype)
+def _isclose(xt, yt, x, y, rel_tol, abs_tol, check_dtype):
+    x_ = x.value.T if xt else x.value
+    y_ = y.value.T if yt else y.value
+    val = x_.isclose(y_, rel_tol=rel_tol, abs_tol=abs_tol, check_dtype=check_dtype)
     return _reduction_value(x, val)
 
 
-def _isequal(x, y, check_dtype):
-    val = x.value.isequal(y.value, check_dtype=check_dtype)
+def _isequal(xt, yt, x, y, check_dtype):
+    x_ = x.value.T if xt else x.value
+    y_ = y.value.T if yt else y.value
+    val = x_.isequal(y_, check_dtype=check_dtype)
     return _reduction_value(x, val)
 
 
@@ -380,6 +920,10 @@ def _optional_dup(x):
     return wrap_inner(x.value.dup())
 
 
+def _dOnion_dup(x):
+    return x.dup()
+
+
 def _reduction_value(x, val):
     """Helper function used when reducing objects to scalars such as for `isclose`"""
     if x.ndim == 0:
@@ -402,5 +946,5 @@ def _update_assign(updating, accum, mask, mask_type, replace, x):
     return updating
 
 
-from .expr import AmbiguousAssignOrExtract, GbDelayed, Updater  # noqa isort: skip
+from .expr import AmbiguousAssignOrExtract, GbDelayed, Updater, _is_pair  # noqa isort: skip
 from .matrix import TransposedMatrix  # noqa isort: skip
diff --git a/dask_grblas/expr.py b/dask_grblas/expr.py
index e226ef0..7d59c70 100644
--- a/dask_grblas/expr.py
+++ b/dask_grblas/expr.py
@@ -8,15 +8,18 @@
 
 from grblas.exceptions import DimensionMismatch
 from dask.base import tokenize
+from dask.highlevelgraph import HighLevelGraph
 
-from .base import BaseType, InnerBaseType, _check_mask
+from .base import BaseType, InnerBaseType, _check_mask, DOnion, is_DOnion, any_dOnions
 from .mask import Mask
 from .utils import (
     get_grblas_type,
+    get_inner_type,
     get_meta,
     get_return_type,
     np_dtype,
     wrap_inner,
+    flatten,
     build_chunk_offsets_dask_array,
     build_chunk_ranges_dask_array,
     build_slice_dask_array_from_chunks,
@@ -24,7 +27,8 @@
 
 
 class GbDelayed:
-    def __init__(self, parent, method_name, *args, meta, **kwargs):
+    def __init__(self, parent, method_name, *args, meta=None, **kwargs):
+        self.has_dOnion = any_dOnions(parent, *args)
         self.parent = parent
         self.method_name = method_name
         self.args = args
@@ -32,13 +36,8 @@ def __init__(self, parent, method_name, *args, meta, **kwargs):
         self._meta = meta
         # InfixExpression and Aggregator requirements:
         self.dtype = meta.dtype
-        self.output_type = meta.output_type
-        self.ndim = len(meta.shape)
-        if self.ndim == 1:
-            self._size = meta.size
-        elif self.ndim == 2:
-            self._nrows = meta.nrows
-            self._ncols = meta.ncols
+        # autocompute requirements:
+        self._value = None
 
     def _matmul(self, meta, mask=None):
         left_operand = self.parent
@@ -125,39 +124,94 @@ def _matmul2(self, meta, mask=None):
 
         op = self.args[1]
         sum_meta = wrap_inner(meta)
-        if mask is None:
-            out = da.core.blockwise(
-                partial(_matmul2, op, meta.dtype, at, bt),
-                out_ind,
-                a,
-                lhs_ind,
-                b,
-                rhs_ind,
-                adjust_chunks={compress_axis: 1},
-                dtype=np.result_type(a, b),
-                concatenate=False,
-                meta=FakeInnerTensor(meta, compress_axis),
-            )
+        if op.is_positional:
+            _, (a, b) = da.core.unify_chunks(a, lhs_ind, b, rhs_ind)
+            x = build_chunk_ranges_dask_array(a, 0, "row-ranges-" + tokenize(a, 0))
+            a_ranges = (x, (lhs_ind[0],))
+            if a.ndim == 2:
+                x = build_chunk_ranges_dask_array(a, 1, "col-ranges-" + tokenize(a, 1))
+                a_ranges += (x, (lhs_ind[1],))
+
+            x = build_chunk_ranges_dask_array(b, 0, "row-ranges-" + tokenize(b, 0))
+            b_ranges = (x, (rhs_ind[0],))
+            if b.ndim == 2:
+                x = build_chunk_ranges_dask_array(b, 1, "col-ranges-" + tokenize(b, 1))
+                b_ranges += (x, (rhs_ind[1],))
+
+            if mask is None:
+                matmul_pos = partial(
+                    _matmul2_positional,
+                    op,
+                    meta.dtype,
+                    at,
+                    bt,
+                    a.shape,
+                    b.shape,
+                )
+                out = da.core.blockwise(
+                    *(matmul_pos, out_ind),
+                    *(a, lhs_ind),
+                    *(b, rhs_ind),
+                    *(a_ranges + b_ranges),
+                    adjust_chunks={compress_axis: 1},
+                    dtype=np.result_type(a, b),
+                    concatenate=False,
+                    meta=FakeInnerTensor(meta, compress_axis),
+                )
+            else:
+                m = mask.mask._delayed
+                grblas_mask_type = get_grblas_type(mask)
+                mask_ind = list(out_ind)
+                mask_ind.remove(compress_axis)
+                mask_ind = tuple(mask_ind)
+                out = da.core.blockwise(
+                    partial(_matmul2_masked, op, meta.dtype, at, bt, grblas_mask_type),
+                    out_ind,
+                    m,
+                    mask_ind,
+                    a,
+                    lhs_ind,
+                    b,
+                    rhs_ind,
+                    adjust_chunks={compress_axis: 1},
+                    dtype=np.result_type(a, b),
+                    concatenate=False,
+                    meta=FakeInnerTensor(meta, compress_axis),
+                )
         else:
-            m = mask.mask._delayed
-            grblas_mask_type = get_grblas_type(mask)
-            mask_ind = list(out_ind)
-            mask_ind.remove(compress_axis)
-            mask_ind = tuple(mask_ind)
-            out = da.core.blockwise(
-                partial(_matmul2_masked, op, meta.dtype, at, bt, grblas_mask_type),
-                out_ind,
-                m,
-                mask_ind,
-                a,
-                lhs_ind,
-                b,
-                rhs_ind,
-                adjust_chunks={compress_axis: 1},
-                dtype=np.result_type(a, b),
-                concatenate=False,
-                meta=FakeInnerTensor(meta, compress_axis),
-            )
+            if mask is None:
+                out = da.core.blockwise(
+                    partial(_matmul2, op, meta.dtype, at, bt),
+                    out_ind,
+                    a,
+                    lhs_ind,
+                    b,
+                    rhs_ind,
+                    adjust_chunks={compress_axis: 1},
+                    dtype=np.result_type(a, b),
+                    concatenate=False,
+                    meta=FakeInnerTensor(meta, compress_axis),
+                )
+            else:
+                m = mask.mask._delayed
+                grblas_mask_type = get_grblas_type(mask)
+                mask_ind = list(out_ind)
+                mask_ind.remove(compress_axis)
+                mask_ind = tuple(mask_ind)
+                out = da.core.blockwise(
+                    partial(_matmul2_masked, op, meta.dtype, at, bt, grblas_mask_type),
+                    out_ind,
+                    m,
+                    mask_ind,
+                    a,
+                    lhs_ind,
+                    b,
+                    rhs_ind,
+                    adjust_chunks={compress_axis: 1},
+                    dtype=np.result_type(a, b),
+                    concatenate=False,
+                    meta=FakeInnerTensor(meta, compress_axis),
+                )
 
         # out has an extra dimension (a slab or a bar), and now reduce along it
         out = sum_by_monoid(op.monoid, out, axis=compress_axis, meta=sum_meta)
@@ -180,26 +234,24 @@ def _reduce_along_axis(self, axis, dtype):
         return delayed
 
     def _reduce_scalar(self, dtype):
-        assert not self.kwargs
         op = self.args[0]
         at = self.parent._is_transposed
         delayed = self.parent._matrix._delayed if at else self.parent._delayed
         delayed = da.reduction(
             delayed,
-            partial(_reduce_scalar, op, dtype),
-            partial(_reduce_combine, op),
+            partial(_reduce_scalar, op, dtype, **self.kwargs),
+            partial(_reduce_combine, op, **self.kwargs),
             concatenate=False,
             dtype=np_dtype(dtype),
         )
         return delayed
 
     def _reduce(self, dtype):
-        assert not self.kwargs
         op = self.args[0]
         delayed = da.reduction(
             self.parent._delayed,
-            partial(_reduce, op, dtype),
-            partial(_reduce_combine, op),
+            partial(_reduce, op, dtype, **self.kwargs),
+            partial(_reduce_combine, op, **self.kwargs),
             concatenate=False,
             dtype=np_dtype(dtype),
         )
@@ -228,17 +280,128 @@ def _aggregate(
         op._new(updater, self)
         return output
 
+    def _kronecker(self, a, b, op, meta):
+        a = a.rechunk(chunks=1)
+        frag = da.core.blockwise(
+            *(partial(_kronecker, a._is_transposed, b._is_transposed), "ijMN"),
+            *((a._matrix._delayed, "ji") if a._is_transposed else (a._delayed, "ij")),
+            *((b._matrix._delayed, "NM") if b._is_transposed else (b._delayed, "MN")),
+            *(op, None),
+            dtype=np_dtype(meta.dtype),
+            meta=wrap_inner(meta),
+        )
+
+        name = "kronecker-" + tokenize(a, b)
+        b_ = b._matrix._delayed if b._is_transposed else b._delayed
+
+        out_chunks = ()
+        for axis in range(2):
+            out_chunks += (b_.chunks[axis] * a.shape[axis],)
+
+        dsk = dict()
+        for i in range(a.shape[0]):
+            for j in range(a.shape[1]):
+                for M in range(b_.numblocks[0]):
+                    for N in range(b_.numblocks[1]):
+
+                        dsk[(name, i * b_.numblocks[0] + M, j * b_.numblocks[1] + N)] = (
+                            lambda x: x,
+                            (frag.name, i, j, M, N),
+                        )
+
+        graph = HighLevelGraph.from_collections(name, dsk, dependencies=[frag])
+        out = da.core.Array(graph, name, out_chunks, meta=wrap_inner(meta))
+        return out
+
     def new(self, dtype=None, *, mask=None, name=None):
+        _check_mask(mask, ignore_None=True)
+
+        if any_dOnions(self, mask):
+
+            def recall_GbDelayed_new(p, m, *args, dtype=None, mask=None, **kwargs):
+                gbd = getattr(p, m)(*args, **kwargs)
+                return gbd.new(dtype=dtype, mask=mask)
+
+            meta_args = list(getattr(v, "_meta", v) for v in self.args)
+            meta_kwargs = {k: getattr(v, "_meta", v) for k, v in self.kwargs.items()}
+            if self.method_name.startswith(("reduce", "apply")):
+                # unary operations
+                a = self.parent
+                op = self.args[0]
+                if self.method_name == "apply":
+                    # grblas `apply()` does not like empty Scalars!
+                    if len(meta_args) > 1 and type(meta_args[1]) is gb.Scalar:
+                        meta_args[1] = gb.Scalar.from_value(1, dtype=meta_args[1].dtype)
+                    if len(meta_args) > 2 and type(meta_args[2]) is gb.Scalar:
+                        meta_args[2] = gb.Scalar.from_value(1, dtype=meta_args[2].dtype)
+                    if "left" in meta_kwargs and type(meta_kwargs["left"]) is gb.Scalar:
+                        meta_kwargs["left"] = gb.Scalar.from_value(
+                            1, dtype=meta_kwargs["left"].dtype
+                        )
+                    if "right" in meta_kwargs and type(meta_kwargs["right"]) is gb.Scalar:
+                        meta_kwargs["right"] = gb.Scalar.from_value(
+                            1, dtype=meta_kwargs["right"].dtype
+                        )
+                elif self.method_name.startswith("reduce"):
+                    # grblas bug occurs when shape is (0, 0)
+                    if a._meta.shape == (0,) * a.ndim:
+                        a._meta.resize(*((1,) * a.ndim))
+                meta = getattr(a._meta, self.method_name)(*meta_args, **meta_kwargs).new(
+                    dtype=dtype
+                )
+                meta.clear()
+            else:
+                # binary operations
+                a = self.parent
+                b = self.args[0]
+                op = self.args[1]
+
+                try:
+                    meta = getattr(a._meta, self.method_name)(b._meta, op=op, **meta_kwargs).new(
+                        dtype=dtype
+                    )
+                except DimensionMismatch:
+                    if self.method_name == "mxm":
+                        b_meta = gb.Matrix.new(
+                            dtype=b._meta.dtype, nrows=a._meta.ncols, ncols=b._meta.ncols
+                        )
+                    elif self.method_name == "vxm":
+                        b_meta = gb.Matrix.new(
+                            dtype=b._meta.dtype, nrows=a._meta.size, ncols=b._meta.ncols
+                        )
+                    elif self.method_name == "mxv":
+                        b_meta = gb.Vector.new(dtype=b._meta.dtype, size=a._meta.ncols)
+
+                    elif self.method_name in ("ewise_add", "ewise_mult"):
+                        b_meta = a._meta.dup(dtype=b._meta.dtype)
+
+                    meta = getattr(a._meta, self.method_name)(b_meta, op=op, **meta_kwargs).new(
+                        dtype=dtype
+                    )
+
+            donion = DOnion.multi_access(
+                meta,
+                recall_GbDelayed_new,
+                self.parent,
+                self.method_name,
+                *self.args,
+                dtype=dtype,
+                mask=mask,
+                **self.kwargs,
+            )
+            return get_return_type(meta)(donion, meta=meta)
+
+        # no dOnions
+        meta = self._meta.new(dtype=dtype)
         if mask is not None:
-            _check_mask(mask)
-            meta = self._meta.new(dtype=dtype, mask=mask._meta)
             delayed_mask = mask.mask._delayed
             grblas_mask_type = get_grblas_type(mask)
         else:
-            meta = self._meta.new(dtype=dtype)
             delayed_mask = None
             grblas_mask_type = None
 
+        meta.clear()
+
         if self.method_name.startswith("reduce"):
             op = self._meta.op
             if op is not None and op.opclass == "Aggregator":
@@ -261,20 +424,37 @@ def new(self, dtype=None, *, mask=None, name=None):
                 )
                 for key in self.kwargs
             }
-            delayed = da.core.elemwise(
-                _expr_new,
-                self.method_name,
-                dtype,
-                grblas_mask_type,
-                self_kwargs,
-                self.parent._delayed,
-                delayed_mask,
-                *[x._delayed if isinstance(x, BaseType) else x for x in self.args],
+            pt = getattr(self.parent, "_is_transposed", False)
+            xts = [getattr(arg, "_is_transposed", False) for arg in self.args]
+            axes = "ij" if self.parent.ndim == 2 else "i"
+            delayed = da.core.blockwise(
+                *(partial(_expr_new, pt, xts), axes),
+                *(self.method_name, None),
+                *(dtype, None),
+                *(grblas_mask_type, None),
+                *(
+                    (self.parent._matrix._delayed, axes[::-1])
+                    if pt
+                    else (self.parent._delayed, axes)
+                ),
+                *(delayed_mask, (None if mask is None else axes)),
+                *flatten(
+                    (
+                        (x._matrix._delayed, axes[::-1])
+                        if xt
+                        else (x._delayed, (None if x._is_scalar else axes))
+                    )
+                    if isinstance(x, BaseType) or getattr(x, "_is_transposed", False)
+                    else (x, None)
+                    for x, xt in zip(self.args, xts)
+                ),
+                **self_kwargs,
                 dtype=np_dtype(meta.dtype),
             )
         elif self.method_name in {"vxm", "mxv", "mxm"}:
-            # TODO: handle dtype and mask
             delayed = self._matmul2(meta, mask=mask)
+        elif self.method_name == "kronecker":
+            delayed = self._kronecker(self.parent, self.args[0], self.args[1], meta)
         else:
             raise ValueError(self.method_name)
         return get_return_type(meta)(delayed)
@@ -381,6 +561,9 @@ def _update(self, updating, *, mask=None, accum=None, replace=None):
             delayed = self._matmul2(meta, mask=mask)
             updating(mask=mask, accum=accum, replace=replace) << get_return_type(meta)(delayed)
             return
+        elif self.method_name == "kronecker":
+            updating(mask=mask, accum=accum, replace=replace) << self.new()
+            return
         else:
             raise ValueError(self.method_name)
         updating.__init__(delayed)
@@ -420,14 +603,24 @@ def _new_matrix(self, dtype, nrows=0, ncols=0, *, name=None):
 
 
 class IndexerResolver:
-    def __init__(self, obj, indices):
+    __slots__ = "obj", "indices", "is_dOnion", "shape"
+
+    def __init__(self, obj, indices, check_shape=True):
+        index_is_dOnion = obj.ndim == 1 and is_DOnion(indices)
+        index_is_dOnion = index_is_dOnion or (
+            obj.ndim == 2 and _is_pair(indices) and (is_DOnion(indices[0]) or is_DOnion(indices[1]))
+        )
+        self.is_dOnion = index_is_dOnion
+        check_shape = not (index_is_dOnion or obj.is_dOnion)
+
         self.obj = obj
         if indices is Ellipsis:
             from .vector import Vector
 
-            if type(obj) is Vector:
+            if type(obj) in {Vector, gb.Vector}:
                 normalized = slice(None).indices(obj._size)
                 self.indices = [AxisIndex(obj._size, slice(*normalized))]
+                self.shape = (obj._size,)
             else:
                 normalized0 = slice(None).indices(obj._nrows)
                 normalized1 = slice(None).indices(obj._ncols)
@@ -435,8 +628,14 @@ def __init__(self, obj, indices):
                     AxisIndex(obj._nrows, slice(*normalized0)),
                     AxisIndex(obj._ncols, slice(*normalized1)),
                 ]
+                self.shape = (obj._nrows, obj._ncols)
         else:
-            self.indices = self.parse_indices(indices, obj.shape)
+            if not check_shape and hasattr(obj, "_meta"):
+                shape = obj._meta.shape
+            else:
+                shape = obj.shape
+            self.indices = self.parse_indices(indices, shape, check_shape)
+            self.shape = tuple(index.size for index in self.indices if index.size is not None)
 
     @property
     def is_single_element(self):
@@ -445,7 +644,7 @@ def is_single_element(self):
                 return False
         return True
 
-    def parse_indices(self, indices, shape):
+    def parse_indices(self, indices, shape, check_shape=True):
         """
         Returns
             [(rows, rowsize), (cols, colsize)] for Matrix
@@ -469,24 +668,41 @@ def parse_indices(self, indices, shape):
                 raise TypeError(
                     f"Index in position {i} cannot be a tuple; must use slice or list or int"
                 )
-            out.append(self.parse_index(idx, typ, shape[i]))
+            out.append(self.parse_index(idx, typ, shape[i], check_shape))
         return out
 
-    def parse_index(self, index, typ, size):
+    def parse_index(self, index, typ, size, check_shape=True):
         if np.issubdtype(typ, np.integer):
             if index >= size:
-                raise IndexError(f"Index out of range: index={index}, size={size}")
+                if check_shape:
+                    raise IndexError(f"Index out of range: index={index}, size={size}")
             if index < 0:
                 index += size
                 if index < 0:
-                    raise IndexError(f"Index out of range: index={index - size}, size={size}")
-            return AxisIndex(None, IndexerResolver.normalize_index(index, size))
+                    if check_shape:
+                        raise IndexError(f"Index out of range: index={index - size}, size={size}")
+            return AxisIndex(None, IndexerResolver.normalize_index(index, size, check_shape))
+
+        def compute_scalar(index):
+            from .scalar import Scalar, PythonScalar
+
+            if type(index) is Scalar:
+                return index.value.compute()
+            if type(index) is PythonScalar:
+                return index.compute()
+            return index
+
         if typ is list:
-            index = [IndexerResolver.normalize_index(i, size) for i in index]
+            index = [
+                IndexerResolver.normalize_index(compute_scalar(i), size, check_shape) for i in index
+            ]
             return AxisIndex(len(index), index)
         elif typ is slice:
-            normalized = index.indices(size)
-            return AxisIndex(len(range(*normalized)), slice(*normalized))
+            if check_shape:
+                normalized = index.indices(size)
+                return AxisIndex(len(range(*normalized)), slice(*normalized))
+            else:
+                return AxisIndex(0, index)
 
         elif typ in {np.ndarray, da.Array}:
             if len(index.shape) != 1:
@@ -494,13 +710,17 @@ def parse_index(self, index, typ, size):
             if not np.issubdtype(index.dtype, np.integer):
                 raise TypeError(f"Invalid dtype for index: {index.dtype}")
             return AxisIndex(index.shape[0], index)
+
+        elif is_DOnion(index):
+            return AxisIndex(0, index)
+
         else:
-            from .scalar import Scalar
+            from .scalar import Scalar, PythonScalar
 
-            if typ is Scalar:
+            if typ in {Scalar, PythonScalar}:
                 if index.dtype.name.startswith("F"):
                     raise TypeError(f"An integer is required for indexing.  Got: {index.dtype}")
-                index = index.value.compute()
+                index = index.value.compute() if typ is Scalar else index.compute()
                 return AxisIndex(None, IndexerResolver.normalize_index(index, size))
 
             from .matrix import Matrix, TransposedMatrix
@@ -530,7 +750,7 @@ def parse_index(self, index, typ, size):
                         f"`x(mask={index.name}) << value`."
                     )
                 raise TypeError(f"Invalid type for index: {typ}; unable to convert to list")
-            index = [IndexerResolver.normalize_index(i, size) for i in index]
+            index = [IndexerResolver.normalize_index(i, size, check_shape) for i in index]
         return AxisIndex(len(index), index)
 
     def get_index(self, dim):
@@ -548,39 +768,52 @@ def validate_types(cls, indices):
         return
 
     @classmethod
-    def normalize_index(cls, index, size):
+    def normalize_index(cls, index, size, check_size=True):
         if type(index) is get_return_type(gb.Scalar.new(int)):
             # This branch needs a second look: How to work with the lazy index?
             index = index.value.compute()
             if not isinstance(index, Integral):
                 raise TypeError("An integer is required for indexing")
         if index >= size:
-            raise IndexError(f"Index out of range: index={index}, size={size}")
+            if check_size:
+                raise IndexError(f"Index out of range: index={index}, size={size}")
         if index < 0:
             index += size
             if index < 0:
-                raise IndexError(f"Index out of range: index={index - size}, size={size}")
+                if check_size:
+                    raise IndexError(f"Index out of range: index={index - size}, size={size}")
         return int(index)
 
 
 class Updater:
+    __bool__ = gb.expr.Updater.__bool__
+    __eq__ = gb.expr.Updater.__eq__
+
     def __init__(self, parent, *, mask=None, accum=None, replace=False, input_mask=None):
-        if input_mask is not None and mask is not None:
+        if mask is not None and input_mask is not None:
             raise TypeError("mask and input_mask arguments cannot both be given")
-        if input_mask is not None and not isinstance(input_mask, Mask):
-            raise TypeError(r"Mask must indicate values (M.V) or structure (M.S)")
 
+        _check_mask(mask, ignore_None=True)
+        _check_mask(input_mask, ignore_None=True)
+
+        self.has_dOnion = any_dOnions(parent, mask, input_mask)
         self.parent = parent
         self.mask = mask
         self.input_mask = input_mask
         self.accum = accum
-        if mask is None:
-            self.replace = None
-        else:
-            self.replace = replace
+        self.replace = replace if mask is not None else None
         self._meta = parent._meta(mask=get_meta(mask), accum=accum, replace=replace)
+
+        # copy `mask` if `parent` is the source of `mask`
+        if parent is getattr(mask, "mask", None):
+            self.mask = type(mask)(mask.mask.dup())
+
+        # copy `input_mask` if `parent` is the source of `input_mask`
+        if parent is getattr(input_mask, "mask", None):
+            self.input_mask = type(input_mask)(input_mask.mask.dup())
+
         # Aggregator specific attribute requirements:
-        self.kwargs = {"mask": mask}
+        self.kwargs = {"mask": self.mask}
 
     def __delitem__(self, keys):
         # Occurs when user calls `del C(params)[index]`
@@ -611,8 +844,13 @@ def __lshift__(self, delayed):
     def update(self, delayed):
         # Occurs when user calls C(params) << delayed
         if self.input_mask is not None:
-            if type(delayed) is AmbiguousAssignOrExtract:
+            if isinstance(delayed, AmbiguousAssignOrExtract):
                 # w(input_mask) << v[index]
+                if self.parent is delayed.parent:
+                    # replace `v` with a copy of itself if `w` is `v`
+                    delayed.parent = delayed.parent.__class__(
+                        delayed.parent._optional_dup(), delayed.parent._meta
+                    )
                 self.parent._update(
                     delayed.new(mask=self.mask, input_mask=self.input_mask),
                     accum=self.accum,
@@ -626,7 +864,8 @@ def update(self, delayed):
         if isinstance(delayed, Number) or (
             isinstance(delayed, BaseType) and get_meta(delayed)._is_scalar
         ):
-            ndim = len(self.parent.shape)
+            # w(mask, accum, replace) << s
+            ndim = self.parent.ndim
             if ndim > 0:
                 self.__setitem__(_squeeze((slice(None),) * ndim), delayed)
             elif self.accum is not None:
@@ -637,12 +876,15 @@ def update(self, delayed):
 
         if self.mask is None and self.accum is None:
             return self.parent.update(delayed)
-        self.parent._meta._update(
-            get_meta(delayed),
-            mask=get_meta(self.mask),
-            accum=self.accum,
-            replace=self.replace,
-        )
+
+        if not any_dOnions(self.parent, delayed):
+            self.parent._meta._update(
+                get_meta(delayed),
+                mask=get_meta(self.mask),
+                accum=self.accum,
+                replace=self.replace,
+            )
+
         if self.parent._meta._is_scalar:
             self.parent._update(delayed, accum=self.accum)
         else:
@@ -650,6 +892,11 @@ def update(self, delayed):
 
 
 def _csc_chunk(row_range, col_range, indices, red_columns, track_indices=False):
+    """
+    create chunk of Reduce_Assign Matrix in Compressed Sparse Column (CSC) format
+
+    (Used in `reduce_assign()`)
+    """
     row_range = row_range[0]
     nrows = row_range.stop - row_range.start
     if type(indices[0]) is slice:
@@ -689,13 +936,14 @@ def _csc_chunk(row_range, col_range, indices, red_columns, track_indices=False):
 
 
 def _fill(inner_vector, rhs):
+    # used in reduce_assign()
     rhs = rhs.value if isinstance(rhs, InnerBaseType) else rhs
     inner_vector.value[:] << rhs
     return inner_vector
 
 
 def reduce_assign(lhs, indices, rhs, dup_op="last", mask=None, accum=None, replace=False):
-    # lhs(mask, accum, replace)[i] << rhs
+    # lhs(mask, accum, replace, dup_op)[i] << rhs
     rhs_is_scalar = not (isinstance(rhs, BaseType) and type(rhs._meta) is gb.Vector)
     if type(indices) is slice:
         chunksz = "auto" if rhs_is_scalar else rhs._delayed.chunks
@@ -808,6 +1056,15 @@ def _get_type_with_ndims(n):
         return get_return_type(gb.Matrix.new(int))
 
 
+def _get_inner_type_with_ndims(n):
+    if n == 0:
+        return get_inner_type(gb.Scalar.new(int))
+    elif n == 1:
+        return get_inner_type(gb.Vector.new(int))
+    else:
+        return get_inner_type(gb.Matrix.new(int))
+
+
 def _get_grblas_type_with_ndims(n):
     if n == 0:
         return gb.Scalar
@@ -1245,24 +1502,111 @@ def _defrag_to_index_chunk(*args, x_chunks, dtype=None):
     return wrap_inner(fused_fragments[index_tuple].new())
 
 
+def _adjust_meta_to_index(meta, index):
+    from .scalar import Scalar, PythonScalar
+
+    # Since grblas does not support indices that are dask arrays
+    # this complicates meta deduction.  We therefore substitute
+    # any non-Integral type indices with `slice(None)`
+    index = index if type(index) is tuple else (index,)
+    # Next, we resize `meta` to accept any Integral-type indices:
+    numbers = [x for x in index if isinstance(x, (Integral, Scalar, PythonScalar))]
+    max_index = np.max(numbers) if numbers else None
+    meta = meta.dup()
+    if max_index is not None:
+        if len(index) == 1:
+            meta.resize(max_index + 1)
+        else:
+            meta.resize(max_index + 1, max_index + 1)
+
+    meta_index = tuple(
+        x if isinstance(x, (Integral, Scalar, PythonScalar)) else slice(None) for x in index
+    )
+    return meta[_squeeze(meta_index)]
+
+
 class AmbiguousAssignOrExtract:
-    def __init__(self, parent, index):
-        self.resolved_indices = IndexerResolver(parent, index)
+    __slots__ = (
+        "has_dOnion",
+        "index",
+        "parent",
+        "resolved_indexes",
+        "_meta",
+        "_value",
+        "__weakref__",
+    )
+    _is_scalar = False
+
+    def __init__(self, parent, index, meta=None):
         self.parent = parent
-        self.index = index
-        # IndexerResolver.validate_types(self.index)
-        self._meta = parent._meta[index]
-        # infix expression requirements:
-        shape = tuple(i.size for i in self.resolved_indices.indices if i.size)
-        self.ndim = len(shape)
-        self.output_type = _get_grblas_type_with_ndims(self.ndim)
-        if self.ndim == 1:
-            self._size = shape[0]
-        elif self.ndim == 2:
-            self._nrows = shape[0]
-            self._ncols = shape[1]
+        self.resolved_indexes = index
+        self.index = _squeeze(tuple(i.index for i in index.indices))
+        self._value = None
+        if parent.is_dOnion or index.is_dOnion:
+            self.has_dOnion = True
+            self._meta = _adjust_meta_to_index(parent._meta, self.index)
+        else:
+            self.has_dOnion = False
+            self._meta = parent._meta[self.index] if meta is None else meta
+
+    @staticmethod
+    def _extract_single_element(x, xt, T, dxn, indices, meta, dtype):
+        def getitem(inner, key, dtype):
+            return wrap_inner(inner.value[key].new(dtype=dtype))
+
+        name = "extract_single_element-" + tokenize(x, xt, indices)
+
+        block = ()
+        element = ()
+        for axis, i in enumerate(indices):
+            stops_ = np.cumsum(x.chunks[T[axis]])
+            starts = np.roll(stops_, 1)
+            starts[0] = 0
+
+            blockid = np.arange(x.numblocks[T[axis]])
+
+            # locate chunk containing element:
+            filter = (starts <= i) & (i < stops_)
+            (R,) = blockid[filter]
+
+            block += (R,)
+            element += (i - starts[R],)
+
+        dsk = dict()
+        dsk[(name,)] = (getitem, (x.name, *block[::dxn]), _squeeze(element[::dxn]), dtype)
+        graph = HighLevelGraph.from_collections(name, dsk, dependencies=[x])
+        out = da.core.Array(graph, name, (), meta=wrap_inner(meta))
+        return out
 
     def new(self, *, dtype=None, mask=None, input_mask=None, name=None):
+        _check_mask(mask, ignore_None=True)
+        _check_mask(input_mask, ignore_None=True)
+
+        if any_dOnions(self, mask, input_mask):
+
+            def _recall_getitem(parent, keys_0, keys_1, dtype, mask, input_mask):
+                keys = keys_0 if keys_1 is None else (keys_0, keys_1)
+                return parent[keys].new(dtype=dtype, mask=mask, input_mask=input_mask)
+
+            meta = self._meta.new(dtype=dtype)
+
+            if _is_pair(self.index):
+                keys_0, keys_1 = self.index[0], self.index[1]
+            else:
+                keys_0, keys_1 = self.index, None
+
+            donion = DOnion.multi_access(
+                meta,
+                _recall_getitem,
+                self.parent,
+                *(keys_0, keys_1),
+                dtype=dtype,
+                mask=mask,
+                input_mask=input_mask,
+            )
+            return get_return_type(meta)(donion)
+
+        # no dOnions
         parent = self.parent
         xt = False  # xt = parent._is_transposed
         dxn = 1  # dxn = -1 if xt else 1
@@ -1278,8 +1622,8 @@ def new(self, *, dtype=None, mask=None, input_mask=None, name=None):
         input_ndim = len(input_shape)
         axes = tuple(range(input_ndim))
         x_axes = axes[::dxn]
-        indices = tuple(i.index for i in self.resolved_indices.indices)
-        out_shape = tuple(i.size for i in self.resolved_indices.indices if i.size is not None)
+        indices = tuple(i.index for i in self.resolved_indexes.indices)
+        out_shape = tuple(i.size for i in self.resolved_indexes.indices if i.size is not None)
         out_ndim = len(out_shape)
 
         if mask is not None:
@@ -1332,7 +1676,7 @@ def new(self, *, dtype=None, mask=None, input_mask=None, name=None):
                 elif out_ndim < input_ndim:
                     (rem_axis,) = [
                         axis
-                        for axis, index in enumerate(self.resolved_indices.indices)
+                        for axis, index in enumerate(self.resolved_indexes.indices)
                         if index.size is not None
                     ]
                     if out_ndim == input_mask_ndim:
@@ -1356,6 +1700,12 @@ def new(self, *, dtype=None, mask=None, input_mask=None, name=None):
 
         dtype = np_dtype(meta.dtype)
         if input_ndim in [1, 2]:
+            if out_ndim == 0:
+                delayed = self.__class__._extract_single_element(
+                    x, xt, T, dxn, indices, meta, meta.dtype
+                )
+                return get_return_type(meta)(delayed)
+
             # prepare arguments for blockwise:
             indices_args = []
             offset_args = []
@@ -1449,15 +1799,27 @@ def __call__(self, *args, **kwargs):
     def update(self, obj):
         if getattr(self.parent, "_is_transposed", False):
             raise TypeError("'TransposedMatrix' object does not support item assignment")
+
+        if self.parent.is_dOnion:
+            self.parent.__setitem__(self.index, obj)
+            return
+
         Assigner(Updater(self.parent), self.index).update(obj)
 
     def __lshift__(self, rhs):
         self.update(rhs)
 
+    @property
+    def dtype(self):
+        return self.parent.dtype
+
     @property
     def value(self):
-        self._meta.value
-        return self.new().value
+        self._meta.new().value
+        scalar = self.new()
+        return scalar.value
+
+    dup = new
 
 
 def _uniquify(ndim, index, obj, mask=None, ot=False):
@@ -1504,20 +1866,81 @@ def _identity_func(x, axis, keepdims):
 
 
 class Assigner:
+    __bool__ = gb.expr.Assigner.__bool__
+    __eq__ = gb.expr.Assigner.__eq__
+
     def __init__(self, updater, index, subassign=False):
         self.updater = updater
         self.parent = updater.parent
-        self.resolved_indices = IndexerResolver(self.parent, index).indices
-        self.index = tuple(i.index for i in self.resolved_indices)
         self._meta = updater.parent._meta
         self.subassign = subassign
 
+        input_ndim = self.parent.ndim
+        index_is_dOnion = input_ndim == 1 and is_DOnion(index)
+        index_is_dOnion = index_is_dOnion or (
+            input_ndim == 2 and _is_pair(index) and (is_DOnion(index[0]) or is_DOnion(index[1]))
+        )
+        if self.updater.has_dOnion or index_is_dOnion:
+            self.has_dOnion = True
+            IndexerResolver(self.parent, index, check_shape=False)
+            self.index = index
+        else:
+            self.has_dOnion = False
+            self.resolved_indexes = IndexerResolver(self.parent, index).indices
+            self.index = tuple(i.index for i in self.resolved_indexes)
+
     def update(self, obj):
-        if not (isinstance(obj, BaseType) or isinstance(obj, Number)):
-            try:
-                obj_transposed = obj._is_transposed
-            except AttributeError:
-                raise TypeError("Bad type for argument `obj`")
+        if not (
+            isinstance(obj, Number)
+            or isinstance(obj, BaseType)
+            or getattr(obj, "_is_transposed", False)
+        ):
+            obj = self.parent._expect_type(
+                obj,
+                (
+                    gb.Scalar,
+                    gb.Vector,
+                    gb.Matrix,
+                    gb.matrix.TransposedMatrix,
+                ),
+                within="Assign.update",
+            )
+        if any_dOnions(self, obj):
+
+            def _recall_update(lhs, mask, accum, replace, keys_0, keys_1, obj, subassign):
+                keys = (keys_0,) if keys_1 is None else (keys_0, keys_1)
+                updater = Updater(lhs, mask=mask, accum=accum, replace=replace)
+                Assigner(updater, keys, subassign=subassign).update(obj)
+                return lhs
+
+            lhs = self.parent
+            lhs_copy = lhs.__class__(lhs._optional_dup(), meta=lhs._meta)
+
+            updater = self.updater
+
+            if _is_pair(self.index):
+                keys_0, keys_1 = self.index[0], self.index[1]
+            else:
+                keys_0, keys_1 = self.index, None
+
+            donion = DOnion.multi_access(
+                lhs._meta,
+                _recall_update,
+                lhs_copy,
+                updater.mask,
+                updater.accum,
+                updater.replace,
+                keys_0,
+                keys_1,
+                obj,
+                self.subassign,
+            )
+            lhs.__init__(donion, meta=lhs._meta)
+            return
+
+        # no dOnions
+        if getattr(obj, "_is_transposed", False):
+            obj_transposed = obj._is_transposed
             obj = obj._matrix
         else:
             obj_transposed = False
@@ -1570,7 +1993,7 @@ def update(self, obj):
                     else:
                         (rem_axis,) = [
                             axis
-                            for axis, index in enumerate(self.resolved_indices)
+                            for axis, index in enumerate(self.resolved_indexes)
                             if index.size is not None
                         ]
                         if parent.shape[rem_axis] != out_shape[0]:
@@ -1579,7 +2002,7 @@ def update(self, obj):
                     if ndim == 2 and out_dim == 1:
                         (int_axis,) = [
                             axis
-                            for axis, index in enumerate(self.resolved_indices)
+                            for axis, index in enumerate(self.resolved_indexes)
                             if index.size is None
                         ]
                         indices = list(indices)
@@ -1800,14 +2223,14 @@ def __init__(self, value, compress_axis):
         self.compress_axis = compress_axis
 
 
-def _expr_new(method_name, dtype, grblas_mask_type, kwargs, x, mask, *args):
+def _expr_new(xt, ats, method_name, dtype, grblas_mask_type, x, mask, *args, **kwargs):
     # expr.new(...)
-    args = [x.value if isinstance(x, InnerBaseType) else x for x in args]
+    args = [_transpose_if(y, yt) if isinstance(y, InnerBaseType) else y for y, yt in zip(args, ats)]
     kwargs = {
         key: (kwargs[key].value if isinstance(kwargs[key], InnerBaseType) else kwargs[key])
         for key in kwargs
     }
-    expr = getattr(x.value, method_name)(*args, **kwargs)
+    expr = getattr(_transpose_if(x, xt), method_name)(*args, **kwargs)
     if mask is not None:
         mask = grblas_mask_type(mask.value)
     return wrap_inner(expr.new(dtype=dtype, mask=mask))
@@ -1869,32 +2292,34 @@ def _add_blocks(monoid_, x, y):
     return x
 
 
-def _reduce_scalar(op, gb_dtype, x, axis=None, keepdims=None, computing_meta=None, dtype=None):
+def _reduce_scalar(
+    op, gb_dtype, x, axis=None, keepdims=None, computing_meta=None, dtype=None, **kwargs
+):
     """Call reduce_scalar on each chunk"""
     if computing_meta:
         return np.empty(0, dtype=dtype)
-    return wrap_inner(x.value.reduce_scalar(op).new(dtype=gb_dtype))
+    return wrap_inner(x.value.reduce_scalar(op, **kwargs).new(dtype=gb_dtype))
 
 
-def _reduce(op, gb_dtype, x, axis=None, keepdims=None, computing_meta=None, dtype=None):
+def _reduce(op, gb_dtype, x, axis=None, keepdims=None, computing_meta=None, dtype=None, **kwargs):
     """Call reduce on each chunk"""
     if computing_meta:
         return np.empty(0, dtype=dtype)
-    return wrap_inner(x.value.reduce(op).new(dtype=gb_dtype))
+    return wrap_inner(x.value.reduce(op, **kwargs).new(dtype=gb_dtype))
 
 
-def _reduce_combine(op, x, axis=None, keepdims=None, computing_meta=None, dtype=None):
+def _reduce_combine(op, x, axis=None, keepdims=None, computing_meta=None, dtype=None, **kwargs):
     """Combine results from reduce or reduce_scalar on each chunk"""
     if computing_meta:
         return np.empty(0, dtype=dtype)
     if type(x) is list:
         # do we need `gb_dtype` instead of `np_dtype` below?
         if type(x[0]) is list:
-            vals = [val.value.value for sublist in x for val in sublist]
+            vals = [val.value.value for sublist in x for val in sublist if val.value.value]
         else:
-            vals = [val.value.value for val in x]
+            vals = [val.value.value for val in x if val.value.value]
         values = gb.Vector.from_values(list(range(len(vals))), vals, size=len(vals), dtype=dtype)
-        return wrap_inner(values.reduce(op).new())
+        return wrap_inner(values.reduce(op, **kwargs).new())
     return x
 
 
@@ -1938,6 +2363,12 @@ def _transpose_if(inner_x, xt):
     return inner_x.value
 
 
+def _kronecker(at, bt, a, b, op):
+    a = _transpose_if(a, at)
+    b = _transpose_if(b, bt)
+    return wrap_inner(a.kronecker(b, op=op).new())
+
+
 def _matmul(op, at, bt, dtype, no_mask, mask_type, *args, computing_meta=None):
     if computing_meta:
         return np.empty(0, dtype=dtype)
@@ -1958,6 +2389,57 @@ def _matmul(op, at, bt, dtype, no_mask, mask_type, *args, computing_meta=None):
     return wrap_inner(gb_obj)
 
 
+def _expand(inner, fullshape, *index_ranges):
+    a = inner
+    if a.ndim == 1:
+        (a_index_range,) = index_ranges
+        balloon = gb.Vector.new(a.value.dtype, *fullshape)
+        balloon[a_index_range.start : a_index_range.stop] << a.value
+    else:
+        (a_row_range, a_col_range) = index_ranges
+        balloon = gb.Matrix.new(a.value.dtype, *fullshape)
+        (
+            balloon[
+                a_row_range.start : a_row_range.stop,
+                a_col_range.start : a_col_range.stop,
+            ]
+            << a.value
+        )
+
+    return wrap_inner(balloon)
+
+
+def _matmul2_positional(
+    op, dtype, at, bt, a_fullshape, b_fullshape, a, b, *args, computing_meta=None
+):
+    a_ranges = (args[0][0],) if a.ndim == 1 else (args[0][0], args[1][0])
+    b_ranges = (args[a.ndim][0],) if b.ndim == 1 else (args[a.ndim][0], args[a.ndim + 1][0])
+
+    a_expanded = _expand(a, a_fullshape, *a_ranges)
+    b_expanded = _expand(b, b_fullshape, *b_ranges)
+
+    res = _matmul2(op, dtype, at, bt, a_expanded, b_expanded, computing_meta=computing_meta)
+
+    if a.ndim == 1 and b.ndim == 1:
+        return res
+
+    # shrink expanded result to original size:
+    indices = (
+        slice(a_ranges[1].start, a_ranges[1].stop)
+        if at
+        else slice(a_ranges[0].start, a_ranges[0].stop)
+    )
+    if b.ndim == 2:
+        cols = (
+            slice(b_ranges[0].start, b_ranges[0].stop)
+            if bt
+            else slice(b_ranges[1].start, b_ranges[1].stop)
+        )
+        indices = cols if a.ndim == 1 else (indices, cols)
+
+    return res[indices].new()
+
+
 def _matmul2(op, dtype, at, bt, a, b, computing_meta=None):
     left = _transpose_if(a, at)
     right = _transpose_if(b, bt)
@@ -2092,3 +2574,7 @@ def concatenate_fragments(frag1, frag2, axis=0, base_axis=0):
         return reduce(partial(concatenate_fragments, axis=axis, base_axis=base_axis), seq_)
     else:
         return seq[0]
+
+
+def _is_pair(arg):
+    return type(arg) is tuple and len(arg) == 2
diff --git a/dask_grblas/functools.py b/dask_grblas/functools.py
new file mode 100644
index 0000000..257d5ec
--- /dev/null
+++ b/dask_grblas/functools.py
@@ -0,0 +1,112 @@
+from reprlib import recursive_repr
+
+
+class skip:
+    def __repr__(self):
+        return "skip"
+
+    __str__ = __repr__
+    __reduce__ = __repr__  # This makes it pickle well!
+
+
+skip = skip()
+
+
+class flexible_partial:
+    """New function with flexible partial application of the given
+    arguments and keywords. Any argument slot of the given function
+    may be occupied (not just the leading slots).  Use the sentinel
+    `skip` to denote vacant argument slots.
+    """
+
+    __slots__ = "base_func", "args", "keywords", "__dict__", "__weakref__"
+
+    def __new__(cls, func, /, *args, **keywords):
+        if not callable(func):
+            raise TypeError("the first argument must be callable")
+
+        if hasattr(func, "base_func"):
+            func_ = func.base_func
+            func_is_partial = True
+        elif hasattr(func, "func"):
+            func_ = func.func
+            func_is_partial = True
+        else:
+            func_is_partial = False
+
+        if func_is_partial:
+            old_arg, new_arg = iter(func.args), iter(args)
+            exhausted = False
+            args = ()
+            for arg in func.args:
+                if arg is skip:
+                    try:
+                        args += (next(new_arg),)
+                    except StopIteration:
+                        exhausted = True
+                        break
+                else:
+                    args += arg
+                next(old_arg)
+
+            args += tuple(old_arg if exhausted else new_arg)
+            keywords = {**func.keywords, **keywords}
+            func = func_
+
+        self = super(flexible_partial, cls).__new__(cls)
+
+        self.base_func = func
+        self.args = args
+        self.keywords = keywords
+        return self
+
+    def __call__(self, /, *args, **keywords):
+        new_arg = iter(args)
+        args = (next(new_arg) if arg is skip else arg for arg in self.args)
+
+        keywords = {**self.keywords, **keywords}
+        return self.base_func(*args, *new_arg, **keywords)
+
+    @recursive_repr()
+    def __repr__(self):
+        qualname = type(self).__qualname__
+        args = [repr(self.base_func)]
+        args.extend(repr(x) for x in self.args)
+        args.extend(f"{k}={v!r}" for (k, v) in self.keywords.items())
+        if type(self).__module__ == "functools":
+            return f"functools.{qualname}({', '.join(args)})"
+        return f"{qualname}({', '.join(args)})"
+
+    def __reduce__(self):
+        return (
+            type(self),
+            (self.base_func,),
+            (self.base_func, self.args, self.keywords or None, self.__dict__ or None),
+        )
+
+    def __setstate__(self, state):
+        if not isinstance(state, tuple):
+            raise TypeError("argument to __setstate__ must be a tuple")
+        if len(state) != 4:
+            raise TypeError(f"expected 4 items in state, got {len(state)}")
+        func, args, kwds, namespace = state
+        if (
+            not callable(func)
+            or not isinstance(args, tuple)
+            or (kwds is not None and not isinstance(kwds, dict))
+            or (namespace is not None and not isinstance(namespace, dict))
+        ):
+            raise TypeError("invalid partial state")
+
+        args = tuple(args)  # just in case it's a subclass
+        if kwds is None:
+            kwds = {}
+        elif type(kwds) is not dict:  # XXX does it need to be *exactly* dict?
+            kwds = dict(kwds)
+        if namespace is None:
+            namespace = {}
+
+        self.__dict__ = namespace
+        self.base_func = func
+        self.args = args
+        self.keywords = kwds
diff --git a/dask_grblas/io.py b/dask_grblas/io.py
index d3c462e..6f183cb 100644
--- a/dask_grblas/io.py
+++ b/dask_grblas/io.py
@@ -2,7 +2,6 @@
 
 from math import floor, sqrt
 from numpy import asarray, conj, zeros, concatenate, ones, empty
-from scipy.io import mmio  # noqa
 
 
 def symm_I_J(pos, n):
@@ -97,40 +96,238 @@ def home(stream, search_window_size=8):
 
 
 # -----------------------------------------------------------------------------
+def asstr(s):
+    if isinstance(s, bytes):
+        return s.decode("latin1")
+    return str(s)
 
 
-def mmread(source, *, dup_op=None, name=None, row_begin=0, row_end=None, col_begin=0, col_end=None):
-    """
-    Read the contents of a Matrix Market filename or file into a new Matrix.
+# -----------------------------------------------------------------------------
 
-    This uses `scipy.io.mmread`:
-    https://docs.scipy.org/doc/scipy/reference/generated/scipy.io.mmread.html
 
-    For more information on the Matrix Market format, see:
-    https://math.nist.gov/MatrixMarket/formats.html
-    """
-    from . import Matrix
-
-    try:
-        from scipy.sparse import coo_matrix  # noqa
-    except ImportError:  # pragma: no cover
-        raise ImportError("scipy is required to read Matrix Market files")
-    array = MMFile().read(
-        source, row_begin=row_begin, row_end=row_end, col_begin=col_begin, col_end=col_end
+class MMFile:
+    __slots__ = ("_rows", "_cols", "_entries", "_format", "_field", "_symmetry")
+
+    @property
+    def rows(self):
+        return self._rows
+
+    @property
+    def cols(self):
+        return self._cols
+
+    @property
+    def entries(self):
+        return self._entries
+
+    @property
+    def format(self):
+        return self._format
+
+    @property
+    def field(self):
+        return self._field
+
+    @property
+    def symmetry(self):
+        return self._symmetry
+
+    @property
+    def has_symmetry(self):
+        return self._symmetry in (
+            self.SYMMETRY_SYMMETRIC,
+            self.SYMMETRY_SKEW_SYMMETRIC,
+            self.SYMMETRY_HERMITIAN,
+        )
+
+    # format values
+    FORMAT_COORDINATE = "coordinate"
+    FORMAT_ARRAY = "array"
+
+    # field values
+    FIELD_INTEGER = "integer"
+    FIELD_UNSIGNED = "unsigned-integer"
+    FIELD_REAL = "real"
+    FIELD_COMPLEX = "complex"
+    FIELD_PATTERN = "pattern"
+    FIELD_VALUES = (FIELD_INTEGER, FIELD_UNSIGNED, FIELD_REAL, FIELD_COMPLEX, FIELD_PATTERN)
+
+    # symmetry values
+    SYMMETRY_GENERAL = "general"
+    SYMMETRY_SYMMETRIC = "symmetric"
+    SYMMETRY_SKEW_SYMMETRIC = "skew-symmetric"
+    SYMMETRY_HERMITIAN = "hermitian"
+    SYMMETRY_VALUES = (
+        SYMMETRY_GENERAL,
+        SYMMETRY_SYMMETRIC,
+        SYMMETRY_SKEW_SYMMETRIC,
+        SYMMETRY_HERMITIAN,
     )
-    if isinstance(array, coo_matrix):
-        nrows, ncols = array.shape
-        return Matrix.from_values(
-            array.row, array.col, array.data, nrows=nrows, ncols=ncols, dup_op=dup_op, name=name
+
+    @classmethod
+    def info(self, source):
+        """
+        Return size, storage parameters from Matrix Market file-like 'source'.
+
+        Parameters
+        ----------
+        source : str or file-like
+            Matrix Market filename (extension .mtx) or open file-like object
+
+        Returns
+        -------
+        rows : int
+            Number of matrix rows.
+        cols : int
+            Number of matrix columns.
+        entries : int
+            Number of non-zero entries of a sparse matrix
+            or rows*cols for a dense matrix.
+        format : str
+            Either 'coordinate' or 'array'.
+        field : str
+            Either 'real', 'complex', 'pattern', or 'integer'.
+        symmetry : str
+            Either 'general', 'symmetric', 'skew-symmetric', or 'hermitian'.
+        """
+
+        stream, close_it = self._open(source)
+
+        try:
+
+            # read and validate header line
+            line = stream.readline()
+            mmid, matrix, format, field, symmetry = [asstr(part.strip()) for part in line.split()]
+            if not mmid.startswith("%%MatrixMarket"):
+                raise ValueError("source is not in Matrix Market format")
+            if not matrix.lower() == "matrix":
+                raise ValueError("Problem reading file header: " + line)
+
+            # http://math.nist.gov/MatrixMarket/formats.html
+            if format.lower() == "array":
+                format = self.FORMAT_ARRAY
+            elif format.lower() == "coordinate":
+                format = self.FORMAT_COORDINATE
+
+            # skip comments
+            # line.startswith('%')
+            while line and line[0] in ["%", 37]:
+                line = stream.readline()
+
+            # skip empty lines
+            while not line.strip():
+                line = stream.readline()
+
+            split_line = line.split()
+            if format == self.FORMAT_ARRAY:
+                if not len(split_line) == 2:
+                    raise ValueError("Header line not of length 2: " + line.decode("ascii"))
+                rows, cols = map(int, split_line)
+                entries = rows * cols
+            else:
+                if not len(split_line) == 3:
+                    raise ValueError("Header line not of length 3: " + line.decode("ascii"))
+                rows, cols, entries = map(int, split_line)
+
+            return (rows, cols, entries, format, field.lower(), symmetry.lower())
+
+        finally:
+            if close_it:
+                stream.close()
+
+    @staticmethod
+    def _open(filespec, mode="rb"):
+        """Return an open file stream for reading based on source.
+
+        If source is a file name, open it (after trying to find it with mtx and
+        gzipped mtx extensions). Otherwise, just return source.
+
+        Parameters
+        ----------
+        filespec : str or file-like
+            String giving file name or file-like object
+        mode : str, optional
+            Mode with which to open file, if `filespec` is a file name.
+
+        Returns
+        -------
+        fobj : file-like
+            Open file-like object.
+        close_it : bool
+            True if the calling function should close this file when done,
+            false otherwise.
+        """
+        # If 'filespec' is path-like (str, pathlib.Path, os.DirEntry, other class
+        # implementing a '__fspath__' method), try to convert it to str. If this
+        # fails by throwing a 'TypeError', assume it's an open file handle and
+        # return it as-is.
+        try:
+            filespec = os.fspath(filespec)
+        except TypeError:
+            return filespec, False
+
+        # 'filespec' is definitely a str now
+
+        # open for reading
+        if mode[0] == "r":
+
+            # determine filename plus extension
+            if not os.path.isfile(filespec):
+                if os.path.isfile(filespec + ".mtx"):
+                    filespec = filespec + ".mtx"
+                elif os.path.isfile(filespec + ".mtx.gz"):
+                    filespec = filespec + ".mtx.gz"
+                elif os.path.isfile(filespec + ".mtx.bz2"):
+                    filespec = filespec + ".mtx.bz2"
+            # open filename
+            if filespec.endswith(".gz"):
+                import gzip
+
+                stream = gzip.open(filespec, mode)
+            elif filespec.endswith(".bz2"):
+                import bz2
+
+                stream = bz2.BZ2File(filespec, "rb")
+            else:
+                stream = open(filespec, mode)
+
+        # open for writing
+        else:
+            if filespec[-4:] != ".mtx":
+                filespec = filespec + ".mtx"
+            stream = open(filespec, mode)
+
+        return stream, True
+
+    # -------------------------------------------------------------------------
+    def _parse_header(self, stream):
+        rows, cols, entries, format, field, symmetry = self.__class__.info(stream)
+        self._init_attrs(
+            rows=rows, cols=cols, entries=entries, format=format, field=field, symmetry=symmetry
         )
-    # SS, SuiteSparse-specific: import_full
-    return Matrix.ss.import_fullr(values=array, take_ownership=True, name=name)
 
+    # -------------------------------------------------------------------------
+    def _init_attrs(self, **kwargs):
+        """
+        Initialize each attributes with the corresponding keyword arg value
+        or a default of None
+        """
 
-# -----------------------------------------------------------------------------
+        attrs = self.__class__.__slots__
+        public_attrs = [attr[1:] for attr in attrs]
+        invalid_keys = set(kwargs.keys()) - set(public_attrs)
+
+        if invalid_keys:
+            raise ValueError(
+                """found %s invalid keyword arguments, please only
+                                use %s"""
+                % (tuple(invalid_keys), public_attrs)
+            )
 
+        for attr in attrs:
+            setattr(self, attr, kwargs.get(attr[1:], None))
 
-class MMFile(mmio.MMFile):
+    # -------------------------------------------------------------------------
     def get_data_begin(self, source):
         """
         Reads the contents of a Matrix Market file-like 'source' into a matrix.
@@ -157,11 +354,13 @@ def get_data_begin(self, source):
                 stream.close()
 
     # -------------------------------------------------------------------------
+
     def _get_data_begin(self, stream):
         _ = self.__class__.info(stream)
         return stream.tell()
 
     # -----------------------------------------------------------------------------
+
     def read_part(self, source, line_start=None, line_stop=None, read_begin=None, read_end=None):
         """
         Reads the contents of a Matrix Market file-like 'source' into a matrix.
diff --git a/dask_grblas/mask.py b/dask_grblas/mask.py
index 314781a..21a760a 100644
--- a/dask_grblas/mask.py
+++ b/dask_grblas/mask.py
@@ -1,3 +1,4 @@
+from grblas.mask import Mask as gb_Mask
 from .utils import get_grblas_type
 
 
@@ -6,6 +7,9 @@ class Mask:
     structure = False
     value = False
 
+    __bool__ = gb_Mask.__bool__
+    __eq__ = gb_Mask.__eq__
+
     def __init__(self, mask):
         from . import matrix, vector
 
@@ -13,6 +17,14 @@ def __init__(self, mask):
         self.mask = mask
         self._meta = get_grblas_type(self)(mask._meta)
 
+    @property
+    def is_dOnion(self):
+        return getattr(self.mask, "is_dOnion", False)
+
+    @property
+    def dOnion_if(self):
+        return self.mask._delayed if self.is_dOnion else self
+
 
 class StructuralMask(Mask):
     complement = False
diff --git a/dask_grblas/matrix.py b/dask_grblas/matrix.py
index 922965e..52a1296 100644
--- a/dask_grblas/matrix.py
+++ b/dask_grblas/matrix.py
@@ -1,18 +1,27 @@
 import dask.array as da
 import numpy as np
 import grblas as gb
-from dask.base import tokenize
+
+from numbers import Integral, Number
+from tlz import compose
+
+from dask.base import tokenize, is_dask_collection
 from dask.delayed import Delayed, delayed
 from dask.highlevelgraph import HighLevelGraph
 from grblas import binary, monoid, semiring
 from grblas.dtypes import lookup_dtype
+from grblas.exceptions import IndexOutOfBound, EmptyObject, DimensionMismatch
 
-from .base import BaseType, InnerBaseType
+from . import _automethods
+from .base import BaseType, InnerBaseType, DOnion, is_DOnion, any_dOnions, Box, skip
 from .base import _nvals as _nvals_in_chunk
-from .expr import AmbiguousAssignOrExtract, GbDelayed, Updater
+from .base import _dup as chunk_dup
+from .expr import AmbiguousAssignOrExtract, IndexerResolver, GbDelayed, Updater
 from .mask import StructuralMask, ValueMask
 from ._ss.matrix import ss
 from .utils import (
+    pack_args,
+    pack_kwargs,
     np_dtype,
     get_return_type,
     get_grblas_type,
@@ -45,6 +54,44 @@ class Matrix(BaseType):
     ndim = 2
     _is_transposed = False
 
+    __abs__ = gb.Matrix.__abs__
+    __add__ = gb.Matrix.__add__
+    __divmod__ = gb.Matrix.__divmod__
+    __eq__ = gb.Matrix.__eq__
+    __floordiv__ = gb.Matrix.__floordiv__
+    __ge__ = gb.Matrix.__ge__
+    __gt__ = gb.Matrix.__gt__
+    __iadd__ = gb.Matrix.__iadd__
+    __iand__ = gb.Matrix.__iand__
+    __ifloordiv__ = gb.Matrix.__ifloordiv__
+    __imod__ = gb.Matrix.__imod__
+    __imul__ = gb.Matrix.__imul__
+    __invert__ = gb.Matrix.__invert__
+    __ior__ = gb.Matrix.__ior__
+    __ipow__ = gb.Matrix.__ipow__
+    __isub__ = gb.Matrix.__isub__
+    __itruediv__ = gb.Matrix.__itruediv__
+    __ixor__ = gb.Matrix.__ixor__
+    __le__ = gb.Matrix.__le__
+    __lt__ = gb.Matrix.__lt__
+    __mod__ = gb.Matrix.__mod__
+    __mul__ = gb.Matrix.__mul__
+    __ne__ = gb.Matrix.__ne__
+    __neg__ = gb.Matrix.__neg__
+    __pow__ = gb.Matrix.__pow__
+    __radd__ = gb.Matrix.__radd__
+    __rdivmod__ = gb.Matrix.__rdivmod__
+    __rfloordiv__ = gb.Matrix.__rfloordiv__
+    __rmod__ = gb.Matrix.__rmod__
+    __rmul__ = gb.Matrix.__rmul__
+    __rpow__ = gb.Matrix.__rpow__
+    __rsub__ = gb.Matrix.__rsub__
+    __rtruediv__ = gb.Matrix.__rtruediv__
+    __rxor__ = gb.Matrix.__rxor__
+    __sub__ = gb.Matrix.__sub__
+    __truediv__ = gb.Matrix.__truediv__
+    __xor__ = gb.Matrix.__xor__
+
     @classmethod
     def from_delayed(cls, matrix, dtype, nrows, ncols, *, nvals=None, name=None):
         if not isinstance(matrix, Delayed):
@@ -121,40 +168,92 @@ def from_values(
         nrows=None,
         ncols=None,
         *,
-        trust_shape=False,
         dup_op=None,
         dtype=None,
         chunks="auto",
         name=None,
     ):
-        # Note: `trust_shape` is a bool parameter that, when True,
-        # can be used to avoid expensive computation of max(rows)
-        # and max(columns) which are used to verify that `nrows`
-        # and `ncols` are indeed large enough to hold all the given
-        # tuples.
-        if (
-            dup_op is None
-            and type(rows) is da.Array
-            and type(columns) is da.Array
-            and type(values) is da.Array
-        ):
-            if not trust_shape or nrows is None or ncols is None:
-                # this branch is an expensive operation:
-                implied_nrows = 1 + da.max(rows).compute()
-                implied_ncols = 1 + da.max(columns).compute()
-                if nrows is not None and implied_nrows > nrows:
-                    raise Exception()
-                if ncols is not None and implied_ncols > ncols:
-                    raise Exception()
-                nrows = implied_nrows if nrows is None else nrows
-                ncols = implied_ncols if ncols is None else ncols
-
-            idtype = gb.Matrix.new(rows.dtype).dtype
-            np_idtype_ = np_dtype(idtype)
-            vdtype = gb.Matrix.new(values.dtype).dtype
-            np_vdtype_ = np_dtype(vdtype)
+        if isinstance(values, Number):
+            dtype = lookup_dtype(type(values) if dtype is None else dtype)
+        elif hasattr(values, "dtype"):
+            dtype = lookup_dtype(values.dtype if dtype is None else dtype)
+
+        meta = gb.Matrix.new(
+            dtype,
+            nrows=nrows if isinstance(nrows, Number) else 0,
+            ncols=ncols if isinstance(ncols, Number) else 0,
+        )
 
-            chunks = da.core.normalize_chunks(chunks, (nrows, ncols), dtype=np_idtype_)
+        # check for any dOnions:
+        args = pack_args(rows, columns, values, nrows, ncols)
+        kwargs = pack_kwargs(dup_op=dup_op, dtype=dtype, chunks=chunks, name=name)
+        if any_dOnions(*args, **kwargs):
+            # dive into dOnion(s):
+            out_donion = DOnion.multi_access(meta, Matrix.from_values, *args, **kwargs)
+            return Matrix(out_donion, meta=meta)
+
+        # no dOnions
+        if type(rows) is da.Array or type(columns) is da.Array or type(values) is da.Array:
+            nrows_, ncols_ = nrows, ncols
+            if type(rows) in {tuple, list, np.ndarray}:
+                nrows_ = nrows or (np.max(rows) + 1)
+                rows = da.asarray(rows)
+            if type(columns) in {tuple, list, np.ndarray}:
+                ncols_ = ncols or (np.max(columns) + 1)
+                columns = da.asarray(columns)
+            if type(values) in {tuple, list, np.ndarray}:
+                values = da.asarray(values)
+
+            np_idtype_ = np_dtype(lookup_dtype(rows.dtype))
+            if isinstance(nrows_, Integral) and isinstance(ncols_, Integral):
+                nrows, ncols = nrows_, ncols_
+                chunks = da.core.normalize_chunks(chunks, (nrows, ncols), dtype=np_idtype_)
+            else:
+                if nrows is None and rows.size == 0:
+                    raise ValueError("No row indices provided. Unable to infer nrows.")
+
+                if ncols is None and columns.size == 0:
+                    raise ValueError("No column indices provided. Unable to infer ncols.")
+
+                if type(values) is da.Array and (
+                    rows.size != columns.size or columns.size != values.size
+                ):
+                    raise ValueError(
+                        "`rows` and `columns` and `values` lengths must match: "
+                        f"{rows.size}, {columns.size}, {values.size}"
+                    )
+                elif rows.size != columns.size:
+                    raise ValueError(
+                        f"`rows` and `columns` lengths must match: {rows.size}, {columns.size}"
+                    )
+
+                if rows.dtype.kind not in "ui":
+                    raise ValueError(f"rows must be integers, not {rows.dtype}")
+
+                if columns.dtype.kind not in "ui":
+                    raise ValueError(f"columns must be integers, not {columns.dtype}")
+
+                nrows = nrows_
+                if nrows is None:
+                    nrows = da.max(rows) + np.asarray(1, dtype=rows.dtype)
+
+                ncols = ncols_
+                if ncols is None:
+                    ncols = da.max(columns) + np.asarray(1, dtype=columns.dtype)
+
+                # Create dOnion from `nrows` and/or `ncols`, that is,
+                # use the inner value of `nrows` and/or `ncols` to create the new Matrix:
+                shape = (nrows, ncols)
+                _shape = [skip if is_dask_collection(x) else x for x in shape]
+                dasks = [x for x in shape if is_dask_collection(x)]
+                args = pack_args(rows, columns, values, *_shape)
+                kwargs = pack_kwargs(dup_op=dup_op, dtype=dtype, chunks=chunks, name=name)
+                donion = DOnion.sprout(dasks, meta, Matrix.from_values, *args, **kwargs)
+                return Matrix(donion, meta=meta)
+
+            # output shape `(nrows, ncols)` is completely determined
+            vdtype = dtype
+            np_vdtype_ = np_dtype(vdtype)
 
             name_ = name
             name = str(name) if name else ""
@@ -166,19 +265,22 @@ def from_values(
                 *(_pick2D, "ijk"),
                 *(rows, "k"),
                 *(columns, "k"),
-                *(values, "k"),
+                *(values, "k" if type(values) is da.Array else None),
                 *(row_ranges, "i"),
                 *(col_ranges, "j"),
+                shape=(nrows, ncols),
                 dtype=np_idtype_,
                 meta=np.array([]),
             )
-            meta = InnerMatrix(gb.Matrix.new(vdtype))
+            meta = InnerMatrix(gb.Matrix.new(vdtype, nrows=nrows, ncols=ncols))
             delayed = da.core.blockwise(
                 *(_from_values2D, "ij"),
+                *(values if isinstance(values, Number) else None, None),
                 *(fragments, "ijk"),
                 *(row_ranges, "i"),
                 *(col_ranges, "j"),
                 concatenate=False,
+                dup_op=dup_op,
                 gb_dtype=vdtype,
                 dtype=np_vdtype_,
                 meta=meta,
@@ -192,8 +294,130 @@ def from_values(
         )
         return cls.from_matrix(matrix, chunks=chunks, name=name)
 
+    def build(
+        self,
+        rows,
+        columns,
+        values,
+        *,
+        dup_op=None,
+        clear=False,
+        nrows=None,
+        ncols=None,
+        chunks=None,
+        in_dOnion=False,  # not part of the API
+    ):
+        if not clear and self._nvals != 0:
+            raise gb.exceptions.OutputNotEmpty()
+
+        # TODO: delayed nrows/ncols
+        nrows = nrows or self._nrows
+        ncols = ncols or self._ncols
+        meta = self._meta
+
+        # check for any DOnions:
+        args = pack_args(self, rows, columns, values)
+        kwargs = pack_kwargs(
+            dup_op=dup_op, clear=clear, nrows=nrows, ncols=ncols, chunks=chunks, in_dOnion=True
+        )
+        if any_dOnions(*args, **kwargs):
+            # dive into DOnion(s):
+            out_donion = DOnion.multi_access(meta, Matrix.build, *args, **kwargs)
+            self.__init__(out_donion, meta=meta)
+            return
+
+        # no DOnions
+        if clear:
+            self.clear()
+
+        self.resize(nrows, ncols)
+
+        if chunks is not None:
+            self.rechunk(inplace=True, chunks=chunks)
+
+        x = self._optional_dup()
+        if type(rows) in {tuple, list, np.ndarray}:
+            if np.max(rows) >= self._nrows:
+                raise gb.exceptions.IndexOutOfBound
+            rows = da.core.from_array(np.array(rows), name="rows-" + tokenize(rows))
+
+        if type(columns) in {tuple, list, np.ndarray}:
+            if np.max(columns) >= self._ncols:
+                raise gb.exceptions.IndexOutOfBound
+            columns = da.core.from_array(np.array(columns), name="columns-" + tokenize(columns))
+
+        if type(values) in {tuple, list, np.ndarray}:
+            values = da.core.from_array(np.array(values), name="values-" + tokenize(values))
+
+        if type(values) is da.Array and (rows.size != columns.size or columns.size != values.size):
+            raise ValueError(
+                "`rows` and `columns` and `values` lengths must match: "
+                f"{rows.size}, {columns.size}, {values.size}"
+            )
+        elif rows.size != columns.size:
+            raise ValueError(
+                f"`rows` and `columns` lengths must match: {rows.size}, {columns.size}"
+            )
+        elif values is None:
+            raise EmptyObject()
+
+        idtype = gb.Matrix.new(rows.dtype).dtype
+        np_idtype_ = np_dtype(idtype)
+        vdtype = (
+            lookup_dtype(type(values))
+            if isinstance(values, Number)
+            else gb.Matrix.new(values.dtype).dtype
+        )
+        np_vdtype_ = np_dtype(vdtype)
+
+        rname = "-row-ranges" + tokenize(x, x.chunks[0])
+        cname = "-col-ranges" + tokenize(x, x.chunks[1])
+        row_ranges = build_chunk_ranges_dask_array(x, 0, rname)
+        col_ranges = build_chunk_ranges_dask_array(x, 1, cname)
+        fragments = da.core.blockwise(
+            *(_pick2D, "ijk"),
+            *(rows, "k"),
+            *(columns, "k"),
+            *(values, None if isinstance(values, Number) else "k"),
+            *(row_ranges, "i"),
+            *(col_ranges, "j"),
+            shape=(nrows, ncols),
+            dtype=np_idtype_,
+            meta=np.array([]),
+        )
+        meta = InnerMatrix(gb.Matrix.new(vdtype))
+        delayed = da.core.blockwise(
+            *(_build_2D_chunk, "ij"),
+            *(x, "ij"),
+            *(row_ranges, "i"),
+            *(col_ranges, "j"),
+            *(fragments, "ijk"),
+            values=values if isinstance(values, Number) else None,
+            dup_op=dup_op,
+            clear=False,
+            concatenate=False,
+            dtype=np_vdtype_,
+            meta=meta,
+        )
+        if in_dOnion:
+            return Matrix(delayed)
+        self.__init__(delayed)
+
     @classmethod
     def new(cls, dtype, nrows=0, ncols=0, *, chunks="auto", name=None):
+        if any_dOnions(nrows, ncols):
+            meta = gb.Matrix.new(dtype)
+            donion = DOnion.multi_access(
+                meta, cls.new, dtype, nrows=nrows, ncols=ncols, chunks=chunks, name=name
+            )
+            return Matrix(donion, meta=meta)
+
+        if type(nrows) is Box:
+            nrows = nrows.content
+
+        if type(ncols) is Box:
+            ncols = ncols.content
+
         dtype = dtype.lower() if isinstance(dtype, str) else dtype
         if nrows == 0 and ncols == 0:
             matrix = gb.Matrix.new(dtype, nrows, ncols)
@@ -234,15 +458,19 @@ def __init__(self, delayed, meta=None, nvals=None):
         # if it is already known  at the time of initialization of
         # this Matrix,  otherwise its value should be left as None
         # (the default)
-        assert type(delayed) is da.Array
-        assert delayed.ndim == 2
+        assert type(delayed) in {da.Array, DOnion}
         self._delayed = delayed
-        if meta is None:
-            meta = gb.Matrix.new(delayed.dtype, *delayed.shape)
+        if type(delayed) is da.Array:
+            assert delayed.ndim == 2
+            if meta is None:
+                meta = gb.Matrix.new(delayed.dtype, *delayed.shape)
+        else:
+            if meta is None:
+                meta = gb.Matrix.new(delayed.dtype)
         self._meta = meta
-        self._nrows = meta.nrows
-        self._ncols = meta.ncols
         self.dtype = meta.dtype
+        self._nrows = self.nrows
+        self._ncols = self.ncols
         self._nvals = nvals
         # Add ss extension methods
         self.ss = ss(self)
@@ -261,17 +489,165 @@ def T(self):
 
     @property
     def nrows(self):
+        if self.is_dOnion:
+            return DOnion.multi_access(self._meta.nrows, getattr, self, "nrows")
         return self._meta.nrows
 
     @property
     def ncols(self):
+        if self.is_dOnion:
+            return DOnion.multi_access(self._meta.ncols, getattr, self, "ncols")
         return self._meta.ncols
 
     @property
     def shape(self):
-        return (self._meta.nrows, self._meta.ncols)
+        if self.is_dOnion:
+            return (self.nrows, self.ncols)
+            # return DOnion.multi_access(self._meta.shape, getattr, self, "shape")
+        return self._meta.shape
+
+    def _head(self, delayed, shape):
+        """
+        Take the leading portion of shape `shape` from `delayed`
+        """
+        def _slice(inner, slc_x, slc_y):
+            return InnerMatrix(inner.value[slc_x, slc_y].new())
+
+        x = delayed
+        numblocks = ()
+        heads = ()
+        new_chunks = ()
+        for axis in range(2):
+            stops_ = np.cumsum(x.chunks[axis])
+            starts = np.roll(stops_, 1)
+            starts[0] = 0
+
+            M = x.numblocks[axis]
+            blockid = np.arange(M)
+
+            # locate chunk containing last element on axis:
+            i = min(self.shape[axis], shape[axis]) - 1
+            filter = (starts <= i) & (i < stops_)
+            (last_block,) = blockid[filter]
+            tail_sz = i - starts[last_block] + 1
+
+            numblocks += (last_block + 1,)
+            heads += (tail_sz,)
+            new_chunks += (x.chunks[axis][:last_block] + (tail_sz,), )
+
+        name = "Matrix.resize-" + tokenize(x)
+        dtype = self.dtype
+        dsk = dict()
+        for i in range(numblocks[0]):
+            x_cut = (i == numblocks[0] - 1)
+            for j in range(numblocks[1]):
+                y_cut = (j == numblocks[1] - 1)
+                if x_cut or y_cut:
+                    dsk[(name, i, j)] = (
+                        _slice,
+                        (x.name, i, j),
+                        slice(heads[0]) if x_cut else slice(None),
+                        slice(heads[1]) if y_cut else slice(None),
+                    )
+                else:
+                    dsk[(name, i, j)] = (chunk_dup, (x.name, i, j), None, dtype, None)
+
+        return name, dsk, new_chunks, numblocks
+
+    def _add_tail(self, axis, size, name, dsk, chunks, numblocks):
+        """
+        Append dask graph `dsk` with empty chunks on axis `axis` up to size `size`
+        """
+        rem = size - self.shape[axis]
+        if rem > 0:
+            j = numblocks[axis]
+            other = 0 if axis else 1
+            new_chunks = chunks[axis] + (rem,)
+            new_chunks = (chunks[0], new_chunks) if axis else (new_chunks, chunks[1])
+
+            for i, sz_i in enumerate(chunks[other]):
+                if axis:
+                    dsk[(name, i, j)] = (compose(InnerMatrix, gb.Matrix.new), self.dtype, sz_i, rem)
+                else:
+                    dsk[(name, j, i)] = (compose(InnerMatrix, gb.Matrix.new), self.dtype, rem, sz_i)
+
+            return name, dsk, new_chunks, (len(new_chunks[0]), len(new_chunks[1]))
+
+        else:
+            return name, dsk, chunks, numblocks
 
     def resize(self, nrows, ncols, inplace=True, chunks="auto"):
+        if any_dOnions(self, nrows, ncols):
+            donion = DOnion.multi_access(
+                self._meta, Matrix.resize, self, nrows, ncols, inplace=False, chunks=chunks
+            )
+            if inplace:
+                self.__init__(donion, meta=self._meta)
+                return
+            else:
+                return Matrix(donion, meta=self._meta)
+
+        name, dsk, new_chunks, num_blocks = self._head(self._delayed, (nrows, ncols))
+        name, dsk, new_chunks, num_blocks = self._add_tail(0, nrows, name, dsk, new_chunks, num_blocks)
+        name, dsk, new_chunks, num_blocks = self._add_tail(1, ncols, name, dsk, new_chunks, num_blocks)
+
+        graph = HighLevelGraph.from_collections(name, dsk, dependencies=[self._delayed])
+        x = da.core.Array(graph, name, new_chunks, meta=wrap_inner(self._meta))
+        x = x.rechunk(chunks=chunks)
+
+        if nrows >= self.nrows and ncols >= self.ncols:
+            nvals = self.nvals
+        else:
+            nvals = None
+
+        if inplace:
+            self.__init__(x, nvals=nvals)
+        else:
+            return Matrix(x, nvals=nvals)
+
+    def _resize_old(self, nrows, ncols, inplace=True, chunks="auto"):
+        if self.is_dOnion:
+            donion = self._delayed.getattr(
+                self._meta, "resize", nrows, ncols, inplace=False, chunks=chunks
+            )
+            if inplace:
+                self.__init__(donion, meta=self._meta)
+                return
+            else:
+                return Matrix(donion, meta=self._meta)
+
+        if nrows >= self.nrows and ncols >= self.ncols:
+            new_matrix = Matrix.new(self.dtype, nrows, ncols, chunks=chunks)
+            rows, cols = slice(0, self.nrows), slice(0, self.ncols)
+            new_matrix[rows, cols] << self
+            nvals = self._nvals
+        elif nrows < self.nrows and ncols < self.ncols:
+            rows, cols = slice(0, nrows), slice(0, ncols)
+            new_matrix = self[rows, cols].new()
+            new_matrix.rechunk(chunks=chunks)
+            nvals = None
+        else:
+            new_matrix = Matrix.new(self.dtype, nrows, ncols, chunks=chunks)
+            rows, cols = slice(0, min(nrows, self.nrows)), slice(0, min(ncols, self.ncols))
+            new_matrix[rows, cols] << self[rows, cols].new()
+            nvals = None
+
+        if inplace:
+            self.__init__(new_matrix._delayed, nvals=nvals)
+        else:
+            return new_matrix
+
+    def _resize_old2(self, nrows, ncols, inplace=True, chunks="auto"):
+        if self.is_dOnion:
+            donion = self._delayed.getattr(
+                self._meta, "resize", nrows, ncols, inplace=False, chunks=chunks
+            )
+            if inplace:
+                self.__init__(donion, meta=self._meta)
+                return
+            else:
+                return Matrix(donion, meta=self._meta)
+
         chunks = da.core.normalize_chunks(chunks, (nrows, ncols), dtype=np.int64)
         output_row_ranges = build_ranges_dask_array_from_chunks(chunks[0], "output_row_ranges-")
         output_col_ranges = build_ranges_dask_array_from_chunks(chunks[1], "output_col_ranges-")
@@ -311,7 +687,40 @@ def resize(self, nrows, ncols, inplace=True, chunks="auto"):
         else:
             return Matrix(x, nvals=nvals)
 
+    def rechunk(self, inplace=False, chunks="auto"):
+        if self.is_dOnion:
+            meta = self._meta
+            donion = self._delayed.getattr(meta, "rechunk", inplace=False, chunks=chunks)
+            if inplace:
+                self.__init__(donion, meta=meta)
+                return
+            else:
+                return Matrix(donion, meta=meta)
+
+        delayed = self._delayed.rechunk(chunks=chunks)
+        if inplace:
+            self._delayed = delayed
+            return
+        else:
+            return Matrix(delayed, meta=self._meta, nvals=self._nvals)
+        # chunks = da.core.normalize_chunks(chunks, self.shape, dtype=np.int64)
+        # if inplace:
+        #     self.resize(*self.shape, chunks=chunks)
+        #     return
+        # else:
+        #     return self.resize(*self.shape, chunks=chunks, inplace=False)
+
+    def diag(self, k=0, dtype=None, chunks="auto"):
+        return self._diag(k=k, dtype=dtype, chunks=chunks)
+
     def _diag(self, k=0, dtype=None, chunks="auto"):
+        if self.is_dOnion:
+            meta = self._meta.diag(k=k, dtype=dtype)
+            donion = DOnion.multi_access(
+                meta, self.__class__._diag, self, k=k, dtype=dtype, chunks=chunks
+            )
+            return get_return_type(meta)(donion, meta=meta)
+
         kdiag_row_start = max(0, -k)
         kdiag_col_start = max(0, k)
         kdiag_row_stop = min(self.nrows, self.ncols - k)
@@ -336,7 +745,7 @@ def _diag(self, k=0, dtype=None, chunks="auto"):
         row_blockid = np.arange(A.numblocks[0])
         col_blockid = np.arange(A.numblocks[1])
 
-        # locate first chunk containing diaagonal:
+        # locate first chunk containing diagonal:
         row_filter = (row_starts <= kdiag_row_start) & (kdiag_row_start < row_stops_)
         col_filter = (col_starts <= kdiag_col_start) & (kdiag_col_start < col_stops_)
         (R,) = row_blockid[row_filter]
@@ -413,13 +822,35 @@ def _diag_old(self, k=0, dtype=None, chunks="auto"):
         nvals = 0 if self._nvals == 0 else None
         return get_return_type(meta)(delayed, nvals=nvals)
 
-    def __getitem__(self, index):
-        return AmbiguousAssignOrExtract(self, index)
+    def __getitem__(self, keys):
+        resolved_indexes = IndexerResolver(self, keys)
+        shape = resolved_indexes.shape
+        if not shape:
+            from .scalar import ScalarIndexExpr
+
+            return ScalarIndexExpr(self, resolved_indexes)
+        elif len(shape) == 1:
+            from .vector import VectorIndexExpr
+
+            return VectorIndexExpr(self, resolved_indexes, *shape)
+        else:
+            return MatrixIndexExpr(self, resolved_indexes, *shape)
+
+    def __delitem__(self, keys, in_dOnion=False):
+        if is_DOnion(self._delayed):
+            good_keys = [x for x in keys if isinstance(x, Integral)]
+            if len(good_keys) != 2:
+                raise TypeError("Remove Element only supports scalars.")
+
+            donion = self._delayed.getattr(self._meta, "__delitem__", keys, in_dOnion=True)
+            self.__init__(donion, meta=self._meta)
+            return
 
-    def __delitem__(self, keys):
         del Updater(self)[keys]
+        if in_dOnion:
+            return self
 
-    def __setitem__(self, index, delayed):
+    def __setitem__(self, index, delayed, in_dOnion=False):
         Updater(self)[index] = delayed
 
     def __contains__(self, index):
@@ -437,219 +868,214 @@ def __iter__(self):
         return zip(rows.flat, columns.flat)
 
     def ewise_add(self, other, op=monoid.plus, *, require_monoid=True):
-        assert type(other) is Matrix  # TODO: or TransposedMatrix
-        meta = self._meta.ewise_add(other._meta, op=op, require_monoid=require_monoid)
-        return GbDelayed(self, "ewise_add", other, op, require_monoid=require_monoid, meta=meta)
+        gb_types = (gb.Matrix, gb.matrix.TransposedMatrix)
+        other = self._expect_type(
+            other, (Matrix, TransposedMatrix) + gb_types, within="ewise_add", argname="other"
+        )
+
+        try:
+            meta = self._meta.ewise_add(other._meta, op=op, require_monoid=require_monoid)
+        except DimensionMismatch:
+            if any_dOnions(self, other):
+                meta = self._meta.ewise_add(self._meta, op=op, require_monoid=require_monoid)
+            else:
+                raise
+
+        return MatrixExpression(
+            self, "ewise_add", other, op, require_monoid=require_monoid, meta=meta
+        )
 
     def ewise_mult(self, other, op=binary.times):
-        assert type(other) is Matrix  # TODO: or TransposedMatrix
-        meta = self._meta.ewise_mult(other._meta, op=op)
-        return GbDelayed(self, "ewise_mult", other, op, meta=meta)
+        gb_types = (gb.Matrix, gb.matrix.TransposedMatrix)
+        other = self._expect_type(
+            other, (Matrix, TransposedMatrix) + gb_types, within="ewise_mult", argname="other"
+        )
+
+        try:
+            meta = self._meta.ewise_mult(other._meta, op=op)
+        except DimensionMismatch:
+            if any_dOnions(self, other):
+                meta = self._meta.ewise_mult(self._meta, op=op)
+            else:
+                raise
+
+        return MatrixExpression(self, "ewise_mult", other, op, meta=meta)
 
     def mxv(self, other, op=semiring.plus_times):
-        from .vector import Vector
+        from .vector import Vector, VectorExpression
 
-        assert type(other) is Vector
-        meta = self._meta.mxv(other._meta, op=op)
-        return GbDelayed(self, "mxv", other, op, meta=meta)
+        other = self._expect_type(other, (Vector, gb.Vector), within="mxv", argname="other")
+
+        try:
+            meta = self._meta.mxv(other._meta, op=op)
+        except DimensionMismatch:
+            if any_dOnions(self, other):
+                other_meta = gb.Vector.new(dtype=other._meta.dtype, size=self._meta.ncols)
+                meta = self._meta.mxv(other_meta, op=op)
+            else:
+                raise
+
+        return VectorExpression(self, "mxv", other, op, meta=meta, size=self.nrows)
 
     def mxm(self, other, op=semiring.plus_times):
-        assert type(other) in (Matrix, TransposedMatrix)
-        meta = self._meta.mxm(other._meta, op=op)
-        return GbDelayed(self, "mxm", other, op, meta=meta)
+        gb_types = (gb.Matrix, gb.matrix.TransposedMatrix)
+        other = self._expect_type(
+            other, (Matrix, TransposedMatrix) + gb_types, within="mxm", argname="other"
+        )
+
+        try:
+            meta = self._meta.mxm(other._meta, op=op)
+        except DimensionMismatch:
+            if any_dOnions(self, other):
+                other_meta = gb.Matrix.new(
+                    dtype=other._meta.dtype, nrows=self._meta.ncols, ncols=other._meta.ncols
+                )
+                meta = self._meta.mxm(other_meta, op=op)
+            else:
+                raise
+
+        return MatrixExpression(
+            self, "mxm", other, op, meta=meta, nrows=self.nrows, ncols=other.ncols
+        )
 
     def kronecker(self, other, op=binary.times):
-        assert type(other) is Matrix  # TODO: or TransposedMatrix
+        gb_types = (gb.Matrix, gb.matrix.TransposedMatrix)
+        other = self._expect_type(
+            other, (Matrix, TransposedMatrix) + gb_types, within="kronecker", argname="other"
+        )
         meta = self._meta.kronecker(other._meta, op=op)
-        return GbDelayed(self, "kronecker", other, op, meta=meta)
+        return MatrixExpression(self, "kronecker", other, op, meta=meta)
 
     def apply(self, op, right=None, *, left=None):
-        from .scalar import Scalar
-
         left_meta = left
         right_meta = right
 
-        if type(left) is Scalar:
-            left_meta = left.dtype.np_type(0)
-        if type(right) is Scalar:
-            right_meta = right.dtype.np_type(0)
-
+        if isinstance(left, BaseType):
+            left_meta = left._meta
+            if left_meta._is_scalar and left_meta.is_empty:
+                left_meta = gb.Scalar.from_value(0, dtype=left_meta.dtype)
+        if isinstance(right, BaseType):
+            right_meta = right._meta
+            if right_meta._is_scalar and right_meta.is_empty:
+                right_meta = gb.Scalar.from_value(0, dtype=right_meta.dtype)
+
+        if self._meta.shape == (0,) * self.ndim:
+            self._meta.resize(*((1,) * self.ndim))
         meta = self._meta.apply(op=op, left=left_meta, right=right_meta)
-        return GbDelayed(self, "apply", op, right, meta=meta, left=left)
+        return MatrixExpression(self, "apply", op, right, meta=meta, left=left)
 
     def reduce_rowwise(self, op=monoid.plus):
+        from .vector import VectorExpression
+
         meta = self._meta.reduce_rowwise(op)
-        return GbDelayed(self, "reduce_rowwise", op, meta=meta)
+        return VectorExpression(self, "reduce_rowwise", op, meta=meta, size=self.nrows)
 
     def reduce_columnwise(self, op=monoid.plus):
-        meta = self._meta.reduce_columnwise(op)
-        return GbDelayed(self, "reduce_columnwise", op, meta=meta)
-
-    def reduce_scalar(self, op=monoid.plus):
-        meta = self._meta.reduce_scalar(op)
-        return GbDelayed(self, "reduce_scalar", op, meta=meta)
-
-    def build(
-        self,
-        rows,
-        columns,
-        values,
-        *,
-        dup_op=None,
-        clear=False,
-        nrows=None,
-        ncols=None,
-        chunks=None,
-    ):
-        if clear:
-            self.clear()
-        elif self.nvals.compute() > 0:
-            raise gb.exceptions.OutputNotEmpty
+        from .vector import VectorExpression
 
-        if nrows is not None or ncols is not None:
-            if nrows is None:
-                nrows = self._nrows
-            if ncols is None:
-                ncols = self._ncols
-            self.resize(nrows, ncols)
+        meta = self._meta.reduce_columnwise(op)
+        return VectorExpression(self, "reduce_columnwise", op, meta=meta, size=self.ncols)
 
-        if chunks is not None:
-            self.rechunk(inplace=True, chunks=chunks)
+    def reduce_scalar(self, op=monoid.plus, *, allow_empty=True):
+        from .scalar import ScalarExpression
 
-        x = self._optional_dup()
-        if type(rows) is list:
-            if np.max(rows) >= self._nrows:
-                raise gb.exceptions.IndexOutOfBound
-            rows = da.core.from_array(np.array(rows), name="rows-" + tokenize(rows))
-        else:
-            if da.max(rows).compute() >= self._nrows:
-                raise gb.exceptions.IndexOutOfBound
-        if type(columns) is list:
-            if np.max(columns) >= self._ncols:
-                raise gb.exceptions.IndexOutOfBound
-            columns = da.core.from_array(np.array(columns), name="columns-" + tokenize(columns))
-        else:
-            if da.max(columns).compute() >= self._ncols:
-                raise gb.exceptions.IndexOutOfBound
-        if type(values) is list:
-            values = da.core.from_array(np.array(values), name="values-" + tokenize(values))
+        meta = self._meta.reduce_scalar(op)
+        return ScalarExpression(self, "reduce_scalar", op, meta=meta, allow_empty=allow_empty)
 
-        idtype = gb.Matrix.new(rows.dtype).dtype
-        np_idtype_ = np_dtype(idtype)
-        vdtype = gb.Matrix.new(values.dtype).dtype
-        np_vdtype_ = np_dtype(vdtype)
+    def to_values(self, dtype=None, chunks="auto"):
+        dtype = lookup_dtype(self.dtype if dtype is None else dtype)
+        meta_i, _, meta_v = self._meta.to_values(dtype)
 
-        rname = "-row-ranges" + tokenize(x, x.chunks[0])
-        cname = "-col-ranges" + tokenize(x, x.chunks[1])
-        row_ranges = build_chunk_ranges_dask_array(x, 0, rname)
-        col_ranges = build_chunk_ranges_dask_array(x, 1, cname)
-        fragments = da.core.blockwise(
-            *(_pick2D, "ijk"),
-            *(rows, "k"),
-            *(columns, "k"),
-            *(values, "k"),
-            *(row_ranges, "i"),
-            *(col_ranges, "j"),
-            dtype=np_idtype_,
-            meta=np.array([]),
-        )
-        meta = InnerMatrix(gb.Matrix.new(vdtype))
-        delayed = da.core.blockwise(
-            *(_build_2D_chunk, "ij"),
-            *(x, "ij"),
-            *(row_ranges, "i"),
-            *(col_ranges, "j"),
-            *(fragments, "ijk"),
-            dup_op=dup_op,
-            concatenate=False,
-            dtype=np_vdtype_,
-            meta=meta,
-        )
-        self.__init__(delayed)
+        if self.is_dOnion:
+            meta = np.array([])
+            result = DOnion.multi_access(
+                meta, self.__class__.to_values, self, dtype=dtype, chunks=chunks
+            )
+            rows = DOnion.multi_access(meta_i, tuple.__getitem__, result, 0)
+            columns = DOnion.multi_access(meta_i, tuple.__getitem__, result, 1)
+            values = DOnion.multi_access(meta_v, tuple.__getitem__, result, 2)
+            return rows, columns, values
 
-    def to_values(self, dtype=None, chunks="auto"):
-        x = self._delayed
         # first find the number of values in each chunk and return
         # them as a 2D numpy array whose shape is equal to x.numblocks
+        x = self._delayed
         nvals_2D = da.core.blockwise(
             *(_nvals_in_chunk, "ij"),
             *(x, "ij"),
             adjust_chunks={"i": 1, "j": 1},
             dtype=np.int64,
             meta=np.array([[]]),
-        ).compute()
+        )
 
         # use the above array to determine the output tuples' array
-        # bounds (`starts` and `stops`) for each chunk of this
+        # bounds (`starts` and `stops_`) for each chunk of this
         # Matrix (self)
-        nvals_1D = nvals_2D.flatten()
-
-        stops = np.cumsum(nvals_1D)
-        starts = np.roll(stops, 1)
+        stops_ = da.cumsum(nvals_2D)  # BEWARE: this function rechunks!
+        starts = da.roll(stops_, 1)
+        starts = starts.copy() if starts.size == 1 else starts  # bug!!
         starts[0] = 0
-        nnz = stops[-1]
-
-        # convert numpy 2D-arrays (`starts` and `stops`) to 2D dask Arrays
-        # of ranges.  Don't forget to fix their `chunks` in oder to enable
-        # them to align with x
-        starts = starts.reshape(nvals_2D.shape)
-        starts = da.from_array(starts, chunks=1, name="starts" + tokenize(starts))
-        starts = da.core.Array(starts.dask, starts.name, x.chunks, starts.dtype, meta=x._meta)
-
-        stops = stops.reshape(nvals_2D.shape)
-        stops = da.from_array(stops, chunks=1, name="stops" + tokenize(stops))
-        stops = da.core.Array(stops.dask, stops.name, x.chunks, stops.dtype, meta=x._meta)
-
-        chunks = da.core.normalize_chunks(chunks, (nnz,), dtype=np.int64)
-        output_ranges = build_ranges_dask_array_from_chunks(chunks[0], "output_ranges-")
+        nnz = stops_[-1]
+        starts = starts.reshape(nvals_2D.shape).rechunk(1)
+        stops_ = stops_.reshape(nvals_2D.shape).rechunk(1)
+
+        def _to_values(x, starts, stops_, dtype, chunks, nnz):
+            # the following changes the `.chunks` attribute of `starts` and `stops_` so that
+            # `blockwise()` can align them with `x`
+            starts = da.core.Array(starts.dask, starts.name, x.chunks, starts.dtype, meta=x._meta)
+            stops_ = da.core.Array(stops_.dask, stops_.name, x.chunks, stops_.dtype, meta=x._meta)
+
+            chunks = da.core.normalize_chunks(chunks, (nnz,), dtype=np.int64)
+            output_ranges = build_ranges_dask_array_from_chunks(chunks[0], "output_ranges-")
+
+            gb_dtype = lookup_dtype(dtype)
+            dtype_ = np_dtype(gb_dtype)
+            # Compute row/col offsets as dask arrays that can align with this
+            # Matrix's (self's) chunks to convert chunk row/col indices to
+            # full dask-grblas Matrix indices.
+            row_offsets = build_chunk_offsets_dask_array(x, 0, "row_offset-")
+            col_offsets = build_chunk_offsets_dask_array(x, 1, "col_offset-")
+            x = da.core.blockwise(
+                *(MatrixTupleExtractor, "ijk"),
+                *(output_ranges, "k"),
+                *(x, "ij"),
+                *(row_offsets, "i"),
+                *(col_offsets, "j"),
+                *(starts, "ij"),
+                *(stops_, "ij"),
+                gb_dtype=dtype,
+                dtype=dtype_,
+                meta=np.array([[[]]]),
+            )
+            x = da.reduction(
+                x, _identity, _flatten, axis=1, concatenate=False, dtype=dtype_, meta=np.array([[]])
+            )
+            return da.reduction(
+                x, _identity, _flatten, axis=0, concatenate=False, dtype=dtype_, meta=np.array([])
+            )
 
-        dtype_ = np_dtype(self.dtype)
-        # Compute row/col offsets as dask arrays that can align with this
-        # Matrix's (self's) chunks to convert chunk row/col indices to
-        # full dask-grblas Matrix indices.
-        row_offsets = build_chunk_offsets_dask_array(x, 0, "row_offset-")
-        col_offsets = build_chunk_offsets_dask_array(x, 1, "col_offset-")
-        x = da.core.blockwise(
-            *(MatrixTupleExtractor, "ijk"),
-            *(output_ranges, "k"),
-            *(x, "ij"),
-            *(row_offsets, "i"),
-            *(col_offsets, "j"),
-            *(starts, "ij"),
-            *(stops, "ij"),
-            gb_dtype=dtype,
-            dtype=dtype_,
-            meta=np.array([[[]]]),
-        )
-        x = da.reduction(
-            x, _identity, _flatten, axis=1, concatenate=False, dtype=dtype_, meta=np.array([[]])
-        )
-        x = da.reduction(
-            x, _identity, _flatten, axis=0, concatenate=False, dtype=dtype_, meta=np.array([])
-        )
+        # since the size of the output (rows, columns, values) depends on nnz, a delayed quantity,
+        # we need to return the output as DOnions (twice-delayed dask-arrays)
+        meta = np.array([])
+        rcv_donion = DOnion.sprout(nnz, meta, _to_values, x, starts, stops_, dtype, chunks)
 
-        meta_i, meta_j, meta_v = self._meta.to_values(dtype)
-        rows = da.map_blocks(_get_rows, x, dtype=meta_i.dtype, meta=meta_i)
-        cols = da.map_blocks(_get_cols, x, dtype=meta_j.dtype, meta=meta_j)
-        vals = da.map_blocks(_get_vals, x, dtype=meta_v.dtype, meta=meta_v)
+        dtype_i = np_dtype(lookup_dtype(meta_i.dtype))
+        rows = rcv_donion.deep_extract(meta_i, da.map_blocks, _get_rows, dtype=dtype_i, meta=meta_i)
+        cols = rcv_donion.deep_extract(meta_i, da.map_blocks, _get_cols, dtype=dtype_i, meta=meta_i)
+        dtype_v = np_dtype(lookup_dtype(meta_v.dtype))
+        vals = rcv_donion.deep_extract(meta_v, da.map_blocks, _get_vals, dtype=dtype_v, meta=meta_v)
         return rows, cols, vals
 
-    def rechunk(self, inplace=False, chunks="auto"):
-        chunks = da.core.normalize_chunks(chunks, self.shape, dtype=np.int64)
-        if inplace:
-            self.resize(*self.shape, chunks=chunks)
-        else:
-            return self.resize(*self.shape, chunks=chunks, inplace=False)
-
     def isequal(self, other, *, check_dtype=False):
+        gb_types = (gb.Matrix, gb.matrix.TransposedMatrix)
         other = self._expect_type(
-            other, (Matrix, TransposedMatrix), within="isequal", argname="other"
+            other, (Matrix, TransposedMatrix) + gb_types, within="isequal", argname="other"
         )
         return super().isequal(other, check_dtype=check_dtype)
 
     def isclose(self, other, *, rel_tol=1e-7, abs_tol=0.0, check_dtype=False):
+        gb_types = (gb.Matrix, gb.matrix.TransposedMatrix)
         other = self._expect_type(
-            other, (Matrix, TransposedMatrix), within="isclose", argname="other"
+            other, (Matrix, TransposedMatrix) + gb_types, within="isclose", argname="other"
         )
         return super().isclose(other, rel_tol=rel_tol, abs_tol=abs_tol, check_dtype=check_dtype)
 
@@ -681,29 +1107,79 @@ def _delete_element(self, resolved_indexes):
 
 class TransposedMatrix:
     ndim = 2
+    _is_scalar = False
     _is_transposed = True
 
-    def __init__(self, matrix):
+    __and__ = gb.matrix.TransposedMatrix.__and__
+    __bool__ = gb.matrix.TransposedMatrix.__bool__
+    __or__ = gb.matrix.TransposedMatrix.__or__
+
+    __abs__ = gb.matrix.TransposedMatrix.__abs__
+    __add__ = gb.matrix.TransposedMatrix.__add__
+    __divmod__ = gb.matrix.TransposedMatrix.__divmod__
+    __eq__ = gb.matrix.TransposedMatrix.__eq__
+    __floordiv__ = gb.matrix.TransposedMatrix.__floordiv__
+    __ge__ = gb.matrix.TransposedMatrix.__ge__
+    __gt__ = gb.matrix.TransposedMatrix.__gt__
+    __invert__ = gb.matrix.TransposedMatrix.__invert__
+    __le__ = gb.matrix.TransposedMatrix.__le__
+    __lt__ = gb.matrix.TransposedMatrix.__lt__
+    __mod__ = gb.matrix.TransposedMatrix.__mod__
+    __mul__ = gb.matrix.TransposedMatrix.__mul__
+    __ne__ = gb.matrix.TransposedMatrix.__ne__
+    __neg__ = gb.matrix.TransposedMatrix.__neg__
+    __pow__ = gb.matrix.TransposedMatrix.__pow__
+    __radd__ = gb.matrix.TransposedMatrix.__radd__
+    __rdivmod__ = gb.matrix.TransposedMatrix.__rdivmod__
+    __rfloordiv__ = gb.matrix.TransposedMatrix.__rfloordiv__
+    __rmod__ = gb.matrix.TransposedMatrix.__rmod__
+    __rmul__ = gb.matrix.TransposedMatrix.__rmul__
+    __rpow__ = gb.matrix.TransposedMatrix.__rpow__
+    __rsub__ = gb.matrix.TransposedMatrix.__rsub__
+    __rtruediv__ = gb.matrix.TransposedMatrix.__rtruediv__
+    __rxor__ = gb.matrix.TransposedMatrix.__rxor__
+    __sub__ = gb.matrix.TransposedMatrix.__sub__
+    __truediv__ = gb.matrix.TransposedMatrix.__truediv__
+    __xor__ = gb.matrix.TransposedMatrix.__xor__
+
+    def __init__(self, matrix, meta=None):
         assert type(matrix) is Matrix
         self._matrix = matrix
-        self._meta = matrix._meta.T
+        self._meta = matrix._meta.T if meta is None else meta
 
         # Aggregator-specific requirements:
-        self._nrows = self.nrows
-        self._ncols = self.ncols
+        self._nrows = self._meta.nrows
+        self._ncols = self._meta.ncols
+
+    @property
+    def is_dOnion(self):
+        return is_DOnion(self._matrix._delayed)
+
+    @property
+    def dOnion_if(self):
+        return self._matrix._delayed if self.is_dOnion else self
+
+    def dup(self, dtype=None, *, mask=None, name=None):
+        return self.new(dtype=dtype, mask=mask)
 
     def new(self, *, dtype=None, mask=None):
+        if any_dOnions(self, mask):
+            donion = DOnion.multi_access(
+                self._meta.new(dtype), self.__class__.new, self, dtype=dtype, mask=mask
+            )
+            return Matrix(donion)
+
         gb_dtype = self._matrix.dtype if dtype is None else lookup_dtype(dtype)
         dtype = np_dtype(gb_dtype)
 
         delayed = self._matrix._delayed
         if mask is None:
-            mask_ind = None
             mask_type = None
+            mask_ind = None
         else:
-            mask = mask.mask
-            mask_ind = "ji"
             mask_type = get_grblas_type(mask)
+            mask = mask.mask._delayed
+            mask_ind = "ji"
         delayed = da.core.blockwise(
             *(_transpose, "ji"),
             *(delayed, "ij"),
@@ -724,17 +1200,27 @@ def dtype(self):
         return self._meta.dtype
 
     def to_values(self, dtype=None, chunks="auto"):
-        # TODO: make this lazy; can we do something smart with this?
         rows, cols, vals = self._matrix.to_values(dtype=dtype, chunks=chunks)
         return cols, rows, vals
 
     # Properties
-    nrows = Matrix.nrows
-    ncols = Matrix.ncols
-    shape = Matrix.shape
-    nvals = Matrix.nvals
+    def isequal(self, other, *, check_dtype=False):
+        other = self._expect_type(
+            other, (Matrix, TransposedMatrix), within="isequal", argname="other"
+        )
+        return BaseType.isequal(self, other, check_dtype=check_dtype)
+
+    def isclose(self, other, *, rel_tol=1e-7, abs_tol=0.0, check_dtype=False):
+        other = self._expect_type(
+            other, (Matrix, TransposedMatrix), within="isclose", argname="other"
+        )
+        return BaseType.isclose(
+            self, other, rel_tol=rel_tol, abs_tol=abs_tol, check_dtype=check_dtype
+        )
 
     # Delayed methods
+    __contains__ = Matrix.__contains__
+    __getitem__ = Matrix.__getitem__
     ewise_add = Matrix.ewise_add
     ewise_mult = Matrix.ewise_mult
     mxv = Matrix.mxv
@@ -746,13 +1232,211 @@ def to_values(self, dtype=None, chunks="auto"):
     reduce_scalar = Matrix.reduce_scalar
 
     # Misc.
-    isequal = Matrix.isequal
-    isclose = Matrix.isclose
-    __getitem__ = Matrix.__getitem__
+    nrows = Matrix.nrows
+    ncols = Matrix.ncols
+    shape = Matrix.shape
+    nvals = Matrix.nvals
+    _expect_type = Matrix._expect_type
     __array__ = Matrix.__array__
     name = Matrix.name
 
 
+class MatrixExpression(GbDelayed):
+    __slots__ = ()
+    output_type = gb.Matrix
+    ndim = 2
+    _is_scalar = False
+
+    # automethods:
+    __and__ = gb.matrix.MatrixExpression.__and__
+    __bool__ = gb.matrix.MatrixExpression.__bool__
+    __or__ = gb.matrix.MatrixExpression.__or__
+    _get_value = _automethods._get_value
+    S = gb.matrix.MatrixExpression.S
+    T = gb.matrix.MatrixExpression.T
+    V = gb.matrix.MatrixExpression.V
+    apply = gb.matrix.MatrixExpression.apply
+    ewise_add = gb.matrix.MatrixExpression.ewise_add
+    ewise_mult = gb.matrix.MatrixExpression.ewise_mult
+    isclose = gb.matrix.MatrixExpression.isclose
+    isequal = gb.matrix.MatrixExpression.isequal
+    kronecker = gb.matrix.MatrixExpression.kronecker
+    mxm = gb.matrix.MatrixExpression.mxm
+    mxv = gb.matrix.MatrixExpression.mxv
+    ncols = gb.matrix.MatrixExpression.ncols
+    nrows = gb.matrix.MatrixExpression.nrows
+    nvals = gb.matrix.MatrixExpression.nvals
+    reduce_rowwise = gb.matrix.MatrixExpression.reduce_rowwise
+    reduce_columnwise = gb.matrix.MatrixExpression.reduce_columnwise
+    reduce_scalar = gb.matrix.MatrixExpression.reduce_scalar
+    shape = gb.matrix.MatrixExpression.shape
+    nvals = gb.matrix.MatrixExpression.nvals
+
+    # infix sugar:
+    __abs__ = gb.matrix.MatrixExpression.__abs__
+    __add__ = gb.matrix.MatrixExpression.__add__
+    __divmod__ = gb.matrix.MatrixExpression.__divmod__
+    __eq__ = gb.matrix.MatrixExpression.__eq__
+    __floordiv__ = gb.matrix.MatrixExpression.__floordiv__
+    __ge__ = gb.matrix.MatrixExpression.__ge__
+    __gt__ = gb.matrix.MatrixExpression.__gt__
+    __invert__ = gb.matrix.MatrixExpression.__invert__
+    __le__ = gb.matrix.MatrixExpression.__le__
+    __lt__ = gb.matrix.MatrixExpression.__lt__
+    __mod__ = gb.matrix.MatrixExpression.__mod__
+    __mul__ = gb.matrix.MatrixExpression.__mul__
+    __ne__ = gb.matrix.MatrixExpression.__ne__
+    __neg__ = gb.matrix.MatrixExpression.__neg__
+    __pow__ = gb.matrix.MatrixExpression.__pow__
+    __radd__ = gb.matrix.MatrixExpression.__radd__
+    __rdivmod__ = gb.matrix.MatrixExpression.__rdivmod__
+    __rfloordiv__ = gb.matrix.MatrixExpression.__rfloordiv__
+    __rmod__ = gb.matrix.MatrixExpression.__rmod__
+    __rmul__ = gb.matrix.MatrixExpression.__rmul__
+    __rpow__ = gb.matrix.MatrixExpression.__rpow__
+    __rsub__ = gb.matrix.MatrixExpression.__rsub__
+    __rtruediv__ = gb.matrix.MatrixExpression.__rtruediv__
+    __rxor__ = gb.matrix.MatrixExpression.__rxor__
+    __sub__ = gb.matrix.MatrixExpression.__sub__
+    __truediv__ = gb.matrix.MatrixExpression.__truediv__
+    __xor__ = gb.matrix.MatrixExpression.__xor__
+
+    # bad sugar:
+    __itruediv__ = gb.matrix.MatrixExpression.__itruediv__
+    __imul__ = gb.matrix.MatrixExpression.__imul__
+    __imatmul__ = gb.matrix.MatrixExpression.__imatmul__
+    __iadd__ = gb.matrix.MatrixExpression.__iadd__
+    __iand__ = gb.matrix.MatrixExpression.__iand__
+    __ipow__ = gb.matrix.MatrixExpression.__ipow__
+    __imod__ = gb.matrix.MatrixExpression.__imod__
+    __isub__ = gb.matrix.MatrixExpression.__isub__
+    __ixor__ = gb.matrix.MatrixExpression.__ixor__
+    __ifloordiv__ = gb.matrix.MatrixExpression.__ifloordiv__
+    __ior__ = gb.matrix.MatrixExpression.__ior__
+
+    def __init__(
+        self,
+        parent,
+        method_name,
+        *args,
+        meta=None,
+        ncols=None,
+        nrows=None,
+        **kwargs,
+    ):
+        super().__init__(
+            parent,
+            method_name,
+            *args,
+            meta=meta,
+            **kwargs,
+        )
+        if ncols is None:
+            ncols = self.parent._ncols
+        if nrows is None:
+            nrows = self.parent._nrows
+        self._ncols = ncols
+        self._nrows = nrows
+
+    # def __getattr__(self, item):
+    #     return getattr(gb.matrix.MatrixExpression, item)
+
+    # def construct_output(self, dtype=None, *, name=None):
+    #     if dtype is None:
+    #         dtype = self.dtype
+    #     nrows = 0 if self._nrows.is_dOnion else self._nrows
+    #     ncols = 0 if self._ncols.is_dOnion else self._ncols
+    #     return Matrix.new(dtype, nrows, ncols, name=name)
+
+
+class MatrixIndexExpr(AmbiguousAssignOrExtract):
+    __slots__ = "_ncols", "_nrows"
+    ndim = 2
+    output_type = gb.Matrix
+    _is_transposed = False
+
+    def __init__(self, parent, resolved_indexes, nrows, ncols):
+        super().__init__(parent, resolved_indexes)
+        self._nrows = nrows
+        self._ncols = ncols
+
+    @property
+    def ncols(self):
+        return self._ncols
+
+    @property
+    def nrows(self):
+        return self._nrows
+
+    @property
+    def shape(self):
+        return (self._nrows, self._ncols)
+
+    # Begin auto-generated code: Matrix
+    __and__ = gb.matrix.MatrixIndexExpr.__and__
+    __bool__ = gb.matrix.MatrixIndexExpr.__bool__
+    __or__ = gb.matrix.MatrixIndexExpr.__or__
+    _get_value = _automethods._get_value
+    S = gb.matrix.MatrixIndexExpr.S
+    T = gb.matrix.MatrixIndexExpr.T
+    V = gb.matrix.MatrixIndexExpr.V
+    apply = gb.matrix.MatrixIndexExpr.apply
+    ewise_add = gb.matrix.MatrixIndexExpr.ewise_add
+    ewise_mult = gb.matrix.MatrixIndexExpr.ewise_mult
+    isclose = gb.matrix.MatrixIndexExpr.isclose
+    isequal = gb.matrix.MatrixIndexExpr.isequal
+    kronecker = gb.matrix.MatrixIndexExpr.kronecker
+    mxm = gb.matrix.MatrixIndexExpr.mxm
+    mxv = gb.matrix.MatrixIndexExpr.mxv
+    nvals = gb.matrix.MatrixIndexExpr.nvals
+    reduce_rowwise = gb.matrix.MatrixIndexExpr.reduce_rowwise
+    reduce_columnwise = gb.matrix.MatrixIndexExpr.reduce_columnwise
+    reduce_scalar = gb.matrix.MatrixIndexExpr.reduce_scalar
+    nvals = gb.matrix.MatrixIndexExpr.nvals
+
+    # infix sugar:
+    __abs__ = gb.matrix.MatrixIndexExpr.__abs__
+    __add__ = gb.matrix.MatrixIndexExpr.__add__
+    __divmod__ = gb.matrix.MatrixIndexExpr.__divmod__
+    __eq__ = gb.matrix.MatrixIndexExpr.__eq__
+    __floordiv__ = gb.matrix.MatrixIndexExpr.__floordiv__
+    __ge__ = gb.matrix.MatrixIndexExpr.__ge__
+    __gt__ = gb.matrix.MatrixIndexExpr.__gt__
+    __invert__ = gb.matrix.MatrixIndexExpr.__invert__
+    __le__ = gb.matrix.MatrixIndexExpr.__le__
+    __lt__ = gb.matrix.MatrixIndexExpr.__lt__
+    __mod__ = gb.matrix.MatrixIndexExpr.__mod__
+    __mul__ = gb.matrix.MatrixIndexExpr.__mul__
+    __ne__ = gb.matrix.MatrixIndexExpr.__ne__
+    __neg__ = gb.matrix.MatrixIndexExpr.__neg__
+    __pow__ = gb.matrix.MatrixIndexExpr.__pow__
+    __radd__ = gb.matrix.MatrixIndexExpr.__radd__
+    __rdivmod__ = gb.matrix.MatrixIndexExpr.__rdivmod__
+    __rfloordiv__ = gb.matrix.MatrixIndexExpr.__rfloordiv__
+    __rmod__ = gb.matrix.MatrixIndexExpr.__rmod__
+    __rmul__ = gb.matrix.MatrixIndexExpr.__rmul__
+    __rpow__ = gb.matrix.MatrixIndexExpr.__rpow__
+    __rsub__ = gb.matrix.MatrixIndexExpr.__rsub__
+    __rtruediv__ = gb.matrix.MatrixIndexExpr.__rtruediv__
+    __rxor__ = gb.matrix.MatrixIndexExpr.__rxor__
+    __sub__ = gb.matrix.MatrixIndexExpr.__sub__
+    __truediv__ = gb.matrix.MatrixIndexExpr.__truediv__
+    __xor__ = gb.matrix.MatrixIndexExpr.__xor__
+
+    # bad sugar:
+    __itruediv__ = gb.matrix.MatrixIndexExpr.__itruediv__
+    __imul__ = gb.matrix.MatrixIndexExpr.__imul__
+    __imatmul__ = gb.matrix.MatrixIndexExpr.__imatmul__
+    __iadd__ = gb.matrix.MatrixIndexExpr.__iadd__
+    __iand__ = gb.matrix.MatrixIndexExpr.__iand__
+    __ipow__ = gb.matrix.MatrixIndexExpr.__ipow__
+    __imod__ = gb.matrix.MatrixIndexExpr.__imod__
+    __isub__ = gb.matrix.MatrixIndexExpr.__isub__
+    __ixor__ = gb.matrix.MatrixIndexExpr.__ixor__
+    __ifloordiv__ = gb.matrix.MatrixIndexExpr.__ifloordiv__
+    __ior__ = gb.matrix.MatrixIndexExpr.__ior__
+
+
 def _chunk_diag_v2(inner_matrix, k):
     return wrap_inner(gb.ss.diag(inner_matrix.value, k))
 
@@ -911,7 +1595,9 @@ def _build_2D_chunk(
     out_row_range,
     out_col_range,
     fragments,
+    values,
     dup_op=None,
+    clear=False,
 ):
     """
     Reassembles filtered tuples (row, col, val) in the list `fragments`
@@ -921,17 +1607,25 @@ def _build_2D_chunk(
     """
     rows = np.concatenate([rows for (rows, _, _) in fragments])
     cols = np.concatenate([cols for (_, cols, _) in fragments])
-    vals = np.concatenate([vals for (_, _, vals) in fragments])
     nrows = out_row_range[0].stop - out_row_range[0].start
     ncols = out_col_range[0].stop - out_col_range[0].start
-    inner_matrix.value.build(
-        rows,
-        cols,
-        vals,
-        nrows=nrows,
-        ncols=ncols,
-        dup_op=dup_op,
-    )
+    if not clear and inner_matrix.value.nvals > 0:
+        raise gb.exceptions.OutputNotEmpty()
+
+    if values is None:
+        vals = np.concatenate([vals for (_, _, vals) in fragments])
+        inner_matrix.value.build(
+            rows,
+            cols,
+            vals,
+            nrows=nrows,
+            ncols=ncols,
+            dup_op=dup_op,
+            clear=clear,
+        )
+    else:
+        vals = values
+        inner_matrix.value.ss.build_scalar(rows, cols, vals)
     return InnerMatrix(inner_matrix.value)
 
 
@@ -944,7 +1638,7 @@ def _new_Matrix_chunk(out_row_range, out_col_range, gb_dtype=None):
     return InnerMatrix(gb.Matrix.new(gb_dtype, nrows=nrows, ncols=ncols))
 
 
-def _from_values2D(fragments, out_row_range, out_col_range, gb_dtype=None):
+def _from_values2D(values, fragments, out_row_range, out_col_range, dup_op=None, gb_dtype=None):
     """
     Reassembles filtered tuples (row, col, val) in the list `fragments`
     obtained from _pick2D() for the chunk within the given row and column
@@ -953,26 +1647,46 @@ def _from_values2D(fragments, out_row_range, out_col_range, gb_dtype=None):
     """
     rows = np.concatenate([rows for (rows, _, _) in fragments])
     cols = np.concatenate([cols for (_, cols, _) in fragments])
-    vals = np.concatenate([vals for (_, _, vals) in fragments])
+    if values is None:
+        vals = np.concatenate([vals for (_, _, vals) in fragments])
+    else:
+        vals = values
     nrows = out_row_range[0].stop - out_row_range[0].start
     ncols = out_col_range[0].stop - out_col_range[0].start
+    if rows.size == 0 or cols.size == 0:
+        return InnerMatrix(gb.Matrix.new(gb_dtype, nrows=nrows, ncols=ncols))
     return InnerMatrix(
-        gb.Matrix.from_values(rows, cols, vals, nrows=nrows, ncols=ncols, dtype=gb_dtype)
+        gb.Matrix.from_values(
+            rows, cols, vals, nrows=nrows, ncols=ncols, dup_op=dup_op, dtype=gb_dtype
+        )
     )
 
 
-def _pick2D(rows, cols, values, row_range, col_range):
+def _pick2D(rows, cols, values, row_range, col_range, shape):
     """
     Filters out only those tuples (row, col, val) that lie within
     the given row and column ranges.  Indices are also offset
     appropriately.
     """
+    # validate indices:
+    rows = np.where(rows < 0, rows + shape[0], rows)
+    bad_indices = (rows < 0) | (shape[0] <= rows)
+    if np.any(bad_indices):
+        raise IndexOutOfBound
+
+    cols = np.where(cols < 0, cols + shape[1], cols)
+    bad_indices = (cols < 0) | (shape[1] <= cols)
+    if np.any(bad_indices):
+        raise IndexOutOfBound
+
+    # filter into chunk:
     row_range, col_range = row_range[0], col_range[0]
     rows_in = (row_range.start <= rows) & (rows < row_range.stop)
     cols_in = (col_range.start <= cols) & (cols < col_range.stop)
     rows = rows[rows_in & cols_in] - row_range.start
     cols = cols[rows_in & cols_in] - col_range.start
-    values = values[rows_in & cols_in]
+    if isinstance(values, np.ndarray):
+        values = values[rows_in & cols_in]
     return (rows, cols, values)
 
 
@@ -1005,7 +1719,8 @@ def _identity(chunk, keepdims=None, axis=None):
 def _concatenate_files(chunk_files, keepdims=None, axis=None):
     import os
     import shutil
-    from scipy.io.mmio import MMFile, mminfo
+    from .io import MMFile
+    from scipy.io import mminfo
 
     chunk_files = chunk_files if type(chunk_files) is list else [chunk_files]
     first_chunk_file, _, row_range_first, col_range_first = chunk_files[0]
@@ -1162,3 +1877,5 @@ def _concat_matrix(seq, axis=0):
 
 gb.utils._output_types[Matrix] = gb.Matrix
 gb.utils._output_types[TransposedMatrix] = gb.matrix.TransposedMatrix
+gb.utils._output_types[MatrixExpression] = gb.Matrix
+gb.utils._output_types[MatrixIndexExpr] = gb.Matrix
diff --git a/dask_grblas/scalar.py b/dask_grblas/scalar.py
index 89c74e7..2266504 100644
--- a/dask_grblas/scalar.py
+++ b/dask_grblas/scalar.py
@@ -3,8 +3,9 @@
 import numpy as np
 from dask.delayed import Delayed, delayed
 
-from .base import BaseType, InnerBaseType
-from .expr import AmbiguousAssignOrExtract, GbDelayed
+from . import _automethods
+from .base import BaseType, InnerBaseType, DOnion, Box, any_dOnions
+from .expr import AmbiguousAssignOrExtract, GbDelayed, _is_pair
 from .utils import get_meta, np_dtype
 
 
@@ -67,35 +68,128 @@ def new(cls, dtype, *, name=None):
         return new(cls, dtype, name=name)
 
     def __init__(self, delayed, meta=None):
-        assert type(delayed) is da.Array, type(delayed)
-        assert delayed.ndim == 0
+        assert type(delayed) in {da.Array, DOnion}, type(delayed)
         self._delayed = delayed
+        if type(delayed) is da.Array:
+            assert delayed.ndim == 0
         if meta is None:
             meta = gb.Scalar.new(delayed.dtype)
+            # meta = gb.Scalar.from_value(1, dtype=delayed.dtype)
         self._meta = meta
         self.dtype = meta.dtype
 
-    def update(self, expr):
+    def update(self, expr, in_dOnion=False):
+        typ = type(expr)
+        if any_dOnions(self, expr):
+            self_copy = self.__class__(self._optional_dup(), meta=self._meta)
+            expr_ = expr
+            if isinstance(expr, AmbiguousAssignOrExtract) and expr.has_dOnion:
+
+                def update_by_aae(c, p, k_0, k_1):
+                    keys = k_0 if k_1 is None else (k_0, k_1)
+                    return c.update(p[keys], in_dOnion=True)
+
+                if _is_pair(expr_.index):
+                    keys_0, keys_1 = expr_.index[0], expr_.index[1]
+                else:
+                    keys_0, keys_1 = expr_.index, None
+
+                donion = DOnion.multi_access(
+                    self._meta,
+                    update_by_aae,
+                    self_copy,
+                    expr_.parent,
+                    *(keys_0, keys_1),
+                )
+                self.__init__(donion, self._meta)
+                return
+
+            if isinstance(expr, GbDelayed) and expr.has_dOnion:
+
+                def update_by_gbd(c, *args, **kwargs):
+                    gbd = getattr(args[0], args[1])(*args[2:], **kwargs)
+                    return c.update(gbd, in_dOnion=True)
+
+                donion = DOnion.multi_access(
+                    self._meta,
+                    update_by_gbd,
+                    self_copy,
+                    expr_.parent,
+                    expr_.method_name,
+                    *expr_.args,
+                    **expr_.kwargs,
+                )
+                self.__init__(donion, self._meta)
+                return
+
+            donion = DOnion.multi_access(
+                self._meta, Scalar.update, self_copy, expr_, in_dOnion=True
+            )
+            self.__init__(donion, self._meta)
+            return
+
+        if typ is Box:
+            expr = expr.content
+            typ = type(expr)
+
         self._meta.update(get_meta(expr))
         self._meta.clear()
-        typ = type(expr)
-        if typ is AmbiguousAssignOrExtract:
+        if isinstance(expr, AmbiguousAssignOrExtract):
             # Extract (s << v[index])
-            self.value = expr.new(dtype=self.dtype).value
+            expr_new = expr.new(dtype=self.dtype)
+            self.value = expr_new.value
         elif typ is Scalar:
             # Simple assignment (s << t)
             self.value = expr.value
-        elif typ is GbDelayed:
+        elif isinstance(expr, GbDelayed):
             # s << v.reduce()
             expr._update(self)
         else:
             # Try simple assignment (s << 1)
             self.value = expr
+        if in_dOnion:
+            return self.__class__(self._delayed, meta=self._meta)
 
-    def _update(self, delayed, *, accum):
+    def _update(self, expr, *, accum, in_dOnion=False):
         # s(accum=accum) << v.reduce()
-        assert type(delayed) is GbDelayed
-        delayed._update(self, accum=accum)
+        typ = type(expr)
+        if typ is Box:
+            expr = expr.content
+
+        assert isinstance(expr, GbDelayed)
+
+        if any_dOnions(self, expr):
+            self_copy = self.__class__(self._optional_dup(), meta=self._meta)
+            expr_ = expr
+            if isinstance(expr, GbDelayed) and expr.has_dOnion:
+
+                def _update_by_gbd(c, *args, accum=None, **kwargs):
+                    gbd = getattr(args[0], args[1])(*args[2:], **kwargs)
+                    return c._update(gbd, accum=accum, in_dOnion=True)
+
+                donion = DOnion.multi_access(
+                    self._meta,
+                    _update_by_gbd,
+                    self_copy,
+                    expr_.parent,
+                    expr_.method_name,
+                    *expr_.args,
+                    accum=accum,
+                    **expr_.kwargs,
+                )
+                self.__init__(donion, self._meta)
+                return
+
+            expr_ = expr.parent.dOnion_if
+            donion = DOnion.mult_access(
+                self._meta, Scalar._update, self_copy, expr_, accum=accum, in_dOnion=True
+            )
+            self.__init__(donion, self._meta)
+            return
+
+        expr._update(self, accum=accum)
+        if in_dOnion:
+            return self.__class__(self._delayed, meta=self._meta)
 
     def dup(self, dtype=None, *, name=None):
         if dtype is None:
@@ -112,7 +206,10 @@ def dup(self, dtype=None, *, name=None):
 
     def _persist(self, *args, **kwargs):
         """Since scalars are small, persist them if they need to be computed"""
-        self._delayed = self._delayed.persist(*args, **kwargs)
+        if self.is_dOnion:
+            self._delayed = self._delayed._persist(*args, **kwargs)
+        else:
+            self._delayed = self._delayed.persist(*args, **kwargs)
 
     def __eq__(self, other):
         return self.isequal(other).compute()
@@ -154,11 +251,28 @@ def __array__(self, dtype=None):
     def isequal(self, other, *, check_dtype=False):
         if other is None:
             return self.is_empty
+        if type(other) is Box:
+            other = other.content
         if type(other) is not Scalar:
+            if other is None:
+                return self.is_empty
             self._meta.isequal(get_meta(other))
-            other = Scalar.from_value(other)
+            try:
+                other = Scalar.from_value(other)
+            except TypeError:
+                other = self._expect_type(
+                    other,
+                    (Scalar, gb.Scalar),
+                    within="isequal",
+                    argname="other",
+                    extra_message="Literal scalars also accepted.",
+                )
+            # Don't check dtype if we had to infer dtype of `other`
             check_dtype = False
-        return super().isequal(other, check_dtype=check_dtype)
+        if check_dtype and self.dtype != other.dtype:
+            return False
+        else:
+            return super().isequal(other, check_dtype=check_dtype)
 
     def isclose(self, other, *, rel_tol=1e-7, abs_tol=0.0, check_dtype=False):
         if other is None:
@@ -171,6 +285,10 @@ def isclose(self, other, *, rel_tol=1e-7, abs_tol=0.0, check_dtype=False):
 
     @property
     def is_empty(self):
+        if self.is_dOnion:
+            donion = DOnion.multi_access(gb.Scalar.new(bool), getattr, self, "is_empty")
+            return PythonScalar(donion)
+
         delayed = da.core.elemwise(
             _is_empty,
             self._delayed,
@@ -198,6 +316,11 @@ def value(self):
 
     @value.setter
     def value(self, val):
+        if any_dOnions(self, val):
+            donion = DOnion.multi_access(self._meta, Scalar.from_value, val)
+            self.__init__(donion, meta=self._meta)
+            return
+
         scalar = Scalar.from_value(val, dtype=self.dtype)
         self._delayed = scalar._delayed
 
@@ -210,6 +333,7 @@ class PythonScalar:
     __complex__ = Scalar.__complex__
     __index__ = Scalar.__index__
     _persist = Scalar._persist
+    is_dOnion = Scalar.is_dOnion
 
     @classmethod
     def from_delayed(cls, scalar, dtype, *, name=None):
@@ -228,9 +352,67 @@ def __eq__(self, other):
 
     def compute(self, *args, **kwargs):
         innerval = self._delayed.compute(*args, **kwargs)
+        if self.is_dOnion:
+            return innerval.value if hasattr(innerval, "value") else innerval
+
         return innerval.value.value
 
 
+class ScalarExpression(GbDelayed):
+    __slots__ = ()
+    output_type = gb.Scalar
+    ndim = 0
+    shape = ()
+    _is_scalar = True
+    _is_cscalar = False
+    __and__ = gb.scalar.ScalarExpression.__and__
+    __bool__ = gb.scalar.ScalarExpression.__bool__
+    __eq__ = gb.scalar.ScalarExpression.__eq__
+    __float__ = gb.scalar.ScalarExpression.__float__
+    __index__ = gb.scalar.ScalarExpression.__index__
+    __int__ = gb.scalar.ScalarExpression.__int__
+    __or__ = gb.scalar.ScalarExpression.__or__
+    _get_value = _automethods._get_value
+    isclose = gb.scalar.ScalarExpression.isclose
+    isequal = gb.scalar.ScalarExpression.isequal
+    value = gb.scalar.ScalarExpression.value
+
+    # def __getattr__(self, item):
+    #     return getattr(gb.scalar.ScalarExpression, item)
+
+
+class ScalarIndexExpr(AmbiguousAssignOrExtract):
+    output_type = gb.Scalar
+    ndim = 0
+    shape = ()
+    _is_scalar = True
+    _is_cscalar = False
+
+    dup = new
+
+    @property
+    def is_cscalar(self):
+        return self._is_cscalar
+
+    @property
+    def is_grbscalar(self):
+        return not self._is_cscalar
+
+    # Begin auto-generated code: Scalar
+    __and__ = gb.scalar.ScalarIndexExpr.__and__
+    __bool__ = gb.scalar.ScalarIndexExpr.__bool__
+    __eq__ = gb.scalar.ScalarIndexExpr.__eq__
+    __float__ = gb.scalar.ScalarIndexExpr.__float__
+    __index__ = gb.scalar.ScalarIndexExpr.__index__
+    __int__ = gb.scalar.ScalarIndexExpr.__int__
+    __or__ = gb.scalar.ScalarIndexExpr.__or__
+    _get_value = _automethods._get_value
+    isclose = gb.scalar.ScalarIndexExpr.isclose
+    isequal = gb.scalar.ScalarIndexExpr.isequal
+    value = gb.scalar.ScalarIndexExpr.value
+    # End auto-generated code: Scalar
+
+
 # Dask task functions
 def _scalar_dup(x, dtype):
     return InnerScalar(x.value.dup(dtype=dtype))
@@ -250,3 +432,5 @@ def _invert(x):
 
 gb.utils._output_types[Scalar] = gb.Scalar
 gb.utils._output_types[PythonScalar] = gb.Scalar
+gb.utils._output_types[ScalarExpression] = gb.Scalar
+gb.utils._output_types[ScalarIndexExpr] = gb.Scalar
diff --git a/dask_grblas/utils.py b/dask_grblas/utils.py
index b07e64d..1141f07 100644
--- a/dask_grblas/utils.py
+++ b/dask_grblas/utils.py
@@ -2,11 +2,20 @@
 import pandas as pd
 import dask.array as da
 import dask.dataframe as dd
+from functools import reduce
 from dask.base import tokenize
 from dask.delayed import delayed
 from .io import MMFile
 
 
+def pack_args(*args):
+    return args
+
+
+def pack_kwargs(**kwargs):
+    return kwargs
+
+
 def np_dtype(dtype):
     return np.dtype(dtype.numba_type.name)
 
@@ -31,6 +40,10 @@ def wrap_inner(val):
     return _inner_types[type(val)](val)
 
 
+def flatten(lol):
+    return reduce(lambda x, y: x + y, lol)
+
+
 def build_block_index_dask_array(x, axis, name):
     """
     Calculate block-index for each chunk of x along axis `axis`
diff --git a/dask_grblas/vector.py b/dask_grblas/vector.py
index 282dcea..16caed1 100644
--- a/dask_grblas/vector.py
+++ b/dask_grblas/vector.py
@@ -1,16 +1,26 @@
 import dask.array as da
 import numpy as np
 import grblas as gb
+
+from numbers import Integral
+from tlz import compose
+
 from dask.base import tokenize
+from dask.highlevelgraph import HighLevelGraph
 from dask.delayed import Delayed, delayed
 from grblas import binary, monoid, semiring
 from grblas.dtypes import lookup_dtype
+from grblas.exceptions import IndexOutOfBound, DimensionMismatch
 
-from .base import BaseType, InnerBaseType, _nvals
-from .expr import AmbiguousAssignOrExtract, GbDelayed, Updater, Assigner
+from . import _automethods
+from .base import BaseType, InnerBaseType, _nvals, DOnion, Box, any_dOnions
+from .base import _dup as chunk_dup
+from .expr import AmbiguousAssignOrExtract, IndexerResolver, GbDelayed, Updater, Assigner
 from .mask import StructuralMask, ValueMask
 from ._ss.vector import ss
 from .utils import (
+    pack_args,
+    pack_kwargs,
     np_dtype,
     get_return_type,
     wrap_inner,
@@ -77,6 +87,43 @@ def __getitem__(self, index):
 class Vector(BaseType):
     __slots__ = ("ss",)
     ndim = 1
+    __abs__ = gb.Vector.__abs__
+    __add__ = gb.Vector.__add__
+    __divmod__ = gb.Vector.__divmod__
+    __eq__ = gb.Vector.__eq__
+    __floordiv__ = gb.Vector.__floordiv__
+    __ge__ = gb.Vector.__ge__
+    __gt__ = gb.Vector.__gt__
+    __iadd__ = gb.Vector.__iadd__
+    __iand__ = gb.Vector.__iand__
+    __ifloordiv__ = gb.Vector.__ifloordiv__
+    __imod__ = gb.Vector.__imod__
+    __imul__ = gb.Vector.__imul__
+    __invert__ = gb.Vector.__invert__
+    __ior__ = gb.Vector.__ior__
+    __ipow__ = gb.Vector.__ipow__
+    __isub__ = gb.Vector.__isub__
+    __itruediv__ = gb.Vector.__itruediv__
+    __ixor__ = gb.Vector.__ixor__
+    __le__ = gb.Vector.__le__
+    __lt__ = gb.Vector.__lt__
+    __mod__ = gb.Vector.__mod__
+    __mul__ = gb.Vector.__mul__
+    __ne__ = gb.Vector.__ne__
+    __neg__ = gb.Vector.__neg__
+    __pow__ = gb.Vector.__pow__
+    __radd__ = gb.Vector.__radd__
+    __rdivmod__ = gb.Vector.__rdivmod__
+    __rfloordiv__ = gb.Vector.__rfloordiv__
+    __rmod__ = gb.Vector.__rmod__
+    __rmul__ = gb.Vector.__rmul__
+    __rpow__ = gb.Vector.__rpow__
+    __rsub__ = gb.Vector.__rsub__
+    __rtruediv__ = gb.Vector.__rtruediv__
+    __rxor__ = gb.Vector.__rxor__
+    __sub__ = gb.Vector.__sub__
+    __truediv__ = gb.Vector.__truediv__
+    __xor__ = gb.Vector.__xor__
 
     @classmethod
     def from_delayed(cls, vector, dtype, size, *, nvals=None, name=None):
@@ -105,31 +152,68 @@ def from_values(
         /,
         size=None,
         *,
-        trust_size=False,
         dup_op=None,
         dtype=None,
         chunks="auto",
         name=None,
     ):
-        # Note: `trust_size` is a bool parameter that, when True,
-        # can be used to avoid expensive computation of max(indices)
-        # which is used to verify that `size` is indeed large enough
-        # to hold all the given tuples.
-        # TODO:
-        # dup_op support for dask_array indices/values (use reduce_assign?)
-        if dup_op is None and type(indices) is da.Array and type(values) is da.Array:
-            if not trust_size or size is None:
-                # this branch is an expensive operation:
-                implied_size = 1 + da.max(indices).compute()
-                if size is not None and implied_size > size:
-                    raise Exception()
-                size = implied_size if size is None else size
-
-            idtype = gb.Vector.new(indices.dtype).dtype
-            np_idtype_ = np_dtype(idtype)
-            vdtype = gb.Vector.new(values.dtype).dtype
+        if hasattr(values, "dtype"):
+            dtype = lookup_dtype(values.dtype if dtype is None else dtype)
+
+        meta = gb.Vector.new(dtype, size=size if isinstance(size, Integral) else 0)
+
+        # check for any DOnions:
+        args = pack_args(indices, values, size)
+        kwargs = pack_kwargs(dup_op=dup_op, dtype=dtype, chunks=chunks, name=name)
+        if any_dOnions(*args, **kwargs):
+            # dive into DOnion(s):
+            out_donion = DOnion.multi_access(meta, Vector.from_values, *args, **kwargs)
+            return Vector(out_donion, meta=meta)
+
+        # no DOnions
+        if type(indices) is da.Array or type(values) is da.Array:
+            size_ = size
+            if type(indices) in {tuple, list, np.ndarray}:
+                size_ = size or (np.max(indices) + 1)
+                indices = da.asarray(indices)
+            if type(values) in {tuple, list, np.ndarray}:
+                values = da.asarray(values)
+
+            np_idtype_ = np_dtype(lookup_dtype(indices.dtype))
+            if isinstance(size_, Integral):
+                size = size_
+                chunks = da.core.normalize_chunks(chunks, (size,), dtype=np_idtype_)
+            else:
+                if indices.size == 0:
+                    raise ValueError("No indices provided. Unable to infer size.")
+
+                if indices.dtype.kind not in "ui":
+                    raise ValueError(f"indices must be integers, not {indices.dtype}")
+
+                # Note: uint + int = float which numpy cannot cast to uint.  So we
+                # ensure the same dtype for each summand here:
+                size = size_
+                if size is None:
+                    size = da.max(indices) + np.asarray(1, dtype=indices.dtype)
+                # Here `size` is a dask 0d-array whose computed value is
+                # used to determine the size of the Vector to be returned.
+                # But since we do not want to compute anything just now,
+                # we instead create a "dOnion" (dask Onion) object.  This
+                # effectively means that we will use the inner value of
+                # `size` to create the new Vector:
+                args = pack_args(indices, values)
+                kwargs = pack_kwargs(dup_op=dup_op, dtype=dtype, chunks=chunks, name=name)
+                donion = DOnion.sprout(size, meta, Vector.from_values, *args, **kwargs)
+                return Vector(donion, meta=meta)
+
+            # output shape `(size,)` is completely determined
+            if indices.size > 0:
+                if indices.size != values.size:
+                    raise ValueError("`indices` and `values` lengths must match")
+
+            vdtype = dtype
             np_vdtype_ = np_dtype(vdtype)
-            chunks = da.core.normalize_chunks(chunks, (size,), dtype=np_idtype_)
+
             name_ = name
             name = str(name) if name else ""
             name = name + "-index-ranges" + tokenize(cls, chunks[0])
@@ -139,15 +223,17 @@ def from_values(
                 *(indices, "j"),
                 *(values, "j"),
                 *(index_ranges, "i"),
+                size=size,
                 dtype=np_vdtype_,
                 meta=np.array([]),
             )
-            meta = InnerVector(gb.Vector.new(vdtype))
+            meta = InnerVector(gb.Vector.new(vdtype, size=size))
             delayed = da.core.blockwise(
                 *(_from_values1D, "i"),
                 *(fragments, "ij"),
                 *(index_ranges, "i"),
                 concatenate=False,
+                dup_op=dup_op,
                 gb_dtype=dtype,
                 dtype=np_vdtype_,
                 meta=meta,
@@ -161,6 +247,14 @@ def from_values(
 
     @classmethod
     def new(cls, dtype, size=0, *, chunks="auto", name=None):
+        if any_dOnions(size):
+            meta = gb.Vector.new(dtype)
+            donion = DOnion.multi_access(meta, cls.new, dtype, size=size, chunks=chunks, name=name)
+            return Vector(donion, meta=meta)
+
+        if type(size) is Box:
+            size = size.content
+
         if size > 0:
             chunks = da.core.normalize_chunks(chunks, (size,), dtype=int)
             meta = gb.Vector.new(dtype)
@@ -187,14 +281,18 @@ def __init__(self, delayed, meta=None, nvals=None):
         # if it is already known  at the time of initialization of
         # this Vector,  otherwise its value should be left as None
         # (the default)
-        assert type(delayed) is da.Array
-        assert delayed.ndim == 1
+        assert type(delayed) in {da.Array, DOnion}
         self._delayed = delayed
-        if meta is None:
-            meta = gb.Vector.new(delayed.dtype, delayed.shape[0])
+        if type(delayed) is da.Array:
+            assert delayed.ndim == 1
+            if meta is None:
+                meta = gb.Vector.new(delayed.dtype, delayed.shape[0])
+        else:
+            if meta is None:
+                meta = gb.Vector.new(delayed.dtype)
         self._meta = meta
-        self._size = meta.size
         self.dtype = meta.dtype
+        self._size = self.size
         self._nvals = nvals
         # Add ss extension methods
         self.ss = ss(self)
@@ -227,13 +325,104 @@ def V(self):
 
     @property
     def size(self):
+        if self.is_dOnion:
+            return DOnion.multi_access(self._meta.size, getattr, self, "size")
         return self._meta.size
 
     @property
     def shape(self):
+        if self.is_dOnion:
+            return (self.size,)
         return self._meta.shape
 
+    def _head(self, delayed, shape):
+        """
+        Take the leading portion of shape `shape` from `delayed`
+        """
+        def _slice(inner, slc_x):
+            return InnerVector(inner.value[slc_x].new())
+
+        x = delayed
+
+        stops_ = np.cumsum(x.chunks[0])
+        starts = np.roll(stops_, 1)
+        starts[0] = 0
+
+        M = x.numblocks[0]
+        blockid = np.arange(M)
+
+        # locate chunk containing last element:
+        i = min(self.shape[0], shape[0]) - 1
+        filter = (starts <= i) & (i < stops_)
+        (last_block,) = blockid[filter]
+        tail_sz = i - starts[last_block] + 1
+
+        numblocks = (last_block + 1,)
+        heads = (tail_sz,)
+        new_chunks = (x.chunks[0][:last_block] + (tail_sz,),)
+
+        name = "Vector.resize-" + tokenize(x)
+        dtype = self.dtype
+        dsk = dict()
+        for i in range(numblocks[0]):
+            x_cut = (i == numblocks[0] - 1)
+            if x_cut:
+                dsk[(name, i)] = (
+                    _slice,
+                    (x.name, i),
+                    slice(heads[0]) if x_cut else slice(None),
+                )
+            else:
+                dsk[(name, i)] = (chunk_dup, (x.name, i), None, dtype, None)
+
+        return name, dsk, new_chunks, numblocks
+
+    def _add_tail(self, axis, size, name, dsk, chunks, numblocks):
+        """
+        Append dask graph `dsk` with empty chunks on axis `axis` up to size `size`
+        """
+        rem = size - self.shape[axis]
+        if rem > 0:
+            j = numblocks[axis]
+            new_chunks = chunks[axis] + (rem,)
+            new_chunks = (new_chunks,)
+
+            dsk[(name, j)] = (compose(InnerVector, gb.Vector.new), self.dtype, rem)
+
+            return name, dsk, new_chunks, (len(new_chunks[0]),)
+
+        else:
+            return name, dsk, chunks, numblocks
+
     def resize(self, size, inplace=True, chunks="auto"):
+        if any_dOnions(self, size):
+            donion = DOnion.multi_access(
+                self._meta, Vector.resize, self, size, inplace=False, chunks=chunks
+            )
+            if inplace:
+                self.__init__(donion, meta=self._meta)
+                return
+            else:
+                return Vector(donion, meta=self._meta)
+
+        name, dsk, new_chunks, num_blocks = self._head(self._delayed, (size,))
+        name, dsk, new_chunks, num_blocks = self._add_tail(0, size, name, dsk, new_chunks, num_blocks)
+
+        graph = HighLevelGraph.from_collections(name, dsk, dependencies=[self._delayed])
+        x = da.core.Array(graph, name, new_chunks, meta=wrap_inner(self._meta))
+        x = x.rechunk(chunks=chunks)
+
+        if size >= self.size:
+            nvals = self.nvals
+        else:
+            nvals = None
+
+        if inplace:
+            self.__init__(x, nvals=nvals)
+        else:
+            return Vector(x, nvals=nvals)
+
+    def _resize_old(self, size, inplace=True, chunks="auto"):
         chunks = da.core.normalize_chunks(chunks, (size,), dtype=np.int64)
         output_ranges = build_ranges_dask_array_from_chunks(chunks[0], "output_ranges-")
 
@@ -269,6 +458,9 @@ def resize(self, size, inplace=True, chunks="auto"):
         else:
             return Vector(x, nvals=nvals)
 
+    def diag(self, k=0, dtype=None, chunks="auto"):
+        return self._diag(k=k, dtype=dtype, chunks=chunks)
+
     def _diag(self, k=0, dtype=None, chunks="auto"):
         nrows = self.size + abs(k)
         kdiag_col_start = max(0, k)
@@ -316,30 +508,20 @@ def rechunk(self, inplace=False, chunks="auto"):
             self.resize(*self.shape, chunks=chunks)
         else:
             return self.resize(*self.shape, chunks=chunks, inplace=False)
-        # chunks = da.core.normalize_chunks(chunks, self.shape, dtype=np.int64)
-        # id = self.to_values()
-        # new = Vector.from_values(*id, *self.shape, trust_size=True, chunks=chunks)
-        # if inplace:
-        #     self.__init__(new._delayed)
-        # else:
-        #     return new
 
-    def __getitem__(self, index):
-        return AmbiguousAssignOrExtract(self, index)
+    def __getitem__(self, keys):
+        resolved_indexes = IndexerResolver(self, keys)
+        shape = resolved_indexes.shape
+        if not shape:
+            from .scalar import ScalarIndexExpr
+
+            return ScalarIndexExpr(self, resolved_indexes)
+        else:
+            return VectorIndexExpr(self, resolved_indexes, *shape)
 
     def __delitem__(self, keys):
         del Updater(self)[keys]
 
-        # del self._meta[index]
-        # delayed = self._optional_dup()
-        # TODO: normalize index
-        # delayed = delayed.map_blocks(
-        #     _delitem,
-        #     index,
-        #     dtype=np_dtype(self.dtype),
-        # )
-        # raise NotImplementedError()
-
     def __setitem__(self, index, delayed):
         Assigner(Updater(self), index).update(delayed)
 
@@ -358,14 +540,35 @@ def __iter__(self):
         return indices.flat
 
     def ewise_add(self, other, op=monoid.plus, *, require_monoid=True):
-        assert type(other) is Vector
-        meta = self._meta.ewise_add(other._meta, op=op, require_monoid=require_monoid)
-        return GbDelayed(self, "ewise_add", other, op, require_monoid=require_monoid, meta=meta)
+        gb_types = (gb.Vector,)
+        other = self._expect_type(other, (Vector,) + gb_types, within="ewise_add", argname="other")
+
+        try:
+            meta = self._meta.ewise_add(other._meta, op=op, require_monoid=require_monoid)
+        except DimensionMismatch:
+            if any_dOnions(self, other):
+                meta = self._meta.ewise_add(self._meta, op=op, require_monoid=require_monoid)
+            else:
+                raise
+
+        return VectorExpression(
+            self, "ewise_add", other, op, require_monoid=require_monoid, meta=meta
+        )
 
     def ewise_mult(self, other, op=binary.times):
-        assert type(other) is Vector
+        gb_types = (gb.Vector,)
+        other = self._expect_type(other, (Vector,) + gb_types, within="ewise_mult", argname="other")
+
+        try:
+            meta = self._meta.ewise_mult(other._meta, op=op)
+        except DimensionMismatch:
+            if any_dOnions(self, other):
+                meta = self._meta.ewise_add(self._meta, op=op)
+            else:
+                raise
+
         meta = self._meta.ewise_mult(other._meta, op=op)
-        return GbDelayed(self, "ewise_mult", other, op, meta=meta)
+        return VectorExpression(self, "ewise_mult", other, op, meta=meta)
 
     # Unofficial methods
     def inner(self, other, op=semiring.plus_times):
@@ -423,9 +626,21 @@ def outer(self, other, op=binary.times):
     def vxm(self, other, op=semiring.plus_times):
         from .matrix import Matrix, TransposedMatrix
 
-        assert type(other) in (Matrix, TransposedMatrix)
-        meta = self._meta.vxm(other._meta, op=op)
-        return GbDelayed(self, "vxm", other, op, meta=meta)
+        gb_types = (gb.Matrix, gb.matrix.TransposedMatrix)
+        other = self._expect_type(
+            other, (Matrix, TransposedMatrix) + gb_types, within="vxm", argname="other"
+        )
+        try:
+            meta = self._meta.vxm(other._meta, op=op)
+        except DimensionMismatch:
+            if any_dOnions(self, other):
+                other_meta = gb.Matrix.new(
+                    dtype=other._meta.dtype, nrows=self._meta.size, ncols=other._meta.ncols
+                )
+                meta = self._meta.vxm(other_meta, op=op)
+            else:
+                raise
+        return VectorExpression(self, "vxm", other, op, meta=meta, size=other.ncols)
 
     def apply(self, op, right=None, *, left=None):
         from .scalar import Scalar
@@ -439,11 +654,13 @@ def apply(self, op, right=None, *, left=None):
             right_meta = right.dtype.np_type(0)
 
         meta = self._meta.apply(op=op, left=left_meta, right=right_meta)
-        return GbDelayed(self, "apply", op, right, meta=meta, left=left)
+        return VectorExpression(self, "apply", op, right, meta=meta, left=left)
+
+    def reduce(self, op=monoid.plus, *, allow_empty=True):
+        from .scalar import ScalarExpression
 
-    def reduce(self, op=monoid.plus):
         meta = self._meta.reduce(op)
-        return GbDelayed(self, "reduce", op, meta=meta)
+        return ScalarExpression(self, "reduce", op, meta=meta, allow_empty=allow_empty)
 
     def build(self, indices, values, *, size=None, chunks=None, dup_op=None, clear=False):
         if clear:
@@ -461,11 +678,11 @@ def build(self, indices, values, *, size=None, chunks=None, dup_op=None, clear=F
         x = self._optional_dup()
         if type(indices) is list:
             if np.max(indices) >= self._size:
-                raise gb.exceptions.IndexOutOfBound
+                raise IndexOutOfBound
             indices = da.core.from_array(np.array(indices), name="indices-" + tokenize(indices))
         else:
             if da.max(indices).compute() >= self._size:
-                raise gb.exceptions.IndexOutOfBound
+                raise IndexOutOfBound
         if type(values) is list:
             values = da.core.from_array(np.array(values), name="values-" + tokenize(values))
 
@@ -504,67 +721,79 @@ def build(self, indices, values, *, size=None, chunks=None, dup_op=None, clear=F
         # self.__init__(Vector.from_vector(vector)._delayed)
 
     def to_values(self, dtype=None, chunks="auto"):
+        dtype = lookup_dtype(self.dtype if dtype is None else dtype)
+        meta_i, meta_v = self._meta.to_values(dtype)
+
         x = self._delayed
+        if type(x) is DOnion:
+            meta = np.array([])
+            result = x.getattr(meta, "to_values", dtype=dtype, chunks=chunks)
+            indices = result.getattr(meta_i, "__getitem__", 0)
+            values = result.getattr(meta_v, "__getitem__", 1)
+            return indices, values
+
+        # get dask array of nvals for each chunk:
         nvals_array = da.core.blockwise(
             *(_nvals, "i"), *(x, "i"), adjust_chunks={"i": 1}, dtype=np.int64, meta=np.array([])
-        ).compute()
+        )
 
-        stops = np.cumsum(nvals_array)
-        starts = np.roll(stops, 1)
+        # accumulate dask array to get index-ranges of the output (indices, values)
+        stops_ = da.cumsum(nvals_array)  # BEWARE: this function rechunks!
+        starts = da.roll(stops_, 1)
+        starts = starts.copy() if starts.size == 1 else starts  # bug!!
         starts[0] = 0
-        nnz = stops[-1]
-
-        starts = starts.reshape(nvals_array.shape)
-        starts = da.from_array(starts, chunks=1, name="starts" + tokenize(starts))
-        starts = da.core.Array(starts.dask, starts.name, x.chunks, starts.dtype, meta=x._meta)
+        nnz = stops_[-1]
+        starts = starts.rechunk(1)
+        stops_ = stops_.rechunk(1)
+
+        def _to_values(x, starts, stops_, dtype, chunks, nnz):
+            # the following changes the `.chunks` attribute of `starts` and `stops_` so that
+            # `blockwise()` can align them with `x`
+            starts = da.core.Array(starts.dask, starts.name, x.chunks, starts.dtype, meta=x._meta)
+            stops_ = da.core.Array(stops_.dask, stops_.name, x.chunks, stops_.dtype, meta=x._meta)
+
+            chunks = da.core.normalize_chunks(chunks, (nnz,), dtype=np.int64)
+            output_ranges = build_ranges_dask_array_from_chunks(chunks[0], "output_ranges-")
+
+            gb_dtype = lookup_dtype(dtype)
+            dtype_ = np_dtype(gb_dtype)
+            index_offsets = build_chunk_offsets_dask_array(x, 0, "index_offset-")
+            x = da.core.blockwise(
+                *(VectorTupleExtractor, "ij"),
+                *(output_ranges, "j"),
+                *(x, "i"),
+                *(index_offsets, "i"),
+                *(starts, "i"),
+                *(stops_, "i"),
+                gb_dtype=gb_dtype,
+                dtype=dtype_,
+                meta=np.array([[]]),
+            )
+            return da.reduction(
+                x, _identity, _flatten, axis=0, concatenate=False, dtype=dtype_, meta=np.array([])
+            )
 
-        stops = stops.reshape(nvals_array.shape)
-        stops = da.from_array(stops, chunks=1, name="stops" + tokenize(stops))
-        stops = da.core.Array(stops.dask, stops.name, x.chunks, stops.dtype, meta=x._meta)
+        # since the size of the output (indices, values) depends on nnz, a delayed quantity,
+        # we need to return (indices, values) as DOnions (twice-delayed dask-arrays)
+        meta = np.array([])
+        iv_donion = DOnion.sprout(nnz, meta, _to_values, x, starts, stops_, dtype, chunks)
 
-        chunks = da.core.normalize_chunks(chunks, (nnz,), dtype=np.int64)
-        output_ranges = build_ranges_dask_array_from_chunks(chunks[0], "output_ranges-")
-
-        dtype_ = np_dtype(self.dtype)
-        index_offsets = build_chunk_offsets_dask_array(x, 0, "index_offset-")
-        x = da.core.blockwise(
-            *(VectorTupleExtractor, "ij"),
-            *(output_ranges, "j"),
-            *(x, "i"),
-            *(index_offsets, "i"),
-            *(starts, "i"),
-            *(stops, "i"),
-            gb_dtype=dtype,
-            dtype=dtype_,
-            meta=np.array([[]]),
+        dtype_i = np_dtype(lookup_dtype(meta_i.dtype))
+        indices = iv_donion.deep_extract(
+            meta_i, da.map_blocks, _get_indices, dtype=dtype_i, meta=meta_i
         )
-        x = da.reduction(
-            x, _identity, _flatten, axis=0, concatenate=False, dtype=dtype_, meta=np.array([])
+        dtype_v = np_dtype(lookup_dtype(meta_v.dtype))
+        values = iv_donion.deep_extract(
+            meta_v, da.map_blocks, _get_values, dtype=dtype_v, meta=meta_v
         )
-
-        meta_i, meta_v = self._meta.to_values(dtype)
-        indices = da.map_blocks(_get_indices, x, dtype=meta_i.dtype, meta=meta_i)
-        values = da.map_blocks(_get_values, x, dtype=meta_v.dtype, meta=meta_v)
         return indices, values
 
-        # delayed = self._delayed
-        # dtype_ = np_dtype(self.dtype)
-        # meta_i, meta_v = self._meta.to_values(dtype)
-        # meta = np.array([])
-        # offsets = build_chunk_offsets_dask_array(delayed, 0, "index_offset-")
-        # x = da.map_blocks(
-        #    TupleExtractor, delayed, offsets, gb_dtype=dtype, dtype=dtype_, meta=meta
-        # )
-        # indices = da.map_blocks(_get_indices, x, dtype=meta_i.dtype, meta=meta)
-        # values = da.map_blocks(_get_values, x, dtype=meta_v.dtype, meta=meta)
-        # return indices, values
-
     def isequal(self, other, *, check_dtype=False):
-        other = self._expect_type(other, Vector, within="isequal", argname="other")
+        other = self._expect_type(other, (Vector, gb.Vector), within="isequal", argname="other")
         return super().isequal(other, check_dtype=check_dtype)
 
     def isclose(self, other, *, rel_tol=1e-7, abs_tol=0.0, check_dtype=False):
-        other = self._expect_type(other, Vector, within="isclose", argname="other")
+        other = self._expect_type(other, (Vector, gb.Vector), within="isclose", argname="other")
         return super().isclose(other, rel_tol=rel_tol, abs_tol=abs_tol, check_dtype=check_dtype)
 
     def _delete_element(self, resolved_indexes):
@@ -592,6 +821,180 @@ def _carg(self):
 Vector.ss = gb.utils.class_property(Vector.ss, ss)
 
 
+class VectorExpression(GbDelayed):
+    __slots__ = ()
+    output_type = gb.Vector
+    ndim = 1
+    _is_scalar = False
+
+    # automethods:
+    __and__ = gb.vector.VectorExpression.__and__
+    __bool__ = gb.vector.VectorExpression.__bool__
+    __or__ = gb.vector.VectorExpression.__or__
+    _get_value = _automethods._get_value
+    S = gb.vector.VectorExpression.S
+    V = gb.vector.VectorExpression.V
+    apply = gb.vector.VectorExpression.apply
+    ewise_add = gb.vector.VectorExpression.ewise_add
+    ewise_mult = gb.vector.VectorExpression.ewise_mult
+    isclose = gb.vector.VectorExpression.isclose
+    isequal = gb.vector.VectorExpression.isequal
+    nvals = gb.vector.VectorExpression.nvals
+    reduce = gb.vector.VectorExpression.reduce
+    shape = gb.vector.VectorExpression.shape
+    size = gb.vector.VectorExpression.size
+    vxm = gb.vector.VectorExpression.vxm
+
+    # infix sugar:
+    __abs__ = gb.vector.VectorExpression.__abs__
+    __add__ = gb.vector.VectorExpression.__add__
+    __divmod__ = gb.vector.VectorExpression.__divmod__
+    __eq__ = gb.vector.VectorExpression.__eq__
+    __floordiv__ = gb.vector.VectorExpression.__floordiv__
+    __ge__ = gb.vector.VectorExpression.__ge__
+    __gt__ = gb.vector.VectorExpression.__gt__
+    __invert__ = gb.vector.VectorExpression.__invert__
+    __le__ = gb.vector.VectorExpression.__le__
+    __lt__ = gb.vector.VectorExpression.__lt__
+    __mod__ = gb.vector.VectorExpression.__mod__
+    __mul__ = gb.vector.VectorExpression.__mul__
+    __ne__ = gb.vector.VectorExpression.__ne__
+    __neg__ = gb.vector.VectorExpression.__neg__
+    __pow__ = gb.vector.VectorExpression.__pow__
+    __radd__ = gb.vector.VectorExpression.__radd__
+    __rdivmod__ = gb.vector.VectorExpression.__rdivmod__
+    __rfloordiv__ = gb.vector.VectorExpression.__rfloordiv__
+    __rmod__ = gb.vector.VectorExpression.__rmod__
+    __rmul__ = gb.vector.VectorExpression.__rmul__
+    __rpow__ = gb.vector.VectorExpression.__rpow__
+    __rsub__ = gb.vector.VectorExpression.__rsub__
+    __rtruediv__ = gb.vector.VectorExpression.__rtruediv__
+    __rxor__ = gb.vector.VectorExpression.__rxor__
+    __sub__ = gb.vector.VectorExpression.__sub__
+    __truediv__ = gb.vector.VectorExpression.__truediv__
+    __xor__ = gb.vector.VectorExpression.__xor__
+
+    # bad sugar:
+    __itruediv__ = gb.vector.VectorExpression.__itruediv__
+    __imul__ = gb.vector.VectorExpression.__imul__
+    __imatmul__ = gb.vector.VectorExpression.__imatmul__
+    __iadd__ = gb.vector.VectorExpression.__iadd__
+    __iand__ = gb.vector.VectorExpression.__iand__
+    __ipow__ = gb.vector.VectorExpression.__ipow__
+    __imod__ = gb.vector.VectorExpression.__imod__
+    __isub__ = gb.vector.VectorExpression.__isub__
+    __ixor__ = gb.vector.VectorExpression.__ixor__
+    __ifloordiv__ = gb.vector.VectorExpression.__ifloordiv__
+    __ior__ = gb.vector.VectorExpression.__ior__
+
+    def __init__(
+        self,
+        parent,
+        method_name,
+        *args,
+        meta=None,
+        size=None,
+        **kwargs,
+    ):
+        super().__init__(
+            parent,
+            method_name,
+            *args,
+            meta=meta,
+            **kwargs,
+        )
+        if size is None:
+            size = self.parent._size
+        self._size = size
+
+    # def __getattr__(self, item):
+    #     return getattr(gb.vector.VectorExpression, item)
+
+    # def construct_output(self, dtype=None, *, name=None):
+    #     if dtype is None:
+    #         dtype = self.dtype
+    #     size = 0 if self._size.is_dOnion else self._size
+    #     return Vector.new(dtype, size, name=name)
+
+
+class VectorIndexExpr(AmbiguousAssignOrExtract):
+    __slots__ = "_size"
+    ndim = 1
+    output_type = gb.Vector
+
+    def __init__(self, parent, resolved_indexes, size):
+        super().__init__(parent, resolved_indexes)
+        self._size = size
+
+    @property
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return (self._size,)
+
+    # Begin auto-generated code: Vector
+    _get_value = _automethods._get_value
+    S = gb.vector.VectorIndexExpr.S
+    V = gb.vector.VectorIndexExpr.V
+    __and__ = gb.vector.VectorIndexExpr.__and__
+    __contains__ = gb.vector.VectorIndexExpr.__contains__
+    __or__ = gb.vector.VectorIndexExpr.__or__
+    apply = gb.vector.VectorIndexExpr.apply
+    ewise_add = gb.vector.VectorIndexExpr.ewise_add
+    ewise_mult = gb.vector.VectorIndexExpr.ewise_mult
+    isclose = gb.vector.VectorIndexExpr.isclose
+    isequal = gb.vector.VectorIndexExpr.isequal
+    nvals = gb.vector.VectorIndexExpr.nvals
+    reduce = gb.vector.VectorIndexExpr.reduce
+    vxm = gb.vector.VectorIndexExpr.vxm
+
+    # infix sugar:
+    __abs__ = gb.vector.VectorIndexExpr.__abs__
+    __add__ = gb.vector.VectorIndexExpr.__add__
+    __divmod__ = gb.vector.VectorIndexExpr.__divmod__
+    __eq__ = gb.vector.VectorIndexExpr.__eq__
+    __floordiv__ = gb.vector.VectorIndexExpr.__floordiv__
+    __ge__ = gb.vector.VectorIndexExpr.__ge__
+    __gt__ = gb.vector.VectorIndexExpr.__gt__
+    __invert__ = gb.vector.VectorIndexExpr.__invert__
+    __le__ = gb.vector.VectorIndexExpr.__le__
+    __lt__ = gb.vector.VectorIndexExpr.__lt__
+    __mod__ = gb.vector.VectorIndexExpr.__mod__
+    __mul__ = gb.vector.VectorIndexExpr.__mul__
+    __ne__ = gb.vector.VectorIndexExpr.__ne__
+    __neg__ = gb.vector.VectorIndexExpr.__neg__
+    __pow__ = gb.vector.VectorIndexExpr.__pow__
+    __radd__ = gb.vector.VectorIndexExpr.__radd__
+    __rdivmod__ = gb.vector.VectorIndexExpr.__rdivmod__
+    __rfloordiv__ = gb.vector.VectorIndexExpr.__rfloordiv__
+    __rmod__ = gb.vector.VectorIndexExpr.__rmod__
+    __rmul__ = gb.vector.VectorIndexExpr.__rmul__
+    __rpow__ = gb.vector.VectorIndexExpr.__rpow__
+    __rsub__ = gb.vector.VectorIndexExpr.__rsub__
+    __rtruediv__ = gb.vector.VectorIndexExpr.__rtruediv__
+    __rxor__ = gb.vector.VectorIndexExpr.__rxor__
+    __sub__ = gb.vector.VectorIndexExpr.__sub__
+    __truediv__ = gb.vector.VectorIndexExpr.__truediv__
+    __xor__ = gb.vector.VectorIndexExpr.__xor__
+
+    # bad sugar:
+    __array__ = gb.vector.VectorIndexExpr.__array__
+    __bool__ = gb.vector.VectorIndexExpr.__bool__
+    __iadd__ = gb.vector.VectorIndexExpr.__iadd__
+    __iand__ = gb.vector.VectorIndexExpr.__iand__
+    __ifloordiv__ = gb.vector.VectorIndexExpr.__ifloordiv__
+    __imatmul__ = gb.vector.VectorIndexExpr.__imatmul__
+    __imod__ = gb.vector.VectorIndexExpr.__imod__
+    __imul__ = gb.vector.VectorIndexExpr.__imul__
+    __ior__ = gb.vector.VectorIndexExpr.__ior__
+    __ipow__ = gb.vector.VectorIndexExpr.__ipow__
+    __isub__ = gb.vector.VectorIndexExpr.__isub__
+    __itruediv__ = gb.vector.VectorIndexExpr.__itruediv__
+    __ixor__ = gb.vector.VectorIndexExpr.__ixor__
+
+
 def _chunk_diag(
     inner_vector,
     input_range,
@@ -611,9 +1014,6 @@ def _chunk_diag(
     The returned matrix is either empty or contains a piece of
     the k-diagonal given by inner_vector
     """
-    # This function creates a new matrix chunk with dimensions determined
-    # by the input k-diagonal vector chunk.  The matrix chunk may or may
-    # not include the k-diagonal chunk
     vector = inner_vector.value
     vec_chunk = input_range[0]
     rows = row_range[0]
@@ -747,14 +1147,23 @@ def _build_1D_chunk(inner_vector, out_index_range, fragments, dup_op=None):
     return InnerVector(inner_vector.value)
 
 
-def _from_values1D(fragments, index_range, gb_dtype=None):
+def _from_values1D(fragments, index_range, dup_op=None, gb_dtype=None):
     inds = np.concatenate([inds for (inds, _) in fragments])
     vals = np.concatenate([vals for (_, vals) in fragments])
     size = index_range[0].stop - index_range[0].start
-    return InnerVector(gb.Vector.from_values(inds, vals, size=size, dtype=gb_dtype))
+    if inds.size == 0:
+        return InnerVector(gb.Vector.new(gb_dtype, size=size))
+    return InnerVector(gb.Vector.from_values(inds, vals, size=size, dup_op=dup_op, dtype=gb_dtype))
+
 
+def _pick1D(indices, values, index_range, size):
+    # validate indices:
+    indices = np.where(indices < 0, indices + size, indices)
+    bad_indices = (indices < 0) | (size <= indices)
+    if np.any(bad_indices):
+        raise IndexOutOfBound
 
-def _pick1D(indices, values, index_range):
+    # filter into chunk:
     index_range = index_range[0]
     indices_in = (index_range.start <= indices) & (indices < index_range.stop)
     indices = indices[indices_in] - index_range.start
@@ -823,4 +1232,6 @@ def _concat_vector(seq, axis=0):
 
 
 gb.utils._output_types[Vector] = gb.Vector
+gb.utils._output_types[VectorExpression] = gb.Vector
+gb.utils._output_types[VectorIndexExpr] = gb.Vector
 from .matrix import InnerMatrix  # noqa isort:skip
diff --git a/tests/from_grblas2/conftest.py b/tests/from_grblas2/conftest.py
index 5d0e635..a9d4632 100644
--- a/tests/from_grblas2/conftest.py
+++ b/tests/from_grblas2/conftest.py
@@ -1,5 +1,55 @@
+import atexit
+import functools
+import itertools
+
+import numpy as np
+import pytest
+
+import grblas as gb
+
+
+def pytest_configure(config):
+    backend = config.getoption("--backend", "suitesparse")
+    blocking = config.getoption("--blocking", True)
+    record = config.getoption("--record", False)
+    mapnumpy = config.getoption("--mapnumpy", None)
+    if mapnumpy is None:  # pragma: no branch
+        mapnumpy = np.random.rand() < 0.5  # heh
+
+    gb.config.set(autocompute=False, mapnumpy=mapnumpy)
+
+    gb.init(backend, blocking=blocking)
+    print(
+        f'Running tests with "{backend}" backend, blocking={blocking}, '
+        f"record={record}, mapnumpy={mapnumpy}"
+    )
+    if record:
+        rec = gb.Recorder()
+        rec.start()
+
+        def save_records():
+            with open("record.txt", "w") as f:  # pragma: no cover
+                f.write("\n".join(rec.data))
+
+        # I'm sure there's a `pytest` way to do this...
+        atexit.register(save_records)
+    for mod in [gb.unary, gb.binary, gb.monoid, gb.semiring, gb.op]:
+        for name in list(mod._delayed):
+            getattr(mod, name)
+
+
+def pytest_runtest_setup(item):
+    if "slow" in item.keywords and not item.config.getoption("--runslow", True):  # pragma: no cover
+        pytest.skip("need --runslow option to run")
+
+
 def autocompute(func):
-    return func
+    @functools.wraps(func)
+    def inner(*args, **kwargs):
+        with gb.config.set(autocompute=True):
+            return func(*args, **kwargs)
+
+    return inner
 
 
 def compute(val):
diff --git a/tests/from_grblas2/test_matrix.py b/tests/from_grblas2/test_matrix.py
index 9d40744..d37b69a 100644
--- a/tests/from_grblas2/test_matrix.py
+++ b/tests/from_grblas2/test_matrix.py
@@ -4,6 +4,7 @@
 import sys
 import weakref
 
+import dask.array as da
 import dask_grblas
 import grblas
 import numpy as np
@@ -22,6 +23,7 @@
 from .conftest import autocompute, compute
 
 from dask_grblas import Matrix, Scalar, Vector  # isort:skip
+from dask_grblas.base import is_DOnion, like_dOnion
 
 
 @pytest.fixture
@@ -42,9 +44,33 @@ def A():
     return Matrix.from_values(*data)
 
 
+@pytest.fixture
+def A_dask():
+    #    0 1 2 3 4 5 6
+    # 0 [- 2 - 3 - - -]
+    # 1 [- - - - 8 - 4]
+    # 2 [- - - - - 1 -]
+    # 3 [3 - 3 - - - -]
+    # 4 [- - - - - 7 -]
+    # 5 [- - 1 - - - -]
+    # 6 [- - 5 7 3 - -]
+    data = [
+        [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1],
+        [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6],
+        [3, 2, 3, 1, 5, 3, 7, 8, 3, 1, 7, 4],
+    ]
+    data = [da.from_array(np.array(a, dtype=np.int64)) for a in data]
+    return Matrix.from_values(*data)
+
+
+@pytest.fixture
+def As(A, A_dask):
+    return [A, A_dask]
+
+
 @pytest.fixture
 def A_chunks():
-    return [7, 4, 3]
+    return [7, 3]
 
 
 @pytest.fixture
@@ -53,6 +79,18 @@ def v():
     return Vector.from_values(*data)
 
 
+@pytest.fixture
+def v_dask():
+    data = [[1, 3, 4, 6], [1, 1, 2, 0]]
+    data = [da.from_array(a) for a in data]
+    return Vector.from_values(*data)
+
+
+@pytest.fixture
+def vs(v, v_dask):
+    return [v, v_dask]
+
+
 def test_new():
     C = Matrix.new(dtypes.INT8, 17, 12)
     assert C.dtype == "INT8"
@@ -61,30 +99,45 @@ def test_new():
     assert C.ncols == 12
 
 
-def test_dup(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        C = A.dup()
-        assert C is not A
-        assert C.dtype == A.dtype
-        assert C.nvals == A.nvals
-        assert C.nrows == A.nrows
-        assert C.ncols == A.ncols
-        # Ensure they are not the same backend object
-        A[0, 0] = 1000
-        assert C[0, 0].value != 1000
-        # extended functionality
-        D = Matrix.from_values([0, 1], [0, 1], [0, 2.5], dtype=dtypes.FP64)
-        E = D.dup(dtype=dtypes.INT64)
-        assert E.isequal(
-            Matrix.from_values([0, 1], [0, 1], [0, 2], dtype=dtypes.INT64), check_dtype=True
+def test_dup(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            C = A.dup()
+            assert C is not A
+            assert C.dtype == A.dtype
+            assert C.nvals == A.nvals
+            assert C.nrows == A.nrows
+            assert C.ncols == A.ncols
+            # Ensure they are not the same backend object
+            A[0, 0] = 1000
+            assert A[0, 0].new() == 1000
+            assert C[0, 0].new() != 1000
+
+    # extended functionality
+    Ds = [Matrix.from_values([0, 1], [0, 1], [0, 2.5], dtype=dtypes.FP64)]
+    Ds.append(
+        Matrix.from_values(
+            da.from_array([0, 1]), da.from_array([0, 1]), da.from_array([0, 2.5]), dtype=dtypes.FP64
         )
-        E = D.dup(mask=D.V)
-        assert E.isequal(Matrix.from_values([1], [1], [2.5], dtype=dtypes.FP64), check_dtype=True)
-        E = D.dup(dtype=dtypes.INT64, mask=D.V)
-        assert E.isequal(Matrix.from_values([1], [1], [2], dtype=dtypes.INT64), check_dtype=True)
+    )
+    for D_ in Ds:
+        for chunks in A_chunks:
+            D = D_.dup()
+            D.rechunk(chunks=chunks, inplace=True)
+            E = D.dup(dtype=dtypes.INT64)
+            assert E.isequal(
+                Matrix.from_values([0, 1], [0, 1], [0, 2], dtype=dtypes.INT64), check_dtype=True
+            )
+            E = D.dup(mask=D.V)
+            assert E.isequal(
+                Matrix.from_values([1], [1], [2.5], dtype=dtypes.FP64), check_dtype=True
+            )
+            E = D.dup(dtype=dtypes.INT64, mask=D.V)
+            assert E.isequal(
+                Matrix.from_values([1], [1], [2], dtype=dtypes.INT64), check_dtype=True
+            )
 
 
 def test_from_values():
@@ -103,7 +156,7 @@ def test_from_values():
     assert C3.ncols == 3
     assert C3.nvals == 2  # duplicates were combined
     assert C3.dtype == int
-    assert C3[1, 1].value == 6  # 2*3
+    assert C3[1, 1].new() == 6  # 2*3
     C3monoid = Matrix.from_values([0, 1, 1], [2, 1, 1], [1, 2, 3], nrows=10, dup_op=monoid.times)
     assert C3.isequal(C3monoid)
 
@@ -139,139 +192,244 @@ def test_from_values():
         Matrix.from_values([0], [1, 2], [0])
 
 
-@pytest.mark.xfail("'Needs investigation'", strict=True)
-def test_from_values_scalar():
-    C = Matrix.from_values([0, 1, 3], [1, 1, 2], 7)
+def test_from_values_dask():
+    rows = da.from_array(np.array([0, 1, 3]))
+    cols = da.from_array(np.array([1, 1, 2]))
+    vals = da.from_array(np.array([True, False, True]))
+    C = Matrix.from_values(rows, cols, vals)
     assert C.nrows == 4
     assert C.ncols == 3
     assert C.nvals == 3
-    assert C.dtype == dtypes.INT64
-    assert C.ss.is_iso
-    assert C.reduce_scalar(monoid.any).new() == 7
+    assert C.dtype == bool
+
+    vals = da.from_array(np.array([12.3, 12.4, 12.5]))
+    C2 = Matrix.from_values(rows, cols, vals, nrows=17, ncols=3)
+    assert C2.nrows == 17
+    assert C2.ncols == 3
+    assert C2.nvals == 3
+    assert C2.dtype == float
+
+    rows = da.from_array(np.array([0, 1, 1]))
+    cols = da.from_array(np.array([2, 1, 1]))
+    vals = da.from_array(np.array([1, 2, 3], dtype=np.int64))
+    C3 = Matrix.from_values(rows, cols, vals, nrows=10, dup_op=binary.times)
+    assert C3.nrows == 10
+    assert C3.ncols == 3
+    assert C3.nvals == 2  # duplicates were combined
+    assert C3.dtype == int
+    assert C3[1, 1].new() == 6  # 2*3
+    C3monoid = Matrix.from_values(rows, cols, vals, nrows=10, dup_op=monoid.times)
+    assert C3.isequal(C3monoid)
+
+    vals = da.from_array(np.array([True, True, True]))
+    with pytest.raises(ValueError, match="Duplicate indices found"):
+        # Duplicate indices requires a dup_op
+        Matrix.from_values(rows, cols, vals).compute()
+
+    rows = da.from_array(np.array([0, 1, 3]))
+    cols = da.from_array(np.array([1, 1, 2]))
+    vals = da.from_array(np.array([12.3, 12.4, 12.5]))
+    with pytest.raises(IndexOutOfBound):
+        # Specified ncols can't hold provided indexes
+        Matrix.from_values(rows, cols, vals, nrows=17, ncols=2).compute()
+
+    empty_da = da.from_array(np.array([]))
+    with pytest.raises(ValueError, match="No row indices provided. Unable to infer nrows."):
+        Matrix.from_values(empty_da, empty_da, empty_da)
+
+    # Changed: Assume empty value is float64 (like numpy)
+    # with pytest.raises(ValueError, match="No vals provided. Unable to determine type"):
+    empty1 = Matrix.from_values(empty_da, empty_da, empty_da, nrows=3, ncols=4)
+    assert empty1.dtype == dtypes.FP64
+    assert empty1.nrows == 3
+    assert empty1.ncols == 4
+    assert empty1.nvals == 0
+
+    with pytest.raises(ValueError, match="Unable to infer"):
+        Matrix.from_values(empty_da, empty_da, empty_da, dtype=dtypes.INT64)
+
+    zero_da = da.from_array(np.array([0]))
+    with pytest.raises(ValueError, match="Unable to infer"):
+        # could also raise b/c rows and columns are different sizes
+        Matrix.from_values(zero_da, empty_da, zero_da, dtype=dtypes.INT64)
+
+    C4 = Matrix.from_values(empty_da, empty_da, empty_da, nrows=3, ncols=4, dtype=dtypes.INT64)
+    C5 = Matrix.new(dtypes.INT64, nrows=3, ncols=4)
+    assert C4.isequal(C5, check_dtype=True)
+
+    cols = da.from_array(np.array([1, 2]))
+    with pytest.raises(
+        ValueError, match="`rows` and `columns` and `values` lengths must match: 1, 2, 1"
+    ):
+        Matrix.from_values(zero_da, cols, zero_da)
+
+
+def test_from_values_scalar():
+    Cs = [Matrix.from_values([0, 1, 3], [1, 1, 2], 7)]
+    Cs.append(
+        Matrix.from_values(
+            da.from_array([0, 1, 3]),
+            da.from_array([1, 1, 2]),
+            7,
+        )
+    )
+    for C in Cs:
+        assert C.nrows == 4
+        assert C.ncols == 3
+        assert C.nvals == 3
+        assert C.dtype == dtypes.INT64
+        # assert C.ss.is_iso
+        assert C.reduce_scalar(monoid.any).new() == 7
 
     # iso drumps duplicates
     C = Matrix.from_values([0, 1, 3, 0], [1, 1, 2, 1], 7)
-    assert C.nrows == 4
-    assert C.ncols == 3
-    assert C.nvals == 3
-    assert C.dtype == dtypes.INT64
-    assert C.ss.is_iso
-    assert C.reduce_scalar(monoid.any).new() == 7
-    with pytest.raises(ValueError, match="dup_op must be None"):
-        Matrix.from_values([0, 1, 3, 0], [1, 1, 2, 1], 7, dup_op=binary.plus)
+    Cs.append(
+        Matrix.from_values(
+            da.from_array([0, 1, 3, 0]),
+            da.from_array([1, 1, 2, 1]),
+            7,
+        )
+    )
+    for C in Cs:
+        assert C.nrows == 4
+        assert C.ncols == 3
+        assert C.nvals == 3
+        assert C.dtype == dtypes.INT64
+        # assert C.ss.is_iso
+        assert C.reduce_scalar(monoid.any).new() == 7
+        with pytest.raises(ValueError, match="dup_op must be None"):
+            Matrix.from_values([0, 1, 3, 0], [1, 1, 2, 1], 7, dup_op=binary.plus)
+
+
+def test_clear(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            A.clear()
+            assert A.nvals == 0
+            assert A.nrows == 7
+            assert A.ncols == 7
 
 
-def test_clear(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        A.clear()
-        assert A.nvals == 0
-        assert A.nrows == 7
-        assert A.ncols == 7
+def test_resize(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            assert A.nrows == 7
+            assert A.ncols == 7
+            assert A.nvals.compute() == 12
+            A.resize(10, 11)
+            assert A.nrows == 10
+            assert A.ncols == 11
+            assert A.nvals == 12
+            assert compute(A[9, 9].new().value) is None
+            A.resize(4, 1)
+            assert A.nrows == 4
+            assert A.ncols == 1
+            assert A.nvals == 1
 
+            A = A_.dup()
+            assert A.nrows == 7
+            assert A.ncols == 7
+            assert A.nvals == 12
+            A.resize(6, 11, chunks=4)
+            assert A.nrows == 6
+            assert A.ncols == 11
+            assert A.nvals == 9
+            if not A.is_dOnion:
+                assert A._delayed.chunks == ((4, 2), (4, 4, 3))
+            else:
+                assert A._delayed.deep_extract(None, lambda x: x._delayed.chunks) == (
+                    (4, 2),
+                    (4, 4, 3),
+                )
+            assert compute(A[3, 2].new().value) == 3
+            assert compute(A[5, 7].new().value) is None
 
-def test_resize(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        assert A.nrows == 7
-        assert A.ncols == 7
-        assert A.nvals.compute() == 12
-        A.resize(10, 11)
-        assert A.nrows == 10
-        assert A.ncols == 11
-        assert A.nvals.compute() == 12
-        assert compute(A[9, 9].value) is None
-        A.resize(4, 1)
-        assert A.nrows == 4
-        assert A.ncols == 1
-        assert A.nvals.compute() == 1
+            A = A_.dup()
+            A.resize(11, 3, chunks=4)
+            assert A.nrows == 11
+            assert A.ncols == 3
+            assert A.nvals == 5
+            if type(A._delayed) is da.Array:
+                assert A._delayed.chunks == ((4, 4, 3), (3,))
+            else:
+                assert A._delayed.deep_extract(None, lambda x: x._delayed.chunks) == (
+                    (4, 4, 3),
+                    (3,),
+                )
+            assert compute(A[3, 2].new().value) == 3
+            assert compute(A[7, 2].new().value) is None
 
-        A = A_.dup()
-        assert A.nrows == 7
-        assert A.ncols == 7
-        assert A.nvals.compute() == 12
-        A.resize(6, 11, chunks=4)
-        assert A.nrows == 6
-        assert A.ncols == 11
-        assert A.nvals.compute() == 9
-        assert A._delayed.chunks == ((4, 2), (4, 4, 3))
-        assert compute(A[3, 2].value) == 3
-        assert compute(A[5, 7].value) is None
 
+def test_rechunk(As, A_chunks):
+    for A_ in As:
         A = A_.dup()
-        A.resize(11, 3, chunks=4)
-        assert A.nrows == 11
-        assert A.ncols == 3
-        assert A.nvals.compute() == 5
-        assert A._delayed.chunks == ((4, 4, 3), (3,))
-        assert compute(A[3, 2].value) == 3
-        assert compute(A[7, 2].value) is None
-
+        for chunks in A_chunks + A_chunks[::-1]:
+            A.rechunk(chunks=chunks, inplace=True)
+            assert A.nrows == 7
+            assert A.ncols == 7
+            assert A.nvals == 12
 
-def test_rechunk(A, A_chunks):
-    A_ = A.dup()
-    for chunks in A_chunks + A_chunks[::-1]:
-        A_.rechunk(chunks=chunks, inplace=True)
-        assert A_.nrows == 7
-        assert A_.ncols == 7
-        assert A_.nvals.compute() == 12
 
+def test_nrows(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            assert A.nrows == 7
 
-def test_nrows(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        assert A.nrows == 7
 
+def test_ncols(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            assert A.ncols == 7
 
-def test_ncols(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        assert A.ncols == 7
 
+def test_nvals(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            assert A.nvals == 12
 
-def test_nvals(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        assert A.nvals == 12
 
+def test_build(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            assert A.nvals == 12
+            A.clear()
+            A.build([0, 6], [0, 1], [1, 2])
+            assert A.nvals == 2
+            with pytest.raises(OutputNotEmpty):
+                A.build([1, 5], [2, 3], [3, 4])
+            assert A.nvals == 2  # nothing should be modified
+            # We can clear though
+            A.build([1, 2, 5], [1, 2, 3], [2, 3, 4], clear=True)
+            assert A.nvals == 3
+            A.clear()
+            if is_DOnion(A._delayed):
+                A.build([0, 11], [0, 0], [1, 1])
+                with pytest.raises(IndexOutOfBound):
+                    A.compute()
+            else:
+                with pytest.raises(IndexOutOfBound):
+                    A.build([0, 11], [0, 0], [1, 1])
 
-def test_build(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        assert A.nvals == 12
-        A.clear()
-        A.build([0, 6], [0, 1], [1, 2])
-        assert A.nvals == 2
-        with pytest.raises(OutputNotEmpty):
-            A.build([1, 5], [2, 3], [3, 4])
-        assert A.nvals == 2  # nothing should be modified
-        # We can clear though
-        A.build([1, 2, 5], [1, 2, 3], [2, 3, 4], clear=True)
-        assert A.nvals == 3
-        A.clear()
-        with pytest.raises(IndexOutOfBound):
-            A.build([0, 11], [0, 0], [1, 1])
-        B = Matrix.new(int, nrows=2, ncols=2)
-        B.build([0, 11], [0, 0], [1, 1], nrows=12)
-        assert B.isequal(Matrix.from_values([0, 11], [0, 0], [1, 1], ncols=2))
-        C = Matrix.new(int, nrows=2, ncols=2)
-        C.build([0, 0], [0, 11], [1, 1], ncols=12)
-        assert C.isequal(Matrix.from_values([0, 0], [0, 11], [1, 1], nrows=2))
+            B = Matrix.new(int, nrows=2, ncols=2)
+            B.build([0, 11], [0, 0], [1, 1], nrows=12)
+            assert B.isequal(Matrix.from_values([0, 11], [0, 0], [1, 1], ncols=2))
+            C = Matrix.new(int, nrows=2, ncols=2)
+            C.build([0, 0], [0, 11], [1, 1], ncols=12)
+            assert C.isequal(Matrix.from_values([0, 0], [0, 11], [1, 1], nrows=2))
 
 
-@pytest.mark.xfail("'Needs investigation'", strict=True)
 def test_build_scalar(A, A_chunks):
     A_ = A
     for chunks in A_chunks:
@@ -283,7 +441,7 @@ def test_build_scalar(A, A_chunks):
         A.clear()
         A.ss.build_scalar([0, 6], [0, 1], 1)
         assert A.nvals == 2
-        assert A.ss.is_iso
+        # assert A.ss.is_iso
         A.clear()
         with pytest.raises(ValueError, match="lengths must match"):
             A.ss.build_scalar([0, 6], [0, 1, 2], 1)
@@ -291,116 +449,117 @@ def test_build_scalar(A, A_chunks):
             A.ss.build_scalar([0, 5], [0, 1], None)
 
 
-def test_extract_values(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        rows, cols, vals = A.to_values(dtype=int)
-        rcv = set(
-            zip(
-                rows.compute(),
-                cols.compute(),
-                vals.compute(),
+def test_extract_values(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            rows, cols, vals = A.to_values(dtype=int)
+            rcv = set(
+                zip(
+                    rows.compute(),
+                    cols.compute(),
+                    vals.compute(),
+                )
             )
-        )
-        expected = set(
-            zip(
-                (0, 0, 1, 1, 2, 3, 3, 4, 5, 6, 6, 6),
-                (1, 3, 4, 6, 5, 0, 2, 5, 2, 2, 3, 4),
-                (2, 3, 8, 4, 1, 3, 3, 7, 1, 5, 7, 3),
+            expected = set(
+                zip(
+                    (0, 0, 1, 1, 2, 3, 3, 4, 5, 6, 6, 6),
+                    (1, 3, 4, 6, 5, 0, 2, 5, 2, 2, 3, 4),
+                    (2, 3, 8, 4, 1, 3, 3, 7, 1, 5, 7, 3),
+                )
             )
-        )
-        assert rcv == expected
-        assert rows.dtype == np.uint64
-        assert cols.dtype == np.uint64
-        assert vals.dtype == np.int64
-        Trows, Tcols, Tvals = A.T.to_values(dtype=float)
-        np.testing.assert_array_equal(rows, Tcols)
-        np.testing.assert_array_equal(cols, Trows)
-        np.testing.assert_array_equal(vals, Tvals)
-        assert Trows.dtype == np.uint64
-        assert Tcols.dtype == np.uint64
-        assert Tvals.dtype == np.float64
-
-
-def test_extract_element(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        assert A[3, 0].new() == 3
-        assert A[1, 6].new() == 4
-        assert A[1, 6].value == 4
-        assert A.T[6, 1].value == 4
-        s = A[0, 0].new()
-        assert compute(s.value) is None
-        assert s.dtype == "INT64"
-        s = A[1, 6].new(dtype=float)
-        assert s.value == 4.0
-        assert s.dtype == "FP64"
-
-
-def test_set_element(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        assert compute(A[1, 1].value) is None
-        assert A[3, 0].value == 3
-        A[1, 1].update(21)
-        A[3, 0] << -5
-        assert A[1, 1].value == 21
-        assert A[3, 0].new() == -5
-
-
-def test_remove_element(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        assert A[3, 0].value == 3
-        del A[3, 0]
-        assert compute(A[3, 0].value) is None
-        assert A[6, 3].value == 7
-        with pytest.raises(TypeError, match="Remove Element only supports"):
-            del A[3:5, 3]
-
-
-def test_mxm(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        C = A.mxm(A, semiring.plus_times).new()
-        result = Matrix.from_values(
-            [0, 0, 0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 5, 6, 6, 6],
-            [0, 2, 4, 6, 2, 3, 4, 5, 2, 1, 3, 5, 2, 5, 0, 2, 5],
-            [9, 9, 16, 8, 20, 28, 12, 56, 1, 6, 9, 3, 7, 1, 21, 21, 26],
-        )
-        assert C.isequal(result)
+            assert rcv == expected
+            assert rows.dtype == np.uint64
+            assert cols.dtype == np.uint64
+            assert vals.dtype == np.int64
+            Trows, Tcols, Tvals = A.T.to_values(dtype=float)
+            np.testing.assert_array_equal(rows.compute(), Tcols.compute())
+            np.testing.assert_array_equal(cols.compute(), Trows.compute())
+            np.testing.assert_array_equal(vals.compute(), Tvals.compute())
+            assert Trows.dtype == np.uint64
+            assert Tcols.dtype == np.uint64
+            assert Tvals.dtype == np.float64
+
+
+def test_extract_element(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            assert A[3, 0].new() == 3
+            assert A[1, 6].new() == 4
+            with pytest.raises(TypeError, match="enable automatic"):
+                A[1, 6].value
+            assert A.T[6, 1].new() == 4
+            s = A[0, 0].new()
+            assert compute(s.value) is None
+            assert s.dtype == "INT64"
+            s = A[1, 6].new(dtype=float)
+            assert s.value == 4.0
+            assert s.dtype == "FP64"
+
+
+def test_set_element(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            assert compute(A[1, 1].new().value) is None
+            assert A[3, 0].new() == 3
+            A[1, 1].update(21)
+            A[3, 0] << -5
+            assert A[1, 1].new() == 21
+            assert A[3, 0].new() == -5
+
+
+def test_remove_element(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            assert A[3, 0].new() == 3
+            del A[3, 0]
+            assert compute(A[3, 0].new().value) is None
+            assert A[6, 3].new() == 7
+            with pytest.raises(TypeError, match="Remove Element only supports"):
+                del A[3:5, 3]
+
+
+def test_mxm(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            C = A.mxm(A, semiring.plus_times).new()
+            result = Matrix.from_values(
+                [0, 0, 0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 5, 6, 6, 6],
+                [0, 2, 4, 6, 2, 3, 4, 5, 2, 1, 3, 5, 2, 5, 0, 2, 5],
+                [9, 9, 16, 8, 20, 28, 12, 56, 1, 6, 9, 3, 7, 1, 21, 21, 26],
+            )
+            assert C.isequal(result)
 
 
-def test_mxm_transpose(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        C = A.dup()
-        C << A.mxm(A.T, semiring.plus_times)
-        result = Matrix.from_values(
-            [0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6],
-            [0, 6, 1, 6, 2, 4, 3, 5, 6, 2, 4, 3, 5, 6, 0, 1, 3, 5, 6],
-            [13, 21, 80, 24, 1, 7, 18, 3, 15, 7, 49, 3, 1, 5, 21, 24, 15, 5, 83],
-        )
-        assert C.isequal(result)
-        C << A.T.mxm(A, semiring.plus_times)
-        result2 = Matrix.from_values(
-            [0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 6, 6],
-            [0, 2, 1, 3, 0, 2, 3, 4, 1, 2, 3, 4, 2, 3, 4, 6, 5, 4, 6],
-            [9, 9, 4, 6, 9, 35, 35, 15, 6, 35, 58, 21, 15, 21, 73, 32, 50, 32, 16],
-        )
-        assert C.isequal(result2)
+def test_mxm_transpose(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            C = A.dup()
+            C << A.mxm(A.T, semiring.plus_times)
+            result = Matrix.from_values(
+                [0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6],
+                [0, 6, 1, 6, 2, 4, 3, 5, 6, 2, 4, 3, 5, 6, 0, 1, 3, 5, 6],
+                [13, 21, 80, 24, 1, 7, 18, 3, 15, 7, 49, 3, 1, 5, 21, 24, 15, 5, 83],
+            )
+            assert C.isequal(result)
+            C << A.T.mxm(A, semiring.plus_times)
+            result2 = Matrix.from_values(
+                [0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 6, 6],
+                [0, 2, 1, 3, 0, 2, 3, 4, 1, 2, 3, 4, 2, 3, 4, 6, 5, 4, 6],
+                [9, 9, 4, 6, 9, 35, 35, 15, 6, 35, 58, 21, 15, 21, 73, 32, 50, 32, 16],
+            )
+            assert C.isequal(result2)
 
 
 def test_mxm_nonsquare():
@@ -408,431 +567,629 @@ def test_mxm_nonsquare():
     B = Matrix.from_values([0, 2, 4], [0, 0, 0], [10, 20, 30], nrows=5, ncols=1)
     C = Matrix.new(A.dtype, nrows=1, ncols=1)
     C << A.mxm(B, semiring.max_plus)
-    assert C[0, 0].value == 33
+    assert C[0, 0].new() == 33
     C1 = A.mxm(B, semiring.max_plus).new()
     assert C1.isequal(C)
     C2 = A.T.mxm(B.T, semiring.max_plus).new()
     assert C2.nrows == 5
     assert C2.ncols == 5
 
-
-def test_mxm_mask(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        val_mask = Matrix.from_values([0, 3, 4], [2, 3, 2], [True, True, True], nrows=7, ncols=7)
-        struct_mask = Matrix.from_values([0, 3, 4], [2, 3, 2], [1, 0, 0], nrows=7, ncols=7)
-        C = A.dup()
-        C(val_mask.V) << A.mxm(A, semiring.plus_times)
-        result = Matrix.from_values(
-            [0, 0, 0, 1, 1, 2, 3, 3, 3, 4, 4, 5, 6, 6, 6],
-            [1, 2, 3, 4, 6, 5, 0, 2, 3, 2, 5, 2, 2, 3, 4],
-            [2, 9, 3, 8, 4, 1, 3, 3, 9, 7, 7, 1, 5, 7, 3],
-        )
-        assert C.isequal(result)
-        C = A.dup()
-        C(~val_mask.V) << A.mxm(A, semiring.plus_times)
-        result2 = Matrix.from_values(
-            [0, 0, 0, 1, 1, 1, 1, 2, 3, 3, 5, 6, 6, 6],
-            [0, 4, 6, 2, 3, 4, 5, 2, 1, 5, 5, 0, 2, 5],
-            [9, 16, 8, 20, 28, 12, 56, 1, 6, 3, 1, 21, 21, 26],
-        )
-        assert C.isequal(result2)
-        C = A.dup()
-        C(struct_mask.S, replace=True).update(A.mxm(A, semiring.plus_times))
-        result3 = Matrix.from_values([0, 3, 4], [2, 3, 2], [9, 9, 7], nrows=7, ncols=7)
-        assert C.isequal(result3)
-        C2 = A.mxm(A, semiring.plus_times).new(mask=struct_mask.S)
-        assert C2.isequal(result3)
-        with pytest.raises(TypeError, match="Mask must indicate"):
-            A.mxm(A).new(mask=struct_mask)
-
-
-def test_mxm_accum(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        A(binary.plus) << A.mxm(A, semiring.plus_times)
-        # fmt: off
-        result = Matrix.from_values(
-            [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 6, 6, 6],
-            [0, 1, 2, 3, 4, 6, 2, 3, 4, 5, 6, 2, 5, 0, 1, 2, 3, 5, 2, 5, 2, 5, 0, 2, 3, 4, 5],
-            [9, 2, 9, 3, 16, 8, 20, 28, 20, 56, 4, 1, 1, 3, 6, 3, 9, 3, 7, 7, 1, 1, 21, 26, 7, 3, 26],
-        )
-        # fmt: on
-        assert A.isequal(result)
-
-
-def test_mxv(A, A_chunks, v):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        w = A.mxv(v, semiring.plus_times).new()
-        result = Vector.from_values([0, 1, 6], [5, 16, 13])
-        assert w.isequal(result)
+    A = Matrix.from_values(
+        da.from_array([0, 0, 0]),
+        da.from_array([0, 2, 4]),
+        da.from_array([1, 2, 3]),
+        nrows=1,
+        ncols=5,
+    )
+    B = Matrix.from_values([0, 2, 4], [0, 0, 0], [10, 20, 30], nrows=5, ncols=1)
+    C = Matrix.new(A.dtype, nrows=1, ncols=1)
+    C << A.mxm(B, semiring.max_plus)
+    assert C[0, 0].new() == 33
+    C1 = A.mxm(B, semiring.max_plus).new()
+    assert C1.isequal(C)
+    C2 = A.T.mxm(B.T, semiring.max_plus).new()
+    assert C2.nrows == 5
+    assert C2.ncols == 5
 
 
-def test_ewise_mult(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        # Binary, Monoid, and Semiring
-        B = Matrix.from_values([0, 0, 5], [1, 2, 2], [5, 4, 8], nrows=7, ncols=7)
-        result = Matrix.from_values([0, 5], [1, 2], [10, 8], nrows=7, ncols=7)
-        C = A.ewise_mult(B, binary.times).new()
-        assert C.isequal(result)
-        C() << A.ewise_mult(B, monoid.times)
-        assert C.isequal(result)
-        with pytest.raises(TypeError, match="Expected type: BinaryOp, Monoid"):
-            A.ewise_mult(B, semiring.plus_times)
-
-
-def test_ewise_add(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        # Binary, Monoid, and Semiring
-        B = Matrix.from_values([0, 0, 5], [1, 2, 2], [5, 4, 8], nrows=7, ncols=7)
-        result = Matrix.from_values(
-            [0, 3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1],
-            [2, 0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6],
-            [4, 3, 5, 3, 8, 5, 3, 7, 8, 3, 1, 7, 4],
-        )
-        with pytest.raises(TypeError, match="require_monoid"):
-            A.ewise_add(B, binary.second)
-        # surprising that SECOND(x, empty) == x, which is why user
-        # must opt-in to using binary ops in ewise_add
-        C = A.ewise_add(B, binary.second, require_monoid=False).new()
-        assert C.isequal(result)
-        C << A.ewise_add(B, monoid.max)
-        assert C.isequal(result)
-        C << A.ewise_add(B, binary.max)
-        assert C.isequal(result)
-        with pytest.raises(TypeError, match="Expected type: Monoid"):
-            A.ewise_add(B, semiring.max_minus)
-
-
-def test_extract(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        C = Matrix.new(A.dtype, 3, 4)
-        result = Matrix.from_values(
-            [0, 0, 1, 2, 2, 2], [0, 2, 1, 1, 2, 3], [2, 3, 3, 5, 7, 3], nrows=3, ncols=4
-        )
-        C << A[[0, 3, 6], [1, 2, 3, 4]]
-        assert C.isequal(result)
-        C << A[0::3, 1:5]
-        assert C.isequal(result)
-        C << A[[0, 3, 6], 1:5:1]
-        assert C.isequal(result)
-        C2 = A[[0, 3, 6], [1, 2, 3, 4]].new()
-        assert C2.isequal(result)
+def test_mxm_mask(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            val_mask = Matrix.from_values(
+                [0, 3, 4], [2, 3, 2], [True, True, True], nrows=7, ncols=7
+            )
+            struct_mask = Matrix.from_values([0, 3, 4], [2, 3, 2], [1, 0, 0], nrows=7, ncols=7)
+            C = A.dup()
+            C(val_mask.V) << A.mxm(A, semiring.plus_times)
+            result = Matrix.from_values(
+                [0, 0, 0, 1, 1, 2, 3, 3, 3, 4, 4, 5, 6, 6, 6],
+                [1, 2, 3, 4, 6, 5, 0, 2, 3, 2, 5, 2, 2, 3, 4],
+                [2, 9, 3, 8, 4, 1, 3, 3, 9, 7, 7, 1, 5, 7, 3],
+            )
+            assert C.isequal(result)
+            C = A.dup()
+            C(~val_mask.V) << A.mxm(A, semiring.plus_times)
+            result2 = Matrix.from_values(
+                [0, 0, 0, 1, 1, 1, 1, 2, 3, 3, 5, 6, 6, 6],
+                [0, 4, 6, 2, 3, 4, 5, 2, 1, 5, 5, 0, 2, 5],
+                [9, 16, 8, 20, 28, 12, 56, 1, 6, 3, 1, 21, 21, 26],
+            )
+            assert C.isequal(result2)
+            C = A.dup()
+            C(struct_mask.S, replace=True).update(A.mxm(A, semiring.plus_times))
+            result3 = Matrix.from_values([0, 3, 4], [2, 3, 2], [9, 9, 7], nrows=7, ncols=7)
+            assert C.isequal(result3)
+            C2 = A.mxm(A, semiring.plus_times).new(mask=struct_mask.S)
+            assert C2.isequal(result3)
+            with pytest.raises(TypeError, match="Mask must indicate"):
+                A.mxm(A).new(mask=struct_mask)
+
+
+def test_mxm_accum(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            A(binary.plus) << A.mxm(A, semiring.plus_times)
+            # fmt: off
+            result = Matrix.from_values(
+                [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 6, 6, 6],
+                [0, 1, 2, 3, 4, 6, 2, 3, 4, 5, 6, 2, 5, 0, 1, 2, 3, 5, 2, 5, 2, 5, 0, 2, 3, 4, 5],
+                [9, 2, 9, 3, 16, 8, 20, 28, 20, 56, 4, 1, 1, 3, 6, 3, 9, 3, 7, 7, 1, 1, 21, 26, 7, 3, 26],
+            )
+            # fmt: on
+            assert A.isequal(result)
 
 
-def test_extract_row(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        w = Vector.new(A.dtype, 3)
-        result = Vector.from_values([1, 2], [5, 3], size=3)
-        w << A[6, [0, 2, 4]]
-        assert w.isequal(result)
-        w << A[6, :5:2]
-        assert w.isequal(result)
-        w << A.T[[0, 2, 4], 6]
-        assert w.isequal(result)
-        w2 = A[6, [0, 2, 4]].new()
-        assert w2.isequal(result)
-        with pytest.raises(TypeError):
-            # Should be list, not tuple (although tuple isn't so bad)
-            A[6, (0, 2, 4)]
-        w3 = A[6, np.array([0, 2, 4])].new()
-        assert w3.isequal(result)
-        with pytest.raises(TypeError, match="Invalid dtype"):
-            A[6, np.array([0, 2, 4], dtype=float)]
-        with pytest.raises(TypeError, match="Invalid number of dimensions"):
-            A[6, np.array([[0, 2, 4]])]
+def test_mxv(As, A_chunks, v):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            w = A.mxv(v, semiring.plus_times).new()
+            result = Vector.from_values([0, 1, 6], [5, 16, 13])
+            assert w.isequal(result)
 
 
-def test_extract_column(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        w = Vector.new(A.dtype, 3)
-        result = Vector.from_values([1, 2], [3, 1], size=3)
-        w << A[[1, 3, 5], 2]
-        assert w.isequal(result)
-        w << A[1:6:2, 2]
-        assert w.isequal(result)
-        w << A.T[2, [1, 3, 5]]
-        assert w.isequal(result)
-        w2 = A[1:6:2, 2].new()
-        assert w2.isequal(result)
+def test_ewise_mult(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            # Binary, Monoid, and Semiring
+            B = Matrix.from_values([0, 0, 5], [1, 2, 2], [5, 4, 8], nrows=7, ncols=7)
+            result = Matrix.from_values([0, 5], [1, 2], [10, 8], nrows=7, ncols=7)
+            C = A.ewise_mult(B, binary.times).new()
+            assert C.isequal(result)
+            C() << A.ewise_mult(B, monoid.times)
+            assert C.isequal(result)
+            with pytest.raises(TypeError, match="Expected type: BinaryOp, Monoid"):
+                A.ewise_mult(B, semiring.plus_times)
+
+
+def test_ewise_add(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            # Binary, Monoid, and Semiring
+            B = Matrix.from_values([0, 0, 5], [1, 2, 2], [5, 4, 8], nrows=7, ncols=7)
+            result = Matrix.from_values(
+                [0, 3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1],
+                [2, 0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6],
+                [4, 3, 5, 3, 8, 5, 3, 7, 8, 3, 1, 7, 4],
+            )
+            with pytest.raises(TypeError, match="require_monoid"):
+                A.ewise_add(B, binary.second)
+            # surprising that SECOND(x, empty) == x, which is why user
+            # must opt-in to using binary ops in ewise_add
+            C = A.ewise_add(B, binary.second, require_monoid=False).new()
+            assert C.isequal(result)
+            C << A.ewise_add(B, monoid.max)
+            assert C.isequal(result)
+            C << A.ewise_add(B, binary.max)
+            assert C.isequal(result)
+            with pytest.raises(TypeError, match="Expected type: Monoid"):
+                A.ewise_add(B, semiring.max_minus)
+
+
+def test_extract(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            C = Matrix.new(A.dtype, 3, 4)
+            result = Matrix.from_values(
+                [0, 0, 1, 2, 2, 2], [0, 2, 1, 1, 2, 3], [2, 3, 3, 5, 7, 3], nrows=3, ncols=4
+            )
+            C << A[[0, 3, 6], [1, 2, 3, 4]]
+            assert C.isequal(result)
+            C << A[0::3, 1:5]
+            assert C.isequal(result)
+            C << A[[0, 3, 6], 1:5:1]
+            assert C.isequal(result)
+            C2 = A[[0, 3, 6], [1, 2, 3, 4]].new()
+            assert C2.isequal(result)
+
+
+def test_extract_row(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            w = Vector.new(A.dtype, 3)
+            result = Vector.from_values([1, 2], [5, 3], size=3)
+            w << A[6, [0, 2, 4]]
+            assert w.isequal(result)
+            w << A[6, :5:2]
+            assert w.isequal(result)
+            w << A.T[[0, 2, 4], 6]
+            assert w.isequal(result)
+            w2 = A[6, [0, 2, 4]].new()
+            assert w2.isequal(result)
+            with pytest.raises(TypeError):
+                # Should be list, not tuple (although tuple isn't so bad)
+                A[6, (0, 2, 4)]
+            w3 = A[6, np.array([0, 2, 4])].new()
+            assert w3.isequal(result)
+            with pytest.raises(TypeError, match="Invalid dtype"):
+                A[6, np.array([0, 2, 4], dtype=float)]
+            with pytest.raises(TypeError, match="Invalid number of dimensions"):
+                A[6, np.array([[0, 2, 4]])]
+
+
+def test_extract_column(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            w = Vector.new(A.dtype, 3)
+            result = Vector.from_values([1, 2], [3, 1], size=3)
+            w << A[[1, 3, 5], 2]
+            assert w.isequal(result)
+            w << A[1:6:2, 2]
+            assert w.isequal(result)
+            w << A.T[2, [1, 3, 5]]
+            assert w.isequal(result)
+            w2 = A[1:6:2, 2].new()
+            assert w2.isequal(result)
 
 
 def test_extract_input_mask():
     # A       M
     # 0 1 2   _ 0 1
     # 3 4 5   2 3 _
-    A = Matrix.from_values(
-        [0, 0, 0, 1, 1, 1],
-        [0, 1, 2, 0, 1, 2],
-        [0, 1, 2, 3, 4, 5],
-    )
-    M = Matrix.from_values(
-        [0, 0, 1, 1],
-        [1, 2, 0, 1],
-        [0, 1, 2, 3],
-    )
-    m = M[0, :].new()
-    MT = M.T.new()
-    # Matrix structure mask
-    result = A[0, [0, 1]].new(input_mask=M.S)
-    expected = Vector.from_values([1], [1])
-    assert result.isequal(expected)
-    # again
-    result.clear()
-    result(input_mask=M.S) << A[0, [0, 1]]
-    assert result.isequal(expected)
-
-    # Vector mask
-    result = A[0, [0, 1]].new(input_mask=m.S)
-    assert result.isequal(expected)
-    # again
-    result.clear()
-    result(input_mask=m.S) << A[0, [0, 1]]
-    assert result.isequal(expected)
-
-    # Matrix value mask
-    result = A[0, [1, 2]].new(input_mask=M.V)
-    expected = Vector.from_values([1], [2], size=2)
-    assert result.isequal(expected)
-    # again
-    result.clear()
-    result(input_mask=M.V) << A[0, [1, 2]]
-    assert result.isequal(expected)
-
-    with pytest.raises(ValueError, match="Shape of `input_mask` does not match shape of input"):
-        A[0, [0, 1]].new(input_mask=MT.S)
-    with pytest.raises(ValueError, match="Shape of `input_mask` does not match shape of input"):
-        m(input_mask=MT.S) << A[0, [0, 1]]
-    with pytest.raises(
-        ValueError, match="Size of `input_mask` Vector does not match ncols of Matrix"
-    ):
-        A[0, [0]].new(input_mask=expected.S)
-    with pytest.raises(
-        ValueError, match="Size of `input_mask` Vector does not match ncols of Matrix"
-    ):
-        m(input_mask=expected.S) << A[0, [0]]
-    with pytest.raises(
-        ValueError, match="Size of `input_mask` Vector does not match nrows of Matrix"
-    ):
-        A[[0], 0].new(input_mask=m.S)
-    with pytest.raises(
-        ValueError, match="Size of `input_mask` Vector does not match nrows of Matrix"
-    ):
-        m(input_mask=m.S) << A[[0], 0]
-    with pytest.raises(
-        TypeError, match="Got Vector `input_mask` when extracting a submatrix from a Matrix"
-    ):
-        A[[0], [0]].new(input_mask=expected.S)
-    with pytest.raises(
-        TypeError, match="Got Vector `input_mask` when extracting a submatrix from a Matrix"
-    ):
-        A(input_mask=expected.S) << A[[0], [0]]
-    with pytest.raises(TypeError, match="mask is not allowed for single element extraction"):
-        A[0, 0].new(input_mask=M.S)
-    with pytest.raises(TypeError, match="mask and input_mask arguments cannot both be given"):
-        A[0, [0, 1]].new(input_mask=M.S, mask=expected.S)
-    with pytest.raises(TypeError, match="mask and input_mask arguments cannot both be given"):
-        A(input_mask=M.S, mask=expected.S)
-    with pytest.raises(TypeError, match=r"Mask must indicate values \(M.V\) or structure \(M.S\)"):
-        A[0, [0, 1]].new(input_mask=M)
-    with pytest.raises(TypeError, match=r"Mask must indicate values \(M.V\) or structure \(M.S\)"):
-        A(input_mask=M)
-    with pytest.raises(TypeError, match="Mask object must be type Vector"):
-        expected[[0, 1]].new(input_mask=M.S)
-    with pytest.raises(TypeError, match="Mask object must be type Vector"):
-        expected(input_mask=M.S) << expected[[0, 1]]
-    with pytest.raises(TypeError, match=r"new\(\) got an unexpected keyword argument 'input_mask'"):
-        A.new(input_mask=M.S)
-    with pytest.raises(TypeError, match="`input_mask` argument may only be used for extract"):
-        A(input_mask=M.S) << A.apply(unary.ainv)
-    with pytest.raises(TypeError, match="`input_mask` argument may only be used for extract"):
-        A(input_mask=M.S)[[0], [0]] = 1
-    with pytest.raises(TypeError, match="`input_mask` argument may only be used for extract"):
-        A(input_mask=M.S)[[0], [0]]
+    As = [
+        Matrix.from_values(
+            [0, 0, 0, 1, 1, 1],
+            [0, 1, 2, 0, 1, 2],
+            [0, 1, 2, 3, 4, 5],
+        )
+    ]
+    As += [
+        Matrix.from_values(
+            da.from_array([0, 0, 0, 1, 1, 1]),
+            da.from_array([0, 1, 2, 0, 1, 2]),
+            da.from_array([0, 1, 2, 3, 4, 5]),
+        )
+    ]
+    Ms = [
+        Matrix.from_values(
+            [0, 0, 1, 1],
+            [1, 2, 0, 1],
+            [0, 1, 2, 3],
+        )
+    ]
+    Ms += [
+        Matrix.from_values(
+            da.from_array([0, 0, 1, 1]),
+            da.from_array([1, 2, 0, 1]),
+            da.from_array([0, 1, 2, 3]),
+        )
+    ]
+    for A_ in As:
+        for M_ in Ms:
+            A = A_.dup()
+            M = M_.dup()
+            m = M[0, :].new()
+            MT = M.T.new()
+            # Matrix structure mask
+            result = A[0, [0, 1]].new(input_mask=M.S)
+            expected = Vector.from_values([1], [1])
+            assert result.isequal(expected)
+            # again
+            result.clear()
+            result(input_mask=M.S) << A[0, [0, 1]]
+            assert result.isequal(expected)
+
+            # Vector mask
+            result = A[0, [0, 1]].new(input_mask=m.S)
+            assert result.isequal(expected)
+            # again
+            result.clear()
+            result(input_mask=m.S) << A[0, [0, 1]]
+            assert result.isequal(expected)
+
+            # Matrix value mask
+            result = A[0, [1, 2]].new(input_mask=M.V)
+            expected = Vector.from_values([1], [2], size=2)
+            assert result.isequal(expected)
+            # again
+            result.clear()
+            result(input_mask=M.V) << A[0, [1, 2]]
+            assert result.isequal(expected)
+
+            with pytest.raises(
+                ValueError, match="Shape of `input_mask` does not match shape of input"
+            ):
+                A[0, [0, 1]].new(input_mask=MT.S).compute()
 
-    # With transpose input value
-    # Matrix structure mask
-    result = A.T[[0, 1], 0].new(input_mask=MT.S)
-    expected = Vector.from_values([1], [1])
-    assert result.isequal(expected)
-    # again
-    result.clear()
-    result(input_mask=MT.S) << A.T[[0, 1], 0]
-    assert result.isequal(expected)
-
-    # Vector mask
-    result = A.T[[0, 1], 0].new(input_mask=m.S)
-    assert result.isequal(expected)
-    # again
-    result.clear()
-    result(input_mask=m.S) << A.T[[0, 1], 0]
-    assert result.isequal(expected)
-
-    # Matrix value mask
-    result = A.T[[1, 2], 0].new(input_mask=MT.V)
-    expected = Vector.from_values([1], [2], size=2)
-    assert result.isequal(expected)
-    # again
-    result.clear()
-    result(input_mask=MT.V) << A.T[[1, 2], 0]
-    assert result.isequal(expected)
+            with pytest.raises(
+                ValueError, match="Shape of `input_mask` does not match shape of input"
+            ):
+                m(input_mask=MT.S) << A[0, [0, 1]]
+                m.compute()
+
+            with pytest.raises(
+                ValueError, match="Size of `input_mask` Vector does not match ncols of Matrix"
+            ):
+                A[0, [0]].new(input_mask=expected.S).compute()
+
+            m = M[0, :].new()
+            with pytest.raises(
+                ValueError, match="Size of `input_mask` Vector does not match ncols of Matrix"
+            ):
+                m(input_mask=expected.S) << A[0, [0]]
+                m.compute()
+
+            m = M[0, :].new()
+            with pytest.raises(
+                ValueError, match="Size of `input_mask` Vector does not match nrows of Matrix"
+            ):
+                A[[0], 0].new(input_mask=m.S).compute()
+
+            m = M[0, :].new()
+            with pytest.raises(
+                ValueError, match="Size of `input_mask` Vector does not match nrows of Matrix"
+            ):
+                m(input_mask=m.S) << A[[0], 0]
+                m.compute()
+
+            with pytest.raises(
+                TypeError, match="Got Vector `input_mask` when extracting a submatrix from a Matrix"
+            ):
+                A[[0], [0]].new(input_mask=expected.S).compute()
+
+            with pytest.raises(
+                TypeError, match="Got Vector `input_mask` when extracting a submatrix from a Matrix"
+            ):
+                A(input_mask=expected.S) << A[[0], [0]]
+                A.compute()
+
+            A = A_.dup()
+            with pytest.raises(
+                TypeError, match="mask is not allowed for single element extraction"
+            ):
+                A[0, 0].new(input_mask=M.S).compute()
+
+            with pytest.raises(
+                TypeError, match="mask and input_mask arguments cannot both be given"
+            ):
+                A[0, [0, 1]].new(input_mask=M.S, mask=expected.S).compute()
+
+            with pytest.raises(
+                TypeError, match="mask and input_mask arguments cannot both be given"
+            ):
+                A(input_mask=M.S, mask=expected.S).compute()
+
+            with pytest.raises(
+                TypeError, match=r"Mask must indicate values \(M.V\) or structure \(M.S\)"
+            ):
+                A[0, [0, 1]].new(input_mask=M).compute()
+
+            with pytest.raises(
+                TypeError, match=r"Mask must indicate values \(M.V\) or structure \(M.S\)"
+            ):
+                A(input_mask=M).compute()
+
+            with pytest.raises(TypeError, match="Mask object must be type Vector"):
+                expected[[0, 1]].new(input_mask=M.S).compute()
+
+            with pytest.raises(TypeError, match="Mask object must be type Vector"):
+                expected(input_mask=M.S) << expected[[0, 1]]
+                expected.compute()
+
+            with pytest.raises(
+                TypeError, match=r"new\(\) got an unexpected keyword argument 'input_mask'"
+            ):
+                A.new(input_mask=M.S).compute()
+
+            with pytest.raises(
+                TypeError, match="`input_mask` argument may only be used for extract"
+            ):
+                A(input_mask=M.S) << A.apply(unary.ainv)
+                A.compute()
+
+            A = A_.dup()
+            with pytest.raises(
+                TypeError, match="`input_mask` argument may only be used for extract"
+            ):
+                A(input_mask=M.S)[[0], [0]] = 1
+                A.compute()
+
+            A = A_.dup()
+            with pytest.raises(
+                TypeError, match="`input_mask` argument may only be used for extract"
+            ):
+                A(input_mask=M.S)[[0], [0]]
+                A.compute()
+
+            A = A_.dup()
+            m = M[0, :].new()
+            # With transpose input value
+            # Matrix structure mask
+            result = A.T[[0, 1], 0].new(input_mask=MT.S)
+            expected = Vector.from_values([1], [1])
+            assert result.isequal(expected)
+            # again
+            result.clear()
+            result(input_mask=MT.S) << A.T[[0, 1], 0]
+            assert result.isequal(expected)
+
+            # Vector mask
+            result = A.T[[0, 1], 0].new(input_mask=m.S)
+            assert result.isequal(expected)
+            # again
+            result.clear()
+            result(input_mask=m.S) << A.T[[0, 1], 0]
+            assert result.isequal(expected)
+
+            # Matrix value mask
+            result = A.T[[1, 2], 0].new(input_mask=MT.V)
+            expected = Vector.from_values([1], [2], size=2)
+            assert result.isequal(expected)
+            # again
+            result.clear()
+            result(input_mask=MT.V) << A.T[[1, 2], 0]
+            assert result.isequal(expected)
 
     # With transpose input value
     # Matrix structure mask
-    A = Matrix.from_values(
-        [0, 0, 0, 1, 1, 1],
-        [0, 1, 2, 0, 1, 2],
-        [0, 1, 2, 3, 4, 5],
+    As = [
+        Matrix.from_values(
+            [0, 0, 0, 1, 1, 1],
+            [0, 1, 2, 0, 1, 2],
+            [0, 1, 2, 3, 4, 5],
+        )
+    ]
+    As += [
+        Matrix.from_values(
+            da.from_array([0, 0, 0, 1, 1, 1]),
+            da.from_array([0, 1, 2, 0, 1, 2]),
+            da.from_array([0, 1, 2, 3, 4, 5]),
+        )
+    ]
+    Ms = [
+        Matrix.from_values(
+            [0, 0, 1, 1],
+            [1, 2, 0, 1],
+            [0, 1, 2, 3],
+        )
+    ]
+    Ms += [
+        Matrix.from_values(
+            da.from_array([0, 0, 1, 1]),
+            da.from_array([1, 2, 0, 1]),
+            da.from_array([0, 1, 2, 3]),
+        )
+    ]
+    for A_ in As:
+        for M_ in Ms:
+            A = A_.dup()
+            M = M_.dup()
+            A.rechunk(chunks=((1, 1), (2, 1)), inplace=True)
+            result = A.T[[0, 1], 0].new(input_mask=MT.S)
+            expected = Vector.from_values([1], [1])
+            assert result.isequal(expected)
+            # again
+            result.clear()
+            result(input_mask=MT.S) << A.T[[0, 1], 0]
+            assert result.isequal(expected)
+
+            # Vector mask
+            result = A.T[[0, 1], 0].new(input_mask=m.S)
+            assert result.isequal(expected)
+            # again
+            result.clear()
+            result(input_mask=m.S) << A.T[[0, 1], 0]
+            assert result.isequal(expected)
+
+            # Matrix value mask
+            result = A.T[[1, 2], 0].new(input_mask=MT.V)
+            expected = Vector.from_values([1], [2], size=2)
+            assert result.isequal(expected)
+            # again
+            result.clear()
+            result(input_mask=MT.V) << A.T[[1, 2], 0]
+            assert result.isequal(expected)
+
+
+def test_extract_with_matrix(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            with pytest.raises(TypeError, match="Invalid type for index"):
+                A[A.T, 1].new()
+            with pytest.raises(TypeError, match="Invalid type for index"):
+                A[A, [1]].new()
+            with pytest.raises(TypeError, match="Invalid type for index"):
+                A[[0], A.V].new()
+
+
+def test_assign(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            B = Matrix.from_values([0, 0, 1], [0, 1, 0], [9, 8, 7])
+            result = Matrix.from_values(
+                [0, 0, 2, 3, 0, 3, 5, 6, 0, 6, 1, 6, 4, 1],
+                [0, 5, 0, 0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 6],
+                [9, 8, 7, 3, 2, 3, 1, 5, 3, 7, 8, 3, 7, 4],
+            )
+            C = A.dup()
+            C()[[0, 2], [0, 5]] = B
+            assert C.isequal(result)
+            C = A.dup()
+            C[:3:2, :6:5]() << B
+            assert C.isequal(result)
+            with pytest.raises(TypeError, match="will make the Matrix dense"):
+                C << 1
+            nvals = C.nvals
+            C(C.S) << 1
+            assert C.nvals == nvals
+            assert C.reduce_scalar().new() == nvals
+            with pytest.raises(TypeError, match="Invalid type for index"):
+                C[C, [1]] = C
+            B = B.T.new()
+            C = A.dup()
+            C()[[0, 2], [0, 5]] = B.T
+            assert C.isequal(result)
+            C = A.dup()
+            C[:3:2, :6:5]() << B.T
+            assert C.isequal(result)
+
+            B.rechunk(chunks=1)
+            C = A.dup()
+            C()[[0, 2], [0, 5]] = B.T
+            assert C.isequal(result)
+            C = A.dup()
+            C[:3:2, :6:5]() << B.T
+            assert C.isequal(result)
+
+
+def test_assign_wrong_dims(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            B = Matrix.from_values([0, 0, 1], [0, 1, 0], [9, 8, 7])
+            with pytest.raises(DimensionMismatch):
+                A[[0, 2, 4], [0, 5]] = B
+                A.compute()
+
+
+def test_assign_row(As, A_chunks, v):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            result = Matrix.from_values(
+                [3, 3, 5, 6, 6, 1, 6, 2, 4, 1, 0, 0, 0, 0],
+                [0, 2, 2, 2, 3, 4, 4, 5, 5, 6, 1, 3, 4, 6],
+                [3, 3, 1, 5, 7, 8, 3, 1, 7, 4, 1, 1, 2, 0],
+            )
+            C = A.dup()
+            C[0, :] = v
+            assert C.isequal(result)
+
+
+def test_subassign_row_col(A_chunks):
+    A_0 = Matrix.from_values(
+        [0, 0, 0, 1, 1, 1, 2, 2, 2],
+        [0, 1, 2, 0, 1, 2, 0, 1, 2],
+        [0, 1, 2, 3, 4, 5, 6, 7, 8],
     )
-    M = Matrix.from_values(
-        [0, 0, 1, 1],
-        [1, 2, 0, 1],
-        [0, 1, 2, 3],
+    A_1 = Matrix.from_values(
+        da.from_array([0, 0, 0, 1, 1, 1, 2, 2, 2]),
+        da.from_array([0, 1, 2, 0, 1, 2, 0, 1, 2]),
+        da.from_array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
     )
-    A.rechunk(chunks=((1, 1), (2, 1)), inplace=True)
-    result = A.T[[0, 1], 0].new(input_mask=MT.S)
-    expected = Vector.from_values([1], [1])
-    assert result.isequal(expected)
-    # again
-    result.clear()
-    result(input_mask=MT.S) << A.T[[0, 1], 0]
-    assert result.isequal(expected)
-
-    # Vector mask
-    result = A.T[[0, 1], 0].new(input_mask=m.S)
-    assert result.isequal(expected)
-    # again
-    result.clear()
-    result(input_mask=m.S) << A.T[[0, 1], 0]
-    assert result.isequal(expected)
-
-    # Matrix value mask
-    result = A.T[[1, 2], 0].new(input_mask=MT.V)
-    expected = Vector.from_values([1], [2], size=2)
-    assert result.isequal(expected)
-    # again
-    result.clear()
-    result(input_mask=MT.V) << A.T[[1, 2], 0]
-    assert result.isequal(expected)
-
-
-def test_extract_with_matrix(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        with pytest.raises(TypeError, match="Invalid type for index"):
-            A[A.T, 1].new()
-        with pytest.raises(TypeError, match="Invalid type for index"):
-            A[A, [1]].new()
-        with pytest.raises(TypeError, match="Invalid type for index"):
-            A[[0], A.V].new()
+    As = [A_0, A_1]
+    for A_ in As:
+        for chunks in [3, 2, 1]:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            m = Vector.from_values([1], [True])
+            v = Vector.from_values([0, 1], [10, 20])
+
+            A[[0, 1], 0](m.S) << v
+            result1 = Matrix.from_values(
+                [0, 0, 0, 1, 1, 1, 2, 2, 2],
+                [0, 1, 2, 0, 1, 2, 0, 1, 2],
+                [0, 1, 2, 20, 4, 5, 6, 7, 8],
+            )
+            assert A.isequal(result1)
 
+            A[1, [1, 2]](m.V, accum=binary.plus).update(v)
+            result2 = Matrix.from_values(
+                [0, 0, 0, 1, 1, 1, 2, 2, 2],
+                [0, 1, 2, 0, 1, 2, 0, 1, 2],
+                [0, 1, 2, 20, 4, 25, 6, 7, 8],
+            )
+            assert A.isequal(result2)
 
-def test_assign(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        B = Matrix.from_values([0, 0, 1], [0, 1, 0], [9, 8, 7])
-        result = Matrix.from_values(
-            [0, 0, 2, 3, 0, 3, 5, 6, 0, 6, 1, 6, 4, 1],
-            [0, 5, 0, 0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 6],
-            [9, 8, 7, 3, 2, 3, 1, 5, 3, 7, 8, 3, 7, 4],
-        )
-        C = A.dup()
-        C()[[0, 2], [0, 5]] = B
-        assert C.isequal(result)
-        C = A.dup()
-        C[:3:2, :6:5]() << B
-        assert C.isequal(result)
-        with pytest.raises(TypeError, match="will make the Matrix dense"):
-            C << 1
-        nvals = C.nvals
-        C(C.S) << 1
-        assert C.nvals == nvals
-        assert C.reduce_scalar().new() == nvals
-        with pytest.raises(TypeError, match="Invalid type for index"):
-            C[C, [1]] = C
-
-        B = B.T.new()
-        C = A.dup()
-        C()[[0, 2], [0, 5]] = B.T
-        assert C.isequal(result)
-        C = A.dup()
-        C[:3:2, :6:5]() << B.T
-        assert C.isequal(result)
-
-        B.rechunk(chunks=1)
-        C = A.dup()
-        C()[[0, 2], [0, 5]] = B.T
-        assert C.isequal(result)
-        C = A.dup()
-        C[:3:2, :6:5]() << B.T
-        assert C.isequal(result)
-
-
-def test_assign_wrong_dims(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        B = Matrix.from_values([0, 0, 1], [0, 1, 0], [9, 8, 7])
-        with pytest.raises(DimensionMismatch):
-            A[[0, 2, 4], [0, 5]] = B
+            A[[0, 1], 0](m.S, binary.plus, replace=True) << v
+            result3 = Matrix.from_values(
+                [0, 0, 1, 1, 1, 2, 2, 2],
+                [1, 2, 0, 1, 2, 0, 1, 2],
+                [1, 2, 40, 4, 25, 6, 7, 8],
+            )
+            assert A.isequal(result3)
+
+            _A = A.dup()
+            with pytest.raises(DimensionMismatch):
+                A(m.S)[[0, 1], 0] << v
+                A.compute()
+
+            A = _A
+            A[[0, 1], 0](m.S) << 99
+            result4 = Matrix.from_values(
+                [0, 0, 1, 1, 1, 2, 2, 2],
+                [1, 2, 0, 1, 2, 0, 1, 2],
+                [1, 2, 99, 4, 25, 6, 7, 8],
+            )
+            assert A.isequal(result4)
 
+            A[[1, 2], 0](m.S, binary.plus, replace=True) << 100
+            result5 = Matrix.from_values(
+                [0, 0, 1, 1, 2, 2, 2],
+                [1, 2, 1, 2, 0, 1, 2],
+                [1, 2, 4, 25, 106, 7, 8],
+            )
+            assert A.isequal(result5)
 
-def test_assign_row(A, A_chunks, v):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        result = Matrix.from_values(
-            [3, 3, 5, 6, 6, 1, 6, 2, 4, 1, 0, 0, 0, 0],
-            [0, 2, 2, 2, 3, 4, 4, 5, 5, 6, 1, 3, 4, 6],
-            [3, 3, 1, 5, 7, 8, 3, 1, 7, 4, 1, 1, 2, 0],
-        )
-        C = A.dup()
-        C[0, :] = v
-        assert C.isequal(result)
+            A[2, [0, 1]](m.S) << -1
+            result6 = Matrix.from_values(
+                [0, 0, 1, 1, 2, 2, 2],
+                [1, 2, 1, 2, 0, 1, 2],
+                [1, 2, 4, 25, 106, -1, 8],
+            )
+            assert A.isequal(result6)
 
 
-def test_subassign_row_col(A_chunks):
-    A = Matrix.from_values(
+def test_subassign_matrix():
+    A_0 = Matrix.from_values(
         [0, 0, 0, 1, 1, 1, 2, 2, 2],
         [0, 1, 2, 0, 1, 2, 0, 1, 2],
         [0, 1, 2, 3, 4, 5, 6, 7, 8],
     )
-    A_ = A
-    for chunks in [3, 2, 1]:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        m = Vector.from_values([1], [True])
-        v = Vector.from_values([0, 1], [10, 20])
-
-        A[[0, 1], 0](m.S) << v
+    A_1 = Matrix.from_values(
+        da.from_array([0, 0, 0, 1, 1, 1, 2, 2, 2]),
+        da.from_array([0, 1, 2, 0, 1, 2, 0, 1, 2]),
+        da.from_array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
+    )
+    As = [A_0, A_1]
+    for A_i in As:
+        A = A_i.dup()
+        m = Matrix.from_values([1], [0], [True])
+        v = Matrix.from_values([0, 1], [0, 0], [10, 20])
+        mT = m.T.new()
+
+        A[[0, 1], [0]](m.S) << v
         result1 = Matrix.from_values(
             [0, 0, 0, 1, 1, 1, 2, 2, 2],
             [0, 1, 2, 0, 1, 2, 0, 1, 2],
@@ -840,7 +1197,11 @@ def test_subassign_row_col(A_chunks):
         )
         assert A.isequal(result1)
 
-        A[1, [1, 2]](m.V, accum=binary.plus).update(v)
+        A_ = A.dup()
+        _A = A.dup()
+        _A_ = A.dup()
+
+        A[[1], [1, 2]](mT.V, accum=binary.plus) << v.T
         result2 = Matrix.from_values(
             [0, 0, 0, 1, 1, 1, 2, 2, 2],
             [0, 1, 2, 0, 1, 2, 0, 1, 2],
@@ -848,7 +1209,16 @@ def test_subassign_row_col(A_chunks):
         )
         assert A.isequal(result2)
 
-        A[[0, 1], 0](m.S, binary.plus, replace=True) << v
+        A_[[1], 1:3](mT.V, accum=binary.plus) << v.T
+        assert A_.isequal(result2)
+
+        _A[1:2, [1, 2]](mT.V, accum=binary.plus) << v.T
+        assert _A.isequal(result2)
+
+        _A_[1:2, 1:3](mT.V, accum=binary.plus) << v.T
+        assert _A_.isequal(result2)
+
+        A[[0, 1], [0]](m.S, binary.plus, replace=True) << v
         result3 = Matrix.from_values(
             [0, 0, 1, 1, 1, 2, 2, 2],
             [1, 2, 0, 1, 2, 0, 1, 2],
@@ -856,10 +1226,13 @@ def test_subassign_row_col(A_chunks):
         )
         assert A.isequal(result3)
 
+        A__ = A.dup()
         with pytest.raises(DimensionMismatch):
-            A(m.S)[[0, 1], 0] << v
+            A(m.S)[[0, 1], [0]] << v
+            A.compute()
 
-        A[[0, 1], 0](m.S) << 99
+        A = A__
+        A[[0, 1], [0]](m.S) << 99
         result4 = Matrix.from_values(
             [0, 0, 1, 1, 1, 2, 2, 2],
             [1, 2, 0, 1, 2, 0, 1, 2],
@@ -867,7 +1240,7 @@ def test_subassign_row_col(A_chunks):
         )
         assert A.isequal(result4)
 
-        A[[1, 2], 0](m.S, binary.plus, replace=True) << 100
+        A[[1, 2], [0]](m.S, binary.plus, replace=True) << 100
         result5 = Matrix.from_values(
             [0, 0, 1, 1, 2, 2, 2],
             [1, 2, 1, 2, 0, 1, 2],
@@ -875,7 +1248,7 @@ def test_subassign_row_col(A_chunks):
         )
         assert A.isequal(result5)
 
-        A[2, [0, 1]](m.S) << -1
+        A[[2], [0, 1]](mT.S) << -1
         result6 = Matrix.from_values(
             [0, 0, 1, 1, 2, 2, 2],
             [1, 2, 1, 2, 0, 1, 2],
@@ -884,715 +1257,705 @@ def test_subassign_row_col(A_chunks):
         assert A.isequal(result6)
 
 
-def test_subassign_matrix():
-    A = Matrix.from_values(
-        [0, 0, 0, 1, 1, 1, 2, 2, 2],
-        [0, 1, 2, 0, 1, 2, 0, 1, 2],
-        [0, 1, 2, 3, 4, 5, 6, 7, 8],
-    )
-    m = Matrix.from_values([1], [0], [True])
-    v = Matrix.from_values([0, 1], [0, 0], [10, 20])
-    mT = m.T.new()
-
-    A[[0, 1], [0]](m.S) << v
-    result1 = Matrix.from_values(
-        [0, 0, 0, 1, 1, 1, 2, 2, 2],
-        [0, 1, 2, 0, 1, 2, 0, 1, 2],
-        [0, 1, 2, 20, 4, 5, 6, 7, 8],
-    )
-    assert A.isequal(result1)
-
-    A_ = A.dup()
-    _A = A.dup()
-    _A_ = A.dup()
-
-    A[[1], [1, 2]](mT.V, accum=binary.plus) << v.T
-    result2 = Matrix.from_values(
-        [0, 0, 0, 1, 1, 1, 2, 2, 2],
-        [0, 1, 2, 0, 1, 2, 0, 1, 2],
-        [0, 1, 2, 20, 4, 25, 6, 7, 8],
-    )
-    assert A.isequal(result2)
-
-    A_[[1], 1:3](mT.V, accum=binary.plus) << v.T
-    assert A_.isequal(result2)
-
-    _A[1:2, [1, 2]](mT.V, accum=binary.plus) << v.T
-    assert _A.isequal(result2)
-
-    _A_[1:2, 1:3](mT.V, accum=binary.plus) << v.T
-    assert _A_.isequal(result2)
-
-    A[[0, 1], [0]](m.S, binary.plus, replace=True) << v
-    result3 = Matrix.from_values(
-        [0, 0, 1, 1, 1, 2, 2, 2],
-        [1, 2, 0, 1, 2, 0, 1, 2],
-        [1, 2, 40, 4, 25, 6, 7, 8],
-    )
-    assert A.isequal(result3)
-
-    with pytest.raises(DimensionMismatch):
-        A(m.S)[[0, 1], [0]] << v
-
-    A[[0, 1], [0]](m.S) << 99
-    result4 = Matrix.from_values(
-        [0, 0, 1, 1, 1, 2, 2, 2],
-        [1, 2, 0, 1, 2, 0, 1, 2],
-        [1, 2, 99, 4, 25, 6, 7, 8],
-    )
-    assert A.isequal(result4)
-
-    A[[1, 2], [0]](m.S, binary.plus, replace=True) << 100
-    result5 = Matrix.from_values(
-        [0, 0, 1, 1, 2, 2, 2],
-        [1, 2, 1, 2, 0, 1, 2],
-        [1, 2, 4, 25, 106, 7, 8],
-    )
-    assert A.isequal(result5)
+def test_assign_column(As, A_chunks, v):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            result = Matrix.from_values(
+                [3, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1, 1, 3, 4, 6],
+                [0, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6, 1, 1, 1, 1],
+                [3, 3, 1, 5, 3, 7, 8, 3, 1, 7, 4, 1, 1, 2, 0],
+            )
+            C = A.dup()
+            C[:, 1] = v
+            assert C.isequal(result)
 
-    A[[2], [0, 1]](mT.S) << -1
-    result6 = Matrix.from_values(
-        [0, 0, 1, 1, 2, 2, 2],
-        [1, 2, 1, 2, 0, 1, 2],
-        [1, 2, 4, 25, 106, -1, 8],
-    )
-    assert A.isequal(result6)
 
+def test_assign_row_scalar(As, A_chunks, v):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            C = A.dup()
+            C[0, :](v.S) << v
+            D = A.dup()
+            D(v.S)[0, :] << v
+            assert C.isequal(D)
+
+            C[:, :](C.S) << 1
+
+            C_ = C.dup()
+            with pytest.raises(
+                TypeError, match="Unable to use Vector mask on Matrix assignment to a Matrix"
+            ):
+                C[:, :](v.S) << 1
+                C.compute()
 
-def test_assign_column(A, A_chunks, v):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        result = Matrix.from_values(
-            [3, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1, 1, 3, 4, 6],
-            [0, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6, 1, 1, 1, 1],
-            [3, 3, 1, 5, 3, 7, 8, 3, 1, 7, 4, 1, 1, 2, 0],
-        )
-        C = A.dup()
-        C[:, 1] = v
-        assert C.isequal(result)
+            C = C_.dup()
+            with pytest.raises(
+                TypeError,
+                match="Unable to use Vector mask on single element assignment to a Matrix",
+            ):
+                C[0, 0](v.S) << 1
+                C.compute()
 
+            C = C_.dup()
+            with pytest.raises(TypeError):
+                C[0, 0](v.S) << v
+                C.compute()
 
-def test_assign_row_scalar(A, A_chunks, v):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        C = A.dup()
-        C[0, :](v.S) << v
-        D = A.dup()
-        D(v.S)[0, :] << v
-        assert C.isequal(D)
+            C = C_.dup()
+            with pytest.raises(TypeError):
+                C(v.S)[0, 0] << v
+                C.compute()
 
-        C[:, :](C.S) << 1
+            C = C_.dup()
+            with pytest.raises(TypeError):
+                C[0, 0](C.S) << v
+                C.compute()
 
-        with pytest.raises(
-            TypeError, match="Unable to use Vector mask on Matrix assignment to a Matrix"
-        ):
-            C[:, :](v.S) << 1
-        with pytest.raises(
-            TypeError, match="Unable to use Vector mask on single element assignment to a Matrix"
-        ):
-            C[0, 0](v.S) << 1
+            C = C_.dup()
+            with pytest.raises(TypeError):
+                C(C.S)[0, 0] << v
+                C.compute()
 
-        with pytest.raises(TypeError):
-            C[0, 0](v.S) << v
-        with pytest.raises(TypeError):
-            C(v.S)[0, 0] << v
-        with pytest.raises(TypeError):
-            C[0, 0](C.S) << v
-        with pytest.raises(TypeError):
-            C(C.S)[0, 0] << v
+            C = C_.dup()
+            with pytest.raises(TypeError):
+                C[0, 0](v.S) << C
+                C.compute()
 
-        with pytest.raises(TypeError):
-            C[0, 0](v.S) << C
-        with pytest.raises(TypeError):
-            C[0, 0](C.S) << C
-
-        C = A.dup()
-        C(v.S)[0, :] = 10
-        result = Matrix.from_values(
-            [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1, 0, 0],
-            [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6, 4, 6],
-            [3, 10, 3, 1, 5, 10, 7, 8, 3, 1, 7, 4, 10, 10],
-        )
-        assert C.isequal(result)
+            C = C_.dup()
+            with pytest.raises(TypeError):
+                C[0, 0](C.S) << C
+                C.compute()
+
+            C = A.dup()
+            C(v.S)[0, :] = 10
+            result = Matrix.from_values(
+                [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1, 0, 0],
+                [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6, 4, 6],
+                [3, 10, 3, 1, 5, 10, 7, 8, 3, 1, 7, 4, 10, 10],
+            )
+            assert C.isequal(result)
 
 
 def test_assign_row_col_matrix_mask():
     # A         B       v1      v2
     # 0 1       4 _     100     10
     # 2 _       0 5             20
-    A = Matrix.from_values([0, 0, 1], [0, 1, 0], [0, 1, 2])
-    B = Matrix.from_values([0, 1, 1], [0, 0, 1], [4, 0, 5])
-    v1 = Vector.from_values([0], [100])
-    v2 = Vector.from_values([0, 1], [10, 20])
-
-    # row assign
-    C = A.dup()
-    C(B.S)[0, :] << v2
-    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [10, 1, 2])
-    assert C.isequal(result)
-
-    C = A.dup()
-    C(B.S, accum=binary.plus)[1, :] = v2
-    result = Matrix.from_values([0, 0, 1, 1], [0, 1, 0, 1], [0, 1, 12, 20])
-    assert C.isequal(result)
-
-    C = A.dup()
-    C(B.S, replace=True)[1, :] << v2
-    result = Matrix.from_values([0, 1, 1], [0, 0, 1], [0, 10, 20])
-    assert C.isequal(result)
-
-    # col assign
-    C = A.dup()
-    C(B.S)[:, 0] = v2
-    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [10, 1, 20])
-    assert C.isequal(result)
-
-    C = A.dup()
-    C(B.S, accum=binary.plus)[:, 1] << v2
-    result = Matrix.from_values([0, 0, 1, 1], [0, 1, 0, 1], [0, 1, 2, 20])
-    assert C.isequal(result)
-
-    C = A.dup()
-    C(B.S, replace=True)[:, 1] = v2
-    result = Matrix.from_values([0, 1, 1], [0, 0, 1], [0, 2, 20])
-    assert C.isequal(result)
-
-    # row assign scalar (as a sanity check)
-    C = A.dup()
-    C(B.S)[0, :] = 100
-    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [100, 1, 2])
-    assert C.isequal(result)
-
-    C = A.dup()
-    C(B.S, accum=binary.plus)[1, :] << 100
-    result = Matrix.from_values([0, 0, 1, 1], [0, 1, 0, 1], [0, 1, 102, 100])
-    assert C.isequal(result)
-
-    C = A.dup()
-    C(B.S, replace=True)[1, :] = 100
-    result = Matrix.from_values([0, 1, 1], [0, 0, 1], [0, 100, 100])
-    assert C.isequal(result)
-
-    # col assign scalar (as a sanity check)
-    C = A.dup()
-    C(B.S)[:, 0] << 100
-    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [100, 1, 100])
-    assert C.isequal(result)
-
-    C = A.dup()
-    C(B.S, accum=binary.plus)[:, 1] = 100
-    result = Matrix.from_values([0, 0, 1, 1], [0, 1, 0, 1], [0, 1, 2, 100])
-    assert C.isequal(result)
-
-    C = A.dup()
-    C(B.S, replace=True)[:, 1] << 100
-    result = Matrix.from_values([0, 1, 1], [0, 0, 1], [0, 2, 100])
-    assert C.isequal(result)
-
-    # row subassign
-    C = A.dup()
-    C[0, :](v2.S) << v2
-    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [10, 20, 2])
-    assert C.isequal(result)
-
-    C = A.dup()
-    C[0, [0]](v1.S) << v1
-    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [100, 1, 2])
-    assert C.isequal(result)
-
-    with pytest.raises(
-        TypeError, match="Indices for subassign imply Vector submask, but got Matrix mask instead"
-    ):
-        C[0, :](B.S) << v2
-
-    # col subassign
-    C = A.dup()
-    C[:, 0](v2.S) << v2
-    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [10, 1, 20])
-    assert C.isequal(result)
-
-    C = A.dup()
-    C[[0], 0](v1.S) << v1
-    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [100, 1, 2])
-    assert C.isequal(result)
-
-    with pytest.raises(
-        TypeError, match="Indices for subassign imply Vector submask, but got Matrix mask instead"
-    ):
-        C[:, 0](B.S) << v2
-
-    # row subassign scalar
-    C = A.dup()
-    C[0, :](v2.S) << 100
-    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [100, 100, 2])
-    assert C.isequal(result)
-
-    C = A.dup()
-    C[0, [0]](v1.S) << 100
-    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [100, 1, 2])
-    assert C.isequal(result)
-
-    with pytest.raises(
-        TypeError, match="Indices for subassign imply Vector submask, but got Matrix mask instead"
-    ):
-        C[:, 0](B.S) << 100
-
-    # col subassign scalar
-    C = A.dup()
-    C[:, 0](v2.S) << 100
-    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [100, 1, 100])
-    assert C.isequal(result)
-
-    C = A.dup()
-    C[[0], 0](v1.S) << 100
-    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [100, 1, 2])
-    assert C.isequal(result)
-
-    with pytest.raises(
-        TypeError, match="Indices for subassign imply Vector submask, but got Matrix mask instead"
-    ):
-        C[:, 0](B.S) << 100
-
-    # Bad subassign
-    with pytest.raises(TypeError, match="Single element assign does not accept a submask"):
-        C[0, 0](B.S) << 100
-
-
-def test_assign_column_scalar(A, A_chunks, v):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        C = A.dup()
-        C[:, 0](v.S) << v
-        D = A.dup()
-        D(v.S)[:, 0] << v
-        assert C.isequal(D)
-
-        C = A.dup()
-        C[:, 1] = v
-        C(v.S)[:, 1] = 10
-        result = Matrix.from_values(
-            [3, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1, 1, 3, 4, 6],
-            [0, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6, 1, 1, 1, 1],
-            [3, 3, 1, 5, 3, 7, 8, 3, 1, 7, 4, 10, 10, 10, 10],
-        )
-        assert C.isequal(result)
+    A_0 = Matrix.from_values([0, 0, 1], [0, 1, 0], [0, 1, 2])
+    B_0 = Matrix.from_values([0, 1, 1], [0, 0, 1], [4, 0, 5])
+    v1_0 = Vector.from_values([0], [100])
+    v2_0 = Vector.from_values([0, 1], [10, 20])
 
-        C(v.V, replace=True, accum=binary.plus)[:, 1] = 20
-        result = Matrix.from_values(
-            [3, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1, 1, 3, 4],
-            [0, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6, 1, 1, 1],
-            [3, 3, 1, 5, 3, 7, 8, 3, 1, 7, 4, 30, 30, 30],
-        )
-        assert C.isequal(result)
+    A_1 = Matrix.from_values(
+        da.from_array([0, 0, 1]), da.from_array([0, 1, 0]), da.from_array([0, 1, 2])
+    )
+    B_1 = Matrix.from_values(
+        da.from_array([0, 1, 1]), da.from_array([0, 0, 1]), da.from_array([4, 0, 5])
+    )
+    v1_1 = Vector.from_values(da.from_array([0]), da.from_array([100]))
+    v2_1 = Vector.from_values(da.from_array([0, 1]), da.from_array([10, 20]))
+
+    As = [A_0, A_1]
+    Bs = [B_0, B_1]
+    v1s = [v1_0, v1_1]
+    v2s = [v2_0, v2_1]
+
+    for A in As:
+        for B in Bs:
+            for v1 in v1s:
+                for v2 in v2s:
+                    # row assign
+                    C = A.dup()
+                    C(B.S)[0, :] << v2
+                    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [10, 1, 2])
+                    assert C.isequal(result)
+
+                    C = A.dup()
+                    C(B.S, accum=binary.plus)[1, :] = v2
+                    result = Matrix.from_values([0, 0, 1, 1], [0, 1, 0, 1], [0, 1, 12, 20])
+                    assert C.isequal(result)
+
+                    C = A.dup()
+                    C(B.S, replace=True)[1, :] << v2
+                    result = Matrix.from_values([0, 1, 1], [0, 0, 1], [0, 10, 20])
+                    assert C.isequal(result)
+
+                    # col assign
+                    C = A.dup()
+                    C(B.S)[:, 0] = v2
+                    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [10, 1, 20])
+                    assert C.isequal(result)
+
+                    C = A.dup()
+                    C(B.S, accum=binary.plus)[:, 1] << v2
+                    result = Matrix.from_values([0, 0, 1, 1], [0, 1, 0, 1], [0, 1, 2, 20])
+                    assert C.isequal(result)
+
+                    C = A.dup()
+                    C(B.S, replace=True)[:, 1] = v2
+                    result = Matrix.from_values([0, 1, 1], [0, 0, 1], [0, 2, 20])
+                    assert C.isequal(result)
+
+                    # row assign scalar (as a sanity check)
+                    C = A.dup()
+                    C(B.S)[0, :] = 100
+                    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [100, 1, 2])
+                    assert C.isequal(result)
+
+                    C = A.dup()
+                    C(B.S, accum=binary.plus)[1, :] << 100
+                    result = Matrix.from_values([0, 0, 1, 1], [0, 1, 0, 1], [0, 1, 102, 100])
+                    assert C.isequal(result)
+
+                    C = A.dup()
+                    C(B.S, replace=True)[1, :] = 100
+                    result = Matrix.from_values([0, 1, 1], [0, 0, 1], [0, 100, 100])
+                    assert C.isequal(result)
+
+                    # col assign scalar (as a sanity check)
+                    C = A.dup()
+                    C(B.S)[:, 0] << 100
+                    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [100, 1, 100])
+                    assert C.isequal(result)
+
+                    C = A.dup()
+                    C(B.S, accum=binary.plus)[:, 1] = 100
+                    result = Matrix.from_values([0, 0, 1, 1], [0, 1, 0, 1], [0, 1, 2, 100])
+                    assert C.isequal(result)
+
+                    C = A.dup()
+                    C(B.S, replace=True)[:, 1] << 100
+                    result = Matrix.from_values([0, 1, 1], [0, 0, 1], [0, 2, 100])
+                    assert C.isequal(result)
+
+                    # row subassign
+                    C = A.dup()
+                    C[0, :](v2.S) << v2
+                    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [10, 20, 2])
+                    assert C.isequal(result)
+
+                    C = A.dup()
+                    C[0, [0]](v1.S) << v1
+                    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [100, 1, 2])
+                    assert C.isequal(result)
+
+                    with pytest.raises(
+                        TypeError,
+                        match="Indices for subassign imply Vector submask, but got Matrix mask instead",
+                    ):
+                        C[0, :](B.S) << v2
+                        C.compute()
+
+                    # col subassign
+                    C = A.dup()
+                    C[:, 0](v2.S) << v2
+                    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [10, 1, 20])
+                    assert C.isequal(result)
+
+                    C = A.dup()
+                    C[[0], 0](v1.S) << v1
+                    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [100, 1, 2])
+                    assert C.isequal(result)
+
+                    with pytest.raises(
+                        TypeError,
+                        match="Indices for subassign imply Vector submask, but got Matrix mask instead",
+                    ):
+                        C[:, 0](B.S) << v2
+                        C.compute()
+
+                    # row subassign scalar
+                    C = A.dup()
+                    C[0, :](v2.S) << 100
+                    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [100, 100, 2])
+                    assert C.isequal(result)
+
+                    C = A.dup()
+                    C[0, [0]](v1.S) << 100
+                    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [100, 1, 2])
+                    assert C.isequal(result)
+
+                    with pytest.raises(
+                        TypeError,
+                        match="Indices for subassign imply Vector submask, but got Matrix mask instead",
+                    ):
+                        C[:, 0](B.S) << 100
+                        C.compute()
+
+                    # col subassign scalar
+                    C = A.dup()
+                    C[:, 0](v2.S) << 100
+                    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [100, 1, 100])
+                    assert C.isequal(result)
+
+                    C = A.dup()
+                    C[[0], 0](v1.S) << 100
+                    result = Matrix.from_values([0, 0, 1], [0, 1, 0], [100, 1, 2])
+                    assert C.isequal(result)
+
+                    with pytest.raises(
+                        TypeError,
+                        match="Indices for subassign imply Vector submask, but got Matrix mask instead",
+                    ):
+                        C[:, 0](B.S) << 100
+                        C.compute()
+
+                    # Bad subassign
+                    C = A.dup()
+                    with pytest.raises(
+                        TypeError, match="Single element assign does not accept a submask"
+                    ):
+                        C[0, 0](B.S) << 100
+                        C.compute()
+
+
+def test_assign_column_scalar(As, A_chunks, v):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            C = A.dup()
+            C[:, 0](v.S) << v
+            D = A.dup()
+            D(v.S)[:, 0] << v
+            assert C.isequal(D)
+
+            C = A.dup()
+            C[:, 1] = v
+            C(v.S)[:, 1] = 10
+            result = Matrix.from_values(
+                [3, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1, 1, 3, 4, 6],
+                [0, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6, 1, 1, 1, 1],
+                [3, 3, 1, 5, 3, 7, 8, 3, 1, 7, 4, 10, 10, 10, 10],
+            )
+            assert C.isequal(result)
 
+            C(v.V, replace=True, accum=binary.plus)[:, 1] = 20
+            result = Matrix.from_values(
+                [3, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1, 1, 3, 4],
+                [0, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6, 1, 1, 1],
+                [3, 3, 1, 5, 3, 7, 8, 3, 1, 7, 4, 30, 30, 30],
+            )
+            assert C.isequal(result)
 
-def test_assign_scalar(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        # Test block
-        result_block = Matrix.from_values(
-            [3, 0, 6, 0, 6, 6, 2, 4, 1, 1, 3, 5, 1, 3, 5],
-            [0, 1, 2, 3, 3, 4, 5, 5, 6, 2, 2, 2, 4, 4, 4],
-            [3, 2, 5, 3, 7, 3, 1, 7, 4, 0, 0, 0, 0, 0, 0],
-        )
-        C = A.dup()
-        C[[1, 3, 5], [2, 4]] = 0
-        assert C.isequal(result_block)
-        C = A.dup()
-        C[[1, 3, 5], [2, 4]] = Scalar.from_value(0)
-        assert C.isequal(result_block)
-        C = A.dup()
-        C[1::2, 2:5:2] = 0
-        assert C.isequal(result_block)
-        C = A.dup()
-        C[1::2, 2:5:2] = Scalar.from_value(0)
-        assert C.isequal(result_block)
-        # Test row
-        result_row = Matrix.from_values(
-            [3, 0, 6, 0, 6, 6, 2, 4, 1, 3, 5, 1, 1],
-            [0, 1, 2, 3, 3, 4, 5, 5, 6, 2, 2, 2, 4],
-            [3, 2, 5, 3, 7, 3, 1, 7, 4, 3, 1, 0, 0],
-        )
-        C = A.dup()
-        C[1, [2, 4]] = 0
-        assert C.isequal(result_row)
-        C = A.dup()
-        C[1, 2] = Scalar.from_value(0)
-        C[1, 4] = Scalar.from_value(0)
-        assert C.isequal(result_row)
-        C = A.dup()
-        C[1, 2:5:2] = 0
-        assert C.isequal(result_row)
-        # Test column
-        result_column = Matrix.from_values(
-            [3, 0, 6, 0, 6, 6, 2, 4, 1, 1, 1, 3, 5],
-            [0, 1, 2, 3, 3, 4, 5, 5, 6, 4, 2, 2, 2],
-            [3, 2, 5, 3, 7, 3, 1, 7, 4, 8, 0, 0, 0],
-        )
-        C = A.dup()
-        C[[1, 3, 5], 2] = 0
-        assert C.isequal(result_column)
-        C = A.dup()
-        C[1::2, 2] = 0
-        assert C.isequal(result_column)
 
+def test_assign_scalar(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            # Test block
+            result_block = Matrix.from_values(
+                [3, 0, 6, 0, 6, 6, 2, 4, 1, 1, 3, 5, 1, 3, 5],
+                [0, 1, 2, 3, 3, 4, 5, 5, 6, 2, 2, 2, 4, 4, 4],
+                [3, 2, 5, 3, 7, 3, 1, 7, 4, 0, 0, 0, 0, 0, 0],
+            )
+            C = A.dup()
+            C[[1, 3, 5], [2, 4]] = 0
+            assert C.isequal(result_block)
+            C = A.dup()
+            C[[1, 3, 5], [2, 4]] = Scalar.from_value(0)
+            assert C.isequal(result_block)
+            C = A.dup()
+            C[1::2, 2:5:2] = 0
+            assert C.isequal(result_block)
+            C = A.dup()
+            C[1::2, 2:5:2] = Scalar.from_value(0)
+            assert C.isequal(result_block)
+            # Test row
+            result_row = Matrix.from_values(
+                [3, 0, 6, 0, 6, 6, 2, 4, 1, 3, 5, 1, 1],
+                [0, 1, 2, 3, 3, 4, 5, 5, 6, 2, 2, 2, 4],
+                [3, 2, 5, 3, 7, 3, 1, 7, 4, 3, 1, 0, 0],
+            )
+            C = A.dup()
+            C[1, [2, 4]] = 0
+            assert C.isequal(result_row)
+            C = A.dup()
+            C[1, 2] = Scalar.from_value(0)
+            C[1, 4] = Scalar.from_value(0)
+            assert C.isequal(result_row)
+            C = A.dup()
+            C[1, 2:5:2] = 0
+            assert C.isequal(result_row)
+            # Test column
+            result_column = Matrix.from_values(
+                [3, 0, 6, 0, 6, 6, 2, 4, 1, 1, 1, 3, 5],
+                [0, 1, 2, 3, 3, 4, 5, 5, 6, 4, 2, 2, 2],
+                [3, 2, 5, 3, 7, 3, 1, 7, 4, 8, 0, 0, 0],
+            )
+            C = A.dup()
+            C[[1, 3, 5], 2] = 0
+            assert C.isequal(result_column)
+            C = A.dup()
+            C[1::2, 2] = 0
+            assert C.isequal(result_column)
 
-def test_assign_bad(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        with pytest.raises(TypeError, match="Bad type"):
-            A[0, 0] = object()
-        with pytest.raises(TypeError, match="Bad type"):
-            A[:, 0] = object()
-        with pytest.raises(TypeError, match="Bad type"):
-            A[0, :] = object()
-        with pytest.raises(TypeError, match="Bad type"):
-            A[:, :] = object()
-        with pytest.raises(TypeError, match="Bad type"):
-            A[0, 0] = A
-        with pytest.raises(TypeError, match="Bad type"):
-            A[:, 0] = A
-        with pytest.raises(TypeError, match="Bad type"):
-            A[0, :] = A
-        v = A[0, :].new()
-        with pytest.raises(TypeError, match="Bad type"):
-            A[0, 0] = v
-        with pytest.raises(TypeError, match="Bad type"):
-            A[:, :] = v
-
-
-def test_apply(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        result = Matrix.from_values(
-            [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1],
-            [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6],
-            [-3, -2, -3, -1, -5, -3, -7, -8, -3, -1, -7, -4],
-        )
-        C = A.apply(unary.ainv).new()
-        assert C.isequal(result)
 
+def test_assign_bad(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            with pytest.raises(TypeError, match="Bad type"):
+                A[0, 0] = object()
+                A.compute()
+            A = A_.dup()
+            with pytest.raises(TypeError, match="Bad type"):
+                A[:, 0] = object()
+                A.compute()
+            A = A_.dup()
+            with pytest.raises(TypeError, match="Bad type"):
+                A[0, :] = object()
+                A.compute()
+            A = A_.dup()
+            with pytest.raises(TypeError, match="Bad type"):
+                A[:, :] = object()
+                A.compute()
+            A = A_.dup()
+            with pytest.raises(TypeError, match="Bad type"):
+                A[0, 0] = A
+                A.compute()
+            A = A_.dup()
+            with pytest.raises(TypeError, match="Bad type"):
+                A[:, 0] = A
+                A.compute()
+            A = A_.dup()
+            with pytest.raises(TypeError, match="Bad type"):
+                A[0, :] = A
+                A.compute()
+            A = A_.dup()
+            v = A[0, :].new()
+            with pytest.raises(TypeError, match="Bad type"):
+                A[0, 0] = v
+                A.compute()
+            A = A_.dup()
+            with pytest.raises(TypeError, match="Bad type"):
+                A[:, :] = v
+                A.compute()
 
-def test_apply_binary(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        result_right = Matrix.from_values(
-            [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1],
-            [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6],
-            [1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1],
-            dtype=bool,
-        )
-        w_right = A.apply(binary.gt, right=1).new()
-        w_right2 = A.apply(binary.gt, right=Scalar.from_value(1)).new()
-        assert w_right.isequal(result_right)
-        assert w_right2.isequal(result_right)
-        result_left = Matrix.from_values(
-            [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1],
-            [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6],
-            [5, 6, 5, 7, 3, 5, 1, 0, 5, 7, 1, 4],
-        )
-        w_left = A.apply(binary.minus, left=8).new()
-        w_left2 = A.apply(binary.minus, left=Scalar.from_value(8)).new()
-        assert w_left.isequal(result_left)
-        assert w_left2.isequal(result_left)
-        with pytest.raises(TypeError):
-            A.apply(binary.plus, left=A)
-        with pytest.raises(TypeError):
-            A.apply(binary.plus, right=A)
-        with pytest.raises(TypeError, match="Cannot provide both"):
-            A.apply(binary.plus, left=1, right=1)
 
-        # allow monoids
-        w1 = A.apply(binary.plus, left=1).new()
-        w2 = A.apply(monoid.plus, left=1).new()
-        w3 = A.apply(monoid.plus, right=1).new()
-        assert w1.isequal(w2)
-        assert w1.isequal(w3)
+def test_apply(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            result = Matrix.from_values(
+                [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1],
+                [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6],
+                [-3, -2, -3, -1, -5, -3, -7, -8, -3, -1, -7, -4],
+            )
+            C = A.apply(unary.ainv).new()
+            assert C.isequal(result)
 
 
-def test_reduce_row(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        result = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [5, 12, 1, 6, 7, 1, 15])
-        w = A.reduce_rowwise(monoid.plus).new()
-        assert w.isequal(result)
-        w2 = A.reduce_rowwise(binary.plus).new()
-        assert w2.isequal(result)
+def test_apply_binary(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            result_right = Matrix.from_values(
+                [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1],
+                [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6],
+                [1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1],
+                dtype=bool,
+            )
+            w_right = A.apply(binary.gt, right=1).new()
+            w_right2 = A.apply(binary.gt, right=Scalar.from_value(1)).new()
+            assert w_right.isequal(result_right)
+            assert w_right2.isequal(result_right)
+            result_left = Matrix.from_values(
+                [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1],
+                [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6],
+                [5, 6, 5, 7, 3, 5, 1, 0, 5, 7, 1, 4],
+            )
+            w_left = A.apply(binary.minus, left=8).new()
+            w_left2 = A.apply(binary.minus, left=Scalar.from_value(8)).new()
+            assert w_left.isequal(result_left)
+            assert w_left2.isequal(result_left)
+            with pytest.raises(TypeError):
+                A.apply(binary.plus, left=A)
+            with pytest.raises(TypeError):
+                A.apply(binary.plus, right=A)
+            with pytest.raises(TypeError, match="Cannot provide both"):
+                A.apply(binary.plus, left=1, right=1)
 
+            # allow monoids
+            w1 = A.apply(binary.plus, left=1).new()
+            w2 = A.apply(monoid.plus, left=1).new()
+            w3 = A.apply(monoid.plus, right=1).new()
+            assert w1.isequal(w2)
+            assert w1.isequal(w3)
 
-@pytest.mark.slow
-def test_reduce_agg(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        result = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [5, 12, 1, 6, 7, 1, 15])
-        w1 = A.reduce_rowwise(agg.sum).new()
-        assert w1.isequal(result)
-        w2 = A.T.reduce_columnwise(agg.sum).new()
-        assert w2.isequal(result)
-
-        counts = A.dup(dtype=bool).reduce_rowwise(monoid.plus[int]).new()
-        w3 = A.reduce_rowwise(agg.count).new()
-        assert w3.isequal(counts)
-        w4 = A.T.reduce_columnwise(agg.count).new()
-        assert w4.isequal(counts)
-
-        Asquared = monoid.times(A & A).new()
-        squared = Asquared.reduce_rowwise(monoid.plus).new()
-        expected = unary.sqrt[float](squared).new()
-        w5 = A.reduce_rowwise(agg.hypot).new()
-        assert w5.isclose(expected)
-        w6 = A.reduce_rowwise(monoid.numpy.hypot[float]).new()
-        assert w6.isclose(expected)
-        w7 = Vector.new(w5.dtype, size=w5.size)
-        w7 << A.reduce_rowwise(agg.hypot)
-        assert w7.isclose(expected)
-
-        w8 = A.reduce_rowwise(agg.logaddexp).new()
-        expected = A.reduce_rowwise(monoid.numpy.logaddexp[float]).new()
-        assert w8.isclose(w8)
-
-        result = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 2, 9, 10, 11, 8, 4])
-        w9 = A.reduce_columnwise(agg.sum).new()
-        assert w9.isequal(result)
-        w10 = A.T.reduce_rowwise(agg.sum).new()
-        assert w10.isequal(result)
-
-        counts = A.dup(dtype=bool).reduce_columnwise(monoid.plus[int]).new()
-        w11 = A.reduce_columnwise(agg.count).new()
-        assert w11.isequal(counts)
-        w12 = A.T.reduce_rowwise(agg.count).new()
-        assert w12.isequal(counts)
-
-        w13 = A.reduce_rowwise(agg.mean).new()
-        expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [2.5, 6, 1, 3, 7, 1, 5])
-        assert w13.isequal(expected)
-        w14 = A.reduce_columnwise(agg.mean).new()
-        expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 2, 3, 5, 5.5, 4, 4])
-        assert w14.isequal(expected)
-
-        w15 = A.reduce_rowwise(agg.exists).new()
-        w16 = A.reduce_columnwise(agg.exists).new()
-        expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [1, 1, 1, 1, 1, 1, 1])
-        assert w15.isequal(expected)
-        assert w16.isequal(expected)
-
-        assert A.reduce_scalar(agg.sum).new() == 47
-        assert A.reduce_scalar(agg.prod).new() == 1270080
-        assert A.reduce_scalar(agg.count).new() == 12
-        assert A.reduce_scalar(agg.count_nonzero).new() == 12
-        assert A.reduce_scalar(agg.count_zero).new() == 0
-        assert A.reduce_scalar(agg.sum_of_squares).new() == 245
-        assert A.reduce_scalar(agg.hypot).new().isclose(245 ** 0.5)
-        assert A.reduce_scalar(agg.logaddexp).new().isclose(8.6071076)
-        assert A.reduce_scalar(agg.logaddexp2).new().isclose(9.2288187)
-        assert A.reduce_scalar(agg.mean).new().isclose(47 / 12)
-        assert A.reduce_scalar(agg.exists).new() == 1
-
-        silly = agg.Aggregator(
-            "silly",
-            composite=[agg.varp, agg.stdp],
-            finalize=lambda x, y: binary.times(x & y),
-            types=[agg.varp],
-        )
-        v1 = A.reduce_rowwise(agg.varp).new()
-        v2 = A.reduce_rowwise(agg.stdp).new()
-        assert v1.isclose(binary.times(v2 & v2).new())
-        v3 = A.reduce_rowwise(silly).new()
-        assert v3.isclose(binary.times(v1 & v2).new())
 
-        s1 = A.reduce_scalar(agg.varp).new()
-        s2 = A.reduce_scalar(agg.stdp).new()
-        assert s1.isclose(s2.value.compute() * s2.value.compute())
-        s3 = A.reduce_scalar(silly).new()
-        assert s3.isclose(s1.value.compute() * s2.value.compute())
+def test_reduce_row(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            result = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [5, 12, 1, 6, 7, 1, 15])
+            w = A.reduce_rowwise(monoid.plus).new()
+            assert w.isequal(result)
+            w2 = A.reduce_rowwise(binary.plus).new()
+            assert w2.isequal(result)
 
 
-@pytest.mark.xfail("'Needs investigation'", strict=True)
-def test_reduce_agg_argminmax(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        # reduce_rowwise
-        expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [1, 6, 5, 0, 5, 2, 4])
-        w1b = A.reduce_rowwise(agg.argmin).new()
-        assert w1b.isequal(expected)
-        w1c = A.T.reduce_columnwise(agg.argmin).new()
-        assert w1c.isequal(expected)
-        expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 4, 5, 0, 5, 2, 3])
-        w2b = A.reduce_rowwise(agg.argmax).new()
-        assert w2b.isequal(expected)
-        w2c = A.T.reduce_columnwise(agg.argmax).new()
-        assert w2c.isequal(expected)
-
-        # reduce_cols
-        expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 0, 5, 0, 6, 2, 1])
-        w7b = A.reduce_columnwise(agg.argmin).new()
-        assert w7b.isequal(expected)
-        w7c = A.T.reduce_rowwise(agg.argmin).new()
-        assert w7c.isequal(expected)
-        expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 0, 6, 6, 1, 4, 1])
-        w8b = A.reduce_columnwise(agg.argmax).new()
-        assert w8b.isequal(expected)
-        w8c = A.T.reduce_rowwise(agg.argmax).new()
-        assert w8c.isequal(expected)
-
-        # reduce_scalar
-        with pytest.raises(
-            ValueError, match="Aggregator argmin may not be used with Matrix.reduce_scalar"
-        ):
-            A.reduce_scalar(agg.argmin)
+@pytest.mark.slow
+def test_reduce_agg(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            result = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [5, 12, 1, 6, 7, 1, 15])
+            w1 = A.reduce_rowwise(agg.sum).new()
+            assert w1.isequal(result)
+            w2 = A.T.reduce_columnwise(agg.sum).new()
+            assert w2.isequal(result)
+
+            counts = A.dup(dtype=bool).reduce_rowwise(monoid.plus[int]).new()
+            w3 = A.reduce_rowwise(agg.count).new()
+            assert w3.isequal(counts)
+            w4 = A.T.reduce_columnwise(agg.count).new()
+            assert w4.isequal(counts)
+
+            Asquared = monoid.times(A & A).new()
+            squared = Asquared.reduce_rowwise(monoid.plus).new()
+            expected = unary.sqrt[float](squared).new()
+            w5 = A.reduce_rowwise(agg.hypot).new()
+            assert w5.isclose(expected)
+            w6 = A.reduce_rowwise(monoid.numpy.hypot[float]).new()
+            assert w6.isclose(expected)
+            w7 = Vector.new(w5.dtype, size=w5.size)
+            w7 << A.reduce_rowwise(agg.hypot)
+            assert w7.isclose(expected)
+
+            w8 = A.reduce_rowwise(agg.logaddexp).new()
+            expected = A.reduce_rowwise(monoid.numpy.logaddexp[float]).new()
+            assert w8.isclose(w8)
+
+            result = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 2, 9, 10, 11, 8, 4])
+            w9 = A.reduce_columnwise(agg.sum).new()
+            assert w9.isequal(result)
+            w10 = A.T.reduce_rowwise(agg.sum).new()
+            assert w10.isequal(result)
+
+            counts = A.dup(dtype=bool).reduce_columnwise(monoid.plus[int]).new()
+            w11 = A.reduce_columnwise(agg.count).new()
+            assert w11.isequal(counts)
+            w12 = A.T.reduce_rowwise(agg.count).new()
+            assert w12.isequal(counts)
+
+            w13 = A.reduce_rowwise(agg.mean).new()
+            expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [2.5, 6, 1, 3, 7, 1, 5])
+            assert w13.isequal(expected)
+            w14 = A.reduce_columnwise(agg.mean).new()
+            expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 2, 3, 5, 5.5, 4, 4])
+            assert w14.isequal(expected)
+
+            w15 = A.reduce_rowwise(agg.exists).new()
+            w16 = A.reduce_columnwise(agg.exists).new()
+            expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [1, 1, 1, 1, 1, 1, 1])
+            assert w15.isequal(expected)
+            assert w16.isequal(expected)
+
+            assert A.reduce_scalar(agg.sum).new() == 47
+            assert A.reduce_scalar(agg.prod).new() == 1270080
+            assert A.reduce_scalar(agg.count).new() == 12
+            assert A.reduce_scalar(agg.count_nonzero).new() == 12
+            assert A.reduce_scalar(agg.count_zero).new() == 0
+            assert A.reduce_scalar(agg.sum_of_squares).new() == 245
+            assert A.reduce_scalar(agg.hypot).new().isclose(245 ** 0.5)
+            assert A.reduce_scalar(agg.logaddexp).new().isclose(8.6071076)
+            assert A.reduce_scalar(agg.logaddexp2).new().isclose(9.2288187)
+            assert A.reduce_scalar(agg.mean).new().isclose(47 / 12)
+            assert A.reduce_scalar(agg.exists).new() == 1
+
+            silly = agg.Aggregator(
+                "silly",
+                composite=[agg.varp, agg.stdp],
+                finalize=lambda x, y: binary.times(x & y),
+                types=[agg.varp],
+            )
+            v1 = A.reduce_rowwise(agg.varp).new()
+            v2 = A.reduce_rowwise(agg.stdp).new()
+            assert v1.isclose(binary.times(v2 & v2).new())
+            v3 = A.reduce_rowwise(silly).new()
+            assert v3.isclose(binary.times(v1 & v2).new())
+
+            s1 = A.reduce_scalar(agg.varp).new()
+            s2 = A.reduce_scalar(agg.stdp).new()
+            assert s1.isclose(s2.value.compute() * s2.value.compute())
+            s3 = A.reduce_scalar(silly).new()
+            assert s3.isclose(s1.value.compute() * s2.value.compute())
+
+
+def test_reduce_agg_argminmax(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            # reduce_rowwise
+            expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [1, 6, 5, 0, 5, 2, 4])
+            w1b = A.reduce_rowwise(agg.argmin).new()
+            assert w1b.isequal(expected)
+            w1c = A.T.reduce_columnwise(agg.argmin).new()
+            assert w1c.isequal(expected)
+            expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 4, 5, 0, 5, 2, 3])
+            w2b = A.reduce_rowwise(agg.argmax).new()
+            assert w2b.isequal(expected)
+            w2c = A.T.reduce_columnwise(agg.argmax).new()
+            assert w2c.isequal(expected)
+
+            # reduce_cols
+            expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 0, 5, 0, 6, 2, 1])
+            w7b = A.reduce_columnwise(agg.argmin).new()
+            assert w7b.isequal(expected)
+            w7c = A.T.reduce_rowwise(agg.argmin).new()
+            assert w7c.isequal(expected)
+            expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 0, 6, 6, 1, 4, 1])
+            w8b = A.reduce_columnwise(agg.argmax).new()
+            assert w8b.isequal(expected)
+            w8c = A.T.reduce_rowwise(agg.argmax).new()
+            assert w8c.isequal(expected)
+
+            # reduce_scalar
+            with pytest.raises(
+                ValueError, match="Aggregator argmin may not be used with Matrix.reduce_scalar"
+            ):
+                A.reduce_scalar(agg.argmin)
 
-        silly = agg.Aggregator(
-            "silly",
-            composite=[agg.argmin, agg.argmax],
-            finalize=lambda x, y: binary.plus(x & y),
-            types=[agg.argmin],
-        )
-        v1 = A.reduce_rowwise(agg.argmin).new()
-        v2 = A.reduce_rowwise(agg.argmax).new()
-        v3 = A.reduce_rowwise(silly).new()
-        assert v3.isequal(binary.plus(v1 & v2).new())
+            silly = agg.Aggregator(
+                "silly",
+                composite=[agg.argmin, agg.argmax],
+                finalize=lambda x, y: binary.plus(x & y),
+                types=[agg.argmin],
+            )
+            v1 = A.reduce_rowwise(agg.argmin).new()
+            v2 = A.reduce_rowwise(agg.argmax).new()
+            v3 = A.reduce_rowwise(silly).new()
+            assert v3.isequal(binary.plus(v1 & v2).new())
 
-        v1 = A.reduce_columnwise(agg.argmin).new()
-        v2 = A.reduce_columnwise(agg.argmax).new()
-        v3 = A.reduce_columnwise(silly).new()
-        assert v3.isequal(binary.plus(v1 & v2).new())
+            v1 = A.reduce_columnwise(agg.argmin).new()
+            v2 = A.reduce_columnwise(agg.argmax).new()
+            v3 = A.reduce_columnwise(silly).new()
+            assert v3.isequal(binary.plus(v1 & v2).new())
 
-        with pytest.raises(ValueError, match="Aggregator"):
-            A.reduce_scalar(silly).new()
+            with pytest.raises(ValueError, match="Aggregator"):
+                A.reduce_scalar(silly).new()
 
 
-@pytest.mark.xfail("'Needs investigation'", strict=True)
-def test_reduce_agg_firstlast(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        # reduce_rowwise
-        w1 = A.reduce_rowwise(agg.first).new()
-        expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [2, 8, 1, 3, 7, 1, 5])
-        assert w1.isequal(expected)
-        w1b = A.T.reduce_columnwise(agg.first).new()
-        assert w1b.isequal(expected)
-        w2 = A.reduce_rowwise(agg.last).new()
-        expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 4, 1, 3, 7, 1, 3])
-        assert w2.isequal(expected)
-        w2b = A.T.reduce_columnwise(agg.last).new()
-        assert w2b.isequal(expected)
-
-        # reduce_columnwise
-        w3 = A.reduce_columnwise(agg.first).new()
-        expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 2, 3, 3, 8, 1, 4])
-        assert w3.isequal(expected)
-        w3b = A.T.reduce_rowwise(agg.first).new()
-        assert w3b.isequal(expected)
-        w4 = A.reduce_columnwise(agg.last).new()
-        expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 2, 5, 7, 3, 7, 4])
-        assert w4.isequal(expected)
-        w4b = A.T.reduce_rowwise(agg.last).new()
-        assert w4b.isequal(expected)
-
-        # reduce_scalar
-        w5 = A.reduce_scalar(agg.first).new()
-        assert w5 == 2
-        w6 = A.reduce_scalar(agg.last).new()
-        assert w6 == 3
-        B = Matrix.new(float, nrows=2, ncols=3)
-        assert B.reduce_scalar(agg.first).new().is_empty
-        assert B.reduce_scalar(agg.last).new().is_empty
-        w7 = B.reduce_rowwise(agg.first).new()
-        assert w7.isequal(Vector.new(float, size=B.nrows))
-        w8 = B.reduce_columnwise(agg.last).new()
-        assert w8.isequal(Vector.new(float, size=B.ncols))
-
-        silly = agg.Aggregator(
-            "silly",
-            composite=[agg.first, agg.last],
-            finalize=lambda x, y: binary.plus(x & y),
-            types=[agg.first],
-        )
-        v1 = A.reduce_rowwise(agg.first).new()
-        v2 = A.reduce_rowwise(agg.last).new()
-        v3 = A.reduce_rowwise(silly).new()
-        assert v3.isequal(binary.plus(v1 & v2).new())
+def test_reduce_agg_firstlast(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            # reduce_rowwise
+            w1 = A.reduce_rowwise(agg.first).new()
+            expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [2, 8, 1, 3, 7, 1, 5])
+            assert w1.isequal(expected)
+            w1b = A.T.reduce_columnwise(agg.first).new()
+            assert w1b.isequal(expected)
+            w2 = A.reduce_rowwise(agg.last).new()
+            expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 4, 1, 3, 7, 1, 3])
+            assert w2.isequal(expected)
+            w2b = A.T.reduce_columnwise(agg.last).new()
+            assert w2b.isequal(expected)
+
+            # reduce_columnwise
+            w3 = A.reduce_columnwise(agg.first).new()
+            expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 2, 3, 3, 8, 1, 4])
+            assert w3.isequal(expected)
+            w3b = A.T.reduce_rowwise(agg.first).new()
+            assert w3b.isequal(expected)
+            w4 = A.reduce_columnwise(agg.last).new()
+            expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 2, 5, 7, 3, 7, 4])
+            assert w4.isequal(expected)
+            w4b = A.T.reduce_rowwise(agg.last).new()
+            assert w4b.isequal(expected)
+
+            # reduce_scalar
+            w5 = A.reduce_scalar(agg.first).new()
+            assert w5 == 2
+            w6 = A.reduce_scalar(agg.last).new()
+            assert w6 == 3
+            B = Matrix.new(float, nrows=2, ncols=3)
+            assert B.reduce_scalar(agg.first).new().is_empty
+            assert B.reduce_scalar(agg.last).new().is_empty
+            w7 = B.reduce_rowwise(agg.first).new()
+            assert w7.isequal(Vector.new(float, size=B.nrows))
+            w8 = B.reduce_columnwise(agg.last).new()
+            assert w8.isequal(Vector.new(float, size=B.ncols))
+
+            silly = agg.Aggregator(
+                "silly",
+                composite=[agg.first, agg.last],
+                finalize=lambda x, y: binary.plus(x & y),
+                types=[agg.first],
+            )
+            v1 = A.reduce_rowwise(agg.first).new()
+            v2 = A.reduce_rowwise(agg.last).new()
+            v3 = A.reduce_rowwise(silly).new()
+            assert v3.isequal(binary.plus(v1 & v2).new())
 
-        s1 = A.reduce_scalar(agg.first).new()
-        s2 = A.reduce_scalar(agg.last).new()
-        s3 = A.reduce_scalar(silly).new()
-        assert s3.isequal(s1.value.compute() + s2.value.compute())
+            s1 = A.reduce_scalar(agg.first).new()
+            s2 = A.reduce_scalar(agg.last).new()
+            s3 = A.reduce_scalar(silly).new()
+            assert s3.isequal(s1.value.compute() + s2.value.compute())
 
 
-@pytest.mark.xfail("'Needs investigation'", strict=True)
-def test_reduce_agg_firstlast_index(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        # reduce_rowwise
-        w1 = A.reduce_rowwise(agg.first_index).new()
-        expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [1, 4, 5, 0, 5, 2, 2])
-        assert w1.isequal(expected)
-        w1b = A.T.reduce_columnwise(agg.first_index).new()
-        assert w1b.isequal(expected)
-        w2 = A.reduce_rowwise(agg.last_index).new()
-        expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 6, 5, 2, 5, 2, 4])
-        assert w2.isequal(expected)
-        w2b = A.T.reduce_columnwise(agg.last_index).new()
-        assert w2b.isequal(expected)
-
-        # reduce_columnwise
-        w3 = A.reduce_columnwise(agg.first_index).new()
-        expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 0, 3, 0, 1, 2, 1])
-        assert w3.isequal(expected)
-        w3b = A.T.reduce_rowwise(agg.first_index).new()
-        assert w3b.isequal(expected)
-        w4 = A.reduce_columnwise(agg.last_index).new()
-        expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 0, 6, 6, 6, 4, 1])
-        assert w4.isequal(expected)
-        w4b = A.T.reduce_rowwise(agg.last_index).new()
-        assert w4b.isequal(expected)
-
-        # reduce_scalar
-        with pytest.raises(ValueError, match="Aggregator first_index may not"):
-            A.reduce_scalar(agg.first_index).new()
-        with pytest.raises(ValueError, match="Aggregator last_index may not"):
-            A.reduce_scalar(agg.last_index).new()
-
-        silly = agg.Aggregator(
-            "silly",
-            composite=[agg.first_index, agg.last_index],
-            finalize=lambda x, y: binary.plus(x & y),
-            types=[agg.first_index],
-        )
-        v1 = A.reduce_rowwise(agg.first_index).new()
-        v2 = A.reduce_rowwise(agg.last_index).new()
-        v3 = A.reduce_rowwise(silly).new()
-        assert v3.isequal(binary.plus(v1 & v2).new())
+def test_reduce_agg_firstlast_index(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            # reduce_rowwise
+            w1 = A.reduce_rowwise(agg.first_index).new()
+            expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [1, 4, 5, 0, 5, 2, 2])
+            assert w1.isequal(expected)
+            w1b = A.T.reduce_columnwise(agg.first_index).new()
+            assert w1b.isequal(expected)
+            w2 = A.reduce_rowwise(agg.last_index).new()
+            expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 6, 5, 2, 5, 2, 4])
+            assert w2.isequal(expected)
+            w2b = A.T.reduce_columnwise(agg.last_index).new()
+            assert w2b.isequal(expected)
+
+            # reduce_columnwise
+            w3 = A.reduce_columnwise(agg.first_index).new()
+            expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 0, 3, 0, 1, 2, 1])
+            assert w3.isequal(expected)
+            w3b = A.T.reduce_rowwise(agg.first_index).new()
+            assert w3b.isequal(expected)
+            w4 = A.reduce_columnwise(agg.last_index).new()
+            expected = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 0, 6, 6, 6, 4, 1])
+            assert w4.isequal(expected)
+            w4b = A.T.reduce_rowwise(agg.last_index).new()
+            assert w4b.isequal(expected)
+
+            # reduce_scalar
+            with pytest.raises(ValueError, match="Aggregator first_index may not"):
+                A.reduce_scalar(agg.first_index).new()
+            with pytest.raises(ValueError, match="Aggregator last_index may not"):
+                A.reduce_scalar(agg.last_index).new()
+
+            silly = agg.Aggregator(
+                "silly",
+                composite=[agg.first_index, agg.last_index],
+                finalize=lambda x, y: binary.plus(x & y),
+                types=[agg.first_index],
+            )
+            v1 = A.reduce_rowwise(agg.first_index).new()
+            v2 = A.reduce_rowwise(agg.last_index).new()
+            v3 = A.reduce_rowwise(silly).new()
+            assert v3.isequal(binary.plus(v1 & v2).new())
 
-        with pytest.raises(ValueError, match="Aggregator"):
-            A.reduce_scalar(silly).new()
+            with pytest.raises(ValueError, match="Aggregator"):
+                A.reduce_scalar(silly).new()
 
 
-@pytest.mark.xfail("'Needs investigation'", strict=True)
 def test_reduce_agg_empty(A_chunks):
     A = Matrix.new("UINT8", nrows=3, ncols=4)
     A_ = A
@@ -1614,86 +1977,85 @@ def test_reduce_agg_empty(A_chunks):
                     assert compute(s.value) is None
 
 
-def test_reduce_row_udf(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        result = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [5, 12, 1, 6, 7, 1, 15])
-        binop = grblas.operator.BinaryOp.register_anonymous(lambda x, y: x + y)
-        with pytest.raises(NotImplementedException):
-            # Although allowed by the spec, SuiteSparse doesn't like user-defined binarops here
-            A.reduce_rowwise(binop).new()
-        # If the user creates a monoid from the binop, then we can use the monoid instead
-        monoid = grblas.operator.Monoid.register_anonymous(binop, 0)
-        w = A.reduce_rowwise(binop).new()
-        assert w.isequal(result)
-        w2 = A.reduce_rowwise(monoid).new()
-        assert w2.isequal(result)
-
-
-def test_reduce_column(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        result = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 2, 9, 10, 11, 8, 4])
-        w = A.reduce_columnwise(monoid.plus).new()
-        assert w.isequal(result)
-        w2 = A.reduce_columnwise(binary.plus).new()
-        assert w2.isequal(result)
+def test_reduce_row_udf(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            result = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [5, 12, 1, 6, 7, 1, 15])
+            binop = grblas.operator.BinaryOp.register_anonymous(lambda x, y: x + y)
+            with pytest.raises(NotImplementedException):
+                # Although allowed by the spec, SuiteSparse doesn't like user-defined binarops here
+                A.reduce_rowwise(binop).new()
+            # If the user creates a monoid from the binop, then we can use the monoid instead
+            monoid = grblas.operator.Monoid.register_anonymous(binop, 0)
+            w = A.reduce_rowwise(binop).new()
+            assert w.isequal(result)
+            w2 = A.reduce_rowwise(monoid).new()
+            assert w2.isequal(result)
+
+
+def test_reduce_column(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            result = Vector.from_values([0, 1, 2, 3, 4, 5, 6], [3, 2, 9, 10, 11, 8, 4])
+            w = A.reduce_columnwise(monoid.plus).new()
+            assert w.isequal(result)
+            w2 = A.reduce_columnwise(binary.plus).new()
+            assert w2.isequal(result)
 
 
-def test_reduce_scalar(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        s = A.reduce_scalar(monoid.plus).new()
-        assert s == 47
-        assert A.reduce_scalar(binary.plus).new() == 47
-        with pytest.raises(TypeError, match="Expected type: Monoid"):
-            A.reduce_scalar(binary.minus)
-
-        # test dtype coercion
-        assert A.dtype == dtypes.INT64
-        s = A.reduce_scalar().new(dtype=float)
-        assert s == 47.0
-        assert s.dtype == dtypes.FP64
-        t = Scalar.new(float)
-        t << A.reduce_scalar(monoid.plus)
-        assert t == 47.0
-        t = Scalar.new(float)
-        t() << A.reduce_scalar(monoid.plus)
-        assert t == 47.0
-        t(accum=binary.times) << A.reduce_scalar(monoid.plus)
-        assert t == 47 * 47
-        assert A.reduce_scalar(monoid.plus[dtypes.UINT64]).new() == 47
-        # Make sure we accumulate as a float, not int
-        t.value = 1.23
-        t(accum=binary.plus) << A.reduce_scalar()
-        assert t == 48.23
-
-
-def test_transpose(A, A_chunks):
+def test_reduce_scalar(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            s = A.reduce_scalar(monoid.plus).new()
+            assert s == 47
+            assert A.reduce_scalar(binary.plus).new() == 47
+            with pytest.raises(TypeError, match="Expected type: Monoid"):
+                A.reduce_scalar(binary.minus)
+
+            # test dtype coercion
+            assert A.dtype == dtypes.INT64
+            s = A.reduce_scalar().new(dtype=float)
+            assert s == 47.0
+            assert s.dtype == dtypes.FP64
+            t = Scalar.new(float)
+            t << A.reduce_scalar(monoid.plus)
+            assert t == 47.0
+            t = Scalar.new(float)
+            t() << A.reduce_scalar(monoid.plus)
+            assert t == 47.0
+            t(accum=binary.times) << A.reduce_scalar(monoid.plus)
+            assert t == 47 * 47
+            assert A.reduce_scalar(monoid.plus[dtypes.UINT64]).new() == 47
+            # Make sure we accumulate as a float, not int
+            t.value = 1.23
+            t(accum=binary.plus) << A.reduce_scalar()
+            assert t == 48.23
+
+
+def test_transpose(As, A_chunks):
     # C << A.T
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        rows, cols, vals = A.to_values()
-        result = Matrix.from_values(cols, rows, vals)
-        C = Matrix.new(A.dtype, A.ncols, A.nrows)
-        C << A.T
-        assert C.isequal(result)
-        C2 = A.T.new()
-        assert C2.isequal(result)
-        assert A.T.T is A
-        C3 = A.T.new(dtype=float)
-        assert C3.isequal(result)
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            rows, cols, vals = A.to_values()
+            result = Matrix.from_values(cols, rows, vals)
+            C = Matrix.new(A.dtype, A.ncols, A.nrows)
+            C << A.T
+            assert C.isequal(result)
+            C2 = A.T.new()
+            assert C2.isequal(result)
+            assert A.T.T is A
+            C3 = A.T.new(dtype=float)
+            assert C3.isequal(result)
 
 
-@pytest.mark.xfail("'Needs investigation'", strict=True)
 def test_kronecker():
     # A  0 1     B  0 1 2
     # 0 [1 -]    0 [- 2 3]
@@ -1704,174 +2066,199 @@ def test_kronecker():
     # 1 [8  -  4  -  -  - ]
     # 2 [-  4  6  -  6  9 ]
     # 3 [16 -  8  24 -  12]
-    A = Matrix.from_values([0, 1, 1], [0, 0, 1], [1, 2, 3])
-    B = Matrix.from_values([0, 0, 1, 1], [1, 2, 0, 2], [2, 3, 8, 4])
+    A0 = Matrix.from_values([0, 1, 1], [0, 0, 1], [1, 2, 3])
+    A1 = Matrix.from_values(
+        da.from_array([0, 1, 1]),
+        da.from_array([0, 0, 1]),
+        da.from_array([1, 2, 3]),
+    )
+    As = [A0, A1]
+    B0 = Matrix.from_values([0, 0, 1, 1], [1, 2, 0, 2], [2, 3, 8, 4])
+    B1 = Matrix.from_values(
+        da.from_array([0, 0, 1, 1]),
+        da.from_array([1, 2, 0, 2]),
+        da.from_array([2, 3, 8, 4]),
+    )
+    Bs = [B0, B1]
     result = Matrix.from_values(
         [0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3],
         [1, 2, 0, 2, 1, 2, 4, 5, 0, 2, 3, 5],
         [2, 3, 8, 4, 4, 6, 6, 9, 16, 8, 24, 12],
     )
-    C = A.kronecker(B, binary.times).new()
-    assert C.isequal(result)
+    for A in As:
+        for B in Bs:
+            C = A.kronecker(B, binary.times).new()
+            assert C.isequal(result)
 
 
-def test_simple_assignment(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        # C << A
-        C = Matrix.new(A.dtype, A.nrows, A.ncols)
-        C << A
-        assert C.isequal(A)
-
-
-def test_assign_transpose(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        C = Matrix.new(A.dtype, A.ncols, A.nrows)
-        C << A.T
-        assert C.isequal(A.T.new())
-
-        with pytest.raises(TypeError):
-            C.T << A
-        with pytest.raises(TypeError, match="does not support item assignment"):
-            C.T[:, :] << A
-        with pytest.raises(AttributeError):
-            C[:, :].T << A
+def test_simple_assignment(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            # C << A
+            C = Matrix.new(A.dtype, A.nrows, A.ncols)
+            C << A
+            assert C.isequal(A)
 
-        C = Matrix.new(A.dtype, A.ncols + 1, A.nrows + 1)
-        C[: A.ncols, : A.nrows] << A.T
-        assert C[: A.ncols, : A.nrows].new().isequal(A.T.new())
 
+def test_assign_transpose(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            C = Matrix.new(A.dtype, A.ncols, A.nrows)
+            C << A.T
+            assert C.isequal(A.T.new())
 
-def test_isequal(A, A_chunks, v):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        assert A.isequal(A)
-        with pytest.raises(TypeError, match="Matrix"):
-            A.isequal(v)  # equality is not type-checking
-        C = Matrix.from_values([1], [1], [1])
-        assert not C.isequal(A)
-        D = Matrix.from_values([1], [2], [1])
-        assert not C.isequal(D)
-        D2 = Matrix.from_values([0], [2], [1], nrows=D.nrows, ncols=D.ncols)
-        assert not D2.isequal(D)
-        C2 = Matrix.from_values([1], [1], [1], nrows=7, ncols=7)
-        assert not C2.isequal(A)
-        C3 = Matrix.from_values(
-            [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1],
-            [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6],
-            [3.0, 2.0, 3.0, 1.0, 5.0, 3.0, 7.0, 8.0, 3.0, 1.0, 7.0, 4.0],
-        )
-        assert not C3.isequal(A, check_dtype=True), "different datatypes are not equal"
-        C4 = Matrix.from_values(
-            [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1],
-            [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6],
-            [3.0, 2.0, 3.0, 1.0, 5.0, 3.000000000000000001, 7.0, 8.0, 3.0, 1 - 1e-11, 7.0, 4.0],
-        )
-        assert not C4.isequal(A)
+            with pytest.raises(TypeError):
+                C.T << A
+            with pytest.raises(TypeError, match="does not support item assignment"):
+                C.T[:, :] << A
+            with pytest.raises(TypeError, match="autocompute"):
+                C[:, :].T << A
+
+            nrows, ncols = A.nrows, A.ncols
+            if A.is_dOnion:
+                nrows, ncols = nrows.compute(), ncols.compute()
+            C = Matrix.new(A.dtype, ncols + 1, nrows + 1)
+            C[:ncols, :nrows] << A.T
+            assert C[:ncols, :nrows].new().isequal(A.T.new())
+
+
+def test_isequal(As, A_chunks, v):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            assert A.isequal(A)
+            with pytest.raises(TypeError, match="Matrix"):
+                A.isequal(v)  # equality is not type-checking
+            C = Matrix.from_values([1], [1], [1])
+            assert not C.isequal(A)
+            D = Matrix.from_values([1], [2], [1])
+            assert not C.isequal(D)
+            D2 = Matrix.from_values([0], [2], [1], nrows=D.nrows, ncols=D.ncols)
+            assert not D2.isequal(D)
+            C2 = Matrix.from_values([1], [1], [1], nrows=7, ncols=7)
+            assert not C2.isequal(A)
+            C3 = Matrix.from_values(
+                [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1],
+                [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6],
+                [3.0, 2.0, 3.0, 1.0, 5.0, 3.0, 7.0, 8.0, 3.0, 1.0, 7.0, 4.0],
+            )
+            assert not C3.isequal(A, check_dtype=True), "different datatypes are not equal"
+            C4 = Matrix.from_values(
+                [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1],
+                [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6],
+                [3.0, 2.0, 3.0, 1.0, 5.0, 3.000000000000000001, 7.0, 8.0, 3.0, 1 - 1e-11, 7.0, 4.0],
+            )
+            assert not C4.isequal(A)
 
 
-@pytest.mark.slow
-def test_isclose(A, A_chunks, v):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        assert A.isclose(A)
-        with pytest.raises(TypeError, match="Matrix"):
-            A.isclose(v)  # equality is not type-checking
-        C = Matrix.from_values([1], [1], [1])  # wrong size
-        assert not C.isclose(A)
-        D = Matrix.from_values([1], [2], [1])
-        assert not C.isclose(D)
-        D2 = Matrix.from_values([0], [2], [1], nrows=D.nrows, ncols=D.ncols)
-        assert not D2.isclose(D)
-        C2 = Matrix.from_values([1], [1], [1], nrows=7, ncols=7)  # missing values
-        assert not C2.isclose(A)
-        C3 = Matrix.from_values(
-            [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1, 0],
-            [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6, 2],
-            [3, 2, 3, 1, 5, 3, 7, 8, 3, 1, 7, 4, 3],
-        )  # extra values
-        assert not C3.isclose(A)
-        C4 = Matrix.from_values(
-            [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1],
-            [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6],
-            [3.0, 2.0, 3.0, 1.0, 5.0, 3.0, 7.0, 8.0, 3.0, 1.0, 7.0, 4.0],
-        )
-        assert not C4.isclose(A, check_dtype=True), "different datatypes are not equal"
-        # fmt: off
-        C5 = Matrix.from_values(
-            [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1],
-            [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6],
-            [3.0, 2.0, 3.0, 1.0, 5.0, 3.000000000000000001, 7.0, 8.0, 3.0, 1 - 1e-11, 7.0, 4.0],
-        )
-        # fmt: on
-        assert C5.isclose(A)
-        C6 = Matrix.from_values(
-            [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1],
-            [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6],
-            [3.0, 2.000001, 3.0, 1.0, 5.0, 3.0, 7.0, 7.9999999, 3.0, 1.0, 7.0, 4.0],
-        )
-        assert C6.isclose(A, rel_tol=1e-3)
+def test_isclose(As, A_chunks, v):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            assert A.isclose(A)
+            with pytest.raises(TypeError, match="Matrix"):
+                A.isclose(v)  # equality is not type-checking
+            C = Matrix.from_values([1], [1], [1])  # wrong size
+            assert not C.isclose(A)
+            D = Matrix.from_values([1], [2], [1])
+            assert not C.isclose(D)
+            D2 = Matrix.from_values([0], [2], [1], nrows=D.nrows, ncols=D.ncols)
+            assert not D2.isclose(D)
+            C2 = Matrix.from_values([1], [1], [1], nrows=7, ncols=7)  # missing values
+            assert not C2.isclose(A)
+            C3 = Matrix.from_values(
+                [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1, 0],
+                [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6, 2],
+                [3, 2, 3, 1, 5, 3, 7, 8, 3, 1, 7, 4, 3],
+            )  # extra values
+            assert not C3.isclose(A)
+            C4 = Matrix.from_values(
+                [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1],
+                [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6],
+                [3.0, 2.0, 3.0, 1.0, 5.0, 3.0, 7.0, 8.0, 3.0, 1.0, 7.0, 4.0],
+            )
+            assert not C4.isclose(A, check_dtype=True), "different datatypes are not equal"
+            # fmt: off
+            C5 = Matrix.from_values(
+                [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1],
+                [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6],
+                [3.0, 2.0, 3.0, 1.0, 5.0, 3.000000000000000001, 7.0, 8.0, 3.0, 1 - 1e-11, 7.0, 4.0],
+            )
+            # fmt: on
+            assert C5.isclose(A)
+            C6 = Matrix.from_values(
+                [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1],
+                [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6],
+                [3.0, 2.000001, 3.0, 1.0, 5.0, 3.0, 7.0, 7.9999999, 3.0, 1.0, 7.0, 4.0],
+            )
+            assert C6.isclose(A, rel_tol=1e-3)
 
 
-@pytest.mark.slow
-@pytest.mark.xfail("'Needs investigation'", strict=True)
-def test_transpose_equals(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        data = [
-            [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6],
-            [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1],
-            [3, 2, 3, 1, 5, 3, 7, 8, 3, 1, 7, 4],
-        ]
-        B = Matrix.from_values(*data)
-        assert A.isequal(B.T)
-        assert B.isequal(A.T)
-        assert A.T.isequal(B)
-        assert A.T.isequal(A.T)
-        assert A.isclose(A)
-        assert A.isclose(B.T)
-        assert B.isclose(A.T)
-        assert A.T.isclose(B)
-        assert A.T.isclose(A.T)
+def test_transpose_equals(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            data = [
+                [0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6],
+                [3, 0, 3, 5, 6, 0, 6, 1, 6, 2, 4, 1],
+                [3, 2, 3, 1, 5, 3, 7, 8, 3, 1, 7, 4],
+            ]
+            B = Matrix.from_values(*data)
+            assert A.isequal(B.T)
+            assert B.isequal(A.T)
+            assert A.T.isequal(B)
+            assert A.T.isequal(A.T)
+            assert A.isclose(A)
+            assert A.isclose(B.T)
+            assert B.isclose(A.T)
+            assert A.T.isclose(B)
+            assert A.T.isclose(A.T)
 
 
-@pytest.mark.xfail("'Needs investigation'", strict=True)
 def test_transpose_exceptional():
-    A = Matrix.from_values([0, 0, 1, 1], [0, 1, 0, 1], [True, True, False, True])
-    B = Matrix.from_values([0, 0, 1, 1], [0, 1, 0, 1], [1, 2, 3, 4])
-
-    with pytest.raises(TypeError, match="not callable"):
-        B.T(mask=A.V) << B.ewise_mult(B, op=binary.plus)
-    with pytest.raises(AttributeError):
-        B(mask=A.T.V) << B.ewise_mult(B, op=binary.plus)
-    with pytest.raises(AttributeError):
-        B.T(mask=A.T.V) << B.ewise_mult(B, op=binary.plus)
-    with pytest.raises(TypeError, match="does not support item assignment"):
-        B.T[1, 0] << 10
-    with pytest.raises(TypeError, match="not callable"):
-        B.T[1, 0]() << 10
-    with pytest.raises(TypeError, match="not callable"):
-        B.T()[1, 0] << 10
-    # with pytest.raises(AttributeError):
-    # should use new instead--Now okay.
-    assert B.T.dup().isequal(B.T.new())
-    # Not exceptional, but while we're here...
-    C = B.T.new(mask=A.V)
-    D = B.T.new()
-    D = D.dup(mask=A.V)
-    assert C.isequal(D)
-    assert C.isequal(Matrix.from_values([0, 0, 1], [0, 1, 1], [1, 3, 4]))
+    A0 = Matrix.from_values([0, 0, 1, 1], [0, 1, 0, 1], [True, True, False, True])
+    B0 = Matrix.from_values([0, 0, 1, 1], [0, 1, 0, 1], [1, 2, 3, 4])
+    A1 = Matrix.from_values(
+        da.from_array([0, 0, 1, 1]),
+        da.from_array([0, 1, 0, 1]),
+        da.from_array([True, True, False, True]),
+    )
+    B1 = Matrix.from_values(
+        da.from_array([0, 0, 1, 1]),
+        da.from_array([0, 1, 0, 1]),
+        da.from_array([1, 2, 3, 4]),
+    )
+    As, Bs = [A0, A1], [B0, B1]
+    for A in As:
+        for B in Bs:
+            with pytest.raises(TypeError, match="not callable"):
+                B.T(mask=A.V) << B.ewise_mult(B, op=binary.plus)
+            with pytest.raises(AttributeError):
+                B(mask=A.T.V) << B.ewise_mult(B, op=binary.plus)
+            with pytest.raises(AttributeError):
+                B.T(mask=A.T.V) << B.ewise_mult(B, op=binary.plus)
+            with pytest.raises(TypeError, match="does not support item assignment"):
+                B.T[1, 0] << 10
+            with pytest.raises(TypeError, match="not callable"):
+                B.T[1, 0]() << 10
+            with pytest.raises(TypeError, match="not callable"):
+                B.T()[1, 0] << 10
+            # with pytest.raises(AttributeError):
+            # should use new instead--Now okay.
+            assert B.T.dup().isequal(B.T.new())
+            # Not exceptional, but while we're here...
+            C = B.T.new(mask=A.V)
+            D = B.T.new()
+            D = D.dup(mask=A.V)
+            assert C.isequal(D)
+            assert C.isequal(Matrix.from_values([0, 0, 1], [0, 1, 1], [1, 3, 4]))
 
 
 def test_nested_matrix_operations():
@@ -1887,37 +2274,38 @@ def test_bad_init():
         Matrix(None, float, name="bad_matrix")
 
 
-@pytest.mark.xfail("'Needs investigation'", strict=True)
-def test_equals(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        assert (A == A).new().reduce_scalar(monoid.land)
+def test_equals(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            assert (A == A).new().reduce_scalar(monoid.land).new()
 
 
-@pytest.mark.xfail("'Needs investigation'", strict=True)
-def test_bad_update(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        with pytest.raises(TypeError, match="Assignment value must be a valid expression"):
-            A << None
+def test_bad_update(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            with pytest.raises(TypeError, match="Assignment value must be a valid expression"):
+                A << None
+                A.compute()
 
 
-def test_incompatible_shapes(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        B = A[:-1, :-1].new()
-        with pytest.raises(DimensionMismatch):
-            A.mxm(B)
-        with pytest.raises(DimensionMismatch):
-            A.ewise_add(B)
-        with pytest.raises(DimensionMismatch):
-            A.ewise_mult(B)
+def test_incompatible_shapes(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            B = A[:-1, :-1].new()
+            with pytest.raises(DimensionMismatch):
+                A.mxm(B).new().compute()
+            A = A_.dup()
+            with pytest.raises(DimensionMismatch):
+                A.ewise_add(B).new().compute()
+            A = A_.dup()
+            with pytest.raises(DimensionMismatch):
+                A.ewise_mult(B).new().compute()
 
 
 @pytest.mark.xfail("'Needs investigation'", strict=True)
@@ -2509,104 +2897,100 @@ def import_func(**kwargs):
         assert C_orig.ss.is_iso is do_iso
 
 
-@pytest.mark.xfail("'Needs investigation'", strict=True)
-def test_no_bool_or_eq(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        with pytest.raises(TypeError, match="not defined"):
-            bool(A)
-        # with pytest.raises(TypeError, match="not defined"):
-        A == A
-        with pytest.raises(TypeError, match="not defined"):
-            bool(A.S)
-        with pytest.raises(TypeError, match="not defined"):
-            A.S == A.S
-        expr = A.ewise_mult(A)
-        with pytest.raises(TypeError, match="not defined"):
-            bool(expr)
-        with pytest.raises(TypeError, match="not enabled"):
-            expr == expr
-        assigner = A[1, 2]()
-        with pytest.raises(TypeError, match="not defined"):
-            bool(assigner)
-        with pytest.raises(TypeError, match="not defined"):
-            assigner == assigner
-        updater = A()
-        with pytest.raises(TypeError, match="not defined"):
-            bool(updater)
-        with pytest.raises(TypeError, match="not defined"):
-            updater == updater
+def test_no_bool_or_eq(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            with pytest.raises(TypeError, match="not defined"):
+                bool(A)
+            # with pytest.raises(TypeError, match="not defined"):
+            A == A
+            with pytest.raises(TypeError, match="not defined"):
+                bool(A.S)
+            with pytest.raises(TypeError, match="not defined"):
+                A.S == A.S
+            expr = A.ewise_mult(A)
+            with pytest.raises(TypeError, match="not defined"):
+                bool(expr)
+            with pytest.raises(TypeError, match="not enabled"):
+                expr == expr
+            assigner = A[1, 2]()
+            with pytest.raises(TypeError, match="not defined"):
+                bool(assigner)
+            with pytest.raises(TypeError, match="not defined"):
+                assigner == assigner
+            updater = A()
+            with pytest.raises(TypeError, match="not defined"):
+                bool(updater)
+            with pytest.raises(TypeError, match="not defined"):
+                updater == updater
 
 
 @autocompute
-@pytest.mark.xfail("'Needs investigation'", strict=True)
-def test_bool_eq_on_scalar_expressions(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        expr = A.reduce_scalar()
-        assert expr == 47
-        assert bool(expr)
-        assert int(expr) == 47
-        assert float(expr) == 47.0
-        assert range(expr) == range(47)
-
-        expr = A[0, 1]
-        assert expr == 2
-        assert bool(expr)
-        assert int(expr) == 2
-        assert float(expr) == 2.0
-        assert range(expr) == range(2)
-
-        expr = A[0, [1, 1]]
-        with pytest.raises(TypeError, match="not defined"):
-            expr == expr
-        with pytest.raises(TypeError, match="not defined"):
-            bool(expr)
-        with pytest.raises(TypeError, match="not defined"):
-            int(expr)
-        with pytest.raises(TypeError, match="not defined"):
-            float(expr)
-        with pytest.raises(TypeError, match="not defined"):
-            range(expr)
-
-
-@pytest.mark.xfail("'Needs investigation'", strict=True)
-def test_bool_eq_on_scalar_expressions_no_auto(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        expr = A.reduce_scalar()
-        with pytest.raises(TypeError, match="autocompute"):
-            expr == 47
-        with pytest.raises(TypeError, match="autocompute"):
-            bool(expr)
-        with pytest.raises(TypeError, match="autocompute"):
-            int(expr)
+def test_bool_eq_on_scalar_expressions(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            expr = A.reduce_scalar()
+            assert expr == 47
+            assert bool(expr)
+            assert int(expr) == 47
+            assert float(expr) == 47.0
+            assert range(expr) == range(47)
+
+            expr = A[0, 1]
+            assert expr == 2
+            assert bool(expr)
+            assert int(expr) == 2
+            assert float(expr) == 2.0
+            assert range(expr) == range(2)
+
+            expr = A[0, [1, 1]]
+            # with pytest.raises(TypeError, match="not defined"):
+            expr == expr  # Now okay
+            with pytest.raises(TypeError, match="not defined"):
+                bool(expr)
+            with pytest.raises(TypeError):
+                int(expr)
+            with pytest.raises(TypeError):
+                float(expr)
+            with pytest.raises(TypeError):
+                range(expr)
 
 
-@pytest.mark.xfail("'Needs investigation'", strict=True)
-def test_contains(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        assert (0, 1) in A
-        assert (1, 0) in A.T
+def test_bool_eq_on_scalar_expressions_no_auto(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            expr = A.reduce_scalar()
+            with pytest.raises(TypeError, match="autocompute"):
+                expr == 47
+            with pytest.raises(TypeError, match="autocompute"):
+                bool(expr)
+            with pytest.raises(TypeError, match="autocompute"):
+                int(expr)
+
+
+def test_contains(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            assert (0, 1) in A
+            assert (1, 0) in A.T
 
-        assert (0, 1) not in A.T
-        assert (1, 0) not in A
+            assert (0, 1) not in A.T
+            assert (1, 0) not in A
 
-        with pytest.raises(TypeError):
-            1 in A
-        with pytest.raises(TypeError):
-            (1,) in A.T
-        with pytest.raises(TypeError, match="Invalid index"):
-            (1, [1, 2]) in A
+            with pytest.raises(TypeError):
+                1 in A
+            with pytest.raises(TypeError):
+                (1,) in A.T
+            with pytest.raises(TypeError, match="Invalid index"):
+                (1, [1, 2]) in A
 
 
 @pytest.mark.xfail("'Needs investigation'", strict=True)
@@ -2845,111 +3229,109 @@ def test_nbytes(A, A_chunks):
 
 
 @autocompute
-@pytest.mark.xfail("'Needs investigation'", strict=True)
-def test_auto(A, A_chunks, v):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        expected = binary.land[bool](A & A).new()
-        B = A.dup(dtype=bool)
-        for expr in [(B & B), binary.land[bool](A & A)]:
-            assert expr.dtype == expected.dtype
-            assert expr.nrows == expected.nrows
-            assert expr.ncols == expected.ncols
-            assert expr.shape == expected.shape
-            assert expr.nvals == expected.nvals
-            assert expr.isclose(expected)
-            assert expected.isclose(expr)
-            assert expr.isequal(expected)
-            assert expected.isequal(expr)
-            assert expr.mxv(v).isequal(expected.mxv(v))
-            assert expected.T.mxv(v).isequal(expr.T.mxv(v))
-            for method in [
-                # "ewise_add",
-                # "ewise_mult",
-                # "mxm",
-                # "__matmul__",
-                "__and__",
-                "__or__",
-                # "kronecker",
-            ]:
-                val1 = getattr(expected, method)(expected).new()
-                val2 = getattr(expected, method)(expr)
-                val3 = getattr(expr, method)(expected)
-                val4 = getattr(expr, method)(expr)
-                assert val1.isequal(val2)
-                assert val1.isequal(val3)
-                assert val1.isequal(val4)
-            for method in ["reduce_rowwise", "reduce_columnwise", "reduce_scalar"]:
-                s1 = getattr(expected, method)(monoid.lor).new()
-                s2 = getattr(expr, method)(monoid.lor)
-                assert s1.isequal(s2.new())
-                assert s1.isequal(s2)
-
-        expected = binary.times(A & A).new()
-        for expr in [binary.times(A & A)]:
-            assert expr.dtype == expected.dtype
-            assert expr.nrows == expected.nrows
-            assert expr.ncols == expected.ncols
-            assert expr.shape == expected.shape
-            assert expr.nvals == expected.nvals
-            assert expr.isclose(expected)
-            assert expected.isclose(expr)
-            assert expr.isequal(expected)
-            assert expected.isequal(expr)
-            assert expr.mxv(v).isequal(expected.mxv(v))
-            assert expected.T.mxv(v).isequal(expr.T.mxv(v))
-            for method in [
-                "ewise_add",
-                "ewise_mult",
-                "mxm",
-                # "__matmul__",
-                # "__and__",
-                # "__or__",
-                "kronecker",
-            ]:
-                val1 = getattr(expected, method)(expected).new()
-                val2 = getattr(expected, method)(expr)
-                val3 = getattr(expr, method)(expected)
-                val4 = getattr(expr, method)(expr)
-                assert val1.isequal(val2)
-                assert val1.isequal(val3)
-                assert val1.isequal(val4)
-            for method in ["reduce_rowwise", "reduce_columnwise", "reduce_scalar"]:
-                s1 = getattr(expected, method)().new()
-                s2 = getattr(expr, method)()
-                assert s1.isequal(s2.new())
-                assert s1.isequal(s2)
-
-        expected = semiring.plus_times(A @ v).new()
-        for expr in [(A @ v), (v @ A.T), semiring.plus_times(A @ v)]:
-            assert expr.vxm(A).isequal(expected.vxm(A))
-            assert expr.vxm(A).new(mask=expr.S).isequal(expected.vxm(A).new(mask=expected.S))
-            assert expr.vxm(A).new(mask=expr.V).isequal(expected.vxm(A).new(mask=expected.V))
+def test_auto(As, A_chunks, v):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            expected = binary.land[bool](A & A).new()
+            B = A.dup(dtype=bool)
+            for expr in [(B & B), binary.land[bool](A & A)]:
+                assert expr.dtype == expected.dtype
+                assert expr.nrows == expected.nrows
+                assert expr.ncols == expected.ncols
+                assert expr.shape == expected.shape
+                assert expr.nvals == expected.nvals
+                assert expr.isclose(expected)
+                assert expected.isclose(expr)
+                assert expr.isequal(expected)
+                assert expected.isequal(expr)
+                assert expr.mxv(v).isequal(expected.mxv(v))
+                assert expected.T.mxv(v).isequal(expr.T.mxv(v))
+                for method in [
+                    # "ewise_add",
+                    # "ewise_mult",
+                    # "mxm",
+                    # "__matmul__",
+                    "__and__",
+                    "__or__",
+                    "kronecker",
+                ]:
+                    val1 = getattr(expected, method)(expected).new()
+                    val2 = getattr(expected, method)(expr)
+                    val3 = getattr(expr, method)(expected)
+                    val4 = getattr(expr, method)(expr)
+                    assert val1.isequal(val2)
+                    assert val1.isequal(val3)
+                    assert val1.isequal(val4)
+                for method in ["reduce_rowwise", "reduce_columnwise", "reduce_scalar"]:
+                    s1 = getattr(expected, method)(monoid.lor).new()
+                    s2 = getattr(expr, method)(monoid.lor)
+                    assert s1.isequal(s2.new())
+                    assert s1.isequal(s2)
+
+            expected = binary.times(A & A).new()
+            for expr in [binary.times(A & A)]:
+                assert expr.dtype == expected.dtype
+                assert expr.nrows == expected.nrows
+                assert expr.ncols == expected.ncols
+                assert expr.shape == expected.shape
+                assert expr.nvals == expected.nvals
+                assert expr.isclose(expected)
+                assert expected.isclose(expr)
+                assert expr.isequal(expected)
+                assert expected.isequal(expr)
+                assert expr.mxv(v).isequal(expected.mxv(v))
+                assert expected.T.mxv(v).isequal(expr.T.mxv(v))
+                for method in [
+                    "ewise_add",
+                    "ewise_mult",
+                    "mxm",
+                    # "__matmul__",
+                    # "__and__",
+                    # "__or__",
+                    # "kronecker",
+                ]:
+                    val1 = getattr(expected, method)(expected).new()
+                    val2 = getattr(expected, method)(expr)
+                    val3 = getattr(expr, method)(expected)
+                    val4 = getattr(expr, method)(expr)
+                    assert val1.isequal(val2)
+                    assert val1.isequal(val3)
+                    assert val1.isequal(val4)
+                for method in ["reduce_rowwise", "reduce_columnwise", "reduce_scalar"]:
+                    s1 = getattr(expected, method)().new()
+                    s2 = getattr(expr, method)()
+                    assert s1.isequal(s2.new())
+                    assert s1.isequal(s2)
+
+            expected = semiring.plus_times(A @ v).new()
+            for expr in [(A @ v), (v @ A.T), semiring.plus_times(A @ v)]:
+                assert expr.vxm(A).isequal(expected.vxm(A))
+                assert expr.vxm(A).new(mask=expr.S).isequal(expected.vxm(A).new(mask=expected.S))
+                assert expr.vxm(A).new(mask=expr.V).isequal(expected.vxm(A).new(mask=expected.V))
 
 
 @autocompute
-@pytest.mark.xfail("'Needs investigation'", strict=True)
-def test_auto_assign(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        expected = A.dup()
-        B = A[1:4, 1:4].new(dtype=bool)
-        expr = B & B
-        expected[:3, :3] = expr.new()
-        A[:3, :3] = expr
-        assert expected.isequal(A)
-        with pytest.raises(TypeError):
-            # Not yet supported, but we could!
+def test_auto_assign(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            expected = A.dup()
+            B = A[1:4, 1:4].new(dtype=bool)
+            expr = B & B
+            expected[:3, :3] = expr.new()
+            A[:3, :3] = expr
+            assert expected.isequal(A)
+            v = A[2:5, 5].new(dtype=bool)
+            expr = v & v
+            A[:3, 4] << expr
+            expected[:3, 4] << expr.new()
+            assert expected.isequal(A)
+            C = A[1:4, 1:4].new()
             A[:3, :3] = A[1:4, 1:4]
-        v = A[2:5, 5].new(dtype=bool)
-        expr = v & v
-        A[:3, 4] << expr
-        expected[:3, 4] << expr.new()
-        assert expected.isequal(A)
+            assert A[:3, :3].isequal(C)
 
 
 @autocompute
@@ -3056,149 +3438,147 @@ def test_flatten(A, A_chunks):
             v.ss.reshape(A.shape + (1,))
 
 
-@pytest.mark.xfail("'Needs investigation'", strict=True)
-def test_autocompute_argument_messages(A, A_chunks, v):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        with pytest.raises(TypeError, match="autocompute"):
-            A.ewise_mult(A & A)
-        with pytest.raises(TypeError, match="autocompute"):
-            A.mxv(A @ v)
+def test_autocompute_argument_messages(As, A_chunks, v):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            with pytest.raises(TypeError, match="autocompute"):
+                A.ewise_mult(A & A)
+            with pytest.raises(TypeError, match="autocompute"):
+                A.mxv(A @ v)
 
 
 @autocompute
-@pytest.mark.xfail("'Needs investigation'", strict=True)
-def test_infix_sugar(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        assert type(A + 1) is not Matrix
-        assert binary.plus(A, 1).isequal(A + 1)
-        assert binary.plus(A.T, 1).isequal(A.T + 1)
-        assert binary.plus(1, A).isequal(1 + A)
-        assert binary.minus(A, 1).isequal(A - 1)
-        assert binary.minus(1, A).isequal(1 - A)
-        assert binary.times(A, 2).isequal(A * 2)
-        assert binary.times(2, A).isequal(2 * A)
-        assert binary.truediv(A, 2).isequal(A / 2)
-        assert binary.truediv(5, A).isequal(5 / A)
-        assert binary.floordiv(A, 2).isequal(A // 2)
-        assert binary.floordiv(5, A).isequal(5 // A)
-        assert binary.numpy.mod(A, 2).isequal(A % 2)
-        assert binary.numpy.mod(5, A).isequal(5 % A)
-        assert binary.pow(A, 2).isequal(A ** 2)
-        assert binary.pow(2, A).isequal(2 ** A)
-        assert binary.pow(A, 2).isequal(pow(A, 2))
-        assert unary.ainv(A).isequal(-A)
-        assert unary.ainv(A.T).isequal(-A.T)
-        B = A.dup(dtype=bool)
-        assert unary.lnot(B).isequal(~B)
-        assert unary.lnot(B.T).isequal(~B.T)
-        with pytest.raises(TypeError):
-            assert unary.lnot(A).isequal(~A)
-        with pytest.raises(TypeError):
-            assert unary.lnot(A.T).isequal(~A.T)
-        assert binary.lxor(True, B).isequal(True ^ B)
-        assert binary.lxor(B, True).isequal(B ^ True)
-        with pytest.raises(TypeError):
-            A ^ True
-        with pytest.raises(TypeError):
-            A ^ B
-        with pytest.raises(TypeError):
-            6 ^ B
-        assert binary.lt(A, 4).isequal(A < 4)
-        assert binary.le(A, 4).isequal(A <= 4)
-        assert binary.gt(A, 4).isequal(A > 4)
-        assert binary.ge(A, 4).isequal(A >= 4)
-        assert binary.eq(A, 4).isequal(A == 4)
-        assert binary.ne(A, 4).isequal(A != 4)
-        x, y = divmod(A, 3)
-        assert binary.floordiv(A, 3).isequal(x)
-        assert binary.numpy.mod(A, 3).isequal(y)
-        assert binary.fmod(A, 3).isequal(y)
-        assert A.isequal(binary.plus((3 * x) & y))
-        x, y = divmod(-A, 3)
-        assert binary.floordiv(-A, 3).isequal(x)
-        assert binary.numpy.mod(-A, 3).isequal(y)
-        # assert binary.fmod(-A, 3).isequal(y)  # The reason we use numpy.mod
-        assert (-A).isequal(binary.plus((3 * x) & y))
-        x, y = divmod(3, A)
-        assert binary.floordiv(3, A).isequal(x)
-        assert binary.numpy.mod(3, A).isequal(y)
-        assert binary.fmod(3, A).isequal(y)
-        assert binary.plus(binary.times(A & x) & y).isequal(3 * unary.one(A))
-        x, y = divmod(-3, A)
-        assert binary.floordiv(-3, A).isequal(x)
-        assert binary.numpy.mod(-3, A).isequal(y)
-        # assert binary.fmod(-3, A).isequal(y)  # The reason we use numpy.mod
-        assert binary.plus(binary.times(A & x) & y).isequal(-3 * unary.one(A))
-
-        assert binary.eq(A & A).isequal(A == A)
-        assert binary.ne(A.T & A.T).isequal(A.T != A.T)
-        assert binary.lt(A & A.T).isequal(A < A.T)
-        assert binary.ge(A.T & A).isequal(A.T >= A)
-
-        B = A.dup()
-        B += 1
-        assert type(B) is Matrix
-        assert binary.plus(A, 1).isequal(B)
-        B = A.dup()
-        B -= 1
-        assert type(B) is Matrix
-        assert binary.minus(A, 1).isequal(B)
-        B = A.dup()
-        B *= 2
-        assert type(B) is Matrix
-        assert binary.times(A, 2).isequal(B)
-        B = A.dup(dtype=float)
-        B /= 2
-        assert type(B) is Matrix
-        assert binary.truediv(A, 2).isequal(B)
-        B = A.dup()
-        B //= 2
-        assert type(B) is Matrix
-        assert binary.floordiv(A, 2).isequal(B)
-        B = A.dup()
-        B %= 2
-        assert type(B) is Matrix
-        assert binary.numpy.mod(A, 2).isequal(B)
-        B = A.dup()
-        B **= 2
-        assert type(B) is Matrix
-        assert binary.pow(A, 2).isequal(B)
-        B = A.dup(dtype=bool)
-        B ^= True
-        assert type(B) is Matrix
-        assert B.isequal(~A.dup(dtype=bool))
-        B = A.dup(dtype=bool)
-        B ^= B
-        assert type(B) is Matrix
-        assert not B.reduce_scalar(agg.any).new()
-
-        expr = binary.plus(A & A)
-        assert unary.abs(expr).isequal(abs(expr))
-        assert unary.ainv(expr).isequal(-expr)
-        with pytest.raises(TypeError):
-            assert unary.lnot(expr).isequal(~expr)
-        with pytest.raises(TypeError):
-            expr += 1
-        with pytest.raises(TypeError):
-            expr -= 1
-        with pytest.raises(TypeError):
-            expr *= 1
-        with pytest.raises(TypeError):
-            expr /= 1
-        with pytest.raises(TypeError):
-            expr //= 1
-        with pytest.raises(TypeError):
-            expr %= 1
-        with pytest.raises(TypeError):
-            expr **= 1
-        with pytest.raises(TypeError):
-            expr ^= 1
+def test_infix_sugar(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            assert type(A + 1) is not Matrix
+            assert binary.plus(A, 1).isequal(A + 1)
+            assert binary.plus(A.T, 1).isequal(A.T + 1)
+            assert binary.plus(1, A).isequal(1 + A)
+            assert binary.minus(A, 1).isequal(A - 1)
+            assert binary.minus(1, A).isequal(1 - A)
+            assert binary.times(A, 2).isequal(A * 2)
+            assert binary.times(2, A).isequal(2 * A)
+            assert binary.truediv(A, 2).isequal(A / 2)
+            assert binary.truediv(5, A).isequal(5 / A)
+            assert binary.floordiv(A, 2).isequal(A // 2)
+            assert binary.floordiv(5, A).isequal(5 // A)
+            assert binary.numpy.mod(A, 2).isequal(A % 2)
+            assert binary.numpy.mod(5, A).isequal(5 % A)
+            assert binary.pow(A, 2).isequal(A ** 2)
+            assert binary.pow(2, A).isequal(2 ** A)
+            assert binary.pow(A, 2).isequal(pow(A, 2))
+            assert unary.ainv(A).isequal(-A)
+            assert unary.ainv(A.T).isequal(-A.T)
+            B = A.dup(dtype=bool)
+            assert unary.lnot(B).isequal(~B)
+            assert unary.lnot(B.T).isequal(~B.T)
+            with pytest.raises(TypeError):
+                assert unary.lnot(A).isequal(~A)
+            with pytest.raises(TypeError):
+                assert unary.lnot(A.T).isequal(~A.T)
+            assert binary.lxor(True, B).isequal(True ^ B)
+            assert binary.lxor(B, True).isequal(B ^ True)
+            with pytest.raises(TypeError):
+                A ^ True
+            with pytest.raises(TypeError):
+                A ^ B
+            with pytest.raises(TypeError):
+                6 ^ B
+            assert binary.lt(A, 4).isequal(A < 4)
+            assert binary.le(A, 4).isequal(A <= 4)
+            assert binary.gt(A, 4).isequal(A > 4)
+            assert binary.ge(A, 4).isequal(A >= 4)
+            assert binary.eq(A, 4).isequal(A == 4)
+            assert binary.ne(A, 4).isequal(A != 4)
+            x, y = divmod(A, 3)
+            assert binary.floordiv(A, 3).isequal(x)
+            assert binary.numpy.mod(A, 3).isequal(y)
+            assert binary.fmod(A, 3).isequal(y)
+            assert A.isequal(binary.plus((3 * x) & y))
+            x, y = divmod(-A, 3)
+            assert binary.floordiv(-A, 3).isequal(x)
+            assert binary.numpy.mod(-A, 3).isequal(y)
+            # assert binary.fmod(-A, 3).isequal(y)  # The reason we use numpy.mod
+            assert (-A).isequal(binary.plus((3 * x) & y))
+            x, y = divmod(3, A)
+            assert binary.floordiv(3, A).isequal(x)
+            assert binary.numpy.mod(3, A).isequal(y)
+            assert binary.fmod(3, A).isequal(y)
+            assert binary.plus(binary.times(A & x) & y).isequal(3 * unary.one(A))
+            x, y = divmod(-3, A)
+            assert binary.floordiv(-3, A).isequal(x)
+            assert binary.numpy.mod(-3, A).isequal(y)
+            # assert binary.fmod(-3, A).isequal(y)  # The reason we use numpy.mod
+            assert binary.plus(binary.times(A & x) & y).isequal(-3 * unary.one(A))
+
+            assert binary.eq(A & A).isequal(A == A)
+            assert binary.ne(A.T & A.T).isequal(A.T != A.T)
+            assert binary.lt(A & A.T).isequal(A < A.T)
+            assert binary.ge(A.T & A).isequal(A.T >= A)
+
+            B = A.dup()
+            B += 1
+            assert type(B) is Matrix
+            assert binary.plus(A, 1).isequal(B)
+            B = A.dup()
+            B -= 1
+            assert type(B) is Matrix
+            assert binary.minus(A, 1).isequal(B)
+            B = A.dup()
+            B *= 2
+            assert type(B) is Matrix
+            assert binary.times(A, 2).isequal(B)
+            B = A.dup(dtype=float)
+            B /= 2
+            assert type(B) is Matrix
+            assert binary.truediv(A, 2).isequal(B)
+            B = A.dup()
+            B //= 2
+            assert type(B) is Matrix
+            assert binary.floordiv(A, 2).isequal(B)
+            B = A.dup()
+            B %= 2
+            assert type(B) is Matrix
+            assert binary.numpy.mod(A, 2).isequal(B)
+            B = A.dup()
+            B **= 2
+            assert type(B) is Matrix
+            assert binary.pow(A, 2).isequal(B)
+            B = A.dup(dtype=bool)
+            B ^= True
+            assert type(B) is Matrix
+            assert B.isequal(~A.dup(dtype=bool))
+            B = A.dup(dtype=bool)
+            B ^= B
+            assert type(B) is Matrix
+            assert not B.reduce_scalar(agg.any).new()
+
+            expr = binary.plus(A & A)
+            assert unary.abs(expr).isequal(abs(expr))
+            assert unary.ainv(expr).isequal(-expr)
+            with pytest.raises(TypeError):
+                assert unary.lnot(expr).isequal(~expr)
+            with pytest.raises(TypeError):
+                expr += 1
+            with pytest.raises(TypeError):
+                expr -= 1
+            with pytest.raises(TypeError):
+                expr *= 1
+            with pytest.raises(TypeError):
+                expr /= 1
+            with pytest.raises(TypeError):
+                expr //= 1
+            with pytest.raises(TypeError):
+                expr %= 1
+            with pytest.raises(TypeError):
+                expr **= 1
+            with pytest.raises(TypeError):
+                expr ^= 1
 
 
 @pytest.mark.slow
@@ -3511,15 +3891,15 @@ def test_deprecated(A, A_chunks):
             A.ss.scan_columns()
 
 
-def test_ndim(A, A_chunks):
-    A_ = A
-    for chunks in A_chunks:
-        A = A_.dup()
-        A.rechunk(chunks=chunks, inplace=True)
-        assert A.ndim == 2
-        assert A.ewise_mult(A).ndim == 2
-        assert (A & A).ndim == 2
-        assert (A @ A).ndim == 2
+def test_ndim(As, A_chunks):
+    for A_ in As:
+        for chunks in A_chunks:
+            A = A_.dup()
+            A.rechunk(chunks=chunks, inplace=True)
+            assert A.ndim == 2
+            assert A.ewise_mult(A).ndim == 2
+            assert (A & A).ndim == 2
+            assert (A @ A).ndim == 2
 
 
 @pytest.mark.xfail("'Needs investigation'", strict=True)
diff --git a/tests/from_grblas2/test_vector.py b/tests/from_grblas2/test_vector.py
index fdbd92f..c8bb84c 100644
--- a/tests/from_grblas2/test_vector.py
+++ b/tests/from_grblas2/test_vector.py
@@ -4,6 +4,7 @@
 import sys
 import weakref
 
+import dask.array as da
 import dask_grblas
 import grblas
 import numpy as np
@@ -124,6 +125,138 @@ def test_from_values():
         Vector.from_values([0], [1, 2])
 
 
+def test_from_values_dask():
+    indices = da.from_array(np.array([0, 1, 3]))
+    values = da.from_array(np.array([True, False, True]))
+    u = Vector.from_values(indices, values)
+    assert u.size == 4
+    assert u.nvals == 3
+    assert u.dtype == bool
+    values = da.from_array(np.array([12.3, 12.4, 12.5]))
+    u2 = Vector.from_values(indices, values, size=17)
+    assert u2.size == 17
+    assert u2.nvals == 3
+    assert u2.dtype == float
+    indices = da.from_array(np.array([0, 1, 1]))
+    values = da.from_array(np.array([1, 2, 3], dtype=np.int64))
+    u3 = Vector.from_values(indices, values, size=10, dup_op=binary.times)
+    assert u3.size == 10
+    assert u3.nvals == 2  # duplicates were combined
+    assert u3.dtype == int
+    assert u3[1].value == 6  # 2*3
+    values = da.from_array(np.array([True, True, True]))
+    with pytest.raises(ValueError, match="Duplicate indices found"):
+        # Duplicate indices requires a dup_op
+        Vector.from_values(indices, values).compute()
+    empty_da = da.from_array(np.array([]))
+    with pytest.raises(ValueError, match="No indices provided. Unable to infer size."):
+        Vector.from_values(empty_da, empty_da).compute()
+
+    # Changed: Assume empty value is float64 (like numpy)
+    # with pytest.raises(ValueError, match="No values provided. Unable to determine type"):
+    w = Vector.from_values(empty_da, empty_da, size=10)
+    assert w.size == 10
+    assert w.nvals == 0
+    assert w.dtype == dtypes.FP64
+
+    with pytest.raises(ValueError, match="No indices provided. Unable to infer size"):
+        Vector.from_values(empty_da, empty_da, dtype=dtypes.INT64)
+    u4 = Vector.from_values(empty_da, empty_da, size=10, dtype=dtypes.INT64)
+    u5 = Vector.new(dtypes.INT64, size=10)
+    assert u4.isequal(u5, check_dtype=True)
+
+    # we check index dtype if given dask array
+    indices = da.from_array(np.array([1.2, 3.4]))
+    values = da.from_array(np.array([1, 2]))
+    with pytest.raises(ValueError, match="indices must be integers, not float64"):
+        Vector.from_values(indices, values).compute()
+
+    # mis-matched sizes
+    indices = da.from_array(np.array([0]))
+    with pytest.raises(ValueError, match="`indices` and `values` lengths must match"):
+        Vector.from_values(indices, values).compute()
+
+
+def test_from_values_DOnion(v):
+    indices = da.from_array(np.array([0, 1, 3]))
+    values = da.from_array(np.array([True, False, True]))
+
+    # The following creates a Vector `u` with `type(u._delayed) == DOnion`
+    # because keyword argument `size` has not been specified:
+    u = Vector.from_values(indices, values)
+    assert u.size == 4
+    assert u.nvals == 3
+    assert u.dtype == bool
+
+    # The output of `.to_values()` is always a tuple of DOnions
+    indices, values = u.to_values()
+
+    # The following creates a Vector `v` with `type(v._delayed) == DOnion`
+    # because arguments `indices` and  `values` are DOnions:
+    v = Vector.from_values(indices, values)
+    assert v.size == 4
+    assert v.nvals == 3
+    assert v.dtype == bool
+    values = da.from_array(np.array([12.3, 12.4, 12.5]))
+
+    # The following creates a Vector `u2` with `type(u2._delayed) == DOnion`
+    # because argument `indices` is a DOnion:
+    u2 = Vector.from_values(indices, values, size=17)
+    assert u2.size == 17
+    assert u2.nvals == 3
+    assert u2.dtype == float
+
+    indices = da.from_array(np.array([0, 1, 1]))
+    indices_ = da.from_array(np.array([1, 2, 3]))
+    i0 = Vector.from_values(indices_, indices)
+    _, indices = i0.to_values()
+    values = da.from_array(np.array([1, 2, 3], dtype=np.int64))
+
+    # The following creates a Vector `u3` with `type(u3._delayed) == DOnion`
+    # because arguments `indices` and `values` are DOnions:
+    u3 = Vector.from_values(indices, values, size=10, dup_op=binary.times)
+    assert u3.size == 10
+    assert u3.nvals == 2  # duplicates were combined
+    assert u3.dtype == int
+    assert u3[1].value == 6  # 2*3
+
+    values = da.from_array(np.array([True, True, True]))
+    with pytest.raises(ValueError, match="Duplicate indices found"):
+        # Duplicate indices requires a dup_op
+        Vector.from_values(indices, values).compute()
+    _, empty_da = Vector.new(float).to_values()
+    with pytest.raises(ValueError, match="No indices provided. Unable to infer size."):
+        Vector.from_values(empty_da, empty_da).compute()
+
+    # Changed: Assume empty value is float64 (like numpy)
+    # with pytest.raises(ValueError, match="No values provided. Unable to determine type"):
+    w = Vector.from_values(empty_da, empty_da, size=10)
+    assert w.size == 10
+    assert w.nvals == 0
+    assert w.dtype == dtypes.FP64
+
+    with pytest.raises(ValueError, match="No indices provided. Unable to infer size"):
+        Vector.from_values(empty_da, empty_da, dtype=dtypes.INT64).compute()
+    u4 = Vector.from_values(empty_da, empty_da, size=10, dtype=dtypes.INT64)
+    u5 = Vector.new(dtypes.INT64, size=10)
+    assert u4.isequal(u5, check_dtype=True)
+
+    # we check index dtype if given dask array
+    indices = da.from_array(np.array([1.2, 3.4]))
+    values = da.from_array(np.array([1, 2]))
+    i0 = Vector.from_values(values, indices)
+    _, indices = i0.to_values()
+    with pytest.raises(ValueError, match="indices must be integers, not float64"):
+        Vector.from_values(indices, values).compute()
+
+    # mis-matched sizes
+    indices = da.from_array(np.array([0]))
+    i0 = Vector.from_values(indices, indices)
+    indices, _ = i0.to_values()
+    with pytest.raises(ValueError, match="`indices` and `values` lengths must match"):
+        Vector.from_values(indices, values).compute()
+
+
 def test_from_values_scalar():
     u = Vector.from_values([0, 1, 3], 7)
     assert u.size == 4
@@ -157,10 +290,10 @@ def test_resize(v):
     v.resize(20)
     assert v.size == 20
     assert v.nvals == 4
-    assert compute(v[19].value) is None
+    assert v[19].new().value == None
     v.resize(4)
     assert v.size == 4
-    assert v.nvals.compute() == 2
+    assert v.nvals == 2
 
     v = v_.dup()
     v.rechunk(chunks=2, inplace=True)
@@ -170,11 +303,11 @@ def test_resize(v):
     v.resize(20, chunks=5)
     assert v.size == 20
     assert v.nvals == 4
-    assert compute(v[19].value) is None
+    assert v[19].new().value == None
     assert v._delayed.chunks == ((5, 5, 5, 5),)
     v.resize(4, chunks=3)
     assert v.size == 4
-    assert v.nvals.compute() == 2
+    assert v.nvals == 2
     assert v._delayed.chunks == ((3, 1),)
 
 
@@ -217,20 +350,20 @@ def test_build_scalar(v):
 
 def test_extract_values(v):
     idx, vals = v.to_values()
-    np.testing.assert_array_equal(idx, (1, 3, 4, 6))
-    np.testing.assert_array_equal(vals, (1, 1, 2, 0))
+    np.testing.assert_array_equal(idx.compute(), (1, 3, 4, 6))
+    np.testing.assert_array_equal(vals.compute(), (1, 1, 2, 0))
     assert idx.dtype == np.uint64
     assert vals.dtype == np.int64
 
     idx, vals = v.to_values(dtype=int)
-    np.testing.assert_array_equal(idx, (1, 3, 4, 6))
-    np.testing.assert_array_equal(vals, (1, 1, 2, 0))
+    np.testing.assert_array_equal(idx.compute(), (1, 3, 4, 6))
+    np.testing.assert_array_equal(vals.compute(), (1, 1, 2, 0))
     assert idx.dtype == np.uint64
     assert vals.dtype == np.int64
 
     idx, vals = v.to_values(dtype=float)
-    np.testing.assert_array_equal(idx, (1, 3, 4, 6))
-    np.testing.assert_array_equal(vals, (1, 1, 2, 0))
+    np.testing.assert_array_equal(idx.compute(), (1, 3, 4, 6))
+    np.testing.assert_array_equal(vals.compute(), (1, 1, 2, 0))
     assert idx.dtype == np.uint64
     assert vals.dtype == np.float64
 
diff --git a/tests/test_functools.py b/tests/test_functools.py
new file mode 100644
index 0000000..92dcdd8
--- /dev/null
+++ b/tests/test_functools.py
@@ -0,0 +1,39 @@
+import pytest
+from functools import partial
+from dask_grblas.functools import flexible_partial, skip
+
+
+def func(a, b, c, d, e, f):
+    return a, b, c, d, e, f
+
+
+def funk(a, b, c, d, e, f, ka="a", kb="b", kc="c"):
+    return a, b, c, d, e, f, ka, kb, kc
+
+
+def test_flexible_partial():
+    # without keywords
+    part_func = flexible_partial(func, skip, 2, skip, skip, 5)
+    result = part_func(1, 3, 4, 6)
+    assert result == (1, 2, 3, 4, 5, 6)
+
+    # with keywords
+    part_funk = flexible_partial(funk, skip, 2, skip, skip, 5, kb="B")
+    result = part_funk(1, 3, 4, 6, kc="C")
+    assert result == (1, 2, 3, 4, 5, 6, "a", "B", "C")
+
+    # apply a 2nd `flexible_partial` on first `flexible_partial`:
+    part_funk2 = flexible_partial(part_funk, 1, skip, 4, ka="A")
+    result = part_funk2(3, 6, kc="C")
+    assert result == (1, 2, 3, 4, 5, 6, "A", "B", "C")
+
+    # or apply a `partial` on first `flexible_partial`:
+    part_funk2 = partial(part_funk, 1, 3, ka="A")
+    result = part_funk2(4, 6, kc="C")
+    assert result == (1, 2, 3, 4, 5, 6, "A", "B", "C")
+
+    # or apply a `flexible_partial` on a `partial`:
+    part_funk = partial(funk, 1, 2, kb="B")
+    part_funk2 = flexible_partial(part_funk, skip, 4, ka="A")
+    result = part_funk2(3, 5, 6, kc="C")
+    assert result == (1, 2, 3, 4, 5, 6, "A", "B", "C")