From fe70198e59740c9e792fbba0cec1b4943ac30cce Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Fri, 13 Jan 2023 00:01:25 -0600
Subject: [PATCH 001/126] first try on compressed contraction

---
 qtensor/compression/CompressedTensor.py       | 115 ++++++++++++++++++
 qtensor/compression/__init__.py               |   2 +
 qtensor/compression/compressed_contraction.py | 108 ++++++++++++++++
 .../compression/test_compressed_contract.py   |  54 ++++++++
 4 files changed, 279 insertions(+)
 create mode 100644 qtensor/compression/CompressedTensor.py
 create mode 100644 qtensor/compression/__init__.py
 create mode 100644 qtensor/compression/compressed_contraction.py
 create mode 100644 qtensor/compression/test_compressed_contract.py

diff --git a/qtensor/compression/CompressedTensor.py b/qtensor/compression/CompressedTensor.py
new file mode 100644
index 00000000..8ce30f84
--- /dev/null
+++ b/qtensor/compression/CompressedTensor.py
@@ -0,0 +1,115 @@
+import itertools
+import numpy as np
+import qtree
+import io
+from qtree.optimizer import Tensor, Var
+
+def iterate_indices(indices: list):
+    if len(indices)==0:
+        return [tuple()]
+    ranges = [range(v.size) for v in indices]
+    return itertools.product(*ranges)
+
+class Compressor():
+    def compress(self, data):
+        print(f"Compressing len {data.size}")
+        comp = io.BytesIO()
+        np.savez_compressed(comp, data)
+        return comp
+
+    def decompress(self, ptr):
+        ptr.seek(0)
+        print(f"Loading arr.")
+        return  np.load(ptr)['arr_0']
+
+
+class CompressedTensor(Tensor):
+    """
+    Extension of the Tensor class that holds compressed data
+
+    The data array is split along several indices S into 2^|S| parts
+
+    """
+    def __init__(self, name, indices,
+                 data_key=None, data=None,
+                 slice_indices=[],
+                 compressor=Compressor()
+                ):
+        """
+        Initialize the tensor
+        name: str,
+              the name of the tensor. Used only for display/convenience.
+              May be not unique.
+        indices: tuple,
+              Indices of the tensor
+        shape: tuple,
+              shape of a tensor
+        data_key: int
+              Key to find tensor's data in the global storage
+        data: np.array
+              Actual data of the tensor. Default None.
+              Usually is not supplied at initialization.
+        slice_indices: list[Var]
+            indices along which the tensor is split into chunks
+        """
+        super().__init__(name, indices, data_key=data_key, data=data)
+        self.slice_indices = slice_indices
+        self.compressor = compressor
+
+    def slice(self, indices: list):
+        """
+        Slice the self.data along dimensions in `indices`,
+        store them compressed
+        """
+        slice_dict = {
+            i: slice(None) for i in self.indices
+        }
+        data_chunks = []
+        for ivals in iterate_indices(indices):
+            for ix, ival in zip(indices, ivals):
+                slice_dict[ix] = ival# slice(ival, ival+1)
+            dslice = self.data[tuple(slice_dict[i] for i in self.indices)]
+
+            data_chunks.append(
+                self.compressor.compress(dslice)
+            )
+            del dslice
+        self._data = data_chunks
+        self.slice_indices = indices
+
+    @property
+    def array_indices(self):
+        return [x for x in self.indices if x not in self.slice_indices]
+
+    def get_chunk(self, ivals):
+        dims = [v.size for v in self.slice_indices]
+        if len(ivals)==0:
+            flat_ix = 0
+        else:
+            flat_ix = np.ravel_multi_index(ivals, dims)
+        ptr = self._data[flat_ix]
+        return self.compressor.decompress(ptr)
+
+    def set_chunk(self, ivals, chunk:np.array):
+        if self._data is None:
+            self._data = np.empty(2**len(self.slice_indices), dtype=object)
+        dims = [v.size for v in self.slice_indices]
+        if len(ivals)==0:
+            flat_ix = 0
+        else:
+            flat_ix = np.ravel_multi_index(ivals, dims)
+        self._data[flat_ix] = self.compressor.compress(chunk)
+
+    def __str__(self):
+        array_ix = ','.join(map(str, self.array_indices))
+        split_ix= ','.join(map(str, self.slice_indices))
+        return f'{self._name}{{{split_ix}}}({array_ix})'
+
+    def copy(self, name=None, indices=None, data_key=None, data=None):
+        raise NotImplementedError()
+
+    def __repr__(self):
+        return self.__str__()
+
+
+
diff --git a/qtensor/compression/__init__.py b/qtensor/compression/__init__.py
new file mode 100644
index 00000000..ea673018
--- /dev/null
+++ b/qtensor/compression/__init__.py
@@ -0,0 +1,2 @@
+from .CompressedTensor import CompressedTensor, Tensor
+from .compressed_contraction import compressed_contract
diff --git a/qtensor/compression/compressed_contraction.py b/qtensor/compression/compressed_contraction.py
new file mode 100644
index 00000000..86eb8244
--- /dev/null
+++ b/qtensor/compression/compressed_contraction.py
@@ -0,0 +1,108 @@
+import numpy as np
+
+from qtensor.compression import CompressedTensor
+from .CompressedTensor import Tensor, iterate_indices
+
+def compressed_contract(A:Tensor, B: Tensor,
+                        result_ixs, contract_ixs,
+                        mem_limit):
+    all_indices = list(set(A.indices).union(B.indices))
+    all_indices.sort(key=int, reverse=True)
+    result_indices = list(set(all_indices) - set(contract_ixs))
+    result_indices.sort(key=int, reverse=True)
+    to_small_int = lambda x: all_indices.index(x)
+
+    exist_compressed = []
+    for T in [A, B]:
+        if isinstance(T, CompressedTensor):
+            exist_compressed += T.slice_indices
+
+    exist_compressed.sort(key=int, reverse=True)
+    need_compressed = result_indices[:-mem_limit]
+    print(f"Need compression: {need_compressed}")
+    add_compress = set(need_compressed) - set(exist_compressed)
+    remove_compress = set(exist_compressed) - set(need_compressed)
+    retain_compress = set(exist_compressed).intersection(need_compressed)
+    add_compress = list(add_compress)
+    remove_compress = list(remove_compress)
+    retain_compress = list(retain_compress)
+
+    remove_compress.sort(key=int, reverse=True)
+    retain_compress.sort(key=int, reverse=True)
+    add_compress.sort(key=int, reverse=True)
+
+    newT_name= 'C'+str(int(all_indices[0]))
+
+    if len(need_compressed)==0 and len(exist_compressed)==0:
+        A_ixs = list(map(int, A.indices))
+        B_ixs = list(map(int, B.indices))
+        
+        result = np.einsum(A.data, A_ixs, B.data, B_ixs)
+        return Tensor(newT_name, all_indices, data=result)
+
+    R = CompressedTensor(newT_name,
+                         result_indices,
+                         slice_indices=need_compressed
+                        )
+
+    chunk_ixs = list(set(result_indices) - set(need_compressed))
+    chunk_ixs.sort(key=int, reverse=True)
+    print(f"Chunk indices: {chunk_ixs}")
+    print(f"Add compression: {add_compress}, Retain compression: {retain_compress}, remove_compress: {remove_compress}")
+    slice_dict = {i: slice(None) for i in all_indices}
+    for iadd in iterate_indices(add_compress):
+        for iret in iterate_indices(retain_compress):
+
+            chunk = np.empty(2**len(chunk_ixs), dtype=B.data.dtype)
+            chunk = chunk.reshape(*(v.size for v in chunk_ixs))
+            for irm in iterate_indices(remove_compress):
+                for i, ival in zip(remove_compress, irm):
+                    slice_dict[i] = ival#slice(ival, ival+1)
+                chunk_view = chunk[tuple(
+                    slice_dict[i] for i in chunk_ixs
+                )]
+                if isinstance(A, CompressedTensor):
+                    A_data = A.get_chunk(iret+irm)
+                    A_ixs = A.array_indices
+                else:
+                    A_data = A.data
+                    A_ixs = A.indices
+
+                # TODO: handle when A and B are sliced differently
+                if isinstance(B, CompressedTensor):
+                    B_data = B.get_chunk(iret+irm)
+                    B_ixs = B.array_indices
+                else:
+                    B_data = B.data
+                    B_ixs = B.indices
+                # --
+                for ia, iaval in zip(add_compress, iadd):
+                    slice_dict[ia] = iaval#slice(iaval, iaval+1)
+                ixsa = set(add_compress).intersection(B_ixs)
+                if len(ixsa):
+                    B_data = B_data[tuple(
+                        slice_dict[i] for i in B_ixs
+                    )]
+                    for _del in ixsa:
+                        B_ixs = tuple(i for i in B_ixs if i!=_del)
+
+                A_ixs = list(map(int, A_ixs))
+                B_ixs = list(map(int, B_ixs))
+                
+                C_ixs = list(map(int, [v for v in chunk_ixs if v not in exist_compressed]))
+                #print(f"A indices: {A_ixs}, B indices: {B_ixs}, C indices:{C_ixs}")
+                A_str = ''.join(chr(97+int(v)) for v in A_ixs)
+                B_str = ''.join(chr(97+int(v)) for v in B_ixs)
+                C_str = ''.join(chr(97+int(v)) for v in C_ixs)
+                expr = f"{A_str},{B_str}->{C_str}"
+                #np.einsum(A_data, A_ixs, B_data, B_ixs, C_ixs, out=chunk_view)
+                print(f"Expr: {expr}")
+                np.einsum(expr, A_data, B_data, out=chunk_view)
+            R.set_chunk(iadd+iret, chunk)
+    return R
+
+
+
+    
+
+
diff --git a/qtensor/compression/test_compressed_contract.py b/qtensor/compression/test_compressed_contract.py
new file mode 100644
index 00000000..76a25e42
--- /dev/null
+++ b/qtensor/compression/test_compressed_contract.py
@@ -0,0 +1,54 @@
+from qtensor.compression import compressed_contract, CompressedTensor, Tensor
+from qtree.optimizer import Var
+import numpy as np
+
+
+def test_compressed_contract():
+    A_ixs = [Var(x) for x in [8,7,6,5,4,3, 2]]
+    A_comp = [Var(x) for x in [8, 7, 6]]
+    B_ixs = [Var(x) for x in [9, 3, 4, 2]]
+    contract_ixs = [Var(x) for x in [3,2]]
+
+    A_data = np.ones(2**len(A_ixs))
+    #A_data = np.random.randn(2**len(A_ixs))
+    A_data = A_data.reshape(*(v.size for v in A_ixs))
+    A_data[1, 1] *= 2
+    A_data[0, 1] *= 2
+    A_data[:, :, :, 1] *= 1.2
+    B_data = np.ones(2**len(B_ixs))*1.2
+    #B_data = np.random.randn(2**len(B_ixs))*1.2
+    B_data = B_data.reshape(*(v.size for v in B_ixs))
+
+    A = CompressedTensor('A', A_ixs, data=A_data)
+    A.slice(A_comp)
+    B = Tensor('B', B_ixs, data=B_data)
+    print(f"Tensor A: {A}")
+    print(f"Tensor B: {B}")
+
+    res_ixs = list(set(A_ixs).union(B_ixs) - set(contract_ixs))
+    res_ixs.sort(key=int, reverse=True)
+    res = compressed_contract(A, B, res_ixs, contract_ixs,
+                              mem_limit=3)
+    print(f"Resulting Tensor: {res}")
+
+    res = compressed_contract(A, B, res_ixs, contract_ixs,
+                              mem_limit=10)
+
+    print(f"Resulting Tensor: {res}")
+    print(res.get_chunk(()).flatten())
+
+
+    A_str = ''.join(chr(97+int(v)) for v in A_ixs)
+    B_str = ''.join(chr(97+int(v)) for v in B_ixs)
+    C_str = ''.join(chr(97+int(v)) for v in res_ixs)
+    expr = f"{A_str},{B_str}->{C_str}"
+    C = np.einsum(expr, A_data, B_data)
+    print(f"Ground truth:")
+    print( C.flatten())
+    
+    assert np.allclose(C, res.get_chunk(()))
+    print("Success!")
+
+
+if __name__=="__main__":
+    test_compressed_contract()

From 22199b8e7b717dee9a3c320eea7a8fc1c6aa968f Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Sat, 14 Jan 2023 09:01:51 -0600
Subject: [PATCH 002/126] add complexity estimation based on hypergraph

---
 qtensor/compression/__init__.py               |   1 +
 qtensor/compression/cost_estimation.py        | 173 ++++++++++++++++++
 .../compression/test_compressed_contract.py   |   2 +-
 qtensor/compression/test_cost_estimation.py   |  45 +++++
 4 files changed, 220 insertions(+), 1 deletion(-)
 create mode 100644 qtensor/compression/cost_estimation.py
 create mode 100644 qtensor/compression/test_cost_estimation.py

diff --git a/qtensor/compression/__init__.py b/qtensor/compression/__init__.py
index ea673018..79eb76e0 100644
--- a/qtensor/compression/__init__.py
+++ b/qtensor/compression/__init__.py
@@ -1,2 +1,3 @@
 from .CompressedTensor import CompressedTensor, Tensor
 from .compressed_contraction import compressed_contract
+from .cost_estimation import compressed_contraction_cost
diff --git a/qtensor/compression/cost_estimation.py b/qtensor/compression/cost_estimation.py
new file mode 100644
index 00000000..49d00b85
--- /dev/null
+++ b/qtensor/compression/cost_estimation.py
@@ -0,0 +1,173 @@
+from dataclasses import dataclass
+from functools import reduce
+import numpy as np
+from qtensor.optimisation import QtreeTensorNet
+from typing import Iterable, Hashable, Dict
+
+Edge = Iterable[Hashable]
+Hypergraph = Dict[Hashable, Edge]
+# # self = hypergraph
+# verts = set(sum(self.values(), []))
+# num_edges = len(self)
+
+@dataclass
+class Cost:
+    use_log = True
+    flops: int
+    memory: int
+    width: int
+    compressions: int
+    decompressions: int
+
+    def time(self, flops_second, compression_throughput, decompression_throughput, memory_limit):
+        """Returns the time in seconds to perform the contraction"""
+        return (
+            self.flops / flops_second
+            + self.compressions *2**memory_limit/ compression_throughput
+            + self.decompressions *2**memory_limit/ decompression_throughput
+        )
+
+    def __add__(self, other):
+        return Cost(
+            self.flops + other.flops,
+            self.memory + other.memory,
+            max(self.width, other.width),
+            self.compressions + other.compressions,
+            self.decompressions + other.decompressions,
+        )
+
+    def format_number(self, n):
+        if self.use_log:
+            return f"{np.log2(n):.2f}"
+        else:
+            return f"{n}"
+
+    def __str__(self):
+        flops_str = self.format_number(self.flops)
+        mems_str = self.format_number(self.memory)
+        return f"Cost(FLOPs={flops_str}, Memory={mems_str}, width={self.width}, compressions={self.compressions}, decompressions={self.decompressions})"
+
+def dual_hg(hg: Hypergraph) -> Hypergraph:
+    dual = {}
+    for iedge, edge in hg.items():
+        for vert in edge:
+            if dual.get(vert) is None:
+                dual[vert] = []
+            dual[vert].append(iedge)
+    return dual
+
+def tn2tn(tn: QtreeTensorNet, peo=None): 
+    ignored_vars = tn.bra_vars + tn.ket_vars
+    # Vertices --> indices
+    # Edges --> tensors
+    dual_tn = { str(hex(id(t))):[x for x in t.indices if x not in ignored_vars]
+               for t in tn.tensors }
+    if peo:
+        dual_tn = { str(hex(id(t))):[peo.index(x) for x in t.indices if x not in ignored_vars]
+                   for t in tn.tensors }
+
+    # Vertices --> tensors
+    # Edges --> indices
+    TN = dual_hg(dual_tn)
+    return TN
+
+def pairwise_cost(indices, comp_ixs, mem_limit, contracted_ixs_count=0):
+    """
+    Computes the cost of contracting a pair of tensors, assuming last
+    `contracted_ixs_count` indices are contrated
+    """
+    all_indices = set().union(*indices) 
+    next_indices = list(all_indices)
+    next_indices.sort(key=int, reverse=True)
+    next_indices = next_indices[:-contracted_ixs_count]
+
+    if len(next_indices) > mem_limit or any(comp_ixs):
+        next_comp_ixs= next_indices[:-mem_limit]
+        rm_comp = set().union(*comp_ixs) - set(next_comp_ixs)
+        decompressions = 2**(len(rm_comp)+len(next_comp_ixs))
+        compressions = 2**len(next_comp_ixs)
+    else:
+        next_comp_ixs = []
+        decompressions = 0
+        compressions = 0
+    return (
+        next_indices,
+        next_comp_ixs,
+        Cost(
+            memory = 2**len(next_indices),
+            flops = 2**len(all_indices),
+            width = len(next_indices),
+            compressions = compressions,
+            decompressions = decompressions,
+        )
+    )
+
+
+def bucket_contract_cost(indices, comp_ixs, mem_limit):
+    ixs, compixs = indices[0], comp_ixs[0]
+    costs = []
+    for i in range(1, len(indices)-1):
+        ixs, compixs, cost = pairwise_cost(
+            [ixs, indices[i]],
+            [compixs, comp_ixs[i]],
+            mem_limit, contracted_ixs_count=0
+        )
+        costs.append(cost)
+    new_ixs, new_comp_ixs, cost = pairwise_cost(
+        [ixs, indices[-1]],
+        [compixs, comp_ixs[-1]],
+        mem_limit, contracted_ixs_count=1
+    )
+    costs.append(cost)
+    return new_ixs, new_comp_ixs, sum(costs[1:], costs[0])
+
+def contract_with_cost(TN, comp_ixs, dual_TN, vertex, mem_limit=30):
+    """
+    Contracts vertex from TN
+    TN is a mapping from indices to [tensor]
+    """
+    tensors = TN[vertex]
+    # contract
+    tensors.sort(key=lambda t: len(dual_TN[t]))
+    indices = [dual_TN[t] for t in tensors]
+    comp_itensors = [comp_ixs.get(t, []) for t in tensors]
+    _, compressed, cost = bucket_contract_cost(indices, comp_itensors, mem_limit)
+
+    result_ixs = set().union(*indices)
+    result_ixs.remove(vertex)
+    # This can be random but should be unique
+    tensor_id = str(hex(id(vertex)))
+    comp_ixs[tensor_id] = compressed
+    # -- remove tensors
+    for t in tensors:
+        for v in dual_TN[t]:
+            TN[v].remove(t)
+        del dual_TN[t]
+    # remove vertex
+    for t in TN[vertex]:
+        dual_TN[t].remove(vertex)
+    del TN[vertex]
+    # -- add result
+    for ix in result_ixs:
+        if TN.get(ix) is None:
+            TN[ix] = []
+        TN[ix].append(tensor_id)
+    dual_TN[tensor_id] = list(result_ixs)
+    # --
+    return cost
+
+
+def compressed_contraction_cost(tn, peo, mem_limit=None):
+    """
+    Compute the cost of a contraction with compression.
+    """
+    TN = tn2tn(tn)
+    ignored_vars = tn.bra_vars + tn.ket_vars
+    peo = [x for x in peo if x not in ignored_vars]
+    costs = []
+    dual_TN = dual_hg(TN)
+    comp_ixs = {}
+    for i in peo:
+        cost = contract_with_cost(TN, comp_ixs, dual_TN, i, mem_limit)
+        costs.append(cost)
+    return costs
diff --git a/qtensor/compression/test_compressed_contract.py b/qtensor/compression/test_compressed_contract.py
index 76a25e42..75ccb277 100644
--- a/qtensor/compression/test_compressed_contract.py
+++ b/qtensor/compression/test_compressed_contract.py
@@ -6,7 +6,7 @@
 def test_compressed_contract():
     A_ixs = [Var(x) for x in [8,7,6,5,4,3, 2]]
     A_comp = [Var(x) for x in [8, 7, 6]]
-    B_ixs = [Var(x) for x in [9, 3, 4, 2]]
+    B_ixs = [Var(x) for x in [10, 9, 3, 4, 2]]
     contract_ixs = [Var(x) for x in [3,2]]
 
     A_data = np.ones(2**len(A_ixs))
diff --git a/qtensor/compression/test_cost_estimation.py b/qtensor/compression/test_cost_estimation.py
new file mode 100644
index 00000000..b0eb9b1e
--- /dev/null
+++ b/qtensor/compression/test_cost_estimation.py
@@ -0,0 +1,45 @@
+import qtensor
+import numpy as np
+from qtensor.compression import compressed_contraction_cost
+from qtensor.tests import get_test_problem
+from qtensor.optimisation import QtreeTensorNet
+from qtensor import QtreeQAOAComposer
+
+
+def costs_to_csv(costs):
+    first_line = "flops, memory, width, compressions, decompressions, time"
+    lines = [first_line]
+    for i, c in enumerate(costs):
+        time = c.time(1e6, 1e5, 1e5, 13)
+        lines.append(f"[{i}]\t{c.flops},\t{c.memory},\t{c.width},\t {c.compressions},\t{c.decompressions},\t{time}")
+    return "\n".join(lines)
+
+def test_compressed_contraction_cost():
+    G, gamma, beta = get_test_problem(n=15, p=3, d=4)
+
+    composer = QtreeQAOAComposer(
+        graph=G, gamma=gamma, beta=beta)
+    composer.ansatz_state()
+
+    tn = QtreeTensorNet.from_qtree_gates(composer.circuit)
+    max_time = 15
+    opt = qtensor.toolbox.get_ordering_algo('rgreedy')
+    peo, t = opt.optimize(tn)
+    print(f"Contraction width: {opt.treewidth}")
+    costs = compressed_contraction_cost(tn, peo, mem_limit=opt.treewidth-5)
+    cost = sum(costs[1:], costs[0])
+    print(costs_to_csv(costs))
+    print(cost)
+    mems_lg, flops_lg = tn.simulation_cost(peo)
+    print(f'Contraction cost: {np.log2(sum(flops_lg))} flops, {np.log2(max(mems_lg))} memory')
+    ignored_vars = tn.bra_vars + tn.ket_vars
+    peo = [x for x in peo if x not in ignored_vars]
+    peo = list(map(int, peo))
+    nodes, path = qtensor.utils.get_neighbors_path(tn.get_line_graph(), peo)
+    print("Path\n", path)
+    print("Path list comp\n", [c.width for c in costs])
+    print("Maxw", max(path))
+    assert opt.treewidth == cost.width+1
+
+if __name__ == '__main__':
+    test_compressed_contraction_cost()

From f5b1ee1a0dafc8cb5e8f29c803c12f57ed17be00 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Sat, 14 Jan 2023 09:07:58 -0600
Subject: [PATCH 003/126] update gitignore (wily, pre-commit)

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 82951579..8fd8aa8d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+.pre-commit-config.yaml
 .DS_Store
 # Byte-compiled / optimized / DLL files
 __pycache__/

From f258e68230c8e6cce50215b31349d14e6b6b1ac3 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Sat, 14 Jan 2023 10:38:16 -0600
Subject: [PATCH 004/126] more correct memory estimation with compression

---
 qtensor/compression/cost_estimation.py      | 97 +++++++++++++++------
 qtensor/compression/test_cost_estimation.py | 15 ++--
 2 files changed, 78 insertions(+), 34 deletions(-)

diff --git a/qtensor/compression/cost_estimation.py b/qtensor/compression/cost_estimation.py
index 49d00b85..fd39795f 100644
--- a/qtensor/compression/cost_estimation.py
+++ b/qtensor/compression/cost_estimation.py
@@ -30,7 +30,7 @@ def time(self, flops_second, compression_throughput, decompression_throughput, m
     def __add__(self, other):
         return Cost(
             self.flops + other.flops,
-            self.memory + other.memory,
+            max(self.memory, other.memory),
             max(self.width, other.width),
             self.compressions + other.compressions,
             self.decompressions + other.decompressions,
@@ -56,45 +56,69 @@ def dual_hg(hg: Hypergraph) -> Hypergraph:
             dual[vert].append(iedge)
     return dual
 
+def remove_vertices_tensors(TN, dual_TN, vertices=[], tensors=[]):
+    for t in tensors:
+        # -- remove tensor
+        for v in dual_TN[t]:
+            TN[v].remove(t)
+        del dual_TN[t]
+
+    for vertex in vertices:
+        # remove vertex
+        for t in TN[vertex]:
+            dual_TN[t].remove(vertex)
+        del TN[vertex]
+
 def tn2tn(tn: QtreeTensorNet, peo=None): 
     ignored_vars = tn.bra_vars + tn.ket_vars
     # Vertices --> indices
     # Edges --> tensors
     dual_tn = { str(hex(id(t))):[x for x in t.indices if x not in ignored_vars]
                for t in tn.tensors }
-    if peo:
-        dual_tn = { str(hex(id(t))):[peo.index(x) for x in t.indices if x not in ignored_vars]
-                   for t in tn.tensors }
 
     # Vertices --> tensors
     # Edges --> indices
     TN = dual_hg(dual_tn)
     return TN
 
-def pairwise_cost(indices, comp_ixs, mem_limit, contracted_ixs_count=0):
+def tensor_memory(indices, mem_limit, compression_ratio):
+    if len(indices) > mem_limit:
+        return 2**len(indices)/compression_ratio
+    else:
+        return 2**len(indices)
+def pairwise_cost(indices, comp_ixs, contracted_ixs=[],
+                  mem_limit=np.inf,
+                  compression_ratio=30,
+                 ):
     """
     Computes the cost of contracting a pair of tensors, assuming last
     `contracted_ixs_count` indices are contrated
     """
+    contracted_ixs_count = len(contracted_ixs)
     all_indices = set().union(*indices) 
     next_indices = list(all_indices)
     next_indices.sort(key=int, reverse=True)
-    next_indices = next_indices[:-contracted_ixs_count]
+    for i in contracted_ixs:
+        next_indices.remove(i)
 
     if len(next_indices) > mem_limit or any(comp_ixs):
         next_comp_ixs= next_indices[:-mem_limit]
         rm_comp = set().union(*comp_ixs) - set(next_comp_ixs)
-        decompressions = 2**(len(rm_comp)+len(next_comp_ixs))
+        decompressions = 2**(len(rm_comp) + len(next_comp_ixs))
         compressions = 2**len(next_comp_ixs)
     else:
         next_comp_ixs = []
         decompressions = 0
         compressions = 0
+    mem = 0
+    for ilist in [next_indices]+indices:
+        mem += tensor_memory(ilist, mem_limit, compression_ratio)
+
     return (
         next_indices,
         next_comp_ixs,
         Cost(
-            memory = 2**len(next_indices),
+            memory = mem,
             flops = 2**len(all_indices),
             width = len(next_indices),
             compressions = compressions,
@@ -103,25 +127,43 @@ def pairwise_cost(indices, comp_ixs, mem_limit, contracted_ixs_count=0):
     )
 
 
-def bucket_contract_cost(indices, comp_ixs, mem_limit):
+def bucket_contract_cost(indices, comp_ixs, contracted_indices, **kwargs):
+    """
+    Computes the cost of contracting a bucket of tensors
+
+    Args:
+        indices: indices of tensors in the bucket
+        comp_ixs: indices that are compressed
+        contracted_indices: indices that are contracted
+        **kwargs: passed to pairwise_cost
+    """
     ixs, compixs = indices[0], comp_ixs[0]
     costs = []
     for i in range(1, len(indices)-1):
         ixs, compixs, cost = pairwise_cost(
             [ixs, indices[i]],
             [compixs, comp_ixs[i]],
-            mem_limit, contracted_ixs_count=0
+            **kwargs
         )
         costs.append(cost)
+    # -- contract last two tensors
     new_ixs, new_comp_ixs, cost = pairwise_cost(
         [ixs, indices[-1]],
         [compixs, comp_ixs[-1]],
-        mem_limit, contracted_ixs_count=1
+        contracted_ixs=contracted_indices,
+        **kwargs,
     )
     costs.append(cost)
-    return new_ixs, new_comp_ixs, sum(costs[1:], costs[0])
-
-def contract_with_cost(TN, comp_ixs, dual_TN, vertex, mem_limit=30):
+    new_ixs = set().union(*indices) - set(contracted_indices)
+    sum_cost = sum(costs[1:], costs[0])
+    sum_cost.width = len(new_ixs)
+    ## Naive Flops calculation
+    # sum_cost.flops = 2**len(set().union(*indices))*(len(indices)+1)
+    return new_ixs, new_comp_ixs, sum_cost
+
+def contract_with_cost(TN, comp_ixs, dual_TN, vertex,
+                       mem_limit=np.inf,
+                       compression_ratio=100):
     """
     Contracts vertex from TN
     TN is a mapping from indices to [tensor]
@@ -130,23 +172,22 @@ def contract_with_cost(TN, comp_ixs, dual_TN, vertex, mem_limit=30):
     # contract
     tensors.sort(key=lambda t: len(dual_TN[t]))
     indices = [dual_TN[t] for t in tensors]
-    comp_itensors = [comp_ixs.get(t, []) for t in tensors]
-    _, compressed, cost = bucket_contract_cost(indices, comp_itensors, mem_limit)
+    comp_indices = [comp_ixs.get(t, []) for t in tensors]
+    result_ixs, compressed, cost = bucket_contract_cost(indices, comp_indices, [vertex],
+                                                        mem_limit=mem_limit,
+                                                        compression_ratio=compression_ratio
+                                                       )
+    # calculate current memory
+    for t_id, indices in dual_TN.items():
+        if t_id in tensors:
+            # these tensors are accounted in bucket_contract_cost
+            continue
+        cost.memory += tensor_memory(indices, mem_limit, compression_ratio)
 
-    result_ixs = set().union(*indices)
-    result_ixs.remove(vertex)
     # This can be random but should be unique
     tensor_id = str(hex(id(vertex)))
     comp_ixs[tensor_id] = compressed
-    # -- remove tensors
-    for t in tensors:
-        for v in dual_TN[t]:
-            TN[v].remove(t)
-        del dual_TN[t]
-    # remove vertex
-    for t in TN[vertex]:
-        dual_TN[t].remove(vertex)
-    del TN[vertex]
+    remove_vertices_tensors(TN, dual_TN, [vertex], tensors)
     # -- add result
     for ix in result_ixs:
         if TN.get(ix) is None:
@@ -157,7 +198,7 @@ def contract_with_cost(TN, comp_ixs, dual_TN, vertex, mem_limit=30):
     return cost
 
 
-def compressed_contraction_cost(tn, peo, mem_limit=None):
+def compressed_contraction_cost(tn, peo, mem_limit=np.inf):
     """
     Compute the cost of a contraction with compression.
     """
diff --git a/qtensor/compression/test_cost_estimation.py b/qtensor/compression/test_cost_estimation.py
index b0eb9b1e..58bdd451 100644
--- a/qtensor/compression/test_cost_estimation.py
+++ b/qtensor/compression/test_cost_estimation.py
@@ -11,11 +11,11 @@ def costs_to_csv(costs):
     lines = [first_line]
     for i, c in enumerate(costs):
         time = c.time(1e6, 1e5, 1e5, 13)
-        lines.append(f"[{i}]\t{c.flops},\t{c.memory},\t{c.width},\t {c.compressions},\t{c.decompressions},\t{time}")
+        lines.append(f"[{i}]\t{c.flops},\t{round(c.memory)},\t{c.width},\t {c.compressions},\t{c.decompressions},\t{time}")
     return "\n".join(lines)
 
 def test_compressed_contraction_cost():
-    G, gamma, beta = get_test_problem(n=15, p=3, d=4)
+    G, gamma, beta = get_test_problem(n=20, p=4, d=4)
 
     composer = QtreeQAOAComposer(
         graph=G, gamma=gamma, beta=beta)
@@ -23,15 +23,18 @@ def test_compressed_contraction_cost():
 
     tn = QtreeTensorNet.from_qtree_gates(composer.circuit)
     max_time = 15
-    opt = qtensor.toolbox.get_ordering_algo('rgreedy')
+    opt = qtensor.toolbox.get_ordering_algo('greedy')
     peo, t = opt.optimize(tn)
     print(f"Contraction width: {opt.treewidth}")
-    costs = compressed_contraction_cost(tn, peo, mem_limit=opt.treewidth-5)
+    M_limit = opt.treewidth + 10
+    costs = compressed_contraction_cost(tn, peo, mem_limit=M_limit)
     cost = sum(costs[1:], costs[0])
     print(costs_to_csv(costs))
     print(cost)
+    print(f'Contraction time estimate: {cost.time(1e6, 1e5, 1e5, M_limit)} seconds')
     mems_lg, flops_lg = tn.simulation_cost(peo)
-    print(f'Contraction cost: {np.log2(sum(flops_lg))} flops, {np.log2(max(mems_lg))} memory')
+    print(f'Contraction cost (old): {np.log2(sum(flops_lg))} flops, {np.log2(max(mems_lg))} memory')
+    print(f'Contraction time (old): {sum(flops_lg)/1e6} seconds')
     ignored_vars = tn.bra_vars + tn.ket_vars
     peo = [x for x in peo if x not in ignored_vars]
     peo = list(map(int, peo))
@@ -39,7 +42,7 @@ def test_compressed_contraction_cost():
     print("Path\n", path)
     print("Path list comp\n", [c.width for c in costs])
     print("Maxw", max(path))
-    assert opt.treewidth == cost.width+1
+    assert opt.treewidth == cost.width
 
 if __name__ == '__main__':
     test_compressed_contraction_cost()

From 9540aaebc77ed594f1bb27dcc827358438d2a13c Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Sat, 14 Jan 2023 10:45:40 -0600
Subject: [PATCH 005/126] add compression_ratio to arguments

---
 qtensor/compression/cost_estimation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/qtensor/compression/cost_estimation.py b/qtensor/compression/cost_estimation.py
index fd39795f..dcc72b2d 100644
--- a/qtensor/compression/cost_estimation.py
+++ b/qtensor/compression/cost_estimation.py
@@ -198,7 +198,7 @@ def contract_with_cost(TN, comp_ixs, dual_TN, vertex,
     return cost
 
 
-def compressed_contraction_cost(tn, peo, mem_limit=np.inf):
+def compressed_contraction_cost(tn, peo, mem_limit=np.inf, compression_ratio=100):
     """
     Compute the cost of a contraction with compression.
     """
@@ -209,6 +209,6 @@ def compressed_contraction_cost(tn, peo, mem_limit=np.inf):
     dual_TN = dual_hg(TN)
     comp_ixs = {}
     for i in peo:
-        cost = contract_with_cost(TN, comp_ixs, dual_TN, i, mem_limit)
+        cost = contract_with_cost(TN, comp_ixs, dual_TN, i, mem_limit, compression_ratio)
         costs.append(cost)
     return costs

From 0062a50ea2113505b364a31205f4b51e6c0839a4 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Fri, 20 Jan 2023 20:02:06 -0600
Subject: [PATCH 006/126] adapt to use slicing

---
 qtensor/compression/cost_estimation.py      | 34 +++++++++++++++----
 qtensor/compression/test_cost_estimation.py | 36 +++++++++++++++++----
 2 files changed, 58 insertions(+), 12 deletions(-)

diff --git a/qtensor/compression/cost_estimation.py b/qtensor/compression/cost_estimation.py
index dcc72b2d..0d96c897 100644
--- a/qtensor/compression/cost_estimation.py
+++ b/qtensor/compression/cost_estimation.py
@@ -38,7 +38,8 @@ def __add__(self, other):
 
     def format_number(self, n):
         if self.use_log:
-            return f"{np.log2(n):.2f}"
+            # log from ints may result in error
+            return f"{np.log2(n*1.):.2f}"
         else:
             return f"{n}"
 
@@ -70,11 +71,15 @@ def remove_vertices_tensors(TN, dual_TN, vertices=[], tensors=[]):
         del TN[vertex]
 
 def tn2tn(tn: QtreeTensorNet, peo=None): 
-    ignored_vars = tn.bra_vars + tn.ket_vars
+    ignored_vars = list(map(int, tn.bra_vars + tn.ket_vars))
     # Vertices --> indices
     # Edges --> tensors
-    dual_tn = { str(hex(id(t))):[x for x in t.indices if x not in ignored_vars]
+    dual_tn = { str(hex(id(t))):[x for x in t.indices if int(x) not in ignored_vars and x.size>1]
                for t in tn.tensors }
+    # clean up empty edges
+    for t in list(dual_tn.keys()):
+        if len(dual_tn[t]) == 0:
+            del dual_tn[t]
 
     # Vertices --> tensors
     # Edges --> indices
@@ -198,13 +203,30 @@ def contract_with_cost(TN, comp_ixs, dual_TN, vertex,
     return cost
 
 
+def convert_TN_peo(tn, peo):
+    """
+    Convert qtensor.QtreeTensorNet to a hypergraph
+    relabel peo accordingly.
+    Args:
+        tn: qtensor.QtreeTensorNet
+        peo: list of indices
+    """
+    TN = tn2tn(tn)
+    relabel_dict = {int(p):i for i, p in enumerate(peo)}
+    peo = [x for x in peo if int(x) not in ignored_vars]
+    ignored_vars = list(map(int, tn.bra_vars + tn.ket_vars))
+
+    TN = {
+        relabel_dict[int(v)]: ix for v, ix in TN.items()
+    }
+    peo = [relabel_dict[int(p)] for p in peo]
+    return TN, peo
+
 def compressed_contraction_cost(tn, peo, mem_limit=np.inf, compression_ratio=100):
     """
     Compute the cost of a contraction with compression.
     """
-    TN = tn2tn(tn)
-    ignored_vars = tn.bra_vars + tn.ket_vars
-    peo = [x for x in peo if x not in ignored_vars]
+    TN, peo = convert_TN_peo(tn, peo)
     costs = []
     dual_TN = dual_hg(TN)
     comp_ixs = {}
diff --git a/qtensor/compression/test_cost_estimation.py b/qtensor/compression/test_cost_estimation.py
index 58bdd451..33b4db33 100644
--- a/qtensor/compression/test_cost_estimation.py
+++ b/qtensor/compression/test_cost_estimation.py
@@ -26,20 +26,44 @@ def test_compressed_contraction_cost():
     opt = qtensor.toolbox.get_ordering_algo('greedy')
     peo, t = opt.optimize(tn)
     print(f"Contraction width: {opt.treewidth}")
-    M_limit = opt.treewidth + 10
+    M_limit = opt.treewidth - 6
+    # -- Estimate compressed contraction
     costs = compressed_contraction_cost(tn, peo, mem_limit=M_limit)
-    cost = sum(costs[1:], costs[0])
+    cost = sum(costs[2:], costs[0])
     print(costs_to_csv(costs))
-    print(cost)
-    print(f'Contraction time estimate: {cost.time(1e6, 1e5, 1e5, M_limit)} seconds')
+    # -- Estimate regular contraction
     mems_lg, flops_lg = tn.simulation_cost(peo)
-    print(f'Contraction cost (old): {np.log2(sum(flops_lg))} flops, {np.log2(max(mems_lg))} memory')
-    print(f'Contraction time (old): {sum(flops_lg)/1e6} seconds')
     ignored_vars = tn.bra_vars + tn.ket_vars
     peo = [x for x in peo if x not in ignored_vars]
     peo = list(map(int, peo))
     nodes, path = qtensor.utils.get_neighbors_path(tn.get_line_graph(), peo)
     print("Path\n", path)
+    # -- Estimate sliced contraction
+    opt_par  = qtensor.optimisation.SlicesOptimizer(max_tw=M_limit+1, max_slice=5)
+    peo, par_vars, tn = opt_par.optimize(tn)
+    print("Par vars", par_vars)
+    tn.slice({i: slice(0, 1) for i in par_vars})
+    peo_sl= peo[:-len(par_vars)]
+    costs_sliced = compressed_contraction_cost(tn, peo_sl)
+    cost_sliced = sum(costs_sliced[1:], costs_sliced[0])
+    runs_count = 2**len(par_vars)
+    # print flops and memory from sliced simulation cost
+    flops_run = cost_sliced.flops
+    mem_run = cost_sliced.memory
+    print(cost)
+    print(cost_sliced)
+    FLOP_perS = 1e9
+    Throughput = 1e11
+    print(f'Contraction cost (sliced): {np.log2(flops_run*runs_count*1.)} flops, {np.log2(mem_run*1.)} memory, {cost_sliced.width} width')
+    print(f'Contraction cost (old): {np.log2(sum(flops_lg))} flops, {np.log2(max(mems_lg))} memory')
+    mems_lg, flops_lg = tn.simulation_cost(peo)
+    print(f'Sliced contraction cost (old): {np.log2(sum(flops_lg)*runs_count)} flops, {np.log2(max(mems_lg))} memory')
+
+    print(f'-- Compressed Contraction time estimate: {cost.time(FLOP_perS, Throughput, Throughput, M_limit)} seconds')
+    print(f'-- Sliced contraction time estimate: {runs_count*cost_sliced.time(FLOP_perS, Throughput, Throughput, M_limit)} seconds')
+    print(f'Contraction time (old): {sum(flops_lg)/FLOP_perS} seconds')
+
+
     print("Path list comp\n", [c.width for c in costs])
     print("Maxw", max(path))
     assert opt.treewidth == cost.width

From 5708555b34b19743d3543fa8c648ece7cc1cdf94 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Fri, 20 Jan 2023 23:45:56 -0600
Subject: [PATCH 007/126] Add __getitem__ to CompressedTensor, generalize
 compressed contraction

---
 qtensor/compression/CompressedTensor.py       |  45 ++++-
 qtensor/compression/compressed_contraction.py | 160 +++++++++---------
 qtensor/compression/cost_estimation.py        |   2 +-
 .../compression/test_compressed_contract.py   |   2 +-
 qtensor/compression/test_compressed_tensor.py |  39 +++++
 qtree                                         |   2 +-
 6 files changed, 165 insertions(+), 85 deletions(-)
 create mode 100644 qtensor/compression/test_compressed_tensor.py

diff --git a/qtensor/compression/CompressedTensor.py b/qtensor/compression/CompressedTensor.py
index 8ce30f84..7685047f 100644
--- a/qtensor/compression/CompressedTensor.py
+++ b/qtensor/compression/CompressedTensor.py
@@ -55,11 +55,17 @@ def __init__(self, name, indices,
         super().__init__(name, indices, data_key=data_key, data=data)
         self.slice_indices = slice_indices
         self.compressor = compressor
+        if data is not None:
+            self._dtype = data.dtype
+        else:
+            self._dtype = None
 
-    def slice(self, indices: list):
+    def compress_indices(self, indices: list):
         """
         Slice the self.data along dimensions in `indices`,
         store them compressed
+
+        Does not support compressing when already compressed
         """
         slice_dict = {
             i: slice(None) for i in self.indices
@@ -77,6 +83,13 @@ def slice(self, indices: list):
         self._data = data_chunks
         self.slice_indices = indices
 
+    @property
+    def dtype(self):
+        """
+        DataType of wrapped chunks.
+        """
+        return self._dtype
+
     @property
     def array_indices(self):
         return [x for x in self.indices if x not in self.slice_indices]
@@ -91,6 +104,13 @@ def get_chunk(self, ivals):
         return self.compressor.decompress(ptr)
 
     def set_chunk(self, ivals, chunk:np.array):
+        # -- Check for consistent data types between chunks
+        if self._dtype is None:
+            self._dtype = chunk.dtype
+        else:
+            assert self.dtype == chunk.dtype, f"Chunk dtype {chunk.dtype} does not match tensor dtype {self.dtype}"
+        # --
+
         if self._data is None:
             self._data = np.empty(2**len(self.slice_indices), dtype=object)
         dims = [v.size for v in self.slice_indices]
@@ -100,6 +120,29 @@ def set_chunk(self, ivals, chunk:np.array):
             flat_ix = np.ravel_multi_index(ivals, dims)
         self._data[flat_ix] = self.compressor.compress(chunk)
 
+    def __getitem__(self, key):
+        """
+        Get a slice of the tensor along the indices in `key`
+        Currently slicing over all compressed indices is required.
+        Slices over compressed indices must be ints
+        """
+        slices_ints, new_indices = self._parse_getitem_key(key)
+        slice_dict = {}
+        chunk_slices_ints = []
+        compression_ints = []
+        for ix, ival in zip(self.indices, slices_ints):
+            slice_dict[ix] = ival
+            if ix in self.slice_indices:
+                compression_ints.append(ival)
+            else:
+                chunk_slices_ints.append(ival)
+        chunk = self.get_chunk(compression_ints)
+        new_name = f"{self.name}[sliced]"
+        # careful: chunk will not be collected even if slice is small
+        chunk_slice = chunk[tuple(chunk_slices_ints)]
+        return Tensor(new_name, new_indices, data=chunk_slice)
+
+
     def __str__(self):
         array_ix = ','.join(map(str, self.array_indices))
         split_ix= ','.join(map(str, self.slice_indices))
diff --git a/qtensor/compression/compressed_contraction.py b/qtensor/compression/compressed_contraction.py
index 86eb8244..89d8ba7c 100644
--- a/qtensor/compression/compressed_contraction.py
+++ b/qtensor/compression/compressed_contraction.py
@@ -3,6 +3,46 @@
 from qtensor.compression import CompressedTensor
 from .CompressedTensor import Tensor, iterate_indices
 
+# taken from numpy/core/einsumfunc.py
+einsum_symbols = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
+einsum_symbols_set = set(einsum_symbols)
+
+def contract_two_tensors(A, B, T_out):
+    """
+    Contract tensors A and B along their common indices and write result to T_out.
+    T_out tensor should be pre-allocated with data.
+
+    This takes care of the case where indices of A and B are Vars with large integer id
+    """
+    result_indices = T_out.indices
+    out_buffer = T_out.data
+    max_id = 0
+    A_ints = []
+    B_ints = []
+
+    for a_i in A.indices:
+        a_int = int(a_i)
+        max_id = max(max_id, a_int)
+        A_ints.append(a_int)
+
+    for b_i in B.indices:
+        b_int = int(b_i)
+        max_id = max(max_id, b_int)
+        B_ints.append(b_int)
+
+    if max_id > len(einsum_symbols):
+        # -- relabel indices to small ints
+        all_indices = set(A_ints + B_ints)
+        relabel_dict_int = {i: j for j, i in enumerate(all_indices)}
+        A_ints = [relabel_dict_int[i] for i in A_ints]
+        B_ints = [relabel_dict_int[i] for i in B_ints]
+        result_ints = [relabel_dict_int[int(i)] for i in result_indices]
+    else:
+        result_ints = list(map(int, result_indices))
+
+    np.einsum(A.data, A_ints, B.data, B_ints, result_ints, out=out_buffer)
+
+
 def compressed_contract(A:Tensor, B: Tensor,
                         result_ixs, contract_ixs,
                         mem_limit):
@@ -12,97 +52,55 @@ def compressed_contract(A:Tensor, B: Tensor,
     result_indices.sort(key=int, reverse=True)
     to_small_int = lambda x: all_indices.index(x)
 
-    exist_compressed = []
+    # -- Find set of existing compressed that will be decompressed
+    exist_compressed = set()
     for T in [A, B]:
         if isinstance(T, CompressedTensor):
-            exist_compressed += T.slice_indices
+            exist_compressed.update(T.slice_indices)
+    # In this particular case, we need not to sort these indices,
+    # since the iteration over fast index gives same latency as over slow index
+    # Potential improvement: if A_S and B_S are different, run outer loop 
+    # over min(A_S, B_S) and inner over the rest indices. This will reduce 
+    # the number of decompressions.
+    # --
+
 
-    exist_compressed.sort(key=int, reverse=True)
     need_compressed = result_indices[:-mem_limit]
     print(f"Need compression: {need_compressed}")
-    add_compress = set(need_compressed) - set(exist_compressed)
-    remove_compress = set(exist_compressed) - set(need_compressed)
-    retain_compress = set(exist_compressed).intersection(need_compressed)
-    add_compress = list(add_compress)
-    remove_compress = list(remove_compress)
-    retain_compress = list(retain_compress)
-
-    remove_compress.sort(key=int, reverse=True)
-    retain_compress.sort(key=int, reverse=True)
-    add_compress.sort(key=int, reverse=True)
-
-    newT_name= 'C'+str(int(all_indices[0]))
+    new_tensor_name = 'C'+str(int(all_indices[0]))
 
+    # -- Early return: if no need to compress, do the regular contraction
     if len(need_compressed)==0 and len(exist_compressed)==0:
-        A_ixs = list(map(int, A.indices))
-        B_ixs = list(map(int, B.indices))
-        
-        result = np.einsum(A.data, A_ixs, B.data, B_ixs)
-        return Tensor(newT_name, all_indices, data=result)
+        C = Tensor.empty(new_tensor_name, result_indices)
+        contract_two_tensors(A, B, C)
+        return C
+    # --
 
-    R = CompressedTensor(newT_name,
+    remove_compress = exist_compressed - set(need_compressed)
+    R = CompressedTensor(new_tensor_name,
                          result_indices,
                          slice_indices=need_compressed
                         )
 
-    chunk_ixs = list(set(result_indices) - set(need_compressed))
-    chunk_ixs.sort(key=int, reverse=True)
-    print(f"Chunk indices: {chunk_ixs}")
-    print(f"Add compression: {add_compress}, Retain compression: {retain_compress}, remove_compress: {remove_compress}")
-    slice_dict = {i: slice(None) for i in all_indices}
-    for iadd in iterate_indices(add_compress):
-        for iret in iterate_indices(retain_compress):
-
-            chunk = np.empty(2**len(chunk_ixs), dtype=B.data.dtype)
-            chunk = chunk.reshape(*(v.size for v in chunk_ixs))
-            for irm in iterate_indices(remove_compress):
-                for i, ival in zip(remove_compress, irm):
-                    slice_dict[i] = ival#slice(ival, ival+1)
-                chunk_view = chunk[tuple(
-                    slice_dict[i] for i in chunk_ixs
-                )]
-                if isinstance(A, CompressedTensor):
-                    A_data = A.get_chunk(iret+irm)
-                    A_ixs = A.array_indices
-                else:
-                    A_data = A.data
-                    A_ixs = A.indices
-
-                # TODO: handle when A and B are sliced differently
-                if isinstance(B, CompressedTensor):
-                    B_data = B.get_chunk(iret+irm)
-                    B_ixs = B.array_indices
-                else:
-                    B_data = B.data
-                    B_ixs = B.indices
-                # --
-                for ia, iaval in zip(add_compress, iadd):
-                    slice_dict[ia] = iaval#slice(iaval, iaval+1)
-                ixsa = set(add_compress).intersection(B_ixs)
-                if len(ixsa):
-                    B_data = B_data[tuple(
-                        slice_dict[i] for i in B_ixs
-                    )]
-                    for _del in ixsa:
-                        B_ixs = tuple(i for i in B_ixs if i!=_del)
-
-                A_ixs = list(map(int, A_ixs))
-                B_ixs = list(map(int, B_ixs))
-                
-                C_ixs = list(map(int, [v for v in chunk_ixs if v not in exist_compressed]))
-                #print(f"A indices: {A_ixs}, B indices: {B_ixs}, C indices:{C_ixs}")
-                A_str = ''.join(chr(97+int(v)) for v in A_ixs)
-                B_str = ''.join(chr(97+int(v)) for v in B_ixs)
-                C_str = ''.join(chr(97+int(v)) for v in C_ixs)
-                expr = f"{A_str},{B_str}->{C_str}"
-                #np.einsum(A_data, A_ixs, B_data, B_ixs, C_ixs, out=chunk_view)
-                print(f"Expr: {expr}")
-                np.einsum(expr, A_data, B_data, out=chunk_view)
-            R.set_chunk(iadd+iret, chunk)
+    result_chunk_ixs = result_indices[-mem_limit:]
+    print(f"Chunk indices: {result_chunk_ixs}, remove_compress: {remove_compress}")
+    slice_dict = {}
+    for r_i in iterate_indices(need_compressed):
+        for ix, sl in zip(need_compressed, r_i):
+            slice_dict[ix] = sl
+        chunk = np.empty(2**len(result_chunk_ixs), dtype=B.dtype)
+        chunk = chunk.reshape(*(v.size for v in result_chunk_ixs))
+        for irm in iterate_indices(remove_compress):
+            for i, ival in zip(remove_compress, irm):
+                slice_dict[i] = ival#slice(ival, ival+1)
+            chunk_view = chunk[tuple(
+                slice_dict.get(i, slice(None)) for i in result_chunk_ixs
+            )]
+            A_slice = A[slice_dict]
+            B_slice = B[slice_dict]
+
+            C_ixs = [v for v in result_chunk_ixs if v not in exist_compressed]
+            C = Tensor('tmp', indices=C_ixs, data=chunk_view)
+            contract_two_tensors(A_slice, B_slice, C)
+        R.set_chunk(r_i, chunk)
     return R
-
-
-
-    
-
-
diff --git a/qtensor/compression/cost_estimation.py b/qtensor/compression/cost_estimation.py
index 0d96c897..a8e9ea8d 100644
--- a/qtensor/compression/cost_estimation.py
+++ b/qtensor/compression/cost_estimation.py
@@ -213,8 +213,8 @@ def convert_TN_peo(tn, peo):
     """
     TN = tn2tn(tn)
     relabel_dict = {int(p):i for i, p in enumerate(peo)}
-    peo = [x for x in peo if int(x) not in ignored_vars]
     ignored_vars = list(map(int, tn.bra_vars + tn.ket_vars))
+    peo = [x for x in peo if int(x) not in ignored_vars]
 
     TN = {
         relabel_dict[int(v)]: ix for v, ix in TN.items()
diff --git a/qtensor/compression/test_compressed_contract.py b/qtensor/compression/test_compressed_contract.py
index 75ccb277..d276e24a 100644
--- a/qtensor/compression/test_compressed_contract.py
+++ b/qtensor/compression/test_compressed_contract.py
@@ -20,7 +20,7 @@ def test_compressed_contract():
     B_data = B_data.reshape(*(v.size for v in B_ixs))
 
     A = CompressedTensor('A', A_ixs, data=A_data)
-    A.slice(A_comp)
+    A.compress_indices(A_comp)
     B = Tensor('B', B_ixs, data=B_data)
     print(f"Tensor A: {A}")
     print(f"Tensor B: {B}")
diff --git a/qtensor/compression/test_compressed_tensor.py b/qtensor/compression/test_compressed_tensor.py
new file mode 100644
index 00000000..e5e0f277
--- /dev/null
+++ b/qtensor/compression/test_compressed_tensor.py
@@ -0,0 +1,39 @@
+from qtensor.compression import CompressedTensor
+from qtree.optimizer import Tensor, Var
+from qtree.system_defs import NP_ARRAY_TYPE
+import numpy as np
+
+def test_empty_tensor():
+    shape = (2, 3, 4)
+    indices = [Var(i, size=s) for i, s in enumerate(shape)]
+    t = CompressedTensor.empty("myT", indices)
+    assert t.name == "myT"
+    assert t.indices == tuple(indices)
+    assert t.shape == shape
+    assert t.data.shape == shape
+    assert t.data.dtype == NP_ARRAY_TYPE
+
+    t.compress_indices([indices[0]])
+    assert t.dtype == NP_ARRAY_TYPE
+
+
+def test_slice_tensor():
+    shape = (2, 3, 4)
+    indices = [Var(i, size=s) for i, s in enumerate(shape)]
+    t = CompressedTensor.empty("myT", indices, dtype=np.uint32)
+    t.compress_indices([indices[0]])
+    S = t[{indices[0]: 1, indices[1]: slice(0, 1)}]
+    assert S.data.shape == (1, 4)
+    assert indices[0] not in S.indices
+    assert int(indices[1]) == int(S.indices[0])
+    assert indices[1] != S.indices[0]
+    assert indices[2] in S.indices
+    assert S.indices[1].size == 4
+    assert np.allclose(t.get_chunk([1])[0:1], S.data)
+
+    t = CompressedTensor.empty("myT", indices, dtype=np.uint32)
+    t.compress_indices([indices[0], indices[1]])
+    S = t[1, 2]
+    assert indices[1] not in S.indices
+    assert np.allclose(t.get_chunk([1, 2]), S.data)
+
diff --git a/qtree b/qtree
index bfe253df..ccbb4093 160000
--- a/qtree
+++ b/qtree
@@ -1 +1 @@
-Subproject commit bfe253df1cbaae6af0a5fd5198f237f3654819d6
+Subproject commit ccbb4093360da843bcb8282941aa22154b85e2af

From 80985072778ffeabf15b950cebd4bf22249cd76d Mon Sep 17 00:00:00 2001
From: Milan Shah <mkshah5@ncsu.edu>
Date: Tue, 24 Jan 2023 15:17:24 -0500
Subject: [PATCH 008/126] Added cuSZx APIs to Compressor class

---
 qtensor/compression/CompressedTensor.py       |   40 +
 .../MultiLevelCacheTableWideInterval.h        |   54 +
 qtensor/compression/szx/include/cuszx_entry.h |   18 +
 qtensor/compression/szx/include/cuszx_float.h |   22 +
 .../compression/szx/include/cuszxd_float.h    |   14 +
 qtensor/compression/szx/include/szx.h         |   92 ++
 .../szx/include/szx_BytesToolkit.h            |   75 +
 .../compression/szx/include/szx_TypeManager.h |   35 +
 .../szx/include/szx_dataCompression.h         |   67 +
 qtensor/compression/szx/include/szx_defines.h |   66 +
 qtensor/compression/szx/include/szx_double.h  |   62 +
 qtensor/compression/szx/include/szx_float.h   |   63 +
 qtensor/compression/szx/include/szx_rw.h      |   89 ++
 qtensor/compression/szx/include/szx_utility.h |   37 +
 qtensor/compression/szx/include/szxd_double.h |   29 +
 qtensor/compression/szx/include/szxd_float.h  |   30 +
 qtensor/compression/szx/include/timingGPU.h   |   31 +
 .../compression/szx/src/DynamicByteArray.c    |   68 +
 .../compression/szx/src/DynamicDoubleArray.c  |   57 +
 .../compression/szx/src/DynamicFloatArray.c   |   57 +
 qtensor/compression/szx/src/DynamicIntArray.c |   57 +
 .../src/MultiLevelCacheTableWideInterval.c    |  125 ++
 qtensor/compression/szx/src/README_python.md  |   30 +
 qtensor/compression/szx/src/cuszx_entry.cu    |  978 ++++++++++++
 qtensor/compression/szx/src/cuszx_float.cu    |  392 +++++
 qtensor/compression/szx/src/cuszx_wrapper.cu  |   41 +
 qtensor/compression/szx/src/cuszx_wrapper.py  |  122 ++
 qtensor/compression/szx/src/cuszxd_float.cu   |  341 ++++
 qtensor/compression/szx/src/pred_quant.c      |    0
 qtensor/compression/szx/src/sz_p_q.c          |  367 +++++
 qtensor/compression/szx/src/szx.c             |  439 ++++++
 .../compression/szx/src/szx_BytesToolkit.c    |  811 ++++++++++
 qtensor/compression/szx/src/szx_TypeManager.c |  381 +++++
 .../compression/szx/src/szx_dataCompression.c |  355 +++++
 qtensor/compression/szx/src/szx_double.c      | 1388 +++++++++++++++++
 qtensor/compression/szx/src/szx_float.c       |  975 ++++++++++++
 qtensor/compression/szx/src/szx_rw.c          | 1009 ++++++++++++
 qtensor/compression/szx/src/szx_utility.c     |   42 +
 qtensor/compression/szx/src/szxd_double.c     | 1104 +++++++++++++
 qtensor/compression/szx/src/szxd_float.c      |  654 ++++++++
 qtensor/compression/szx/src/timingGPU.cu      |   45 +
 41 files changed, 10662 insertions(+)
 create mode 100644 qtensor/compression/szx/include/MultiLevelCacheTableWideInterval.h
 create mode 100644 qtensor/compression/szx/include/cuszx_entry.h
 create mode 100644 qtensor/compression/szx/include/cuszx_float.h
 create mode 100644 qtensor/compression/szx/include/cuszxd_float.h
 create mode 100644 qtensor/compression/szx/include/szx.h
 create mode 100644 qtensor/compression/szx/include/szx_BytesToolkit.h
 create mode 100644 qtensor/compression/szx/include/szx_TypeManager.h
 create mode 100644 qtensor/compression/szx/include/szx_dataCompression.h
 create mode 100644 qtensor/compression/szx/include/szx_defines.h
 create mode 100644 qtensor/compression/szx/include/szx_double.h
 create mode 100644 qtensor/compression/szx/include/szx_float.h
 create mode 100644 qtensor/compression/szx/include/szx_rw.h
 create mode 100644 qtensor/compression/szx/include/szx_utility.h
 create mode 100644 qtensor/compression/szx/include/szxd_double.h
 create mode 100644 qtensor/compression/szx/include/szxd_float.h
 create mode 100644 qtensor/compression/szx/include/timingGPU.h
 create mode 100644 qtensor/compression/szx/src/DynamicByteArray.c
 create mode 100644 qtensor/compression/szx/src/DynamicDoubleArray.c
 create mode 100644 qtensor/compression/szx/src/DynamicFloatArray.c
 create mode 100644 qtensor/compression/szx/src/DynamicIntArray.c
 create mode 100644 qtensor/compression/szx/src/MultiLevelCacheTableWideInterval.c
 create mode 100644 qtensor/compression/szx/src/README_python.md
 create mode 100644 qtensor/compression/szx/src/cuszx_entry.cu
 create mode 100644 qtensor/compression/szx/src/cuszx_float.cu
 create mode 100644 qtensor/compression/szx/src/cuszx_wrapper.cu
 create mode 100644 qtensor/compression/szx/src/cuszx_wrapper.py
 create mode 100644 qtensor/compression/szx/src/cuszxd_float.cu
 create mode 100644 qtensor/compression/szx/src/pred_quant.c
 create mode 100644 qtensor/compression/szx/src/sz_p_q.c
 create mode 100644 qtensor/compression/szx/src/szx.c
 create mode 100644 qtensor/compression/szx/src/szx_BytesToolkit.c
 create mode 100644 qtensor/compression/szx/src/szx_TypeManager.c
 create mode 100644 qtensor/compression/szx/src/szx_dataCompression.c
 create mode 100644 qtensor/compression/szx/src/szx_double.c
 create mode 100644 qtensor/compression/szx/src/szx_float.c
 create mode 100644 qtensor/compression/szx/src/szx_rw.c
 create mode 100644 qtensor/compression/szx/src/szx_utility.c
 create mode 100644 qtensor/compression/szx/src/szxd_double.c
 create mode 100644 qtensor/compression/szx/src/szxd_float.c
 create mode 100644 qtensor/compression/szx/src/timingGPU.cu

diff --git a/qtensor/compression/CompressedTensor.py b/qtensor/compression/CompressedTensor.py
index 7685047f..1d1ed7b3 100644
--- a/qtensor/compression/CompressedTensor.py
+++ b/qtensor/compression/CompressedTensor.py
@@ -4,6 +4,10 @@
 import io
 from qtree.optimizer import Tensor, Var
 
+from szx.src.cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
+
+CUSZX_BLOCKSIZE = 256
+
 def iterate_indices(indices: list):
     if len(indices)==0:
         return [tuple()]
@@ -22,6 +26,42 @@ def decompress(self, ptr):
         print(f"Loading arr.")
         return  np.load(ptr)['arr_0']
 
+    
+    ### Compression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
+    # - num_elements = Number of floating point elements in data
+    # - r2r_error = relative-to-value-range error bound for lossy compression
+    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
+    # Returns:
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
+    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
+        
+        if not isCuPy:
+            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+        else:
+            cmp_bytes, outSize_ptr = cuszx_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+        return cmp_bytes, outSize_ptr
+
+    ### Decompression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - num_elements = Number of floating point elements in original data
+    # Returns:
+    # - decompressed_data = Float32 pointer to decompressed data
+    #
+    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
+
+    def cuszx_decompress(self, isCuPy, cmp_bytes, num_elements):
+        if not isCuPy:
+            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
+        else:
+            decompressed_data = cuszx_device_decompress(num_elements, cmp_bytes)
+
+        return decompressed_data
 
 class CompressedTensor(Tensor):
     """
diff --git a/qtensor/compression/szx/include/MultiLevelCacheTableWideInterval.h b/qtensor/compression/szx/include/MultiLevelCacheTableWideInterval.h
new file mode 100644
index 00000000..853d14bc
--- /dev/null
+++ b/qtensor/compression/szx/include/MultiLevelCacheTableWideInterval.h
@@ -0,0 +1,54 @@
+/**
+ *  @file MultiLevelCacheTableWideInterval.h
+ *  @author Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang, Sheng Di, Dingwen Tao
+ *  @date Jan, 2019
+ *  @brief Header file for MultiLevelCacheTableWideInterval.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#ifndef _MULTILEVELCACHETABLEWIDEINTERVAL_H
+#define _MULTILEVELCACHETABLEWIDEINTERVAL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <memory.h>
+#include <stdlib.h>
+#include "stdio.h"
+
+typedef struct SubLevelTableWideInterval{
+    uint64_t baseIndex;
+    uint64_t topIndex;
+    uint16_t* table;
+    uint16_t expoIndex;
+} SubLevelTableWideInterval;
+
+typedef struct TopLevelTableWideInterval{
+    uint16_t bits;
+    uint16_t baseIndex;
+    uint16_t topIndex;
+    struct SubLevelTableWideInterval* subTables;
+    double bottomBoundary;
+    double topBoundary;
+} TopLevelTableWideInterval;
+
+void freeTopLevelTableWideInterval(struct TopLevelTableWideInterval* topTable);
+
+uint16_t MLCTWI_GetExpoIndex(double value);
+uint16_t MLCTWI_GetRequiredBits(double precision);
+uint64_t MLCTWI_GetMantiIndex(double value, int bits);
+
+double MLTCWI_RebuildDouble(uint16_t expo, uint64_t manti, int bits);
+void MultiLevelCacheTableWideIntervalBuild(struct TopLevelTableWideInterval* topTable, double* precisionTable, int count, double precision, int plus_bits);
+uint32_t MultiLevelCacheTableWideIntervalGetIndex(double value, struct TopLevelTableWideInterval* topLevelTable);
+void MultiLevelCacheTableWideIntervalFree(struct TopLevelTableWideInterval* table);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_MULTILEVELCACHETABLEWIDEINTERVAL_H
diff --git a/qtensor/compression/szx/include/cuszx_entry.h b/qtensor/compression/szx/include/cuszx_entry.h
new file mode 100644
index 00000000..34638319
--- /dev/null
+++ b/qtensor/compression/szx/include/cuszx_entry.h
@@ -0,0 +1,18 @@
+#ifndef CUSZX_ENTRY_H
+#define CUSZX_ENTRY_H
+
+#include <stdio.h>
+#include "cuszx_float.h"
+#include "cuszxd_float.h"
+
+#define GPU
+
+extern "C" unsigned char* cuSZx_fast_compress_args_unpredictable_blocked_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold);
+
+extern "C" void cuSZx_fast_decompress_args_unpredictable_blocked_float(float** newData, size_t nbEle, unsigned char* cmpBytes);
+
+extern "C" unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold);
+
+extern "C" float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes);
+
+#endif /* ----- #ifndef CUSZX_ENTRY_H  ----- */
diff --git a/qtensor/compression/szx/include/cuszx_float.h b/qtensor/compression/szx/include/cuszx_float.h
new file mode 100644
index 00000000..a933c2db
--- /dev/null
+++ b/qtensor/compression/szx/include/cuszx_float.h
@@ -0,0 +1,22 @@
+#ifndef CUSZX_FLOAT_H
+#define CUSZX_FLOAT_H
+
+#include <cuda_runtime.h>
+
+// Utilities and system includes
+#include <helper_cuda.h>  // helper function CUDA error checking and initialization
+#include <helper_functions.h>  // helper for shared functions common to CUDA Samples
+
+#define FULL_MASK 0xffffffff
+
+__device__
+void reduction(double sum1, double sum2,
+        double minDiff, double maxDiff, double sumDiff, double sumOfDiffSquare, 
+        double minErr, double maxErr, double sumErr, double sumErrSqr);
+
+__global__ void compress_float(float *oriData, unsigned char *meta, short *offsets, unsigned char *midBytes, float absErrBound, int bs, size_t nb, size_t mSize, float sparsity_level, uint32_t *blk_idx, uint8_t *blk_subidx,float *blk_vals,float threshold, uint8_t *blk_sig); 
+
+__global__ void get_numsig(uint64_t *num_sig);
+
+__global__ void apply_threshold(float *data, float threshold, size_t length);
+#endif /* ----- #ifndef CUSZX_COMPRESS_FLOAT_H  ----- */
diff --git a/qtensor/compression/szx/include/cuszxd_float.h b/qtensor/compression/szx/include/cuszxd_float.h
new file mode 100644
index 00000000..b203f707
--- /dev/null
+++ b/qtensor/compression/szx/include/cuszxd_float.h
@@ -0,0 +1,14 @@
+#ifndef CUSZXD_FLOAT_H
+#define CUSZXD_FLOAT_H
+
+#include <cuda_runtime.h>
+
+// Utilities and system includes
+#include <helper_cuda.h>  // helper function CUDA error checking and initialization
+#include <helper_functions.h>  // helper for shared functions common to CUDA Samples
+
+__global__ void decompress_float(unsigned char *data, int bs, size_t nc, size_t mSize); 
+
+__global__ void decompress_state2(float *out, unsigned char* stateArray, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx,uint32_t blockSize, uint8_t *blk_sig);
+
+#endif /* ----- #ifndef CUSZX_DECOMPRESS_FLOAT_H  ----- */
diff --git a/qtensor/compression/szx/include/szx.h b/qtensor/compression/szx/include/szx.h
new file mode 100644
index 00000000..a6872950
--- /dev/null
+++ b/qtensor/compression/szx/include/szx.h
@@ -0,0 +1,92 @@
+/**
+ *  @file szx.h
+ *  @author Sheng Di
+ *  @date April, 2022
+ *  @brief Header file for the whole compressor.
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZX_H
+#define _SZX_H
+
+#include <stdio.h>
+#include <stdint.h>
+#include <sys/time.h>      /* For gettimeofday(), in microseconds */
+#include <time.h>          /* For time(), in seconds */
+#include "szx_float.h"
+#include "szx_rw.h"
+#include "szx_utility.h"
+
+#ifdef _WIN32
+#define PATH_SEPARATOR ';'
+#else
+#define PATH_SEPARATOR ':'
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "szx_defines.h"
+#include "szx_double.h"
+#include "szxd_double.h"
+#include "szx_float.h"
+#include "szxd_float.h"
+#include "szx_TypeManager.h"
+
+typedef union lint16
+{
+	unsigned short usvalue;
+	short svalue;
+	unsigned char byte[2];
+} lint16;
+
+typedef union lint32
+{
+	int ivalue;
+	unsigned int uivalue;
+	unsigned char byte[4];
+} lint32;
+
+typedef union lint64
+{
+	long lvalue;
+	unsigned long ulvalue;
+	unsigned char byte[8];
+} lint64;
+
+typedef union ldouble
+{
+    double value;
+    unsigned long lvalue;
+    unsigned char byte[8];
+} ldouble;
+
+typedef union lfloat
+{
+    float value;
+    unsigned int ivalue;
+    unsigned char byte[4];
+} lfloat;
+
+
+extern int versionNumber[4];
+
+//-------------------key global variables--------------
+extern int dataEndianType; //*endian type of the data read from disk
+extern int sysEndianType; //*sysEndianType is actually set automatically.
+
+int computeDimension(size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+size_t computeDataLength(size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+int filterDimension(size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t* correctedDimension);
+unsigned char* SZ_fast_compress_args(int fastMode, int dataType, void *data, size_t *outSize, int errBoundMode, float absErrBound,
+float relBoundRatio, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+void* SZ_fast_decompress_pred(int dataType, float* preData, unsigned char *curBytes, size_t byteLength, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+void* SZ_fast_decompress(int fastMode, int dataType, unsigned char *bytes, size_t byteLength, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZX_H  ----- */
diff --git a/qtensor/compression/szx/include/szx_BytesToolkit.h b/qtensor/compression/szx/include/szx_BytesToolkit.h
new file mode 100644
index 00000000..027afe97
--- /dev/null
+++ b/qtensor/compression/szx/include/szx_BytesToolkit.h
@@ -0,0 +1,75 @@
+/**
+ *  @file szx_ByteToolkit.h
+ *  @author Sheng Di
+ *  @date Feb, 2022
+ *  @brief Header file for the ByteToolkit.c.
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZX_ByteToolkit_H
+#define _SZX_ByteToolkit_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+void sz_writeBits_Fast_int8(unsigned char* buffer,uint64_t *bitPosPtr, int numBits, unsigned char data);
+void sz_writeBits_Fast_int32(unsigned char* buffer,uint64_t *bitPosPtr, int numBits, int32_t data);
+void sz_writeBits_Fast_int64(unsigned char* buffer,uint64_t *bitPosPtr, int numBits, int64_t data);
+unsigned short bytesToUInt16_bigEndian(unsigned char* bytes);
+unsigned int bytesToUInt32_bigEndian(unsigned char* bytes);
+unsigned long bytesToUInt64_bigEndian(unsigned char* b);
+short bytesToInt16_bigEndian(unsigned char* bytes);
+int bytesToInt32_bigEndian(unsigned char* bytes);
+long bytesToInt64_bigEndian(unsigned char* b);
+int bytesToInt_bigEndian(unsigned char* bytes);
+void intToBytes_bigEndian(unsigned char *b, unsigned int num);
+void int64ToBytes_bigEndian(unsigned char *b, uint64_t num);
+void int32ToBytes_bigEndian(unsigned char *b, uint32_t num);
+void int16ToBytes_bigEndian(unsigned char *b, uint16_t num);
+long bytesToLong_bigEndian(unsigned char* b);
+void longToBytes_bigEndian(unsigned char *b, unsigned long num) ;
+long doubleToOSEndianLong(double value);
+int floatToOSEndianInt(float value);
+short getExponent_float(float value);
+short getPrecisionReqLength_float(float precision);
+short getExponent_double(double value);
+short getPrecisionReqLength_double(double precision);
+unsigned char numberOfLeadingZeros_Int(int i);
+unsigned char numberOfLeadingZeros_Long(long i);
+unsigned char getLeadingNumbers_Int(int v1, int v2);
+unsigned char getLeadingNumbers_Long(long v1, long v2);
+short bytesToShort(unsigned char* bytes);
+void shortToBytes(unsigned char* b, short value);
+int bytesToInt(unsigned char* bytes);
+long bytesToLong(unsigned char* bytes);
+float bytesToFloat(unsigned char* bytes);
+void floatToBytes(unsigned char *b, float num);
+double bytesToDouble(unsigned char* bytes);
+void doubleToBytes(unsigned char *b, double num);
+int getMaskRightCode(int m);
+int getLeftMovingCode(int kMod8);
+int getRightMovingSteps(int kMod8, int resiBitLength);
+int getRightMovingCode(int kMod8, int resiBitLength);
+short* convertByteDataToShortArray(unsigned char* bytes, size_t byteLength);
+unsigned short* convertByteDataToUShortArray(unsigned char* bytes, size_t byteLength);
+void convertShortArrayToBytes(short* states, size_t stateLength, unsigned char* bytes);
+void convertUShortArrayToBytes(unsigned short* states, size_t stateLength, unsigned char* bytes);
+void convertIntArrayToBytes(int* states, size_t stateLength, unsigned char* bytes);
+void convertUIntArrayToBytes(unsigned int* states, size_t stateLength, unsigned char* bytes);
+void convertLongArrayToBytes(int64_t* states, size_t stateLength, unsigned char* bytes);
+void convertULongArrayToBytes(uint64_t* states, size_t stateLength, unsigned char* bytes);
+size_t bytesToSize(unsigned char* bytes);
+void sizeToBytes(unsigned char* outBytes, size_t size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZX_ByteToolkit_H  ----- */
+
diff --git a/qtensor/compression/szx/include/szx_TypeManager.h b/qtensor/compression/szx/include/szx_TypeManager.h
new file mode 100644
index 00000000..f4409104
--- /dev/null
+++ b/qtensor/compression/szx/include/szx_TypeManager.h
@@ -0,0 +1,35 @@
+/**
+ *  @file TypeManager.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the TypeManager.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZX_TypeManager_H
+#define _SZX_TypeManager_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <stdint.h>
+
+size_t convertIntArray2ByteArray_fast_1b_args(unsigned char* intArray, size_t intArrayLength, unsigned char *result);
+size_t convertIntArray2ByteArray_fast_1b(unsigned char* intArray, size_t intArrayLength, unsigned char **result);
+size_t convertIntArray2ByteArray_fast_1b_to_result(unsigned char* intArray, size_t intArrayLength, unsigned char *result);
+void convertByteArray2IntArray_fast_1b_args(size_t intArrayLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char* intArray);
+void convertByteArray2IntArray_fast_1b(size_t intArrayLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray);
+size_t convertIntArray2ByteArray_fast_2b_args(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char *result);
+size_t convertIntArray2ByteArray_fast_2b(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char **result);
+void convertByteArray2IntArray_fast_2b(size_t stepLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray);
+int getLeftMovingSteps(size_t k, unsigned char resiBitLength);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZX_TypeManager_H  ----- */
+
diff --git a/qtensor/compression/szx/include/szx_dataCompression.h b/qtensor/compression/szx/include/szx_dataCompression.h
new file mode 100644
index 00000000..afce931b
--- /dev/null
+++ b/qtensor/compression/szx/include/szx_dataCompression.h
@@ -0,0 +1,67 @@
+/**
+ *  @file szx_dataCompression.h
+ *  @author Sheng Di
+ *  @date July, 2022
+ *  @brief Header file for the dataCompression.c.
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZX_DataCompression_H
+#define _SZX_DataCompression_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "szx.h"
+#include <stdio.h>
+#include <stdbool.h>
+
+#define computeMinMax(data) \
+        for(i=1;i<size;i++)\
+        {\
+                data_ = data[i];\
+                if(min>data_)\
+                        min = data_;\
+                else if(max<data_)\
+                        max = data_;\
+        }\
+
+
+//dataCompression.c
+int computeByteSizePerIntValue(long valueRangeSize);
+long computeRangeSize_int(void* oriData, int dataType, size_t size, int64_t* valueRangeSize);
+double computeRangeSize_double(double* oriData, size_t size, double* valueRangeSize, double* medianValue);
+float computeRangeSize_float(float* oriData, size_t size, float* valueRangeSize, float* medianValue);
+
+double min_d(double a, double b);
+double max_d(double a, double b);
+float min_f(float a, float b);
+float max_f(float a, float b);
+double getRealPrecision_double(double valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status);
+double getRealPrecision_float(float valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status);
+double getRealPrecision_int(long valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status);
+void symTransform_8bytes(unsigned char data[8]);
+void symTransform_2bytes(unsigned char data[2]);
+void symTransform_4bytes(unsigned char data[4]);
+
+void compressInt8Value(int8_t tgtValue, int8_t minValue, int byteSize, unsigned char* bytes);
+void compressInt16Value(int16_t tgtValue, int16_t minValue, int byteSize, unsigned char* bytes);
+void compressInt32Value(int32_t tgtValue, int32_t minValue, int byteSize, unsigned char* bytes);
+void compressInt64Value(int64_t tgtValue, int64_t minValue, int byteSize, unsigned char* bytes);
+
+void compressUInt8Value(uint8_t tgtValue, uint8_t minValue, int byteSize, unsigned char* bytes);
+void compressUInt16Value(uint16_t tgtValue, uint16_t minValue, int byteSize, unsigned char* bytes);
+void compressUInt32Value(uint32_t tgtValue, uint32_t minValue, int byteSize, unsigned char* bytes);
+void compressUInt64Value(uint64_t tgtValue, uint64_t minValue, int byteSize, unsigned char* bytes);
+    
+int compIdenticalLeadingBytesCount_double(unsigned char* preBytes, unsigned char* curBytes);
+int compIdenticalLeadingBytesCount_float(unsigned char* preBytes, unsigned char* curBytes);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZX_DataCompression_H  ----- */
+
diff --git a/qtensor/compression/szx/include/szx_defines.h b/qtensor/compression/szx/include/szx_defines.h
new file mode 100644
index 00000000..66cc07ca
--- /dev/null
+++ b/qtensor/compression/szx/include/szx_defines.h
@@ -0,0 +1,66 @@
+/**
+ *  @file szx_defines.h
+ *  @author Sheng Di
+ *  @date Jan, 2022
+ *  @brief Header file for the dataCompression.c.
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZX_DEFINES_H
+#define _SZX_DEFINES_H
+
+#define SZx_VERNUM 0x0200
+#define SZx_VER_MAJOR 1
+#define SZx_VER_MINOR 0
+#define SZx_VER_BUILD 0
+#define SZx_VER_REVISION 0
+
+#define ABS 0
+#define REL 1
+#define VR_REL 1  //alternative name to REL
+#define ABS_AND_REL 2
+#define ABS_OR_REL 3
+#define PSNR 4
+#define NORM 5
+
+#define PW_REL 10
+#define ABS_AND_PW_REL 11
+#define ABS_OR_PW_REL 12
+#define REL_AND_PW_REL 13
+#define REL_OR_PW_REL 14
+
+
+#define SZ_FLOAT 0
+#define SZ_DOUBLE 1
+#define SZ_UINT8 2
+#define SZ_INT8 3
+#define SZ_UINT16 4
+#define SZ_INT16 5
+#define SZ_UINT32 6
+#define SZ_INT32 7
+#define SZ_UINT64 8
+#define SZ_INT64 9
+
+#define LITTLE_ENDIAN_DATA 0 //refers to the endian type of the data read from the disk
+#define BIG_ENDIAN_DATA 1 //big_endian (ppc, max, etc.) ; little_endian (x86, x64, etc.)
+
+#define LITTLE_ENDIAN_SYSTEM 0 //refers to the endian type of the system
+#define BIG_ENDIAN_SYSTEM 1
+
+
+#define SZx_NO_BLOCK_FAST_CMPR 1
+#define SZx_WITH_BLOCK_FAST_CMPR 2
+#define SZx_RANDOMACCESS_FAST_CMPR 3
+#define SZx_OPENMP_FAST_CMPR 4
+
+//SUCCESS returning status
+#define SZ_SCES 0  //successful
+#define SZ_NSCS -1 //Not successful
+#define SZ_FERR -2 //Failed to open input file
+#define SZ_TERR -3 //wrong data type (should be only float or double)
+#define SZ_DERR -4 //dimension error
+#define SZ_MERR -5 //sz_mode error
+#define SZ_BERR -6 //bound-mode error (should be only ABS, REL, ABS_AND_REL, ABS_OR_REL, or PW_REL)
+
+#endif /* _SZX_DEFINES_H */
diff --git a/qtensor/compression/szx/include/szx_double.h b/qtensor/compression/szx/include/szx_double.h
new file mode 100644
index 00000000..e6d724db
--- /dev/null
+++ b/qtensor/compression/szx/include/szx_double.h
@@ -0,0 +1,62 @@
+/**
+ *  @file szx_double.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_double.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <szx_float.h>
+
+#ifndef _SZ_Double_H
+#define _SZ_Double_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void SZ_fast_compress_args_unpredictable_one_block_double(double *oriData, size_t nbEle, float absErrBound,
+                                                                unsigned char *outputBytes, int *outSize,
+                                                                unsigned char *leadNumberArray_int, float medianValue,
+                                                                float radius);
+                                                                
+size_t computeStateMedianRadius_double(double *oriData, size_t nbEle, float absErrBound, int blockSize,
+                                      unsigned char *stateArray, float *medianArray, float *radiusArray) ;
+                                      
+void max_min_double(double *x, int n, double *tmp_max, double *tmp_min);
+
+void simd_max_min_double(double *x, int n, double *tmp_max, double *tmp_min);
+
+void computeStateMedianRadius_double2(double *oriData, size_t nbEle, float absErrBound,
+                                     unsigned char *state, float *median, float *radius) ;
+                                     
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_double(double *oriData, size_t *outSize, float absErrBound, size_t nbEle,
+                                                  int blockSize) ;
+                                                  
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_randomaccess_double_openmp(double *oriData, size_t *outSize, 
+									float absErrBound, size_t nbEle, int blockSize) ;
+                                                               
+                                                               
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_randomaccess_double(double *oriData, size_t *outSize, 
+								float absErrBound, size_t nbEle, int blockSize) ;
+    
+unsigned char *
+SZ_fast_compress_args_unpredictable_double(double *data, size_t *outSize, float absErrBound, size_t r5, size_t r4,
+                                          size_t r3, size_t r2, size_t r1, float mValue, float radius);
+                                          
+unsigned char *SZ_skip_compress_double(double *data, size_t dataLength, size_t *outSize) ;
+
+void computeReqLength_double(float realPrecision, short radExpo, int *reqLength, float *medianValue) ;
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Double_H  ----- */
+
diff --git a/qtensor/compression/szx/include/szx_float.h b/qtensor/compression/szx/include/szx_float.h
new file mode 100644
index 00000000..57e6388f
--- /dev/null
+++ b/qtensor/compression/szx/include/szx_float.h
@@ -0,0 +1,63 @@
+/**
+ *  @file sz_float.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_float.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Float_H
+#define _SZ_Float_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned char * SZ_fast_compress_args_with_prediction_float(float *pred, float *data, size_t *outSize, float absErrBound, size_t r5,
+                                            size_t r4, size_t r3, size_t r2, size_t r1, float medianValue, float radius);
+
+void SZ_fast_compress_args_unpredictable_one_block_float(float *oriData, size_t nbEle, float absErrBound,
+                                                                unsigned char *outputBytes, int *outSize,
+                                                                unsigned char *leadNumberArray_int, float medianValue,
+                                                                float radius);
+                                                                
+size_t computeStateMedianRadius_float(float *oriData, size_t nbEle, float absErrBound, int blockSize,
+                                      unsigned char *stateArray, float *medianArray, float *radiusArray) ;
+                                      
+void max_min_float(float *x, int n, float *tmp_max, float *tmp_min);
+
+void simd_max_min_float(float *x, int n, float *tmp_max, float *tmp_min);
+
+void computeStateMedianRadius_float2(float *oriData, size_t nbEle, float absErrBound,
+                                     unsigned char *state, float *median, float *radius) ;
+                                     
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle,
+                                                  int blockSize) ;
+                                                  
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_randomaccess_float_openmp(float *oriData, size_t *outSize, float absErrBound,
+                                                               size_t nbEle, int blockSize) ;
+                                                               
+                                                               
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_randomaccess_float(float *oriData, size_t *outSize, float absErrBound,
+    size_t nbEle, int blockSize) ;
+    
+unsigned char *
+SZ_fast_compress_args_unpredictable_float(float *data, size_t *outSize, float absErrBound, size_t r5, size_t r4,
+                                          size_t r3, size_t r2, size_t r1, float mValue, float radius);
+                                          
+unsigned char *SZ_skip_compress_float(float *data, size_t dataLength, size_t *outSize) ;
+
+void computeReqLength_float(double realPrecision, short radExpo, int *reqLength, float *medianValue) ;
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Float_H  ----- */
+
diff --git a/qtensor/compression/szx/include/szx_rw.h b/qtensor/compression/szx/include/szx_rw.h
new file mode 100644
index 00000000..551dea0f
--- /dev/null
+++ b/qtensor/compression/szx/include/szx_rw.h
@@ -0,0 +1,89 @@
+/**
+ *  @file szx_rw.h
+ *  @author Sheng Di
+ *  @date Jan, 2022
+ *  @brief Header file for the whole io interface.
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZX_RW_H
+#define _SZX_RW_H
+
+#include <stdio.h>
+#include <stdint.h>
+
+#ifdef _WIN32
+#define PATH_SEPARATOR ';'
+#else
+#define PATH_SEPARATOR ':'
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int checkFileExistance(char* filePath);
+
+float** create2DArray_float(size_t m, size_t n);
+void free2DArray_float(float** data, size_t m);
+float*** create3DArray_float(size_t p, size_t m, size_t n);
+void free3DArray_float(float*** data, size_t p, size_t m);
+double** create2DArray_double(size_t m, size_t n);
+void free2DArray_double(double** data, size_t m);
+double*** create3DArray_double(size_t p, size_t m, size_t n);
+void free3DArray_double(double*** data, size_t p, size_t m);
+size_t checkFileSize(char *srcFilePath, int *status);
+
+unsigned char *readByteData(char *srcFilePath, size_t *byteLength, int *status);
+double *readDoubleData(char *srcFilePath, size_t *nbEle, int *status);
+int8_t *readInt8Data(char *srcFilePath, size_t *nbEle, int *status);
+int16_t *readInt16Data(char *srcFilePath, size_t *nbEle, int *status);
+uint16_t *readUInt16Data(char *srcFilePath, size_t *nbEle, int *status);
+int32_t *readInt32Data(char *srcFilePath, size_t *nbEle, int *status);
+uint32_t *readUInt32Data(char *srcFilePath, size_t *nbEle, int *status);
+int64_t *readInt64Data(char *srcFilePath, size_t *nbEle, int *status);
+uint64_t *readUInt64Data(char *srcFilePath, size_t *nbEle, int *status);
+float *readFloatData(char *srcFilePath, size_t *nbEle, int *status);
+unsigned short* readShortData(char *srcFilePath, size_t *dataLength, int *status);
+
+double *readDoubleData_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+int8_t *readInt8Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+int16_t *readInt16Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+uint16_t *readUInt16Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+int32_t *readInt32Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+uint32_t *readUInt32Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+int64_t *readInt64Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+uint64_t *readUInt64Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+float *readFloatData_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+
+void writeByteData(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status);
+void writeDoubleData(double *data, size_t nbEle, char *tgtFilePath, int *status);
+void writeFloatData(float *data, size_t nbEle, char *tgtFilePath, int *status);
+void writeData(void *data, int dataType, size_t nbEle, char *tgtFilePath, int *status);
+void writeFloatData_inBytes(float *data, size_t nbEle, char* tgtFilePath, int *status);
+void writeDoubleData_inBytes(double *data, size_t nbEle, char* tgtFilePath, int *status);
+void writeShortData_inBytes(short *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeUShortData_inBytes(unsigned short *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeIntData_inBytes(int *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeUIntData_inBytes(unsigned int *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeLongData_inBytes(int64_t *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeULongData_inBytes(uint64_t *states, size_t stateLength, char *tgtFilePath, int *status);
+
+void writeStrings(int nbStr, char *str[], char *tgtFilePath, int *status);
+
+//void convertToPFM_float(float *data, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, int endianType, char *tgtFilePath, int *status);
+
+void checkfilesizec_(char *srcFilePath, int *len, size_t *filesize);
+void readbytefile_(char *srcFilePath, int *len, unsigned char *bytes, size_t *byteLength);
+void readdoublefile_(char *srcFilePath, int *len, double *data, size_t *nbEle);
+void readfloatfile_(char *srcFilePath, int *len, float *data, size_t *nbEle);
+void writebytefile_(unsigned char *bytes, size_t *byteLength, char *tgtFilePath, int *len);
+void writedoublefile_(double *data, size_t *nbEle, char *tgtFilePath, int *len);
+void writefloatfile_(float *data, size_t *nbEle, char *tgtFilePath, int *len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZX_RW_H  ----- */
diff --git a/qtensor/compression/szx/include/szx_utility.h b/qtensor/compression/szx/include/szx_utility.h
new file mode 100644
index 00000000..133c2816
--- /dev/null
+++ b/qtensor/compression/szx/include/szx_utility.h
@@ -0,0 +1,37 @@
+/**
+ *  @file szx_utility.h
+ *  @author Sheng Di
+ *  @date Feb, 2022
+ *  @brief Header file for the utility.c.
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZX_UTILITY_H
+#define _SZX_UTILITY_H
+
+#include "szx.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//sihuan added: use a assistant struct to do sorting and swap that are easy to implement: should
+//consider optimizing the performance later.
+typedef struct sort_ast_particle{
+	int64_t id;
+	float var[6];
+} sort_ast_particle;
+
+extern struct timeval sz_costStart; /*only used for recording the cost*/
+extern double sz_totalCost;
+
+void sz_cost_start();
+void sz_cost_end();
+void sz_cost_end_msg(char *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZX_UTILITY_H  ----- */
diff --git a/qtensor/compression/szx/include/szxd_double.h b/qtensor/compression/szx/include/szxd_double.h
new file mode 100644
index 00000000..4ea4be11
--- /dev/null
+++ b/qtensor/compression/szx/include/szxd_double.h
@@ -0,0 +1,29 @@
+/**
+ *  @file szxd_double.h
+ *  @author Sheng Di
+ *  @date Feb, 2022
+ *  @brief Header file for the szd_double.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZXD_Double_H
+#define _SZXD_Double_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int SZ_fast_decompress_args_unpredictable_one_block_double(double* newData, size_t blockSize, unsigned char* cmpBytes);
+void SZ_fast_decompress_args_unpredictable_blocked_double(double** newData, size_t nbEle, unsigned char* cmpBytes);
+void SZ_fast_decompress_args_unpredictable_blocked_randomaccess_double(double** newData, size_t nbEle, unsigned char* cmpBytes);
+void SZ_fast_decompress_args_unpredictable_blocked_randomaccess_double_openmp(double** newData, size_t nbEle, unsigned char* cmpBytes);
+
+void SZ_fast_decompress_args_unpredictable_double(double** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, 
+size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZXD_Double_H  ----- */
diff --git a/qtensor/compression/szx/include/szxd_float.h b/qtensor/compression/szx/include/szxd_float.h
new file mode 100644
index 00000000..fbe0219d
--- /dev/null
+++ b/qtensor/compression/szx/include/szxd_float.h
@@ -0,0 +1,30 @@
+/**
+ *  @file szxd_float.h
+ *  @author Sheng Di
+ *  @date Feb, 2022
+ *  @brief Header file for the szd_float.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZXD_Float_H
+#define _SZXD_Float_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void SZ_fast_decompress_args_with_prediction_float(float** newData, float* pred, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+int SZ_fast_decompress_args_unpredictable_one_block_float(float* newData, size_t blockSize, unsigned char* cmpBytes);
+void SZ_fast_decompress_args_unpredictable_blocked_float(float** newData, size_t nbEle, unsigned char* cmpBytes);
+void SZ_fast_decompress_args_unpredictable_blocked_randomaccess_float(float** newData, size_t nbEle, unsigned char* cmpBytes);
+void SZ_fast_decompress_args_unpredictable_blocked_randomaccess_float_openmp(float** newData, size_t nbEle, unsigned char* cmpBytes);
+
+void SZ_fast_decompress_args_unpredictable_float(float** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, 
+size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZXD_Float_H  ----- */
diff --git a/qtensor/compression/szx/include/timingGPU.h b/qtensor/compression/szx/include/timingGPU.h
new file mode 100644
index 00000000..c6081682
--- /dev/null
+++ b/qtensor/compression/szx/include/timingGPU.h
@@ -0,0 +1,31 @@
+#ifndef __TIMING_CUH__
+#define __TIMING_CUH__
+
+/**************/
+/* TIMING GPU */
+/**************/
+
+// Events are a part of CUDA API and provide a system independent way to measure execution times on CUDA devices with approximately 0.5
+// microsecond precision.
+
+struct PrivateTimingGPU;
+
+class TimingGPU
+{
+    private:
+        PrivateTimingGPU *privateTimingGPU;
+
+    public:
+
+        TimingGPU();
+
+        ~TimingGPU();
+
+        void StartCounter();
+        void StartCounterFlags();
+
+        float GetCounter();
+
+}; // TimingGPU class
+
+#endif
diff --git a/qtensor/compression/szx/src/DynamicByteArray.c b/qtensor/compression/szx/src/DynamicByteArray.c
new file mode 100644
index 00000000..64b7d5c7
--- /dev/null
+++ b/qtensor/compression/szx/src/DynamicByteArray.c
@@ -0,0 +1,68 @@
+/**
+ *  @file DynamicByteArray.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief Dynamic Byte Array
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "DynamicByteArray.h"
+
+void new_DBA(DynamicByteArray **dba, size_t cap) {
+		*dba = (DynamicByteArray *)malloc(sizeof(DynamicByteArray));
+        (*dba)->size = 0;
+        (*dba)->capacity = cap;
+        (*dba)->array = (unsigned char*)malloc(sizeof(unsigned char)*cap);
+    }
+
+void convertDBAtoBytes(DynamicByteArray *dba, unsigned char** bytes)
+{
+	size_t size = dba->size;
+	if(size>0)
+		*bytes = (unsigned char*)malloc(size * sizeof(unsigned char));
+	else
+		*bytes = NULL;
+	memcpy(*bytes, dba->array, size*sizeof(unsigned char));	
+}
+
+void free_DBA(DynamicByteArray *dba)
+{
+	free(dba->array);
+	free(dba);
+}
+
+inline unsigned char getDBA_Data(DynamicByteArray *dba, size_t pos)
+{
+	if(pos>=dba->size)
+	{
+		printf("Error: wrong position of DBA (impossible case unless bugs elsewhere in the code?).\n");
+		exit(0);
+	}
+	return dba->array[pos];
+}
+
+inline void addDBA_Data(DynamicByteArray *dba, unsigned char value)
+{
+	if(dba->size==dba->capacity)
+	{
+		dba->capacity = dba->capacity << 1;
+		dba->array = (unsigned char *)realloc(dba->array, dba->capacity*sizeof(unsigned char));
+	}
+	dba->array[dba->size] = value;
+	dba->size ++;
+}
+
+inline void memcpyDBA_Data(DynamicByteArray *dba, unsigned char* data, size_t length)
+{
+	if(dba->size + length > dba->capacity)
+	{
+		dba->capacity = dba->size + length;
+		dba->array = (unsigned char *)realloc(dba->array, dba->capacity*sizeof(unsigned char));
+	}
+	memcpy(&(dba->array[dba->size]), data, length);
+	dba->size += length;
+}
diff --git a/qtensor/compression/szx/src/DynamicDoubleArray.c b/qtensor/compression/szx/src/DynamicDoubleArray.c
new file mode 100644
index 00000000..54bbb109
--- /dev/null
+++ b/qtensor/compression/szx/src/DynamicDoubleArray.c
@@ -0,0 +1,57 @@
+/**
+ *  @file DynamicFloatArray.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief Dynamic Float Array
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "DynamicDoubleArray.h"
+
+void new_DDA(DynamicDoubleArray **dda, size_t cap) {
+		*dda = (DynamicDoubleArray *)malloc(sizeof(DynamicDoubleArray));
+        (*dda)->size = 0;
+        (*dda)->capacity = cap;
+        (*dda)->array = (double*)malloc(sizeof(double)*cap);
+    }
+
+void convertDDAtoDoubles(DynamicDoubleArray *dba, double **data)
+{
+	size_t size = dba->size;
+	if(size>0)
+		*data = (double*)malloc(size * sizeof(double));
+	else
+		*data = NULL;
+	memcpy(*data, dba->array, size*sizeof(double));	
+}
+
+void free_DDA(DynamicDoubleArray *dda)
+{
+	free(dda->array);
+	free(dda);
+}
+
+double getDDA_Data(DynamicDoubleArray *dda, size_t pos)
+{
+	if(pos>=dda->size)
+	{
+		printf("Error: wrong position of DIA.\n");
+		exit(0);
+	}
+	return dda->array[pos];
+}
+
+void addDDA_Data(DynamicDoubleArray *dda, double value)
+{
+	if(dda->size==dda->capacity)
+	{
+		dda->capacity *= 2;
+		dda->array = (double *)realloc(dda->array, dda->capacity*sizeof(double));
+	}
+	dda->array[dda->size] = value;
+	dda->size ++;
+}
diff --git a/qtensor/compression/szx/src/DynamicFloatArray.c b/qtensor/compression/szx/src/DynamicFloatArray.c
new file mode 100644
index 00000000..1a80a488
--- /dev/null
+++ b/qtensor/compression/szx/src/DynamicFloatArray.c
@@ -0,0 +1,57 @@
+/**
+ *  @file DynamicFloatArray.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief Dynamic Float Array
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "DynamicFloatArray.h"
+
+void new_DFA(DynamicFloatArray **dfa, size_t cap) {
+		*dfa = (DynamicFloatArray *)malloc(sizeof(DynamicFloatArray));
+        (*dfa)->size = 0;
+        (*dfa)->capacity = cap;
+        (*dfa)->array = (float*)malloc(sizeof(float)*cap);
+    }
+
+void convertDFAtoFloats(DynamicFloatArray *dfa, float **data)
+{
+	size_t size = dfa->size;
+	if(size>0)
+		*data = (float*)malloc(size * sizeof(float));
+	else
+		*data = NULL;
+	memcpy(*data, dfa->array, size*sizeof(float));	
+}
+
+void free_DFA(DynamicFloatArray *dfa)
+{
+	free(dfa->array);
+	free(dfa);
+}
+
+float getDFA_Data(DynamicFloatArray *dfa, size_t pos)
+{
+	if(pos>=dfa->size)
+	{
+		printf("Error: wrong position of DIA.\n");
+		exit(0);
+	}
+	return dfa->array[pos];
+}
+
+void addDFA_Data(DynamicFloatArray *dfa, float value)
+{
+	if(dfa->size==dfa->capacity)
+	{
+		dfa->capacity *= 2;
+		dfa->array = (float *)realloc(dfa->array, dfa->capacity*sizeof(float));
+	}
+	dfa->array[dfa->size] = value;
+	dfa->size++;
+}
diff --git a/qtensor/compression/szx/src/DynamicIntArray.c b/qtensor/compression/szx/src/DynamicIntArray.c
new file mode 100644
index 00000000..347e3a18
--- /dev/null
+++ b/qtensor/compression/szx/src/DynamicIntArray.c
@@ -0,0 +1,57 @@
+/**
+ *  @file DynamicIntArray.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief Dynamic Int Array
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "DynamicIntArray.h"
+
+void new_DIA(DynamicIntArray **dia, size_t cap) {
+		*dia = (DynamicIntArray *)malloc(sizeof(DynamicIntArray));
+        (*dia)->size = 0;
+        (*dia)->capacity = cap;
+        (*dia)->array = (unsigned char*)malloc(sizeof(unsigned char)*cap);
+    }
+
+void convertDIAtoInts(DynamicIntArray *dia, unsigned char **data)
+{
+	size_t size = dia->size;
+	if(size>0)
+		*data = (unsigned char*)malloc(size * sizeof(char));
+	else
+		*data = NULL;
+	memcpy(*data, dia->array, size*sizeof(unsigned char));	
+}
+
+void free_DIA(DynamicIntArray *dia)
+{
+	free(dia->array);
+	free(dia);
+}
+
+int getDIA_Data(DynamicIntArray *dia, size_t pos)
+{
+	if(pos>=dia->size)
+	{
+		printf("Error: wrong position of DIA.\n");
+		exit(0);
+	}
+	return dia->array[pos];
+}
+
+inline void addDIA_Data(DynamicIntArray *dia, int value)
+{
+	if(dia->size==dia->capacity)
+	{
+		dia->capacity = dia->capacity << 1;
+		dia->array = (unsigned char *)realloc(dia->array, dia->capacity*sizeof(unsigned char));
+	}
+	dia->array[dia->size] = (unsigned char)value;
+	dia->size ++;
+}
diff --git a/qtensor/compression/szx/src/MultiLevelCacheTableWideInterval.c b/qtensor/compression/szx/src/MultiLevelCacheTableWideInterval.c
new file mode 100644
index 00000000..d137115f
--- /dev/null
+++ b/qtensor/compression/szx/src/MultiLevelCacheTableWideInterval.c
@@ -0,0 +1,125 @@
+/**
+ *  @file MultiLevelCacheTableWideInterval.h
+ *  @author Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang, Sheng Di, Dingwen Tao
+ *  @date Jan, 2019
+ *  @brief Header file.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdbool.h>
+#include "MultiLevelCacheTableWideInterval.h"
+
+void freeTopLevelTableWideInterval(struct TopLevelTableWideInterval* topTable)
+{
+	for(int i=topTable->topIndex-topTable->baseIndex; i>=0; i--)
+	{
+		struct SubLevelTableWideInterval* processingSubTable = &topTable->subTables[i];
+		free(processingSubTable->table);
+	}
+	free(topTable->subTables);
+}
+
+uint16_t MLCTWI_GetExpoIndex(double value){
+    uint64_t* ptr = (uint64_t*)&value;
+    return (*ptr) >> 52;
+}
+
+uint16_t MLCTWI_GetRequiredBits(double precision){
+    uint64_t* ptr = (uint64_t*)&precision;
+    return -(((*ptr) >> 52) - 1023);
+}
+
+uint64_t MLCTWI_GetMantiIndex(double value, int bits){
+    uint64_t* ptr = (uint64_t*)&value;
+    (*ptr) = (*ptr) << 12 >> 12;
+    int shift = 64 - 12 - bits;
+    if(shift > 0){
+        return (*ptr) >> shift;
+    }else{
+        return (*ptr);
+    }
+}
+
+double MLTCWI_RebuildDouble(uint16_t expo, uint64_t manti, int bits){
+    double result = 0;
+    uint64_t *ptr = (uint64_t*)&result;
+    *ptr = expo;
+    (*ptr) = (*ptr) << 52;
+    (*ptr) += (manti << (52-bits));
+    return result;
+}
+
+void MultiLevelCacheTableWideIntervalBuild(struct TopLevelTableWideInterval* topTable, double* precisionTable, int count, double precision, int plus_bits){
+    uint16_t bits = MLCTWI_GetRequiredBits(precision) + plus_bits;
+    topTable->bits = bits;
+    topTable->bottomBoundary = precisionTable[1]/(1+precision);
+    topTable->topBoundary = precisionTable[count-1]/(1-precision);
+    topTable->baseIndex = MLCTWI_GetExpoIndex(topTable->bottomBoundary);
+    topTable->topIndex = MLCTWI_GetExpoIndex(topTable->topBoundary);
+    int subTableCount = topTable->topIndex - topTable->baseIndex + 1;
+    topTable->subTables = (struct SubLevelTableWideInterval*)malloc(sizeof(struct SubLevelTableWideInterval) * subTableCount);
+    memset(topTable->subTables, 0, sizeof(struct SubLevelTableWideInterval) * subTableCount);
+
+    for(int i=topTable->topIndex-topTable->baseIndex; i>=0; i--){
+        struct SubLevelTableWideInterval* processingSubTable = &topTable->subTables[i];
+
+        uint32_t maxIndex = 0;
+        for(int j=0; j<bits; j++){
+            maxIndex += 1 << j;
+        }
+        processingSubTable->topIndex = maxIndex;
+        processingSubTable->baseIndex = 0;
+
+        uint64_t subTableLength = processingSubTable->topIndex - processingSubTable-> baseIndex+ 1;
+        processingSubTable->table = (uint16_t*)malloc(sizeof(uint16_t) * subTableLength);
+        memset(processingSubTable->table, 0, sizeof(uint16_t) * subTableLength);
+        processingSubTable->expoIndex = topTable->baseIndex + i;
+    }
+
+
+    uint32_t index = 0;
+    bool flag = false;
+    for(uint16_t i = 0; i<=topTable->topIndex-topTable->baseIndex; i++){
+        struct SubLevelTableWideInterval* processingSubTable = &topTable->subTables[i];
+        uint16_t expoIndex = i+topTable->baseIndex;
+        for(uint32_t j = 0; j<=processingSubTable->topIndex - processingSubTable->baseIndex; j++){
+            uint64_t mantiIndex = j + processingSubTable->baseIndex;
+            double sampleBottom = MLTCWI_RebuildDouble(expoIndex, mantiIndex, topTable->bits);
+            double sampleTop = MLTCWI_RebuildDouble(expoIndex, mantiIndex+1, topTable->bits);
+            double bottomBoundary = precisionTable[index] / (1+precision);
+            double topBoundary = precisionTable[index] / (1-precision);
+            if(sampleTop < topBoundary && sampleBottom > bottomBoundary){
+                processingSubTable->table[j] = index;
+                flag = true;
+            }else{
+                if(flag && index < count-1){
+                    index++;
+                    processingSubTable->table[j] = index;
+                }else{
+                    processingSubTable->table[j] = 0;
+                }
+            }
+        }
+    }
+
+}
+
+uint32_t MultiLevelCacheTableWideIntervalGetIndex(double value, struct TopLevelTableWideInterval* topLevelTable){
+    uint16_t expoIndex = MLCTWI_GetExpoIndex(value);
+    if(expoIndex <= topLevelTable->topIndex && expoIndex >= topLevelTable->baseIndex){
+        struct SubLevelTableWideInterval* subLevelTable = &topLevelTable->subTables[expoIndex-topLevelTable->baseIndex];
+        uint64_t mantiIndex = MLCTWI_GetMantiIndex(value, topLevelTable->bits);
+        return subLevelTable->table[mantiIndex - subLevelTable->baseIndex];
+
+    }
+    return 0;
+}
+
+void MultiLevelCacheTableWideIntervalFree(struct TopLevelTableWideInterval* table){
+    for(int i=0; i<table->topIndex - table->baseIndex + 1; i++){
+        free(table->subTables[i].table);
+    }
+    free(table->subTables);
+}
+
diff --git a/qtensor/compression/szx/src/README_python.md b/qtensor/compression/szx/src/README_python.md
new file mode 100644
index 00000000..e71bf518
--- /dev/null
+++ b/qtensor/compression/szx/src/README_python.md
@@ -0,0 +1,30 @@
+# Using the Python Wrapper for QC Compression
+### Steps to Build:
+1. Clone the repository, switch to threshold_integrate branch
+
+2. Change directory to "SZx/szx/src/"
+
+3. Run the following NVCC command:
+nvcc --shared --compiler-options '-fPIC' -I ../include/ -I $CUDA_SAMPLES_PATH -o cuszx_wrapper.so *.cu *.c
+
+    - $CUDA_SAMPLES_PATH should be the path to the include/ directory of CUDA's samples
+
+### Using the Python API:
+**def cuszx_device_compress(oriData, outSize, absErrBound, nbEle, blockSize,threshold)**
+- Parameters:
+    - oriData: CUPY array to be compressed, should be flattened to 1-D
+    - outSize: CTypes size_t pointer, will store the resulting compressed data size in bytes
+    - absErrBound: Float, the relative-to-value-range error bound for compression
+    - nbEle: Integer, number of data elements
+    - blockSize: Integer, cuSZx runtime parameter (recommended value = 256)
+    - threshold: Float, the relative-to-value-range threshold for compression
+- Returns:
+    - o_bytes: GPU device pointer to compressed bytes
+    - outSize: See 'Parameters'
+
+**def cuszx_device_decompress(nbEle, cmpBytes)**
+- Parameters:
+    - nbEle: Integer, number of data elements
+    - cmpBytes: GPU device pointer to compressed bytes
+- Returns:
+    - newData: GPU float pointer (CTypes) to decompressed data
\ No newline at end of file
diff --git a/qtensor/compression/szx/src/cuszx_entry.cu b/qtensor/compression/szx/src/cuszx_entry.cu
new file mode 100644
index 00000000..b6894760
--- /dev/null
+++ b/qtensor/compression/szx/src/cuszx_entry.cu
@@ -0,0 +1,978 @@
+#include "cuszx_entry.h"
+#include "szx_defines.h"
+#include "szx_BytesToolkit.h"
+#include "szx_TypeManager.h"
+#include "timingGPU.h"
+#include "szx.h"
+
+#define SPARSITY_LEVEL 0.25
+
+TimingGPU timer_GPU;
+void bin(unsigned n)
+{
+    unsigned i;
+    for (i = 1 << 31; i > 0; i = i / 2)
+        (n & i) ? printf("1") : printf("0");
+}
+
+__host__ __device__ size_t convert_state_to_out(unsigned char* meta, size_t length, unsigned char *result){
+    size_t out_length;
+
+    if(length%4==0)
+		out_length = length/4;
+	else
+		out_length = length/4+1;
+
+    for (size_t i = 0; i < out_length; i++)
+    {
+        uint8_t tmp = 0;
+
+        for (size_t j = 0; j < 4; j++)
+        {
+            if (i*4 + j < length)
+            {
+                tmp |= (0x03 & meta[i*4+j]) << 2*j;
+            }
+            
+        }
+        result[i] = tmp;
+    }
+    return out_length;
+}
+
+// nbBlocks, r, stateNBBytes, stateArray
+__host__ __device__ size_t convert_out_to_state(size_t nbBlocks, unsigned char* cmp, unsigned char* out_state){
+    size_t state_length;
+    if(nbBlocks%4==0)
+		state_length = nbBlocks/4;
+	else
+		state_length = nbBlocks/4+1;
+
+    for (size_t i = 0; i < state_length; i++)
+    {
+        for (size_t j = 0; j < 4; j++)
+        {
+            if (4*i + j < nbBlocks)
+            {
+                out_state[4*i + j]= (cmp[i] >> 2*j) & 0x03;
+            }
+            
+        }
+    }
+    return nbBlocks;
+}
+
+__host__ __device__ size_t convert_block2_to_out(unsigned char *result, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    size_t out_length = 0;
+    memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
+    out_length += numBlocks*4;
+    memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
+    out_length += num_sig*sizeof(float);
+    memcpy(result+out_length, blk_subidx, num_sig*sizeof(uint8_t));
+    out_length += num_sig*sizeof(uint8_t);
+    memcpy(result+out_length, blk_sig, numBlocks*sizeof(uint8_t));
+    out_length+= numBlocks*sizeof(uint8_t);
+
+    return out_length;
+}
+
+__host__ __device__ size_t convert_out_to_block2(unsigned char *in_cmp, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    size_t out_length = 0;
+    memcpy(blk_idx, in_cmp, numBlocks*sizeof(uint32_t));
+    out_length += numBlocks*4;
+    memcpy(blk_vals, in_cmp+out_length,num_sig*sizeof(float));
+    out_length += num_sig*sizeof(float);
+    memcpy(blk_subidx, in_cmp+out_length, num_sig*sizeof(uint8_t));
+    out_length += num_sig*sizeof(uint8_t);
+    memcpy(blk_sig, in_cmp+out_length, numBlocks*sizeof(uint8_t));
+    out_length += numBlocks*sizeof(uint8_t);
+
+    return out_length;
+}
+
+int _post_proc(float *oriData, unsigned char *meta, short *offsets, unsigned char *midBytes, unsigned char *outBytes, size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig)
+{
+    int out_size = 0;
+
+    size_t nbConstantBlocks = 0;
+    size_t nbBlocks = nbEle/blockSize;
+    size_t ncBytes = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    out_size += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
+    if (nbBlocks%8==0)
+        out_size += nbBlocks/8;
+    else
+        out_size += nbBlocks/8+1;
+    int s0 = 0;
+    int s1 = 0;
+    int s2 = 0;
+    int s3 = 0;
+    for (int i=0; i<nbBlocks; i++){
+        if (meta[i]==0 || meta[i]==1 || meta[i] == 2) nbConstantBlocks++;
+        else out_size += 1+(blockSize/4)+offsets[i];
+    
+    	if(meta[i]==0) s0++;
+    	if(meta[i]==1) s1++;
+    	if(meta[i]==2) s2++;
+    	if(meta[i]==3) s3++;
+    }
+    printf("%d %d %d %d\n", s0, s1, s2, s3);
+    out_size += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
+
+    //outBytes = (unsigned char*)malloc(out_size);
+  //  printf("accessing outbytes now...\n");
+	unsigned char* r = outBytes;
+    unsigned char* r_old = outBytes; 
+	r[0] = SZx_VER_MAJOR;
+	r[1] = SZx_VER_MINOR;
+	r[2] = 1;
+	r[3] = 0; // indicates this is not a random access version
+	r[4] = (unsigned char)blockSize;
+	r=r+5; //1 byte
+	sizeToBytes(r, nbConstantBlocks);
+	r += sizeof(size_t);
+    sizeToBytes(r, (size_t) num_sig);
+    r += sizeof(size_t); 
+	r += convert_state_to_out(meta, nbBlocks, r);
+    r += convert_block2_to_out(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
+    r += (nbEle%blockSize)*sizeof(float);
+    unsigned char* c = r;
+    unsigned char* o = c+nbConstantBlocks*sizeof(float);
+    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
+    for (int i=0; i<nbBlocks; i++){
+        
+        if (meta[i]==0 || meta[i] == 1){
+	    memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
+            c += sizeof(float);
+        }else if(meta[i] == 3){
+            shortToBytes(o, offsets[i]);
+	   
+            o += sizeof(short);
+            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
+            
+	    nc += mSize; 
+            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            
+	    nc += offsets[i];
+	   
+        } 
+    }
+
+    // return out_size;
+    return (uint32_t) (nc-r_old);
+}
+
+unsigned char* cuSZx_fast_compress_args_unpredictable_blocked_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold)
+{
+//    printf("tr thresh abs %f %f\n", threshold, absErrBound);
+  //  printf("first: %f %f %f\n", oriData[0], oriData[1], oriData[2]);
+    float sparsity_level = SPARSITY_LEVEL;
+	float* d_oriData;
+    cudaMalloc((void**)&d_oriData, sizeof(float)*nbEle); 
+    cudaMemcpy(d_oriData, oriData, sizeof(float)*nbEle, cudaMemcpyHostToDevice); 
+
+	size_t nbBlocks = nbEle/blockSize;
+	size_t remainCount = nbEle%blockSize;
+	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
+
+    size_t ncBytes = blockSize/4;
+    //ncBytes = (blockSize+1)%4==0 ? ncBytes : ncBytes+1; //Bytes to store one non-constant block data.
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    size_t msz = (1+mSize) * nbBlocks * sizeof(unsigned char);
+    size_t mbsz = sizeof(float) * nbEle * sizeof(unsigned char);
+
+    unsigned char *meta = (unsigned char*)malloc(msz);
+    short *offsets = (short*)malloc(nbBlocks*sizeof(short));
+    unsigned char *midBytes = (unsigned char*)malloc(mbsz);
+
+	unsigned char* d_meta;
+	unsigned char* d_midBytes;
+	short* d_offsets;
+
+    uint32_t *blk_idx, *d_blk_idx;
+    uint8_t *blk_sig, *d_blk_sig;
+    uint8_t *blk_subidx, *d_blk_subidx;
+    float *blk_vals, *d_blk_vals;
+    uint64_t *num_sig, *d_num_sig;
+
+    checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
+    num_sig = (uint64_t *)malloc(sizeof(uint64_t));
+    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
+    // blk_idx = malloc()
+    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, nbEle*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, nbEle*sizeof(float)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void**)&d_meta, msz)); 
+    //checkCudaErrors(cudaMemcpy(d_meta, meta, msz, cudaMemcpyHostToDevice)); 
+    checkCudaErrors(cudaMemset(d_meta, 0, msz));
+    checkCudaErrors(cudaMalloc((void**)&d_offsets, nbBlocks*sizeof(short))); 
+    checkCudaErrors(cudaMemset(d_offsets, 0, nbBlocks*sizeof(short)));
+    checkCudaErrors(cudaMalloc((void**)&d_midBytes, mbsz)); 
+    checkCudaErrors(cudaMemset(d_midBytes, 0, mbsz));
+
+    timer_GPU.StartCounter();
+    // apply_threshold<<<80,256>>>(d_oriData, threshold, nbEle);
+    // cudaDeviceSynchronize();
+    dim3 dimBlock(32, blockSize/32);
+    dim3 dimGrid(65536, 1);
+    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
+    compress_float<<<dimGrid, dimBlock, sMemsize>>>(d_oriData, d_meta, d_offsets, d_midBytes, absErrBound, blockSize, nbBlocks, mSize, sparsity_level, d_blk_idx, d_blk_subidx,d_blk_vals, threshold, d_blk_sig);
+    cudaError_t err = cudaGetLastError();        // Get error code
+    printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    printf("GPU compression timing: %f ms\n", timer_GPU.GetCounter());
+    cudaDeviceSynchronize();
+    get_numsig<<<1,1>>>(d_num_sig);
+    cudaDeviceSynchronize();
+
+    checkCudaErrors(cudaMemcpy(num_sig, d_num_sig, sizeof(uint64_t), cudaMemcpyDeviceToHost));
+
+    blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
+    blk_vals= (float *)malloc((*num_sig)*sizeof(float));
+    blk_subidx = (uint8_t *)malloc((*num_sig)*sizeof(uint8_t));
+    blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
+
+    checkCudaErrors(cudaMemcpy(meta, d_meta, msz, cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(offsets, d_offsets, nbBlocks*sizeof(short), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(midBytes, d_midBytes, mbsz, cudaMemcpyDeviceToHost)); 
+    
+    
+    checkCudaErrors(cudaMemcpy(blk_idx, d_blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(blk_vals,d_blk_vals, (*num_sig)*sizeof(float), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(blk_subidx,d_blk_subidx, (*num_sig)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(blk_sig,d_blk_sig, (nbBlocks)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
+
+    size_t maxPreservedBufferSize = sizeof(float)*nbEle;
+    unsigned char* outBytes = (unsigned char*)malloc(maxPreservedBufferSize);
+    memset(outBytes, 0, maxPreservedBufferSize);
+
+    outSize = (size_t *)malloc(sizeof(size_t));
+    //outSize[0] = _post_proc(oriData, meta, offsets, midBytes, outBytes, nbEle, blockSize, *num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+
+    *outSize = _post_proc(oriData, meta, offsets, midBytes, outBytes, nbEle, blockSize, *num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+//    printf("Beginning free\n");
+    printf("outsize %p \n", outBytes);
+    free(blk_idx);
+    free(blk_subidx);
+    free(blk_vals);
+    free(meta);
+    free(offsets);
+    free(midBytes);
+    checkCudaErrors(cudaFree(d_meta));
+    checkCudaErrors(cudaFree(d_offsets));
+    checkCudaErrors(cudaFree(d_midBytes));
+    return outBytes;
+}
+
+void cuSZx_fast_decompress_args_unpredictable_blocked_float(float** newData, size_t nbEle, unsigned char* cmpBytes)
+{
+    uint32_t *blk_idx, *d_blk_idx;
+    uint8_t *blk_subidx, *d_blk_subidx;
+    uint8_t *blk_sig, *d_blk_sig;
+    float *blk_vals, *d_blk_vals;
+    size_t num_sig, *d_num_sig;
+
+	*newData = (float*)malloc(sizeof(float)*nbEle);
+    memset(*newData, 0, sizeof(float)*nbEle);
+	
+	unsigned char* r = cmpBytes;
+	r += 4;
+	int blockSize = r[0];  //get block size
+	if(blockSize == 0)blockSize = 256;
+	r++;
+	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
+	r += sizeof(size_t);
+	num_sig = bytesToSize(r);
+    r += sizeof(size_t);
+	size_t nbBlocks = nbEle/blockSize;
+    size_t ncBlocks = 0;
+    size_t num_state2_blks = 0;
+	// size_t ncBlocks = nbBlocks - nbConstantBlocks; //get number of constant blocks
+	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    size_t ncLeading = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncLeading; //Number of bytes for each data block's metadata.
+	unsigned char* stateArray = (unsigned char*)malloc(nbBlocks);
+    unsigned char* d_stateArray;
+    cudaMalloc(&d_stateArray, nbBlocks);
+	float* constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));			
+	
+    
+
+    blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
+    blk_vals= (float *)malloc((num_sig)*sizeof(float));
+    blk_subidx = (uint8_t *)malloc((num_sig)*sizeof(uint8_t));
+    blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
+
+	printf("Converting state array\n");
+    convert_out_to_state(nbBlocks, r, stateArray);
+	// convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
+	for (size_t i = 0; i < nbBlocks; i++)
+    {
+        if (stateArray[i] == 2)
+        {
+            num_state2_blks++;
+        }else if(stateArray[i] == 3){
+            ncBlocks++;
+        }
+    }
+    
+	r += stateNBBytes;
+    unsigned char* data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
+    memset(data, 0, ncBlocks*blockSize*sizeof(float));
+    printf("converting block vals\n");
+    size_t to_add = convert_out_to_block2(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    r+= to_add;
+    // checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
+    // num_sig = (uint64_t *)malloc(sizeof(uint64_t));
+    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
+    // blk_idx = malloc()
+    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, num_sig*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, num_sig*sizeof(float)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMemcpy(d_blk_idx, blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_blk_vals, blk_vals, (num_sig)*sizeof(float), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_blk_subidx, blk_subidx, (num_sig)*sizeof(uint8_t), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_stateArray, stateArray, nbBlocks, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_blk_sig, blk_sig, nbBlocks*sizeof(uint8_t), cudaMemcpyHostToDevice));
+
+
+	size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+    memcpy((*newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
+    r += (nbEle%blockSize)*sizeof(float);
+	float* fr = (float*)r; //fr is the starting address of constant median values.
+	for(i = 0;i < nbConstantBlocks;i++, j+=4) //get the median values for constant-value blocks
+		constantMedianArray[i] = fr[i];
+    r += nbConstantBlocks*sizeof(float);
+    unsigned char* p = r + ncBlocks * sizeof(short);
+    for(i = 0;i < ncBlocks;i++){
+        int leng = (int)bytesToShort(r)+mSize;
+        r += sizeof(short);
+        if (leng > blockSize*sizeof(float))
+        {
+            printf("Warning: compressed block is larger than the original block!\n");
+            exit(0);
+        }
+        memcpy(data+i*blockSize*sizeof(float), p, leng);
+        p += leng;
+    } 
+
+    unsigned char* d_data;
+    float *d_newdata;
+    checkCudaErrors(cudaMalloc((void**)&d_data, ncBlocks*blockSize*sizeof(float))); 
+    checkCudaErrors(cudaMemcpy(d_data, data, ncBlocks*blockSize*sizeof(float), cudaMemcpyHostToDevice)); 
+    checkCudaErrors(cudaMalloc(&d_newdata, nbBlocks*blockSize*sizeof(float)));
+
+    timer_GPU.StartCounter();
+    dim3 dimBlock(32, blockSize/32);
+    dim3 dimGrid(65536, 1);
+    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
+    decompress_state2<<<nbBlocks, 64>>>(d_newdata, d_stateArray,d_blk_idx, d_blk_vals, d_blk_subidx,blockSize, d_blk_sig);
+    decompress_float<<<dimGrid, dimBlock, sMemsize>>>(d_data, blockSize, ncBlocks, mSize);
+    cudaError_t err = cudaGetLastError();        // Get error code
+    printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
+    cudaDeviceSynchronize();
+    checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
+    float* fdata = (float*)data;
+
+    int nb=0, nc=0;
+    for (i=0;i<nbBlocks;i++){
+        if (stateArray[i]==0 || stateArray[i]==1){
+            float Median = constantMedianArray[nb];
+            if (Median>1) printf("data%i:%f\n",i, Median);
+            for (j=0;j<blockSize;j++)
+                *((*newData)+i*blockSize+j) = Median;
+            nb++;
+        }else if(stateArray[i]==3){
+            for (j=0;j<blockSize;j++)
+                *((*newData)+i*blockSize+j) = fdata[nc*blockSize+j];
+            nc++;
+        }
+    }
+
+	free(stateArray);
+	free(constantMedianArray);
+	free(data);
+    cudaFree(d_newdata);
+    cudaFree(d_stateArray);
+    checkCudaErrors(cudaFree(d_data));
+
+}
+
+__device__ inline void longToBytes_bigEndian_d(unsigned char *b, unsigned long num) 
+{
+	b[0] = (unsigned char)(num>>56);
+	b[1] = (unsigned char)(num>>48);
+	b[2] = (unsigned char)(num>>40);
+	b[3] = (unsigned char)(num>>32);
+	b[4] = (unsigned char)(num>>24);
+	b[5] = (unsigned char)(num>>16);
+	b[6] = (unsigned char)(num>>8);
+	b[7] = (unsigned char)(num);
+//	if(dataEndianType==LITTLE_ENDIAN_DATA)
+//		symTransform_8bytes(*b);
+}
+
+__device__ inline void shortToBytes_d(unsigned char* b, short value)
+{
+	lint16 buf;
+	buf.svalue = value;
+	memcpy(b, buf.byte, 2);
+}
+
+__global__ void device_post_proc(size_t *outSize, float *oriData, unsigned char *meta, short *offsets, unsigned char *midBytes, unsigned char *outBytes, size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig)
+{
+    int out_size = 0;
+
+    size_t nbConstantBlocks = 0;
+    size_t nbBlocks = nbEle/blockSize;
+    size_t ncBytes = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    out_size += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
+    if (nbBlocks%8==0)
+        out_size += nbBlocks/8;
+    else
+        out_size += nbBlocks/8+1;
+    int s0 = 0;
+    int s1 = 0;
+    int s2 = 0;
+    int s3 = 0;
+    for (int i=0; i<nbBlocks; i++){
+        if (meta[i]==0 || meta[i]==1 || meta[i] == 2) nbConstantBlocks++;
+        else out_size += 1+(blockSize/4)+offsets[i];
+    
+    	if(meta[i]==0) s0++;
+    	if(meta[i]==1) s1++;
+    	if(meta[i]==2) s2++;
+    	if(meta[i]==3) s3++;
+    }
+    printf("%d %d %d %d\n", s0, s1, s2, s3);
+    out_size += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
+
+    //outBytes = (unsigned char*)malloc(out_size);
+	unsigned char* r = outBytes;
+    unsigned char* r_old = outBytes; 
+	r[0] = SZx_VER_MAJOR;
+	r[1] = SZx_VER_MINOR;
+	r[2] = 1;
+	r[3] = 0; // indicates this is not a random access version
+	r[4] = (unsigned char)blockSize;
+	r=r+5; //1 byte
+	//sizeToBytes(r, nbConstantBlocks);
+    longToBytes_bigEndian_d(r, nbConstantBlocks);
+	r += sizeof(size_t);
+    //sizeToBytes(r, (size_t) num_sig);
+    longToBytes_bigEndian_d(r, (unsigned long)num_sig);
+    r += sizeof(size_t); 
+	r += convert_state_to_out(meta, nbBlocks, r);
+    r += convert_block2_to_out(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
+    r += (nbEle%blockSize)*sizeof(float);
+    unsigned char* c = r;
+    unsigned char* o = c+nbConstantBlocks*sizeof(float);
+    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
+    for (int i=0; i<nbBlocks; i++){
+        
+        if (meta[i]==0 || meta[i] == 1){
+            memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
+            c += sizeof(float);
+        }else if(meta[i] == 3){
+           shortToBytes_d(o, offsets[i]);
+            o += sizeof(short);
+            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
+            nc += mSize; 
+            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            nc += offsets[i];
+        } 
+    }
+
+    // return out_size;
+    *outSize = (uint32_t) (nc-r_old);
+    printf("outBytes 0 %d\n", (int) outBytes[0]);
+    // return (uint32_t) (nc-r_old);
+}
+
+unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold)
+{
+    /**
+     * Assuming the following are device pointers:
+     *  float *oriData
+     *  size_t *outSize
+     *  unsigned char* outBytes
+     * 
+     */
+
+    float sparsity_level = SPARSITY_LEVEL;
+
+    // Set the input data as the function parameter, this should be a device pointer
+
+	float* d_oriData = oriData;
+    // cudaMalloc((void**)&d_oriData, sizeof(float)*nbEle); 
+    // cudaMemcpy(d_oriData, oriData, sizeof(float)*nbEle, cudaMemcpyHostToDevice); 
+
+	size_t nbBlocks = nbEle/blockSize;
+	size_t remainCount = nbEle%blockSize;
+	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
+
+    size_t ncBytes = blockSize/4;
+    //ncBytes = (blockSize+1)%4==0 ? ncBytes : ncBytes+1; //Bytes to store one non-constant block data.
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    size_t msz = (1+mSize) * nbBlocks * sizeof(unsigned char);
+    size_t mbsz = sizeof(float) * nbEle * sizeof(unsigned char);
+
+    // These are host pointers and do not need to be allocated
+
+    // unsigned char *meta = (unsigned char*)malloc(msz);
+    // short *offsets = (short*)malloc(nbBlocks*sizeof(short));
+    // unsigned char *midBytes = (unsigned char*)malloc(mbsz);
+
+	unsigned char* d_meta;
+	unsigned char* d_midBytes;
+	short* d_offsets;
+
+    uint32_t *blk_idx, *d_blk_idx;
+    uint8_t *blk_sig, *d_blk_sig;
+    uint8_t *blk_subidx, *d_blk_subidx;
+    float *blk_vals, *d_blk_vals;
+    uint64_t *num_sig, *d_num_sig;
+
+    checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
+    num_sig = (uint64_t *)malloc(sizeof(uint64_t));
+    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
+    // blk_idx = malloc()
+    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, nbEle*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, nbEle*sizeof(float)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void**)&d_meta, msz)); 
+    //checkCudaErrors(cudaMemcpy(d_meta, meta, msz, cudaMemcpyHostToDevice)); 
+    checkCudaErrors(cudaMemset(d_meta, 0, msz));
+    checkCudaErrors(cudaMalloc((void**)&d_offsets, nbBlocks*sizeof(short))); 
+    checkCudaErrors(cudaMemset(d_offsets, 0, nbBlocks*sizeof(short)));
+    checkCudaErrors(cudaMalloc((void**)&d_midBytes, mbsz)); 
+    checkCudaErrors(cudaMemset(d_midBytes, 0, mbsz));
+
+    timer_GPU.StartCounter();
+    // apply_threshold<<<80,256>>>(d_oriData, threshold, nbEle);
+    // cudaDeviceSynchronize();
+    dim3 dimBlock(32, blockSize/32);
+    dim3 dimGrid(65536, 1);
+    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
+    compress_float<<<dimGrid, dimBlock, sMemsize>>>(d_oriData, d_meta, d_offsets, d_midBytes, absErrBound, blockSize, nbBlocks, mSize, sparsity_level, d_blk_idx, d_blk_subidx,d_blk_vals, threshold, d_blk_sig);
+    cudaError_t err = cudaGetLastError();        // Get error code
+    printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    printf("GPU compression timing: %f ms\n", timer_GPU.GetCounter());
+    cudaDeviceSynchronize();
+    get_numsig<<<1,1>>>(d_num_sig);
+    cudaDeviceSynchronize();
+
+    checkCudaErrors(cudaMemcpy(num_sig, d_num_sig, sizeof(uint64_t), cudaMemcpyDeviceToHost));
+
+    // These are allocations and memcpys to host pointers, do not need them
+
+    // blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
+    // blk_vals= (float *)malloc((*num_sig)*sizeof(float));
+    // blk_subidx = (uint8_t *)malloc((*num_sig)*sizeof(uint8_t));
+    // blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
+
+    // checkCudaErrors(cudaMemcpy(meta, d_meta, msz, cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(offsets, d_offsets, nbBlocks*sizeof(short), cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(midBytes, d_midBytes, mbsz, cudaMemcpyDeviceToHost)); 
+    
+    
+    // checkCudaErrors(cudaMemcpy(blk_idx, d_blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyDeviceToHost));
+    // checkCudaErrors(cudaMemcpy(blk_vals,d_blk_vals, (*num_sig)*sizeof(float), cudaMemcpyDeviceToHost));
+    // checkCudaErrors(cudaMemcpy(blk_subidx,d_blk_subidx, (*num_sig)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
+    // checkCudaErrors(cudaMemcpy(blk_sig,d_blk_sig, (nbBlocks)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
+
+
+    size_t maxPreservedBufferSize = sizeof(float)*nbEle;
+    unsigned char *d_outBytes;
+    // unsigned char* outBytes = (unsigned char*)malloc(maxPreservedBufferSize);
+    // memset(outBytes, 0, maxPreservedBufferSize);
+    checkCudaErrors(cudaMalloc(&d_outBytes, maxPreservedBufferSize));
+
+    size_t *d_outSize;
+
+    checkCudaErrors(cudaMalloc(&d_outSize, sizeof(size_t)));
+
+    device_post_proc<<<1,1>>>(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
+
+    cudaDeviceSynchronize();
+    checkCudaErrors(cudaMemcpy(outSize, d_outSize, sizeof(size_t), cudaMemcpyDeviceToHost));
+
+    // printf("completed compression\n");
+    //free(blk_idx);
+    //free(blk_subidx);
+    //free(blk_vals);
+    // free(meta);
+    // free(offsets);
+    // free(midBytes);
+    checkCudaErrors(cudaFree(d_meta));
+    checkCudaErrors(cudaFree(d_offsets));
+    checkCudaErrors(cudaFree(d_midBytes));
+//    printf("completed compression\n");
+    return d_outBytes;
+}
+
+__device__ inline long bytesToLong_bigEndian(unsigned char* b) {
+	long temp = 0;
+	long res = 0;
+
+	res <<= 8;
+	temp = b[0] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = b[1] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[3] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[4] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[5] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[6] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[7] & 0xff;
+	res |= temp;						
+	
+	return res;
+}
+
+__device__ inline size_t bytesToSize(unsigned char* bytes)
+{
+	size_t result = bytesToLong_bigEndian(bytes);//8	
+	return result;
+}
+
+__device__ inline short bytesToShort(unsigned char* bytes)
+{
+	lint16 buf;
+	memcpy(buf.byte, bytes, 2);
+	
+	return buf.svalue;
+}
+
+__global__ void decompress_get_stats(float *newData, size_t nbEle, unsigned char* cmpBytes, 
+    size_t *numSigValues, int *bs,
+    size_t *numConstantBlks, size_t *numBlks,
+    size_t *mSizeptr, unsigned char *newCmpBytes
+){
+	unsigned char* r = cmpBytes;
+    size_t num_sig;
+	r += 4;
+	int blockSize = r[0];  //get block size
+	
+	if(blockSize == 0)blockSize = 256;
+	r++;
+	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
+	r += sizeof(size_t);
+	num_sig = bytesToSize(r);
+
+    r += sizeof(size_t);
+	size_t nbBlocks = nbEle/blockSize;
+    size_t ncBlocks = 0;
+    size_t num_state2_blks = 0;
+	// size_t ncBlocks = nbBlocks - nbConstantBlocks; //get number of constant blocks
+	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    size_t ncLeading = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncLeading; //Number of bytes for each data block's metadata.
+
+    *mSizeptr = mSize;
+
+    *numConstantBlks = nbConstantBlocks;
+    *numBlks = nbBlocks;
+    *numSigValues = num_sig;
+    *bs = blockSize;
+    newCmpBytes = r;
+    // printf("nb blocks: %d\n", nbBlocks);
+
+}
+
+__global__ void setup_data_stateArray(float *newData, size_t nbEle, unsigned char* r, 
+    size_t num_sig, int blockSize,
+    size_t nbConstantBlocks, size_t nbBlocks, size_t *ncBlks,
+    unsigned char *stateArray, unsigned char *newR
+){
+    r += 4;
+    r++;
+    r += sizeof(size_t);
+    r += sizeof(size_t);
+    size_t ncBlocks = 0;
+	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    size_t num_state2_blks = 0;
+	printf("Converting state array\n");
+    // printf("cmp %d\n", (int)r[0]);
+    // printf("state %d\n", (int)stateArray[0]);
+    convert_out_to_state(nbBlocks, r, stateArray);
+    // printf("state %d\n", (int)stateArray[0]);
+    // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
+	for (size_t i = 0; i < nbBlocks; i++)
+    {
+        if (stateArray[i] == 2)
+        {
+            num_state2_blks++;
+        }else if(stateArray[i] == 3){
+            ncBlocks++;
+        }
+    }
+    
+	r += stateNBBytes;
+    newR = r;
+    *ncBlks = ncBlocks;
+}
+
+__global__ void decompress_startup(float *newData, size_t nbEle, unsigned char* r, 
+    uint32_t *blk_idx, uint8_t *blk_subidx, uint8_t *blk_sig,
+    float *blk_vals, size_t num_sig, int blockSize,
+    size_t nbConstantBlocks, size_t nbBlocks, size_t ncBlocks,
+    unsigned char *stateArray, float* constantMedianArray, unsigned char *data,
+    size_t mSize, unsigned char *newCmpBytes
+){
+    /**
+     * Structures to return:
+     * blk_idx, blk_subidx, blk_sig, blk_vals, numSigValues (pointer)
+     * bs (pointer to blockSize), numConstantBlks (pointer), numBlks (pointer)
+     * ncBlks (pointer), stateArray, constantMedianArray
+     */
+	
+    // size_t ncBlocks = 0;
+	// size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    // size_t num_state2_blks = 0;
+	// printf("Converting state array\n");
+    // convert_out_to_state(nbBlocks, r, stateArray);
+    // printf("state %d\n", (int)stateArray[0]);
+    // // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
+	// for (size_t i = 0; i < nbBlocks; i++)
+    // {
+    //     if (stateArray[i] == 2)
+    //     {
+    //         num_state2_blks++;
+    //     }else if(stateArray[i] == 3){
+    //         ncBlocks++;
+    //     }
+    // }
+    size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+	r += 4;
+    r++;
+    r += sizeof(size_t);
+    r += sizeof(size_t);
+    r += stateNBBytes;
+    // data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
+    // memset(data, 0, ncBlocks*blockSize*sizeof(float));
+    // printf("converting block vals %d\n", data[0]);
+    size_t to_add = convert_out_to_block2(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    r+= to_add;
+
+	size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+    
+    // printf("before mallocs in kernel\n");
+    
+    memcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
+
+    // printf("before mallocs in kernel\n");
+    r += (nbEle%blockSize)*sizeof(float);
+	float* fr = (float*)r; //fr is the starting address of constant median values.
+	for(i = 0;i < nbConstantBlocks;i++, j+=4) //get the median values for constant-value blocks
+		constantMedianArray[i] = fr[i];
+    r += nbConstantBlocks*sizeof(float);
+    unsigned char* p = r + ncBlocks * sizeof(short);
+    for(i = 0;i < ncBlocks;i++){
+        int leng = (int)bytesToShort(r)+mSize;
+        r += sizeof(short);
+        if (leng > blockSize*sizeof(float))
+        {
+            printf("Warning: compressed block is larger than the original block!\n");
+            return;
+            // exit(0);
+        }
+        memcpy(data+i*blockSize*sizeof(float), p, leng);
+        p += leng;
+    } 
+
+    newCmpBytes = r;
+    // printf("before mallocs in kernel\n");
+
+    // printf("nb blocks: %d\n", nbBlocks);
+}
+
+__global__ void decompress_post_proc(unsigned char *data, float *newData, int blockSize, 
+    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
+    float *constantMedianArray
+){
+    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
+    float* fdata = (float*)data;
+    int i,j;
+    int nb=0, nc=0;
+    for (i=0;i<nbBlocks;i++){
+        if (stateArray[i]==0 || stateArray[i]==1){
+            float Median = constantMedianArray[nb];
+            // if (Median>1) printf("data%i:%f\n",i, Median);
+            for (j=0;j<blockSize;j++)
+                *((newData)+i*blockSize+j) = Median;
+            nb++;
+        }else if(stateArray[i]==3){
+            for (j=0;j<blockSize;j++)
+                *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
+            nc++;
+        }
+    }
+}
+
+float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
+{
+    /**
+     * Assume the following are device pointers
+     * 
+     * unsigned char* cmpBytes
+     * float** newData
+     * 
+     */
+    
+    uint32_t *blk_idx;
+    uint8_t *blk_subidx;
+    uint8_t *blk_sig;
+    float *blk_vals, *constantMedianArray;
+    size_t *num_sig, *mSize, mSize_h, num_sig_h;
+    int *blockSize, bs;
+    size_t *nbConstantBlocks, *nbBlocks, *ncBlocks, nbBlocks_h, ncBlocks_h, nbConstantBlocks_h;
+    unsigned char *stateArray, *data;
+    float *newData;
+
+    unsigned char *oldCmpBytes = cmpBytes;
+	//*newData = (float*)malloc(sizeof(float)*nbEle);
+//    printf("cmpbytes check %d\n", (int)cmpBytes[0]);
+//    printf("new check %f\n", *newData[0]);
+    // printf("malloc\n");
+    checkCudaErrors(cudaMalloc((void**)&num_sig, sizeof(size_t)));
+    checkCudaErrors(cudaMalloc((void**)&blockSize, sizeof(int)));
+    checkCudaErrors(cudaMalloc((void**)&nbConstantBlocks, sizeof(size_t)));
+    checkCudaErrors(cudaMalloc((void**)&nbBlocks, sizeof(size_t)));
+    checkCudaErrors(cudaMalloc((void**)&ncBlocks, sizeof(size_t)));
+    checkCudaErrors(cudaMalloc((void**)&mSize, sizeof(size_t)));    
+    checkCudaErrors(cudaMalloc((void**)&newData, sizeof(float)*nbEle));
+
+    decompress_get_stats<<<1,1>>>(newData, nbEle, cmpBytes, 
+        num_sig, blockSize,
+        nbConstantBlocks, nbBlocks,
+        mSize, cmpBytes
+    );
+    cudaDeviceSynchronize();
+
+    cudaError_t err = cudaGetLastError();        // Get error code
+    printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    checkCudaErrors(cudaMemcpy(&nbBlocks_h, nbBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(&nbConstantBlocks_h, nbConstantBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(&bs, blockSize, sizeof(int), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(&mSize_h, mSize, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(&num_sig_h, num_sig, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+
+
+    checkCudaErrors(cudaMalloc((void**)&stateArray, nbBlocks_h));
+    checkCudaErrors(cudaMalloc((void**)&constantMedianArray, nbConstantBlocks_h*sizeof(float)));
+
+    checkCudaErrors(cudaMalloc((void**)&blk_idx, nbBlocks_h*sizeof(uint32_t)));
+    checkCudaErrors(cudaMalloc((void**)&blk_vals, num_sig_h*sizeof(float)));
+    checkCudaErrors(cudaMalloc((void**)&blk_subidx, num_sig_h*sizeof(uint8_t)));
+    checkCudaErrors(cudaMalloc((void**)&blk_sig, nbBlocks_h*sizeof(uint8_t)));
+
+    setup_data_stateArray<<<1,1>>>(newData, nbEle, cmpBytes, 
+        num_sig_h, bs,
+        nbConstantBlocks_h, nbBlocks_h, ncBlocks,
+        stateArray, cmpBytes
+    );
+    cudaDeviceSynchronize();
+    checkCudaErrors(cudaMemcpy(&ncBlocks_h, ncBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+
+    checkCudaErrors(cudaMalloc((void**)&data, ncBlocks_h*bs*sizeof(float)));
+    // cmpBytes = newCmpBytes;
+    // data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
+    // memset(data, 0, ncBlocks*blockSize*sizeof(float));
+    // stateArray = (unsigned char*)malloc(nbBlocks);
+    
+    // // unsigned char* d_stateArray;
+    // // cudaMalloc(&d_stateArray, nbBlocks);
+	// constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));			
+
+    // blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
+    // blk_vals= (float *)malloc((num_sig)*sizeof(float));
+    // blk_subidx = (uint8_t *)malloc((num_sig)*sizeof(uint8_t));
+    // blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
+
+    //test_nbBlks = (size_t *)malloc(sizeof(size_t));
+    // printf("malloc\n");
+    decompress_startup<<<1,1>>>(newData, nbEle, cmpBytes, 
+    blk_idx, blk_subidx, blk_sig,
+    blk_vals, num_sig_h, bs,
+    nbConstantBlocks_h, nbBlocks_h, ncBlocks_h,
+    stateArray, constantMedianArray, data, mSize_h, cmpBytes);
+    cudaDeviceSynchronize();
+    // cmpBytes = newCmpBytes;
+
+    
+
+    // unsigned char* d_data;
+    float *d_newdata;
+    // checkCudaErrors(cudaMalloc((void**)&d_data, ncBlocks*blockSize*sizeof(float))); 
+    // checkCudaErrors(cudaMemcpy(d_data, data, ncBlocks*blockSize*sizeof(float), cudaMemcpyHostToDevice)); 
+    checkCudaErrors(cudaMalloc(&d_newdata, nbBlocks_h*bs*sizeof(float)));
+
+    timer_GPU.StartCounter();
+    dim3 dimBlock(32, bs/32);
+    dim3 dimGrid(65536, 1);
+    const int sMemsize = bs * sizeof(float) + dimBlock.y * sizeof(int);
+    decompress_state2<<<nbBlocks_h, 64>>>(d_newdata, stateArray,blk_idx, blk_vals, blk_subidx, bs, blk_sig);
+    decompress_float<<<dimGrid, dimBlock, sMemsize>>>(data, bs, ncBlocks_h, mSize_h);
+    //err = cudaGetLastError();        // Get error code
+    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    //printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
+    cudaDeviceSynchronize();
+
+    err = cudaGetLastError();        // Get error code
+    printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
+    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(newData, d_newdata, nbBlocks_h*bs*sizeof(float), cudaMemcpyDeviceToDevice));
+    cudaFree(d_newdata);
+
+    decompress_post_proc<<<1,1>>>(data, newData, bs, 
+    nbBlocks_h, ncBlocks_h, stateArray,
+    constantMedianArray);
+    cudaDeviceSynchronize();
+
+	cudaFree(stateArray);
+	cudaFree(constantMedianArray);
+	cudaFree(data);
+    cudaFree(blk_idx);
+    cudaFree(blk_subidx);
+    cudaFree(blk_vals);
+    cudaFree(blk_sig);
+    return newData;
+
+}
+
diff --git a/qtensor/compression/szx/src/cuszx_float.cu b/qtensor/compression/szx/src/cuszx_float.cu
new file mode 100644
index 00000000..48be2365
--- /dev/null
+++ b/qtensor/compression/szx/src/cuszx_float.cu
@@ -0,0 +1,392 @@
+#include <stdio.h>
+#include <math.h>
+#include "cuszx_float.h"
+
+#include <cooperative_groups.h>
+
+namespace cg = cooperative_groups;
+
+#define MAX_BLK_SIZE 256
+
+__device__ uint32_t num_state2;
+__device__ uint64_t total_sig;
+
+__device__
+void gridReduction_cg(double *results) 
+{
+    int tidx = threadIdx.x;
+    int tidy = threadIdx.y;
+    int bid = blockIdx.x;
+
+    if (bid==0){
+        double data = results[tidy*gridDim.x+tidx];
+
+        for (int i=(tidx+blockDim.x); i<gridDim.x; i+=blockDim.x){
+            if (tidy<2) data = min(data, results[tidy*gridDim.x+i]);
+            else if (tidy<4) data = max(data, results[tidy*gridDim.x+i]);
+            else data += results[tidy*gridDim.x+i];
+        }
+        __syncthreads();                  
+
+        for (int offset = warpSize/2; offset > 0; offset /= 2) 
+        {
+            if (tidy<2) data = min(data, __shfl_xor_sync(FULL_MASK, data, offset));
+            else if (tidy<4) data = max(data, __shfl_xor_sync(FULL_MASK, data, offset));
+            else data += __shfl_down_sync(FULL_MASK, data, offset);
+        }
+
+        if (tidx==0) results[tidy*gridDim.x] = data;
+    }
+}
+
+__device__ void _IntArray2ByteArray(int leadingNum, int mbase, unsigned char* meta)
+{
+    leadingNum = leadingNum << (3-threadIdx.x%4)*2;
+    for (int i = 1; i < 4; i *= 2) {
+        unsigned int mask = 0xffffffff;
+        leadingNum |= __shfl_down_sync(mask, leadingNum, i);
+    }
+
+    if (threadIdx.x%4==0)
+        meta[mbase+threadIdx.y*8+threadIdx.x/4] = (unsigned char)leadingNum;
+    __syncthreads();
+
+
+}
+
+__device__ int _compute_reqLength(int redius, int absErrBound)
+{
+    int radExpo = (redius & 0x7F800000) >> 23;
+    radExpo -= 127;
+    int reqExpo = (absErrBound & 0x7F800000) >> 23;
+    reqExpo -= 127;
+    return 9+radExpo-reqExpo+1;
+}
+
+__device__ int _shfl_scan(int lznum, int *sums)
+{
+    // Below is the basic structure of using a shfl instruction
+    // for a scan.
+    // Record "value" as a variable - we accumulate it along the way
+    int value = lznum;
+
+    // Now accumulate in log steps up the chain
+    // compute sums, with another thread's value who is
+    // distance delta away (i).  Note
+    // those threads where the thread 'i' away would have
+    // been out of bounds of the warp are unaffected.  This
+    // creates the scan sum.
+
+#pragma unroll
+    for (int i = 1; i <= warpSize; i *= 2) {
+        unsigned int mask = 0xffffffff;
+        int n = __shfl_up_sync(mask, value, i);
+
+        if (threadIdx.x >= i) value += n;
+                      
+    }
+
+    // value now holds the scan value for the individual thread
+    // next sum the largest values for each warp
+
+    // write the sum of the warp to smem
+    if (threadIdx.x == warpSize - 1) {
+        sums[threadIdx.y] = value;
+    }
+    __syncthreads();
+
+    //
+    // scan sum the warp sums
+    // the same shfl scan operation, but performed on warp sums
+    //
+    if (threadIdx.y == 0 && threadIdx.x < blockDim.y) {
+        int warp_sum = sums[threadIdx.x];
+
+        int mask = (1 << blockDim.y) - 1;
+        for (int i = 1; i <= blockDim.y; i *= 2) {
+            //int n = __shfl_up_sync(mask, warp_sum, i, blockDim.y);
+            int n = __shfl_up_sync(mask, warp_sum, i);
+            if (threadIdx.x >= i) warp_sum += n;
+        }
+
+        sums[threadIdx.x] = warp_sum;
+    }
+    __syncthreads();
+
+    // perform a uniform add across warps in the block
+    // read neighbouring warp's sum and add it to threads value
+    int blockSum = 0;
+    if (threadIdx.y > 0) {
+        blockSum = sums[threadIdx.y - 1];
+    }
+    value += blockSum;
+
+    return value;
+}
+
+__device__ void _compute_oneBlock(unsigned long bbase, int mbase, int obase, int reqLength, float *value, int *ivalue, uchar4 *cvalue, int *sums, unsigned char *meta, short *offsets, unsigned char *midBytes)
+{
+	int reqBytesLength;
+	int rightShiftBits;
+
+
+	if (reqLength%8 != 0)
+	{
+		reqBytesLength = reqLength/8+1;		
+		rightShiftBits = 8 - reqLength%8;
+    }else{
+		reqBytesLength = reqLength/8;		
+		rightShiftBits = 0;
+    }
+
+    int cur_ivalue = (ivalue[threadIdx.y*blockDim.x+threadIdx.x] >> rightShiftBits) & ((1<<(32-rightShiftBits))-1);
+    ivalue[threadIdx.y*blockDim.x+threadIdx.x] = cur_ivalue;
+    __syncthreads();                  
+
+    int pre_ivalue = 0;
+    if (threadIdx.x!=0 || threadIdx.y!=0) pre_ivalue = ivalue[threadIdx.y*blockDim.x+threadIdx.x-1];
+    pre_ivalue = cur_ivalue ^ pre_ivalue;
+    __syncthreads();                  
+
+    int leadingNum = 0;
+    if (reqBytesLength == 2)
+    {
+        if (pre_ivalue >> 16 == 0) leadingNum = 2;
+        else if (pre_ivalue >> 24 == 0) leadingNum = 1;
+    }else if (reqBytesLength == 3)
+    {
+        if (pre_ivalue >> 8 == 0) leadingNum = 3;
+        else if (pre_ivalue >> 16 == 0) leadingNum = 2;
+        else if (pre_ivalue >> 24 == 0) leadingNum = 1;
+    }else if (reqBytesLength == 1)
+    {
+        if (pre_ivalue >> 24 == 0) leadingNum = 1;
+
+    }else if (reqBytesLength == 4)
+    {
+        if (pre_ivalue == 0) leadingNum = 4;
+        else if (pre_ivalue >> 8 == 0) leadingNum = 3;
+        else if (pre_ivalue >> 16 == 0) leadingNum = 2;
+        else if (pre_ivalue >> 24 == 0) leadingNum = 1;
+    }
+    //midBytes[bbase+threadIdx.y*blockDim.x+threadIdx.x] = leadingNum; 
+
+    int midByte_size = reqBytesLength - leadingNum;
+    int midByte_sum = _shfl_scan(midByte_size, sums);
+    uchar4 cur_cvalue = cvalue[threadIdx.y*blockDim.x+threadIdx.x];
+    if (reqBytesLength == 2)
+    {
+        if (midByte_size == 1){
+            midBytes[bbase+midByte_sum-1] = cur_cvalue.z; 
+        }else if (midByte_size == 2){
+            midBytes[bbase+midByte_sum-1] = cur_cvalue.w; 
+            midBytes[bbase+midByte_sum-2] = cur_cvalue.z;
+        }
+    }else if (reqBytesLength == 3)
+    {
+        if (midByte_size == 1){
+            midBytes[bbase+midByte_sum-1] = cur_cvalue.y; 
+        }else if (midByte_size == 2){
+            midBytes[bbase+midByte_sum-1] = cur_cvalue.z; 
+            midBytes[bbase+midByte_sum-2] = cur_cvalue.y; 
+        }else if (midByte_size == 3){
+            midBytes[bbase+midByte_sum-1] = cur_cvalue.w; 
+            midBytes[bbase+midByte_sum-2] = cur_cvalue.z; 
+            midBytes[bbase+midByte_sum-3] = cur_cvalue.y; 
+        }
+    }else if (reqBytesLength == 1)
+    {
+        if (midByte_size == 1)
+            midBytes[bbase+midByte_sum-1] = cur_cvalue.w; 
+    }else if (reqBytesLength == 4)
+    {
+        if (midByte_size == 1){
+            midBytes[bbase+midByte_sum-1] = cur_cvalue.x; 
+        }else if (midByte_size == 2){
+            midBytes[bbase+midByte_sum-1] = cur_cvalue.y; 
+            midBytes[bbase+midByte_sum-2] = cur_cvalue.x; 
+        }else if (midByte_size == 3){
+            midBytes[bbase+midByte_sum-1] = cur_cvalue.z; 
+            midBytes[bbase+midByte_sum-2] = cur_cvalue.y; 
+            midBytes[bbase+midByte_sum-3] = cur_cvalue.x; 
+        }else if (midByte_size == 4){
+            midBytes[bbase+midByte_sum-1] = cur_cvalue.w; 
+            midBytes[bbase+midByte_sum-2] = cur_cvalue.z; 
+            midBytes[bbase+midByte_sum-3] = cur_cvalue.y; 
+            midBytes[bbase+midByte_sum-4] = cur_cvalue.x; 
+        }
+    }
+
+    if (threadIdx.x==0 && threadIdx.y==0) meta[mbase] = (unsigned char)reqLength;
+    if (threadIdx.x==blockDim.x-1 && threadIdx.y==blockDim.y-1) offsets[obase] = (short)midByte_sum;
+    _IntArray2ByteArray(leadingNum, mbase+1, meta);
+
+}
+
+__global__ void apply_threshold(float *data, float threshold, size_t length){
+    
+    if(threadIdx.x == 0 && blockIdx.x == 0){
+	printf("tid threshold: %f\n", threshold);
+    }
+
+    for (unsigned long tid = threadIdx.x+blockDim.x*blockIdx.x; tid < length; tid+=blockDim.x*gridDim.x)
+    {
+        if (fabs(data[tid]) <= threshold)
+        {
+            data[tid] = 0.0;
+        }
+    }
+}
+
+__global__ void compress_float(float *oriData, unsigned char *meta, short *offsets, unsigned char *midBytes, float absErrBound, int bs, size_t nb, size_t mSize, float sparsity_level, uint32_t *blk_idx, uint8_t *blk_subidx,float *blk_vals, float threshold, uint8_t *blk_sig) 
+{
+    int tidx = threadIdx.x;
+    int tidy = threadIdx.y;
+    int bid = blockIdx.x;
+
+    float data, radius, medianValue;
+    unsigned mask;
+    unsigned char state;
+    extern __shared__ float shared[];
+
+    __shared__ float block_vals[MAX_BLK_SIZE];
+    __shared__ uint8_t block_idxs[MAX_BLK_SIZE];
+    __shared__ int num_sig;
+    __shared__ int index;
+    float* value = shared;
+    int* ivalue = (int*)shared;
+    uchar4* cvalue = (uchar4*)shared;
+    int* sums = &ivalue[bs];
+
+    //if(threadIdx.x == 0 && blockIdx.x == 0){
+//	printf("tid threshold: %f\n", threshold);
+  //  }
+
+    for (unsigned long b=bid; b<nb; b+=gridDim.x){
+        if (tidx ==0 && tidy ==0)
+        {
+            num_sig = 0;
+        }
+        __syncthreads();
+
+
+        for (size_t i = b*bs+(tidx + blockDim.x*tidy); i < b*bs +bs; i+=blockDim.x*blockDim.y)
+        {
+            // fabs(data[tid]) <= threshold
+            float old = oriData[i];
+	    if (fabs(oriData[i]) > threshold)
+            {
+                int idx = atomicAdd(&num_sig, 1);
+                block_vals[idx] = oriData[i];
+                block_idxs[idx] = (uint8_t) (0xff & (i - (b*bs)));
+            }else{
+                oriData[i] = 0.0;
+            }
+            //if(fabs(old) > threshold && oriData[i] ==0.0){
+		//printf("something wrong\n");
+	    //}
+        }
+        __syncthreads();
+
+        data = oriData[b*bs+tidy*warpSize+tidx];
+        float Min = data;
+        float Max = data;
+
+        for (int offset = warpSize/2; offset > 0; offset /= 2) 
+        {
+            Min = min(Min, __shfl_xor_sync(FULL_MASK, Min, offset));
+            Max = max(Max, __shfl_xor_sync(FULL_MASK, Max, offset));
+        }
+        if (tidx==0){
+            value[tidy] = Min;
+            value[blockDim.y+tidy] = Max;
+        }
+        __syncthreads();                  
+
+        if (tidy==0){
+            if (tidx < blockDim.y){
+                Min = value[tidx];
+                Max = value[blockDim.y+tidx];
+            }
+
+            mask = __ballot_sync(FULL_MASK, tidx < blockDim.y);
+            for (int offset = blockDim.y/2; offset > 0; offset /= 2) 
+            {
+                Min = min(Min, __shfl_xor_sync(mask, Min, offset));
+                Max = max(Max, __shfl_xor_sync(mask, Max, offset));
+            }
+            
+            if (tidx==0){
+                radius = (Max - Min)/2;
+                value[0] = radius;
+                value[1] = Min + radius;
+                value[2] = absErrBound;
+            }
+        }
+        __syncthreads();                  
+
+        radius = value[0];
+        medianValue = value[1];
+
+        if (num_sig==0)
+        {
+            state = 1; // All zeros
+        }else if( num_sig > 0 && radius <= absErrBound){
+            state = 0; // Constant block with non zeros
+        } else if( ((float) num_sig/(float)bs) <= sparsity_level && num_sig > 0){
+            state = 2; // Do grouping, store as-is with bitmap/index
+        } else{
+            state = 3; // Do normal non-constant block
+        }
+        
+
+        // state = radius <= absErrBound ? 0 : 1;
+        if (tidx==0){
+	    
+            meta[b] = state;
+            meta[nb+b*mSize] = cvalue[1].x;
+            meta[nb+b*mSize+1] = cvalue[1].y;
+            meta[nb+b*mSize+2] = cvalue[1].z;
+            meta[nb+b*mSize+3] = cvalue[1].w;
+        } 
+        __syncthreads();                  
+        int tid = tidx + tidy*blockDim.x;
+        //if(tid == 0) printf("s %d %d\n", b, (int)state);
+	if (state==2)
+        {
+            int idx = 0;
+            if (tidx ==0 && tidy == 0)
+            {
+		//printf("level: %f\n", ((float)num_sig/(float)bs));
+                idx = atomicAdd(&num_state2, (uint32_t)num_sig);
+                blk_idx[b] = idx;    // Store the index of where this block has values and indices within block
+                blk_sig[b] = (uint8_t) 0xff & num_sig;
+            	index = idx;
+	    }
+            __syncthreads();
+	    idx = index;
+            for (int i = tid; i < num_sig; i+=blockDim.x*blockDim.y)
+            {
+                blk_vals[idx+i] = block_vals[i];   // Store the value of the significant data point in the block
+                blk_subidx[idx+i] = block_idxs[i]; // Store the byte value of index within block of significant data point
+                //printf("blk %f %f , ind %d\n", block_vals[i], block_idxs[i], idx);
+	    }
+            
+        }
+        
+
+        if (state==3){
+            int reqLength = _compute_reqLength(ivalue[0], ivalue[2]);
+            __syncthreads();                  
+            value[tidy*blockDim.x+tidx] = data - medianValue;
+            __syncthreads();                  
+            _compute_oneBlock(b*bs*sizeof(float), nb+b*mSize+4, b, reqLength, value, ivalue, cvalue, sums, meta, offsets, midBytes);
+        }
+
+    }
+
+}
+
+__global__ void get_numsig(uint64_t *num_sig){
+    *num_sig = (uint64_t)num_state2;
+}
diff --git a/qtensor/compression/szx/src/cuszx_wrapper.cu b/qtensor/compression/szx/src/cuszx_wrapper.cu
new file mode 100644
index 00000000..b68ac8c9
--- /dev/null
+++ b/qtensor/compression/szx/src/cuszx_wrapper.cu
@@ -0,0 +1,41 @@
+#include "cuszx_entry.h"
+#include "szx_defines.h"
+#include "szx_BytesToolkit.h"
+#include "szx_TypeManager.h"
+#include "timingGPU.h"
+
+extern "C"{
+    unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize){
+        float max,min;
+        unsigned char* bytes;
+        max = data[0];
+        min = data[0];
+        for (size_t i = 0; i < nbEle; i++)
+        {
+            if(data[i] > max) max = data[i];
+            if(data[i] < min) min = data[i];
+        }
+        
+        float threshold = r2r_threshold*(max-min);
+        float errBound = r2r_err*(max-min);
+        bytes = cuSZx_fast_compress_args_unpredictable_blocked_float(data, outSize, errBound, nbEle, blockSize, threshold);
+   	    // printf("outSize %p\n", bytes);
+        return bytes;
+    }
+
+    float* cuSZx_integrated_decompress(unsigned char *bytes, size_t nbEle){
+        // printf("test\n");
+        float**data;
+	    cuSZx_fast_decompress_args_unpredictable_blocked_float(data, nbEle, bytes);
+        return *data;
+    }
+
+    unsigned char* cuSZx_device_compress(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold){
+        return device_ptr_cuSZx_compress_float(oriData, outSize, absErrBound, nbEle, blockSize, threshold);
+    }
+
+    float* cuSZx_device_decompress(size_t nbEle, unsigned char* cmpBytes){
+        return device_ptr_cuSZx_decompress_float(nbEle, cmpBytes);
+    }
+    
+}
diff --git a/qtensor/compression/szx/src/cuszx_wrapper.py b/qtensor/compression/szx/src/cuszx_wrapper.py
new file mode 100644
index 00000000..15227432
--- /dev/null
+++ b/qtensor/compression/szx/src/cuszx_wrapper.py
@@ -0,0 +1,122 @@
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+import cupy as cp
+
+LIB_PATH = './cuszx_wrapper.so'
+
+# unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize)
+
+def get_host_compress():
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.cuSZx_integrated_compress
+    # Returns: unsigned char *bytes
+    # Needs: float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize
+    func.argtypes = [POINTER(c_float), c_float, c_float, c_size_t, c_int, POINTER(c_size_t)]
+    func.restype = POINTER(c_ubyte)
+    return func
+
+# float* cuSZx_integrated_decompress(unsigned char *bytes, size_t nbEle)
+
+def get_host_decompress():
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.cuSZx_integrated_decompress
+    # Returns: float *newData
+    # Needs: size_t nbEle, unsigned char *cmpBytes
+    func.argtypes = [POINTER(c_ubyte), c_size_t]
+    func.restype = POINTER(c_float)
+    return func
+
+def get_device_compress():
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.cuSZx_device_compress
+    # Returns: unsigned char *bytes
+    # Needs: float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold
+    func.argtypes = [POINTER(c_float), POINTER(c_size_t), c_float, c_size_t, c_int, c_float]
+    func.restype = POINTER(c_ubyte)
+    return func
+
+def get_device_decompress():
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.cuSZx_device_decompress
+    # Returns: float *newData
+    # Needs: size_t nbEle, unsigned char *cmpBytes
+    func.argtypes = [c_size_t, POINTER(c_ubyte)]
+    func.restype = POINTER(c_float)
+    return func
+
+
+def cuszx_host_compress(oriData, absErrBound, nbEle, blockSize,threshold):
+    __cuszx_host_compress = get_host_compress()
+
+    variable = ctypes.c_size_t(0)
+    outSize = ctypes.pointer(variable)
+
+    oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float))
+
+    o_bytes = __cuszx_host_compress(oriData_p, outSize,np.float32(absErrBound), np.ulonglong(nbEle), np.int32(blockSize),np.float32(threshold))
+
+    return o_bytes, outSize
+
+def cuszx_host_decompress(nbEle, cmpBytes):
+    __cuszx_host_decompress=get_host_decompress()
+
+    nbEle_p = ctypes.c_size_t(nbEle)
+    newData = __cuszx_host_decompress(nbEle_p,cmpBytes)
+    return newData
+
+def cuszx_device_compress(oriData, absErrBound, nbEle, blockSize,threshold):
+    __cuszx_device_compress = get_device_compress()
+
+    variable = ctypes.c_size_t(0)
+    outSize = ctypes.pointer(variable)
+
+    oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float))
+    
+    o_bytes = __cuszx_device_compress(oriData_p, outSize,np.float32(absErrBound), np.ulonglong(nbEle), np.int32(blockSize),np.float32(threshold))
+    
+    return o_bytes, outSize
+
+
+def cuszx_device_decompress(nbEle, cmpBytes):
+    __cuszx_device_decompress=get_device_decompress()
+    
+    nbEle_p = ctypes.c_size_t(nbEle)
+    newData = __cuszx_device_decompress(nbEle_p,cmpBytes)
+    return newData
+
+### Example of device compress/decompress wrapper usage
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = 1024
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.1
+    r2r_error = 0.1
+
+    in_vector = np.zeros((DATA_SIZE,))
+    for i in range(0,int(DATA_SIZE/4)):
+        in_vector[i] = 0.0
+    for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+        in_vector[i] = 5.0
+    for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+        in_vector[i] = random.uniform(MIN_D, MAX_D)
+    for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+        in_vector[i] = -7.0
+    for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+        in_vector[i] = 0.001
+
+
+    in_vector = in_vector.astype('float32')
+    in_vector_gpu = cp.asarray(in_vector)
+
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+
+    o_bytes, outSize = cuszx_device_compress(in_vector_gpu, r2r_error, DATA_SIZE, 256, r2r_threshold)
+    print("Compress Success...starting decompress ")
+    d_bytes = cuszx_device_decompress(DATA_SIZE, o_bytes)
+    print("Decompress Success")
diff --git a/qtensor/compression/szx/src/cuszxd_float.cu b/qtensor/compression/szx/src/cuszxd_float.cu
new file mode 100644
index 00000000..3edd1ee3
--- /dev/null
+++ b/qtensor/compression/szx/src/cuszxd_float.cu
@@ -0,0 +1,341 @@
+#include <stdio.h>
+#include <math.h>
+#include "cuszxd_float.h"
+
+#include <cooperative_groups.h>
+
+namespace cg = cooperative_groups;
+
+__device__ int _deshfl_scan(int lznum, int *sums)
+{
+    // Below is the basic structure of using a shfl instruction
+    // for a scan.
+    // Record "value" as a variable - we accumulate it along the way
+    int value = lznum;
+
+    // Now accumulate in log steps up the chain
+    // compute sums, with another thread's value who is
+    // distance delta away (i).  Note
+    // those threads where the thread 'i' away would have
+    // been out of bounds of the warp are unaffected.  This
+    // creates the scan sum.
+
+#pragma unroll
+    for (int i = 1; i <= warpSize; i *= 2) {
+        unsigned int mask = 0xffffffff;
+        int n = __shfl_up_sync(mask, value, i);
+
+        if (threadIdx.x >= i) value += n;
+                      
+    }
+
+    // value now holds the scan value for the individual thread
+    // next sum the largest values for each warp
+
+    // write the sum of the warp to smem
+    if (threadIdx.x == warpSize - 1) {
+        sums[threadIdx.y] = value;
+    }
+    __syncthreads();
+
+    //
+    // scan sum the warp sums
+    // the same shfl scan operation, but performed on warp sums
+    //
+    if (threadIdx.y == 0 && threadIdx.x < blockDim.y) {
+        int warp_sum = sums[threadIdx.x];
+
+        int mask = (1 << blockDim.y) - 1;
+        for (int i = 1; i <= blockDim.y; i *= 2) {
+            //int n = __shfl_up_sync(mask, warp_sum, i, blockDim.y);
+            int n = __shfl_up_sync(mask, warp_sum, i);
+            if (threadIdx.x >= i) warp_sum += n;
+        }
+
+        sums[threadIdx.x] = warp_sum;
+    }
+    __syncthreads();
+
+    // perform a uniform add across warps in the block
+    // read neighbouring warp's sum and add it to threads value
+    int blockSum = 0;
+    if (threadIdx.y > 0) {
+        blockSum = sums[threadIdx.y - 1];
+    }
+    value += blockSum;
+
+    return value;
+}
+
+__device__ int _compareByte(int pre, int cur, int reqBytesLength)
+{
+    if (reqBytesLength == 2)
+    {
+        if ((pre&0x0000ff00) > (cur&0x0000ff00)){
+            cur &= 0x000000ff;
+            cur |= (pre & 0x0000ff00);
+        }
+        if ((pre&0x000000ff) > (cur&0x000000ff)){
+            cur &= 0x0000ff00;
+            cur |= (pre & 0x000000ff);
+        }
+    }else if (reqBytesLength == 3)
+    {
+        if ((pre&0x00ff0000) > (cur&0x00ff0000)){
+            cur &= 0x0000ffff;
+            cur |= (pre & 0x00ff0000);
+        }
+        if ((pre&0x0000ff00) > (cur&0x0000ff00)){
+            cur &= 0x00ff00ff;
+            cur |= (pre & 0x0000ff00);
+        }
+        if ((pre&0x000000ff) > (cur&0x000000ff)){
+            cur &= 0x00ffff00;
+            cur |= (pre & 0x000000ff);
+        }
+    }else if (reqBytesLength == 1)
+    {
+        if (pre > cur)
+            cur = pre;
+    }else if (reqBytesLength == 4)
+    {
+        if ((pre&0xff000000) > (cur&0xff000000)){
+            cur &= 0x00ffffff;
+            cur |= (pre & 0xff000000);
+        }
+        if ((pre&0x00ff0000) > (cur&0x00ff0000)){
+            cur &= 0xff00ffff;
+            cur |= (pre & 0x00ff0000);
+        }
+        if ((pre&0x0000ff00) > (cur&0x0000ff00)){
+            cur &= 0xffff00ff;
+            cur |= (pre & 0x0000ff00);
+        }
+        if ((pre&0x000000ff) > (cur&0x000000ff)){
+            cur &= 0xffffff00;
+            cur |= (pre & 0x000000ff);
+        }
+    }
+    return cur;
+}
+
+__device__ int _retrieve_leading(int pos, int reqBytesLength, int* sums)
+{
+#pragma unroll
+    for (int i = 1; i <= warpSize; i *= 2) {
+        unsigned int mask = 0xffffffff;
+        int n = __shfl_up_sync(mask, pos, i);
+        if (threadIdx.x >= i)
+            pos = _compareByte(n, pos, reqBytesLength);
+    }
+
+    if (threadIdx.x == warpSize - 1)
+        sums[threadIdx.y] = pos;
+    __syncthreads();
+
+    if (threadIdx.y == 0 && threadIdx.x < blockDim.y) {
+        int warp_pos = sums[threadIdx.x];
+
+        int mask = (1 << blockDim.y) - 1;
+        for (int i = 1; i <= blockDim.y; i *= 2) {
+            int n = __shfl_up_sync(mask, warp_pos, i);
+            if (threadIdx.x >= i)
+                warp_pos = _compareByte(n, warp_pos, reqBytesLength);
+        }
+
+        sums[threadIdx.x] = warp_pos;
+    }
+    __syncthreads();
+
+    if (threadIdx.y > 0) {
+        int block_pos = sums[threadIdx.y - 1];
+        pos = _compareByte(block_pos, pos, reqBytesLength);
+    }
+
+    return pos;
+}
+
+#define MAX_BLK_SIZE 256
+
+__global__ void decompress_state2(float *out, unsigned char* stateArray, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx,uint32_t blockSize, uint8_t *blk_sig){
+    int bid = blockIdx.x;
+    uint8_t state = stateArray[bid];
+
+    __shared__ float block_vals[MAX_BLK_SIZE];
+    __shared__ uint8_t block_subidx[MAX_BLK_SIZE];
+    // __shared__ char idx_taken[MAX_BLK_SIZE];
+    __shared__ float s_out[MAX_BLK_SIZE];
+    __shared__ int sig_count;
+    if (state != 2)
+    {
+        return;
+    }
+
+    int local_sig = blk_sig[bid];
+    int idx = blk_idx[bid];
+    
+    for (size_t i = threadIdx.x; i < local_sig; i+=blockDim.x)
+    {
+        block_vals[i] = blk_vals[idx+i];
+        block_subidx[i]=blk_subidx[idx+i];
+        // idx_taken[block_subidx[i]] = 1;
+        atomicAdd(&sig_count, 1);
+        
+    }
+    
+    __syncthreads();
+    
+    for (size_t i = threadIdx.x; i < blockSize; i+=blockDim.x)
+    {
+        s_out[i] = 0.0;
+    }
+
+    __syncthreads();
+    for (size_t i = threadIdx.x; i < local_sig; i+=blockDim.x)
+    {
+        s_out[block_subidx[i]] = block_vals[i];
+    }
+    __syncthreads();
+    for (size_t i = threadIdx.x; i < blockSize; i+=blockDim.x)
+    {
+        out[bid*blockSize+i] = s_out[i];
+    }
+}
+
+__global__ void decompress_float(unsigned char *data, int bs, size_t nc, size_t mSize) 
+{
+    int tidx = threadIdx.x;
+    int tidy = threadIdx.y;
+    int tid = tidy*warpSize+tidx;
+    int bid = blockIdx.x;
+
+    float medianValue;
+    unsigned char leadingNum;
+    extern __shared__ float shared[];
+    float* value = shared;
+    int* ivalue = (int*)shared;
+    uchar4* c4value = (uchar4*)shared;
+    unsigned char* cvalue = (unsigned char*)shared;
+    int* sums = &ivalue[bs];
+    int reqLength;
+    float* fbytes = (float*)data;
+	int reqBytesLength;
+	int rightShiftBits;
+
+
+    bool bi = false;
+    for (int b=bid; b<nc; b+=gridDim.x){
+        bi = false;
+        if (b==26192) bi=true;
+        value[tid] = fbytes[b*bs+tid];
+        __syncthreads();                  
+        medianValue = value[0];
+        reqLength = (int)cvalue[4];
+        if (reqLength%8 != 0)
+        {
+            reqBytesLength = reqLength/8+1;		
+            rightShiftBits = 8 - reqLength%8;
+        }else{
+            reqBytesLength = reqLength/8;		
+            rightShiftBits = 0;
+        }
+        leadingNum = cvalue[5+(tid>>2)];
+        leadingNum = (leadingNum >> (6-((tid&0x03)<<1))) & 0x03;
+        int midByte_size = reqBytesLength - leadingNum;
+        int midByte_sum = _deshfl_scan(midByte_size, sums);
+
+        uchar4 tmp;
+        tmp.x = 0;
+        tmp.y = 0;
+        tmp.z = 0;
+        tmp.w = 0;
+        int pos = 0;
+        if (reqBytesLength == 2)
+        {
+            if (midByte_size == 1){
+                tmp.z = cvalue[mSize+midByte_sum-1]; 
+                pos |= tid<<8;
+            }else if (midByte_size == 2){
+                tmp.w = cvalue[mSize+midByte_sum-1]; 
+                tmp.z = cvalue[mSize+midByte_sum-2];
+                pos |= tid;
+                pos |= tid<<8;
+            }
+        }else if (reqBytesLength == 3)
+        {
+            if (midByte_size == 1){
+                tmp.y = cvalue[mSize+midByte_sum-1]; 
+                pos |= tid<<16;
+            }else if (midByte_size == 2){
+                tmp.z = cvalue[mSize+midByte_sum-1]; 
+                tmp.y = cvalue[mSize+midByte_sum-2]; 
+                pos |= tid<<8;
+                pos |= tid<<16;
+            }else if (midByte_size == 3){
+                tmp.w = cvalue[mSize+midByte_sum-1]; 
+                tmp.z = cvalue[mSize+midByte_sum-2]; 
+                tmp.y = cvalue[mSize+midByte_sum-3]; 
+                pos |= tid;
+                pos |= tid<<8;
+                pos |= tid<<16;
+            }
+        }else if (reqBytesLength == 1)
+        {
+            if (midByte_size == 1)
+                tmp.w = cvalue[mSize+midByte_sum-1]; 
+                pos |= tid;
+        }else if (reqBytesLength == 4)
+        {
+            if (midByte_size == 1){
+                tmp.x = cvalue[mSize+midByte_sum-1]; 
+                pos |= tid<<24;
+            }else if (midByte_size == 2){
+                tmp.y = cvalue[mSize+midByte_sum-1]; 
+                tmp.x = cvalue[mSize+midByte_sum-2]; 
+                pos |= tid<<16;
+                pos |= tid<<24;
+            }else if (midByte_size == 3){
+                tmp.z = cvalue[mSize+midByte_sum-1]; 
+                tmp.y = cvalue[mSize+midByte_sum-2]; 
+                tmp.x = cvalue[mSize+midByte_sum-3]; 
+                pos |= tid<<8;
+                pos |= tid<<16;
+                pos |= tid<<24;
+            }else if (midByte_size == 4){
+                tmp.w = cvalue[mSize+midByte_sum-1]; 
+                tmp.z = cvalue[mSize+midByte_sum-2]; 
+                tmp.y = cvalue[mSize+midByte_sum-3]; 
+                tmp.x = cvalue[mSize+midByte_sum-4]; 
+                pos |= tid;
+                pos |= tid<<8;
+                pos |= tid<<16;
+                pos |= tid<<24;
+            }
+        }
+        __syncthreads();                  
+        c4value[tid] = tmp;
+
+        pos = _retrieve_leading(pos, reqBytesLength, sums);
+
+        if (leadingNum == 2){
+            tmp.w = c4value[pos&0xff].w; 
+            tmp.z = c4value[(pos>>8)&0xff].z;
+        }else if (leadingNum == 3){
+            tmp.w = c4value[pos&0xff].w; 
+            tmp.z = c4value[(pos>>8)&0xff].z;
+            tmp.y = c4value[(pos>>16)&0xff].y; 
+        }else if (leadingNum == 1){
+            tmp.w = c4value[pos&0xff].w; 
+        }else if (leadingNum == 4){
+            tmp.w = c4value[pos&0xff].w; 
+            tmp.z = c4value[(pos>>8)&0xff].z;
+            tmp.y = c4value[(pos>>16)&0xff].y; 
+            tmp.x = c4value[pos>>24].x; 
+        }
+        c4value[tid] = tmp;
+        __syncthreads();                  
+        ivalue[tid] = ivalue[tid] << rightShiftBits;
+
+        fbytes[b*bs+tid] = value[tid] + medianValue;
+    }
+}
diff --git a/qtensor/compression/szx/src/pred_quant.c b/qtensor/compression/szx/src/pred_quant.c
new file mode 100644
index 00000000..e69de29b
diff --git a/qtensor/compression/szx/src/sz_p_q.c b/qtensor/compression/szx/src/sz_p_q.c
new file mode 100644
index 00000000..d6cb6017
--- /dev/null
+++ b/qtensor/compression/szx/src/sz_p_q.c
@@ -0,0 +1,367 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+void updateLossyCompElement_Double(unsigned char* curBytes, unsigned char* preBytes, 
+		int reqBytesLength, int resiBitsLength,  LossyCompressionElement *lce)
+{
+	int resiIndex, intMidBytes_Length = 0;
+	int leadingNum = compIdenticalLeadingBytesCount_double(preBytes, curBytes); //in fact, float is enough for both single-precision and double-precisiond ata.
+	int fromByteIndex = leadingNum;
+	int toByteIndex = reqBytesLength; //later on: should use "< toByteIndex" to tarverse....
+	if(fromByteIndex < toByteIndex)
+	{
+		intMidBytes_Length = reqBytesLength - leadingNum;
+		memcpy(lce->integerMidBytes, &(curBytes[fromByteIndex]), intMidBytes_Length);
+	}
+	int resiBits = 0;
+	if(resiBitsLength!=0)
+	{
+		resiIndex = reqBytesLength;
+		if(resiIndex < 8)
+			resiBits = (curBytes[resiIndex] & 0xFF) >> (8-resiBitsLength);
+	}
+	lce->leadingZeroBytes = leadingNum;
+	lce->integerMidBytes_Length = intMidBytes_Length;
+	lce->resMidBitsLength = resiBitsLength;
+	lce->residualMidBits = resiBits;
+}
+
+inline void longToBytes_bigEndian(unsigned char *b, unsigned long num) 
+{
+	b[0] = (unsigned char)(num>>56);
+	b[1] = (unsigned char)(num>>48);
+	b[2] = (unsigned char)(num>>40);
+	b[3] = (unsigned char)(num>>32);
+	b[4] = (unsigned char)(num>>24);
+	b[5] = (unsigned char)(num>>16);
+	b[6] = (unsigned char)(num>>8);
+	b[7] = (unsigned char)(num);
+//	if(dataEndianType==LITTLE_ENDIAN_DATA)
+//		symTransform_8bytes(*b);
+}
+
+void compressSingleDoubleValue(DoubleValueCompressElement *vce, double tgtValue, double precision, double medianValue,
+		int reqLength, int reqBytesLength, int resiBitsLength)
+{
+	double normValue = tgtValue - medianValue;
+
+	ldouble lfBuf;
+	lfBuf.value = normValue;
+
+	int ignBytesLength = 64 - reqLength;
+	if(ignBytesLength<0)
+		ignBytesLength = 0;
+
+	long tmp_long = lfBuf.lvalue;
+	longToBytes_bigEndian(vce->curBytes, tmp_long);
+
+	lfBuf.lvalue = (lfBuf.lvalue >> ignBytesLength)<<ignBytesLength;
+
+	//double tmpValue = lfBuf.value;
+
+	vce->data = lfBuf.value+medianValue;
+	vce->curValue = tmp_long;
+	vce->reqBytesLength = reqBytesLength;
+	vce->resiBitsLength = resiBitsLength;
+}
+
+inline void intToBytes_bigEndian(unsigned char *b, unsigned int num)
+{
+	b[0] = (unsigned char)(num >> 24);	
+	b[1] = (unsigned char)(num >> 16);	
+	b[2] = (unsigned char)(num >> 8);	
+	b[3] = (unsigned char)(num);	
+	
+	//note: num >> xxx already considered endian_type...
+//if(dataEndianType==LITTLE_ENDIAN_DATA)
+//		symTransform_4bytes(*b); //change to BIG_ENDIAN_DATA
+}
+
+inline short computeReqLength_double_MSST19(double realPrecision)
+{
+	short reqExpo = getPrecisionReqLength_double(realPrecision);
+	return 12-reqExpo;
+}
+
+
+unsigned int optimize_intervals_double_1D_opt_MSST19(double *oriData, size_t dataLength, double realPrecision)
+{
+	size_t i = 0, radiusIndex;
+	double pred_value = 0;
+	double pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = 0;//dataLength/confparams_cpr->sampleDistance;
+
+	double * data_pos = oriData + 2;
+	double divider = log2(1+realPrecision)*2;
+	int tempIndex = 0;
+	while(data_pos - oriData < dataLength){
+		if(*data_pos == 0){
+        		data_pos += confparams_cpr->sampleDistance;
+        		continue;
+		}
+		tempIndex++;
+		totalSampleSize++;
+		pred_value = data_pos[-1];
+		pred_err = fabs((double)*data_pos / pred_value);
+		radiusIndex = (unsigned long)fabs(log2(pred_err)/divider+0.5);
+		if(radiusIndex>=confparams_cpr->maxRangeRadius)
+			radiusIndex = confparams_cpr->maxRangeRadius - 1;
+		intervals[radiusIndex]++;
+
+		data_pos += confparams_cpr->sampleDistance;
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<64)
+		powerOf2 = 64;
+
+	free(intervals);
+	return powerOf2;
+}
+
+
+TightDataPointStorageD* SZ_compress_double_1D_MDQ_MSST19(double *oriData,
+size_t dataLength, double realPrecision, double valueRangeSize, double medianValue_f)
+{
+#ifdef HAVE_TIMECMPR
+	double* decData = NULL;
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData = (double*)(multisteps->hist_data);
+#endif
+
+	//struct ClockPoint clockPointBuild;
+	//TimeDurationStart("build", &clockPointBuild);
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_double_1D_opt_MSST19(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	//updateQuantizationInfo(quantization_intervals);
+	int intvRadius = quantization_intervals/2;
+
+	double* precisionTable = (double*)malloc(sizeof(double) * quantization_intervals);
+	double inv = 2.0-pow(2, -(confparams_cpr->plus_bits));
+    for(int i=0; i<quantization_intervals; i++){
+        double test = pow((1+realPrecision), inv*(i - intvRadius));
+        precisionTable[i] = test;
+    }
+
+	struct TopLevelTableWideInterval levelTable;
+    MultiLevelCacheTableWideIntervalBuild(&levelTable, precisionTable, quantization_intervals, realPrecision, confparams_cpr->plus_bits);
+
+	size_t i;
+	int reqLength;
+	double medianValue = medianValue_f;
+	//double medianInverse = 1 / medianValue_f;
+	//short radExpo = getExponent_double(realPrecision);
+
+	reqLength = computeReqLength_double_MSST19(realPrecision);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	double* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, dataLength/2/8);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, dataLength/2);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[8];
+	intToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+	double last3CmprsData[3] = {0};
+
+	//size_t miss=0, hit=0;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+	//add the first data
+	type[0] = 0;
+	compressSingleDoubleValue_MSST19(vce, spaceFillingValue[0], realPrecision, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_double(last3CmprsData, vce->data);
+	//miss++;
+#ifdef HAVE_TIMECMPR
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[0] = vce->data;
+#endif
+
+	//add the second data
+	type[1] = 0;
+	compressSingleDoubleValue_MSST19(vce, spaceFillingValue[1], realPrecision, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_double(last3CmprsData, vce->data);
+	//miss++;
+#ifdef HAVE_TIMECMPR
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[1] = vce->data;
+#endif
+	int state;
+	//double checkRadius;
+	double curData;
+	double pred = vce->data;
+
+    double predRelErrRatio;
+
+	const uint64_t top = levelTable.topIndex, base = levelTable.baseIndex;
+	const uint64_t range = top - base;
+	const int bits = levelTable.bits;
+	uint64_t* const buffer = (uint64_t*)&predRelErrRatio;
+	const int shift = 52-bits;
+	uint64_t expoIndex, mantiIndex;
+	uint16_t* tables[range+1];
+	for(int i=0; i<=range; i++){
+		tables[i] = levelTable.subTables[i].table;
+	}
+
+	for(i=2;i<dataLength;i++)
+	{
+		curData = spaceFillingValue[i];
+		predRelErrRatio = curData / pred;
+
+		expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+		if(expoIndex <= range){
+			mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+			state = tables[expoIndex][mantiIndex];
+		}else{
+			state = 0;
+		}
+
+		if(state)
+		{
+			type[i] = state;
+			pred *= precisionTable[state];
+			//hit++;
+			continue;
+		}
+
+		//unpredictable data processing
+		type[i] = 0;
+		compressSingleDoubleValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		pred =  vce->data;
+		//miss++;
+#ifdef HAVE_TIMECMPR
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[i] = vce->data;
+#endif
+
+	}//end of for
+
+//	printf("miss:%d, hit:%d\n", miss, hit);
+
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageD* tdps;
+
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+    tdps->plus_bits = confparams_cpr->plus_bits;
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	free(precisionTable);
+	freeTopLevelTableWideInterval(&levelTable);
+	return tdps;
+}
+
+
+void SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr_pre_log_MSST19(unsigned char** newByteData, double *oriData, double pwrErrRatio, size_t dataLength, size_t *outSize, double valueRangeSize, double medianValue_f,
+																unsigned char* signs, bool* positive, double min, double max, double nearZero){
+	double multiplier = pow((1+pwrErrRatio), -3.0001);
+	for(int i=0; i<dataLength; i++){
+		if(oriData[i] == 0){
+			oriData[i] = nearZero * multiplier;
+		}
+	}
+
+	double median_log = sqrt(fabs(nearZero * max));
+
+	TightDataPointStorageD* tdps = SZ_compress_double_1D_MDQ_MSST19(oriData, dataLength, pwrErrRatio, valueRangeSize, median_log);
+
+	tdps->minLogValue = nearZero / ((1+pwrErrRatio)*(1+pwrErrRatio));
+	if(!(*positive)){
+		unsigned char * comp_signs;
+		// compress signs
+		unsigned long signSize = sz_lossless_compress(ZSTD_COMPRESSOR, 3, signs, dataLength, &comp_signs);
+		tdps->pwrErrBoundBytes = comp_signs;
+		tdps->pwrErrBoundBytes_size = signSize;
+	}
+	else{
+		tdps->pwrErrBoundBytes = NULL;
+		tdps->pwrErrBoundBytes_size = 0;
+	}
+	free(signs);
+
+	convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+	if(*outSize>3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + sizeof(double)*dataLength)
+		SZ_compress_args_double_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+	free_TightDataPointStorageD(tdps);
+}
+
+double computeRangeSize_double_MSST19(double* oriData, size_t size, double* valueRangeSize, double* medianValue, unsigned char * signs, bool* positive, double* nearZero)
+{
+    size_t i = 0;
+    double min = oriData[0];
+    double max = min;
+    *nearZero = min;
+
+    for(i=1;i<size;i++)
+    {
+        double data = oriData[i];
+        if(data <0){
+            signs[i] = 1;
+            *positive = false;
+        }
+        if(oriData[i] != 0 && fabs(oriData[i]) < fabs(*nearZero)){
+            *nearZero = oriData[i];
+        }
+        if(min>data)
+            min = data;
+        else if(max<data)
+            max = data;
+    }
+
+    *valueRangeSize = max - min;
+    *medianValue = min + *valueRangeSize/2;
+    return min;
+}
\ No newline at end of file
diff --git a/qtensor/compression/szx/src/szx.c b/qtensor/compression/szx/src/szx.c
new file mode 100644
index 00000000..ed4d1bef
--- /dev/null
+++ b/qtensor/compression/szx/src/szx.c
@@ -0,0 +1,439 @@
+/**
+ *  @file sz.c
+ *  @author Sheng Di
+ *  @date Jan, 2022
+ *  @brief 
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "szx.h"
+#include "szx_rw.h"
+
+int versionNumber[4] = {SZx_VER_MAJOR,SZx_VER_MINOR,SZx_VER_BUILD,SZx_VER_REVISION};
+
+int dataEndianType = LITTLE_ENDIAN_DATA; //*endian type of the data read from disk
+int sysEndianType = LITTLE_ENDIAN_SYSTEM; //*sysEndianType is actually set automatically.
+
+int computeDimension(size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	int dimension;
+	if(r1==0)
+	{
+		dimension = 0;
+	}
+	else if(r2==0)
+	{
+		dimension = 1;
+	}
+	else if(r3==0)
+	{
+		dimension = 2;
+	}
+	else if(r4==0)
+	{
+		dimension = 3;
+	}
+	else if(r5==0)
+	{
+		dimension = 4;
+	}
+	else
+	{
+		dimension = 5;
+	}
+	return dimension;
+}
+
+size_t computeDataLength(size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	size_t dataLength;
+	if(r1==0)
+	{
+		dataLength = 0;
+	}
+	else if(r2==0)
+	{
+		dataLength = r1;
+	}
+	else if(r3==0)
+	{
+		dataLength = r1*r2;
+	}
+	else if(r4==0)
+	{
+		dataLength = r1*r2*r3;
+	}
+	else if(r5==0)
+	{
+		dataLength = r1*r2*r3*r4;
+	}
+	else
+	{
+		dataLength = r1*r2*r3*r4*r5;
+	}
+	return dataLength;
+}
+
+/**
+ * @brief		check dimension and correct it if needed
+ * @return 	0 (didn't change dimension)
+ * 					1 (dimension is changed)
+ * 					2 (dimension is problematic)
+ **/
+int filterDimension(size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t* correctedDimension)
+{
+	int dimensionCorrected = 0;
+	int dim = computeDimension(r5, r4, r3, r2, r1);
+	correctedDimension[0] = r1;
+	correctedDimension[1] = r2;
+	correctedDimension[2] = r3;
+	correctedDimension[3] = r4;
+	correctedDimension[4] = r5;
+	size_t* c = correctedDimension;
+	if(dim==1)
+	{
+		if(r1<1)
+			return 2;
+	}
+	else if(dim==2)
+	{
+		if(r2==1)
+		{
+			c[1]= 0;
+			dimensionCorrected = 1;
+		}	
+		if(r1==1) //remove this dimension
+		{
+			c[0] = c[1]; 
+			c[1] = c[2];
+			dimensionCorrected = 1;
+		}
+	}
+	else if(dim==3)
+	{
+		if(r3==1)
+		{
+			c[2] = 0;
+			dimensionCorrected = 1;
+		}	
+		if(r2==1)
+		{
+			c[1] = c[2];
+			c[2] = c[3];
+			dimensionCorrected = 1;
+		}
+		if(r1==1)
+		{
+			c[0] = c[1];
+			c[1] = c[2];
+			c[2] = c[3];
+			dimensionCorrected = 1;
+		}
+	}
+	else if(dim==4)
+	{
+		if(r4==1)
+		{
+			c[3] = 0;
+			dimensionCorrected = 1;
+		}
+		if(r3==1)
+		{
+			c[2] = c[3];
+			c[3] = c[4];
+			dimensionCorrected = 1;
+		}
+		if(r2==1)
+		{
+			c[1] = c[2];
+			c[2] = c[3];
+			c[3] = c[4];
+			dimensionCorrected = 1;
+		}
+		if(r1==1)
+		{
+			c[0] = c[1];
+			c[1] = c[2];
+			c[2] = c[3];
+			c[3] = c[4];
+			dimensionCorrected = 1;
+		}
+	}
+	else if(dim==5)
+	{
+		if(r5==1)
+		{
+			c[4] = 0;
+			dimensionCorrected = 1;
+		}
+		if(r4==1)
+		{
+			c[3] = c[4];
+			c[4] = 0;
+			dimensionCorrected = 1;
+		}
+		if(r3==1)
+		{
+			c[2] = c[3];
+			c[3] = c[4];
+			c[4] = 0;
+			dimensionCorrected = 1;
+		}
+		if(r2==1)
+		{
+			c[1] = c[2];
+			c[2] = c[3];
+			c[3] = c[4];
+			c[4] = 0;
+			dimensionCorrected = 1;
+		}
+		if(r1==1)
+		{
+			c[0] = c[1];
+			c[1] = c[2];
+			c[2] = c[3];
+			c[3] = c[4];
+			c[4] = 0;
+			dimensionCorrected = 1;
+		}
+	}
+	
+	return dimensionCorrected;
+	
+}
+
+unsigned char* SZ_fast_compress_args(int fastMode, int dataType, void *data, size_t *outSize, int errBoundMode, float absErrBound,
+float relBoundRatio, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	unsigned char*  bytes = NULL;
+	size_t length = computeDataLength(r5, r4, r3, r2, r1);
+	size_t i = 0;
+	
+	if(dataType == SZ_FLOAT)
+	{
+		if(fastMode == SZx_WITH_BLOCK_FAST_CMPR || fastMode == SZx_RANDOMACCESS_FAST_CMPR || fastMode == SZx_OPENMP_FAST_CMPR)
+		{
+			float realPrecision = absErrBound;
+			if(errBoundMode==REL)
+			{
+				float* oriData = (float*)data;
+				float min = oriData[0];
+				float max = oriData[0];
+				for(i=0;i<length;i++)
+				{
+					float v = oriData[i];
+					if(min>v)
+						min = v;
+					else if(max<v)
+						max = v;
+				}
+				float valueRange = max - min;
+				realPrecision = valueRange*relBoundRatio;
+			}
+
+			int blockSize = 128;
+			if (fastMode == SZx_RANDOMACCESS_FAST_CMPR) {
+				bytes = SZ_fast_compress_args_unpredictable_blocked_randomaccess_float(data, outSize, realPrecision, length, blockSize);
+			} 
+			else if(fastMode == SZx_OPENMP_FAST_CMPR)
+			{
+				#ifdef _OPENMP
+				bytes = SZ_fast_compress_args_unpredictable_blocked_randomaccess_float_openmp(data, outSize, realPrecision, length,
+																							  blockSize);
+				#else
+				bytes = SZ_fast_compress_args_unpredictable_blocked_randomaccess_float(data, outSize, realPrecision, length, blockSize);
+				printf("WARNING: It seems that you want to run the code with openmp mode but you didn't compile the code in openmp mode.\nSo, the compression is degraded to serial version automatically.\n");
+				#endif
+			}
+			else {
+				bytes = SZ_fast_compress_args_unpredictable_blocked_float(data, outSize, realPrecision, length, blockSize);
+			}
+			return bytes;
+		}
+		else
+		{
+			//compute value range
+			float* oriData = (float*)data;
+			float min = oriData[0];
+			float max = oriData[0];
+			for(i=0;i<length;i++)
+			{
+				float v = oriData[i];
+				if(min>v)
+					min = v;
+				else if(max<v)
+					max = v;
+			}
+			float valueRange = max - min;
+			float radius = valueRange/2;
+			float medianValue = min + radius;
+
+			float realPrecision = 0;
+			if(errBoundMode==ABS)
+				realPrecision = absErrBound;
+			else if(errBoundMode==REL)
+				realPrecision = valueRange*relBoundRatio;
+
+			bytes = SZ_fast_compress_args_unpredictable_float(data, outSize, realPrecision, r5, r4, r3, r2, r1, medianValue, radius);		
+		}
+	}
+	else if(dataType == SZ_DOUBLE)
+	{
+		if(fastMode == SZx_WITH_BLOCK_FAST_CMPR || fastMode == SZx_RANDOMACCESS_FAST_CMPR || fastMode == SZx_OPENMP_FAST_CMPR)
+		{
+			float realPrecision = absErrBound;
+			if(errBoundMode==REL)
+			{
+				double* oriData = (double*)data;
+				double min = oriData[0];
+				double max = oriData[0];
+				for(i=0;i<length;i++)
+				{
+					double v = oriData[i];
+					if(min>v)
+						min = v;
+					else if(max<v)
+						max = v;
+				}
+				double valueRange = max - min;
+				realPrecision = valueRange*relBoundRatio;
+			}
+
+			int blockSize = 128;
+			if (fastMode == SZx_RANDOMACCESS_FAST_CMPR) {
+				bytes = SZ_fast_compress_args_unpredictable_blocked_randomaccess_double(data, outSize, realPrecision, length, blockSize);
+			} 
+			else if(fastMode == SZx_OPENMP_FAST_CMPR)
+			{
+				#ifdef _OPENMP
+				bytes = SZ_fast_compress_args_unpredictable_blocked_randomaccess_double_openmp(data, outSize, realPrecision, length,
+																							  blockSize);
+				#else
+				bytes = SZ_fast_compress_args_unpredictable_blocked_randomaccess_double(data, outSize, realPrecision, length, blockSize);
+				printf("WARNING: It seems that you want to run the code with openmp mode but you didn't compile the code in openmp mode.\nSo, the compression is degraded to serial version automatically.\n");
+				#endif
+			}
+			else {
+				bytes = SZ_fast_compress_args_unpredictable_blocked_double(data, outSize, realPrecision, length, blockSize);
+			}
+			return bytes;
+		}
+		else
+		{
+			//compute value range
+			double* oriData = (double*)data;
+			double min = oriData[0];
+			double max = oriData[0];
+			for(i=0;i<length;i++)
+			{
+				double v = oriData[i];
+				if(min>v)
+					min = v;
+				else if(max<v)
+					max = v;
+			}
+			double valueRange = max - min;
+			float radius = valueRange/2;
+			float medianValue = min + radius;
+
+			float realPrecision = 0;
+			if(errBoundMode==ABS)
+				realPrecision = absErrBound;
+			else if(errBoundMode==REL)
+				realPrecision = valueRange*relBoundRatio;
+
+			bytes = SZ_fast_compress_args_unpredictable_double(data, outSize, realPrecision, r5, r4, r3, r2, r1, medianValue, radius);		
+		}		
+	}
+
+    return bytes;
+
+}
+
+/**
+ * @deprecated
+ * */
+void* SZ_fast_decompress_pred(int dataType, float* preData, unsigned char *curBytes, size_t byteLength, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+    int x = 1;
+    char *y = (char*)&x;
+    if(*y==1)
+        sysEndianType = LITTLE_ENDIAN_SYSTEM;
+    else //=0
+        sysEndianType = BIG_ENDIAN_SYSTEM;
+
+    if(dataType == SZ_FLOAT)
+    {
+        float* newFloatData = NULL;
+        SZ_fast_decompress_args_with_prediction_float(&newFloatData, preData, r5, r4, r3, r2, r1, curBytes, byteLength);
+        return newFloatData;
+    }
+    else if(dataType == SZ_DOUBLE)
+    {
+        double* newDoubleData = NULL;
+        //SZ_fast_decompress_args_unpredictable_float(&newDoubleData, r5, r4, r3, r2, r1, bytes, byteLength, 0, NULL);
+        return newDoubleData;
+    }
+
+    return NULL;
+}
+
+void* SZ_fast_decompress(int fastMode, int dataType, unsigned char *bytes, size_t byteLength, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	size_t nbEle = computeDataLength(r5, r4, r3, r2, r1);
+    int x = 1;
+    char *y = (char*)&x;
+    if(*y==1)
+        sysEndianType = LITTLE_ENDIAN_SYSTEM;
+    else //=0
+        sysEndianType = BIG_ENDIAN_SYSTEM;
+
+    if(dataType == SZ_FLOAT)
+    {
+        float* newFloatData = NULL;
+        if(fastMode == SZx_NO_BLOCK_FAST_CMPR)
+            SZ_fast_decompress_args_unpredictable_float(&newFloatData, r5, r4, r3, r2, r1, bytes, byteLength);
+		else if(fastMode == SZx_WITH_BLOCK_FAST_CMPR)
+			SZ_fast_decompress_args_unpredictable_blocked_float(&newFloatData, nbEle, bytes);            
+        else if(fastMode == SZx_RANDOMACCESS_FAST_CMPR)
+			SZ_fast_decompress_args_unpredictable_blocked_randomaccess_float(&newFloatData, nbEle, bytes);
+        else //SZx_openmp
+        {
+#ifdef _OPENMP
+                SZ_fast_decompress_args_unpredictable_blocked_randomaccess_float_openmp(&newFloatData, nbEle, bytes);
+#else
+                SZ_fast_decompress_args_unpredictable_blocked_float(&newFloatData, nbEle, bytes);
+                printf("WARNING: It seems that you want to run the code with openmp mode but you didn't compile the code in openmp mode.\nSo, the decompression is degraded to serial version automatically.\n");
+#endif
+        }
+        return newFloatData;
+    }
+    else if(dataType == SZ_DOUBLE)
+    {
+        double* newFloatData = NULL;
+        if(fastMode == SZx_NO_BLOCK_FAST_CMPR)
+            SZ_fast_decompress_args_unpredictable_double(&newFloatData, r5, r4, r3, r2, r1, bytes, byteLength);
+		else if(fastMode == SZx_WITH_BLOCK_FAST_CMPR)
+			SZ_fast_decompress_args_unpredictable_blocked_double(&newFloatData, nbEle, bytes);            
+        else if(fastMode == SZx_RANDOMACCESS_FAST_CMPR)
+			SZ_fast_decompress_args_unpredictable_blocked_randomaccess_double(&newFloatData, nbEle, bytes);
+        else //SZx_openmp
+        {
+#ifdef _OPENMP
+                SZ_fast_decompress_args_unpredictable_blocked_randomaccess_double_openmp(&newFloatData, nbEle, bytes);
+#else
+                SZ_fast_decompress_args_unpredictable_blocked_double(&newFloatData, nbEle, bytes);
+                printf("WARNING: It seems that you want to run the code with openmp mode but you didn't compile the code in openmp mode.\nSo, the decompression is degraded to serial version automatically.\n");
+#endif
+        }
+        return newFloatData;
+    }
+
+    return NULL;
+}
diff --git a/qtensor/compression/szx/src/szx_BytesToolkit.c b/qtensor/compression/szx/src/szx_BytesToolkit.c
new file mode 100644
index 00000000..9d684ad8
--- /dev/null
+++ b/qtensor/compression/szx/src/szx_BytesToolkit.c
@@ -0,0 +1,811 @@
+/**
+ *  @file szx_ByteToolkit.c
+ *  @author Sheng Di
+ *  @date Feb, 2022
+ *  @brief Byte Toolkit
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+ 
+#include <stdlib.h>
+#include "szx.h" 	
+#include "szx_BytesToolkit.h"
+#include "szx_dataCompression.h"
+
+inline void sz_writeBits_Fast_int8(unsigned char* buffer,uint64_t *bitPosPtr, int numBits, unsigned char data)
+{
+    unsigned char mask = (1 << numBits)-1;
+    *(buffer + ((*bitPosPtr)>>3)) |= (data & mask) << ((*bitPosPtr) & (uint64_t)0x0000000000000007);
+    (*bitPosPtr) += numBits;
+}
+
+inline void sz_writeBits_Fast_int32(unsigned char* buffer,uint64_t *bitPosPtr, int numBits, int32_t data)
+{
+    uint32_t mask = (1 << numBits)-1;
+    *(uint32_t*)(buffer + ((*bitPosPtr)>>3)) |= ((*(uint32_t*)&data)&mask) << ((*bitPosPtr) & (uint64_t)0x0000000000000007);
+    (*bitPosPtr) += numBits;
+}
+
+inline void sz_writeBits_Fast_int64(unsigned char* buffer,uint64_t *bitPosPtr, int numBits, int64_t data)
+{
+    uint64_t mask = ((uint64_t)0x0000000000000001<<numBits)-1;
+    *(uint64_t*)(buffer + ((*bitPosPtr)>>3)) |= ((*(uint64_t*)&data)&mask) << ((*bitPosPtr) & (uint64_t)0x0000000000000007);
+    (*bitPosPtr) += numBits;
+}
+
+
+inline unsigned short bytesToUInt16_bigEndian(unsigned char* bytes)
+{
+	int temp = 0;
+	unsigned short res = 0;
+	
+	temp = bytes[0] & 0xff;
+	res |= temp;	
+
+	res <<= 8;
+	temp = bytes[1] & 0xff;
+	res |= temp;
+	
+	return res;
+}	
+	
+inline unsigned int bytesToUInt32_bigEndian(unsigned char* bytes)
+{
+	unsigned int temp = 0;
+	unsigned int res = 0;
+	
+	res <<= 8;
+	temp = bytes[0] & 0xff;
+	res |= temp;	
+
+	res <<= 8;
+	temp = bytes[1] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = bytes[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = bytes[3] & 0xff;
+	res |= temp;
+	
+	return res;
+}
+
+inline unsigned long bytesToUInt64_bigEndian(unsigned char* b) {
+	unsigned long temp = 0;
+	unsigned long res = 0;
+
+	res <<= 8;
+	temp = b[0] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = b[1] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[3] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[4] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[5] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[6] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[7] & 0xff;
+	res |= temp;						
+	
+	return res;
+}
+	
+inline short bytesToInt16_bigEndian(unsigned char* bytes)
+{
+	int temp = 0;
+	short res = 0;
+	
+	temp = bytes[0] & 0xff;
+	res |= temp;	
+
+	res <<= 8;
+	temp = bytes[1] & 0xff;
+	res |= temp;
+	
+	return res;
+}	
+	
+inline int bytesToInt32_bigEndian(unsigned char* bytes)
+{
+	int temp = 0;
+	int res = 0;
+	
+	res <<= 8;
+	temp = bytes[0] & 0xff;
+	res |= temp;	
+
+	res <<= 8;
+	temp = bytes[1] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = bytes[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = bytes[3] & 0xff;
+	res |= temp;
+	
+	return res;
+}
+
+inline long bytesToInt64_bigEndian(unsigned char* b) {
+	long temp = 0;
+	long res = 0;
+
+	res <<= 8;
+	temp = b[0] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = b[1] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[3] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[4] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[5] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[6] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[7] & 0xff;
+	res |= temp;						
+	
+	return res;
+}
+
+inline int bytesToInt_bigEndian(unsigned char* bytes)
+{
+	int temp = 0;
+	int res = 0;
+	
+	res <<= 8;
+	temp = bytes[0] & 0xff;
+	res |= temp;	
+
+	res <<= 8;
+	temp = bytes[1] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = bytes[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = bytes[3] & 0xff;
+	res |= temp;
+	
+	return res;
+}
+
+/**
+ * @unsigned char *b the variable to store the converted bytes (length=4)
+ * @unsigned int num
+ * */
+inline void intToBytes_bigEndian(unsigned char *b, unsigned int num)
+{
+	b[0] = (unsigned char)(num >> 24);	
+	b[1] = (unsigned char)(num >> 16);	
+	b[2] = (unsigned char)(num >> 8);	
+	b[3] = (unsigned char)(num);	
+	
+	//note: num >> xxx already considered endian_type...
+//if(dataEndianType==LITTLE_ENDIAN_DATA)
+//		symTransform_4bytes(*b); //change to BIG_ENDIAN_DATA
+}
+
+inline void int64ToBytes_bigEndian(unsigned char *b, uint64_t num)
+{
+	b[0] = (unsigned char)(num>>56);
+	b[1] = (unsigned char)(num>>48);
+	b[2] = (unsigned char)(num>>40);
+	b[3] = (unsigned char)(num>>32);
+	b[4] = (unsigned char)(num>>24);
+	b[5] = (unsigned char)(num>>16);
+	b[6] = (unsigned char)(num>>8);
+	b[7] = (unsigned char)(num);
+}
+
+inline void int32ToBytes_bigEndian(unsigned char *b, uint32_t num)
+{
+	b[0] = (unsigned char)(num >> 24);	
+	b[1] = (unsigned char)(num >> 16);	
+	b[2] = (unsigned char)(num >> 8);	
+	b[3] = (unsigned char)(num);		
+}
+
+inline void int16ToBytes_bigEndian(unsigned char *b, uint16_t num)
+{
+	b[0] = (unsigned char)(num >> 8);	
+	b[1] = (unsigned char)(num);
+}
+
+/**
+ * @endianType: refers to the endian_type of unsigned char* b.
+ * */
+inline long bytesToLong_bigEndian(unsigned char* b) {
+	long temp = 0;
+	long res = 0;
+
+	res <<= 8;
+	temp = b[0] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = b[1] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[3] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[4] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[5] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[6] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[7] & 0xff;
+	res |= temp;						
+	
+	return res;
+}
+
+inline void longToBytes_bigEndian(unsigned char *b, unsigned long num) 
+{
+	b[0] = (unsigned char)(num>>56);
+	b[1] = (unsigned char)(num>>48);
+	b[2] = (unsigned char)(num>>40);
+	b[3] = (unsigned char)(num>>32);
+	b[4] = (unsigned char)(num>>24);
+	b[5] = (unsigned char)(num>>16);
+	b[6] = (unsigned char)(num>>8);
+	b[7] = (unsigned char)(num);
+//	if(dataEndianType==LITTLE_ENDIAN_DATA)
+//		symTransform_8bytes(*b);
+}
+
+
+inline long doubleToOSEndianLong(double value)
+{
+	ldouble buf;
+	buf.value = value;
+	return buf.lvalue;
+}
+
+inline int floatToOSEndianInt(float value)
+{
+	lfloat buf;
+	buf.value = value;
+	return buf.ivalue;
+}
+
+//TODO: debug: lfBuf.lvalue could be actually little_endian....
+inline short getExponent_float(float value)
+{
+	//int ivalue = floatToBigEndianInt(value);
+
+	lfloat lbuf;
+	lbuf.value = value;
+	int ivalue = lbuf.ivalue;
+	
+	int expValue = (ivalue & 0x7F800000) >> 23;
+	expValue -= 127;
+	return (short)expValue;
+}
+
+inline short getPrecisionReqLength_float(float precision)
+{
+	lfloat lbuf;
+	lbuf.value = precision;
+	int ivalue = lbuf.ivalue;
+	
+	int expValue = (ivalue & 0x7F800000) >> 23;
+	expValue -= 127;
+//	unsigned char the1stManBit = (unsigned char)((ivalue & 0x00400000) >> 22);
+//	if(the1stManBit==1)
+//		expValue--;	
+	return (short)expValue;
+}
+
+inline short getExponent_double(double value)
+{
+	//long lvalue = doubleToBigEndianLong(value);
+	
+	ldouble lbuf;
+	lbuf.value = value;
+	long lvalue = lbuf.lvalue;
+	
+	int expValue = (int)((lvalue & 0x7FF0000000000000) >> 52);
+	expValue -= 1023;
+	return (short)expValue;
+}
+
+inline short getPrecisionReqLength_double(double precision)
+{
+	ldouble lbuf;
+	lbuf.value = precision;
+	long lvalue = lbuf.lvalue;
+	
+	int expValue = (int)((lvalue & 0x7FF0000000000000) >> 52);
+	expValue -= 1023;
+//	unsigned char the1stManBit = (unsigned char)((lvalue & 0x0008000000000000) >> 51);
+//	if(the1stManBit==1)
+//		expValue--;
+	return (short)expValue;
+}
+
+inline unsigned char numberOfLeadingZeros_Int(int i) {
+	if (i == 0)
+		return 32;
+	unsigned char n = 1;
+	if (((unsigned int)i) >> 16 == 0) { n += 16; i <<= 16; }
+	if (((unsigned int)i) >> 24 == 0) { n +=  8; i <<=  8; }
+	if (((unsigned int)i) >> 28 == 0) { n +=  4; i <<=  4; }
+	if (((unsigned int)i) >> 30 == 0) { n +=  2; i <<=  2; }
+	n -= ((unsigned int)i) >> 31;
+	return n;
+}
+
+inline unsigned char numberOfLeadingZeros_Long(long i) {
+	 if (i == 0)
+		return 64;
+	unsigned char n = 1;
+	int x = (int)(((unsigned long)i) >> 32);
+	if (x == 0) { n += 32; x = (int)i; }
+	if (((unsigned int)x) >> 16 == 0) { n += 16; x <<= 16; }
+	if (((unsigned int)x) >> 24 == 0) { n +=  8; x <<=  8; }
+	if (((unsigned int)x) >> 28 == 0) { n +=  4; x <<=  4; }
+	if (((unsigned int)x) >> 30 == 0) { n +=  2; x <<=  2; }
+	n -= ((unsigned int)x) >> 31;
+	return n;
+}
+
+inline unsigned char getLeadingNumbers_Int(int v1, int v2)
+{
+	int v = v1 ^ v2;
+	return (unsigned char)numberOfLeadingZeros_Int(v);
+}
+
+inline unsigned char getLeadingNumbers_Long(long v1, long v2)
+{
+	long v = v1 ^ v2;
+	return (unsigned char)numberOfLeadingZeros_Long(v);
+}
+
+/**
+ * By default, the endian type is OS endian type.
+ * */
+inline short bytesToShort(unsigned char* bytes)
+{
+	lint16 buf;
+	memcpy(buf.byte, bytes, 2);
+	
+	return buf.svalue;
+}
+
+inline void shortToBytes(unsigned char* b, short value)
+{
+	lint16 buf;
+	buf.svalue = value;
+	memcpy(b, buf.byte, 2);
+}
+
+inline int bytesToInt(unsigned char* bytes)
+{
+	lfloat buf;
+	memcpy(buf.byte, bytes, 4);
+	return buf.ivalue;
+}
+
+inline long bytesToLong(unsigned char* bytes)
+{
+	ldouble buf;
+	memcpy(buf.byte, bytes, 8);
+	return buf.lvalue;
+}
+
+//the byte to input is in the big-endian format
+inline float bytesToFloat(unsigned char* bytes)
+{
+	lfloat buf;
+	memcpy(buf.byte, bytes, 4);
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM)
+		symTransform_4bytes(buf.byte);	
+	return buf.value;
+}
+
+inline void floatToBytes(unsigned char *b, float num)
+{
+	lfloat buf;
+	buf.value = num;
+	memcpy(b, buf.byte, 4);
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM)
+		symTransform_4bytes(b);		
+}
+
+//the byte to input is in the big-endian format
+inline double bytesToDouble(unsigned char* bytes)
+{
+	ldouble buf;
+	memcpy(buf.byte, bytes, 8);
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM)
+		symTransform_8bytes(buf.byte);
+	return buf.value;
+}
+
+inline void doubleToBytes(unsigned char *b, double num)
+{
+	ldouble buf;
+	buf.value = num;
+	memcpy(b, buf.byte, 8);
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM)
+		symTransform_8bytes(b);
+}
+
+
+inline int getMaskRightCode(int m) {
+	switch (m) {
+	case 1:
+		return 0x01;
+	case 2:
+		return 0x03;
+	case 3:
+		return 0x07;
+	case 4:
+		return 0x0F;
+	case 5:
+		return 0x1F;
+	case 6:
+		return 0x3F;
+	case 7:
+		return 0X7F;
+	case 8:
+		return 0XFF;
+	default:
+		return 0;
+	}
+}
+
+inline int getLeftMovingCode(int kMod8)
+{
+	return getMaskRightCode(8 - kMod8);
+}
+
+inline int getRightMovingSteps(int kMod8, int resiBitLength) {
+	return 8 - kMod8 - resiBitLength;
+}
+
+inline int getRightMovingCode(int kMod8, int resiBitLength)
+{
+	int rightMovingSteps = 8 - kMod8 - resiBitLength;
+	if(rightMovingSteps < 0)
+	{
+		switch(-rightMovingSteps)
+		{
+		case 1:
+			return 0x80;
+		case 2:
+			return 0xC0;
+		case 3:
+			return 0xE0;
+		case 4:
+			return 0xF0;
+		case 5:
+			return 0xF8;
+		case 6:
+			return 0xFC;
+		case 7:
+			return 0XFE;
+		default:
+			return 0;
+		}    		
+	}
+	else //if(rightMovingSteps >= 0)
+	{
+		int a = getMaskRightCode(8 - kMod8);
+		int b = getMaskRightCode(8 - kMod8 - resiBitLength);
+		int c = a - b;
+		return c;
+	}
+}
+
+short* convertByteDataToShortArray(unsigned char* bytes, size_t byteLength)
+{
+	lint16 ls;
+	size_t i, stateLength = byteLength/2;
+	short* states = (short*)malloc(stateLength*sizeof(short));
+	if(sysEndianType==dataEndianType)
+	{	
+		for(i=0;i<stateLength;i++)
+		{
+			ls.byte[0] = bytes[i*2];
+			ls.byte[1] = bytes[i*2+1];
+			states[i] = ls.svalue;
+		}
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.byte[0] = bytes[i*2+1];
+			ls.byte[1] = bytes[i*2];
+			states[i] = ls.svalue;
+		}		
+	}
+	return states;
+} 
+
+unsigned short* convertByteDataToUShortArray(unsigned char* bytes, size_t byteLength)
+{
+	lint16 ls;
+	size_t i, stateLength = byteLength/2;
+	unsigned short* states = (unsigned short*)malloc(stateLength*sizeof(unsigned short));
+	if(sysEndianType==dataEndianType)
+	{	
+		for(i=0;i<stateLength;i++)
+		{
+			ls.byte[0] = bytes[i*2];
+			ls.byte[1] = bytes[i*2+1];
+			states[i] = ls.usvalue;
+		}
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.byte[0] = bytes[i*2+1];
+			ls.byte[1] = bytes[i*2];
+			states[i] = ls.usvalue;
+		}		
+	}
+	return states;
+} 
+
+void convertShortArrayToBytes(short* states, size_t stateLength, unsigned char* bytes)
+{
+	lint16 ls;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.svalue = states[i];
+			bytes[i*2] = ls.byte[0];
+			bytes[i*2+1] = ls.byte[1];
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.svalue = states[i];
+			bytes[i*2] = ls.byte[1];
+			bytes[i*2+1] = ls.byte[0];
+		}			
+	}
+}
+
+void convertUShortArrayToBytes(unsigned short* states, size_t stateLength, unsigned char* bytes)
+{
+	lint16 ls;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.usvalue = states[i];
+			bytes[i*2] = ls.byte[0];
+			bytes[i*2+1] = ls.byte[1];
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.usvalue = states[i];
+			bytes[i*2] = ls.byte[1];
+			bytes[i*2+1] = ls.byte[0];
+		}			
+	}
+}
+
+void convertIntArrayToBytes(int* states, size_t stateLength, unsigned char* bytes)
+{
+	lint32 ls;
+	size_t index = 0;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 2; //==i*4
+			ls.ivalue = states[i];
+			bytes[index] = ls.byte[0];
+			bytes[index+1] = ls.byte[1];
+			bytes[index+2] = ls.byte[2];
+			bytes[index+3] = ls.byte[3];
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 2; //==i*4
+			ls.ivalue = states[i];
+			bytes[index] = ls.byte[3];
+			bytes[index+1] = ls.byte[2];
+			bytes[index+2] = ls.byte[1];
+			bytes[index+3] = ls.byte[0];
+		}			
+	}
+}
+
+void convertUIntArrayToBytes(unsigned int* states, size_t stateLength, unsigned char* bytes)
+{
+	lint32 ls;
+	size_t index = 0;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 2; //==i*4
+			ls.uivalue = states[i];
+			bytes[index] = ls.byte[0];
+			bytes[index+1] = ls.byte[1];
+			bytes[index+2] = ls.byte[2];
+			bytes[index+3] = ls.byte[3];
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 2; //==i*4
+			ls.uivalue = states[i];
+			bytes[index] = ls.byte[3];
+			bytes[index+1] = ls.byte[2];
+			bytes[index+2] = ls.byte[1];
+			bytes[index+3] = ls.byte[0];
+		}			
+	}
+}
+
+void convertLongArrayToBytes(int64_t* states, size_t stateLength, unsigned char* bytes)
+{
+	lint64 ls;
+	size_t index = 0;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 3; //==i*8
+			ls.lvalue = states[i];
+			bytes[index] = ls.byte[0];
+			bytes[index+1] = ls.byte[1];
+			bytes[index+2] = ls.byte[2];
+			bytes[index+3] = ls.byte[3];
+			bytes[index+4] = ls.byte[4];
+			bytes[index+5] = ls.byte[5];
+			bytes[index+6] = ls.byte[6];
+			bytes[index+7] = ls.byte[7];	
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 3; //==i*8
+			ls.lvalue = states[i];
+			bytes[index] = ls.byte[7];
+			bytes[index+1] = ls.byte[6];
+			bytes[index+2] = ls.byte[5];
+			bytes[index+3] = ls.byte[4];
+			bytes[index+4] = ls.byte[3];
+			bytes[index+5] = ls.byte[2];
+			bytes[index+6] = ls.byte[1];
+			bytes[index+7] = ls.byte[0];	
+		}			
+	}
+}
+
+void convertULongArrayToBytes(uint64_t* states, size_t stateLength, unsigned char* bytes)
+{
+	lint64 ls;
+	size_t index = 0;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 3; //==i*8
+			ls.ulvalue = states[i];
+			bytes[index] = ls.byte[0];
+			bytes[index+1] = ls.byte[1];
+			bytes[index+2] = ls.byte[2];
+			bytes[index+3] = ls.byte[3];
+			bytes[index+4] = ls.byte[4];
+			bytes[index+5] = ls.byte[5];
+			bytes[index+6] = ls.byte[6];
+			bytes[index+7] = ls.byte[7];			
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 3; //==i*8
+			ls.ulvalue = states[i];
+			bytes[index] = ls.byte[7];
+			bytes[index+1] = ls.byte[6];
+			bytes[index+2] = ls.byte[5];
+			bytes[index+3] = ls.byte[4];
+			bytes[index+4] = ls.byte[3];
+			bytes[index+5] = ls.byte[2];
+			bytes[index+6] = ls.byte[1];
+			bytes[index+7] = ls.byte[0];	
+		}			
+	}
+}
+
+
+inline size_t bytesToSize(unsigned char* bytes)
+{
+	size_t result = bytesToLong_bigEndian(bytes);//8	
+	return result;
+}
+
+inline void sizeToBytes(unsigned char* outBytes, size_t size)
+{
+		longToBytes_bigEndian(outBytes, size);//8
+}
+
diff --git a/qtensor/compression/szx/src/szx_TypeManager.c b/qtensor/compression/szx/src/szx_TypeManager.c
new file mode 100644
index 00000000..5a4af0b6
--- /dev/null
+++ b/qtensor/compression/szx/src/szx_TypeManager.c
@@ -0,0 +1,381 @@
+/**
+ *  @file TypeManager.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief TypeManager is used to manage the type array: parsing of the bytes and other types in between.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "szx.h"
+
+size_t convertIntArray2ByteArray_fast_1b_args(unsigned char* intArray, size_t intArrayLength, unsigned char *result)
+{
+	size_t byteLength = 0;
+	size_t i, j; 
+	if(intArrayLength%8==0)
+		byteLength = intArrayLength/8;
+	else
+		byteLength = intArrayLength/8+1;
+
+	size_t n = 0;
+	int tmp, type;
+	for(i = 0;i<byteLength;i++)
+	{
+		tmp = 0;
+		for(j = 0;j<8&&n<intArrayLength;j++)
+		{
+			type = intArray[n];
+			//if(type == 1)
+			tmp = (tmp | (type << (7-j)));
+			n++;
+		}
+    	result[i] = (unsigned char)tmp;
+	}
+	return byteLength;
+}
+
+size_t convertIntArray2ByteArray_fast_1b(unsigned char* intArray, size_t intArrayLength, unsigned char **result)
+{
+	size_t byteLength = 0;
+	size_t i, j; 
+	if(intArrayLength%8==0)
+		byteLength = intArrayLength/8;
+	else
+		byteLength = intArrayLength/8+1;
+		
+	if(byteLength>0)
+		*result = (unsigned char*)malloc(byteLength*sizeof(unsigned char));
+	else
+		*result = NULL;
+	size_t n = 0;
+	int tmp, type;
+	for(i = 0;i<byteLength;i++)
+	{
+		tmp = 0;
+		for(j = 0;j<8&&n<intArrayLength;j++)
+		{
+			type = intArray[n];
+			if(type == 1)
+				tmp = (tmp | (1 << (7-j)));
+			n++;
+		}
+    	(*result)[i] = (unsigned char)tmp;
+	}
+	return byteLength;
+}
+
+size_t convertIntArray2ByteArray_fast_1b_to_result(unsigned char* intArray, size_t intArrayLength, unsigned char *result)
+{
+	size_t byteLength = 0;
+	size_t i, j; 
+	if(intArrayLength%8==0)
+		byteLength = intArrayLength/8;
+	else
+		byteLength = intArrayLength/8+1;
+		
+	size_t n = 0;
+	int tmp, type;
+	for(i = 0;i<byteLength;i++)
+	{
+		tmp = 0;
+		for(j = 0;j<8&&n<intArrayLength;j++)
+		{
+			type = intArray[n];
+			if(type == 1)
+				tmp = (tmp | (1 << (7-j)));
+			n++;
+		}
+    	result[i] = (unsigned char)tmp;
+	}
+	return byteLength;
+}
+
+void convertByteArray2IntArray_fast_1b_args(size_t intArrayLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char* intArray)
+{   
+	size_t n = 0, i;
+	int tmp;
+	for (i = 0; i < byteArrayLength-1; i++) 
+	{
+		tmp = byteArray[i];
+		intArray[n++] = (tmp & 0x80) >> 7;
+		intArray[n++] = (tmp & 0x40) >> 6;
+		intArray[n++] = (tmp & 0x20) >> 5;
+		intArray[n++] = (tmp & 0x10) >> 4;
+		intArray[n++] = (tmp & 0x08) >> 3;
+		intArray[n++] = (tmp & 0x04) >> 2;
+		intArray[n++] = (tmp & 0x02) >> 1;
+		intArray[n++] = (tmp & 0x01) >> 0;		
+	}
+	
+	tmp = byteArray[i];	
+	if(n == intArrayLength)
+		return;
+	intArray[n++] = (tmp & 0x80) >> 7;
+	if(n == intArrayLength)
+		return;	
+	intArray[n++] = (tmp & 0x40) >> 6;
+	if(n == intArrayLength)
+		return;	
+	intArray[n++] = (tmp & 0x20) >> 5;
+	if(n == intArrayLength)
+		return;
+	intArray[n++] = (tmp & 0x10) >> 4;
+	if(n == intArrayLength)
+		return;	
+	intArray[n++] = (tmp & 0x08) >> 3;
+	if(n == intArrayLength)
+		return;	
+	intArray[n++] = (tmp & 0x04) >> 2;
+	if(n == intArrayLength)
+		return;	
+	intArray[n++] = (tmp & 0x02) >> 1;
+	if(n == intArrayLength)
+		return;	
+	intArray[n++] = (tmp & 0x01) >> 0;	
+}
+
+void convertByteArray2IntArray_fast_1b(size_t intArrayLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray)	
+{
+    if(intArrayLength > byteArrayLength*8)
+    {
+    	printf("Error: intArrayLength > byteArrayLength*8\n");
+    	printf("intArrayLength=%zu, byteArrayLength = %zu", intArrayLength, byteArrayLength);
+    	exit(0);
+    }
+	if(intArrayLength>0)
+		*intArray = (unsigned char*)malloc(intArrayLength*sizeof(unsigned char));
+	else
+		*intArray = NULL;    
+    
+	size_t n = 0, i;
+	int tmp;
+	for (i = 0; i < byteArrayLength-1; i++) 
+	{
+		tmp = byteArray[i];
+		(*intArray)[n++] = (tmp & 0x80) >> 7;
+		(*intArray)[n++] = (tmp & 0x40) >> 6;
+		(*intArray)[n++] = (tmp & 0x20) >> 5;
+		(*intArray)[n++] = (tmp & 0x10) >> 4;
+		(*intArray)[n++] = (tmp & 0x08) >> 3;
+		(*intArray)[n++] = (tmp & 0x04) >> 2;
+		(*intArray)[n++] = (tmp & 0x02) >> 1;
+		(*intArray)[n++] = (tmp & 0x01) >> 0;		
+	}
+	
+	tmp = byteArray[i];	
+	if(n == intArrayLength)
+		return;
+	(*intArray)[n++] = (tmp & 0x80) >> 7;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x40) >> 6;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x20) >> 5;
+	if(n == intArrayLength)
+		return;
+	(*intArray)[n++] = (tmp & 0x10) >> 4;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x08) >> 3;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x04) >> 2;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x02) >> 1;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x01) >> 0;		
+}
+
+
+inline size_t convertIntArray2ByteArray_fast_2b_args(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char *result)
+{
+	register unsigned char tmp = 0;
+	size_t i, j = 0, byteLength = 0;
+	if(timeStepTypeLength%4==0)
+		byteLength = timeStepTypeLength*2/8;
+	else
+		byteLength = timeStepTypeLength*2/8+1;
+	size_t n = 0;
+	if(timeStepTypeLength%4==0)
+	{
+		for(i = 0;i<byteLength;i++)
+		{
+			tmp = 0;
+
+			tmp |= timeStepType[n++] << 6;
+			tmp |= timeStepType[n++] << 4;
+			tmp |= timeStepType[n++] << 2;
+			tmp |= timeStepType[n++];
+
+		/*	for(j = 0;j<4;j++) 
+			{
+				unsigned char type = timeStepType[n++];
+				tmp = tmp | type << (6-(j<<1));
+			}*/
+
+			result[i] = tmp;
+		}		
+	}
+	else
+	{
+		size_t byteLength_ = byteLength - 1;
+		for(i = 0;i<byteLength_;i++)
+		{
+			tmp = 0;
+
+			tmp |= timeStepType[n++] << 6;
+			tmp |= timeStepType[n++] << 4;
+			tmp |= timeStepType[n++] << 2;
+			tmp |= timeStepType[n++];	
+
+		/*	for(j = 0;j<4;j++)
+			{
+				unsigned char type = timeStepType[n++];
+				tmp = tmp | type << (6-(j<<1));
+			}*/
+
+			result[i] = tmp;
+		}
+		tmp = 0;
+        int mod4 = timeStepTypeLength%4;
+        for(j=0;j<mod4;j++)
+		{
+			unsigned char type = timeStepType[n++];
+			tmp = tmp | type << (6-(j<<1));			
+		}
+		result[i] = tmp;
+	}
+
+/*	//The original version (the slowest version)
+ * for(i = 0;i<byteLength;i++)
+	{
+		tmp = 0;
+
+		for(j = 0;j<4&&n<timeStepTypeLength;j++)
+		{
+			unsigned char type = timeStepType[n++];
+			tmp = tmp | type << (6-(j<<1));
+		}
+
+		result[i] = tmp;
+	}
+*/
+	return byteLength;
+}
+
+/**
+ * little endian
+ * [01|10|11|00|....]-->[01|10|11|00][....]
+ * @param timeStepType
+ * @return
+ */
+size_t convertIntArray2ByteArray_fast_2b(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char **result)
+{
+	size_t i, j, byteLength = 0;
+	if(timeStepTypeLength%4==0)
+		byteLength = timeStepTypeLength*2/8;
+	else
+		byteLength = timeStepTypeLength*2/8+1;
+	if(byteLength>0)
+		*result = (unsigned char*)malloc(byteLength*sizeof(unsigned char));
+	else
+		*result = NULL;
+	size_t n = 0;
+	for(i = 0;i<byteLength;i++)
+	{
+		int tmp = 0;
+		for(j = 0;j<4&&n<timeStepTypeLength;j++)
+		{
+			int type = timeStepType[n];
+			switch(type)
+			{
+			case 0: 
+				
+				break;
+			case 1:
+				tmp = (tmp | (1 << (6-j*2)));
+				break;
+			case 2:
+				tmp = (tmp | (2 << (6-j*2)));
+				break;
+			case 3:
+				tmp = (tmp | (3 << (6-j*2)));
+				break;
+			default:
+				printf("Error: wrong timestep type...: type[%zu]=%d\n", n, type);
+				exit(0);
+			}
+			n++;
+		}
+		(*result)[i] = (unsigned char)tmp;
+	}
+	return byteLength;
+}
+
+void convertByteArray2IntArray_fast_2b(size_t stepLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray)
+{
+	if(stepLength > byteArrayLength*4)
+	{
+		printf("Error: stepLength > byteArray.length*4\n");
+		printf("stepLength=%zu, byteArray.length=%zu\n", stepLength, byteArrayLength);
+		exit(0);
+	}
+	if(stepLength>0)
+		*intArray = (unsigned char*)malloc(stepLength*sizeof(unsigned char));
+	else
+		*intArray = NULL;
+	size_t i, n = 0;
+
+	int mod4 = stepLength%4;
+	if(mod4==0)
+	{
+		for (i = 0; i < byteArrayLength; i++) {
+			unsigned char tmp = byteArray[i];
+			(*intArray)[n++] = (tmp & 0xC0) >> 6;
+			(*intArray)[n++] = (tmp & 0x30) >> 4;
+			(*intArray)[n++] = (tmp & 0x0C) >> 2;
+			(*intArray)[n++] = tmp & 0x03;
+		}	
+	}
+	else
+	{
+		size_t t = byteArrayLength - mod4;
+		for (i = 0; i < t; i++) {
+			unsigned char tmp = byteArray[i];
+			(*intArray)[n++] = (tmp & 0xC0) >> 6;
+			(*intArray)[n++] = (tmp & 0x30) >> 4;
+			(*intArray)[n++] = (tmp & 0x0C) >> 2;
+			(*intArray)[n++] = tmp & 0x03;
+		}
+		unsigned char tmp = byteArray[i];				
+		switch(mod4)
+		{
+		case 1:
+			(*intArray)[n++] = (tmp & 0xC0) >> 6;
+			break;
+		case 2:
+			(*intArray)[n++] = (tmp & 0xC0) >> 6;
+			(*intArray)[n++] = (tmp & 0x30) >> 4;			
+			break;
+		case 3:	
+			(*intArray)[n++] = (tmp & 0xC0) >> 6;
+			(*intArray)[n++] = (tmp & 0x30) >> 4;
+			(*intArray)[n++] = (tmp & 0x0C) >> 2;		
+			break;
+		}
+	}
+}
+
+
+inline int getLeftMovingSteps(size_t k, unsigned char resiBitLength)
+{
+	return 8 - k%8 - resiBitLength;
+}
+
+
diff --git a/qtensor/compression/szx/src/szx_dataCompression.c b/qtensor/compression/szx/src/szx_dataCompression.c
new file mode 100644
index 00000000..d5130a93
--- /dev/null
+++ b/qtensor/compression/szx/src/szx_dataCompression.c
@@ -0,0 +1,355 @@
+/**
+ *  @file double_compression.c
+ *  @author Sheng Di, Dingwen Tao, Xin Liang, Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang
+ *  @date April, 2016
+ *  @brief Compression Technique for double array
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "szx.h"
+#include "szx_dataCompression.h"
+#include "szx_BytesToolkit.h"
+
+int computeByteSizePerIntValue(long valueRangeSize)
+{
+	if(valueRangeSize<=256)
+		return 1;
+	else if(valueRangeSize<=65536)
+		return 2;
+	else if(valueRangeSize<=4294967296) //2^32
+		return 4;
+	else
+		return 8;
+}
+
+long computeRangeSize_int(void* oriData, int dataType, size_t size, int64_t* valueRangeSize)
+{
+	size_t i = 0;
+	long max = 0, min = 0;
+
+	if(dataType==SZ_UINT8)
+	{
+		unsigned char* data = (unsigned char*)oriData;
+		unsigned char data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_INT8)
+	{
+		char* data = (char*)oriData;
+		char data_;
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_UINT16)
+	{
+		unsigned short* data = (unsigned short*)oriData;
+		unsigned short data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_INT16)
+	{ 
+		short* data = (short*)oriData;
+		short data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_UINT32)
+	{
+		unsigned int* data = (unsigned int*)oriData;
+		unsigned int data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_INT32)
+	{
+		int* data = (int*)oriData;
+		int data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_UINT64)
+	{
+		unsigned long* data = (unsigned long*)oriData;
+		unsigned long data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_INT64)
+	{
+		long* data = (long *)oriData;
+		long data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+
+	*valueRangeSize = max - min;
+	return min;	
+}
+
+float computeRangeSize_float(float* oriData, size_t size, float* valueRangeSize, float* medianValue)
+{
+	size_t i = 0;
+	float min = oriData[0];
+	float max = min;
+	for(i=1;i<size;i++)
+	{
+		float data = oriData[i];
+		if(min>data)
+			min = data;
+		else if(max<data)
+			max = data;
+	}
+
+	*valueRangeSize = max - min;
+	*medianValue = min + *valueRangeSize/2;
+	return min;
+}
+
+double computeRangeSize_double(double* oriData, size_t size, double* valueRangeSize, double* medianValue)
+{
+	size_t i = 0;
+	double min = oriData[0];
+	double max = min;
+	for(i=1;i<size;i++)
+	{
+		double data = oriData[i];
+		if(min>data)
+			min = data;
+		else if(max<data)
+			max = data;
+	}
+	
+	*valueRangeSize = max - min;
+	*medianValue = min + *valueRangeSize/2;
+	return min;
+}
+
+double min_d(double a, double b)
+{
+	if(a<b)
+		return a;
+	else
+		return b;
+}
+
+double max_d(double a, double b)
+{
+	if(a>b)
+		return a;
+	else
+		return b;
+}
+
+float min_f(float a, float b)
+{
+	if(a<b)
+		return a;
+	else
+		return b;
+}
+
+float max_f(float a, float b)
+{
+	if(a>b)
+		return a;
+	else
+		return b;
+}
+
+double getRealPrecision_double(double valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status)
+{
+	int state = SZ_SCES;
+	double precision = 0;
+	if(errBoundMode==ABS||errBoundMode==ABS_OR_PW_REL||errBoundMode==ABS_AND_PW_REL)
+		precision = absErrBound; 
+	else if(errBoundMode==REL||errBoundMode==REL_OR_PW_REL||errBoundMode==REL_AND_PW_REL)
+		precision = relBoundRatio*valueRangeSize;
+	else if(errBoundMode==ABS_AND_REL)
+		precision = min_d(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==ABS_OR_REL)
+		precision = max_d(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==PW_REL)
+		precision = 0;
+	else
+	{
+		printf("Error: error-bound-mode is incorrect!\n");
+		state = SZ_BERR;
+	}
+	*status = state;
+	return precision;
+}
+
+double getRealPrecision_float(float valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status)
+{
+	int state = SZ_SCES;
+	double precision = 0;
+	if(errBoundMode==ABS||errBoundMode==ABS_OR_PW_REL||errBoundMode==ABS_AND_PW_REL)
+		precision = absErrBound; 
+	else if(errBoundMode==REL||errBoundMode==REL_OR_PW_REL||errBoundMode==REL_AND_PW_REL)
+		precision = relBoundRatio*valueRangeSize;
+	else if(errBoundMode==ABS_AND_REL)
+		precision = min_f(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==ABS_OR_REL)
+		precision = max_f(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==PW_REL)
+		precision = 0;
+	else
+	{
+		printf("Error: error-bound-mode is incorrect!\n");
+		state = SZ_BERR;
+	}
+	*status = state;
+	return precision;
+}
+
+double getRealPrecision_int(long valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status)
+{
+	int state = SZ_SCES;
+	double precision = 0;
+	if(errBoundMode==ABS||errBoundMode==ABS_OR_PW_REL||errBoundMode==ABS_AND_PW_REL)
+		precision = absErrBound; 
+	else if(errBoundMode==REL||errBoundMode==REL_OR_PW_REL||errBoundMode==REL_AND_PW_REL)
+		precision = relBoundRatio*valueRangeSize;
+	else if(errBoundMode==ABS_AND_REL)
+		precision = min_f(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==ABS_OR_REL)
+		precision = max_f(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==PW_REL)
+		precision = -1;
+	else
+	{
+		printf("Error: error-bound-mode is incorrect!\n");
+		state = SZ_BERR;
+	}
+	*status = state;
+	return precision;
+}
+
+inline void symTransform_8bytes(unsigned char data[8])
+{
+	unsigned char tmp = data[0];
+	data[0] = data[7];
+	data[7] = tmp;
+
+	tmp = data[1];
+	data[1] = data[6];
+	data[6] = tmp;
+	
+	tmp = data[2];
+	data[2] = data[5];
+	data[5] = tmp;
+	
+	tmp = data[3];
+	data[3] = data[4];
+	data[4] = tmp;
+}
+
+inline void symTransform_2bytes(unsigned char data[2])
+{
+	unsigned char tmp = data[0];
+	data[0] = data[1];
+	data[1] = tmp;
+}
+
+inline void symTransform_4bytes(unsigned char data[4])
+{
+	unsigned char tmp = data[0];
+	data[0] = data[3];
+	data[3] = tmp;
+
+	tmp = data[1];
+	data[1] = data[2];
+	data[2] = tmp;
+}
+
+inline void compressInt8Value(int8_t tgtValue, int8_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint8_t data = tgtValue - minValue;
+	memcpy(bytes, &data, byteSize); //byteSize==1
+}
+
+inline void compressInt16Value(int16_t tgtValue, int16_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint16_t data = tgtValue - minValue;
+	unsigned char tmpBytes[2];
+	int16ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 2 - byteSize, byteSize);
+}
+
+inline void compressInt32Value(int32_t tgtValue, int32_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint32_t data = tgtValue - minValue;
+	unsigned char tmpBytes[4];
+	int32ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 4 - byteSize, byteSize);
+}
+
+inline void compressInt64Value(int64_t tgtValue, int64_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint64_t data = tgtValue - minValue;
+	unsigned char tmpBytes[8];
+	int64ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 8 - byteSize, byteSize);
+}
+
+inline void compressUInt8Value(uint8_t tgtValue, uint8_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint8_t data = tgtValue - minValue;
+	memcpy(bytes, &data, byteSize); //byteSize==1
+}
+
+inline void compressUInt16Value(uint16_t tgtValue, uint16_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint16_t data = tgtValue - minValue;
+	unsigned char tmpBytes[2];
+	int16ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 2 - byteSize, byteSize);
+}
+
+inline void compressUInt32Value(uint32_t tgtValue, uint32_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint32_t data = tgtValue - minValue;
+	unsigned char tmpBytes[4];
+	int32ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 4 - byteSize, byteSize);
+}
+
+inline void compressUInt64Value(uint64_t tgtValue, uint64_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint64_t data = tgtValue - minValue;
+	unsigned char tmpBytes[8];
+	int64ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 8 - byteSize, byteSize);
+}
+
+int compIdenticalLeadingBytesCount_double(unsigned char* preBytes, unsigned char* curBytes)
+{
+	int i, n = 0;
+	for(i=0;i<8;i++)
+		if(preBytes[i]==curBytes[i])
+			n++;
+		else
+			break;
+	if(n>3) n = 3;
+	return n;
+}
+
+
+inline int compIdenticalLeadingBytesCount_float(unsigned char* preBytes, unsigned char* curBytes)
+{
+	int i, n = 0;
+	for(i=0;i<4;i++)
+		if(preBytes[i]==curBytes[i])
+			n++;
+		else
+			break;
+	if(n>3) n = 3;
+	return n;
+}
diff --git a/qtensor/compression/szx/src/szx_double.c b/qtensor/compression/szx/src/szx_double.c
new file mode 100644
index 00000000..34bd2b4d
--- /dev/null
+++ b/qtensor/compression/szx/src/szx_double.c
@@ -0,0 +1,1388 @@
+/**
+ *  @file szx_double.c
+ *  @author Sheng Di, Kai Zhao
+ *  @date Aug, 2022
+ *  @brief SZ_Init, Compression and Decompression functions
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "szx.h"
+#include "szx_double.h"
+#include "szx_BytesToolkit.h"
+#include "szx_TypeManager.h"
+#include <assert.h>
+
+#ifdef _OPENMP
+#include "omp.h"
+#endif
+
+#if defined(__AVX__) || defined(__AVX2__)  || defined(__AVX512F__)
+#include <immintrin.h>
+#endif
+
+inline void SZ_fast_compress_args_unpredictable_one_block_double(double *oriData, size_t nbEle, float absErrBound,
+                                                                unsigned char *outputBytes, int *outSize,
+                                                                unsigned char *leadNumberArray_int, float mValue,
+                                                                float radius) {
+	double medianValue = mValue;
+    size_t totalSize = 0, i = 0;
+
+    int reqLength;
+
+    //compute median, value range, and radius
+
+    short radExpo = getExponent_float(radius);
+    computeReqLength_double(absErrBound, radExpo, &reqLength, &mValue);
+
+    int reqBytesLength = reqLength / 8;
+    int resiBitsLength = reqLength % 8;
+    int rightShiftBits = 0;
+
+    size_t leadNumberArray_size = nbEle % 4 == 0 ? nbEle / 4 : nbEle / 4 + 1;
+
+    register ldouble lfBuf_pre;
+    register ldouble lfBuf_cur;
+    lfBuf_pre.lvalue = 0;
+
+    unsigned char *leadNumberArray = outputBytes + 1 + sizeof(float);
+
+    unsigned char *exactMidbyteArray = leadNumberArray + leadNumberArray_size;
+
+    if (resiBitsLength != 0) {
+        rightShiftBits = 8 - resiBitsLength;
+        reqBytesLength++;
+    }
+
+    register unsigned char leadingNum = 0;
+    size_t residualMidBytes_size = 0;
+    if (sysEndianType == LITTLE_ENDIAN_SYSTEM) {
+
+        if (reqBytesLength == 3) {
+            for (i = 0; i < nbEle; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 3;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 2;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[5];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        } else if (reqBytesLength == 2) {
+            for (i = 0; i < nbEle; i++) {
+
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 2;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[6];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        } else if (reqBytesLength == 1) {
+            for (i = 0; i < nbEle; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[7];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }else if(reqBytesLength == 4) {
+            for (i = 0; i < nbEle; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 4;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 3;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[5];
+                    residualMidBytes_size += 2;
+                } else //leadingNum == 3
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[4];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }
+        else if (reqBytesLength == 5)
+        {
+            for (i = 0; i < nbEle; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 5;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 4;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[5];
+                    residualMidBytes_size += 3;
+                } else if (leadingNum == 3)
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[4];
+                    residualMidBytes_size += 2;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }
+        else if(reqBytesLength == 6)
+        {
+            for (i = 0; i < nbEle; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 6;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 5;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[5];
+                    residualMidBytes_size += 4;
+                } else if (leadingNum == 3)
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[4];
+                    residualMidBytes_size += 3;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }        
+        else if(reqBytesLength == 7)
+        {
+            for (i = 0; i < nbEle; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 6] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 7;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 6;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[5];
+                    residualMidBytes_size += 5;
+                } else if (leadingNum == 3)
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[4];
+                    residualMidBytes_size += 4;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }
+        else //reqLength == 8
+        {
+            for (i = 0; i < nbEle; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 6] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 7] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 8;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 6] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 7;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[5];
+                    residualMidBytes_size += 6;
+                } else if (leadingNum == 3)
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[4];
+                    residualMidBytes_size += 5;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }        
+
+        convertIntArray2ByteArray_fast_2b_args(leadNumberArray_int, nbEle, leadNumberArray);
+        int k = 0;
+
+        unsigned char reqLengthB = (unsigned char) reqLength;
+        outputBytes[k] = reqLengthB;
+        k++;
+        floatToBytes(&(outputBytes[k]), mValue);
+        k += sizeof(float);
+        //sizeToBytes(&(outputBytes[k]), leadNumberArray_size);
+        //outputBytes[k] = leadNumberArray_size;  //leadNumberArray_size can be calculated based on block size (=blockSize/4)
+
+        totalSize = 1 + sizeof(float) + leadNumberArray_size + residualMidBytes_size;
+    } else {
+
+    }
+
+    *outSize = totalSize;
+
+}
+
+size_t computeStateMedianRadius_double(double *oriData, size_t nbEle, float absErrBound, int blockSize,
+                                      unsigned char *stateArray, float *medianArray, float *radiusArray) {
+    size_t nbConstantBlocks = 0;
+    size_t i = 0, j = 0;
+    size_t nbBlocks = nbEle / blockSize;
+    size_t offset = 0;
+
+    for (i = 0; i < nbBlocks; i++) {
+        double min = oriData[offset];
+        double max = oriData[offset];
+        for (j = 1; j < blockSize; j++) {
+            double v = oriData[offset + j];
+            if (min > v)
+                min = v;
+            else if (max < v)
+                max = v;
+        }
+        double valueRange = max - min;
+        double radius = valueRange / 2;
+        double medianValue = min + radius;
+
+        if (radius <= absErrBound) {
+            stateArray[i] = 0;
+            nbConstantBlocks++;
+        } else
+            stateArray[i] = 1;
+
+        stateArray[i] = radius <= absErrBound ? 0 : 1;
+        medianArray[i] = (float)medianValue;
+        radiusArray[i] = (float)radius;
+        offset += blockSize;
+    }
+
+    int remainCount = nbEle % blockSize;
+    if (remainCount != 0) {
+        double min = oriData[offset];
+        double max = oriData[offset];
+        for (j = 1; j < remainCount; j++) {
+            double v = oriData[offset + j];
+            if (min > v)
+                min = v;
+            else if (max < v)
+                max = v;
+        }
+        double valueRange = max - min;
+        double radius = valueRange / 2;
+        double medianValue = min + radius;
+        if (radius <= absErrBound) {
+            stateArray[i] = 0;
+            nbConstantBlocks++;
+        } else
+            stateArray[i] = 1;
+        medianArray[i] = (float)medianValue;
+        radiusArray[i] = (float)radius;
+    }
+    return nbConstantBlocks;
+}
+
+
+void max_min_double(double *x, int n, double *tmp_max, double *tmp_min) {
+    for (size_t i = 0; i < n; i++) {
+        if (x[i] > *tmp_max) {
+            *tmp_max = x[i];
+        }
+        if (x[i] < *tmp_min) {
+            *tmp_min = x[i];
+        }
+    }
+}
+
+void simd_max_min_double(double *x, int n, double *tmp_max, double *tmp_min) {
+    *tmp_max = x[0];
+    *tmp_min = x[0];
+#ifdef  __AVX512F__
+    //    printf("use avx512, n=%d \n", n);
+    int n16 = n & -16, i = 0, j=0;
+    if (n > 16) {
+        double *ptr_x = x;
+        __m512 max1 = _mm512_loadu_ps(ptr_x);
+//        __m512 max2 = _mm512_loadu_ps(ptr_x + 16);
+        __m512 min1 = max1;
+//        __m512 min2 = max2;
+        __m512 tmp1;
+//        __m512 tmp2;
+        for (; i < n16; i += 16) {
+            tmp1 = _mm512_loadu_ps(ptr_x);
+            max1 = _mm512_max_ps(tmp1, max1);
+            min1 = _mm512_min_ps(tmp1, min1);
+//            tmp2 = _mm512_loadu_ps(ptr_x+16);
+//            max2 = _mm512_max_ps(tmp2, max2);
+//            min2 = _mm512_min_ps(tmp2, min2);
+            ptr_x += 16;
+        }
+//        max1 = _mm512_max_ps(max1, max2);
+//        min1 = _mm512_min_ps(min1, min2);
+          __m256 max256 = _mm256_max_ps(_mm512_extractf32x8_ps(max1,0), _mm512_extractf32x8_ps(max1,1));
+          __m128 max128 = _mm_max_ps(_mm256_extractf128_ps(max256,0), _mm256_extractf128_ps(max256,1));
+          __m256 min256 = _mm256_min_ps(_mm512_extractf32x8_ps(min1,0), _mm512_extractf32x8_ps(min1,1));
+          __m128 min128 = _mm_min_ps(_mm256_extractf128_ps(min256,0), _mm256_extractf128_ps(min256,1));
+          for (j=0;j<4;j++){
+            *tmp_max = *tmp_max < max128[j] ? max128[j] : *tmp_max;
+            *tmp_min = *tmp_min > min128[j] ? min128[j] : *tmp_min;
+          }
+
+        if ( i < n ) {
+            max_min_double(ptr_x, n - i, tmp_max, tmp_min);
+        }
+    } else {
+        max_min_double(x, n, tmp_max, tmp_min);
+    }
+#elif __AVX2__
+//        printf("use avx2, n=%d \n", n);
+    //    fflush(stdout);
+    int n16 = n & -16, i = 0;
+    if (n > 16) {
+        double *ptr_x = x;
+        __m256 max1 = _mm256_loadu_ps(ptr_x);
+        __m256 max2 = _mm256_loadu_ps(ptr_x + 8);
+        __m256 min1 = max1;
+        __m256 min2 = max2;
+        for (; i < n16; i += 16) {
+            max1 = _mm256_max_ps(_mm256_loadu_ps(ptr_x), max1);
+            min1 = _mm256_min_ps(_mm256_loadu_ps(ptr_x), min1);
+            max2 = _mm256_max_ps(_mm256_loadu_ps(ptr_x + 8), max2);
+            min2 = _mm256_min_ps(_mm256_loadu_ps(ptr_x + 8), min2);
+            ptr_x += 16;
+        }
+//        printf("%d %d %d\n", n, n16, i);
+//        exit(0);
+        max1 = _mm256_max_ps(max1, max2);
+        min1 = _mm256_min_ps(min1, min2);
+        for (int j = 0; j < 8; j++) {
+            *tmp_max = *tmp_max < max1[j] ? max1[j] : *tmp_max;
+            *tmp_min = *tmp_min > min1[j] ? min1[j] : *tmp_min;
+        }
+        if ( i < n ) {
+            max_min_double(ptr_x, n - i, tmp_max, tmp_min);
+        }
+    } else {
+        max_min_double(x, n, tmp_max, tmp_min);
+    }
+#else
+    max_min_double(x, n, tmp_max, tmp_min);
+#endif
+}
+
+void computeStateMedianRadius_double2(double *oriData, size_t nbEle, float absErrBound,
+                                     unsigned char *state, float *median, float *radius) {
+     double min = oriData[0];
+     double max = oriData[0];
+     simd_max_min_double(oriData, nbEle, &max, &min);
+
+    double valueRange = max - min;
+    *radius = valueRange / 2;
+    *median = min + *radius;
+
+    if (*radius <= absErrBound) {
+        *state = 0;
+    } else {
+        *state = 1;
+    }
+}
+
+
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_double(double *oriData, size_t *outSize, float absErrBound, size_t nbEle,
+                                                  int blockSize) {
+    double *op = oriData;
+
+    *outSize = 0;
+    size_t maxPreservedBufferSize =
+            sizeof(double) * nbEle; //assume that the compressed data size would not exceed the original size
+    unsigned char *outputBytes = (unsigned char *) malloc(maxPreservedBufferSize);
+    memset(outputBytes, 0, maxPreservedBufferSize);
+    unsigned char *leadNumberArray_int = (unsigned char *) malloc(blockSize * sizeof(int));
+
+    size_t i = 0;
+    int oSize = 0;
+
+    size_t nbBlocks = nbEle / blockSize;
+    size_t remainCount = nbEle % blockSize;
+    size_t stateNBBytes =
+            remainCount == 0 ? (nbBlocks % 8 == 0 ? nbBlocks / 8 : nbBlocks / 8 + 1) : ((nbBlocks + 1) % 8 == 0 ?
+                                                                                        (nbBlocks + 1) / 8 :
+                                                                                        (nbBlocks + 1) / 8 + 1);
+    size_t actualNBBlocks = remainCount == 0 ? nbBlocks : nbBlocks + 1;
+
+    unsigned char *stateArray = (unsigned char *) malloc(actualNBBlocks);
+    float *medianArray = (float *) malloc(actualNBBlocks * sizeof(float));
+    float *radiusArray = (float *) malloc(actualNBBlocks * sizeof(float));
+
+    size_t nbConstantBlocks = computeStateMedianRadius_double(oriData, nbEle, absErrBound, blockSize, stateArray,
+                                                             medianArray, radiusArray);
+
+    unsigned char *r = outputBytes; // + sizeof(size_t) + stateNBBytes;
+    r[0] = SZx_VER_MAJOR;
+    r[1] = SZx_VER_MINOR;
+    r[2] = 1;
+    r[3] = 0; // indicates this is not a random access version
+    r[4] = (unsigned char) blockSize;
+    r = r + 5; //1 byte
+    sizeToBytes(r, nbConstantBlocks);
+    r += sizeof(size_t); //r is the starting address of 'stateNBBytes'
+
+    unsigned char *p = r + stateNBBytes; //p is the starting address of constant median values.
+    unsigned char *q =
+            p + sizeof(float) * nbConstantBlocks; //q is the starting address of the non-constant data sblocks
+    //3: versions, 1: metadata: state, 1: metadata: blockSize, sizeof(size_t): nbConstantBlocks, ....
+    *outSize += (3 + 1 + 1 + sizeof(size_t) + stateNBBytes + sizeof(float) * nbConstantBlocks);
+
+    //printf("nbConstantBlocks = %zu, percent = %f\n", nbConstantBlocks, 1.0f*(nbConstantBlocks*blockSize)/nbEle);
+    for (i = 0; i < nbBlocks; i++, op += blockSize) {
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_double(op, blockSize, absErrBound, q, &oSize,
+                                                                leadNumberArray_int, medianArray[i], radiusArray[i]);
+            q += oSize;
+            *outSize += oSize;
+        } else {
+            floatToBytes(p, medianArray[i]);
+            p += sizeof(float);
+        }
+    }
+
+    if (remainCount != 0) {
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_double(op, remainCount, absErrBound, q, &oSize,
+                                                                leadNumberArray_int, medianArray[i], radiusArray[i]);
+            *outSize += oSize;
+        } else {
+            floatToBytes(p, medianArray[i]);
+        }
+
+    }
+
+    convertIntArray2ByteArray_fast_1b_args(stateArray, actualNBBlocks, r);
+	
+    free(stateArray);
+    free(medianArray);	
+    free(radiusArray);
+    free(leadNumberArray_int);
+
+    return outputBytes;
+}
+
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_randomaccess_double_openmp(double *oriData, size_t *outSize, float absErrBound,
+                                                               size_t nbEle, int blockSize) {
+#ifdef _OPENMP
+    printf("use openmp\n");
+
+#ifdef __AVX512F__
+    printf("use avx512\n");
+#elif __AVX2__
+    printf("use avx2\n");
+#else
+#endif
+    printf("blockSize = %d\n",blockSize);
+    sz_cost_start();
+    double *op = oriData;
+
+    size_t i = 0;
+    size_t nbBlocks = nbEle / blockSize;
+    size_t remainCount = nbEle % blockSize;
+    size_t actualNBBlocks = remainCount == 0 ? nbBlocks : nbBlocks + 1;
+    size_t stateNBBytes = (actualNBBlocks % 8 == 0 ? actualNBBlocks / 8 : actualNBBlocks / 8 + 1);
+
+    unsigned char *stateArray = (unsigned char *) malloc(actualNBBlocks);
+    float *medianArray = (float *) malloc(actualNBBlocks * sizeof(float));
+
+    size_t nbNonConstantBlocks = 0;
+
+    unsigned char *tmp_q = (unsigned char *) malloc(blockSize * sizeof(double) * actualNBBlocks);
+    int *outSizes = (int *) malloc(actualNBBlocks * sizeof(int));
+    size_t *outSizesAccumlate = (size_t *) malloc(actualNBBlocks * sizeof(size_t));
+    int *nbNonConstantBlockAccumlate = (int *) malloc(actualNBBlocks * sizeof(int));
+
+    (*outSize) = 0;
+    size_t maxPreservedBufferSize =
+    sizeof(double) * nbEle; //assume that the compressed data size would not exceed the original size
+    unsigned char *outputBytes = (unsigned char *) malloc(maxPreservedBufferSize);
+    memset(outputBytes, 0, maxPreservedBufferSize);
+    unsigned char *r = outputBytes; // + sizeof(size_t) + stateNBBytes;
+    r[0] = SZx_VER_MAJOR;
+    r[1] = SZx_VER_MINOR;
+    r[2] = 1;
+    r[3] = 1; //support random access decompression
+    r = r + 4; //4 byte
+
+    int nbThreads = 1;
+    unsigned char *leadNumberArray_int;
+    size_t z0[200],z1[200];
+
+    size_t nbConstantBlocks;
+    unsigned char *R, *p, *q;
+    float *pf;
+    uint16_t *O;
+
+#pragma omp parallel
+{
+#pragma omp single
+{
+    nbThreads = omp_get_num_threads();
+    //printf("nbThreads = %d\n", nbThreads);
+    assert(nbThreads<200);
+    leadNumberArray_int = (unsigned char *) malloc(blockSize * sizeof(int) * nbThreads);
+
+    //sz_cost_end_msg("sequential-1 malloc");
+    //sz_cost_start();
+}
+#pragma omp for reduction(+:nbNonConstantBlocks) schedule(static)
+    for (i = 0; i < nbBlocks; i++) {
+        float radius;
+        computeStateMedianRadius_double2(op + i * blockSize, blockSize, absErrBound, stateArray + i, medianArray + i,
+                                        &radius);
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_double(op + i * blockSize, blockSize, absErrBound,
+                                                                tmp_q + i * blockSize * sizeof(float), outSizes + i,
+                                                                leadNumberArray_int +
+                                                                omp_get_thread_num() * blockSize * sizeof(int),
+                                                                medianArray[i], radius);
+            outSizesAccumlate[i]=outSizes[i];
+            nbNonConstantBlocks += 1;
+        }else{
+            outSizes[i]=0;
+            outSizesAccumlate[i]=0;
+        }
+    }
+#pragma omp single
+{
+//    sz_cost_end_msg("parallel-1 compress");
+//    exit(0);
+    if (remainCount != 0) {
+        i = nbBlocks;
+        float radius;
+        computeStateMedianRadius_double2(op + i * blockSize, remainCount, absErrBound, stateArray + i, medianArray + i,
+                                        &radius);
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_double(op + i * blockSize, remainCount, absErrBound,
+                                                                tmp_q + i * blockSize * sizeof(float), outSizes + i,
+                                                                leadNumberArray_int, medianArray[i], radius);
+            outSizesAccumlate[i] = outSizes[i];
+            nbNonConstantBlocks += 1;
+        }else{
+            outSizesAccumlate[i] = 0;
+            outSizes[i]=0;
+        }
+    }
+
+    nbConstantBlocks = actualNBBlocks - nbNonConstantBlocks;
+
+    sizeToBytes(r, blockSize);
+    r += sizeof(size_t);
+    sizeToBytes(r, nbConstantBlocks);
+    r += sizeof(size_t);
+    O = (uint16_t*) r; //o is the starting address of 'block-size array'
+    R = r + nbNonConstantBlocks * sizeof(uint16_t); //R is the starting address of the state array
+    p = R + stateNBBytes; //p is the starting address of constant median values.
+    pf = (float *) p;
+    q = p + sizeof(float) * nbConstantBlocks; //q is the starting address of the non-constant data sblocks
+    // unsigned char *q0 = q;
+    // printf("%lu %lu %lu %lu\n",r-outputBytes, R-outputBytes, p-outputBytes, q-outputBytes);
+    // 3: versions, 1: metadata: state, 1: metadata: blockSize, sizeof(size_t): nbConstantBlocks, ....
+    *outSize = q - outputBytes;
+
+//    sz_cost_start();
+
+}
+    int tid = omp_get_thread_num();
+    int lo = tid * actualNBBlocks / nbThreads;
+    int hi = (tid + 1) * actualNBBlocks / nbThreads;
+    int b;
+    nbNonConstantBlockAccumlate[lo]=stateArray[lo];
+    for (b = lo+1; b < hi; b++){
+        outSizesAccumlate[b] = outSizesAccumlate[b] + outSizesAccumlate[b-1];
+    }
+    for (b = lo+1; b < hi; b++){
+        nbNonConstantBlockAccumlate[b]=stateArray[b]+nbNonConstantBlockAccumlate[b-1];
+    }
+    z0[tid] = outSizesAccumlate[hi-1];
+    z1[tid] = nbNonConstantBlockAccumlate[hi-1];
+    size_t offset0=0, offset1=0;
+#pragma omp barrier
+    for (int j = 0; j < tid; j++) {
+        offset0+=z0[j];
+        offset1+=z1[j];
+    }
+    for (b = lo; b < hi; b++){
+        outSizesAccumlate[b] = outSizesAccumlate[b] + offset0;
+        nbNonConstantBlockAccumlate[b] = nbNonConstantBlockAccumlate[b] + offset1;
+    }
+#pragma omp single
+{
+//    sz_cost_end_msg("parallel-2 prefix sum");
+//    sz_cost_start();
+};
+#pragma omp for schedule(static)
+    for (i = 0; i < actualNBBlocks; i++) {
+        if (stateArray[i]) {
+            memcpy(q+outSizesAccumlate[i]-outSizes[i], tmp_q + i * blockSize * sizeof(float), outSizes[i]);
+            O[nbNonConstantBlockAccumlate[i]-1]=outSizes[i];
+        } else {
+            pf[i-nbNonConstantBlockAccumlate[i]]=medianArray[i];
+        }
+    }
+#pragma omp single
+{
+//    sz_cost_end_msg("parallel-3 memcpy");
+//    sz_cost_start();
+
+    *outSize += outSizesAccumlate[actualNBBlocks-1];
+
+    convertIntArray2ByteArray_fast_1b_args(stateArray, actualNBBlocks, R);
+//    sz_cost_end_msg("sequential-2 int2byte");
+//    sz_cost_start();
+    free(nbNonConstantBlockAccumlate);
+    free(outSizesAccumlate);
+    free(leadNumberArray_int);
+    free(tmp_q);
+    free(medianArray);
+    free(stateArray);
+    free(outSizes);
+//    sz_cost_end_msg("sequential-3 free");
+//    printf("blocksize = %d, actualNBBlocks = %lu\n", blockSize, actualNBBlocks);
+//    printf("nbConstantBlocks = %zu, percent = %f\n", nbConstantBlocks, 1.0f * (nbConstantBlocks * blockSize) / nbEle);
+//    printf("CR = %.3f, nbEle = %lu \n", nbEle*4.0/(*outSize), nbEle);
+}
+}
+    return outputBytes;
+#else
+    return NULL;
+#endif
+}
+
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_randomaccess_double(double *oriData, size_t *outSize, float absErrBound,
+    size_t nbEle, int blockSize) {
+    double *op = oriData;
+
+    *outSize = 0;
+    size_t maxPreservedBufferSize =
+            sizeof(double) * nbEle; //assume that the compressed data size would not exceed the original size
+    unsigned char *outputBytes = (unsigned char *) malloc(maxPreservedBufferSize);
+    memset(outputBytes, 0, maxPreservedBufferSize);
+    unsigned char *leadNumberArray_int = (unsigned char *) malloc(blockSize * sizeof(int));
+
+    size_t i = 0;
+    int oSize = 0;
+
+    size_t nbBlocks = nbEle / blockSize;
+    size_t remainCount = nbEle % blockSize;
+    size_t actualNBBlocks = remainCount == 0 ? nbBlocks : nbBlocks + 1;
+
+    size_t stateNBBytes = (actualNBBlocks % 8 == 0 ? actualNBBlocks / 8 : actualNBBlocks / 8 + 1);
+
+    unsigned char *stateArray = (unsigned char *) malloc(actualNBBlocks);
+    float *medianArray = (float *) malloc(actualNBBlocks * sizeof(float));
+    float *radiusArray = (float *) malloc(actualNBBlocks * sizeof(float));
+
+    size_t nbConstantBlocks = computeStateMedianRadius_double(oriData, nbEle, absErrBound, blockSize, stateArray,
+                                                             medianArray, radiusArray);
+
+    size_t nbNonConstantBlocks = actualNBBlocks - nbConstantBlocks;
+
+    unsigned char *r = outputBytes; // + sizeof(size_t) + stateNBBytes;
+    r[0] = SZx_VER_MAJOR;
+    r[1] = SZx_VER_MINOR;
+    r[2] = 1;
+    r[3] = 1; //support random access decompression
+    r = r + 4; //1 byte
+
+    sizeToBytes(r, blockSize);
+    r += sizeof(size_t);
+    sizeToBytes(r, nbConstantBlocks);
+    r += sizeof(size_t); //r is the starting address of 'block-size array'
+    uint16_t *O=(uint16_t*)r;
+    unsigned char *R = r + nbNonConstantBlocks*sizeof(uint16_t); //R is the starting address of the state array
+    unsigned char *p = R + stateNBBytes; //p is the starting address of constant median values.
+    unsigned char *q =
+            p + sizeof(float) * nbConstantBlocks; //q is the starting address of the non-constant data sblocks
+    //3: versions, 1: metadata: state, 1: metadata: blockSize, sizeof(size_t): nbConstantBlocks, ....
+    *outSize = q-outputBytes;
+
+    size_t nonConstantBlockID = 0;
+    //printf("nbConstantBlocks = %zu, percent = %f\n", nbConstantBlocks, 1.0f*(nbConstantBlocks*blockSize)/nbEle);
+    for (i = 0; i < nbBlocks; i++, op += blockSize) {
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_double(op, blockSize, absErrBound, q, &oSize,
+                                                                leadNumberArray_int, medianArray[i], radiusArray[i]);
+            q += oSize;
+            *outSize += oSize;
+            O[nonConstantBlockID++] = oSize;
+        } else {
+            floatToBytes(p, medianArray[i]);
+            p += sizeof(float);
+        }
+    }
+
+    if (remainCount != 0) {
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_double(op, remainCount, absErrBound, q, &oSize,
+                                                                leadNumberArray_int, medianArray[i], radiusArray[i]);
+            *outSize += oSize;
+            O[nonConstantBlockID] = oSize;
+        } else {
+            floatToBytes(p, medianArray[i]);
+        }
+
+    }
+
+    convertIntArray2ByteArray_fast_1b_args(stateArray, actualNBBlocks, R);
+
+    free(leadNumberArray_int);
+
+    return outputBytes;
+}
+
+
+unsigned char *
+SZ_fast_compress_args_unpredictable_double(double *data, size_t *outSize, float absErrBound, size_t r5, size_t r4,
+                                          size_t r3, size_t r2, size_t r1, float mValue, float radius) {
+    size_t totalSize = 0;
+    double medianValue = mValue;
+
+    size_t dataLength = computeDataLength(r5, r4, r3, r2, r1);
+
+    size_t maxPreservedBufferSize =
+            sizeof(double) * dataLength; //assume that the compressed data size would not exceed the original size
+
+    unsigned char *outputBytes = (unsigned char *) malloc(maxPreservedBufferSize);
+    memset(outputBytes, 0, maxPreservedBufferSize);
+    unsigned char *r = outputBytes; // + sizeof(size_t) + stateNBBytes;
+    r[0] = SZx_VER_MAJOR;
+    r[1] = SZx_VER_MINOR;
+    r[2] = 1; //SZx_VER_SUPERFAST
+    r[3] = 0; //support random access decompression
+
+//	sz_cost_start();
+    size_t i;
+    int reqLength;
+    short radExpo = getExponent_float(radius);
+
+    computeReqLength_double(absErrBound, radExpo, &reqLength, &mValue);
+
+    int reqBytesLength = reqLength / 8;
+    int resiBitsLength = reqLength % 8;
+    int rightShiftBits = 0;
+
+    size_t leadNumberArray_size = dataLength % 4 == 0 ? dataLength / 4 : dataLength / 4 + 1;
+
+    register ldouble lfBuf_pre;
+    register ldouble lfBuf_cur;
+    lfBuf_pre.lvalue = 0;
+
+    unsigned char *leadNumberArray = outputBytes + 4 + 1 + sizeof(float) + sizeof(size_t);
+
+    unsigned char *exactMidbyteArray = leadNumberArray + leadNumberArray_size;
+
+    if (resiBitsLength != 0) {
+        rightShiftBits = 8 - resiBitsLength;
+        reqBytesLength++;
+    }
+
+    register unsigned char leadingNum = 0;
+
+    unsigned char *leadNumberArray_int = (unsigned char *) malloc(dataLength);
+
+    size_t residualMidBytes_size = 0;
+    if (sysEndianType == LITTLE_ENDIAN_SYSTEM) {
+        if (reqBytesLength == 3) {
+            for (i = 0; i < dataLength; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 3;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 2;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[5];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        } else if (reqBytesLength == 2) {
+            for (i = 0; i < dataLength; i++) {
+
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 2;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[6];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        } else if (reqBytesLength == 1) {
+            for (i = 0; i < dataLength; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[7];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+		}else if(reqBytesLength == 4) {
+            for (i = 0; i < dataLength; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 4;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 3;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[5];
+                    residualMidBytes_size += 2;
+                } else //leadingNum == 3
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[4];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }
+        else if (reqBytesLength == 5)
+        {
+            for (i = 0; i < dataLength; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 5;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 4;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[5];
+                    residualMidBytes_size += 3;
+                } else if (leadingNum == 3)
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[4];
+                    residualMidBytes_size += 2;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }
+        else if(reqBytesLength == 6)
+        {
+            for (i = 0; i < dataLength; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 6;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 5;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[5];
+                    residualMidBytes_size += 4;
+                } else if (leadingNum == 3)
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[4];
+                    residualMidBytes_size += 3;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }        
+        else if(reqBytesLength == 7)
+        {
+            for (i = 0; i < dataLength; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 6] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 7;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 6;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[5];
+                    residualMidBytes_size += 5;
+                } else if (leadingNum == 3)
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[4];
+                    residualMidBytes_size += 4;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }
+        else //reqLength == 8
+        {
+            for (i = 0; i < dataLength; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 6] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 7] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 8;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 6] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 7;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[5];
+                    residualMidBytes_size += 6;
+                } else if (leadingNum == 3)
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[4];
+                    residualMidBytes_size += 5;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }        
+
+        convertIntArray2ByteArray_fast_2b_args(leadNumberArray_int, dataLength, leadNumberArray);
+
+        int k = 4;
+
+        unsigned char reqLengthB = (unsigned char) reqLength;
+        outputBytes[k] = reqLengthB;
+        k++;
+        floatToBytes(&(outputBytes[k]), mValue);
+        k += sizeof(float);
+        sizeToBytes(&(outputBytes[k]), leadNumberArray_size);
+
+        totalSize = 4 + 1 + sizeof(float) + sizeof(size_t) + leadNumberArray_size + residualMidBytes_size;
+    } else {
+
+    }
+
+    *outSize = totalSize;
+
+    free(leadNumberArray_int);
+//	sz_cost_end();
+//	printf("compression time = %f\n", sz_totalCost);
+
+    return outputBytes;
+}
+
+unsigned char *SZ_skip_compress_double(double *data, size_t dataLength, size_t *outSize) {
+    *outSize = dataLength * sizeof(double);
+    unsigned char *out = (unsigned char *) malloc(dataLength * sizeof(double));
+    memcpy(out, data, dataLength * sizeof(double));
+    return out;
+}
+
+inline void computeReqLength_double(float realPrecision, short radExpo, int* reqLength, float* medianValue)
+{
+        short reqExpo = getPrecisionReqLength_double(realPrecision);
+        *reqLength = 12+radExpo - reqExpo; //radExpo-reqExpo == reqMantiLength
+        if(*reqLength<12)
+                *reqLength = 12;
+        if(*reqLength>64)
+        {
+                *reqLength = 64;
+                *medianValue = 0;
+        }
+}
+
diff --git a/qtensor/compression/szx/src/szx_float.c b/qtensor/compression/szx/src/szx_float.c
new file mode 100644
index 00000000..010c2e4d
--- /dev/null
+++ b/qtensor/compression/szx/src/szx_float.c
@@ -0,0 +1,975 @@
+/**
+ *  @file szx_float.c
+ *  @author Sheng Di, Kai Zhao
+ *  @date Aug, 2022
+ *  @brief SZ_Init, Compression and Decompression functions
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "szx.h"
+#include "szx_float.h"
+#include "szx_BytesToolkit.h"
+#include "szx_TypeManager.h"
+#include <assert.h>
+
+#ifdef _OPENMP
+#include "omp.h"
+#endif
+
+#if defined(__AVX__) || defined(__AVX2__)  || defined(__AVX512F__)
+#include <immintrin.h>
+#endif
+
+unsigned char *
+SZ_fast_compress_args_with_prediction_float(float *pred, float *data, size_t *outSize, float absErrBound, size_t r5,
+                                            size_t r4, size_t r3, size_t r2, size_t r1, float medianValue,
+                                            float radius) {
+    size_t dataLength = computeDataLength(r5, r4, r3, r2, r1);
+    float *delta = (float *) malloc(sizeof(float) * dataLength);
+    size_t i = 0;
+    for (i = 0; i < dataLength; i++)
+        delta[i] = data[i] - pred[i];
+    unsigned char *output = SZ_fast_compress_args_unpredictable_float(delta, outSize, absErrBound, r5, r4, r3, r2, r1,
+                                                                      medianValue, radius);
+    return output;
+}
+
+inline void SZ_fast_compress_args_unpredictable_one_block_float(float *oriData, size_t nbEle, float absErrBound,
+                                                                unsigned char *outputBytes, int *outSize,
+                                                                unsigned char *leadNumberArray_int, float medianValue,
+                                                                float radius) {
+    size_t totalSize = 0, i = 0;
+
+    int reqLength;
+
+    //compute median, value range, and radius
+
+    short radExpo = getExponent_float(radius);
+    computeReqLength_float(absErrBound, radExpo, &reqLength, &medianValue);
+
+    int reqBytesLength = reqLength / 8;
+    int resiBitsLength = reqLength % 8;
+    int rightShiftBits = 0;
+
+    size_t leadNumberArray_size = nbEle % 4 == 0 ? nbEle / 4 : nbEle / 4 + 1;
+
+    register lfloat lfBuf_pre;
+    register lfloat lfBuf_cur;
+    lfBuf_pre.ivalue = 0;
+
+    unsigned char *leadNumberArray = outputBytes + 1 + sizeof(float);
+
+    unsigned char *exactMidbyteArray = leadNumberArray + leadNumberArray_size;
+
+    if (resiBitsLength != 0) {
+        rightShiftBits = 8 - resiBitsLength;
+        reqBytesLength++;
+    }
+
+    register unsigned char leadingNum = 0;
+    size_t residualMidBytes_size = 0;
+    if (sysEndianType == LITTLE_ENDIAN_SYSTEM) {
+        if (reqBytesLength == 2) {
+            for (i = 0; i < nbEle; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre.ivalue = lfBuf_cur.ivalue ^ lfBuf_pre.ivalue;
+
+                if (lfBuf_pre.ivalue >> 8 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.ivalue >> 16 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.ivalue >> 24 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[3];
+                    residualMidBytes_size += 2;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        } else if (reqBytesLength == 3) {
+            for (i = 0; i < nbEle; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre.ivalue = lfBuf_cur.ivalue ^ lfBuf_pre.ivalue;
+
+                if (lfBuf_pre.ivalue >> 8 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.ivalue >> 16 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.ivalue >> 24 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[3];
+                    residualMidBytes_size += 3;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    residualMidBytes_size += 2;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        } else if (reqBytesLength == 1) {
+            for (i = 0; i < nbEle; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre.ivalue = lfBuf_cur.ivalue ^ lfBuf_pre.ivalue;
+
+                if (lfBuf_pre.ivalue >> 8 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.ivalue >> 16 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.ivalue >> 24 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[3];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        } else //reqBytesLength == 4
+        {
+            for (i = 0; i < nbEle; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre.ivalue = lfBuf_cur.ivalue ^ lfBuf_pre.ivalue;
+
+                if (lfBuf_pre.ivalue >> 8 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.ivalue >> 16 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.ivalue >> 24 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[3];
+                    residualMidBytes_size += 4;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    residualMidBytes_size += 3;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    residualMidBytes_size += 2;
+                } else //leadingNum == 3
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }
+
+        convertIntArray2ByteArray_fast_2b_args(leadNumberArray_int, nbEle, leadNumberArray);
+        int k = 0;
+
+        unsigned char reqLengthB = (unsigned char) reqLength;
+        outputBytes[k] = reqLengthB;
+        k++;
+        floatToBytes(&(outputBytes[k]), medianValue);
+        k += sizeof(float);
+        //sizeToBytes(&(outputBytes[k]), leadNumberArray_size);
+        //outputBytes[k] = leadNumberArray_size;  //leadNumberArray_size can be calculated based on block size (=blockSize/4)
+
+        totalSize = 1 + sizeof(float) + leadNumberArray_size + residualMidBytes_size;
+    } else {
+
+    }
+
+    *outSize = totalSize;
+
+}
+
+size_t computeStateMedianRadius_float(float *oriData, size_t nbEle, float absErrBound, int blockSize,
+                                      unsigned char *stateArray, float *medianArray, float *radiusArray) {
+    size_t nbConstantBlocks = 0;
+    size_t i = 0, j = 0;
+    size_t nbBlocks = nbEle / blockSize;
+    size_t offset = 0;
+
+    for (i = 0; i < nbBlocks; i++) {
+        float min = oriData[offset];
+        float max = oriData[offset];
+        for (j = 1; j < blockSize; j++) {
+            float v = oriData[offset + j];
+            if (min > v)
+                min = v;
+            else if (max < v)
+                max = v;
+        }
+        float valueRange = max - min;
+        float radius = valueRange / 2;
+        float medianValue = min + radius;
+
+        if (radius <= absErrBound) {
+            stateArray[i] = 0;
+            nbConstantBlocks++;
+        } else
+            stateArray[i] = 1;
+
+        stateArray[i] = radius <= absErrBound ? 0 : 1;
+        medianArray[i] = medianValue;
+        radiusArray[i] = radius;
+        offset += blockSize;
+    }
+
+    int remainCount = nbEle % blockSize;
+    if (remainCount != 0) {
+        float min = oriData[offset];
+        float max = oriData[offset];
+        for (j = 1; j < remainCount; j++) {
+            float v = oriData[offset + j];
+            if (min > v)
+                min = v;
+            else if (max < v)
+                max = v;
+        }
+        float valueRange = max - min;
+        float radius = valueRange / 2;
+        float medianValue = min + radius;
+        if (radius <= absErrBound) {
+            stateArray[i] = 0;
+            nbConstantBlocks++;
+        } else
+            stateArray[i] = 1;
+        medianArray[i] = medianValue;
+        radiusArray[i] = radius;
+    }
+    return nbConstantBlocks;
+}
+
+
+void max_min_float(float *x, int n, float *tmp_max, float *tmp_min) {
+    for (size_t i = 0; i < n; i++) {
+        if (x[i] > *tmp_max) {
+            *tmp_max = x[i];
+        }
+        if (x[i] < *tmp_min) {
+            *tmp_min = x[i];
+        }
+    }
+}
+
+void simd_max_min_float(float *x, int n, float *tmp_max, float *tmp_min) {
+    *tmp_max = x[0];
+    *tmp_min = x[0];
+#ifdef  __AVX512F__
+    //    printf("use avx512, n=%d \n", n);
+    int n16 = n & -16, i = 0, j=0;
+    if (n > 16) {
+        float *ptr_x = x;
+        __m512 max1 = _mm512_loadu_ps(ptr_x);
+//        __m512 max2 = _mm512_loadu_ps(ptr_x + 16);
+        __m512 min1 = max1;
+//        __m512 min2 = max2;
+        __m512 tmp1;
+//        __m512 tmp2;
+        for (; i < n16; i += 16) {
+            tmp1 = _mm512_loadu_ps(ptr_x);
+            max1 = _mm512_max_ps(tmp1, max1);
+            min1 = _mm512_min_ps(tmp1, min1);
+//            tmp2 = _mm512_loadu_ps(ptr_x+16);
+//            max2 = _mm512_max_ps(tmp2, max2);
+//            min2 = _mm512_min_ps(tmp2, min2);
+            ptr_x += 16;
+        }
+//        max1 = _mm512_max_ps(max1, max2);
+//        min1 = _mm512_min_ps(min1, min2);
+          __m256 max256 = _mm256_max_ps(_mm512_extractf32x8_ps(max1,0), _mm512_extractf32x8_ps(max1,1));
+          __m128 max128 = _mm_max_ps(_mm256_extractf128_ps(max256,0), _mm256_extractf128_ps(max256,1));
+          __m256 min256 = _mm256_min_ps(_mm512_extractf32x8_ps(min1,0), _mm512_extractf32x8_ps(min1,1));
+          __m128 min128 = _mm_min_ps(_mm256_extractf128_ps(min256,0), _mm256_extractf128_ps(min256,1));
+          for (j=0;j<4;j++){
+            *tmp_max = *tmp_max < max128[j] ? max128[j] : *tmp_max;
+            *tmp_min = *tmp_min > min128[j] ? min128[j] : *tmp_min;
+          }
+
+        if ( i < n ) {
+            max_min_float(ptr_x, n - i, tmp_max, tmp_min);
+        }
+    } else {
+        max_min_float(x, n, tmp_max, tmp_min);
+    }
+#elif __AVX2__
+//        printf("use avx2, n=%d \n", n);
+    //    fflush(stdout);
+    int n16 = n & -16, i = 0;
+    if (n > 16) {
+        float *ptr_x = x;
+        __m256 max1 = _mm256_loadu_ps(ptr_x);
+        __m256 max2 = _mm256_loadu_ps(ptr_x + 8);
+        __m256 min1 = max1;
+        __m256 min2 = max2;
+        for (; i < n16; i += 16) {
+            max1 = _mm256_max_ps(_mm256_loadu_ps(ptr_x), max1);
+            min1 = _mm256_min_ps(_mm256_loadu_ps(ptr_x), min1);
+            max2 = _mm256_max_ps(_mm256_loadu_ps(ptr_x + 8), max2);
+            min2 = _mm256_min_ps(_mm256_loadu_ps(ptr_x + 8), min2);
+            ptr_x += 16;
+        }
+//        printf("%d %d %d\n", n, n16, i);
+//        exit(0);
+        max1 = _mm256_max_ps(max1, max2);
+        min1 = _mm256_min_ps(min1, min2);
+        for (int j = 0; j < 8; j++) {
+            *tmp_max = *tmp_max < max1[j] ? max1[j] : *tmp_max;
+            *tmp_min = *tmp_min > min1[j] ? min1[j] : *tmp_min;
+        }
+        if ( i < n ) {
+            max_min_float(ptr_x, n - i, tmp_max, tmp_min);
+        }
+    } else {
+        max_min_float(x, n, tmp_max, tmp_min);
+    }
+#else
+    max_min_float(x, n, tmp_max, tmp_min);
+#endif
+}
+
+void computeStateMedianRadius_float2(float *oriData, size_t nbEle, float absErrBound,
+                                     unsigned char *state, float *median, float *radius) {
+     float min = oriData[0];
+     float max = oriData[0];
+     simd_max_min_float(oriData, nbEle, &max, &min);
+
+    float valueRange = max - min;
+    *radius = valueRange / 2;
+    *median = min + *radius;
+
+    if (*radius <= absErrBound) {
+        *state = 0;
+    } else {
+        *state = 1;
+    }
+}
+
+
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle,
+                                                  int blockSize) {
+    float *op = oriData;
+
+    *outSize = 0;
+    size_t maxPreservedBufferSize =
+            sizeof(float) * nbEle; //assume that the compressed data size would not exceed the original size
+    unsigned char *outputBytes = (unsigned char *) malloc(maxPreservedBufferSize);
+    memset(outputBytes, 0, maxPreservedBufferSize);
+    unsigned char *leadNumberArray_int = (unsigned char *) malloc(blockSize * sizeof(int));
+
+    size_t i = 0;
+    int oSize = 0;
+
+    size_t nbBlocks = nbEle / blockSize;
+    size_t remainCount = nbEle % blockSize;
+    size_t stateNBBytes =
+            remainCount == 0 ? (nbBlocks % 8 == 0 ? nbBlocks / 8 : nbBlocks / 8 + 1) : ((nbBlocks + 1) % 8 == 0 ?
+                                                                                        (nbBlocks + 1) / 8 :
+                                                                                        (nbBlocks + 1) / 8 + 1);
+    size_t actualNBBlocks = remainCount == 0 ? nbBlocks : nbBlocks + 1;
+
+    unsigned char *stateArray = (unsigned char *) malloc(actualNBBlocks);
+    float *medianArray = (float *) malloc(actualNBBlocks * sizeof(float));
+    float *radiusArray = (float *) malloc(actualNBBlocks * sizeof(float));
+
+    size_t nbConstantBlocks = computeStateMedianRadius_float(oriData, nbEle, absErrBound, blockSize, stateArray,
+                                                             medianArray, radiusArray);
+
+    unsigned char *r = outputBytes; // + sizeof(size_t) + stateNBBytes;
+    r[0] = SZx_VER_MAJOR;
+    r[1] = SZx_VER_MINOR;
+    r[2] = 1;
+    r[3] = 0; // indicates this is not a random access version
+    r[4] = (unsigned char) blockSize;
+    r = r + 5; //1 byte
+    sizeToBytes(r, nbConstantBlocks);
+    r += sizeof(size_t); //r is the starting address of 'stateNBBytes'
+
+    unsigned char *p = r + stateNBBytes; //p is the starting address of constant median values.
+    unsigned char *q =
+            p + sizeof(float) * nbConstantBlocks; //q is the starting address of the non-constant data sblocks
+    //3: versions, 1: metadata: state, 1: metadata: blockSize, sizeof(size_t): nbConstantBlocks, ....
+    *outSize += (3 + 1 + 1 + sizeof(size_t) + stateNBBytes + sizeof(float) * nbConstantBlocks);
+
+    //printf("nbConstantBlocks = %zu, percent = %f\n", nbConstantBlocks, 1.0f*(nbConstantBlocks*blockSize)/nbEle);
+    for (i = 0; i < nbBlocks; i++, op += blockSize) {
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_float(op, blockSize, absErrBound, q, &oSize,
+                                                                leadNumberArray_int, medianArray[i], radiusArray[i]);
+            q += oSize;
+            *outSize += oSize;
+        } else {
+            floatToBytes(p, medianArray[i]);
+            p += sizeof(float);
+        }
+    }
+
+    if (remainCount != 0) {
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_float(op, remainCount, absErrBound, q, &oSize,
+                                                                leadNumberArray_int, medianArray[i], radiusArray[i]);
+            *outSize += oSize;
+        } else {
+            floatToBytes(p, medianArray[i]);
+        }
+
+    }
+
+    convertIntArray2ByteArray_fast_1b_args(stateArray, actualNBBlocks, r);
+	
+    free(stateArray);
+    free(medianArray);	
+    free(radiusArray);
+    free(leadNumberArray_int);
+
+    return outputBytes;
+}
+
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_randomaccess_float_openmp(float *oriData, size_t *outSize, float absErrBound,
+                                                               size_t nbEle, int blockSize) {
+#ifdef _OPENMP
+    printf("use openmp\n");
+
+#ifdef __AVX512F__
+    printf("use avx512\n");
+#elif __AVX2__
+    printf("use avx2\n");
+#else
+#endif
+    printf("blockSize = %d\n",blockSize);
+    sz_cost_start();
+    float *op = oriData;
+
+    size_t i = 0;
+    size_t nbBlocks = nbEle / blockSize;
+    size_t remainCount = nbEle % blockSize;
+    size_t actualNBBlocks = remainCount == 0 ? nbBlocks : nbBlocks + 1;
+    size_t stateNBBytes = (actualNBBlocks % 8 == 0 ? actualNBBlocks / 8 : actualNBBlocks / 8 + 1);
+
+    unsigned char *stateArray = (unsigned char *) malloc(actualNBBlocks);
+    float *medianArray = (float *) malloc(actualNBBlocks * sizeof(float));
+
+    size_t nbNonConstantBlocks = 0;
+
+    unsigned char *tmp_q = (unsigned char *) malloc(blockSize * sizeof(float) * actualNBBlocks);
+    int *outSizes = (int *) malloc(actualNBBlocks * sizeof(int));
+    size_t *outSizesAccumlate = (size_t *) malloc(actualNBBlocks * sizeof(size_t));
+    int *nbNonConstantBlockAccumlate = (int *) malloc(actualNBBlocks * sizeof(int));
+
+    (*outSize) = 0;
+    size_t maxPreservedBufferSize =
+    sizeof(float) * nbEle; //assume that the compressed data size would not exceed the original size
+    unsigned char *outputBytes = (unsigned char *) malloc(maxPreservedBufferSize);
+    memset(outputBytes, 0, maxPreservedBufferSize);
+    unsigned char *r = outputBytes; // + sizeof(size_t) + stateNBBytes;
+    r[0] = SZx_VER_MAJOR;
+    r[1] = SZx_VER_MINOR;
+    r[2] = 1;
+    r[3] = 1; //support random access decompression
+    r = r + 4; //4 byte
+
+    int nbThreads = 1;
+    unsigned char *leadNumberArray_int;
+    size_t z0[200],z1[200];
+
+    size_t nbConstantBlocks;
+    unsigned char *R, *p, *q;
+    float *pf;
+    uint16_t *O;
+
+#pragma omp parallel
+{
+#pragma omp single
+{
+    nbThreads = omp_get_num_threads();
+    printf("nbThreads = %d\n", nbThreads);
+    assert(nbThreads<200);
+    leadNumberArray_int = (unsigned char *) malloc(blockSize * sizeof(int) * nbThreads);
+
+    sz_cost_end_msg("sequential-1 malloc");
+    sz_cost_start();
+}
+#pragma omp for reduction(+:nbNonConstantBlocks) schedule(static)
+    for (i = 0; i < nbBlocks; i++) {
+        float radius;
+        computeStateMedianRadius_float2(op + i * blockSize, blockSize, absErrBound, stateArray + i, medianArray + i,
+                                        &radius);
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_float(op + i * blockSize, blockSize, absErrBound,
+                                                                tmp_q + i * blockSize * sizeof(float), outSizes + i,
+                                                                leadNumberArray_int +
+                                                                omp_get_thread_num() * blockSize * sizeof(int),
+                                                                medianArray[i], radius);
+            outSizesAccumlate[i]=outSizes[i];
+            nbNonConstantBlocks += 1;
+        }else{
+            outSizes[i]=0;
+            outSizesAccumlate[i]=0;
+        }
+    }
+#pragma omp single
+{
+    sz_cost_end_msg("parallel-1 compress");
+//    exit(0);
+    if (remainCount != 0) {
+        i = nbBlocks;
+        float radius;
+        computeStateMedianRadius_float2(op + i * blockSize, remainCount, absErrBound, stateArray + i, medianArray + i,
+                                        &radius);
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_float(op + i * blockSize, remainCount, absErrBound,
+                                                                tmp_q + i * blockSize * sizeof(float), outSizes + i,
+                                                                leadNumberArray_int, medianArray[i], radius);
+            outSizesAccumlate[i] = outSizes[i];
+            nbNonConstantBlocks += 1;
+        }else{
+            outSizesAccumlate[i] = 0;
+            outSizes[i]=0;
+        }
+    }
+
+    nbConstantBlocks = actualNBBlocks - nbNonConstantBlocks;
+
+    sizeToBytes(r, blockSize);
+    r += sizeof(size_t);
+    sizeToBytes(r, nbConstantBlocks);
+    r += sizeof(size_t);
+    O = (uint16_t*) r; //o is the starting address of 'block-size array'
+    R = r + nbNonConstantBlocks * sizeof(uint16_t); //R is the starting address of the state array
+    p = R + stateNBBytes; //p is the starting address of constant median values.
+    pf = (float *) p;
+    q = p + sizeof(float) * nbConstantBlocks; //q is the starting address of the non-constant data sblocks
+    // unsigned char *q0 = q;
+    // printf("%lu %lu %lu %lu\n",r-outputBytes, R-outputBytes, p-outputBytes, q-outputBytes);
+    // 3: versions, 1: metadata: state, 1: metadata: blockSize, sizeof(size_t): nbConstantBlocks, ....
+    *outSize = q - outputBytes;
+
+    sz_cost_start();
+
+}
+    int tid = omp_get_thread_num();
+    int lo = tid * actualNBBlocks / nbThreads;
+    int hi = (tid + 1) * actualNBBlocks / nbThreads;
+    int b;
+    nbNonConstantBlockAccumlate[lo]=stateArray[lo];
+    for (b = lo+1; b < hi; b++){
+        outSizesAccumlate[b] = outSizesAccumlate[b] + outSizesAccumlate[b-1];
+    }
+    for (b = lo+1; b < hi; b++){
+        nbNonConstantBlockAccumlate[b]=stateArray[b]+nbNonConstantBlockAccumlate[b-1];
+    }
+    z0[tid] = outSizesAccumlate[hi-1];
+    z1[tid] = nbNonConstantBlockAccumlate[hi-1];
+    size_t offset0=0, offset1=0;
+#pragma omp barrier
+    for (int j = 0; j < tid; j++) {
+        offset0+=z0[j];
+        offset1+=z1[j];
+    }
+    for (b = lo; b < hi; b++){
+        outSizesAccumlate[b] = outSizesAccumlate[b] + offset0;
+        nbNonConstantBlockAccumlate[b] = nbNonConstantBlockAccumlate[b] + offset1;
+    }
+#pragma omp single
+{
+    sz_cost_end_msg("parallel-2 prefix sum");
+    sz_cost_start();
+};
+#pragma omp for schedule(static)
+    for (i = 0; i < actualNBBlocks; i++) {
+        if (stateArray[i]) {
+            memcpy(q+outSizesAccumlate[i]-outSizes[i], tmp_q + i * blockSize * sizeof(float), outSizes[i]);
+            O[nbNonConstantBlockAccumlate[i]-1]=outSizes[i];
+        } else {
+            pf[i-nbNonConstantBlockAccumlate[i]]=medianArray[i];
+        }
+    }
+#pragma omp single
+{
+    sz_cost_end_msg("parallel-3 memcpy");
+    sz_cost_start();
+
+    *outSize += outSizesAccumlate[actualNBBlocks-1];
+
+    convertIntArray2ByteArray_fast_1b_args(stateArray, actualNBBlocks, R);
+    sz_cost_end_msg("sequential-2 int2byte");
+    sz_cost_start();
+    free(nbNonConstantBlockAccumlate);
+    free(outSizesAccumlate);
+    free(leadNumberArray_int);
+    free(tmp_q);
+    free(medianArray);
+    free(stateArray);
+    free(outSizes);
+    sz_cost_end_msg("sequential-3 free");
+    printf("blocksize = %d, actualNBBlocks = %lu\n", blockSize, actualNBBlocks);
+    printf("nbConstantBlocks = %zu, percent = %f\n", nbConstantBlocks, 1.0f * (nbConstantBlocks * blockSize) / nbEle);
+    printf("CR = %.3f, nbEle = %lu \n", nbEle*4.0/(*outSize), nbEle);
+}
+}
+    return outputBytes;
+#else
+    return NULL;
+#endif
+}
+
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_randomaccess_float(float *oriData, size_t *outSize, float absErrBound,
+    size_t nbEle, int blockSize) {
+    float *op = oriData;
+
+    *outSize = 0;
+    size_t maxPreservedBufferSize =
+            sizeof(float) * nbEle; //assume that the compressed data size would not exceed the original size
+    unsigned char *outputBytes = (unsigned char *) malloc(maxPreservedBufferSize);
+    memset(outputBytes, 0, maxPreservedBufferSize);
+    unsigned char *leadNumberArray_int = (unsigned char *) malloc(blockSize * sizeof(int));
+
+    size_t i = 0;
+    int oSize = 0;
+
+    size_t nbBlocks = nbEle / blockSize;
+    size_t remainCount = nbEle % blockSize;
+    size_t actualNBBlocks = remainCount == 0 ? nbBlocks : nbBlocks + 1;
+
+    size_t stateNBBytes = (actualNBBlocks % 8 == 0 ? actualNBBlocks / 8 : actualNBBlocks / 8 + 1);
+
+    unsigned char *stateArray = (unsigned char *) malloc(actualNBBlocks);
+    float *medianArray = (float *) malloc(actualNBBlocks * sizeof(float));
+    float *radiusArray = (float *) malloc(actualNBBlocks * sizeof(float));
+
+    size_t nbConstantBlocks = computeStateMedianRadius_float(oriData, nbEle, absErrBound, blockSize, stateArray,
+                                                             medianArray, radiusArray);
+
+    size_t nbNonConstantBlocks = actualNBBlocks - nbConstantBlocks;
+
+    unsigned char *r = outputBytes; // + sizeof(size_t) + stateNBBytes;
+    r[0] = SZx_VER_MAJOR;
+    r[1] = SZx_VER_MINOR;
+    r[2] = 1;
+    r[3] = 1; //support random access decompression
+    r = r + 4; //1 byte
+
+    sizeToBytes(r, blockSize);
+    r += sizeof(size_t);
+    sizeToBytes(r, nbConstantBlocks);
+    r += sizeof(size_t); //r is the starting address of 'block-size array'
+    uint16_t *O=(uint16_t*)r;
+    unsigned char *R = r + nbNonConstantBlocks*sizeof(uint16_t); //R is the starting address of the state array
+    unsigned char *p = R + stateNBBytes; //p is the starting address of constant median values.
+    unsigned char *q =
+            p + sizeof(float) * nbConstantBlocks; //q is the starting address of the non-constant data sblocks
+    //3: versions, 1: metadata: state, 1: metadata: blockSize, sizeof(size_t): nbConstantBlocks, ....
+    *outSize = q-outputBytes;
+
+    size_t nonConstantBlockID = 0;
+    //printf("nbConstantBlocks = %zu, percent = %f\n", nbConstantBlocks, 1.0f*(nbConstantBlocks*blockSize)/nbEle);
+    for (i = 0; i < nbBlocks; i++, op += blockSize) {
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_float(op, blockSize, absErrBound, q, &oSize,
+                                                                leadNumberArray_int, medianArray[i], radiusArray[i]);
+            q += oSize;
+            *outSize += oSize;
+            O[nonConstantBlockID++] = oSize;
+        } else {
+            floatToBytes(p, medianArray[i]);
+            p += sizeof(float);
+        }
+    }
+
+    if (remainCount != 0) {
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_float(op, remainCount, absErrBound, q, &oSize,
+                                                                leadNumberArray_int, medianArray[i], radiusArray[i]);
+            *outSize += oSize;
+            O[nonConstantBlockID] = oSize;
+        } else {
+            floatToBytes(p, medianArray[i]);
+        }
+
+    }
+
+    convertIntArray2ByteArray_fast_1b_args(stateArray, actualNBBlocks, R);
+
+    free(leadNumberArray_int);
+
+    return outputBytes;
+
+}
+
+
+unsigned char *
+SZ_fast_compress_args_unpredictable_float(float *data, size_t *outSize, float absErrBound, size_t r5, size_t r4,
+                                          size_t r3, size_t r2, size_t r1, float mValue, float radius) {
+    size_t totalSize = 0;
+    float medianValue = mValue;
+
+    size_t dataLength = computeDataLength(r5, r4, r3, r2, r1);
+
+    size_t maxPreservedBufferSize =
+            sizeof(float) * dataLength; //assume that the compressed data size would not exceed the original size
+
+    unsigned char *outputBytes = (unsigned char *) malloc(maxPreservedBufferSize);
+    memset(outputBytes, 0, maxPreservedBufferSize);
+    unsigned char *r = outputBytes; // + sizeof(size_t) + stateNBBytes;
+    r[0] = SZx_VER_MAJOR;
+    r[1] = SZx_VER_MINOR;
+    r[2] = 1; //SZx_VER_SUPERFAST
+    r[3] = 0; //support random access decompression
+
+//	sz_cost_start();
+    size_t i;
+    int reqLength;
+    short radExpo = getExponent_float(radius);
+
+    computeReqLength_float(absErrBound, radExpo, &reqLength, &medianValue);
+
+    int reqBytesLength = reqLength / 8;
+    int resiBitsLength = reqLength % 8;
+    int rightShiftBits = 0;
+
+    size_t leadNumberArray_size = dataLength % 4 == 0 ? dataLength / 4 : dataLength / 4 + 1;
+
+    register lfloat lfBuf_pre;
+    register lfloat lfBuf_cur;
+    lfBuf_pre.ivalue = 0;
+
+    unsigned char *leadNumberArray = outputBytes + 4 + 1 + sizeof(float) + sizeof(size_t);
+
+    unsigned char *exactMidbyteArray = leadNumberArray + leadNumberArray_size;
+
+    if (resiBitsLength != 0) {
+        rightShiftBits = 8 - resiBitsLength;
+        reqBytesLength++;
+    }
+
+    register unsigned char leadingNum = 0;
+
+    unsigned char *leadNumberArray_int = (unsigned char *) malloc(dataLength);
+
+    size_t residualMidBytes_size = 0;
+    if (sysEndianType == LITTLE_ENDIAN_SYSTEM) {
+        if (reqBytesLength == 3) {
+            for (i = 0; i < dataLength; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre.ivalue = lfBuf_cur.ivalue ^ lfBuf_pre.ivalue;
+
+                if (lfBuf_pre.ivalue >> 8 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.ivalue >> 16 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.ivalue >> 24 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[3];
+                    residualMidBytes_size += 3;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    residualMidBytes_size += 2;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        } else if (reqBytesLength == 2) {
+            for (i = 0; i < dataLength; i++) {
+
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre.ivalue = lfBuf_cur.ivalue ^ lfBuf_pre.ivalue;
+
+                if (lfBuf_pre.ivalue >> 8 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.ivalue >> 16 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.ivalue >> 24 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[3];
+                    residualMidBytes_size += 2;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        } else if (reqBytesLength == 1) {
+            for (i = 0; i < dataLength; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre.ivalue = lfBuf_cur.ivalue ^ lfBuf_pre.ivalue;
+
+                if (lfBuf_pre.ivalue >> 8 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.ivalue >> 16 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.ivalue >> 24 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[3];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }else //reqBytesLength == 4
+        {
+            for (i = 0; i < dataLength; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre.ivalue = lfBuf_cur.ivalue ^ lfBuf_pre.ivalue;
+
+                if (lfBuf_pre.ivalue >> 8 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.ivalue >> 16 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.ivalue >> 24 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[3];
+                    residualMidBytes_size += 4;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    residualMidBytes_size += 3;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    residualMidBytes_size += 2;
+                } else //leadingNum == 3
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }
+
+        convertIntArray2ByteArray_fast_2b_args(leadNumberArray_int, dataLength, leadNumberArray);
+
+        int k = 4;
+
+        unsigned char reqLengthB = (unsigned char) reqLength;
+        outputBytes[k] = reqLengthB;
+        k++;
+        floatToBytes(&(outputBytes[k]), medianValue);
+        k += sizeof(float);
+        sizeToBytes(&(outputBytes[k]), leadNumberArray_size);
+
+        totalSize = 4 + 1 + sizeof(float) + sizeof(size_t) + leadNumberArray_size + residualMidBytes_size;
+    } else {
+
+    }
+
+    *outSize = totalSize;
+
+    free(leadNumberArray_int);
+//	sz_cost_end();
+//	printf("compression time = %f\n", sz_totalCost);
+
+    return outputBytes;
+}
+
+unsigned char *SZ_skip_compress_float(float *data, size_t dataLength, size_t *outSize) {
+    *outSize = dataLength * sizeof(float);
+    unsigned char *out = (unsigned char *) malloc(dataLength * sizeof(float));
+    memcpy(out, data, dataLength * sizeof(float));
+    return out;
+}
+
+inline void computeReqLength_float(double realPrecision, short radExpo, int *reqLength, float *medianValue) {
+    short reqExpo = getPrecisionReqLength_double(realPrecision);
+    *reqLength = 9 + radExpo - reqExpo + 1; //radExpo-reqExpo == reqMantiLength
+    if (*reqLength < 9)
+        *reqLength = 9;
+    if (*reqLength > 32) {
+        *reqLength = 32;
+        *medianValue = 0;
+    }
+}
diff --git a/qtensor/compression/szx/src/szx_rw.c b/qtensor/compression/szx/src/szx_rw.c
new file mode 100644
index 00000000..8e3e92a3
--- /dev/null
+++ b/qtensor/compression/szx/src/szx_rw.c
@@ -0,0 +1,1009 @@
+/**
+ *  @file szx_rw.c
+ *  @author Sheng Di
+ *  @date April, 2022
+ *  @brief io interface for fortrance
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <unistd.h>
+
+#include "szx_rw.h"
+#include "szx.h"
+#include "szx_BytesToolkit.h"
+#include "szx_dataCompression.h"
+
+int checkFileExistance(char* filePath)
+{
+	if( access( filePath, F_OK ) != -1 ) {
+		// file exists
+		return 1;
+	} else {
+		// file doesn't exist
+		return 0;
+	}	
+}
+
+float** create2DArray_float(size_t m, size_t n)
+{
+	size_t i=0;
+	float **data = (float**)malloc(sizeof(float*)*m);
+	for(i=0;i<m;i++)
+		data[i] = (float*)malloc(sizeof(float)*n);
+	return data;
+}
+
+void free2DArray_float(float** data, size_t m)
+{
+	size_t i = 0;
+	for(i=0;i<m;i++)
+		free(data[i]);
+	free(data);	
+}
+
+float*** create3DArray_float(size_t p, size_t m, size_t n)
+{
+	size_t i = 0, j = 0;
+	float ***data = (float***)malloc(sizeof(float**)*m);
+	for(i=0;i<p;i++)
+	{
+		data[i] = (float**)malloc(sizeof(float*)*n);
+		for(j=0;j<m;j++)
+			data[i][j] = (float*)malloc(sizeof(float)*n);
+	}
+	return data;
+}
+
+void free3DArray_float(float*** data, size_t p, size_t m)
+{
+	size_t i,j;
+	for(i=0;i<p;i++)
+	{
+		for(j=0;j<m;j++)
+			free(data[i][j]);
+		free(data[i]);
+	}
+	free(data);	
+}
+
+double** create2DArray_double(size_t m, size_t n)
+{
+	size_t i=0;
+	double **data = (double**)malloc(sizeof(double*)*m);
+	for(i=0;i<m;i++)
+			data[i] = (double*)malloc(sizeof(double)*n);
+			
+	return data;
+}
+
+void free2DArray_double(double** data, size_t m)
+{
+	size_t i;
+	for(i=0;i<m;i++)
+		free(data[i]);
+	free(data);	
+}
+
+double*** create3DArray_double(size_t p, size_t m, size_t n)
+{
+	size_t i = 0, j = 0;
+	double ***data = (double***)malloc(sizeof(double**)*m);
+	for(i=0;i<p;i++)
+	{
+		data[i] = (double**)malloc(sizeof(double*)*n);
+		for(j=0;j<m;j++)
+			data[i][j] = (double*)malloc(sizeof(double)*n);
+	}
+	return data;
+}
+
+void free3DArray_double(double*** data, size_t p, size_t m)
+{
+	size_t i,j;
+	for(i=0;i<p;i++)
+	{
+		for(j=0;j<m;j++)
+			free(data[i][j]);
+		free(data[i]);
+	}
+	free(data);	
+}
+
+size_t checkFileSize(char *srcFilePath, int *status)
+{
+	size_t filesize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return -1;
+	}
+	fseek(pFile, 0, SEEK_END);
+    filesize = ftell(pFile);
+    fclose(pFile);
+    *status = SZ_SCES;
+    return filesize;
+}
+
+unsigned char *readByteData(char *srcFilePath, size_t *byteLength, int *status)
+{
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = SZ_FERR;
+        return 0;
+    }
+	fseek(pFile, 0, SEEK_END);
+    *byteLength = ftell(pFile);
+    fclose(pFile);
+    
+    unsigned char *byteBuf = ( unsigned char *)malloc((*byteLength)*sizeof(unsigned char)); //sizeof(char)==1
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = SZ_FERR;
+        return 0;
+    }
+    fread(byteBuf, 1, *byteLength, pFile);
+    fclose(pFile);
+    *status = SZ_SCES;
+    return byteBuf;
+}
+
+double *readDoubleData(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		double *daBuf = readDoubleData_systemEndian(srcFilePath, nbEle,&state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+		
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state==SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		double *daBuf = (double *)malloc(byteLength);
+		*nbEle = byteLength/8;
+		
+		ldouble buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i*8;
+			memcpy(buf.byte, bytes+j, 8);
+			symTransform_8bytes(buf.byte);
+			daBuf[i] = buf.value;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+
+int8_t *readInt8Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	int8_t *daBuf = readInt8Data_systemEndian(srcFilePath, nbEle, &state);
+	*status = state;
+	return daBuf;
+}
+
+int16_t *readInt16Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		int16_t *daBuf = readInt16Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		int16_t *daBuf = (int16_t *)malloc(byteLength);
+		*nbEle = byteLength/2;
+
+		lint16 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i << 1;//*2
+			memcpy(buf.byte, bytes+j, 2);
+			symTransform_2bytes(buf.byte);
+			daBuf[i] = buf.svalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+uint16_t *readUInt16Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		uint16_t *daBuf = readUInt16Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		uint16_t *daBuf = (uint16_t *)malloc(byteLength);
+		*nbEle = byteLength/2;
+
+		lint16 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i << 1;//*2
+			memcpy(buf.byte, bytes+j, 2);
+			symTransform_2bytes(buf.byte);
+			daBuf[i] = buf.usvalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+int32_t *readInt32Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		int32_t *daBuf = readInt32Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		int32_t *daBuf = (int32_t *)malloc(byteLength);
+		*nbEle = byteLength/4;
+
+		lint32 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i*4;
+			memcpy(buf.byte, bytes+j, 4);
+			symTransform_4bytes(buf.byte);
+			daBuf[i] = buf.ivalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+uint32_t *readUInt32Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		uint32_t *daBuf = readUInt32Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		uint32_t *daBuf = (uint32_t *)malloc(byteLength);
+		*nbEle = byteLength/4;
+
+		lint32 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i << 2; //*4
+			memcpy(buf.byte, bytes+j, 4);
+			symTransform_4bytes(buf.byte);
+			daBuf[i] = buf.uivalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+int64_t *readInt64Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		int64_t *daBuf = readInt64Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		int64_t *daBuf = (int64_t *)malloc(byteLength);
+		*nbEle = byteLength/8;
+
+		lint64 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i << 3; //*8
+			memcpy(buf.byte, bytes+j, 8);
+			symTransform_8bytes(buf.byte);
+			daBuf[i] = buf.lvalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+uint64_t *readUInt64Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		uint64_t *daBuf = readUInt64Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		uint64_t *daBuf = (uint64_t *)malloc(byteLength);
+		*nbEle = byteLength/8;
+
+		lint64 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i << 3; //*8
+			memcpy(buf.byte, bytes+j, 8);
+			symTransform_8bytes(buf.byte);
+			daBuf[i] = buf.ulvalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+
+float *readFloatData(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		float *daBuf = readFloatData_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+		
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		float *daBuf = (float *)malloc(byteLength);
+		*nbEle = byteLength/4;
+		
+		lfloat buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i*4;
+			memcpy(buf.byte, bytes+j, 4);
+			symTransform_4bytes(buf.byte);
+			daBuf[i] = buf.value;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+double *readDoubleData_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = SZ_FERR;
+        return NULL;
+    }
+	fseek(pFile, 0, SEEK_END);
+    inSize = ftell(pFile);
+    *nbEle = inSize/8; //only support double in this version
+    fclose(pFile);
+    
+    double *daBuf = (double *)malloc(inSize);
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = SZ_FERR;
+        return NULL;
+    }
+    fread(daBuf, 8, *nbEle, pFile);
+    fclose(pFile);
+    *status = SZ_SCES;
+    return daBuf;
+}
+
+
+int8_t *readInt8Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize;
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	int8_t *daBuf = (int8_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 1, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;
+}
+
+
+int16_t *readInt16Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/2; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	int16_t *daBuf = (int16_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 2, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;	
+}
+
+uint16_t *readUInt16Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/2; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	uint16_t *daBuf = (uint16_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 2, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;	
+}
+
+int32_t *readInt32Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/4; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	int32_t *daBuf = (int32_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 4, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;	
+}
+
+uint32_t *readUInt32Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/4; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	uint32_t *daBuf = (uint32_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 4, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;	
+}
+
+int64_t *readInt64Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/8; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	int64_t *daBuf = (int64_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 8, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;
+}
+
+uint64_t *readUInt64Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/8; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	uint64_t *daBuf = (uint64_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 8, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;
+}
+
+float *readFloatData_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = SZ_FERR;
+        return NULL;
+    }
+	fseek(pFile, 0, SEEK_END);
+    inSize = ftell(pFile);
+    *nbEle = inSize/4; 
+    fclose(pFile);
+    
+    if(inSize<=0)
+    {
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+    
+    float *daBuf = (float *)malloc(inSize);
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = SZ_FERR;
+        return NULL;
+    }
+    fread(daBuf, 4, *nbEle, pFile);
+    fclose(pFile);
+    *status = SZ_SCES;
+    return daBuf;
+}
+
+void writeByteData(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status)
+{
+	FILE *pFile = fopen(tgtFilePath, "wb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 3\n");
+        *status = SZ_FERR;
+        return;
+    }
+    
+    fwrite(bytes, 1, byteLength, pFile); //write outSize bytes
+    fclose(pFile);
+    *status = SZ_SCES;
+}
+
+void writeDoubleData(double *data, size_t nbEle, char *tgtFilePath, int *status)
+{
+	size_t i = 0;
+	char s[64];
+	FILE *pFile = fopen(tgtFilePath, "wb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 3\n");
+        *status = SZ_FERR;
+        return;
+    }
+    
+    for(i = 0;i<nbEle;i++)
+	{
+		sprintf(s,"%.20G\n",data[i]);
+		fputs(s, pFile);
+	}
+    
+    fclose(pFile);
+    *status = SZ_SCES;
+}
+
+void writeFloatData(float *data, size_t nbEle, char *tgtFilePath, int *status)
+{
+	size_t i = 0;
+	char s[64];
+	FILE *pFile = fopen(tgtFilePath, "wb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 3\n");
+        *status = SZ_FERR;
+        return;
+    }
+   
+    for(i = 0;i<nbEle;i++)
+	{
+		//printf("i=%d\n",i);
+		//printf("data[i]=%f\n",data[i]);
+		sprintf(s,"%.30G\n",data[i]);
+		fputs(s, pFile);
+	}
+    
+    fclose(pFile);
+    *status = SZ_SCES;
+}
+
+void writeData(void *data, int dataType, size_t nbEle, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	if(dataType == SZ_FLOAT)
+	{
+		float* dataArray = (float *)data;
+		writeFloatData(dataArray, nbEle, tgtFilePath, &state);
+	}
+	else if(dataType == SZ_DOUBLE)
+	{
+		double* dataArray = (double *)data;
+		writeDoubleData(dataArray, nbEle, tgtFilePath, &state);	
+	}
+	else
+	{
+		printf("Error: data type cannot be the types other than SZ_FLOAT or SZ_DOUBLE\n");
+		*status = SZ_TERR; //wrong type
+		return;
+	}
+	*status = state;
+}
+
+void writeFloatData_inBytes(float *data, size_t nbEle, char* tgtFilePath, int *status)
+{
+	size_t i = 0; 
+	int state = SZ_SCES;
+	lfloat buf;
+	unsigned char* bytes = (unsigned char*)malloc(nbEle*sizeof(float));
+	for(i=0;i<nbEle;i++)
+	{
+		buf.value = data[i];
+		bytes[i*4+0] = buf.byte[0];
+		bytes[i*4+1] = buf.byte[1];
+		bytes[i*4+2] = buf.byte[2];
+		bytes[i*4+3] = buf.byte[3];					
+	}
+
+	size_t byteLength = nbEle*sizeof(float);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeDoubleData_inBytes(double *data, size_t nbEle, char* tgtFilePath, int *status)
+{
+	size_t i = 0, index = 0; 
+	int state = SZ_SCES;
+	ldouble buf;
+	unsigned char* bytes = (unsigned char*)malloc(nbEle*sizeof(double));
+	for(i=0;i<nbEle;i++)
+	{
+		index = i*8;
+		buf.value = data[i];
+		bytes[index+0] = buf.byte[0];
+		bytes[index+1] = buf.byte[1];
+		bytes[index+2] = buf.byte[2];
+		bytes[index+3] = buf.byte[3];
+		bytes[index+4] = buf.byte[4];
+		bytes[index+5] = buf.byte[5];
+		bytes[index+6] = buf.byte[6];
+		bytes[index+7] = buf.byte[7];
+	}
+
+	size_t byteLength = nbEle*sizeof(double);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeShortData_inBytes(short *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*2;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertShortArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeUShortData_inBytes(unsigned short *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*2;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertUShortArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeIntData_inBytes(int *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*4;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertIntArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeUIntData_inBytes(unsigned int *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*4;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertUIntArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeLongData_inBytes(int64_t *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*8;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertLongArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeULongData_inBytes(uint64_t *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*8;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertULongArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+unsigned short* readShortData(char *srcFilePath, size_t *dataLength, int *status)
+{
+	size_t byteLength = 0; 
+	int state = SZ_SCES;
+	unsigned char * bytes = readByteData(srcFilePath, &byteLength, &state);
+	*dataLength = byteLength/2;
+	unsigned short* states = convertByteDataToUShortArray(bytes, byteLength);
+	free(bytes);
+	*status = state;
+	return states;
+}
+
+void writeStrings(int nbStr, char *str[], char *tgtFilePath, int *status)
+{
+	size_t i = 0;
+	char s[256];
+	FILE *pFile = fopen(tgtFilePath, "wb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 3\n");
+		*status = SZ_FERR;
+		return;
+	}
+
+	for(i = 0;i<nbStr;i++)
+	{
+		sprintf(s,"%s\n",str[i]);
+		fputs(s, pFile);
+	}
+
+	fclose(pFile);
+	*status = SZ_SCES;
+}
+
diff --git a/qtensor/compression/szx/src/szx_utility.c b/qtensor/compression/szx/src/szx_utility.c
new file mode 100644
index 00000000..da49c16b
--- /dev/null
+++ b/qtensor/compression/szx/src/szx_utility.c
@@ -0,0 +1,42 @@
+/**
+ *  @file szx_utility.c
+ *  @author Sheng Di
+ *  @date Feb, 2022
+ *  @brief 
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "szx_utility.h"
+
+struct timeval sz_costStart; /*only used for recording the cost*/
+double sz_totalCost = 0;
+
+void sz_cost_start()
+{
+	sz_totalCost = 0;
+	gettimeofday(&sz_costStart, NULL);
+}
+
+void sz_cost_end()
+{
+	double elapsed;
+	struct timeval costEnd;
+	gettimeofday(&costEnd, NULL);
+	elapsed = ((costEnd.tv_sec*1000000+costEnd.tv_usec)-(sz_costStart.tv_sec*1000000+sz_costStart.tv_usec))/1000000.0;
+	sz_totalCost += elapsed;
+}
+
+void sz_cost_end_msg(char *msg)
+{
+    double elapsed;
+    struct timeval costEnd;
+    gettimeofday(&costEnd, NULL);
+    elapsed = ((costEnd.tv_sec*1000000+costEnd.tv_usec)-(sz_costStart.tv_sec*1000000+sz_costStart.tv_usec))/1000000.0;
+    sz_totalCost += elapsed;
+    printf("timecost=%f, %s\n", elapsed, msg);
+}
diff --git a/qtensor/compression/szx/src/szxd_double.c b/qtensor/compression/szx/src/szxd_double.c
new file mode 100644
index 00000000..b04bca20
--- /dev/null
+++ b/qtensor/compression/szx/src/szxd_double.c
@@ -0,0 +1,1104 @@
+/**
+ *  @file szxd_double.c
+ *  @author Sheng Di, Kai Zhao
+ *  @date Feb, 2022
+ *  @brief 
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "szxd_double.h"
+#include "szx.h"
+#include "szx_BytesToolkit.h"
+#include "szx_TypeManager.h"
+#ifdef _OPENMP
+#include "omp.h"
+#endif
+
+int SZ_fast_decompress_args_unpredictable_one_block_double(double* newData, size_t blockSize, unsigned char* cmpBytes)
+{
+	int cmpSize = 0;
+	size_t nbEle = blockSize;
+	
+	register double medianValue;
+	size_t leadNumArray_size = nbEle%4==0?nbEle/4:nbEle/4+1;
+	
+	size_t k = 0;
+	int reqLength = (int)cmpBytes[k];
+	k++;
+	medianValue = (double)bytesToFloat(&(cmpBytes[k]));
+	k+=sizeof(float);
+	
+	unsigned char* leadNumArray = &(cmpBytes[k]);
+	k += leadNumArray_size;
+	unsigned char* residualMidBytes = &(cmpBytes[k]);	
+	unsigned char* q = residualMidBytes;
+		
+	cmpSize = k;	
+		
+	size_t i = 0, j = 0;
+	k = 0;
+	
+	register ldouble lfBuf_pre;
+	register ldouble lfBuf_cur;
+	
+	lfBuf_pre.lvalue = 0;
+
+	int reqBytesLength, resiBitsLength; 
+	register unsigned char leadingNum;
+
+	reqBytesLength = reqLength/8;
+	resiBitsLength = reqLength%8;
+	int rightShiftBits = 0;
+	
+	if(resiBitsLength!=0)
+	{
+		rightShiftBits = 8 - resiBitsLength;
+		reqBytesLength ++;
+	}
+	
+	//sz_cost_start();
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM)
+	{
+		//reqBytesLength must not be equal to 1 for double data
+		if(reqBytesLength == 3)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];
+					lfBuf_cur.byte[5] = q[0];
+					lfBuf_cur.byte[6] = q[1];				
+					q += 2;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];
+					lfBuf_cur.byte[5] = q[0];									
+					q += 1;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[5] = q[0];
+					lfBuf_cur.byte[6] = q[1];					
+					lfBuf_cur.byte[7] = q[2];					
+					q += 3;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				newData[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;
+			}
+		}
+		else if(reqBytesLength == 2)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+	
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];
+					lfBuf_cur.byte[6] = q[0];			
+					q += 1;	
+				}
+				else if(leadingNum >= 2)
+				{
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];									
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[6] = q[0];					
+					lfBuf_cur.byte[7] = q[1];					
+					q += 2;
+				}
+				
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				newData[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;
+			}					
+		}
+		else if(reqBytesLength == 4)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[4] = q[0];
+					lfBuf_cur.byte[5] = q[1];
+					lfBuf_cur.byte[6] = q[2];				
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 3;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[4] = q[0];									
+					lfBuf_cur.byte[5] = q[1];									
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 2;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[4] = q[0];									
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];	
+					q += 1;				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[4] = q[0];
+					lfBuf_cur.byte[5] = q[1];
+					lfBuf_cur.byte[6] = q[2];					
+					lfBuf_cur.byte[7] = q[3];					
+					q += 4;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				newData[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;			
+			}
+		} 		
+		else if(reqBytesLength == 5)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[3] = q[0];
+					lfBuf_cur.byte[4] = q[1];
+					lfBuf_cur.byte[5] = q[2];
+					lfBuf_cur.byte[6] = q[3];				
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 4;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[3] = q[0];
+					lfBuf_cur.byte[4] = q[1];									
+					lfBuf_cur.byte[5] = q[2];									
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 3;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[3] = q[0];			
+					lfBuf_cur.byte[4] = q[1];									
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];	
+					q += 2;				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[3] = q[0];
+					lfBuf_cur.byte[4] = q[1];
+					lfBuf_cur.byte[5] = q[2];					
+					lfBuf_cur.byte[6] = q[3];		
+					lfBuf_cur.byte[7] = q[3];				
+					q += 5;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				newData[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;			
+			}
+		}
+		else if(reqBytesLength == 6)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[2] = q[0];					
+					lfBuf_cur.byte[3] = q[1];
+					lfBuf_cur.byte[4] = q[2];
+					lfBuf_cur.byte[5] = q[3];
+					lfBuf_cur.byte[6] = q[4];				
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 5;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[2] = q[0];					
+					lfBuf_cur.byte[3] = q[1];
+					lfBuf_cur.byte[4] = q[2];									
+					lfBuf_cur.byte[5] = q[3];									
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 4;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[2] = q[0];			
+					lfBuf_cur.byte[3] = q[1];						
+					lfBuf_cur.byte[4] = q[2];									
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];	
+					q += 3;				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[2] = q[0];						
+					lfBuf_cur.byte[3] = q[1];
+					lfBuf_cur.byte[4] = q[2];
+					lfBuf_cur.byte[5] = q[3];					
+					lfBuf_cur.byte[6] = q[4];		
+					lfBuf_cur.byte[7] = q[5];				
+					q += 6;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				newData[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;			
+			}
+		}
+		else if(reqBytesLength == 7)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[1] = q[0];		
+					lfBuf_cur.byte[2] = q[1];					
+					lfBuf_cur.byte[3] = q[2];
+					lfBuf_cur.byte[4] = q[3];
+					lfBuf_cur.byte[5] = q[4];
+					lfBuf_cur.byte[6] = q[5];				
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 6;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[1] = q[0];							
+					lfBuf_cur.byte[2] = q[1];					
+					lfBuf_cur.byte[3] = q[2];
+					lfBuf_cur.byte[4] = q[3];									
+					lfBuf_cur.byte[5] = q[4];									
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 5;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[1] = q[0];			
+					lfBuf_cur.byte[2] = q[1];					
+					lfBuf_cur.byte[3] = q[2];						
+					lfBuf_cur.byte[4] = q[3];									
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];	
+					q += 4;				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[1] = q[0];					
+					lfBuf_cur.byte[2] = q[1];						
+					lfBuf_cur.byte[3] = q[2];
+					lfBuf_cur.byte[4] = q[3];
+					lfBuf_cur.byte[5] = q[4];					
+					lfBuf_cur.byte[6] = q[5];		
+					lfBuf_cur.byte[7] = q[6];				
+					q += 7;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				newData[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;			
+			}
+		}		
+		else //reqBytesLength == 8
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{
+					lfBuf_cur.byte[0] = q[0];								
+					lfBuf_cur.byte[1] = q[1];		
+					lfBuf_cur.byte[2] = q[2];					
+					lfBuf_cur.byte[3] = q[3];
+					lfBuf_cur.byte[4] = q[4];
+					lfBuf_cur.byte[5] = q[5];
+					lfBuf_cur.byte[6] = q[6];				
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 7;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[0] = q[0];						
+					lfBuf_cur.byte[1] = q[1];							
+					lfBuf_cur.byte[2] = q[2];					
+					lfBuf_cur.byte[3] = q[3];
+					lfBuf_cur.byte[4] = q[4];									
+					lfBuf_cur.byte[5] = q[5];									
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 6;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[0] = q[0];						
+					lfBuf_cur.byte[1] = q[1];			
+					lfBuf_cur.byte[2] = q[2];					
+					lfBuf_cur.byte[3] = q[3];						
+					lfBuf_cur.byte[4] = q[4];									
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];	
+					q += 5;				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[0] = q[0];						
+					lfBuf_cur.byte[1] = q[1];					
+					lfBuf_cur.byte[2] = q[2];						
+					lfBuf_cur.byte[3] = q[3];
+					lfBuf_cur.byte[4] = q[4];
+					lfBuf_cur.byte[5] = q[5];					
+					lfBuf_cur.byte[6] = q[6];		
+					lfBuf_cur.byte[7] = q[7];				
+					q += 8;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				newData[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;			
+			}
+		}				
+	}
+	else
+	{
+		
+	}
+	
+	cmpSize += (q - residualMidBytes); //add the number of residualMidBytes
+	return cmpSize;
+}
+
+
+void SZ_fast_decompress_args_unpredictable_blocked_double(double** newData, size_t nbEle, unsigned char* cmpBytes)
+{
+	*newData = (double*)malloc(sizeof(double)*nbEle);
+
+	unsigned char* r = cmpBytes;
+	r += 4;
+	int blockSize = r[0];  //get block size
+	r++;
+	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
+	r += sizeof(size_t);
+
+	size_t nbBlocks = nbEle/blockSize;
+	size_t remainCount = nbEle%blockSize;
+	size_t stateNBBytes = remainCount == 0 ? (nbBlocks%8==0?nbBlocks/8:nbBlocks/8+1) : ((nbBlocks+1)%8==0? (nbBlocks+1)/8:(nbBlocks+1)/8+1);
+	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
+	unsigned char* stateArray = (unsigned char*)malloc(actualNBBlocks);
+	float* constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));
+
+	convertByteArray2IntArray_fast_1b_args(actualNBBlocks, r, stateNBBytes, stateArray); //get the stateArray
+
+	unsigned char* p = r + stateNBBytes; //p is the starting address of constant median values.
+
+	size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+	for(i = 0;i < nbConstantBlocks;i++, j+=4) //get the median values for constant-value blocks
+		constantMedianArray[i] = bytesToFloat(p+j);
+
+	unsigned char* q = p + sizeof(float)*nbConstantBlocks; //q is the starting address of the non-constant data blocks
+	double* op = *newData;
+
+	for(i=0;i<nbBlocks;i++, op += blockSize)
+	{
+		unsigned char state = stateArray[i];
+		if(state) //non-constant block
+		{
+			int cmpSize = SZ_fast_decompress_args_unpredictable_one_block_double(op, blockSize, q);
+			q += cmpSize;
+		}
+		else //constant block
+		{
+			float medianValue = constantMedianArray[k];
+			for(j=0;j<blockSize;j++)
+				op[j] = medianValue;
+			p += sizeof(float);
+			k ++;
+		}
+	}
+
+	if(remainCount)
+	{
+		unsigned char state = stateArray[i];
+		if(state) //non-constant block
+		{
+			SZ_fast_decompress_args_unpredictable_one_block_double(op, remainCount, q);
+		}
+		else //constant block
+		{
+			float medianValue = constantMedianArray[k];
+			for(j=0;j<remainCount;j++)
+				op[j] = medianValue;
+		}
+	}
+
+	free(stateArray);
+	free(constantMedianArray);
+}
+
+void SZ_fast_decompress_args_unpredictable_blocked_randomaccess_double_openmp(double** newData, size_t nbEle, unsigned char* cmpBytes) {
+
+	*newData = (double *) malloc(sizeof(double) * nbEle);
+	sz_cost_start();
+	unsigned char *r = cmpBytes;
+	r += 4; //skip version information
+	int blockSize = bytesToLong_bigEndian(r);  //get block size
+    r += sizeof(size_t);
+	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
+	r += sizeof(size_t);
+
+	size_t nbBlocks = nbEle / blockSize;
+	size_t remainCount = nbEle % blockSize;
+	size_t stateNBBytes =
+			remainCount == 0 ? (nbBlocks % 8 == 0 ? nbBlocks / 8 : nbBlocks / 8 + 1) : ((nbBlocks + 1) % 8 == 0 ?
+																						(nbBlocks + 1) / 8 :
+																						(nbBlocks + 1) / 8 + 1);
+	size_t actualNBBlocks = remainCount == 0 ? nbBlocks : nbBlocks + 1;
+
+	size_t nbNonConstantBlocks = actualNBBlocks - nbConstantBlocks;
+
+	unsigned char *stateArray = (unsigned char *) malloc(actualNBBlocks);
+//	double *constantMedianArray = (double *) malloc(nbConstantBlocks * sizeof(double));
+    unsigned char **qarray = (unsigned char **) malloc(actualNBBlocks * sizeof(unsigned char *));
+    float *parray = (float *) malloc(actualNBBlocks * sizeof(float));
+
+    int16_t* O = (int16_t*) r;
+    unsigned char *R = r + nbNonConstantBlocks*sizeof(uint16_t); //block-size information
+    unsigned char *p = R + stateNBBytes; //p is the starting address of constant median values.
+    float *constantMedianArray = (float *) p;
+    unsigned char *q = p + sizeof(float) * nbConstantBlocks; //q is the starting address of the non-constant data blocks
+    double *op = *newData;
+
+	size_t nonConstantBlockID = 0, constantBlockID = 0;
+//    sz_cost_end_msg("sequential-1 malloc");
+
+//    sz_cost_start();
+    size_t i = 0;// k = 0; //k is used to keep track of constant block index
+//    for (i = 0; i < nbConstantBlocks; i++, k += 4) //get the median values for constant-value blocks
+//        constantMedianArray[i] = bytesToFloat(p + k);
+
+    convertByteArray2IntArray_fast_1b_args(actualNBBlocks, R, stateNBBytes, stateArray); //get the stateArray
+//    sz_cost_end_msg("sequential-2 byte to int");
+
+//    sz_cost_start();
+    for (i = 0; i < actualNBBlocks; i++) {
+		if (stateArray[i]) {
+			qarray[i] = q;
+
+			q += O[nonConstantBlockID++];
+		} else {
+			parray[i] = constantMedianArray[constantBlockID++];
+		}
+	}
+
+//   sz_cost_end_msg("sequential-3 sum");
+//	sz_cost_start();
+#pragma omp parallel for schedule(static)
+	for (i = 0; i < nbBlocks; i++) {
+		if (stateArray[i]) {//non-constant block
+			SZ_fast_decompress_args_unpredictable_one_block_double(op + i * blockSize, blockSize, qarray[i]);
+		} else {//constant block
+			for (int j = 0; j < blockSize; j++)
+				op[i * blockSize + j] = parray[i];
+		}
+	}
+//	sz_cost_end_msg("parallel-1");
+
+//	sz_cost_start();
+	if (remainCount) {
+        i = nbBlocks;
+        if (stateArray[i]) { //non-constant block
+			SZ_fast_decompress_args_unpredictable_one_block_double(op + i * blockSize, remainCount, qarray[i]);
+		} else {//constant block
+			for (int j = 0; j < remainCount; j++)
+				op[i * blockSize + j] = parray[i];
+		}
+	}
+
+	free(parray);
+	free(qarray);
+	free(stateArray);
+//	free(constantMedianArray);
+//	sz_cost_end_msg("sequence-3 free");
+}
+
+
+void SZ_fast_decompress_args_unpredictable_blocked_randomaccess_double(double** newData, size_t nbEle, unsigned char* cmpBytes){
+	*newData = (double*)malloc(sizeof(double)*nbEle);
+	
+	unsigned char* r = cmpBytes;
+	r+=4; //skip version information
+    int blockSize = bytesToLong_bigEndian(r);  //get block size
+    r += sizeof(size_t);
+	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
+	r += sizeof(size_t);
+		
+	size_t nbBlocks = nbEle/blockSize;
+	size_t remainCount = nbEle%blockSize;
+	size_t stateNBBytes = remainCount == 0 ? (nbBlocks%8==0?nbBlocks/8:nbBlocks/8+1) : ((nbBlocks+1)%8==0? (nbBlocks+1)/8:(nbBlocks+1)/8+1);
+	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
+	
+	size_t nbNonConstantBlocks = actualNBBlocks - nbConstantBlocks;
+	
+
+	unsigned char* stateArray = (unsigned char*)malloc(actualNBBlocks);
+	float* constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));
+
+    int16_t* O = (int16_t*) r;
+    unsigned char* R = r+ nbNonConstantBlocks*sizeof(uint16_t); //block-size information
+
+    convertByteArray2IntArray_fast_1b_args(actualNBBlocks, R, stateNBBytes, stateArray); //get the stateArray
+	
+	unsigned char* p = R + stateNBBytes; //p is the starting address of constant median values.
+	
+	size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+	for(i = 0;i < nbConstantBlocks;i++, j+=4) //get the median values for constant-value blocks
+		constantMedianArray[i] = bytesToFloat(p+j);
+
+	unsigned char* q = p + sizeof(float)*nbConstantBlocks; //q is the starting address of the non-constant data blocks
+	double* op = *newData;
+
+	size_t nonConstantBlockID=0;
+	for(i=0;i<nbBlocks;i++, op += blockSize)
+	{
+		unsigned char state = stateArray[i];
+		if(state) //non-constant block
+		{
+            SZ_fast_decompress_args_unpredictable_one_block_double(op, blockSize, q);
+            q += O[nonConstantBlockID];
+            nonConstantBlockID++;
+		}
+		else //constant block
+		{
+			float medianValue = constantMedianArray[k];			
+			for(j=0;j<blockSize;j++)
+				op[j] = medianValue;
+			p += sizeof(float);
+			k ++;
+		}
+	}
+
+	if(remainCount)
+	{
+		unsigned char state = stateArray[i];
+		if(state) //non-constant block
+		{
+			SZ_fast_decompress_args_unpredictable_one_block_double(op, remainCount, q);	
+		}
+		else //constant block
+		{
+			float medianValue = constantMedianArray[k];				
+			for(j=0;j<remainCount;j++)
+				op[j] = medianValue;
+		}		
+	}
+	
+	free(stateArray);
+	free(constantMedianArray);
+}
+
+void SZ_fast_decompress_args_unpredictable_double(double** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes,
+size_t cmpSize)
+{
+	size_t nbEle = computeDataLength(r5, r4, r3, r2, r1);
+	*newData = (double*)malloc(sizeof(double)*nbEle);	
+	
+	register double medianValue;
+	size_t leadNumArray_size;
+
+    unsigned char *r = cmpBytes;
+    r += 4; //skip version information
+
+	size_t k = 0;
+	int reqLength = (int)r[k];
+	k++;
+	medianValue = (double)bytesToFloat(&(r[k]));
+	k+=sizeof(float);
+	leadNumArray_size = bytesToSize(&(r[k]));
+	k+=sizeof(size_t);
+	
+	unsigned char* leadNumArray = &(r[k]);
+	k += leadNumArray_size;
+	unsigned char* residualMidBytes = &(r[k]);
+	unsigned char* q = residualMidBytes;
+		
+	size_t i = 0, j = 0;
+	k = 0;
+	
+	register ldouble lfBuf_pre;
+	register ldouble lfBuf_cur;
+	
+	lfBuf_pre.lvalue = 0;
+
+	int reqBytesLength, resiBitsLength; 
+	register unsigned char leadingNum;
+
+	reqBytesLength = reqLength/8;
+	resiBitsLength = reqLength%8;
+	int rightShiftBits = 0;
+	
+	if(resiBitsLength!=0)
+	{
+		rightShiftBits = 8 - resiBitsLength;
+		reqBytesLength ++;
+	}
+	
+	//sz_cost_start();
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM) {
+
+		//reqBytesLength must not be equal to 1 for double data
+		if(reqBytesLength == 3)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];
+					lfBuf_cur.byte[5] = q[0];
+					lfBuf_cur.byte[6] = q[1];				
+					q += 2;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];
+					lfBuf_cur.byte[5] = q[0];									
+					q += 1;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[5] = q[0];
+					lfBuf_cur.byte[6] = q[1];					
+					lfBuf_cur.byte[7] = q[2];					
+					q += 3;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				(*newData)[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;
+			}
+		}
+		else if(reqBytesLength == 2)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+	
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];
+					lfBuf_cur.byte[6] = q[0];			
+					q += 1;	
+				}
+				else if(leadingNum >= 2)
+				{
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];									
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[6] = q[0];					
+					lfBuf_cur.byte[7] = q[1];					
+					q += 2;
+				}
+				
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				(*newData)[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;
+			}					
+		}
+		else if(reqBytesLength == 4)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[4] = q[0];
+					lfBuf_cur.byte[5] = q[1];
+					lfBuf_cur.byte[6] = q[2];				
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 3;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[4] = q[0];									
+					lfBuf_cur.byte[5] = q[1];									
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 2;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[4] = q[0];									
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];	
+					q += 1;				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[4] = q[0];
+					lfBuf_cur.byte[5] = q[1];
+					lfBuf_cur.byte[6] = q[2];					
+					lfBuf_cur.byte[7] = q[3];					
+					q += 4;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				(*newData)[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;			
+			}
+		} 		
+		else if(reqBytesLength == 5)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[3] = q[0];
+					lfBuf_cur.byte[4] = q[1];
+					lfBuf_cur.byte[5] = q[2];
+					lfBuf_cur.byte[6] = q[3];				
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 4;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[3] = q[0];
+					lfBuf_cur.byte[4] = q[1];									
+					lfBuf_cur.byte[5] = q[2];									
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 3;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[3] = q[0];			
+					lfBuf_cur.byte[4] = q[1];									
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];	
+					q += 2;				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[3] = q[0];
+					lfBuf_cur.byte[4] = q[1];
+					lfBuf_cur.byte[5] = q[2];					
+					lfBuf_cur.byte[6] = q[3];		
+					lfBuf_cur.byte[7] = q[3];				
+					q += 5;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				(*newData)[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;			
+			}
+		}
+		else if(reqBytesLength == 6)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[2] = q[0];					
+					lfBuf_cur.byte[3] = q[1];
+					lfBuf_cur.byte[4] = q[2];
+					lfBuf_cur.byte[5] = q[3];
+					lfBuf_cur.byte[6] = q[4];				
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 5;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[2] = q[0];					
+					lfBuf_cur.byte[3] = q[1];
+					lfBuf_cur.byte[4] = q[2];									
+					lfBuf_cur.byte[5] = q[3];									
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 4;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[2] = q[0];			
+					lfBuf_cur.byte[3] = q[1];						
+					lfBuf_cur.byte[4] = q[2];									
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];	
+					q += 3;				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[2] = q[0];						
+					lfBuf_cur.byte[3] = q[1];
+					lfBuf_cur.byte[4] = q[2];
+					lfBuf_cur.byte[5] = q[3];					
+					lfBuf_cur.byte[6] = q[4];		
+					lfBuf_cur.byte[7] = q[5];				
+					q += 6;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				(*newData)[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;			
+			}
+		}
+		else if(reqBytesLength == 7)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[1] = q[0];		
+					lfBuf_cur.byte[2] = q[1];					
+					lfBuf_cur.byte[3] = q[2];
+					lfBuf_cur.byte[4] = q[3];
+					lfBuf_cur.byte[5] = q[4];
+					lfBuf_cur.byte[6] = q[5];				
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 6;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[1] = q[0];							
+					lfBuf_cur.byte[2] = q[1];					
+					lfBuf_cur.byte[3] = q[2];
+					lfBuf_cur.byte[4] = q[3];									
+					lfBuf_cur.byte[5] = q[4];									
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 5;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[1] = q[0];			
+					lfBuf_cur.byte[2] = q[1];					
+					lfBuf_cur.byte[3] = q[2];						
+					lfBuf_cur.byte[4] = q[3];									
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];	
+					q += 4;				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[1] = q[0];					
+					lfBuf_cur.byte[2] = q[1];						
+					lfBuf_cur.byte[3] = q[2];
+					lfBuf_cur.byte[4] = q[3];
+					lfBuf_cur.byte[5] = q[4];					
+					lfBuf_cur.byte[6] = q[5];		
+					lfBuf_cur.byte[7] = q[6];				
+					q += 7;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				(*newData)[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;			
+			}
+		}		
+		else //reqBytesLength == 8
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{
+					lfBuf_cur.byte[0] = q[0];								
+					lfBuf_cur.byte[1] = q[1];		
+					lfBuf_cur.byte[2] = q[2];					
+					lfBuf_cur.byte[3] = q[3];
+					lfBuf_cur.byte[4] = q[4];
+					lfBuf_cur.byte[5] = q[5];
+					lfBuf_cur.byte[6] = q[6];				
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 7;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[0] = q[0];						
+					lfBuf_cur.byte[1] = q[1];							
+					lfBuf_cur.byte[2] = q[2];					
+					lfBuf_cur.byte[3] = q[3];
+					lfBuf_cur.byte[4] = q[4];									
+					lfBuf_cur.byte[5] = q[5];									
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 6;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[0] = q[0];						
+					lfBuf_cur.byte[1] = q[1];			
+					lfBuf_cur.byte[2] = q[2];					
+					lfBuf_cur.byte[3] = q[3];						
+					lfBuf_cur.byte[4] = q[4];									
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];	
+					q += 5;				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[0] = q[0];						
+					lfBuf_cur.byte[1] = q[1];					
+					lfBuf_cur.byte[2] = q[2];						
+					lfBuf_cur.byte[3] = q[3];
+					lfBuf_cur.byte[4] = q[4];
+					lfBuf_cur.byte[5] = q[5];					
+					lfBuf_cur.byte[6] = q[6];		
+					lfBuf_cur.byte[7] = q[7];				
+					q += 8;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				(*newData)[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;			
+			}
+		}
+
+
+
+    }
+	
+	//sz_cost_end();
+	//printf("totalCost = %f\n", sz_totalCost);
+	//free(leadNum);
+	
+}
diff --git a/qtensor/compression/szx/src/szxd_float.c b/qtensor/compression/szx/src/szxd_float.c
new file mode 100644
index 00000000..63d6ad6e
--- /dev/null
+++ b/qtensor/compression/szx/src/szxd_float.c
@@ -0,0 +1,654 @@
+/**
+ *  @file szxd_float.c
+ *  @author Sheng Di, Kai Zhao
+ *  @date Feb, 2022
+ *  @brief 
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "szxd_float.h"
+#include "szx.h"
+#include "szx_BytesToolkit.h"
+#include "szx_TypeManager.h"
+#ifdef _OPENMP
+#include "omp.h"
+#endif
+
+void SZ_fast_decompress_args_with_prediction_float(float** newData, float* pred, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize)
+{
+	size_t nbEle = computeDataLength(r5, r4, r3, r2, r1);
+	SZ_fast_decompress_args_unpredictable_float(newData, r5, r4, r3, r2, r1, cmpBytes, cmpSize);
+	size_t i = 0;
+	for(i=0;i<nbEle;i++)
+		(*newData)[i] += pred[i];
+}
+
+int SZ_fast_decompress_args_unpredictable_one_block_float(float* newData, size_t blockSize, unsigned char* cmpBytes)
+{
+	int cmpSize = 0;
+	size_t nbEle = blockSize;
+	
+	register float medianValue;
+	size_t leadNumArray_size = nbEle%4==0?nbEle/4:nbEle/4+1;
+	
+	size_t k = 0;
+	int reqLength = (int)cmpBytes[k];
+	k++;
+	medianValue = bytesToFloat(&(cmpBytes[k]));
+	k+=sizeof(float);
+	
+	unsigned char* leadNumArray = &(cmpBytes[k]);
+	k += leadNumArray_size;
+	unsigned char* residualMidBytes = &(cmpBytes[k]);	
+	unsigned char* q = residualMidBytes;
+		
+	cmpSize = k;	
+		
+	size_t i = 0, j = 0;
+	k = 0;
+	
+	register lfloat lfBuf_pre;
+	register lfloat lfBuf_cur;
+	
+	lfBuf_pre.ivalue = 0;
+
+	int reqBytesLength, resiBitsLength; 
+	register unsigned char leadingNum;
+
+	reqBytesLength = reqLength/8;
+	resiBitsLength = reqLength%8;
+	int rightShiftBits = 0;
+	
+	if(resiBitsLength!=0)
+	{
+		rightShiftBits = 8 - resiBitsLength;
+		reqBytesLength ++;
+	}
+	
+	//sz_cost_start();
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM)
+	{
+		if(reqBytesLength == 3)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+					lfBuf_cur.byte[1] = q[0];
+					lfBuf_cur.byte[2] = q[1];				
+					q += 2;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[2] = lfBuf_pre.byte[2];
+					lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+					lfBuf_cur.byte[1] = q[0];									
+					q += 1;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[1] = lfBuf_pre.byte[1];
+					lfBuf_cur.byte[2] = lfBuf_pre.byte[2];
+					lfBuf_cur.byte[3] = lfBuf_pre.byte[3];				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[1] = q[0];
+					lfBuf_cur.byte[2] = q[1];					
+					lfBuf_cur.byte[3] = q[2];					
+					q += 3;
+				}
+
+				lfBuf_cur.ivalue = lfBuf_cur.ivalue << rightShiftBits;
+				newData[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;
+			}
+		}
+		else if(reqBytesLength == 2)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+	
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+					lfBuf_cur.byte[2] = q[0];			
+					q += 1;	
+				}
+				else if(leadingNum >= 2)
+				{
+					lfBuf_cur.byte[2] = lfBuf_pre.byte[2];
+					lfBuf_cur.byte[3] = lfBuf_pre.byte[3];									
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[2] = q[0];					
+					lfBuf_cur.byte[3] = q[1];					
+					q += 2;
+				}
+				
+				lfBuf_cur.ivalue = lfBuf_cur.ivalue << rightShiftBits;
+				newData[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;
+			}					
+		}
+		else if(reqBytesLength == 1)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum != 0) //>=1
+				{	
+					lfBuf_cur.byte[3] = lfBuf_pre.byte[3];				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[3] = q[0];				
+					q += 1;	
+				}
+				
+				lfBuf_cur.ivalue = lfBuf_cur.ivalue << rightShiftBits;
+				newData[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;
+			}				
+		}
+		else //reqBytesLength == 4
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[0] = q[0];
+					lfBuf_cur.byte[1] = q[1];
+					lfBuf_cur.byte[2] = q[2];				
+					lfBuf_cur.byte[3] = lfBuf_pre.byte[3];					
+					q += 3;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[0] = q[0];									
+					lfBuf_cur.byte[1] = q[1];									
+					lfBuf_cur.byte[2] = lfBuf_pre.byte[2];
+					lfBuf_cur.byte[3] = lfBuf_pre.byte[3];					
+					q += 2;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[0] = q[0];									
+					lfBuf_cur.byte[1] = lfBuf_pre.byte[1];
+					lfBuf_cur.byte[2] = lfBuf_pre.byte[2];
+					lfBuf_cur.byte[3] = lfBuf_pre.byte[3];	
+					q += 1;				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[0] = q[0];
+					lfBuf_cur.byte[1] = q[1];
+					lfBuf_cur.byte[2] = q[2];					
+					lfBuf_cur.byte[3] = q[3];					
+					q += 4;
+				}
+
+				lfBuf_cur.ivalue = lfBuf_cur.ivalue << rightShiftBits;
+				newData[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;			
+			}
+		}
+	}
+	else
+	{
+		
+	}
+	
+	cmpSize += (q - residualMidBytes); //add the number of residualMidBytes
+	return cmpSize;
+}
+
+
+void SZ_fast_decompress_args_unpredictable_blocked_float(float** newData, size_t nbEle, unsigned char* cmpBytes)
+{
+	*newData = (float*)malloc(sizeof(float)*nbEle);
+
+	unsigned char* r = cmpBytes;
+	r += 4;
+	int blockSize = r[0];  //get block size
+	r++;
+	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
+	r += sizeof(size_t);
+
+	size_t nbBlocks = nbEle/blockSize;
+	size_t remainCount = nbEle%blockSize;
+	size_t stateNBBytes = remainCount == 0 ? (nbBlocks%8==0?nbBlocks/8:nbBlocks/8+1) : ((nbBlocks+1)%8==0? (nbBlocks+1)/8:(nbBlocks+1)/8+1);
+	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
+	unsigned char* stateArray = (unsigned char*)malloc(actualNBBlocks);
+	float* constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));
+
+	convertByteArray2IntArray_fast_1b_args(actualNBBlocks, r, stateNBBytes, stateArray); //get the stateArray
+
+	unsigned char* p = r + stateNBBytes; //p is the starting address of constant median values.
+
+	size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+	for(i = 0;i < nbConstantBlocks;i++, j+=4) //get the median values for constant-value blocks
+		constantMedianArray[i] = bytesToFloat(p+j);
+
+	unsigned char* q = p + sizeof(float)*nbConstantBlocks; //q is the starting address of the non-constant data blocks
+	float* op = *newData;
+
+	for(i=0;i<nbBlocks;i++, op += blockSize)
+	{
+		unsigned char state = stateArray[i];
+		if(state) //non-constant block
+		{
+			int cmpSize = SZ_fast_decompress_args_unpredictable_one_block_float(op, blockSize, q);
+			q += cmpSize;
+		}
+		else //constant block
+		{
+			float medianValue = constantMedianArray[k];
+			for(j=0;j<blockSize;j++)
+				op[j] = medianValue;
+			p += sizeof(float);
+			k ++;
+		}
+	}
+
+	if(remainCount)
+	{
+		unsigned char state = stateArray[i];
+		if(state) //non-constant block
+		{
+			SZ_fast_decompress_args_unpredictable_one_block_float(op, remainCount, q);
+		}
+		else //constant block
+		{
+			float medianValue = constantMedianArray[k];
+			for(j=0;j<remainCount;j++)
+				op[j] = medianValue;
+		}
+	}
+
+	free(stateArray);
+	free(constantMedianArray);
+}
+
+void SZ_fast_decompress_args_unpredictable_blocked_randomaccess_float_openmp(float** newData, size_t nbEle, unsigned char* cmpBytes) {
+
+	*newData = (float *) malloc(sizeof(float) * nbEle);
+	sz_cost_start();
+	unsigned char *r = cmpBytes;
+	r += 4; //skip version information
+	int blockSize = bytesToLong_bigEndian(r);  //get block size
+    r += sizeof(size_t);
+	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
+	r += sizeof(size_t);
+
+	size_t nbBlocks = nbEle / blockSize;
+	size_t remainCount = nbEle % blockSize;
+	size_t stateNBBytes =
+			remainCount == 0 ? (nbBlocks % 8 == 0 ? nbBlocks / 8 : nbBlocks / 8 + 1) : ((nbBlocks + 1) % 8 == 0 ?
+																						(nbBlocks + 1) / 8 :
+																						(nbBlocks + 1) / 8 + 1);
+	size_t actualNBBlocks = remainCount == 0 ? nbBlocks : nbBlocks + 1;
+
+	size_t nbNonConstantBlocks = actualNBBlocks - nbConstantBlocks;
+
+	unsigned char *stateArray = (unsigned char *) malloc(actualNBBlocks);
+//	float *constantMedianArray = (float *) malloc(nbConstantBlocks * sizeof(float));
+    unsigned char **qarray = (unsigned char **) malloc(actualNBBlocks * sizeof(unsigned char *));
+    float *parray = (float *) malloc(actualNBBlocks * sizeof(float));
+
+    int16_t* O = (int16_t*) r;
+    unsigned char *R = r + nbNonConstantBlocks*sizeof(uint16_t); //block-size information
+    unsigned char *p = R + stateNBBytes; //p is the starting address of constant median values.
+    float *constantMedianArray = (float *) p;
+    unsigned char *q = p + sizeof(float) * nbConstantBlocks; //q is the starting address of the non-constant data blocks
+    float *op = *newData;
+
+	size_t nonConstantBlockID = 0, constantBlockID = 0;
+    sz_cost_end_msg("sequential-1 malloc");
+
+    sz_cost_start();
+    size_t i = 0;// k = 0; //k is used to keep track of constant block index
+//    for (i = 0; i < nbConstantBlocks; i++, k += 4) //get the median values for constant-value blocks
+//        constantMedianArray[i] = bytesToFloat(p + k);
+
+    convertByteArray2IntArray_fast_1b_args(actualNBBlocks, R, stateNBBytes, stateArray); //get the stateArray
+    sz_cost_end_msg("sequential-2 byte to int");
+
+    sz_cost_start();
+    for (i = 0; i < actualNBBlocks; i++) {
+		if (stateArray[i]) {
+			qarray[i] = q;
+
+			q += O[nonConstantBlockID++];
+		} else {
+			parray[i] = constantMedianArray[constantBlockID++];
+		}
+	}
+
+    sz_cost_end_msg("sequential-3 sum");
+	sz_cost_start();
+#pragma omp parallel for schedule(static)
+	for (i = 0; i < nbBlocks; i++) {
+		if (stateArray[i]) {//non-constant block
+			SZ_fast_decompress_args_unpredictable_one_block_float(op + i * blockSize, blockSize, qarray[i]);
+		} else {//constant block
+			for (int j = 0; j < blockSize; j++)
+				op[i * blockSize + j] = parray[i];
+		}
+	}
+	sz_cost_end_msg("parallel-1");
+
+	sz_cost_start();
+	if (remainCount) {
+        i = nbBlocks;
+        if (stateArray[i]) { //non-constant block
+			SZ_fast_decompress_args_unpredictable_one_block_float(op + i * blockSize, remainCount, qarray[i]);
+		} else {//constant block
+			for (int j = 0; j < remainCount; j++)
+				op[i * blockSize + j] = parray[i];
+		}
+	}
+
+	free(parray);
+	free(qarray);
+	free(stateArray);
+//	free(constantMedianArray);
+	sz_cost_end_msg("sequence-3 free");
+}
+
+
+void SZ_fast_decompress_args_unpredictable_blocked_randomaccess_float(float** newData, size_t nbEle, unsigned char* cmpBytes){
+	*newData = (float*)malloc(sizeof(float)*nbEle);
+	
+	unsigned char* r = cmpBytes;
+	r+=4; //skip version information
+    int blockSize = bytesToLong_bigEndian(r);  //get block size
+    r += sizeof(size_t);
+	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
+	r += sizeof(size_t);
+		
+	size_t nbBlocks = nbEle/blockSize;
+	size_t remainCount = nbEle%blockSize;
+	size_t stateNBBytes = remainCount == 0 ? (nbBlocks%8==0?nbBlocks/8:nbBlocks/8+1) : ((nbBlocks+1)%8==0? (nbBlocks+1)/8:(nbBlocks+1)/8+1);
+	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
+	
+	size_t nbNonConstantBlocks = actualNBBlocks - nbConstantBlocks;
+	
+
+	unsigned char* stateArray = (unsigned char*)malloc(actualNBBlocks);
+	float* constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));
+
+    int16_t* O = (int16_t*) r;
+    unsigned char* R = r+ nbNonConstantBlocks*sizeof(uint16_t); //block-size information
+
+    convertByteArray2IntArray_fast_1b_args(actualNBBlocks, R, stateNBBytes, stateArray); //get the stateArray
+	
+	unsigned char* p = R + stateNBBytes; //p is the starting address of constant median values.
+	
+	size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+	for(i = 0;i < nbConstantBlocks;i++, j+=4) //get the median values for constant-value blocks
+		constantMedianArray[i] = bytesToFloat(p+j);
+
+	unsigned char* q = p + sizeof(float)*nbConstantBlocks; //q is the starting address of the non-constant data blocks
+	float* op = *newData;
+
+	size_t nonConstantBlockID=0;
+	for(i=0;i<nbBlocks;i++, op += blockSize)
+	{
+		unsigned char state = stateArray[i];
+		if(state) //non-constant block
+		{
+            SZ_fast_decompress_args_unpredictable_one_block_float(op, blockSize, q);
+            q += O[nonConstantBlockID];
+            nonConstantBlockID++;
+		}
+		else //constant block
+		{
+			float medianValue = constantMedianArray[k];			
+			for(j=0;j<blockSize;j++)
+				op[j] = medianValue;
+			p += sizeof(float);
+			k ++;
+		}
+	}
+
+	if(remainCount)
+	{
+		unsigned char state = stateArray[i];
+		if(state) //non-constant block
+		{
+			SZ_fast_decompress_args_unpredictable_one_block_float(op, remainCount, q);	
+		}
+		else //constant block
+		{
+			float medianValue = constantMedianArray[k];				
+			for(j=0;j<remainCount;j++)
+				op[j] = medianValue;
+		}		
+	}
+	
+	free(stateArray);
+	free(constantMedianArray);
+}
+
+void SZ_fast_decompress_args_unpredictable_float(float** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes,
+size_t cmpSize)
+{
+	size_t nbEle = computeDataLength(r5, r4, r3, r2, r1);
+	*newData = (float*)malloc(sizeof(float)*nbEle);	
+	
+	register float medianValue;
+	size_t leadNumArray_size;
+
+    unsigned char *r = cmpBytes;
+    r += 4; //skip version information
+
+	size_t k = 0;
+	int reqLength = (int)r[k];
+	k++;
+	medianValue = bytesToFloat(&(r[k]));
+	k+=sizeof(float);
+	leadNumArray_size = bytesToSize(&(r[k]));
+	k+=sizeof(size_t);
+	
+	unsigned char* leadNumArray = &(r[k]);
+	k += leadNumArray_size;
+	unsigned char* residualMidBytes = &(r[k]);
+	unsigned char* q = residualMidBytes;
+		
+	size_t i = 0, j = 0;
+	k = 0;
+	
+	register lfloat lfBuf_pre;
+	register lfloat lfBuf_cur;
+	
+	lfBuf_pre.ivalue = 0;
+
+	int reqBytesLength, resiBitsLength; 
+	register unsigned char leadingNum;
+
+	reqBytesLength = reqLength/8;
+	resiBitsLength = reqLength%8;
+	int rightShiftBits = 0;
+	
+	if(resiBitsLength!=0)
+	{
+		rightShiftBits = 8 - resiBitsLength;
+		reqBytesLength ++;
+	}
+	
+	//sz_cost_start();
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM) {
+        if (reqBytesLength == 3) {
+            for (i = 0; i < nbEle; i++) {
+                lfBuf_cur.value = 0;
+
+                j = (i >> 2); //i/4
+                k = (i & 0x03) << 1; //(i%4)*2
+                leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+
+                if (leadingNum == 1) {
+                    lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+                    lfBuf_cur.byte[1] = q[0];
+                    lfBuf_cur.byte[2] = q[1];
+                    q += 2;
+                } else if (leadingNum == 2) {
+                    lfBuf_cur.byte[2] = lfBuf_pre.byte[2];
+                    lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+                    lfBuf_cur.byte[1] = q[0];
+                    q += 1;
+                } else if (leadingNum == 3) {
+                    lfBuf_cur.byte[1] = lfBuf_pre.byte[1];
+                    lfBuf_cur.byte[2] = lfBuf_pre.byte[2];
+                    lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+                } else //==0
+                {
+                    lfBuf_cur.byte[1] = q[0];
+                    lfBuf_cur.byte[2] = q[1];
+                    lfBuf_cur.byte[3] = q[2];
+                    q += 3;
+                }
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue << rightShiftBits;
+                (*newData)[i] = lfBuf_cur.value + medianValue;
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        } else if (reqBytesLength == 2) {
+            for (i = 0; i < nbEle; i++) {
+                lfBuf_cur.value = 0;
+
+                j = (i >> 2); //i/4
+                k = (i & 0x03) << 1; //(i%4)*2
+                leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+
+                if (leadingNum == 1) {
+                    lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+                    lfBuf_cur.byte[2] = q[0];
+                    q += 1;
+                } else if (leadingNum >= 2) {
+                    lfBuf_cur.byte[2] = lfBuf_pre.byte[2];
+                    lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+                } else //==0
+                {
+                    lfBuf_cur.byte[2] = q[0];
+                    lfBuf_cur.byte[3] = q[1];
+                    q += 2;
+                }
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue << rightShiftBits;
+                (*newData)[i] = lfBuf_cur.value + medianValue;
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre = lfBuf_cur;
+
+            }
+        } else if (reqBytesLength == 1) {
+            for (i = 0; i < nbEle; i++) {
+                lfBuf_cur.value = 0;
+
+                j = (i >> 2); //i/4
+                k = (i & 0x03) << 1; //(i%4)*2
+                leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+
+                if (leadingNum != 0) //>=1
+                {
+                    lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+                } else //==0
+                {
+                    lfBuf_cur.byte[3] = q[0];
+                    q += 1;
+                }
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue << rightShiftBits;
+                (*newData)[i] = lfBuf_cur.value + medianValue;
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        } else {
+            for (i = 0; i < nbEle; i++) {
+                lfBuf_cur.value = 0;
+
+                j = (i >> 2); //i/4
+                k = (i & 0x03) << 1; //(i%4)*2
+                leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+
+                if (leadingNum == 1) {
+                    lfBuf_cur.byte[0] = q[0];
+                    lfBuf_cur.byte[1] = q[1];
+                    lfBuf_cur.byte[2] = q[2];
+                    lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+                    q += 3;
+                } else if (leadingNum == 2) {
+                    lfBuf_cur.byte[0] = q[0];
+                    lfBuf_cur.byte[1] = q[1];
+                    lfBuf_cur.byte[2] = lfBuf_pre.byte[2];
+                    lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+                    q += 2;
+                } else if (leadingNum == 3) {
+                    lfBuf_cur.byte[0] = q[0];
+                    lfBuf_cur.byte[1] = lfBuf_pre.byte[1];
+                    lfBuf_cur.byte[2] = lfBuf_pre.byte[2];
+                    lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+                    q += 1;
+                } else //==0
+                {
+                    lfBuf_cur.byte[0] = q[0];
+                    lfBuf_cur.byte[1] = q[1];
+                    lfBuf_cur.byte[2] = q[2];
+                    lfBuf_cur.byte[3] = q[3];
+                    q += 4;
+                }
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue << rightShiftBits;
+                (*newData)[i] = lfBuf_cur.value + medianValue;
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }
+    }
+	
+	//sz_cost_end();
+	//printf("totalCost = %f\n", sz_totalCost);
+	//free(leadNum);
+	
+}
diff --git a/qtensor/compression/szx/src/timingGPU.cu b/qtensor/compression/szx/src/timingGPU.cu
new file mode 100644
index 00000000..dc390510
--- /dev/null
+++ b/qtensor/compression/szx/src/timingGPU.cu
@@ -0,0 +1,45 @@
+/**************/
+/* TIMING GPU */
+/**************/
+
+#include "timingGPU.h"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+struct PrivateTimingGPU {
+    cudaEvent_t start;
+    cudaEvent_t stop;
+};
+
+// default constructor
+TimingGPU::TimingGPU() { privateTimingGPU = new PrivateTimingGPU;  }
+
+// default destructor
+TimingGPU::~TimingGPU() { }
+
+void TimingGPU::StartCounter()
+{
+    cudaEventCreate(&((*privateTimingGPU).start));
+    cudaEventCreate(&((*privateTimingGPU).stop));
+    cudaEventRecord((*privateTimingGPU).start,0);
+}
+
+void TimingGPU::StartCounterFlags()
+{
+    int eventflags = cudaEventBlockingSync;
+
+    cudaEventCreateWithFlags(&((*privateTimingGPU).start),eventflags);
+    cudaEventCreateWithFlags(&((*privateTimingGPU).stop),eventflags);
+    cudaEventRecord((*privateTimingGPU).start,0);
+}
+
+// Gets the counter in ms
+float TimingGPU::GetCounter()
+{
+    float time;
+    cudaEventRecord((*privateTimingGPU).stop, 0);
+    cudaEventSynchronize((*privateTimingGPU).stop);
+    cudaEventElapsedTime(&time,(*privateTimingGPU).start,(*privateTimingGPU).stop);
+    return time;
+}

From cb05f9f65e3be8184e273d74ab43f8de9dc63067 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Mon, 23 Jan 2023 00:00:58 -0600
Subject: [PATCH 009/126] first try: reverse order

---
 bench/qc_simulation/qtensor/run.py            |  4 +-
 qtensor/contraction_backends/torch.py         | 43 +++++++++++--
 .../transposed_bucket_elimination.py          | 60 +++++++++++++++++++
 3 files changed, 100 insertions(+), 7 deletions(-)
 create mode 100644 qtensor/merged_indices/transposed_bucket_elimination.py

diff --git a/bench/qc_simulation/qtensor/run.py b/bench/qc_simulation/qtensor/run.py
index 5a625dc2..116e4dbd 100644
--- a/bench/qc_simulation/qtensor/run.py
+++ b/bench/qc_simulation/qtensor/run.py
@@ -4,6 +4,7 @@
 from test_circuits import gen_qaoa_maxcut_circuit
 import qtensor
 import qtree
+from qtensor.merged_indices.transposed_bucket_elimination import bucket_elimination
 import numpy as np
 import pandas as pd
 import pyrofiler
@@ -14,7 +15,7 @@
 from qtensor.contraction_backends import get_backend, PerfBackend
 
 def bucket_contraction_report(tn, buckets, backend,
-                              bucket_elimination=qtree.optimizer.bucket_elimination
+                              bucket_elimination=bucket_elimination
                              ):
     """
     Returns:
@@ -42,7 +43,6 @@ def get_buckets_tn(circ, backend, ordering_algo:str, batch_vars=0, seed=10):
     sim.prepare_buckets(circ, batch_vars=batch_vars)
     return sim.buckets, tn
 
-
 '''
 Function: Generate a collection of above report, and process them into final usable form
 I/O: ... -> processed data is a dict, directly usable by json
diff --git a/qtensor/contraction_backends/torch.py b/qtensor/contraction_backends/torch.py
index b3614fa3..8e37238d 100644
--- a/qtensor/contraction_backends/torch.py
+++ b/qtensor/contraction_backends/torch.py
@@ -17,6 +17,37 @@ def qtree2torch_tensor(tensor, data_dict):
     return tensor.copy(data=torch_t)
 
 
+def get_einsum_expr(idx1, idx2):
+    """
+    Takes two tuples of indices and returns an einsum expression
+    to evaluate the sum over repeating indices
+
+    Parameters
+    ----------
+    idx1 : list-like
+          indices of the first argument
+    idx2 : list-like
+          indices of the second argument
+
+    Returns
+    -------
+    expr : str
+          Einsum command to sum over indices repeating in idx1
+          and idx2.
+    """
+    result_indices = sorted(list(set(idx1 + idx2)), reverse=True)
+    # remap indices to reduce their order, as einsum does not like
+    # large numbers
+    idx_to_least_idx = {old_idx: new_idx for new_idx, old_idx
+                        in enumerate(result_indices)}
+
+    str1 = ''.join(qtree.utils.num_to_alpha(idx_to_least_idx[ii]) for ii in idx1)
+    str2 = ''.join(qtree.utils.num_to_alpha(idx_to_least_idx[ii]) for ii in idx2)
+    str3 = ''.join(qtree.utils.num_to_alpha(idx_to_least_idx[ii]) for ii in result_indices)
+    return str1 + ',' + str2 + '->' + str3
+
+
+
 class TorchBackend(ContractionBackend):
     def __init__(self, device='cpu'):
         self.device = device
@@ -34,7 +65,7 @@ def process_bucket(self, bucket, no_sum=False):
 
         for tensor in bucket[1:]:
 
-            expr = qtree.utils.get_einsum_expr(
+            expr = get_einsum_expr(
                 list(map(int, result_indices)), list(map(int, tensor.indices))
             )
 
@@ -48,7 +79,8 @@ def process_bucket(self, bucket, no_sum=False):
             # Merge and sort indices and shapes
             result_indices = tuple(sorted(
                 set(result_indices + tensor.indices),
-                key=int)
+                key=int, reverse=True
+            )
             )
             
             size = len(set(tensor.indices))
@@ -61,7 +93,8 @@ def process_bucket(self, bucket, no_sum=False):
 
         if len(result_indices) > 0:
             if not no_sum:  # trim first index
-                first_index, *result_indices = result_indices
+                first_index = result_indices[-1]
+                result_indices = result_indices[:-1]
             else:
                 first_index, *_ = result_indices
             tag = first_index.identity
@@ -75,7 +108,7 @@ def process_bucket(self, bucket, no_sum=False):
                                 data=result_data)
         else:
             result = qtree.optimizer.Tensor(f'E{tag}', result_indices,
-                                data=torch.sum(result_data, axis=0))
+                                data=torch.sum(result_data, axis=-1))
         
         #print("summary:",sorted(self.exprs.items(), key=lambda x: x[1], reverse=True))
         #print("stats:",self.width_bc)
@@ -133,7 +166,7 @@ def get_sliced_buckets(self, buckets, data_dict, slice_dict):
             for tensor in bucket:
                 # get data
                 # sort tensor dimensions
-                transpose_order = np.argsort(list(map(int, tensor.indices)))
+                transpose_order = np.argsort(list(map(int, tensor.indices)))[::-1]
                 data = data_dict[tensor.data_key]
                 if not isinstance(data, torch.Tensor):             
                     if self.device == 'gpu' and torch.cuda.is_available():
diff --git a/qtensor/merged_indices/transposed_bucket_elimination.py b/qtensor/merged_indices/transposed_bucket_elimination.py
new file mode 100644
index 00000000..41246fc9
--- /dev/null
+++ b/qtensor/merged_indices/transposed_bucket_elimination.py
@@ -0,0 +1,60 @@
+import itertools
+
+def bucket_elimination(buckets, process_bucket_fn,
+                       n_var_nosum=0):
+    """
+    Algorithm to evaluate a contraction of a large number of tensors.
+    The variables to contract over are assigned ``buckets`` which
+    hold tensors having respective variables. The algorithm
+    proceeds through contracting one variable at a time, thus we eliminate
+    buckets one by one.
+
+    Parameters
+    ----------
+    buckets : list of lists
+    process_bucket_fn : function
+              function that will process this kind of buckets
+    n_var_nosum : int, optional
+              number of variables that have to be left in the
+              result. Expected at the end of bucket list
+    Returns
+    -------
+    result : numpy.array
+    """
+    # import pdb
+    # pdb.set_trace()
+    n_var_contract = len(buckets) - n_var_nosum
+
+    result = None
+    for n in range(n_var_contract):
+        bucket = buckets[n]
+        if len(bucket) > 0:
+            tensor = process_bucket_fn(bucket)
+            #-- Memory management
+            buckets[n] = []
+            #--
+
+            if len(tensor.indices) > 0:
+                # tensor is not scalar.
+                # Move it to appropriate bucket
+                first_index = int(tensor.indices[-1])
+                buckets[first_index].append(tensor)
+            else:   # tensor is scalar
+                if result is not None:
+                    result *= tensor
+                else:
+                    result = tensor
+        # free up space, the tensors are no longer needed
+        buckets[n] = []
+
+    # form a single list of the rest if any
+    rest = list(itertools.chain.from_iterable(buckets[n_var_contract:]))
+    if len(rest) > 0:
+        # only multiply tensors
+        tensor = process_bucket_fn(rest, no_sum=True)
+        if result is not None:
+            result *= tensor
+        else:
+            result = tensor
+    return result
+

From 5482c68ee25f3a6c511da4fef9a22de59d506eba Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Wed, 25 Jan 2023 11:55:16 -0600
Subject: [PATCH 010/126] merge summation into last einsum in torch backend

---
 qtensor/contraction_backends/torch.py | 40 +++++++++++++++++----------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/qtensor/contraction_backends/torch.py b/qtensor/contraction_backends/torch.py
index 8e37238d..05f72410 100644
--- a/qtensor/contraction_backends/torch.py
+++ b/qtensor/contraction_backends/torch.py
@@ -3,7 +3,7 @@
 import numpy as np
 from qtree import np_framework
 from qtensor.contraction_backends import ContractionBackend
-from qtensor.contraction_backends.numpy import get_einsum_expr
+
 def qtree2torch_tensor(tensor, data_dict):
     """ Converts qtree tensor to pytorch tensor using data dict"""
     if isinstance(tensor.data, torch.Tensor):
@@ -17,7 +17,7 @@ def qtree2torch_tensor(tensor, data_dict):
     return tensor.copy(data=torch_t)
 
 
-def get_einsum_expr(idx1, idx2):
+def get_einsum_expr(idx1, idx2, contract=0):
     """
     Takes two tuples of indices and returns an einsum expression
     to evaluate the sum over repeating indices
@@ -40,6 +40,7 @@ def get_einsum_expr(idx1, idx2):
     # large numbers
     idx_to_least_idx = {old_idx: new_idx for new_idx, old_idx
                         in enumerate(result_indices)}
+    result_indices = result_indices[:len(result_indices)-contract]
 
     str1 = ''.join(qtree.utils.num_to_alpha(idx_to_least_idx[ii]) for ii in idx1)
     str2 = ''.join(qtree.utils.num_to_alpha(idx_to_least_idx[ii]) for ii in idx2)
@@ -63,7 +64,7 @@ def process_bucket(self, bucket, no_sum=False):
         width = len(set(bucket[0].indices))
         #print("w:",width)
 
-        for tensor in bucket[1:]:
+        for tensor in bucket[1:-1]:
 
             expr = get_einsum_expr(
                 list(map(int, result_indices)), list(map(int, tensor.indices))
@@ -91,24 +92,33 @@ def process_bucket(self, bucket, no_sum=False):
             self.width_bc[width][0] = len(self.width_dict[width])
             self.width_bc[width][1] += 1
 
+        if len(bucket)>1:
+            tensor = bucket[-1]
+            expr = get_einsum_expr(
+                list(map(int, result_indices)), list(map(int, tensor.indices))
+                , contract = 1
+            )
+            result_data = torch.einsum(expr, result_data, tensor.data)
+            result_indices = tuple(sorted(
+                set(result_indices + tensor.indices),
+                key=int, reverse=True
+            ))
+        else:
+            result_data = result_data.sum(axis=-1)
+
+
+
         if len(result_indices) > 0:
-            if not no_sum:  # trim first index
-                first_index = result_indices[-1]
-                result_indices = result_indices[:-1]
-            else:
-                first_index, *_ = result_indices
+            first_index = result_indices[-1]
+            result_indices = result_indices[:-1]
             tag = first_index.identity
         else:
             tag = 'f'
             result_indices = []
 
         # reduce
-        if no_sum:
-            result = qtree.optimizer.Tensor(f'E{tag}', result_indices,
-                                data=result_data)
-        else:
-            result = qtree.optimizer.Tensor(f'E{tag}', result_indices,
-                                data=torch.sum(result_data, axis=-1))
+        result = qtree.optimizer.Tensor(f'E{tag}', result_indices,
+                            data=result_data)
         
         #print("summary:",sorted(self.exprs.items(), key=lambda x: x[1], reverse=True))
         #print("stats:",self.width_bc)
@@ -173,7 +183,7 @@ def get_sliced_buckets(self, buckets, data_dict, slice_dict):
                         cuda = torch.device('cuda')
                         data = torch.from_numpy(data).to(cuda)
                     else:
-                        data = torch.from_numpy(data)
+                        data = torch.from_numpy(data.astype(np.complex128))
 
                 data = data.permute(tuple(transpose_order))
                 # transpose indices

From 3ad55ac2f17c338ca23b3976de4d777bcf26518e Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Wed, 25 Jan 2023 12:20:10 -0600
Subject: [PATCH 011/126] update submodule

---
 qtree | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qtree b/qtree
index bfe253df..ccbb4093 160000
--- a/qtree
+++ b/qtree
@@ -1 +1 @@
-Subproject commit bfe253df1cbaae6af0a5fd5198f237f3654819d6
+Subproject commit ccbb4093360da843bcb8282941aa22154b85e2af

From 405d5ab8a78ce3ff59fd8d96e86e3ac5251c8015 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Wed, 25 Jan 2023 12:27:36 -0600
Subject: [PATCH 012/126] fix dtype for torch_gpu

---
 qtensor/contraction_backends/torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qtensor/contraction_backends/torch.py b/qtensor/contraction_backends/torch.py
index 05f72410..db0c88a5 100644
--- a/qtensor/contraction_backends/torch.py
+++ b/qtensor/contraction_backends/torch.py
@@ -181,7 +181,7 @@ def get_sliced_buckets(self, buckets, data_dict, slice_dict):
                 if not isinstance(data, torch.Tensor):             
                     if self.device == 'gpu' and torch.cuda.is_available():
                         cuda = torch.device('cuda')
-                        data = torch.from_numpy(data).to(cuda)
+                        data = torch.from_numpy(data.astype(np.complex128)).to(cuda)
                     else:
                         data = torch.from_numpy(data.astype(np.complex128))
 

From 5c90eb68838425406f6d183e19ec99eb6e4f03d1 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Thu, 26 Jan 2023 21:12:56 -0600
Subject: [PATCH 013/126] Add ordering algo as parameter to run.py. Fix merged
 backend in torch

---
 bench/qc_simulation/qtensor/run.py    | 17 +++++++-----
 qtensor/contraction_backends/torch.py | 37 +++++++++++----------------
 2 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/bench/qc_simulation/qtensor/run.py b/bench/qc_simulation/qtensor/run.py
index 116e4dbd..9c7ae7e0 100644
--- a/bench/qc_simulation/qtensor/run.py
+++ b/bench/qc_simulation/qtensor/run.py
@@ -30,7 +30,8 @@ def bucket_contraction_report(tn, buckets, backend,
         buckets, perf_backend.process_bucket,
         n_var_nosum=len(tn.free_vars)
     )
-    perf_backend.get_result_data(result).flatten()
+    result_data = perf_backend.get_result_data(result).flatten()
+    print("Result data:", result_data)
     # compute report_table
     rep_txt = perf_backend.gen_report(show=False)
     return perf_backend.report_table
@@ -47,10 +48,10 @@ def get_buckets_tn(circ, backend, ordering_algo:str, batch_vars=0, seed=10):
 Function: Generate a collection of above report, and process them into final usable form
 I/O: ... -> processed data is a dict, directly usable by json
 '''
-def collect_process_be_pt_report(repeat: int, backend, circ):
+def collect_process_be_pt_report(repeat: int, backend, circ, ordering_algo='greedy'):
     timing = pyrofiler.timing
     with timing(callback=lambda x: None) as gen:
-        buckets, tn = get_buckets_tn(circ, backend, 'rgreedy_0.02_10', batch_vars=0)
+        buckets, tn = get_buckets_tn(circ, backend, ordering_algo, batch_vars=0)
 
     tables = []
     wall_start = time.time()
@@ -71,8 +72,10 @@ def mean_mmax(x: list):
     return np.mean(x)
 
 def main():
-    Ns = [24, 26, 28, 30]
-    p = 3
+    Ns = [24]
+    p = 15
+    ordering_algo = 'greedy'
+    repeats = 2
     top_K = 15
     backend_name = 'torch_cpu'
     print("backend: ", backend_name)
@@ -80,7 +83,7 @@ def main():
         print(f"N={N}")
         backend = get_backend(backend_name)
         circ = gen_qaoa_maxcut_circuit(N, p)
-        report = collect_process_be_pt_report(9, backend, circ)
+        report = collect_process_be_pt_report(repeats, backend, circ, ordering_algo=ordering_algo)
 
         stats = report[["time"]].groupby('step').agg(['mean', 'min', 'max', 'std'])
         stats = pd.concat([
@@ -101,7 +104,7 @@ def main():
             report[["time"]].groupby('step').agg('mean'),
             report[["flop","FLOPS", 'result_size', 'bucket_len']].groupby('step').first()
         ], axis=1)
-        print(stats[['time', 'result_size', 'FLOPS']].groupby('result_size').agg(['mean', 'sum']))
+        print(stats[['time', 'result_size', 'FLOPS']].groupby('result_size').agg(['mean', 'sum', 'count']))
         print("Total time:")
         print(stats['time'].sum())
 
diff --git a/qtensor/contraction_backends/torch.py b/qtensor/contraction_backends/torch.py
index db0c88a5..0636f096 100644
--- a/qtensor/contraction_backends/torch.py
+++ b/qtensor/contraction_backends/torch.py
@@ -3,6 +3,8 @@
 import numpy as np
 from qtree import np_framework
 from qtensor.contraction_backends import ContractionBackend
+import string
+CHARS = string.ascii_lowercase + string.ascii_uppercase
 
 def qtree2torch_tensor(tensor, data_dict):
     """ Converts qtree tensor to pytorch tensor using data dict"""
@@ -16,6 +18,18 @@ def qtree2torch_tensor(tensor, data_dict):
     data_dict[tensor.data_key] = torch_t
     return tensor.copy(data=torch_t)
 
+def get_einsum_expr_bucket(bucket, all_indices_list, result_indices):
+    # converting elements to int will make stuff faster, 
+    # but will drop support for char indices
+    # all_indices_list = [int(x) for x in all_indices]
+    # to_small_int = lambda x: all_indices_list.index(int(x))
+    to_small_int = lambda x: all_indices_list.index(x)
+    expr = ','.join(
+        ''.join(CHARS[to_small_int(i)] for i in t.indices)
+        for t in bucket) +\
+            '->'+''.join(CHARS[to_small_int(i)] for i in result_indices)
+    return expr
+
 
 def get_einsum_expr(idx1, idx2, contract=0):
     """
@@ -55,14 +69,11 @@ def __init__(self, device='cpu'):
         self.dtype = ['float', 'double', 'complex64', 'complex128']
         self.width_dict = [set() for i in range(30)]
         self.width_bc = [[0,0] for i in range(30)] #(#distinct_bc, #bc)
-        self.exprs = {}
-
 
     def process_bucket(self, bucket, no_sum=False):
         result_indices = bucket[0].indices
         result_data = bucket[0].data
         width = len(set(bucket[0].indices))
-        #print("w:",width)
 
         for tensor in bucket[1:-1]:
 
@@ -70,12 +81,6 @@ def process_bucket(self, bucket, no_sum=False):
                 list(map(int, result_indices)), list(map(int, tensor.indices))
             )
 
-            if expr not in self.exprs.keys():
-                self.exprs[expr] = 1
-            else:
-                self.exprs[expr] += 1
-
-            result_data = torch.einsum(expr, result_data, tensor.data)
 
             # Merge and sort indices and shapes
             result_indices = tuple(sorted(
@@ -98,7 +103,6 @@ def process_bucket(self, bucket, no_sum=False):
                 list(map(int, result_indices)), list(map(int, tensor.indices))
                 , contract = 1
             )
-            result_data = torch.einsum(expr, result_data, tensor.data)
             result_indices = tuple(sorted(
                 set(result_indices + tensor.indices),
                 key=int, reverse=True
@@ -119,9 +123,6 @@ def process_bucket(self, bucket, no_sum=False):
         # reduce
         result = qtree.optimizer.Tensor(f'E{tag}', result_indices,
                             data=result_data)
-        
-        #print("summary:",sorted(self.exprs.items(), key=lambda x: x[1], reverse=True))
-        #print("stats:",self.width_bc)
         return result
 
     def process_bucket_merged(self, ixs, bucket, no_sum=False):
@@ -146,13 +147,7 @@ def process_bucket_merged(self, ixs, bucket, no_sum=False):
             for i in range(len(tensors)):
                 tensors[i] = tensors[i].type(torch.complex128)
         
-        expr = get_einsum_expr(bucket, all_indices_list, result_indices)
-        # print("expr:", expr)
-        if expr not in self.exprs.keys():
-            self.exprs[expr] = 1
-        else:
-            self.exprs[expr] += 1
-
+        expr = get_einsum_expr_bucket(bucket, all_indices_list, result_indices)
         expect = len(result_indices)
         result_data = torch.einsum(expr, *tensors)
 
@@ -165,8 +160,6 @@ def process_bucket_merged(self, ixs, bucket, no_sum=False):
         result = qtree.optimizer.Tensor(f'E{tag}', result_indices,
                             data=result_data)
         
-        # print("summary:",sorted(self.exprs.items(), key=lambda x: x[1], reverse=True))
-        # print("# distinct buckets:", len(self.exprs))
         return result
 
     def get_sliced_buckets(self, buckets, data_dict, slice_dict):

From 1298fdad1aece4dc5bea3004706e17ac91b1358f Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Fri, 27 Jan 2023 19:20:50 -0600
Subject: [PATCH 014/126] update compression cost test

---
 qtensor/compression/test_cost_estimation.py | 29 ++++++++++++---------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/qtensor/compression/test_cost_estimation.py b/qtensor/compression/test_cost_estimation.py
index 33b4db33..35c65801 100644
--- a/qtensor/compression/test_cost_estimation.py
+++ b/qtensor/compression/test_cost_estimation.py
@@ -4,31 +4,32 @@
 from qtensor.tests import get_test_problem
 from qtensor.optimisation import QtreeTensorNet
 from qtensor import QtreeQAOAComposer
+from qtensor.optimisation.Optimizer import TreeTrimSplitter
 
 
 def costs_to_csv(costs):
     first_line = "flops, memory, width, compressions, decompressions, time"
     lines = [first_line]
     for i, c in enumerate(costs):
-        time = c.time(1e6, 1e5, 1e5, 13)
+        time = c.time(1e11/16, 200e9/16, 200e9/15, 13)
         lines.append(f"[{i}]\t{c.flops},\t{round(c.memory)},\t{c.width},\t {c.compressions},\t{c.decompressions},\t{time}")
     return "\n".join(lines)
 
 def test_compressed_contraction_cost():
-    G, gamma, beta = get_test_problem(n=20, p=4, d=4)
+    G, gamma, beta = get_test_problem(n=32, p=15, d=4)
+    opt = qtensor.toolbox.get_ordering_algo('naive')
 
     composer = QtreeQAOAComposer(
         graph=G, gamma=gamma, beta=beta)
     composer.ansatz_state()
 
     tn = QtreeTensorNet.from_qtree_gates(composer.circuit)
-    max_time = 15
-    opt = qtensor.toolbox.get_ordering_algo('greedy')
+    #max_time = 15
     peo, t = opt.optimize(tn)
     print(f"Contraction width: {opt.treewidth}")
-    M_limit = opt.treewidth - 6
+    M_limit = opt.treewidth-6
     # -- Estimate compressed contraction
-    costs = compressed_contraction_cost(tn, peo, mem_limit=M_limit)
+    costs = compressed_contraction_cost(tn, peo, mem_limit=M_limit, compression_ratio=64)
     cost = sum(costs[2:], costs[0])
     print(costs_to_csv(costs))
     # -- Estimate regular contraction
@@ -39,7 +40,8 @@ def test_compressed_contraction_cost():
     nodes, path = qtensor.utils.get_neighbors_path(tn.get_line_graph(), peo)
     print("Path\n", path)
     # -- Estimate sliced contraction
-    opt_par  = qtensor.optimisation.SlicesOptimizer(max_tw=M_limit+1, max_slice=5)
+    opt_par  = qtensor.optimisation.SlicesOptimizer(base_ordering=opt, max_tw=M_limit+1, max_slice=2+opt.treewidth-M_limit)
+    #opt_par  = TreeTrimSplitter(base_ordering=opt, max_tw=M_limit+1, max_slice=5+opt.treewidth-M_limit)
     peo, par_vars, tn = opt_par.optimize(tn)
     print("Par vars", par_vars)
     tn.slice({i: slice(0, 1) for i in par_vars})
@@ -50,14 +52,15 @@ def test_compressed_contraction_cost():
     # print flops and memory from sliced simulation cost
     flops_run = cost_sliced.flops
     mem_run = cost_sliced.memory
-    print(cost)
-    print(cost_sliced)
-    FLOP_perS = 1e9
-    Throughput = 1e11
+    print("M limit", M_limit)
+    print("Cost", cost)
+    print("Cost sliced", cost_sliced)
+    FLOP_perS = 1e12
+    Throughput = 200e9/16
     print(f'Contraction cost (sliced): {np.log2(flops_run*runs_count*1.)} flops, {np.log2(mem_run*1.)} memory, {cost_sliced.width} width')
-    print(f'Contraction cost (old): {np.log2(sum(flops_lg))} flops, {np.log2(max(mems_lg))} memory')
+    print(f'Contraction cost (old): {np.log2(sum(flops_lg)*1.)} flops, {np.log2(max(mems_lg))} memory')
     mems_lg, flops_lg = tn.simulation_cost(peo)
-    print(f'Sliced contraction cost (old): {np.log2(sum(flops_lg)*runs_count)} flops, {np.log2(max(mems_lg))} memory')
+    print(f'Sliced contraction cost (old): {np.log2(sum(flops_lg)*1.0*runs_count)} flops, {np.log2(max(mems_lg)*1.0)} memory')
 
     print(f'-- Compressed Contraction time estimate: {cost.time(FLOP_perS, Throughput, Throughput, M_limit)} seconds')
     print(f'-- Sliced contraction time estimate: {runs_count*cost_sliced.time(FLOP_perS, Throughput, Throughput, M_limit)} seconds')

From e9cf7997a984f3f3e420094ef7d9167d459bfc44 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Fri, 27 Jan 2023 19:31:09 -0600
Subject: [PATCH 015/126] fix bug in profiler summary

---
 .../contraction_backends/performance_measurement_decorator.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qtensor/contraction_backends/performance_measurement_decorator.py b/qtensor/contraction_backends/performance_measurement_decorator.py
index eb676cb9..b365b969 100644
--- a/qtensor/contraction_backends/performance_measurement_decorator.py
+++ b/qtensor/contraction_backends/performance_measurement_decorator.py
@@ -39,7 +39,7 @@ def from_bucket_time(cls, bucket: list, time: float):
     def indices_info(self):
         """ String representation of bucket data"""
         info = ""
-        all_indices = sorted(sum(map(list, self.indices), []), key=int)
+        all_indices = sorted(list(set(sum(map(list, self.indices), []))), key=int)
         ix_to_char = {i:string.ascii_letters[j] for j, i in enumerate(all_indices)}
         for ix, strides in zip(self.indices, self.strides):
             tensor_info = ""

From cdd724bd1e958113a908d9362e62e7b4b9284090 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Sat, 28 Jan 2023 00:15:09 -0600
Subject: [PATCH 016/126] fix torch backend bug

---
 qtensor/contraction_backends/torch.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/qtensor/contraction_backends/torch.py b/qtensor/contraction_backends/torch.py
index 0636f096..009d7689 100644
--- a/qtensor/contraction_backends/torch.py
+++ b/qtensor/contraction_backends/torch.py
@@ -71,6 +71,7 @@ def __init__(self, device='cpu'):
         self.width_bc = [[0,0] for i in range(30)] #(#distinct_bc, #bc)
 
     def process_bucket(self, bucket, no_sum=False):
+        bucket.sort(key = lambda x: len(x.indices))
         result_indices = bucket[0].indices
         result_data = bucket[0].data
         width = len(set(bucket[0].indices))
@@ -81,6 +82,7 @@ def process_bucket(self, bucket, no_sum=False):
                 list(map(int, result_indices)), list(map(int, tensor.indices))
             )
 
+            result_data = torch.einsum(expr, result_data, tensor.data)
 
             # Merge and sort indices and shapes
             result_indices = tuple(sorted(
@@ -103,6 +105,7 @@ def process_bucket(self, bucket, no_sum=False):
                 list(map(int, result_indices)), list(map(int, tensor.indices))
                 , contract = 1
             )
+            result_data = torch.einsum(expr, result_data, tensor.data)
             result_indices = tuple(sorted(
                 set(result_indices + tensor.indices),
                 key=int, reverse=True

From 737beaba26d6ff79333a34a8b9b1549bf16c89e2 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Thu, 23 Feb 2023 13:17:48 -0600
Subject: [PATCH 017/126] add minimal CUSZCompressor

---
 qtensor/compression/CompressedTensor.py       |  39 +-
 qtensor/compression/szx/src/README_python.md  |   4 +-
 qtensor/compression/szx/src/cuszx_wrapper.py  |   2 +-
 qtensor/compression/szx/src/sz_p_q.c          | 367 ------------------
 qtensor/compression/test_compressed_tensor.py |  24 +-
 5 files changed, 61 insertions(+), 375 deletions(-)
 delete mode 100644 qtensor/compression/szx/src/sz_p_q.c

diff --git a/qtensor/compression/CompressedTensor.py b/qtensor/compression/CompressedTensor.py
index 1d1ed7b3..ffba51a0 100644
--- a/qtensor/compression/CompressedTensor.py
+++ b/qtensor/compression/CompressedTensor.py
@@ -1,10 +1,12 @@
 import itertools
 import numpy as np
-import qtree
 import io
-from qtree.optimizer import Tensor, Var
+from qtree.optimizer import Tensor
+from qtree.system_defs import NP_ARRAY_TYPE
+import sys
+sys.path.append("./szx/src")
 
-from szx.src.cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
+from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
 
 CUSZX_BLOCKSIZE = 256
 
@@ -26,6 +28,27 @@ def decompress(self, ptr):
         print(f"Loading arr.")
         return  np.load(ptr)['arr_0']
 
+class CUSZCompressor():
+    def compress(self, data):
+        import cupy
+        if isinstance(data, cupy.ndarray):
+            isCuPy = True
+        else:
+            isCuPy = False
+        num_elements = data.size
+        r2r_error = 0.01
+        r2r_threshold = 0.01
+        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data.flatten(), num_elements, r2r_error, r2r_threshold)
+        return (cmp_bytes, num_elements, isCuPy, data.shape)
+
+    def decompress(self, obj):
+        import cupy
+        cmp_bytes, num_elements, isCuPy, shape = obj
+        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements)
+        mem = cupy.cuda.UnownedMemory(decompressed_ptr, num_elements*8, self, device_id=0)
+        mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
+        arr = cupy.ndarray(shape, dtype=np.float64, memptr=mem_ptr)
+        return arr
     
     ### Compression API with cuSZx ###
     # Parameters:
@@ -100,6 +123,14 @@ def __init__(self, name, indices,
         else:
             self._dtype = None
 
+    @classmethod
+    def empty(cls, name, indices, slice_indices=[], compressor=Compressor(), dtype:type=NP_ARRAY_TYPE):
+        t = super().empty(name, indices, dtype)
+        t.compressor = compressor
+        if slice_indices:
+            t.compress_indices(slice_indices)
+        return t
+
     def compress_indices(self, indices: list):
         """
         Slice the self.data along dimensions in `indices`,
@@ -143,7 +174,7 @@ def get_chunk(self, ivals):
         ptr = self._data[flat_ix]
         return self.compressor.decompress(ptr)
 
-    def set_chunk(self, ivals, chunk:np.array):
+    def set_chunk(self, ivals, chunk: np.ndarray):
         # -- Check for consistent data types between chunks
         if self._dtype is None:
             self._dtype = chunk.dtype
diff --git a/qtensor/compression/szx/src/README_python.md b/qtensor/compression/szx/src/README_python.md
index e71bf518..0754950d 100644
--- a/qtensor/compression/szx/src/README_python.md
+++ b/qtensor/compression/szx/src/README_python.md
@@ -5,7 +5,7 @@
 2. Change directory to "SZx/szx/src/"
 
 3. Run the following NVCC command:
-nvcc --shared --compiler-options '-fPIC' -I ../include/ -I $CUDA_SAMPLES_PATH -o cuszx_wrapper.so *.cu *.c
+nvcc --shared --compiler-options '-fPIC' -I ../include/ -I $CUDA_SAMPLES_PATH -o libcuszx_wrapper.so *.cu *.c
 
     - $CUDA_SAMPLES_PATH should be the path to the include/ directory of CUDA's samples
 
@@ -27,4 +27,4 @@ nvcc --shared --compiler-options '-fPIC' -I ../include/ -I $CUDA_SAMPLES_PATH -o
     - nbEle: Integer, number of data elements
     - cmpBytes: GPU device pointer to compressed bytes
 - Returns:
-    - newData: GPU float pointer (CTypes) to decompressed data
\ No newline at end of file
+    - newData: GPU float pointer (CTypes) to decompressed data
diff --git a/qtensor/compression/szx/src/cuszx_wrapper.py b/qtensor/compression/szx/src/cuszx_wrapper.py
index 15227432..dae52eac 100644
--- a/qtensor/compression/szx/src/cuszx_wrapper.py
+++ b/qtensor/compression/szx/src/cuszx_wrapper.py
@@ -4,7 +4,7 @@
 import random
 import cupy as cp
 
-LIB_PATH = './cuszx_wrapper.so'
+LIB_PATH = 'szx/src/libcuszx_wrapper.so'
 
 # unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize)
 
diff --git a/qtensor/compression/szx/src/sz_p_q.c b/qtensor/compression/szx/src/sz_p_q.c
deleted file mode 100644
index d6cb6017..00000000
--- a/qtensor/compression/szx/src/sz_p_q.c
+++ /dev/null
@@ -1,367 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-void updateLossyCompElement_Double(unsigned char* curBytes, unsigned char* preBytes, 
-		int reqBytesLength, int resiBitsLength,  LossyCompressionElement *lce)
-{
-	int resiIndex, intMidBytes_Length = 0;
-	int leadingNum = compIdenticalLeadingBytesCount_double(preBytes, curBytes); //in fact, float is enough for both single-precision and double-precisiond ata.
-	int fromByteIndex = leadingNum;
-	int toByteIndex = reqBytesLength; //later on: should use "< toByteIndex" to tarverse....
-	if(fromByteIndex < toByteIndex)
-	{
-		intMidBytes_Length = reqBytesLength - leadingNum;
-		memcpy(lce->integerMidBytes, &(curBytes[fromByteIndex]), intMidBytes_Length);
-	}
-	int resiBits = 0;
-	if(resiBitsLength!=0)
-	{
-		resiIndex = reqBytesLength;
-		if(resiIndex < 8)
-			resiBits = (curBytes[resiIndex] & 0xFF) >> (8-resiBitsLength);
-	}
-	lce->leadingZeroBytes = leadingNum;
-	lce->integerMidBytes_Length = intMidBytes_Length;
-	lce->resMidBitsLength = resiBitsLength;
-	lce->residualMidBits = resiBits;
-}
-
-inline void longToBytes_bigEndian(unsigned char *b, unsigned long num) 
-{
-	b[0] = (unsigned char)(num>>56);
-	b[1] = (unsigned char)(num>>48);
-	b[2] = (unsigned char)(num>>40);
-	b[3] = (unsigned char)(num>>32);
-	b[4] = (unsigned char)(num>>24);
-	b[5] = (unsigned char)(num>>16);
-	b[6] = (unsigned char)(num>>8);
-	b[7] = (unsigned char)(num);
-//	if(dataEndianType==LITTLE_ENDIAN_DATA)
-//		symTransform_8bytes(*b);
-}
-
-void compressSingleDoubleValue(DoubleValueCompressElement *vce, double tgtValue, double precision, double medianValue,
-		int reqLength, int reqBytesLength, int resiBitsLength)
-{
-	double normValue = tgtValue - medianValue;
-
-	ldouble lfBuf;
-	lfBuf.value = normValue;
-
-	int ignBytesLength = 64 - reqLength;
-	if(ignBytesLength<0)
-		ignBytesLength = 0;
-
-	long tmp_long = lfBuf.lvalue;
-	longToBytes_bigEndian(vce->curBytes, tmp_long);
-
-	lfBuf.lvalue = (lfBuf.lvalue >> ignBytesLength)<<ignBytesLength;
-
-	//double tmpValue = lfBuf.value;
-
-	vce->data = lfBuf.value+medianValue;
-	vce->curValue = tmp_long;
-	vce->reqBytesLength = reqBytesLength;
-	vce->resiBitsLength = resiBitsLength;
-}
-
-inline void intToBytes_bigEndian(unsigned char *b, unsigned int num)
-{
-	b[0] = (unsigned char)(num >> 24);	
-	b[1] = (unsigned char)(num >> 16);	
-	b[2] = (unsigned char)(num >> 8);	
-	b[3] = (unsigned char)(num);	
-	
-	//note: num >> xxx already considered endian_type...
-//if(dataEndianType==LITTLE_ENDIAN_DATA)
-//		symTransform_4bytes(*b); //change to BIG_ENDIAN_DATA
-}
-
-inline short computeReqLength_double_MSST19(double realPrecision)
-{
-	short reqExpo = getPrecisionReqLength_double(realPrecision);
-	return 12-reqExpo;
-}
-
-
-unsigned int optimize_intervals_double_1D_opt_MSST19(double *oriData, size_t dataLength, double realPrecision)
-{
-	size_t i = 0, radiusIndex;
-	double pred_value = 0;
-	double pred_err;
-	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
-	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
-	size_t totalSampleSize = 0;//dataLength/confparams_cpr->sampleDistance;
-
-	double * data_pos = oriData + 2;
-	double divider = log2(1+realPrecision)*2;
-	int tempIndex = 0;
-	while(data_pos - oriData < dataLength){
-		if(*data_pos == 0){
-        		data_pos += confparams_cpr->sampleDistance;
-        		continue;
-		}
-		tempIndex++;
-		totalSampleSize++;
-		pred_value = data_pos[-1];
-		pred_err = fabs((double)*data_pos / pred_value);
-		radiusIndex = (unsigned long)fabs(log2(pred_err)/divider+0.5);
-		if(radiusIndex>=confparams_cpr->maxRangeRadius)
-			radiusIndex = confparams_cpr->maxRangeRadius - 1;
-		intervals[radiusIndex]++;
-
-		data_pos += confparams_cpr->sampleDistance;
-	}
-	//compute the appropriate number
-	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
-	size_t sum = 0;
-	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
-	{
-		sum += intervals[i];
-		if(sum>targetCount)
-			break;
-	}
-	if(i>=confparams_cpr->maxRangeRadius)
-		i = confparams_cpr->maxRangeRadius-1;
-
-	unsigned int accIntervals = 2*(i+1);
-	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
-
-	if(powerOf2<64)
-		powerOf2 = 64;
-
-	free(intervals);
-	return powerOf2;
-}
-
-
-TightDataPointStorageD* SZ_compress_double_1D_MDQ_MSST19(double *oriData,
-size_t dataLength, double realPrecision, double valueRangeSize, double medianValue_f)
-{
-#ifdef HAVE_TIMECMPR
-	double* decData = NULL;
-	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
-		decData = (double*)(multisteps->hist_data);
-#endif
-
-	//struct ClockPoint clockPointBuild;
-	//TimeDurationStart("build", &clockPointBuild);
-	unsigned int quantization_intervals;
-	if(exe_params->optQuantMode==1)
-		quantization_intervals = optimize_intervals_double_1D_opt_MSST19(oriData, dataLength, realPrecision);
-	else
-		quantization_intervals = exe_params->intvCapacity;
-	//updateQuantizationInfo(quantization_intervals);
-	int intvRadius = quantization_intervals/2;
-
-	double* precisionTable = (double*)malloc(sizeof(double) * quantization_intervals);
-	double inv = 2.0-pow(2, -(confparams_cpr->plus_bits));
-    for(int i=0; i<quantization_intervals; i++){
-        double test = pow((1+realPrecision), inv*(i - intvRadius));
-        precisionTable[i] = test;
-    }
-
-	struct TopLevelTableWideInterval levelTable;
-    MultiLevelCacheTableWideIntervalBuild(&levelTable, precisionTable, quantization_intervals, realPrecision, confparams_cpr->plus_bits);
-
-	size_t i;
-	int reqLength;
-	double medianValue = medianValue_f;
-	//double medianInverse = 1 / medianValue_f;
-	//short radExpo = getExponent_double(realPrecision);
-
-	reqLength = computeReqLength_double_MSST19(realPrecision);
-
-	int* type = (int*) malloc(dataLength*sizeof(int));
-
-	double* spaceFillingValue = oriData; //
-
-	DynamicIntArray *exactLeadNumArray;
-	new_DIA(&exactLeadNumArray, dataLength/2/8);
-
-	DynamicByteArray *exactMidByteArray;
-	new_DBA(&exactMidByteArray, dataLength/2);
-
-	DynamicIntArray *resiBitArray;
-	new_DIA(&resiBitArray, DynArrayInitLen);
-
-	unsigned char preDataBytes[8];
-	intToBytes_bigEndian(preDataBytes, 0);
-
-	int reqBytesLength = reqLength/8;
-	int resiBitsLength = reqLength%8;
-	double last3CmprsData[3] = {0};
-
-	//size_t miss=0, hit=0;
-
-	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
-	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
-
-	//add the first data
-	type[0] = 0;
-	compressSingleDoubleValue_MSST19(vce, spaceFillingValue[0], realPrecision, reqLength, reqBytesLength, resiBitsLength);
-	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
-	memcpy(preDataBytes,vce->curBytes,8);
-	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
-	listAdd_double(last3CmprsData, vce->data);
-	//miss++;
-#ifdef HAVE_TIMECMPR
-	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
-		decData[0] = vce->data;
-#endif
-
-	//add the second data
-	type[1] = 0;
-	compressSingleDoubleValue_MSST19(vce, spaceFillingValue[1], realPrecision, reqLength, reqBytesLength, resiBitsLength);
-	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
-	memcpy(preDataBytes,vce->curBytes,8);
-	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
-	listAdd_double(last3CmprsData, vce->data);
-	//miss++;
-#ifdef HAVE_TIMECMPR
-	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
-		decData[1] = vce->data;
-#endif
-	int state;
-	//double checkRadius;
-	double curData;
-	double pred = vce->data;
-
-    double predRelErrRatio;
-
-	const uint64_t top = levelTable.topIndex, base = levelTable.baseIndex;
-	const uint64_t range = top - base;
-	const int bits = levelTable.bits;
-	uint64_t* const buffer = (uint64_t*)&predRelErrRatio;
-	const int shift = 52-bits;
-	uint64_t expoIndex, mantiIndex;
-	uint16_t* tables[range+1];
-	for(int i=0; i<=range; i++){
-		tables[i] = levelTable.subTables[i].table;
-	}
-
-	for(i=2;i<dataLength;i++)
-	{
-		curData = spaceFillingValue[i];
-		predRelErrRatio = curData / pred;
-
-		expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
-		if(expoIndex <= range){
-			mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
-			state = tables[expoIndex][mantiIndex];
-		}else{
-			state = 0;
-		}
-
-		if(state)
-		{
-			type[i] = state;
-			pred *= precisionTable[state];
-			//hit++;
-			continue;
-		}
-
-		//unpredictable data processing
-		type[i] = 0;
-		compressSingleDoubleValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
-		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
-		memcpy(preDataBytes,vce->curBytes,8);
-		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
-		pred =  vce->data;
-		//miss++;
-#ifdef HAVE_TIMECMPR
-		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
-			decData[i] = vce->data;
-#endif
-
-	}//end of for
-
-//	printf("miss:%d, hit:%d\n", miss, hit);
-
-	size_t exactDataNum = exactLeadNumArray->size;
-
-	TightDataPointStorageD* tdps;
-
-	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum,
-			type, exactMidByteArray->array, exactMidByteArray->size,
-			exactLeadNumArray->array,
-			resiBitArray->array, resiBitArray->size,
-			resiBitsLength,
-			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
-    tdps->plus_bits = confparams_cpr->plus_bits;
-
-	//free memory
-	free_DIA(exactLeadNumArray);
-	free_DIA(resiBitArray);
-	free(type);
-	free(vce);
-	free(lce);
-	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
-	free(precisionTable);
-	freeTopLevelTableWideInterval(&levelTable);
-	return tdps;
-}
-
-
-void SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr_pre_log_MSST19(unsigned char** newByteData, double *oriData, double pwrErrRatio, size_t dataLength, size_t *outSize, double valueRangeSize, double medianValue_f,
-																unsigned char* signs, bool* positive, double min, double max, double nearZero){
-	double multiplier = pow((1+pwrErrRatio), -3.0001);
-	for(int i=0; i<dataLength; i++){
-		if(oriData[i] == 0){
-			oriData[i] = nearZero * multiplier;
-		}
-	}
-
-	double median_log = sqrt(fabs(nearZero * max));
-
-	TightDataPointStorageD* tdps = SZ_compress_double_1D_MDQ_MSST19(oriData, dataLength, pwrErrRatio, valueRangeSize, median_log);
-
-	tdps->minLogValue = nearZero / ((1+pwrErrRatio)*(1+pwrErrRatio));
-	if(!(*positive)){
-		unsigned char * comp_signs;
-		// compress signs
-		unsigned long signSize = sz_lossless_compress(ZSTD_COMPRESSOR, 3, signs, dataLength, &comp_signs);
-		tdps->pwrErrBoundBytes = comp_signs;
-		tdps->pwrErrBoundBytes_size = signSize;
-	}
-	else{
-		tdps->pwrErrBoundBytes = NULL;
-		tdps->pwrErrBoundBytes_size = 0;
-	}
-	free(signs);
-
-	convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
-	if(*outSize>3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + sizeof(double)*dataLength)
-		SZ_compress_args_double_StoreOriData(oriData, dataLength, newByteData, outSize);
-
-	free_TightDataPointStorageD(tdps);
-}
-
-double computeRangeSize_double_MSST19(double* oriData, size_t size, double* valueRangeSize, double* medianValue, unsigned char * signs, bool* positive, double* nearZero)
-{
-    size_t i = 0;
-    double min = oriData[0];
-    double max = min;
-    *nearZero = min;
-
-    for(i=1;i<size;i++)
-    {
-        double data = oriData[i];
-        if(data <0){
-            signs[i] = 1;
-            *positive = false;
-        }
-        if(oriData[i] != 0 && fabs(oriData[i]) < fabs(*nearZero)){
-            *nearZero = oriData[i];
-        }
-        if(min>data)
-            min = data;
-        else if(max<data)
-            max = data;
-    }
-
-    *valueRangeSize = max - min;
-    *medianValue = min + *valueRangeSize/2;
-    return min;
-}
\ No newline at end of file
diff --git a/qtensor/compression/test_compressed_tensor.py b/qtensor/compression/test_compressed_tensor.py
index e5e0f277..d86a511d 100644
--- a/qtensor/compression/test_compressed_tensor.py
+++ b/qtensor/compression/test_compressed_tensor.py
@@ -1,6 +1,8 @@
 from qtensor.compression import CompressedTensor
-from qtree.optimizer import Tensor, Var
+from qtensor.compression.CompressedTensor import Compressor, CUSZCompressor
+from qtree.optimizer import Var
 from qtree.system_defs import NP_ARRAY_TYPE
+import pytest
 import numpy as np
 
 def test_empty_tensor():
@@ -10,6 +12,7 @@ def test_empty_tensor():
     assert t.name == "myT"
     assert t.indices == tuple(indices)
     assert t.shape == shape
+    assert t.data is not None
     assert t.data.shape == shape
     assert t.data.dtype == NP_ARRAY_TYPE
 
@@ -23,6 +26,7 @@ def test_slice_tensor():
     t = CompressedTensor.empty("myT", indices, dtype=np.uint32)
     t.compress_indices([indices[0]])
     S = t[{indices[0]: 1, indices[1]: slice(0, 1)}]
+    assert S.data is not None
     assert S.data.shape == (1, 4)
     assert indices[0] not in S.indices
     assert int(indices[1]) == int(S.indices[0])
@@ -35,5 +39,23 @@ def test_slice_tensor():
     t.compress_indices([indices[0], indices[1]])
     S = t[1, 2]
     assert indices[1] not in S.indices
+    assert S.data is not None
     assert np.allclose(t.get_chunk([1, 2]), S.data)
 
+@pytest.mark.parametrize(argnames=["shape", "compressor"],
+                         argvalues=[
+                             ((2, 3, 4), Compressor()),
+                             ((2, 3, 4), CUSZCompressor())]
+                        )
+def test_compressors(shape, compressor):
+    import cupy
+    shape = (2, 3, 4)
+    indices = [Var(i, size=s) for i, s in enumerate(shape)]
+    data = cupy.random.randn(*shape)
+    t = CompressedTensor("myT", indices, data=data, compressor=compressor)
+    t.compress_indices([indices[0]])
+
+    s = t[1]
+    assert s.data is not None
+    assert np.allclose(t.get_chunk([1]), s.data)
+

From 72d1031fb5ce28188c0c39eb2a8a6ee87b646481 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Fri, 24 Feb 2023 13:13:32 -0600
Subject: [PATCH 018/126] fix cusz pointer conversion

---
 qtensor/compression/CompressedTensor.py       | 11 ++++++++++-
 qtensor/compression/test_compressed_tensor.py |  7 +++++--
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/qtensor/compression/CompressedTensor.py b/qtensor/compression/CompressedTensor.py
index ffba51a0..847902b3 100644
--- a/qtensor/compression/CompressedTensor.py
+++ b/qtensor/compression/CompressedTensor.py
@@ -39,13 +39,22 @@ def compress(self, data):
         r2r_error = 0.01
         r2r_threshold = 0.01
         cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data.flatten(), num_elements, r2r_error, r2r_threshold)
+        print("returning compressed data")
         return (cmp_bytes, num_elements, isCuPy, data.shape)
 
     def decompress(self, obj):
         import cupy
+        import ctypes
         cmp_bytes, num_elements, isCuPy, shape = obj
         decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements)
-        mem = cupy.cuda.UnownedMemory(decompressed_ptr, num_elements*8, self, device_id=0)
+        # -- Workaround to convert GPU pointer to int
+        p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
+        # cast to int64 pointer
+        # (effectively converting pointer to pointer to addr to pointer to int64)
+        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        decompressed_int = p_decompressed_int.contents
+        # --
+        mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements*8, self, device_id=0)
         mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
         arr = cupy.ndarray(shape, dtype=np.float64, memptr=mem_ptr)
         return arr
diff --git a/qtensor/compression/test_compressed_tensor.py b/qtensor/compression/test_compressed_tensor.py
index d86a511d..12dd7862 100644
--- a/qtensor/compression/test_compressed_tensor.py
+++ b/qtensor/compression/test_compressed_tensor.py
@@ -45,17 +45,20 @@ def test_slice_tensor():
 @pytest.mark.parametrize(argnames=["shape", "compressor"],
                          argvalues=[
                              ((2, 3, 4), Compressor()),
-                             ((2, 3, 4), CUSZCompressor())]
+                             ((2, 3, 4), CUSZCompressor()),
+                             ((2,)*20, CUSZCompressor())
+                        ]
                         )
 def test_compressors(shape, compressor):
     import cupy
-    shape = (2, 3, 4)
     indices = [Var(i, size=s) for i, s in enumerate(shape)]
     data = cupy.random.randn(*shape)
+    print("Data size", data.nbytes)
     t = CompressedTensor("myT", indices, data=data, compressor=compressor)
     t.compress_indices([indices[0]])
 
     s = t[1]
+    print('got chunk')
     assert s.data is not None
     assert np.allclose(t.get_chunk([1]), s.data)
 

From e9f3981f555ce05d463d9b96019bd1ac40459dd2 Mon Sep 17 00:00:00 2001
From: Milan Kartik Shah <mkshah5@nps4.ece.ncsu.edu>
Date: Fri, 24 Feb 2023 19:11:23 -0500
Subject: [PATCH 019/126] Fixed CUDA misaligned address error

---
 qtensor/compression/szx/src/cuszx_entry.cu   | 75 +++++++++++++++-----
 qtensor/compression/szx/src/cuszx_wrapper.py |  6 +-
 2 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/qtensor/compression/szx/src/cuszx_entry.cu b/qtensor/compression/szx/src/cuszx_entry.cu
index b6894760..c9098471 100644
--- a/qtensor/compression/szx/src/cuszx_entry.cu
+++ b/qtensor/compression/szx/src/cuszx_entry.cu
@@ -86,7 +86,7 @@ __host__ __device__ size_t convert_out_to_block2(unsigned char *in_cmp, uint32_t
     out_length += num_sig*sizeof(uint8_t);
     memcpy(blk_sig, in_cmp+out_length, numBlocks*sizeof(uint8_t));
     out_length += numBlocks*sizeof(uint8_t);
-
+//    printf("outlength: %d\n",out_length);
     return out_length;
 }
 
@@ -453,7 +453,7 @@ __global__ void device_post_proc(size_t *outSize, float *oriData, unsigned char
     	if(meta[i]==2) s2++;
     	if(meta[i]==3) s3++;
     }
-    printf("%d %d %d %d\n", s0, s1, s2, s3);
+   // printf("%d %d %d %d\n", s0, s1, s2, s3);
     out_size += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
 
     //outBytes = (unsigned char*)malloc(out_size);
@@ -495,7 +495,7 @@ __global__ void device_post_proc(size_t *outSize, float *oriData, unsigned char
 
     // return out_size;
     *outSize = (uint32_t) (nc-r_old);
-    printf("outBytes 0 %d\n", (int) outBytes[0]);
+   // printf("outBytes 0 %d\n", (int) outBytes[0]);
     // return (uint32_t) (nc-r_old);
 }
 
@@ -685,7 +685,7 @@ __global__ void decompress_get_stats(float *newData, size_t nbEle, unsigned char
 	unsigned char* r = cmpBytes;
     size_t num_sig;
 	r += 4;
-	int blockSize = r[0];  //get block size
+	int blockSize = (int) r[0];  //get block size
 	
 	if(blockSize == 0)blockSize = 256;
 	r++;
@@ -709,7 +709,7 @@ __global__ void decompress_get_stats(float *newData, size_t nbEle, unsigned char
     *numSigValues = num_sig;
     *bs = blockSize;
     newCmpBytes = r;
-    // printf("nb blocks: %d\n", nbBlocks);
+  //  printf("nb blocks: %d\n", nbBlocks);
 
 }
 
@@ -718,6 +718,7 @@ __global__ void setup_data_stateArray(float *newData, size_t nbEle, unsigned cha
     size_t nbConstantBlocks, size_t nbBlocks, size_t *ncBlks,
     unsigned char *stateArray, unsigned char *newR
 ){
+    blockSize = 256;
     r += 4;
     r++;
     r += sizeof(size_t);
@@ -725,7 +726,7 @@ __global__ void setup_data_stateArray(float *newData, size_t nbEle, unsigned cha
     size_t ncBlocks = 0;
 	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
     size_t num_state2_blks = 0;
-	printf("Converting state array\n");
+//	printf("Converting state array\n");
     // printf("cmp %d\n", (int)r[0]);
     // printf("state %d\n", (int)stateArray[0]);
     convert_out_to_state(nbBlocks, r, stateArray);
@@ -753,6 +754,8 @@ __global__ void decompress_startup(float *newData, size_t nbEle, unsigned char*
     unsigned char *stateArray, float* constantMedianArray, unsigned char *data,
     size_t mSize, unsigned char *newCmpBytes
 ){
+    blockSize = 256;
+    size_t nb_tmp = (int) nbEle/256;
     /**
      * Structures to return:
      * blk_idx, blk_subidx, blk_sig, blk_vals, numSigValues (pointer)
@@ -776,29 +779,54 @@ __global__ void decompress_startup(float *newData, size_t nbEle, unsigned char*
     //         ncBlocks++;
     //     }
     // }
-    size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
-	r += 4;
+   // size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+
+    size_t stateNBBytes = nb_tmp%4==0 ? nb_tmp/4 : nb_tmp/4+1;
+    //printf("%p\n", r);
+    r += 4;
     r++;
     r += sizeof(size_t);
     r += sizeof(size_t);
+    //printf("statenb %d %d\n", stateNBBytes, nb_tmp);
     r += stateNBBytes;
     // data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
     // memset(data, 0, ncBlocks*blockSize*sizeof(float));
-    // printf("converting block vals %d\n", data[0]);
+   // printf("converting block vals %d\n", data[0]);
     size_t to_add = convert_out_to_block2(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
     r+= to_add;
 
-	size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+    size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
     
     // printf("before mallocs in kernel\n");
     
     memcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
 
-    // printf("before mallocs in kernel\n");
+    //printf("before mallocs in kernel %p\n", r);
     r += (nbEle%blockSize)*sizeof(float);
-	float* fr = (float*)r; //fr is the starting address of constant median values.
-	for(i = 0;i < nbConstantBlocks;i++, j+=4) //get the median values for constant-value blocks
-		constantMedianArray[i] = fr[i];
+    //printf("r: %p\n", r);
+    //printf("%d, %d, %d\n",nbEle, 256, nbEle%256);
+    unsigned char * fr = r; //fr is the starting address of constant median values.
+
+  //  printf("%p\n", r);
+    unsigned char tmp_r[4];
+    tmp_r[0]=r[0];
+    tmp_r[1]=r[1];
+    tmp_r[2]=r[2];
+    tmp_r[3]=r[3];
+
+
+//    printf("nbconstant: %f\n", ((float*)tmp_r)[0]);
+    for(i = 0;i < nbConstantBlocks;i++, j+=4){ //get the median values for constant-value blocks
+	    
+    	    tmp_r[0]=r[j];
+    	    tmp_r[1]=r[j+1];
+    	    tmp_r[2]=r[j+2];
+    	    tmp_r[3]=r[j+3];
+	    float tmp = ((float*)tmp_r)[0];
+//	    printf("median: %f\n", tmp);	
+	    constantMedianArray[i] = tmp;
+    }
+    //printf("after constantmedian\n");
     r += nbConstantBlocks*sizeof(float);
     unsigned char* p = r + ncBlocks * sizeof(short);
     for(i = 0;i < ncBlocks;i++){
@@ -810,12 +838,14 @@ __global__ void decompress_startup(float *newData, size_t nbEle, unsigned char*
             return;
             // exit(0);
         }
+//	printf("before memcpy\n");
         memcpy(data+i*blockSize*sizeof(float), p, leng);
-        p += leng;
+  //      printf("after memcpy\n");
+	p += leng;
     } 
 
     newCmpBytes = r;
-    // printf("before mallocs in kernel\n");
+//    printf("before mallocs in kernel\n");
 
     // printf("nb blocks: %d\n", nbBlocks);
 }
@@ -842,6 +872,10 @@ __global__ void decompress_post_proc(unsigned char *data, float *newData, int bl
             nc++;
         }
     }
+
+    //for(int k = 0; k < nbBlocks*blockSize;k++){
+//	printf("%f\n", newData[k]);
+  //  }
 }
 
 float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
@@ -885,7 +919,7 @@ float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
     cudaDeviceSynchronize();
 
     cudaError_t err = cudaGetLastError();        // Get error code
-    printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
     checkCudaErrors(cudaMemcpy(&nbBlocks_h, nbBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
     checkCudaErrors(cudaMemcpy(&nbConstantBlocks_h, nbConstantBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
     checkCudaErrors(cudaMemcpy(&bs, blockSize, sizeof(int), cudaMemcpyDeviceToHost)); 
@@ -907,6 +941,8 @@ float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
         stateArray, cmpBytes
     );
     cudaDeviceSynchronize();
+
+   // printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
     checkCudaErrors(cudaMemcpy(&ncBlocks_h, ncBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
 
     checkCudaErrors(cudaMalloc((void**)&data, ncBlocks_h*bs*sizeof(float)));
@@ -924,6 +960,7 @@ float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
     // blk_subidx = (uint8_t *)malloc((num_sig)*sizeof(uint8_t));
     // blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
 
+    //printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
     //test_nbBlks = (size_t *)malloc(sizeof(size_t));
     // printf("malloc\n");
     decompress_startup<<<1,1>>>(newData, nbEle, cmpBytes, 
@@ -934,12 +971,13 @@ float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
     cudaDeviceSynchronize();
     // cmpBytes = newCmpBytes;
 
-    
+    //printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
 
     // unsigned char* d_data;
     float *d_newdata;
     // checkCudaErrors(cudaMalloc((void**)&d_data, ncBlocks*blockSize*sizeof(float))); 
     // checkCudaErrors(cudaMemcpy(d_data, data, ncBlocks*blockSize*sizeof(float), cudaMemcpyHostToDevice)); 
+    //printf("nblocks: %d bs: %d\n", nbBlocks_h, bs);
     checkCudaErrors(cudaMalloc(&d_newdata, nbBlocks_h*bs*sizeof(float)));
 
     timer_GPU.StartCounter();
@@ -964,7 +1002,6 @@ float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
     nbBlocks_h, ncBlocks_h, stateArray,
     constantMedianArray);
     cudaDeviceSynchronize();
-
 	cudaFree(stateArray);
 	cudaFree(constantMedianArray);
 	cudaFree(data);
diff --git a/qtensor/compression/szx/src/cuszx_wrapper.py b/qtensor/compression/szx/src/cuszx_wrapper.py
index dae52eac..866437d2 100644
--- a/qtensor/compression/szx/src/cuszx_wrapper.py
+++ b/qtensor/compression/szx/src/cuszx_wrapper.py
@@ -4,7 +4,7 @@
 import random
 import cupy as cp
 
-LIB_PATH = 'szx/src/libcuszx_wrapper.so'
+LIB_PATH = './libcuszx_wrapper.so'
 
 # unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize)
 
@@ -90,7 +90,7 @@ def cuszx_device_decompress(nbEle, cmpBytes):
 
 if __name__ == "__main__":
     
-    DATA_SIZE = 1024
+    DATA_SIZE = int(1024)
     MAX_D = 10.0
     MIN_D = -10.0
     RANGE = MAX_D - MIN_D
@@ -112,7 +112,7 @@ def cuszx_device_decompress(nbEle, cmpBytes):
 
     in_vector = in_vector.astype('float32')
     in_vector_gpu = cp.asarray(in_vector)
-
+    
     # variable = ctypes.c_size_t(0)
     # outSize = ctypes.pointer(variable)
 

From 62f0911a8e9189cfcd13158fc79421a814838425 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Sat, 25 Feb 2023 20:46:29 -0600
Subject: [PATCH 020/126] remove unused szx/src/Dynamic* files; minor tweak
 intest_cost_estimation

---
 .../szx/include/DynamicByteArray.h            | 36 +++++++++++++++++++
 .../szx/include/DynamicDoubleArray.h          | 36 +++++++++++++++++++
 .../szx/include/DynamicFloatArray.h           | 35 ++++++++++++++++++
 .../compression/szx/include/DynamicIntArray.h | 35 ++++++++++++++++++
 qtensor/compression/test_cost_estimation.py   |  2 +-
 5 files changed, 143 insertions(+), 1 deletion(-)
 create mode 100644 qtensor/compression/szx/include/DynamicByteArray.h
 create mode 100644 qtensor/compression/szx/include/DynamicDoubleArray.h
 create mode 100644 qtensor/compression/szx/include/DynamicFloatArray.h
 create mode 100644 qtensor/compression/szx/include/DynamicIntArray.h

diff --git a/qtensor/compression/szx/include/DynamicByteArray.h b/qtensor/compression/szx/include/DynamicByteArray.h
new file mode 100644
index 00000000..9f50a2ac
--- /dev/null
+++ b/qtensor/compression/szx/include/DynamicByteArray.h
@@ -0,0 +1,36 @@
+/**
+ *  @file DynamicByteArray.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Dynamic Byte Array.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DynamicByteArray_H
+#define _DynamicByteArray_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+typedef struct DynamicByteArray
+{
+	unsigned char* array;
+	size_t size;
+	size_t capacity;
+} DynamicByteArray;
+
+void new_DBA(DynamicByteArray **dba, size_t cap);
+void convertDBAtoBytes(DynamicByteArray *dba, unsigned char** bytes);
+void free_DBA(DynamicByteArray *dba);
+unsigned char getDBA_Data(DynamicByteArray *dba, size_t pos);
+extern void addDBA_Data(DynamicByteArray *dba, unsigned char value);
+extern void memcpyDBA_Data(DynamicByteArray *dba, unsigned char* data, size_t length);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DynamicByteArray_H  ----- */
diff --git a/qtensor/compression/szx/include/DynamicDoubleArray.h b/qtensor/compression/szx/include/DynamicDoubleArray.h
new file mode 100644
index 00000000..9a3ef4b6
--- /dev/null
+++ b/qtensor/compression/szx/include/DynamicDoubleArray.h
@@ -0,0 +1,36 @@
+/**
+ *  @file DynamicDoubleArray.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Dynamic Double Array.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DynamicDoubleArray_H
+#define _DynamicDoubleArray_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+typedef struct DynamicDoubleArray
+{	
+	double* array;
+	size_t size;
+	double capacity;
+} DynamicDoubleArray;
+
+void new_DDA(DynamicDoubleArray **dda, size_t cap);
+void convertDDAtoDoubles(DynamicDoubleArray *dba, double **data);
+void free_DDA(DynamicDoubleArray *dda);
+double getDDA_Data(DynamicDoubleArray *dda, size_t pos);
+void addDDA_Data(DynamicDoubleArray *dda, double value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DynamicDoubleArray_H  ----- */
diff --git a/qtensor/compression/szx/include/DynamicFloatArray.h b/qtensor/compression/szx/include/DynamicFloatArray.h
new file mode 100644
index 00000000..2770f786
--- /dev/null
+++ b/qtensor/compression/szx/include/DynamicFloatArray.h
@@ -0,0 +1,35 @@
+/**
+ *  @file DynamicFloatArray.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Dynamic Float Array.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DynamicFloatArray_H
+#define _DynamicFloatArray_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+typedef struct DynamicFloatArray
+{	
+	float* array;
+	size_t size;
+	size_t capacity;
+} DynamicFloatArray;
+
+void new_DFA(DynamicFloatArray **dfa, size_t cap);
+void convertDFAtoFloats(DynamicFloatArray *dfa, float **data);
+void free_DFA(DynamicFloatArray *dfa);
+float getDFA_Data(DynamicFloatArray *dfa, size_t pos);
+void addDFA_Data(DynamicFloatArray *dfa, float value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DynamicFloatArray_H  ----- */
diff --git a/qtensor/compression/szx/include/DynamicIntArray.h b/qtensor/compression/szx/include/DynamicIntArray.h
new file mode 100644
index 00000000..b9c0a4f3
--- /dev/null
+++ b/qtensor/compression/szx/include/DynamicIntArray.h
@@ -0,0 +1,35 @@
+/**
+ *  @file DynamicIntArray.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Dynamic Int Array.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DynamicIntArray_H
+#define _DynamicIntArray_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+typedef struct DynamicIntArray
+{
+	unsigned char* array; //char* (one byte) is enough, don't have to be int*
+	size_t size;
+	size_t capacity;
+} DynamicIntArray;
+
+void new_DIA(DynamicIntArray **dia, size_t cap);
+void convertDIAtoInts(DynamicIntArray *dia, unsigned char **data);
+void free_DIA(DynamicIntArray *dia);
+int getDIA_Data(DynamicIntArray *dia, size_t pos);
+extern void addDIA_Data(DynamicIntArray *dia, int value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DynamicIntArray_H  ----- */
diff --git a/qtensor/compression/test_cost_estimation.py b/qtensor/compression/test_cost_estimation.py
index 35c65801..03957330 100644
--- a/qtensor/compression/test_cost_estimation.py
+++ b/qtensor/compression/test_cost_estimation.py
@@ -41,7 +41,7 @@ def test_compressed_contraction_cost():
     print("Path\n", path)
     # -- Estimate sliced contraction
     opt_par  = qtensor.optimisation.SlicesOptimizer(base_ordering=opt, max_tw=M_limit+1, max_slice=2+opt.treewidth-M_limit)
-    #opt_par  = TreeTrimSplitter(base_ordering=opt, max_tw=M_limit+1, max_slice=5+opt.treewidth-M_limit)
+    opt_par  = TreeTrimSplitter(base_ordering=opt, max_tw=M_limit+1, max_slice=5+opt.treewidth-M_limit)
     peo, par_vars, tn = opt_par.optimize(tn)
     print("Par vars", par_vars)
     tn.slice({i: slice(0, 1) for i in par_vars})

From 67cb1ff1348437cf87f8607d0fb6404fd4f7f37e Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Sun, 26 Feb 2023 18:35:36 -0600
Subject: [PATCH 021/126] rename merged_indices/ to contraction_algos to host
 variants of bucket elimination

---
 bench/qc_simulation/README.md                 | 37 +++++++++++++++++++
 qtensor/FeynmanSimulator.py                   |  2 +-
 qtensor/MergedSimulator.py                    |  2 +-
 qtensor/Simulate.py                           |  6 +++
 qtensor/__init__.py                           |  2 +-
 qtensor/contraction_algos/__init__.py         |  2 +
 .../merged_bucket_elimination.py}             |  0
 .../transposed_bucket_elimination.py          |  0
 qtensor/merged_indices/__init__.py            |  1 -
 qtensor/optimisation/Optimizer.py             | 16 ++++----
 10 files changed, 57 insertions(+), 11 deletions(-)
 create mode 100644 bench/qc_simulation/README.md
 create mode 100644 qtensor/contraction_algos/__init__.py
 rename qtensor/{merged_indices/bucket_elimination.py => contraction_algos/merged_bucket_elimination.py} (100%)
 rename qtensor/{merged_indices => contraction_algos}/transposed_bucket_elimination.py (100%)
 delete mode 100644 qtensor/merged_indices/__init__.py

diff --git a/bench/qc_simulation/README.md b/bench/qc_simulation/README.md
new file mode 100644
index 00000000..f0866832
--- /dev/null
+++ b/bench/qc_simulation/README.md
@@ -0,0 +1,37 @@
+## Advanced usage
+
+It is possible to glob over inputs and vectorize over outputs
+The globbing is possible over remote files
+
+```
+main.py process \
+    gh://example.com/data/*/*.element \
+    results/{X}/{in_file}_y{y}.r \
+    -X=1,2 --Y=foo,bar
+```
+
+The parent directory for each out file will be created automatically
+
+## Examples
+
+### Easily manage simulation and estimation results
+
+After running preprocess, one can estimate runtime and compare that to actual time to simulate
+```bash
+# Assume 1GFlop (low-end cpu number)
+./main.py estimate preprocess/bris/bris_\*.txt_oalgogreedy.circ estimations/bris/cpu --sim qtensor -M 27 -F 1e9
+./main.py estimate preprocess/bris/bris_\*.txt_oalgorgreedy.circ estimations/bris/cpu --sim qtensor -M 27 -F 1e9
+
+rm  -r simulations/bris/*
+# Simulate Greedy
+./main.py simulate preprocess/bris/bris_\*.txt_oalgogreedy.circ simulations/bris --sim qtensor -M 27
+# Simulate RGreedy
+./main.py simulate preprocess/bris/bris_\*.txt_oalgorgreedy.circ simulations/bris --sim qtensor -M 27
+cat simulations/bris/*rgreedy*
+cat estimations/bris/cpu/*rgreedy*
+cat simulations/bris/*greedy*
+cat estimations/bris/cpu/*greedy*
+```
+
+This shows how UNIX utilities are used to filter and present data. In SQL this would be something like
+`SELECT * FROM simulations WHERE ordering_algo="greedy"`. 
diff --git a/qtensor/FeynmanSimulator.py b/qtensor/FeynmanSimulator.py
index 83a28fcc..03e29360 100644
--- a/qtensor/FeynmanSimulator.py
+++ b/qtensor/FeynmanSimulator.py
@@ -161,7 +161,7 @@ def _parallel_unit(self, par_idx):
         self.merged_buckets = self.tn.buckets
         self.ibunch = self.ibunch
 
-        result = qtensor.merged_indices.bucket_elimination(
+        result = qtensor.contraction_algos.merged_bucket_elimination(
             self.tn.buckets,
             self.ibunch,
             self.backend.process_bucket_merged,
diff --git a/qtensor/MergedSimulator.py b/qtensor/MergedSimulator.py
index aa0d6818..34dee455 100644
--- a/qtensor/MergedSimulator.py
+++ b/qtensor/MergedSimulator.py
@@ -110,7 +110,7 @@ def simulate_batch(self, qc, batch_vars=0, peo=None, dry_run=False):
         if dry_run:
             return peo, max(width)
 
-        result = qtensor.merged_indices.bucket_elimination(
+        result = qtensor.contraction_algos.merged_bucket_elimination(
             self.tn.buckets,
             self.ibunch,
             self.backend.process_bucket_merged,
diff --git a/qtensor/Simulate.py b/qtensor/Simulate.py
index 0e271bd1..0d6e7c22 100644
--- a/qtensor/Simulate.py
+++ b/qtensor/Simulate.py
@@ -118,7 +118,13 @@ def prepare_buckets(self, qc, batch_vars=0, peo=None):
                     raise ValueError(f'Treewidth {self.optimizer.treewidth} is larger than max_tw={self.max_tw}.')
         else:
             self.peo = peo
+        self._slice_relabel_buckets()
 
+    def _slice_relabel_buckets(self):
+        """
+        Relabels peo according to bucket indices.
+        Assumes self.tn and self.peo exists
+        """
         all_indices = sum([list(t.indices) for bucket in self.tn.buckets for t in bucket], [])
         identity_map = {int(v): v for v in all_indices}
         self.peo = [identity_map[int(i)] for i in self.peo]
diff --git a/qtensor/__init__.py b/qtensor/__init__.py
index f30a7f7d..6d2be717 100644
--- a/qtensor/__init__.py
+++ b/qtensor/__init__.py
@@ -21,7 +21,7 @@
 from qtensor import simplify_circuit
 from qtensor.simplify_circuit import simplify_qtree_circuit
 from qtensor import optimisation
-from qtensor import merged_indices
+from qtensor import contraction_algos
 from qtensor import problems
 from qtensor import MergedSimulator
 from qtensor import tools
diff --git a/qtensor/contraction_algos/__init__.py b/qtensor/contraction_algos/__init__.py
new file mode 100644
index 00000000..1523654c
--- /dev/null
+++ b/qtensor/contraction_algos/__init__.py
@@ -0,0 +1,2 @@
+from .merged_bucket_elimination import bucket_elimination as merged_bucket_elimination
+from .transposed_bucket_elimination import bucket_elimination as transposed_bucket_elimination
diff --git a/qtensor/merged_indices/bucket_elimination.py b/qtensor/contraction_algos/merged_bucket_elimination.py
similarity index 100%
rename from qtensor/merged_indices/bucket_elimination.py
rename to qtensor/contraction_algos/merged_bucket_elimination.py
diff --git a/qtensor/merged_indices/transposed_bucket_elimination.py b/qtensor/contraction_algos/transposed_bucket_elimination.py
similarity index 100%
rename from qtensor/merged_indices/transposed_bucket_elimination.py
rename to qtensor/contraction_algos/transposed_bucket_elimination.py
diff --git a/qtensor/merged_indices/__init__.py b/qtensor/merged_indices/__init__.py
deleted file mode 100644
index 5d2dadbf..00000000
--- a/qtensor/merged_indices/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .bucket_elimination import bucket_elimination
diff --git a/qtensor/optimisation/Optimizer.py b/qtensor/optimisation/Optimizer.py
index 85b7f0f7..d5cd37c8 100644
--- a/qtensor/optimisation/Optimizer.py
+++ b/qtensor/optimisation/Optimizer.py
@@ -235,24 +235,26 @@ def _get_ordering_ints(self, graph, inplace=True):
         #log.info('peo {}', self.peo)
         return peo, [self.treewidth]
 
-class TamakiOptimizer(GreedyOptimizer):
+class TamakiOptimizer(Optimizer):
     def __init__(self, max_width=None, *args, wait_time=5, **kwargs):
         super().__init__(*args, **kwargs)
         self.wait_time = wait_time
         self.max_width = max_width
 
-    def _get_ordering(self, graph, inplace=True):
-        node_names = nx.get_node_attributes(graph, 'name')
-        node_sizes = nx.get_node_attributes(graph, 'size')
+    def _get_ordering_ints(self, graph, inplace=True):
         peo, tw = qtree.graph_model.peo_calculation.get_upper_bound_peo_pace2017_interactive(
                 graph, method="tamaki", max_time=self.wait_time, max_width=self.max_width)
+        return peo, [tw]
 
-
+    def _get_ordering(self, graph, inplace=True):
+        node_names = nx.get_node_attributes(graph, 'name')
+        node_sizes = nx.get_node_attributes(graph, 'size')
+        peo, path = self._get_ordering_ints(graph, inplace=inplace)
         peo = [qtree.optimizer.Var(var, size=node_sizes[var],
                         name=node_names[var])
                     for var in peo]
-        self.treewidth = tw
-        return peo, [tw]
+        self.treewidth = max(path)
+        return peo, path
 
 class TamakiExactOptimizer(GreedyOptimizer):
     def __init__(self, *args, **kwargs):

From 976f699c553a3022ee47de5ea9a5327b299152f8 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Sun, 26 Feb 2023 18:46:19 -0600
Subject: [PATCH 022/126] add test for reversed order backend

---
 qtensor/contraction_algos/__init__.py               |  1 +
 .../transposed_bucket_elimination.py                | 13 +++++++++++++
 qtensor/tests/test_bucket_backends.py               | 13 +++++++++++++
 3 files changed, 27 insertions(+)

diff --git a/qtensor/contraction_algos/__init__.py b/qtensor/contraction_algos/__init__.py
index 1523654c..877bf819 100644
--- a/qtensor/contraction_algos/__init__.py
+++ b/qtensor/contraction_algos/__init__.py
@@ -1,2 +1,3 @@
 from .merged_bucket_elimination import bucket_elimination as merged_bucket_elimination
 from .transposed_bucket_elimination import bucket_elimination as transposed_bucket_elimination
+from .transposed_bucket_elimination import test_reverse_order_backend
diff --git a/qtensor/contraction_algos/transposed_bucket_elimination.py b/qtensor/contraction_algos/transposed_bucket_elimination.py
index 41246fc9..e09d9117 100644
--- a/qtensor/contraction_algos/transposed_bucket_elimination.py
+++ b/qtensor/contraction_algos/transposed_bucket_elimination.py
@@ -1,4 +1,17 @@
 import itertools
+import numpy as np
+from qtree.optimizer import Tensor, Var
+
+def test_reverse_order_backend(backend):
+    """
+    Duck-test if the tensors are with reverse index order
+    using slice_buckets method
+    """
+    a, b = Var(1), Var(2)
+    test_b = [[Tensor('T', [a, b], data_key='k')]]
+    data_dict={'k': np.random.rand(2, 2)}
+    sliced = backend.get_sliced_buckets(test_b, data_dict, {a: slice(None), b: slice(None)})
+    return sliced[0][0].indices[0] == b
 
 def bucket_elimination(buckets, process_bucket_fn,
                        n_var_nosum=0):
diff --git a/qtensor/tests/test_bucket_backends.py b/qtensor/tests/test_bucket_backends.py
index f5facce9..b06c3005 100644
--- a/qtensor/tests/test_bucket_backends.py
+++ b/qtensor/tests/test_bucket_backends.py
@@ -2,10 +2,13 @@
 from qtensor.contraction_backends import PerfNumpyBackend
 
 from qtensor.Simulate import CirqSimulator, QtreeSimulator
+import qtensor
 import numpy as np
 import networkx as nx
 from qtensor.tests import get_test_problem
 
+from qtensor.contraction_algos import test_reverse_order_backend
+
 
 def test_profiled(capsys):
     G, gamma, beta = get_test_problem()
@@ -25,3 +28,13 @@ def test_profiled(capsys):
     qtree_amp = result
 
     assert qtree_amp
+
+def test_reverse_order_switch():
+    backend = qtensor.contraction_backends.get_backend('torch')
+    reverse = test_reverse_order_backend(backend)
+    assert reverse
+
+    backend = qtensor.contraction_backends.get_backend('einsum')
+    reverse = test_reverse_order_backend(backend)
+    assert not reverse
+

From 701adc49977ce79a7daadbbb54f9f12dc285d05a Mon Sep 17 00:00:00 2001
From: Milan Kartik Shah <mkshah5@nps4.ece.ncsu.edu>
Date: Sun, 26 Feb 2023 21:28:40 -0500
Subject: [PATCH 023/126] Updated device compress to use R2R error and
 threshold

---
 qtensor/compression/szx/src/cuszx_wrapper.py | 42 +++++++++++---------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/qtensor/compression/szx/src/cuszx_wrapper.py b/qtensor/compression/szx/src/cuszx_wrapper.py
index 866437d2..e64fca24 100644
--- a/qtensor/compression/szx/src/cuszx_wrapper.py
+++ b/qtensor/compression/szx/src/cuszx_wrapper.py
@@ -4,7 +4,7 @@
 import random
 import cupy as cp
 
-LIB_PATH = './libcuszx_wrapper.so'
+LIB_PATH = '/home/mkshah5/clean_QTensor/QTensor/qtensor/compression/szx/src/libcuszx_wrapper.so'
 
 # unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize)
 
@@ -68,10 +68,11 @@ def cuszx_host_decompress(nbEle, cmpBytes):
 
 def cuszx_device_compress(oriData, absErrBound, nbEle, blockSize,threshold):
     __cuszx_device_compress = get_device_compress()
-
+    
     variable = ctypes.c_size_t(0)
     outSize = ctypes.pointer(variable)
-
+    absErrBound = absErrBound*(cp.amax(oriData.get())-cp.amin(oriData.get()))
+    threshold = threshold*(cp.amax(oriData.get())-cp.amin(oriData.get()))
     oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float))
     
     o_bytes = __cuszx_device_compress(oriData_p, outSize,np.float32(absErrBound), np.ulonglong(nbEle), np.int32(blockSize),np.float32(threshold))
@@ -90,24 +91,29 @@ def cuszx_device_decompress(nbEle, cmpBytes):
 
 if __name__ == "__main__":
     
-    DATA_SIZE = int(1024)
+    DATA_SIZE = int(2**26)
     MAX_D = 10.0
     MIN_D = -10.0
     RANGE = MAX_D - MIN_D
-    r2r_threshold = 0.1
-    r2r_error = 0.1
-
-    in_vector = np.zeros((DATA_SIZE,))
-    for i in range(0,int(DATA_SIZE/4)):
-        in_vector[i] = 0.0
-    for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
-        in_vector[i] = 5.0
-    for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
-        in_vector[i] = random.uniform(MIN_D, MAX_D)
-    for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
-        in_vector[i] = -7.0
-    for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
-        in_vector[i] = 0.001
+    r2r_threshold = 0.01
+    r2r_error = 0.01
+
+    in_vector = np.fromfile("real_tensor_d26.f32", dtype=np.float32)
+    print(np.max(in_vector))
+    #range_vr = np.max(in_vector)-np.min(in_vector)
+    #r2r_threshold = r2r_threshold*range_vr
+    #r2r_error = r2r_error*range_vr
+#    in_vector = np.zeros((DATA_SIZE,))
+#    for i in range(0,int(DATA_SIZE/4)):
+#        in_vector[i] = 0.0
+#    for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+#        in_vector[i] = 5.0
+#    for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+#        in_vector[i] = random.uniform(MIN_D, MAX_D)
+#    for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+#        in_vector[i] = -7.0
+#    for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+#        in_vector[i] = 0.001
 
 
     in_vector = in_vector.astype('float32')

From b873f6da3dee2e40f81fb0f2c86e98d7b0c1af90 Mon Sep 17 00:00:00 2001
From: Milan Kartik Shah <mkshah5@nps4.ece.ncsu.edu>
Date: Sun, 26 Feb 2023 21:35:46 -0500
Subject: [PATCH 024/126] Reset cuszx library path in wrapper

---
 qtensor/compression/szx/src/cuszx_wrapper.py | 30 ++++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/qtensor/compression/szx/src/cuszx_wrapper.py b/qtensor/compression/szx/src/cuszx_wrapper.py
index e64fca24..b237a8ec 100644
--- a/qtensor/compression/szx/src/cuszx_wrapper.py
+++ b/qtensor/compression/szx/src/cuszx_wrapper.py
@@ -4,7 +4,7 @@
 import random
 import cupy as cp
 
-LIB_PATH = '/home/mkshah5/clean_QTensor/QTensor/qtensor/compression/szx/src/libcuszx_wrapper.so'
+LIB_PATH = './libcuszx_wrapper.so'
 
 # unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize)
 
@@ -91,29 +91,29 @@ def cuszx_device_decompress(nbEle, cmpBytes):
 
 if __name__ == "__main__":
     
-    DATA_SIZE = int(2**26)
+    DATA_SIZE = int(1024)
     MAX_D = 10.0
     MIN_D = -10.0
     RANGE = MAX_D - MIN_D
     r2r_threshold = 0.01
     r2r_error = 0.01
 
-    in_vector = np.fromfile("real_tensor_d26.f32", dtype=np.float32)
-    print(np.max(in_vector))
+    #in_vector = np.fromfile("real_tensor_d26.f32", dtype=np.float32)
+    #print(np.max(in_vector))
     #range_vr = np.max(in_vector)-np.min(in_vector)
     #r2r_threshold = r2r_threshold*range_vr
     #r2r_error = r2r_error*range_vr
-#    in_vector = np.zeros((DATA_SIZE,))
-#    for i in range(0,int(DATA_SIZE/4)):
-#        in_vector[i] = 0.0
-#    for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
-#        in_vector[i] = 5.0
-#    for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
-#        in_vector[i] = random.uniform(MIN_D, MAX_D)
-#    for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
-#        in_vector[i] = -7.0
-#    for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
-#        in_vector[i] = 0.001
+    in_vector = np.zeros((DATA_SIZE,))
+    for i in range(0,int(DATA_SIZE/4)):
+        in_vector[i] = 0.0
+    for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+        in_vector[i] = 5.0
+    for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+        in_vector[i] = random.uniform(MIN_D, MAX_D)
+    for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+        in_vector[i] = -7.0
+    for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+        in_vector[i] = 0.001
 
 
     in_vector = in_vector.astype('float32')

From b2ffd090d49a0e818fdae24525bd13fa7fdf656c Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Mon, 27 Feb 2023 11:41:21 -0600
Subject: [PATCH 025/126] add better slicing tools, integrate reverse
 contraction

---
 qtensor/Simulate.py                           |  7 ++-
 qtensor/contraction_algos/__init__.py         | 15 ++++++
 qtensor/contraction_backends/__init__.py      |  1 +
 qtensor/contraction_backends/common.py        | 49 +++++++++++++++++++
 qtensor/contraction_backends/compression.py   | 31 ++++++++++++
 .../contraction_backends/tests/test_common.py | 48 ++++++++++++++++++
 .../contraction_backends/tests/test_torch.py  |  3 +-
 qtensor/contraction_backends/torch.py         | 33 ++++---------
 qtensor/tests/test_bucket_backends.py         |  2 +
 qtensor/tools/benchmarking/simulators.py      |  2 +-
 10 files changed, 160 insertions(+), 31 deletions(-)
 create mode 100644 qtensor/contraction_backends/common.py
 create mode 100644 qtensor/contraction_backends/compression.py
 create mode 100644 qtensor/contraction_backends/tests/test_common.py

diff --git a/qtensor/Simulate.py b/qtensor/Simulate.py
index 0d6e7c22..285e51d1 100644
--- a/qtensor/Simulate.py
+++ b/qtensor/Simulate.py
@@ -1,6 +1,7 @@
 import qtree
 from qtensor.tools.lazy_import import cirq
 from qtensor.contraction_backends import NumpyBackend, ContractionBackend
+from qtensor.contraction_algos import bucket_elimination
 
 from qtensor.optimisation.TensorNet import QtreeTensorNet
 from qtensor.optimisation.Optimizer import DefaultOptimizer, Optimizer
@@ -143,10 +144,8 @@ def _slice_relabel_buckets(self):
     def simulate_batch(self, qc, batch_vars=0, peo=None):
         self.prepare_buckets(qc, batch_vars, peo)
 
-        result = qtree.optimizer.bucket_elimination(
-            self.buckets, self.backend.process_bucket,
-            n_var_nosum=len(self.tn.free_vars)
-        )
+        result = bucket_elimination(self.buckets, self.backend,
+                                    n_var_nosum=len(self.tn.free_vars))
         return self.backend.get_result_data(result).flatten()
 
     def simulate(self, qc):
diff --git a/qtensor/contraction_algos/__init__.py b/qtensor/contraction_algos/__init__.py
index 877bf819..810026b0 100644
--- a/qtensor/contraction_algos/__init__.py
+++ b/qtensor/contraction_algos/__init__.py
@@ -1,3 +1,18 @@
+import qtree
+from qtensor.contraction_backends import ContractionBackend
+
 from .merged_bucket_elimination import bucket_elimination as merged_bucket_elimination
 from .transposed_bucket_elimination import bucket_elimination as transposed_bucket_elimination
 from .transposed_bucket_elimination import test_reverse_order_backend
+
+def bucket_elimination(buckets, backend:ContractionBackend,
+                       n_var_nosum=0):
+    """
+    Algorithm to evaluate a contraction of a large number of tensors.
+    """
+    if test_reverse_order_backend(backend):
+        return transposed_bucket_elimination(buckets, backend.process_bucket, n_var_nosum)
+    else:
+        return qtree.optimizer.bucket_elimination(buckets, backend.process_bucket, n_var_nosum)
+
+
diff --git a/qtensor/contraction_backends/__init__.py b/qtensor/contraction_backends/__init__.py
index 6d57b88c..d67bc884 100644
--- a/qtensor/contraction_backends/__init__.py
+++ b/qtensor/contraction_backends/__init__.py
@@ -1,5 +1,6 @@
 #from torch._C import device
 from .base_class import ContractionBackend
+from .common import slice_numpy_tensor
 from .numpy import NumpyBackend
 from .torch import TorchBackend
 from .cupy import CuPyBackend
diff --git a/qtensor/contraction_backends/common.py b/qtensor/contraction_backends/common.py
new file mode 100644
index 00000000..f787635e
--- /dev/null
+++ b/qtensor/contraction_backends/common.py
@@ -0,0 +1,49 @@
+import numpy as np
+from qtree.optimizer import Tensor
+
+def permute_np_tensor_data(data:np.ndarray, indices_in, indices_out):
+    """
+    Permute the data of a numpy tensor to the given indices_out.
+    
+    Returns:
+        permuted data
+    """
+    # permute indices
+    out_locs = {idx: i for i, idx in enumerate(indices_out)}
+    perm = [out_locs[i] for i in indices_in]
+    # permute tensor
+    return np.transpose(data, perm)
+
+def get_slice_bounds(slice_dict, indices):
+    """Slice a numpy tensor data
+
+
+    Returns:
+        tuple of slice bounds
+    """
+    slice_bounds = tuple([
+        slice_dict.get(i, slice(None)) for i in indices
+    ])
+    return slice_bounds
+
+def slice_numpy_tensor(data:np.ndarray, indices_in, indices_out, slice_dict):
+    """
+    Args:
+        data : np.ndarray
+        indices_in: list of `qtree.optimizer.Var`
+        indices_out: list of `qtree.optimizer.Var`
+        slice_dict: dict of `qtree.optimizer.Var` to `slice`
+
+    Returns:
+        new data, new indices
+    """
+    slice_bounds = get_slice_bounds(slice_dict, indices_in)
+    s_data = data[slice_bounds]
+    indices_sliced = [
+        i for sl, i in zip(slice_bounds, indices_in) if not isinstance(sl, int)
+    ]
+    indices_sized = [v.copy(size=size) for v, size in zip(indices_sliced, s_data.shape)]
+    assert len(indices_out) == len(s_data.shape)
+    assert len(indices_sliced) == len(s_data.shape)
+    st_data = permute_np_tensor_data(s_data, indices_sliced, indices_out)
+    return st_data, indices_out
diff --git a/qtensor/contraction_backends/compression.py b/qtensor/contraction_backends/compression.py
new file mode 100644
index 00000000..812b97b5
--- /dev/null
+++ b/qtensor/contraction_backends/compression.py
@@ -0,0 +1,31 @@
+from qtensor.contraction_backends import ContractionBackend
+from qtensor.compression import Compressor
+
+class CompressionBackend(ContractionBackend):
+    """
+    Compression bucket contraction backend.
+
+    This backend "decorates" another backend, by using compression in 
+    pairwise contraction. If the result tensor has more than `max_tw` indices,
+    it is sliced and the contraction result is compressed before proceeding to
+    next slice.
+    """
+    def __init__(self, backend, compressor:Compressor, max_tw:int):
+        """
+        Arguments:
+            backend: the backend to use for contraction
+            compressor: the compressor to use for compression
+            max_tw: threshold for triggering compression.
+
+        """
+        self.backend = backend
+        self.compressor = compressor
+        self.max_tw = max_tw
+
+    def process_bucket(self, bucket):
+
+    def get_sliced_buckets(self, buckets, slice_idx):
+        return buckets
+
+    def get_result_data(self, result):
+        return result.data
diff --git a/qtensor/contraction_backends/tests/test_common.py b/qtensor/contraction_backends/tests/test_common.py
new file mode 100644
index 00000000..c5905932
--- /dev/null
+++ b/qtensor/contraction_backends/tests/test_common.py
@@ -0,0 +1,48 @@
+from qtensor.contraction_backends.common import slice_numpy_tensor
+import numpy as np
+from qtree.optimizer import Var
+
+def test_slice_numpy_tensor():
+    shape = (2, 3, 4, 5)
+    indices_in = [Var(i, size=s) for i, s in enumerate(shape)]
+    data = np.random.rand(*shape)
+    data_ref = data.copy()
+    slice_dict = {
+        indices_in[0]: slice(None),
+        indices_in[1]: slice(1, 3),
+        indices_in[2]: 1,
+        indices_in[3]: slice(3, 4),
+    }
+    indices_out = [indices_in[3], indices_in[1], indices_in[0]]
+    new_data, new_indices = slice_numpy_tensor(
+        data, indices_in, indices_out, slice_dict
+    )
+    assert new_data.shape == (1, 2, 2)
+    assert new_indices == indices_out
+    assert np.allclose(data,  data_ref)
+    assert not np.allclose(new_data , data_ref[:, 1:3, 1, 3:4])
+    assert np.allclose(new_data , data_ref[:, 1:3, 1, 3:4].transpose(2, 1, 0))
+    assert np.allclose(new_data , data_ref.transpose()[3:4, 1, 1:3, :])
+
+def test_slice_torch_tensor():
+    import torch
+    shape = (2, 3, 4, 5)
+    indices_in = [Var(i, size=s) for i, s in enumerate(shape)]
+    data = torch.randn(*shape)
+    data_ref = data.clone()
+    slice_dict = {
+        indices_in[0]: slice(None),
+        indices_in[1]: slice(1, 3),
+        indices_in[2]: 1,
+        indices_in[3]: slice(3, 4),
+    }
+    indices_out = [indices_in[3], indices_in[1], indices_in[0]]
+    new_data, new_indices = slice_numpy_tensor(
+        data, indices_in, indices_out, slice_dict
+    )
+    assert isinstance(new_data, torch.Tensor)
+    assert new_data.shape == (1, 2, 2)
+    assert new_indices == indices_out
+    assert np.allclose(data,  data_ref)
+    assert not np.allclose(new_data , data_ref[:, 1:3, 1, 3:4])
+    assert np.allclose(new_data , data_ref[:, 1:3, 1, 3:4].permute(2, 1, 0))
diff --git a/qtensor/contraction_backends/tests/test_torch.py b/qtensor/contraction_backends/tests/test_torch.py
index df47d9d0..ec23edb8 100644
--- a/qtensor/contraction_backends/tests/test_torch.py
+++ b/qtensor/contraction_backends/tests/test_torch.py
@@ -56,13 +56,12 @@ def contract_tn(backend, search_len=1, test_problem_kwargs={}):
         print('selected_bucket', selected_bucket)
 
         result = backend.process_bucket(selected_bucket)
-        return result.data
+        return backend.get_result_data(result)
 
     # First test only simple buckets
     restr = contract_tn(btr, 1)
     resnp = contract_tn(bnp, 1)
     assert type(restr) is torch.Tensor
-    assert restr.dtype is torch.cfloat
 
     assert np.allclose(restr, resnp)
 
diff --git a/qtensor/contraction_backends/torch.py b/qtensor/contraction_backends/torch.py
index 009d7689..4d7ad448 100644
--- a/qtensor/contraction_backends/torch.py
+++ b/qtensor/contraction_backends/torch.py
@@ -3,6 +3,7 @@
 import numpy as np
 from qtree import np_framework
 from qtensor.contraction_backends import ContractionBackend
+from .common import slice_numpy_tensor
 import string
 CHARS = string.ascii_lowercase + string.ascii_uppercase
 
@@ -172,41 +173,25 @@ def get_sliced_buckets(self, buckets, data_dict, slice_dict):
             for tensor in bucket:
                 # get data
                 # sort tensor dimensions
-                transpose_order = np.argsort(list(map(int, tensor.indices)))[::-1]
+                out_indices = list(sorted(tensor.indices, key=int, reverse=True))
                 data = data_dict[tensor.data_key]
+                # Works for torch tensors just fine
+                data, new_indices = slice_numpy_tensor(data, tensor.indices, out_indices, slice_dict)
+
                 if not isinstance(data, torch.Tensor):             
                     if self.device == 'gpu' and torch.cuda.is_available():
                         cuda = torch.device('cuda')
                         data = torch.from_numpy(data.astype(np.complex128)).to(cuda)
                     else:
                         data = torch.from_numpy(data.astype(np.complex128))
-
-                data = data.permute(tuple(transpose_order))
-                # transpose indices
-                indices_sorted = [tensor.indices[pp]
-                                  for pp in transpose_order]
-
+                else:
+                    data = data.type(torch.complex128)
                 # slice data
-                slice_bounds = []
-                for idx in indices_sorted:
-                    try:
-                        slice_bounds.append(slice_dict[idx])
-                    except KeyError:
-                        slice_bounds.append(slice(None))
-
-                data = data[tuple(slice_bounds)]
-
-                # update indices
-                indices_sliced = [idx.copy(size=size) for idx, size in
-                                  zip(indices_sorted, data.shape)]
-                indices_sliced = [i for sl, i in zip(slice_bounds, indices_sliced) if not isinstance(sl, int)]
-                assert len(data.shape) == len(indices_sliced)
-
                 sliced_bucket.append(
-                    tensor.copy(indices=indices_sliced, data=data))
+                    tensor.copy(indices=new_indices, data=data))
             sliced_buckets.append(sliced_bucket)
 
         return sliced_buckets
 
     def get_result_data(self, result):
-        return result.data
+        return np.transpose(result.data)
diff --git a/qtensor/tests/test_bucket_backends.py b/qtensor/tests/test_bucket_backends.py
index b06c3005..efac3a5f 100644
--- a/qtensor/tests/test_bucket_backends.py
+++ b/qtensor/tests/test_bucket_backends.py
@@ -38,3 +38,5 @@ def test_reverse_order_switch():
     reverse = test_reverse_order_backend(backend)
     assert not reverse
 
+def test_compression_backend():
+    pass
diff --git a/qtensor/tools/benchmarking/simulators.py b/qtensor/tools/benchmarking/simulators.py
index 659bc775..fc06126f 100644
--- a/qtensor/tools/benchmarking/simulators.py
+++ b/qtensor/tools/benchmarking/simulators.py
@@ -299,7 +299,7 @@ def simulate_qaoa_energy(self, G, p, opt):
             with profiles.mem_util() as m:
                 # should not consume all iterator at once
                 for edge, (ibunch, merged_buckets) in zip(self.iterate_edges(G, p), opt):
-                    edge_contribution = qtensor.merged_indices.bucket_elimination(
+                    edge_contribution = qtensor.contraction_algos.merged_bucket_elimination(
                         [x.copy() for x in merged_buckets],
                         ibunch,
                         sim.backend.process_bucket_merged,

From a0e00e0e072a7f3562767b4edf52704a37d1c9b7 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Mon, 27 Feb 2023 11:46:33 -0600
Subject: [PATCH 026/126] rename to is_reverse_order_backend

---
 qtensor/contraction_algos/__init__.py | 4 ++--
 qtensor/tests/test_bucket_backends.py | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/qtensor/contraction_algos/__init__.py b/qtensor/contraction_algos/__init__.py
index 810026b0..724cbef2 100644
--- a/qtensor/contraction_algos/__init__.py
+++ b/qtensor/contraction_algos/__init__.py
@@ -3,14 +3,14 @@
 
 from .merged_bucket_elimination import bucket_elimination as merged_bucket_elimination
 from .transposed_bucket_elimination import bucket_elimination as transposed_bucket_elimination
-from .transposed_bucket_elimination import test_reverse_order_backend
+from .transposed_bucket_elimination import is_reverse_order_backend
 
 def bucket_elimination(buckets, backend:ContractionBackend,
                        n_var_nosum=0):
     """
     Algorithm to evaluate a contraction of a large number of tensors.
     """
-    if test_reverse_order_backend(backend):
+    if is_reverse_order_backend(backend):
         return transposed_bucket_elimination(buckets, backend.process_bucket, n_var_nosum)
     else:
         return qtree.optimizer.bucket_elimination(buckets, backend.process_bucket, n_var_nosum)
diff --git a/qtensor/tests/test_bucket_backends.py b/qtensor/tests/test_bucket_backends.py
index efac3a5f..8d3c2270 100644
--- a/qtensor/tests/test_bucket_backends.py
+++ b/qtensor/tests/test_bucket_backends.py
@@ -7,7 +7,7 @@
 import networkx as nx
 from qtensor.tests import get_test_problem
 
-from qtensor.contraction_algos import test_reverse_order_backend
+from qtensor.contraction_algos import is_reverse_order_backend
 
 
 def test_profiled(capsys):
@@ -31,11 +31,11 @@ def test_profiled(capsys):
 
 def test_reverse_order_switch():
     backend = qtensor.contraction_backends.get_backend('torch')
-    reverse = test_reverse_order_backend(backend)
+    reverse = is_reverse_order_backend(backend)
     assert reverse
 
     backend = qtensor.contraction_backends.get_backend('einsum')
-    reverse = test_reverse_order_backend(backend)
+    reverse = is_reverse_order_backend(backend)
     assert not reverse
 
 def test_compression_backend():

From e14cf371e374b80e821e35e428300edb3c7a9b32 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Mon, 27 Feb 2023 11:47:12 -0600
Subject: [PATCH 027/126] rename to is_reverse_order_backend

---
 qtensor/contraction_algos/transposed_bucket_elimination.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qtensor/contraction_algos/transposed_bucket_elimination.py b/qtensor/contraction_algos/transposed_bucket_elimination.py
index e09d9117..8b92d54c 100644
--- a/qtensor/contraction_algos/transposed_bucket_elimination.py
+++ b/qtensor/contraction_algos/transposed_bucket_elimination.py
@@ -2,7 +2,7 @@
 import numpy as np
 from qtree.optimizer import Tensor, Var
 
-def test_reverse_order_backend(backend):
+def is_reverse_order_backend(backend):
     """
     Duck-test if the tensors are with reverse index order
     using slice_buckets method

From 8126f30aa51cbff0aaf6ef8cebd4888034217119 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Mon, 27 Feb 2023 17:28:01 -0600
Subject: [PATCH 028/126] fix dtype issue with test_compresssor

---
 qtensor/compression/CompressedTensor.py       | 19 ++++--
 qtensor/compression/__init__.py               |  1 +
 qtensor/compression/compressed_contraction.py |  7 +-
 .../compression/szx/src/DynamicByteArray.c    | 68 -------------------
 .../compression/szx/src/DynamicDoubleArray.c  | 57 ----------------
 .../compression/szx/src/DynamicFloatArray.c   | 57 ----------------
 qtensor/compression/szx/src/DynamicIntArray.c | 57 ----------------
 qtensor/compression/test_compressed_tensor.py |  7 +-
 8 files changed, 27 insertions(+), 246 deletions(-)
 delete mode 100644 qtensor/compression/szx/src/DynamicByteArray.c
 delete mode 100644 qtensor/compression/szx/src/DynamicDoubleArray.c
 delete mode 100644 qtensor/compression/szx/src/DynamicFloatArray.c
 delete mode 100644 qtensor/compression/szx/src/DynamicIntArray.c

diff --git a/qtensor/compression/CompressedTensor.py b/qtensor/compression/CompressedTensor.py
index 847902b3..3db07abd 100644
--- a/qtensor/compression/CompressedTensor.py
+++ b/qtensor/compression/CompressedTensor.py
@@ -4,7 +4,10 @@
 from qtree.optimizer import Tensor
 from qtree.system_defs import NP_ARRAY_TYPE
 import sys
-sys.path.append("./szx/src")
+from pathlib import Path
+print(Path(__file__).parent/'szx/src/')
+sys.path.append(Path(__file__).parent/'szx/src/')
+sys.path.append('./szx/src')
 
 from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
 
@@ -17,6 +20,13 @@ def iterate_indices(indices: list):
     return itertools.product(*ranges)
 
 class Compressor():
+    def compress(self, data):
+        raise NotImplementedError
+
+    def decompress(self, ptr):
+        raise NotImplementedError
+
+class NumpyCompressor(Compressor):
     def compress(self, data):
         print(f"Compressing len {data.size}")
         comp = io.BytesIO()
@@ -28,7 +38,7 @@ def decompress(self, ptr):
         print(f"Loading arr.")
         return  np.load(ptr)['arr_0']
 
-class CUSZCompressor():
+class CUSZCompressor(Compressor):
     def compress(self, data):
         import cupy
         if isinstance(data, cupy.ndarray):
@@ -36,6 +46,7 @@ def compress(self, data):
         else:
             isCuPy = False
         num_elements = data.size
+        print("Num elements", num_elements)
         r2r_error = 0.01
         r2r_threshold = 0.01
         cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data.flatten(), num_elements, r2r_error, r2r_threshold)
@@ -54,9 +65,9 @@ def decompress(self, obj):
         p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
         decompressed_int = p_decompressed_int.contents
         # --
-        mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements*8, self, device_id=0)
+        mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements*4, self, device_id=0)
         mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
-        arr = cupy.ndarray(shape, dtype=np.float64, memptr=mem_ptr)
+        arr = cupy.ndarray(shape, dtype=np.float32, memptr=mem_ptr)
         return arr
     
     ### Compression API with cuSZx ###
diff --git a/qtensor/compression/__init__.py b/qtensor/compression/__init__.py
index 79eb76e0..f11efed3 100644
--- a/qtensor/compression/__init__.py
+++ b/qtensor/compression/__init__.py
@@ -1,3 +1,4 @@
 from .CompressedTensor import CompressedTensor, Tensor
+from .CompressedTensor import Compressor, CUSZCompressor
 from .compressed_contraction import compressed_contract
 from .cost_estimation import compressed_contraction_cost
diff --git a/qtensor/compression/compressed_contraction.py b/qtensor/compression/compressed_contraction.py
index 89d8ba7c..bd03a077 100644
--- a/qtensor/compression/compressed_contraction.py
+++ b/qtensor/compression/compressed_contraction.py
@@ -44,8 +44,13 @@ def contract_two_tensors(A, B, T_out):
 
 
 def compressed_contract(A:Tensor, B: Tensor,
-                        result_ixs, contract_ixs,
+                        contract_ixs,
                         mem_limit):
+    """
+    Contract tensors A and B along `contract_ixs` and return the result
+
+    The result tensor indices will be ordered from largest to smallest
+    """
     all_indices = list(set(A.indices).union(B.indices))
     all_indices.sort(key=int, reverse=True)
     result_indices = list(set(all_indices) - set(contract_ixs))
diff --git a/qtensor/compression/szx/src/DynamicByteArray.c b/qtensor/compression/szx/src/DynamicByteArray.c
deleted file mode 100644
index 64b7d5c7..00000000
--- a/qtensor/compression/szx/src/DynamicByteArray.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/**
- *  @file DynamicByteArray.c
- *  @author Sheng Di
- *  @date May, 2016
- *  @brief Dynamic Byte Array
- *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
- *      See COPYRIGHT in top-level directory.
- */
-
-#include <stdlib.h> 
-#include <stdio.h>
-#include <string.h>
-#include "DynamicByteArray.h"
-
-void new_DBA(DynamicByteArray **dba, size_t cap) {
-		*dba = (DynamicByteArray *)malloc(sizeof(DynamicByteArray));
-        (*dba)->size = 0;
-        (*dba)->capacity = cap;
-        (*dba)->array = (unsigned char*)malloc(sizeof(unsigned char)*cap);
-    }
-
-void convertDBAtoBytes(DynamicByteArray *dba, unsigned char** bytes)
-{
-	size_t size = dba->size;
-	if(size>0)
-		*bytes = (unsigned char*)malloc(size * sizeof(unsigned char));
-	else
-		*bytes = NULL;
-	memcpy(*bytes, dba->array, size*sizeof(unsigned char));	
-}
-
-void free_DBA(DynamicByteArray *dba)
-{
-	free(dba->array);
-	free(dba);
-}
-
-inline unsigned char getDBA_Data(DynamicByteArray *dba, size_t pos)
-{
-	if(pos>=dba->size)
-	{
-		printf("Error: wrong position of DBA (impossible case unless bugs elsewhere in the code?).\n");
-		exit(0);
-	}
-	return dba->array[pos];
-}
-
-inline void addDBA_Data(DynamicByteArray *dba, unsigned char value)
-{
-	if(dba->size==dba->capacity)
-	{
-		dba->capacity = dba->capacity << 1;
-		dba->array = (unsigned char *)realloc(dba->array, dba->capacity*sizeof(unsigned char));
-	}
-	dba->array[dba->size] = value;
-	dba->size ++;
-}
-
-inline void memcpyDBA_Data(DynamicByteArray *dba, unsigned char* data, size_t length)
-{
-	if(dba->size + length > dba->capacity)
-	{
-		dba->capacity = dba->size + length;
-		dba->array = (unsigned char *)realloc(dba->array, dba->capacity*sizeof(unsigned char));
-	}
-	memcpy(&(dba->array[dba->size]), data, length);
-	dba->size += length;
-}
diff --git a/qtensor/compression/szx/src/DynamicDoubleArray.c b/qtensor/compression/szx/src/DynamicDoubleArray.c
deleted file mode 100644
index 54bbb109..00000000
--- a/qtensor/compression/szx/src/DynamicDoubleArray.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/**
- *  @file DynamicFloatArray.c
- *  @author Sheng Di
- *  @date May, 2016
- *  @brief Dynamic Float Array
- *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
- *      See COPYRIGHT in top-level directory.
- */
-
-#include <stdlib.h> 
-#include <stdio.h>
-#include <string.h>
-#include "DynamicDoubleArray.h"
-
-void new_DDA(DynamicDoubleArray **dda, size_t cap) {
-		*dda = (DynamicDoubleArray *)malloc(sizeof(DynamicDoubleArray));
-        (*dda)->size = 0;
-        (*dda)->capacity = cap;
-        (*dda)->array = (double*)malloc(sizeof(double)*cap);
-    }
-
-void convertDDAtoDoubles(DynamicDoubleArray *dba, double **data)
-{
-	size_t size = dba->size;
-	if(size>0)
-		*data = (double*)malloc(size * sizeof(double));
-	else
-		*data = NULL;
-	memcpy(*data, dba->array, size*sizeof(double));	
-}
-
-void free_DDA(DynamicDoubleArray *dda)
-{
-	free(dda->array);
-	free(dda);
-}
-
-double getDDA_Data(DynamicDoubleArray *dda, size_t pos)
-{
-	if(pos>=dda->size)
-	{
-		printf("Error: wrong position of DIA.\n");
-		exit(0);
-	}
-	return dda->array[pos];
-}
-
-void addDDA_Data(DynamicDoubleArray *dda, double value)
-{
-	if(dda->size==dda->capacity)
-	{
-		dda->capacity *= 2;
-		dda->array = (double *)realloc(dda->array, dda->capacity*sizeof(double));
-	}
-	dda->array[dda->size] = value;
-	dda->size ++;
-}
diff --git a/qtensor/compression/szx/src/DynamicFloatArray.c b/qtensor/compression/szx/src/DynamicFloatArray.c
deleted file mode 100644
index 1a80a488..00000000
--- a/qtensor/compression/szx/src/DynamicFloatArray.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/**
- *  @file DynamicFloatArray.c
- *  @author Sheng Di
- *  @date May, 2016
- *  @brief Dynamic Float Array
- *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
- *      See COPYRIGHT in top-level directory.
- */
-
-#include <stdlib.h> 
-#include <stdio.h>
-#include <string.h>
-#include "DynamicFloatArray.h"
-
-void new_DFA(DynamicFloatArray **dfa, size_t cap) {
-		*dfa = (DynamicFloatArray *)malloc(sizeof(DynamicFloatArray));
-        (*dfa)->size = 0;
-        (*dfa)->capacity = cap;
-        (*dfa)->array = (float*)malloc(sizeof(float)*cap);
-    }
-
-void convertDFAtoFloats(DynamicFloatArray *dfa, float **data)
-{
-	size_t size = dfa->size;
-	if(size>0)
-		*data = (float*)malloc(size * sizeof(float));
-	else
-		*data = NULL;
-	memcpy(*data, dfa->array, size*sizeof(float));	
-}
-
-void free_DFA(DynamicFloatArray *dfa)
-{
-	free(dfa->array);
-	free(dfa);
-}
-
-float getDFA_Data(DynamicFloatArray *dfa, size_t pos)
-{
-	if(pos>=dfa->size)
-	{
-		printf("Error: wrong position of DIA.\n");
-		exit(0);
-	}
-	return dfa->array[pos];
-}
-
-void addDFA_Data(DynamicFloatArray *dfa, float value)
-{
-	if(dfa->size==dfa->capacity)
-	{
-		dfa->capacity *= 2;
-		dfa->array = (float *)realloc(dfa->array, dfa->capacity*sizeof(float));
-	}
-	dfa->array[dfa->size] = value;
-	dfa->size++;
-}
diff --git a/qtensor/compression/szx/src/DynamicIntArray.c b/qtensor/compression/szx/src/DynamicIntArray.c
deleted file mode 100644
index 347e3a18..00000000
--- a/qtensor/compression/szx/src/DynamicIntArray.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/**
- *  @file DynamicIntArray.c
- *  @author Sheng Di
- *  @date May, 2016
- *  @brief Dynamic Int Array
- *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
- *      See COPYRIGHT in top-level directory.
- */
-
-#include <stdlib.h> 
-#include <stdio.h>
-#include <string.h>
-#include "DynamicIntArray.h"
-
-void new_DIA(DynamicIntArray **dia, size_t cap) {
-		*dia = (DynamicIntArray *)malloc(sizeof(DynamicIntArray));
-        (*dia)->size = 0;
-        (*dia)->capacity = cap;
-        (*dia)->array = (unsigned char*)malloc(sizeof(unsigned char)*cap);
-    }
-
-void convertDIAtoInts(DynamicIntArray *dia, unsigned char **data)
-{
-	size_t size = dia->size;
-	if(size>0)
-		*data = (unsigned char*)malloc(size * sizeof(char));
-	else
-		*data = NULL;
-	memcpy(*data, dia->array, size*sizeof(unsigned char));	
-}
-
-void free_DIA(DynamicIntArray *dia)
-{
-	free(dia->array);
-	free(dia);
-}
-
-int getDIA_Data(DynamicIntArray *dia, size_t pos)
-{
-	if(pos>=dia->size)
-	{
-		printf("Error: wrong position of DIA.\n");
-		exit(0);
-	}
-	return dia->array[pos];
-}
-
-inline void addDIA_Data(DynamicIntArray *dia, int value)
-{
-	if(dia->size==dia->capacity)
-	{
-		dia->capacity = dia->capacity << 1;
-		dia->array = (unsigned char *)realloc(dia->array, dia->capacity*sizeof(unsigned char));
-	}
-	dia->array[dia->size] = (unsigned char)value;
-	dia->size ++;
-}
diff --git a/qtensor/compression/test_compressed_tensor.py b/qtensor/compression/test_compressed_tensor.py
index 12dd7862..3cf49e3d 100644
--- a/qtensor/compression/test_compressed_tensor.py
+++ b/qtensor/compression/test_compressed_tensor.py
@@ -52,7 +52,7 @@ def test_slice_tensor():
 def test_compressors(shape, compressor):
     import cupy
     indices = [Var(i, size=s) for i, s in enumerate(shape)]
-    data = cupy.random.randn(*shape)
+    data = cupy.random.random(shape, dtype=np.float32)*.00001
     print("Data size", data.nbytes)
     t = CompressedTensor("myT", indices, data=data, compressor=compressor)
     t.compress_indices([indices[0]])
@@ -60,5 +60,8 @@ def test_compressors(shape, compressor):
     s = t[1]
     print('got chunk')
     assert s.data is not None
-    assert np.allclose(t.get_chunk([1]), s.data)
+    ch = cupy.asnumpy(t.get_chunk([1]))
+    ref = cupy.asnumpy(s.data)
+
+    assert np.allclose(ch, ref)
 

From fd3888e41bf984b8a854f0ddbf0824b5a8fff279 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Mon, 27 Feb 2023 21:23:13 -0600
Subject: [PATCH 029/126] first try on compressed backend

---
 qtensor/compression/CompressedTensor.py       | 17 +++---
 qtensor/compression/compressed_contraction.py |  8 +--
 qtensor/compression/test_compressed_tensor.py | 27 ++++++----
 qtensor/contraction_backends/compression.py   | 52 +++++++++++++++++--
 4 files changed, 81 insertions(+), 23 deletions(-)

diff --git a/qtensor/compression/CompressedTensor.py b/qtensor/compression/CompressedTensor.py
index 3db07abd..fab843b1 100644
--- a/qtensor/compression/CompressedTensor.py
+++ b/qtensor/compression/CompressedTensor.py
@@ -47,17 +47,22 @@ def compress(self, data):
             isCuPy = False
         num_elements = data.size
         print("Num elements", num_elements)
+        # Adapt numele depending on itemsize
+        itemsize = data.dtype.itemsize
+        num_elements_eff = int(num_elements*itemsize/4) 
+
         r2r_error = 0.01
         r2r_threshold = 0.01
-        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data.flatten(), num_elements, r2r_error, r2r_threshold)
+        dtype = data.dtype
+        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, r2r_error, r2r_threshold)
         print("returning compressed data")
-        return (cmp_bytes, num_elements, isCuPy, data.shape)
+        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype)
 
     def decompress(self, obj):
         import cupy
         import ctypes
-        cmp_bytes, num_elements, isCuPy, shape = obj
-        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements)
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype = obj
+        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
         # -- Workaround to convert GPU pointer to int
         p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
         # cast to int64 pointer
@@ -65,9 +70,9 @@ def decompress(self, obj):
         p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
         decompressed_int = p_decompressed_int.contents
         # --
-        mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements*4, self, device_id=0)
+        mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
         mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
-        arr = cupy.ndarray(shape, dtype=np.float32, memptr=mem_ptr)
+        arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
         return arr
     
     ### Compression API with cuSZx ###
diff --git a/qtensor/compression/compressed_contraction.py b/qtensor/compression/compressed_contraction.py
index bd03a077..9d1d69b5 100644
--- a/qtensor/compression/compressed_contraction.py
+++ b/qtensor/compression/compressed_contraction.py
@@ -2,6 +2,7 @@
 
 from qtensor.compression import CompressedTensor
 from .CompressedTensor import Tensor, iterate_indices
+from .CompressedTensor import Compressor
 
 # taken from numpy/core/einsumfunc.py
 einsum_symbols = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
@@ -44,8 +45,8 @@ def contract_two_tensors(A, B, T_out):
 
 
 def compressed_contract(A:Tensor, B: Tensor,
-                        contract_ixs,
-                        mem_limit):
+                        contract_ixs, mem_limit,
+                        compressor:Compressor):
     """
     Contract tensors A and B along `contract_ixs` and return the result
 
@@ -84,7 +85,8 @@ def compressed_contract(A:Tensor, B: Tensor,
     remove_compress = exist_compressed - set(need_compressed)
     R = CompressedTensor(new_tensor_name,
                          result_indices,
-                         slice_indices=need_compressed
+                         slice_indices=need_compressed,
+                         compressor=compressor
                         )
 
     result_chunk_ixs = result_indices[-mem_limit:]
diff --git a/qtensor/compression/test_compressed_tensor.py b/qtensor/compression/test_compressed_tensor.py
index 3cf49e3d..23e86175 100644
--- a/qtensor/compression/test_compressed_tensor.py
+++ b/qtensor/compression/test_compressed_tensor.py
@@ -1,5 +1,5 @@
 from qtensor.compression import CompressedTensor
-from qtensor.compression.CompressedTensor import Compressor, CUSZCompressor
+from qtensor.compression.CompressedTensor import NumpyCompressor, CUSZCompressor
 from qtree.optimizer import Var
 from qtree.system_defs import NP_ARRAY_TYPE
 import pytest
@@ -42,26 +42,33 @@ def test_slice_tensor():
     assert S.data is not None
     assert np.allclose(t.get_chunk([1, 2]), S.data)
 
-@pytest.mark.parametrize(argnames=["shape", "compressor"],
+@pytest.mark.parametrize(argnames=["shape", "compressor", "dtype"],
                          argvalues=[
-                             ((2, 3, 4), Compressor()),
-                             ((2, 3, 4), CUSZCompressor()),
-                             ((2,)*20, CUSZCompressor())
+                             ((2, 3, 4), NumpyCompressor(), np.float32),
+                             ((2, 3, 4), NumpyCompressor(), np.float64),
+                             ((2, 3, 4), CUSZCompressor(), np.float32),
+                             ((2, 3, 4), CUSZCompressor(), np.float64),
+                             ((2, 3, 4), CUSZCompressor(), np.complex128),
+                             ((2,)*20, CUSZCompressor(), np.float32),
+                             ((2,)*20, CUSZCompressor(), np.float64)
                         ]
                         )
-def test_compressors(shape, compressor):
+def test_compressors(shape, compressor, dtype):
+    print(shape, compressor, dtype)
     import cupy
     indices = [Var(i, size=s) for i, s in enumerate(shape)]
-    data = cupy.random.random(shape, dtype=np.float32)*.00001
-    print("Data size", data.nbytes)
+    if dtype is np.complex128:
+        data = cupy.random.random(shape, dtype=np.float64) + 1j*cupy.random.random(shape, dtype=np.float64)
+    else:
+        data = cupy.random.random(shape, dtype=dtype)
     t = CompressedTensor("myT", indices, data=data, compressor=compressor)
     t.compress_indices([indices[0]])
+    print("<--Compressed")
 
     s = t[1]
-    print('got chunk')
+    print("-->Decompressed")
     assert s.data is not None
     ch = cupy.asnumpy(t.get_chunk([1]))
     ref = cupy.asnumpy(s.data)
 
     assert np.allclose(ch, ref)
-
diff --git a/qtensor/contraction_backends/compression.py b/qtensor/contraction_backends/compression.py
index 812b97b5..6f3dd502 100644
--- a/qtensor/contraction_backends/compression.py
+++ b/qtensor/contraction_backends/compression.py
@@ -1,5 +1,8 @@
 from qtensor.contraction_backends import ContractionBackend
 from qtensor.compression import Compressor
+from qtensor.compression.compressed_contraction import compressed_contract
+from qtensor.contraction_backends.common import slice_numpy_tensor
+from qtree.optimizer import Tensor
 
 class CompressionBackend(ContractionBackend):
     """
@@ -22,10 +25,51 @@ def __init__(self, backend, compressor:Compressor, max_tw:int):
         self.compressor = compressor
         self.max_tw = max_tw
 
-    def process_bucket(self, bucket):
+    def process_bucket(self, bucket, no_sum=False):
+        """
+        Process a bucket.
 
-    def get_sliced_buckets(self, buckets, slice_idx):
-        return buckets
+        This uses `self.backend.process_bucket` in combination with
+        compression.compressed_contraction.compressed_contract
+        """
+        bucket.sort(key=lambda x: len(x.indices))
+        accum = bucket[0]
+        for t in bucket[1:-1]:
+            contract_ixs = set().union(*[t.indices, accum.indices])
+            accum = compressed_contract(
+                accum, t, contract_ixs, self.max_tw, self.compressor
+            )
+        if len(bucket)>1:
+            t = bucket[-1]
+            contract_ixs = sorted(
+                set().union(*[t.indices, accum.indices])
+                , key=int, reverse=True
+            )
+            contract_ixs = contract_ixs[:-1]
+            accum = compressed_contract(
+                accum, t, contract_ixs, self.max_tw, self.compressor
+            )
+            return accum
+        else:
+            # This assumes large buckets with one element don't exist
+            result_data = accum.data.sum(axis=-1)
+            return Tensor(accum.name, accum.indices[:-1], data=result_data)
+
+    def get_sliced_buckets(self, buckets, data_dict, slice_dict):
+        """
+        Slice buckets accounding to `slice_dict`
+
+        This delegates to `self.backend`, assuming that buckets don't have 
+        tensors with more than `self.max_tw` indices.
+        """
+        # Note: to support large tensors (more than `max_tw`), 
+        # just iterate through sliced bucket tensors and compress if needed
+        return self.backend.get_sliced_buckets(buckets, data_dict, slice_dict)
 
     def get_result_data(self, result):
-        return result.data
+        """
+        Get result data from `result` tensor.
+
+        This assumes that the result has at most `self.max_tw` indices.
+        """
+        return self.backend.get_result_data(result)

From ecd18cf401f76c11f0576e4ab4339e4f2560420e Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Mon, 27 Feb 2023 22:08:28 -0600
Subject: [PATCH 030/126] don't test complex128 or float64. This is not
 supported

---
 qtensor/compression/test_compressed_tensor.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/qtensor/compression/test_compressed_tensor.py b/qtensor/compression/test_compressed_tensor.py
index 23e86175..3780143d 100644
--- a/qtensor/compression/test_compressed_tensor.py
+++ b/qtensor/compression/test_compressed_tensor.py
@@ -50,7 +50,8 @@ def test_slice_tensor():
                              ((2, 3, 4), CUSZCompressor(), np.float64),
                              ((2, 3, 4), CUSZCompressor(), np.complex128),
                              ((2,)*20, CUSZCompressor(), np.float32),
-                             ((2,)*20, CUSZCompressor(), np.float64)
+                             ((2,)*20, CUSZCompressor(), np.complex64),
+                             #((2,)*20, CUSZCompressor(), np.float64)
                         ]
                         )
 def test_compressors(shape, compressor, dtype):
@@ -59,6 +60,8 @@ def test_compressors(shape, compressor, dtype):
     indices = [Var(i, size=s) for i, s in enumerate(shape)]
     if dtype is np.complex128:
         data = cupy.random.random(shape, dtype=np.float64) + 1j*cupy.random.random(shape, dtype=np.float64)
+    elif dtype is np.complex64:
+        data = cupy.random.random(shape, dtype=np.float32) + 1j*cupy.random.random(shape, dtype=np.float32)
     else:
         data = cupy.random.random(shape, dtype=dtype)
     t = CompressedTensor("myT", indices, data=data, compressor=compressor)
@@ -72,3 +75,4 @@ def test_compressors(shape, compressor, dtype):
     ref = cupy.asnumpy(s.data)
 
     assert np.allclose(ch, ref)
+    assert np.allclose(ch, data[1], rtol=0.1, atol=.01)

From 77c27df0370f60d57b3f192c65daaa18a141744d Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Tue, 28 Feb 2023 13:37:03 -0600
Subject: [PATCH 031/126] reverse cupy order

---
 qtensor/contraction_backends/cupy.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/qtensor/contraction_backends/cupy.py b/qtensor/contraction_backends/cupy.py
index b5a897d9..fee5b608 100644
--- a/qtensor/contraction_backends/cupy.py
+++ b/qtensor/contraction_backends/cupy.py
@@ -2,6 +2,7 @@
 from qtensor.tools.lazy_import import cupy as cp
 from qtensor.contraction_backends import ContractionBackend
 from qtensor.contraction_backends.numpy import get_einsum_expr
+from .common import slice_numpy_tensor
 
 
 class CuPyBackend(ContractionBackend):
@@ -9,6 +10,7 @@ class CuPyBackend(ContractionBackend):
     # Replace all torch methods with cupy's analog
     
     def process_bucket(self, bucket, no_sum=False):
+        bucket.sort(key = lambda x: len(x.indices))
         result_indices = bucket[0].indices
         result_data = bucket[0].data
         for tensor in bucket[1:]:
@@ -25,12 +27,13 @@ def process_bucket(self, bucket, no_sum=False):
             # Merge and sort indices and shapes
             result_indices = tuple(sorted(
                 set(result_indices + tensor.indices),
-                key=int)
+                key=int, reverse=True)
             )
 
         if len(result_indices) > 0:
             if not no_sum:  # trim first index
-                first_index, *result_indices = result_indices
+                contract_index, *result_indices = result_indices
+                result_indices = result_indices[:-1]
             else:
                 first_index, *_ = result_indices
             tag = first_index.identity
@@ -96,6 +99,7 @@ def get_sliced_buckets(self, buckets, data_dict, slice_dict):
                 # cp.argsort requires input to be cp array
                 #print(tensor.indices)
                 transpose_order = cp.argsort(cp.asarray(list(map(int, tensor.indices)))).tolist()
+                transpose_order = list(reversed(transpose_order))
                 
                 '''
                 Change 2:

From 0a7c43f11f0452ea8d3cca102795a692ab5fc911 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Tue, 28 Feb 2023 13:56:34 -0600
Subject: [PATCH 032/126] reverse cupy tensor index ordering

---
 qtensor/contraction_backends/cupy.py            | 16 ++++++++--------
 qtensor/contraction_backends/tests/test_cupy.py |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/qtensor/contraction_backends/cupy.py b/qtensor/contraction_backends/cupy.py
index fee5b608..4a3b4c56 100644
--- a/qtensor/contraction_backends/cupy.py
+++ b/qtensor/contraction_backends/cupy.py
@@ -1,8 +1,8 @@
 import qtree
 from qtensor.tools.lazy_import import cupy as cp
 from qtensor.contraction_backends import ContractionBackend
-from qtensor.contraction_backends.numpy import get_einsum_expr
-from .common import slice_numpy_tensor
+#from qtensor.contraction_backends.numpy import get_einsum_expr
+from .common import slice_numpy_tensor, get_einsum_expr
 
 
 class CuPyBackend(ContractionBackend):
@@ -15,7 +15,7 @@ def process_bucket(self, bucket, no_sum=False):
         result_data = bucket[0].data
         for tensor in bucket[1:]:
 
-            expr = qtree.utils.get_einsum_expr(
+            expr = get_einsum_expr(
                 list(map(int, result_indices)), list(map(int, tensor.indices))
             )
             
@@ -32,11 +32,11 @@ def process_bucket(self, bucket, no_sum=False):
 
         if len(result_indices) > 0:
             if not no_sum:  # trim first index
-                contract_index, *result_indices = result_indices
+                contract_index = result_indices[-1]
                 result_indices = result_indices[:-1]
             else:
-                first_index, *_ = result_indices
-            tag = first_index.identity
+                contract_index = result_indices[-1]
+            tag = contract_index.identity
         else:
             tag = 'f'
             result_indices = []
@@ -47,7 +47,7 @@ def process_bucket(self, bucket, no_sum=False):
                                 data=result_data)
         else:
             result = qtree.optimizer.Tensor(f'E{tag}', result_indices,
-                                data=cp.sum(result_data, axis=0))
+                                data=cp.sum(result_data, axis=-1))
         return result
 
     def process_bucket_merged(self, ixs, bucket, no_sum=False):
@@ -140,4 +140,4 @@ def get_sliced_buckets(self, buckets, data_dict, slice_dict):
         return sliced_buckets
 
     def get_result_data(self, result):
-        return result.data
+        return cp.transpose(result.data)
diff --git a/qtensor/contraction_backends/tests/test_cupy.py b/qtensor/contraction_backends/tests/test_cupy.py
index 517fc074..a559ae94 100644
--- a/qtensor/contraction_backends/tests/test_cupy.py
+++ b/qtensor/contraction_backends/tests/test_cupy.py
@@ -64,7 +64,7 @@ def contract_tn(backend, search_len=1, test_problem_kwargs={}):
         print('selected_bucket', selected_bucket)
 
         result = backend.process_bucket(selected_bucket)
-        return result.data
+        return backend.get_result_data(result)
 
     restr = contract_tn(btr, 1)
     resnp = contract_tn(bnp, 1)

From a6ee295772094c8f9e772350a15724cfeae90bda Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Tue, 28 Feb 2023 19:18:06 -0600
Subject: [PATCH 033/126] add compressor backend and tests for it; add
 cbe/common.py

---
 qtensor/compression/CompressedTensor.py       |  16 ++-
 qtensor/compression/__init__.py               |   4 +-
 qtensor/compression/compressed_contraction.py | 104 ++++++++++++++++--
 qtensor/compression/szx/src/cuszx_wrapper.py  |   3 +-
 .../compression/test_compressed_contract.py   |  23 +++-
 qtensor/contraction_backends/__init__.py      |  12 +-
 qtensor/contraction_backends/common.py        |  31 ++++++
 qtensor/contraction_backends/compression.py   |  30 +++--
 .../contraction_backends/tests/test_torch.py  |  14 +--
 qtensor/contraction_backends/torch.py         |  31 +-----
 qtensor/tests/__init__.py                     |   7 ++
 qtensor/tests/test_bucket_backends.py         |  33 +++++-
 12 files changed, 231 insertions(+), 77 deletions(-)

diff --git a/qtensor/compression/CompressedTensor.py b/qtensor/compression/CompressedTensor.py
index fab843b1..c1299d06 100644
--- a/qtensor/compression/CompressedTensor.py
+++ b/qtensor/compression/CompressedTensor.py
@@ -6,7 +6,7 @@
 import sys
 from pathlib import Path
 print(Path(__file__).parent/'szx/src/')
-sys.path.append(Path(__file__).parent/'szx/src/')
+sys.path.append(str(Path(__file__).parent/'szx/src/'))
 sys.path.append('./szx/src')
 
 from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
@@ -28,17 +28,19 @@ def decompress(self, ptr):
 
 class NumpyCompressor(Compressor):
     def compress(self, data):
-        print(f"Compressing len {data.size}")
         comp = io.BytesIO()
         np.savez_compressed(comp, data)
         return comp
 
     def decompress(self, ptr):
         ptr.seek(0)
-        print(f"Loading arr.")
         return  np.load(ptr)['arr_0']
 
 class CUSZCompressor(Compressor):
+    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
+        self.r2r_error = r2r_error
+        self.r2r_threshold = r2r_threshold
+
     def compress(self, data):
         import cupy
         if isinstance(data, cupy.ndarray):
@@ -46,16 +48,12 @@ def compress(self, data):
         else:
             isCuPy = False
         num_elements = data.size
-        print("Num elements", num_elements)
         # Adapt numele depending on itemsize
         itemsize = data.dtype.itemsize
         num_elements_eff = int(num_elements*itemsize/4) 
 
-        r2r_error = 0.01
-        r2r_threshold = 0.01
         dtype = data.dtype
-        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, r2r_error, r2r_threshold)
-        print("returning compressed data")
+        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
         return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype)
 
     def decompress(self, obj):
@@ -121,7 +119,7 @@ class CompressedTensor(Tensor):
     def __init__(self, name, indices,
                  data_key=None, data=None,
                  slice_indices=[],
-                 compressor=Compressor()
+                 compressor:Compressor=NumpyCompressor()
                 ):
         """
         Initialize the tensor
diff --git a/qtensor/compression/__init__.py b/qtensor/compression/__init__.py
index f11efed3..6a5f7464 100644
--- a/qtensor/compression/__init__.py
+++ b/qtensor/compression/__init__.py
@@ -1,4 +1,4 @@
 from .CompressedTensor import CompressedTensor, Tensor
-from .CompressedTensor import Compressor, CUSZCompressor
-from .compressed_contraction import compressed_contract
+from .CompressedTensor import Compressor, NumpyCompressor, CUSZCompressor
+from .compressed_contraction import compressed_contract, compressed_sum
 from .cost_estimation import compressed_contraction_cost
diff --git a/qtensor/compression/compressed_contraction.py b/qtensor/compression/compressed_contraction.py
index 9d1d69b5..c6648f24 100644
--- a/qtensor/compression/compressed_contraction.py
+++ b/qtensor/compression/compressed_contraction.py
@@ -8,7 +8,7 @@
 einsum_symbols = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
 einsum_symbols_set = set(einsum_symbols)
 
-def contract_two_tensors(A, B, T_out):
+def contract_two_tensors(A, B, T_out, einsum=np.einsum):
     """
     Contract tensors A and B along their common indices and write result to T_out.
     T_out tensor should be pre-allocated with data.
@@ -31,7 +31,7 @@ def contract_two_tensors(A, B, T_out):
         max_id = max(max_id, b_int)
         B_ints.append(b_int)
 
-    if max_id > len(einsum_symbols):
+    if max_id >= len(einsum_symbols):
         # -- relabel indices to small ints
         all_indices = set(A_ints + B_ints)
         relabel_dict_int = {i: j for j, i in enumerate(all_indices)}
@@ -41,12 +41,21 @@ def contract_two_tensors(A, B, T_out):
     else:
         result_ints = list(map(int, result_indices))
 
-    np.einsum(A.data, A_ints, B.data, B_ints, result_ints, out=out_buffer)
+    out = einsum(A.data, A_ints, B.data, B_ints, result_ints)
+    if len(result_ints)>0:
+        # This copying is reqiured because cupy doesn't support `out` argument.
+        out_buffer[:] = out
+    else:
+        out_buffer.fill(out)
 
 
 def compressed_contract(A:Tensor, B: Tensor,
                         contract_ixs, mem_limit,
-                        compressor:Compressor):
+                        compressor:Compressor,
+                        # These two functions are used to support many backends
+                        einsum=np.einsum,
+                        move_data=lambda x: x
+                       ):
     """
     Contract tensors A and B along `contract_ixs` and return the result
 
@@ -56,7 +65,6 @@ def compressed_contract(A:Tensor, B: Tensor,
     all_indices.sort(key=int, reverse=True)
     result_indices = list(set(all_indices) - set(contract_ixs))
     result_indices.sort(key=int, reverse=True)
-    to_small_int = lambda x: all_indices.index(x)
 
     # -- Find set of existing compressed that will be decompressed
     exist_compressed = set()
@@ -72,15 +80,16 @@ def compressed_contract(A:Tensor, B: Tensor,
 
 
     need_compressed = result_indices[:-mem_limit]
-    print(f"Need compression: {need_compressed}")
-    new_tensor_name = 'C'+str(int(all_indices[0]))
+    new_tensor_name = 'C'+str(int(all_indices[-1]))
 
     # -- Early return: if no need to compress, do the regular contraction
     if len(need_compressed)==0 and len(exist_compressed)==0:
         C = Tensor.empty(new_tensor_name, result_indices)
-        contract_two_tensors(A, B, C)
+        C.data = move_data(C.data)
+        contract_two_tensors(A, B, C, einsum=einsum)
         return C
     # --
+    print(f"Need compression: {need_compressed}")
 
     remove_compress = exist_compressed - set(need_compressed)
     R = CompressedTensor(new_tensor_name,
@@ -97,6 +106,7 @@ def compressed_contract(A:Tensor, B: Tensor,
             slice_dict[ix] = sl
         chunk = np.empty(2**len(result_chunk_ixs), dtype=B.dtype)
         chunk = chunk.reshape(*(v.size for v in result_chunk_ixs))
+        chunk = move_data(chunk)
         for irm in iterate_indices(remove_compress):
             for i, ival in zip(remove_compress, irm):
                 slice_dict[i] = ival#slice(ival, ival+1)
@@ -109,5 +119,81 @@ def compressed_contract(A:Tensor, B: Tensor,
             C_ixs = [v for v in result_chunk_ixs if v not in exist_compressed]
             C = Tensor('tmp', indices=C_ixs, data=chunk_view)
             contract_two_tensors(A_slice, B_slice, C)
-        R.set_chunk(r_i, chunk)
+        if len(need_compressed)==0:
+            R = Tensor(new_tensor_name, result_indices, data=chunk)
+        else:
+            R.set_chunk(r_i, chunk)
+    print('Return', R)
+    return R
+
+def compressed_sum(A:Tensor, sum_ixs,
+                   compressor:Compressor,
+                   mem_limit,
+                   # These two functions are used to support many backends
+                   einsum=np.einsum,
+                   move_data=lambda x: x
+                  ):
+    """
+    The result tensor indices will be ordered from largest to smallest
+    """
+    all_indices = list(set(A.indices))
+    all_indices.sort(key=int, reverse=True)
+    result_indices = list(set(all_indices) - set(sum_ixs))
+    result_indices.sort(key=int, reverse=True)
+
+    # -- Find set of existing compressed that will be decompressed
+    exist_compressed = set()
+    if isinstance(A, CompressedTensor):
+        exist_compressed.update(A.slice_indices)
+    # In this particular case, we need not to sort these indices,
+    # since the iteration over fast index gives same latency as over slow index
+    # Potential improvement: if A_S and B_S are different, run outer loop 
+    # over min(A_S, B_S) and inner over the rest indices. This will reduce 
+    # the number of decompressions.
+    # --
+
+    need_compressed = result_indices[:-mem_limit]
+    new_tensor_name = 'C'+str(int(all_indices[-1]))
+
+    # -- Early return: if no need to compress, do the regular contraction
+    if len(need_compressed)==0 and len(exist_compressed)==0:
+        C = Tensor.empty(new_tensor_name, result_indices)
+        sum_axes = [A.indices.index(i) for i in sum_ixs]
+        C.data = A.data.sum(axis=sum_axes)
+        return C
+    # --
+    print(f"Need compression: {need_compressed}")
+
+    remove_compress = exist_compressed - set(need_compressed)
+    R = CompressedTensor(new_tensor_name,
+                         result_indices,
+                         slice_indices=need_compressed,
+                         compressor=compressor
+                        )
+
+    result_chunk_ixs = result_indices[-mem_limit:]
+    print(f"Chunk indices: {result_chunk_ixs}, remove_compress: {remove_compress}")
+    slice_dict = {}
+    for r_i in iterate_indices(need_compressed):
+        for ix, sl in zip(need_compressed, r_i):
+            slice_dict[ix] = sl
+        chunk = np.empty(2**len(result_chunk_ixs), dtype=A.dtype)
+        chunk = chunk.reshape(*(v.size for v in result_chunk_ixs))
+        chunk = move_data(chunk)
+        for irm in iterate_indices(remove_compress):
+            for i, ival in zip(remove_compress, irm):
+                slice_dict[i] = ival#slice(ival, ival+1)
+            chunk_view = chunk[tuple(
+                slice_dict.get(i, slice(None)) for i in result_chunk_ixs
+            )]
+            A_slice = A[slice_dict]
+            sum_axes = [A_slice.indices.index(i) for i in sum_ixs]
+
+            C_ixs = [v for v in result_chunk_ixs if v not in exist_compressed]
+            C = Tensor('tmp', indices=C_ixs, data=chunk_view)
+            chunk_view[:] = A_slice.data.sum(axis=tuple(sum_axes))
+        if len(need_compressed)==0:
+            R = Tensor(new_tensor_name, result_indices, data=chunk)
+        else:
+            R.set_chunk(r_i, chunk)
     return R
diff --git a/qtensor/compression/szx/src/cuszx_wrapper.py b/qtensor/compression/szx/src/cuszx_wrapper.py
index b237a8ec..15888fab 100644
--- a/qtensor/compression/szx/src/cuszx_wrapper.py
+++ b/qtensor/compression/szx/src/cuszx_wrapper.py
@@ -4,7 +4,8 @@
 import random
 import cupy as cp
 
-LIB_PATH = './libcuszx_wrapper.so'
+from pathlib import Path
+LIB_PATH = str(Path(__file__).parent/'libcuszx_wrapper.so')
 
 # unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize)
 
diff --git a/qtensor/compression/test_compressed_contract.py b/qtensor/compression/test_compressed_contract.py
index d276e24a..c6c3abcb 100644
--- a/qtensor/compression/test_compressed_contract.py
+++ b/qtensor/compression/test_compressed_contract.py
@@ -1,4 +1,5 @@
-from qtensor.compression import compressed_contract, CompressedTensor, Tensor
+from qtensor.compression import compressed_contract, compressed_sum, CompressedTensor, Tensor
+from qtensor.compression import NumpyCompressor
 from qtree.optimizer import Var
 import numpy as np
 
@@ -49,6 +50,26 @@ def test_compressed_contract():
     assert np.allclose(C, res.get_chunk(()))
     print("Success!")
 
+def test_compressed_sum():
+    A_ixs = [Var(x) for x in [8,7,6,5,4,3, 2]]
+    A_comp = [Var(x) for x in [8, 7, 6]]
+    A_data = np.random.rand(2**len(A_ixs))
+    #A_data = np.random.randn(2**len(A_ixs))
+    A_data = A_data.reshape(*(v.size for v in A_ixs))
+    A = CompressedTensor('A', A_ixs, data=A_data)
+    A.compress_indices(A_comp)
+    sum_indices = [Var(i) for i in [2, 4]]
+
+    res = compressed_sum(A, sum_indices, NumpyCompressor(), mem_limit=4)
+    print(f"Resulting Tensor: {res}")
+    res_ref = np.sum(A_data, axis=tuple(A_ixs.index(i) for i in sum_indices))
+    assert np.allclose(res.get_chunk((0, )), res_ref[0])
+    assert not np.allclose(res.get_chunk((1, )), res_ref[0])
+
+    res = compressed_sum(res, [Var(5)], NumpyCompressor(), mem_limit=4)
+    assert isinstance(res, Tensor)
+    assert np.allclose(res.data, res_ref.sum(axis=3))
+
 
 if __name__=="__main__":
     test_compressed_contract()
diff --git a/qtensor/contraction_backends/__init__.py b/qtensor/contraction_backends/__init__.py
index d67bc884..adaa68d1 100644
--- a/qtensor/contraction_backends/__init__.py
+++ b/qtensor/contraction_backends/__init__.py
@@ -11,11 +11,14 @@
 from .opt_einsum import OptEinusmBackend
 from .transpose_backend import NumpyTranspoedBackend, TorchTransposedBackend, CupyTransposedBackend, CutensorTransposedBackend
 from .performance_measurement_decorator import PerfNumpyBackend, PerfBackend, GPUPerfBackend
+from .compression import CompressionBackend
+from qtensor.compression import NumpyCompressor
 
 def get_backend(name):
     backend_dict = {
         'mkl': CMKLExtendedBackend,
         'einsum':NumpyBackend,
+        'numpy':NumpyBackend,
         'opt_einsum': OptEinusmBackend,
         'torch_cpu': TorchBackend,
         'torch_gpu': TorchBackend,
@@ -27,7 +30,14 @@ def get_backend(name):
         'tr_cupy': CupyTransposedBackend,
         'tr_cutensor': CutensorTransposedBackend
     }
-    if name in ["torch_gpu", "tr_torch"]:
+    # -- add compression backend
+    compression_suffix = '_compressed'
+    ix = name.find(compression_suffix)
+    if ix != -1:
+        backend = get_backend(name[:ix])
+        return CompressionBackend(backend, NumpyCompressor(), 30)
+
+    if name in ["torch_gpu", "torch_cpu"]:
         return backend_dict['torch'](device = name[-3:])
     else:
         return backend_dict[name]()
diff --git a/qtensor/contraction_backends/common.py b/qtensor/contraction_backends/common.py
index f787635e..bf132ca7 100644
--- a/qtensor/contraction_backends/common.py
+++ b/qtensor/contraction_backends/common.py
@@ -1,4 +1,5 @@
 import numpy as np
+import qtree
 from qtree.optimizer import Tensor
 
 def permute_np_tensor_data(data:np.ndarray, indices_in, indices_out):
@@ -47,3 +48,33 @@ def slice_numpy_tensor(data:np.ndarray, indices_in, indices_out, slice_dict):
     assert len(indices_sliced) == len(s_data.shape)
     st_data = permute_np_tensor_data(s_data, indices_sliced, indices_out)
     return st_data, indices_out
+
+def get_einsum_expr(idx1, idx2, contract=0):
+    """
+    Takes two tuples of indices and returns an einsum expression
+    to evaluate the sum over repeating indices
+
+    Parameters
+    ----------
+    idx1 : list-like
+          indices of the first argument
+    idx2 : list-like
+          indices of the second argument
+
+    Returns
+    -------
+    expr : str
+          Einsum command to sum over indices repeating in idx1
+          and idx2.
+    """
+    result_indices = sorted(list(set(idx1 + idx2)), reverse=True)
+    # remap indices to reduce their order, as einsum does not like
+    # large numbers
+    idx_to_least_idx = {old_idx: new_idx for new_idx, old_idx
+                        in enumerate(result_indices)}
+    result_indices = result_indices[:len(result_indices)-contract]
+
+    str1 = ''.join(qtree.utils.num_to_alpha(idx_to_least_idx[ii]) for ii in idx1)
+    str2 = ''.join(qtree.utils.num_to_alpha(idx_to_least_idx[ii]) for ii in idx2)
+    str3 = ''.join(qtree.utils.num_to_alpha(idx_to_least_idx[ii]) for ii in result_indices)
+    return str1 + ',' + str2 + '->' + str3
diff --git a/qtensor/contraction_backends/compression.py b/qtensor/contraction_backends/compression.py
index 6f3dd502..bdac4b65 100644
--- a/qtensor/contraction_backends/compression.py
+++ b/qtensor/contraction_backends/compression.py
@@ -1,6 +1,6 @@
 from qtensor.contraction_backends import ContractionBackend
 from qtensor.compression import Compressor
-from qtensor.compression.compressed_contraction import compressed_contract
+from qtensor.compression.compressed_contraction import compressed_contract, compressed_sum
 from qtensor.contraction_backends.common import slice_numpy_tensor
 from qtree.optimizer import Tensor
 
@@ -25,6 +25,18 @@ def __init__(self, backend, compressor:Compressor, max_tw:int):
         self.compressor = compressor
         self.max_tw = max_tw
 
+    def _get_backend_specific_fns(self, backend):
+        ## Hacky way to extend backends
+        if 'cupy' in backend.__class__.__name__.lower():
+            import cupy as cp
+            return cp.einsum, cp.array
+        elif 'torch' in backend.__class__.__name__.lower():
+            import torch
+            return torch.einsum, torch.tensor
+        else:
+            import numpy as np
+            return np.einsum, lambda x: x
+
     def process_bucket(self, bucket, no_sum=False):
         """
         Process a bucket.
@@ -32,28 +44,30 @@ def process_bucket(self, bucket, no_sum=False):
         This uses `self.backend.process_bucket` in combination with
         compression.compressed_contraction.compressed_contract
         """
+        ctr_kw = dict(zip(['einsum', 'move_data'], self._get_backend_specific_fns(self.backend)))
         bucket.sort(key=lambda x: len(x.indices))
         accum = bucket[0]
         for t in bucket[1:-1]:
-            contract_ixs = set().union(*[t.indices, accum.indices])
             accum = compressed_contract(
-                accum, t, contract_ixs, self.max_tw, self.compressor
+                accum, t, [], self.max_tw, self.compressor,
+                **ctr_kw
             )
         if len(bucket)>1:
             t = bucket[-1]
-            contract_ixs = sorted(
+            total_ixs = sorted(
                 set().union(*[t.indices, accum.indices])
                 , key=int, reverse=True
             )
-            contract_ixs = contract_ixs[:-1]
             accum = compressed_contract(
-                accum, t, contract_ixs, self.max_tw, self.compressor
+                accum, t, [total_ixs[-1]], self.max_tw, self.compressor
+                ,**ctr_kw
             )
             return accum
         else:
             # This assumes large buckets with one element don't exist
-            result_data = accum.data.sum(axis=-1)
-            return Tensor(accum.name, accum.indices[:-1], data=result_data)
+            indices = (accum.indices[-1], )
+            res = compressed_sum(accum, indices, self.compressor, self.max_tw,  **ctr_kw)
+            return res
 
     def get_sliced_buckets(self, buckets, data_dict, slice_dict):
         """
diff --git a/qtensor/contraction_backends/tests/test_torch.py b/qtensor/contraction_backends/tests/test_torch.py
index ec23edb8..32a4757d 100644
--- a/qtensor/contraction_backends/tests/test_torch.py
+++ b/qtensor/contraction_backends/tests/test_torch.py
@@ -3,19 +3,9 @@
 import numpy as np
 from qtensor.contraction_backends import TorchBackend, NumpyBackend
 from qtensor import QtreeSimulator
+from qtensor.tests import get_test_qaoa_ansatz_circ
 torch = pytest.importorskip('torch')
 
-def get_test_qaoa_circ(n=10, p=2, d=3, type='random'):
-    G = qtensor.toolbox.random_graph(seed=10, degree=d, nodes=n, type=type)
-    print('Test problem: n, p, d', n, p, d)
-    gamma, beta = [np.pi/5]*p, [np.pi/2]*p
-
-    composer = qtensor.DefaultQAOAComposer(
-        graph=G, gamma=gamma, beta=beta)
-    composer.ansatz_state()
-    return composer.circuit
-
-
 def get_test_qaoa_tn(n=10, p=2, d=3, type='random'):
     G = qtensor.toolbox.random_graph(seed=10, degree=d, nodes=n, type=type)
     print('Test problem: n, p, d', n, p, d)
@@ -29,7 +19,7 @@ def get_test_qaoa_tn(n=10, p=2, d=3, type='random'):
 
 
 def test_simulation():
-    circ = get_test_qaoa_circ(p=3)
+    circ = get_test_qaoa_ansatz_circ(p=3)
     btr = TorchBackend()
     bnp = NumpyBackend()
     simtr = QtreeSimulator(backend=btr)
diff --git a/qtensor/contraction_backends/torch.py b/qtensor/contraction_backends/torch.py
index 4d7ad448..2180be40 100644
--- a/qtensor/contraction_backends/torch.py
+++ b/qtensor/contraction_backends/torch.py
@@ -3,7 +3,7 @@
 import numpy as np
 from qtree import np_framework
 from qtensor.contraction_backends import ContractionBackend
-from .common import slice_numpy_tensor
+from .common import slice_numpy_tensor, get_einsum_expr
 import string
 CHARS = string.ascii_lowercase + string.ascii_uppercase
 
@@ -32,35 +32,6 @@ def get_einsum_expr_bucket(bucket, all_indices_list, result_indices):
     return expr
 
 
-def get_einsum_expr(idx1, idx2, contract=0):
-    """
-    Takes two tuples of indices and returns an einsum expression
-    to evaluate the sum over repeating indices
-
-    Parameters
-    ----------
-    idx1 : list-like
-          indices of the first argument
-    idx2 : list-like
-          indices of the second argument
-
-    Returns
-    -------
-    expr : str
-          Einsum command to sum over indices repeating in idx1
-          and idx2.
-    """
-    result_indices = sorted(list(set(idx1 + idx2)), reverse=True)
-    # remap indices to reduce their order, as einsum does not like
-    # large numbers
-    idx_to_least_idx = {old_idx: new_idx for new_idx, old_idx
-                        in enumerate(result_indices)}
-    result_indices = result_indices[:len(result_indices)-contract]
-
-    str1 = ''.join(qtree.utils.num_to_alpha(idx_to_least_idx[ii]) for ii in idx1)
-    str2 = ''.join(qtree.utils.num_to_alpha(idx_to_least_idx[ii]) for ii in idx2)
-    str3 = ''.join(qtree.utils.num_to_alpha(idx_to_least_idx[ii]) for ii in result_indices)
-    return str1 + ',' + str2 + '->' + str3
 
 
diff --git a/qtensor/tests/__init__.py b/qtensor/tests/__init__.py
index 3c34f014..dff5904d 100644
--- a/qtensor/tests/__init__.py
+++ b/qtensor/tests/__init__.py
@@ -1,7 +1,14 @@
 import networkx as nx
+import qtensor
 import numpy as np
 from functools import lru_cache
 
+def get_test_qaoa_ansatz_circ(n=10, p=2, d=3, type='random'):
+    G, gamma, beta = get_test_problem(n, p, d, type)
+    composer = qtensor.DefaultQAOAComposer(
+        graph=G, gamma=gamma, beta=beta)
+    composer.ansatz_state()
+    return composer.circuit
 
 @lru_cache(maxsize=2**12)
 def get_test_problem(n=10, p=2, d=3, type='random'):
diff --git a/qtensor/tests/test_bucket_backends.py b/qtensor/tests/test_bucket_backends.py
index 8d3c2270..d26ba8a5 100644
--- a/qtensor/tests/test_bucket_backends.py
+++ b/qtensor/tests/test_bucket_backends.py
@@ -1,11 +1,14 @@
 from qtensor import QtreeQAOAComposer
 from qtensor.contraction_backends import PerfNumpyBackend
-
+from qtensor.contraction_backends import CuPyBackend, NumpyBackend, CompressionBackend
+from qtensor.compression import NumpyCompressor, CUSZCompressor
 from qtensor.Simulate import CirqSimulator, QtreeSimulator
+
+import pytest
 import qtensor
 import numpy as np
 import networkx as nx
-from qtensor.tests import get_test_problem
+from qtensor.tests import get_test_problem, get_test_qaoa_ansatz_circ
 
 from qtensor.contraction_algos import is_reverse_order_backend
 
@@ -38,5 +41,27 @@ def test_reverse_order_switch():
     reverse = is_reverse_order_backend(backend)
     assert not reverse
 
-def test_compression_backend():
-    pass
+ref_backend_name = 'cupy'
+@pytest.mark.parametrize('circ', [
+    get_test_qaoa_ansatz_circ(n=6, p=3),
+    get_test_qaoa_ansatz_circ(n=12, p=4),
+])
+@pytest.mark.parametrize(['backend', 'atol'], [
+    ('cupy', 1e-10),
+    ('torch', 1e-10),
+    ('cupy_compressed', 1e-10),
+    (CompressionBackend(
+        CuPyBackend(),
+        CUSZCompressor(r2r_error=1e-4, r2r_threshold=1e-5),
+        11 ),
+        1e-5)
+])
+def test_backends(circ, backend, atol):
+    ref_backend = qtensor.contraction_backends.get_backend(ref_backend_name)
+    if isinstance(backend, str):
+        backend = qtensor.contraction_backends.get_backend(backend)
+    sim = QtreeSimulator(backend=backend)
+    res = sim.simulate(circ)
+    sim_ref = QtreeSimulator(backend=ref_backend)
+    res_ref = sim_ref.simulate(circ)
+    assert np.allclose(res, res_ref, atol=atol)

From 9faae9544e30dd9d9ae763640cd9a437ae72e5ca Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Tue, 28 Feb 2023 19:36:13 -0600
Subject: [PATCH 034/126] fix the tests

---
 qtensor/compression/CompressedTensor.py         |  2 +-
 qtensor/compression/test_compressed_contract.py | 13 +++++++------
 qtensor/compression/test_compressed_tensor.py   |  1 +
 qtensor/compression/test_cost_estimation.py     |  6 +++---
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/qtensor/compression/CompressedTensor.py b/qtensor/compression/CompressedTensor.py
index c1299d06..4aecece6 100644
--- a/qtensor/compression/CompressedTensor.py
+++ b/qtensor/compression/CompressedTensor.py
@@ -147,7 +147,7 @@ def __init__(self, name, indices,
             self._dtype = None
 
     @classmethod
-    def empty(cls, name, indices, slice_indices=[], compressor=Compressor(), dtype:type=NP_ARRAY_TYPE):
+    def empty(cls, name, indices, slice_indices=[], compressor=NumpyCompressor(), dtype:type=NP_ARRAY_TYPE):
         t = super().empty(name, indices, dtype)
         t.compressor = compressor
         if slice_indices:
diff --git a/qtensor/compression/test_compressed_contract.py b/qtensor/compression/test_compressed_contract.py
index c6c3abcb..05af6de4 100644
--- a/qtensor/compression/test_compressed_contract.py
+++ b/qtensor/compression/test_compressed_contract.py
@@ -5,6 +5,7 @@
 
 
 def test_compressed_contract():
+    compressor = NumpyCompressor()
     A_ixs = [Var(x) for x in [8,7,6,5,4,3, 2]]
     A_comp = [Var(x) for x in [8, 7, 6]]
     B_ixs = [Var(x) for x in [10, 9, 3, 4, 2]]
@@ -28,15 +29,15 @@ def test_compressed_contract():
 
     res_ixs = list(set(A_ixs).union(B_ixs) - set(contract_ixs))
     res_ixs.sort(key=int, reverse=True)
-    res = compressed_contract(A, B, res_ixs, contract_ixs,
-                              mem_limit=3)
+    res = compressed_contract(A, B, contract_ixs,
+                              mem_limit=3, compressor=compressor)
     print(f"Resulting Tensor: {res}")
 
-    res = compressed_contract(A, B, res_ixs, contract_ixs,
-                              mem_limit=10)
+    res = compressed_contract(A, B, contract_ixs,
+                              mem_limit=10, compressor=compressor)
 
     print(f"Resulting Tensor: {res}")
-    print(res.get_chunk(()).flatten())
+    print(res.data.flatten())
 
 
     A_str = ''.join(chr(97+int(v)) for v in A_ixs)
@@ -47,7 +48,7 @@ def test_compressed_contract():
     print(f"Ground truth:")
     print( C.flatten())
     
-    assert np.allclose(C, res.get_chunk(()))
+    assert np.allclose(C, res.data)
     print("Success!")
 
 def test_compressed_sum():
diff --git a/qtensor/compression/test_compressed_tensor.py b/qtensor/compression/test_compressed_tensor.py
index 3780143d..3136e7ee 100644
--- a/qtensor/compression/test_compressed_tensor.py
+++ b/qtensor/compression/test_compressed_tensor.py
@@ -51,6 +51,7 @@ def test_slice_tensor():
                              ((2, 3, 4), CUSZCompressor(), np.complex128),
                              ((2,)*20, CUSZCompressor(), np.float32),
                              ((2,)*20, CUSZCompressor(), np.complex64),
+                             # Not supported:
                              #((2,)*20, CUSZCompressor(), np.float64)
                         ]
                         )
diff --git a/qtensor/compression/test_cost_estimation.py b/qtensor/compression/test_cost_estimation.py
index 03957330..0a9aa24e 100644
--- a/qtensor/compression/test_cost_estimation.py
+++ b/qtensor/compression/test_cost_estimation.py
@@ -16,7 +16,7 @@ def costs_to_csv(costs):
     return "\n".join(lines)
 
 def test_compressed_contraction_cost():
-    G, gamma, beta = get_test_problem(n=32, p=15, d=4)
+    G, gamma, beta = get_test_problem(n=12, p=5, d=4)
     opt = qtensor.toolbox.get_ordering_algo('naive')
 
     composer = QtreeQAOAComposer(
@@ -41,7 +41,7 @@ def test_compressed_contraction_cost():
     print("Path\n", path)
     # -- Estimate sliced contraction
     opt_par  = qtensor.optimisation.SlicesOptimizer(base_ordering=opt, max_tw=M_limit+1, max_slice=2+opt.treewidth-M_limit)
-    opt_par  = TreeTrimSplitter(base_ordering=opt, max_tw=M_limit+1, max_slice=5+opt.treewidth-M_limit)
+    #opt_par  = TreeTrimSplitter(base_ordering=opt, max_tw=M_limit+1, max_slice=5+opt.treewidth-M_limit)
     peo, par_vars, tn = opt_par.optimize(tn)
     print("Par vars", par_vars)
     tn.slice({i: slice(0, 1) for i in par_vars})
@@ -69,7 +69,7 @@ def test_compressed_contraction_cost():
 
     print("Path list comp\n", [c.width for c in costs])
     print("Maxw", max(path))
-    assert opt.treewidth == cost.width
+    assert opt.treewidth == cost.width+1
 
 if __name__ == '__main__':
     test_compressed_contraction_cost()

From 514a54b0ab3b3c4d10d76062faf263df2d69bd1d Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Tue, 28 Feb 2023 19:55:26 -0600
Subject: [PATCH 035/126] use lazy_import for cupy

---
 qtensor/compression/CompressedTensor.py      | 6 +++++-
 qtensor/compression/szx/src/cuszx_wrapper.py | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/qtensor/compression/CompressedTensor.py b/qtensor/compression/CompressedTensor.py
index 4aecece6..9a92170e 100644
--- a/qtensor/compression/CompressedTensor.py
+++ b/qtensor/compression/CompressedTensor.py
@@ -9,7 +9,11 @@
 sys.path.append(str(Path(__file__).parent/'szx/src/'))
 sys.path.append('./szx/src')
 
-from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
+try:
+    from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
+except:
+    # Silently fail on missing build of cuszx
+    pass
 
 CUSZX_BLOCKSIZE = 256
 
diff --git a/qtensor/compression/szx/src/cuszx_wrapper.py b/qtensor/compression/szx/src/cuszx_wrapper.py
index 15888fab..fd62e87d 100644
--- a/qtensor/compression/szx/src/cuszx_wrapper.py
+++ b/qtensor/compression/szx/src/cuszx_wrapper.py
@@ -2,7 +2,7 @@
 import ctypes
 from ctypes import *
 import random
-import cupy as cp
+from qtensor.tools.lazy_import import cupy as cp
 
 from pathlib import Path
 LIB_PATH = str(Path(__file__).parent/'libcuszx_wrapper.so')

From 091bc7930b71e54fcaab4856e58fe9a0fc0bdf14 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Wed, 1 Mar 2023 18:25:42 -0600
Subject: [PATCH 036/126] change _get_ordering_ints to get_ordering_ints

---
 qtensor/optimisation/Optimizer.py            | 23 ++++++++++----------
 qtensor/optimisation/RGreedy.py              |  6 ++---
 qtensor/optimisation/late_parallelisation.py |  6 ++---
 3 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/qtensor/optimisation/Optimizer.py b/qtensor/optimisation/Optimizer.py
index d5cd37c8..96189a94 100644
--- a/qtensor/optimisation/Optimizer.py
+++ b/qtensor/optimisation/Optimizer.py
@@ -15,7 +15,7 @@
 
 
 class Optimizer:
-    def _get_ordering_ints(self, graph, inplace=True):
+    def get_ordering_ints(self, graph, inplace=True):
         raise NotImplementedError
 
     def _get_ordering(self, graph: nx.Graph, inplace=True):
@@ -27,7 +27,7 @@ def _get_ordering(self, graph: nx.Graph, inplace=True):
         """
         node_names = nx.get_node_attributes(graph, 'name')
         node_sizes = nx.get_node_attributes(graph, 'size')
-        peo, path = self._get_ordering_ints(graph, inplace=inplace)
+        peo, path = self.get_ordering_ints(graph, inplace=inplace)
         # compatibility with slicing
         self.peo_ints = [int(x) for x in peo]
 
@@ -69,7 +69,7 @@ def optimize(self, tensor_net):
 
 class WithoutOptimizer(Optimizer):
 
-    def _get_ordering_ints(self, graph, inplace=True):
+    def get_ordering_ints(self, graph, inplace=True):
         peo = sorted([int(v) for v in graph.nodes()])
         # magic line
         peo = list(reversed(peo))
@@ -77,7 +77,7 @@ def _get_ordering_ints(self, graph, inplace=True):
         return peo, path
 
 class GreedyOptimizer(Optimizer):
-    def _get_ordering_ints(self, graph, free_vars=[]):
+    def get_ordering_ints(self, graph, free_vars=[]):
         #mapping = {a:b for a,b in zip(graph.nodes(), reversed(list(graph.nodes())))}
         #graph = nx.relabel_nodes(graph, mapping)
         peo_ints, path = utils.get_neighbors_peo(graph)
@@ -188,7 +188,6 @@ def _get_max_tw(self):
         return int(np.log2(avail)) - 4
 
     def _split_graph(self, p_graph, max_tw):
-        peo_ints, path = self.base_ordering._get_ordering_ints(p_graph)
         searcher = GreedyParvars(p_graph)
         while True:
             #nodes, path = utils.get_neighbors_path(graph, peo=peo_ints)
@@ -207,22 +206,22 @@ def _split_graph(self, p_graph, max_tw):
                 log.error('Memory is not enough. Max tw: {}', max_tw)
                 raise Exception('Estimated OOM')
 
-            peo_ints, path = self.base_ordering._get_ordering_ints(p_graph)
+            self.peo_ints, path = self.base_ordering.get_ordering_ints(p_graph)
             self.treewidth = max(path)
 
-        return peo_ints, searcher.result
+        return self.peo_ints, searcher.result
 
     def optimize(self, tensor_net):
         peo, tn = super().optimize(tensor_net)
         return peo+self.parallel_vars, self.parallel_vars, tn
 
-    def _get_ordering_ints(self, graph, inplace=True):
+    def get_ordering_ints(self, graph, inplace=True):
         p_graph = copy.deepcopy(graph)
         max_tw = self._get_max_tw()
         log.info('Maximum treewidth: {}', max_tw)
         max_tw = max_tw - self.tw_bias
 
-        self.peo_ints, path = self.base_ordering._get_ordering_ints(p_graph)
+        self.peo_ints, path = self.base_ordering.get_ordering_ints(p_graph)
         self.treewidth = max(path)
         peo, par_vars = self._split_graph(p_graph, max_tw)
 
@@ -241,7 +240,7 @@ def __init__(self, max_width=None, *args, wait_time=5, **kwargs):
         self.wait_time = wait_time
         self.max_width = max_width
 
-    def _get_ordering_ints(self, graph, inplace=True):
+    def get_ordering_ints(self, graph, inplace=True):
         peo, tw = qtree.graph_model.peo_calculation.get_upper_bound_peo_pace2017_interactive(
                 graph, method="tamaki", max_time=self.wait_time, max_width=self.max_width)
         return peo, [tw]
@@ -249,7 +248,7 @@ def _get_ordering_ints(self, graph, inplace=True):
     def _get_ordering(self, graph, inplace=True):
         node_names = nx.get_node_attributes(graph, 'name')
         node_sizes = nx.get_node_attributes(graph, 'size')
-        peo, path = self._get_ordering_ints(graph, inplace=inplace)
+        peo, path = self.get_ordering_ints(graph, inplace=inplace)
         peo = [qtree.optimizer.Var(var, size=node_sizes[var],
                         name=node_names[var])
                     for var in peo]
@@ -313,7 +312,7 @@ def _split_graph(self, p_graph, max_tw):
             pv_cnt = len(result)
             log.info('Parvars count: {}. Amps count: {}', pv_cnt, 2**pv_cnt)
 
-            peo_ints, path = self.base_ordering._get_ordering_ints(p_graph)
+            peo_ints, path = self.base_ordering.get_ordering_ints(p_graph)
             tw = max(path)
             log.info('Treewidth: {}', tw)
             self._slice_hist.append([pv_cnt, tw])
diff --git a/qtensor/optimisation/RGreedy.py b/qtensor/optimisation/RGreedy.py
index 86a4dd02..ef46973d 100644
--- a/qtensor/optimisation/RGreedy.py
+++ b/qtensor/optimisation/RGreedy.py
@@ -34,7 +34,7 @@ def _get_ordering(self, graph, **kwargs):
         #graph = nx.convert_node_labels_to_integers(graph)
         node_names = nx.get_node_attributes(graph, 'name')
         node_sizes = nx.get_node_attributes(graph, 'size')
-        peo, path = self._get_ordering_ints(graph)
+        peo, path = self.get_ordering_ints(graph)
 
         peo = [qtree.optimizer.Var(var, size=node_sizes[var],
                         name=node_names[var])
@@ -42,7 +42,7 @@ def _get_ordering(self, graph, **kwargs):
         #print('tw=', max(path))
         return peo, path
 
-    def _get_ordering_ints(self, old_graph, free_vars=[]):
+    def get_ordering_ints(self, old_graph, free_vars=[]):
         best_peo = None
         best_width = np.inf
         best_widths = None
@@ -94,7 +94,7 @@ def _get_ordering_ints(self, old_graph, free_vars=[]):
 
 class RGreedyOptimizerNk(RGreedyOptimizer):
 
-    def _get_ordering_ints(self, old_graph, free_vars=[]):
+    def get_ordering_ints(self, old_graph, free_vars=[]):
         best_peo = None
         best_width = np.inf
         best_widths = None
diff --git a/qtensor/optimisation/late_parallelisation.py b/qtensor/optimisation/late_parallelisation.py
index 7ce2a3ce..121b99d0 100644
--- a/qtensor/optimisation/late_parallelisation.py
+++ b/qtensor/optimisation/late_parallelisation.py
@@ -17,7 +17,7 @@ def slice_greedy(graph, p_bunch, ordering_algo='greedy'):
     """ Slice greedy and inplece """
     orderer = qtn.toolbox.get_ordering_algo(ordering_algo)
     searcher = GreedyParvars(graph)
-    peo_ints, path = orderer._get_ordering_ints(graph)
+    peo_ints, path = orderer.get_ordering_ints(graph)
     for _ in range(p_bunch):
         error = searcher.step()
         pv_cnt = len(searcher.result)
@@ -104,7 +104,7 @@ def find_slice_at_step(self, ordering, graph, p_bunch):
             # Room for optimization: do not copy graph
             sliced_graph = graph.copy()
             slice_vars = self.slicer(sliced_graph, p_bunch=p_bunch)
-            _peo, _path = self.orderer._get_ordering_ints(sliced_graph)
+            _peo, _path = self.orderer.get_ordering_ints(sliced_graph)
             step_tw = qtn.utils.n_neighbors(graph, node) + 1
             largest_tw = max(step_tw, largest_tw)
             _tw = max(largest_tw, max(_path))
@@ -145,7 +145,7 @@ def optimize(self, tensor_net):
         else:
             current_graph = line_graph
 
-        current_ordering, tw_path = self.orderer._get_ordering_ints(current_graph)
+        current_ordering, tw_path = self.orderer.get_ordering_ints(current_graph)
         contraction_schedule = []
         log.info(f"Initial treewidth: {max(tw_path)}")
 

From 7a1ef39ac167fb8b72f719a96dffe6a7e890c875 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Wed, 1 Mar 2023 18:43:07 -0600
Subject: [PATCH 037/126] make adaptive optimizer compatibile with slicing

---
 qtensor/optimisation/adaptive.py | 65 ++++++++++++++++----------------
 1 file changed, 33 insertions(+), 32 deletions(-)

diff --git a/qtensor/optimisation/adaptive.py b/qtensor/optimisation/adaptive.py
index 3381017f..10864347 100644
--- a/qtensor/optimisation/adaptive.py
+++ b/qtensor/optimisation/adaptive.py
@@ -58,56 +58,55 @@ def __init__(self, max_time=np.inf, opt_sim_ratio=1.5):
         self.max_time = max_time
         self.opt_sim_ratio = opt_sim_ratio
 
-    def log_progress(self, rt,  opt, etime):
-        width = opt.treewidth
+    def log_progress(self, rt, opt, etime, width):
         opt_name = opt.__class__.__name__
         if hasattr(self, 'verbose'):
             print(f"Qtensor adaptive optimizer: Time={rt:.4f}, width={width}, optimizer={opt_name}, expected contraction time={etime}")
 
-    def optimize(self, tensor_net):
+    def get_ordering_ints(self, graph, inplace=False):
         start = time.time()
         naive = WithoutOptimizer()
         # first, optimize with naive ordering and check treewidth
-        res = naive.optimize(tensor_net)
+        peo, path = naive.get_ordering_ints(graph)
+        width = max(path)
 
-        e1 = expected_contraction_time(naive.treewidth)
-        self.log_progress(time.time()-start, naive, e1)
+        e1 = expected_contraction_time(width)
+        self.log_progress(time.time()-start, naive, e1, width)
 
         if not should_optimize_more(e1, time.time()-start, self.opt_sim_ratio):
-            self.treewidth = naive.treewidth
-            return res
+            return peo, path
 
 
         # Next, greedy
         opt = GreedyOptimizer()
-        res = opt.optimize(tensor_net)
+        peo, path = opt.get_ordering_ints(graph)
+        width = max(path)
 
-        e1 = expected_contraction_time(opt.treewidth)
-        self.log_progress(time.time()-start, opt, e1)
+        e1 = expected_contraction_time(width)
+        self.log_progress(time.time()-start, opt, e1, width)
 
         if not should_optimize_more(e1, time.time()-start, self.opt_sim_ratio):
-            self.treewidth = opt.treewidth
-            return res
+            return peo, path
 
 
         # Next, rgreedy
-        rgreedy_time = expected_contraction_time(opt.treewidth-1)
+        rgreedy_time = expected_contraction_time(width-1)
         while rgreedy_time<5:
             opt = RGreedyOptimizer(temp=.02, max_time=rgreedy_time)
-            res = opt.optimize(tensor_net)
+            peo, path = opt.get_ordering_ints(graph)
+            width = max(path)
 
-            e1 = expected_contraction_time(opt.treewidth)
-            self.log_progress(time.time()-start, opt, e1)
+            e1 = expected_contraction_time(width)
+            self.log_progress(time.time()-start, opt, e1, width)
 
             if not should_optimize_more(e1, time.time()-start, self.opt_sim_ratio):
-                self.treewidth = opt.treewidth
-                return res
+                return peo, path
 
-            rgreedy_time = expected_contraction_time(opt.treewidth-1)
+            rgreedy_time = expected_contraction_time(width-1)
 
         # Next, Tamaki
         max_simulatable = 32
-        width = min(max_simulatable, opt.treewidth-1)
+        target_width = min(max_simulatable, width-1)
         while True:
             # terminate if reached max time - 1. No sense in running tamaki for 1 second
             # at this scale.
@@ -115,7 +114,7 @@ def optimize(self, tensor_net):
             if spent_so_far > self.max_time:
                 print("Adaptive ordering algo exceeded budget of",
                       f"{self.max_time} seconds. Returning prematurely")
-                return res
+                return peo, path
             wait_time = min(
                 expected_contraction_time(width),
                 # reserve a second for tamaki overhead
@@ -124,29 +123,31 @@ def optimize(self, tensor_net):
             # Tamaki may fail to process very large graphs if the budget is too small
             wait_time += 1
 
-            opt = TamakiOptimizer(max_width=width, wait_time=wait_time)
+            opt = TamakiOptimizer(max_width=target_width, wait_time=wait_time)
             # Detect termination reason. 
             # If terminated because reached max_width, then reduce the width
             # Othervise need more time
             start_opt = time.time()
-            t_out = opt.optimize(tensor_net)
+            t_peo, t_path = opt.get_ordering_ints(graph)
+            t_width = max(t_path)
             opt_duration = time.time() - start_opt
             # Record result if it's better than what we already have
             # (Sometimes it can decrease if we are close to time budget)
-            if opt.treewidth <= width:
-                res = t_out
+            if t_width <= target_width:
+                peo = t_peo
+                path = t_path
+                width = t_width
 
-            self.treewidth = opt.treewidth
-            e1 = expected_contraction_time(opt.treewidth)
-            self.log_progress(time.time()-start, opt, e1)
+            e1 = expected_contraction_time(width)
+            self.log_progress(time.time()-start, opt, e1, width)
 
             if not should_optimize_more(e1, time.time() - start, self.opt_sim_ratio):
-                return res
+                return peo, path
 
             # Do not reduce target treewidth if failed to converge to the previous one.
             if opt_duration < wait_time - 1:
-                width = opt.treewidth - 1
+                target_width = width - 1
 
 
-        return res
+        return peo, path
 

From 19028974f40ea15570d81249058ec83505e3ec6a Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Wed, 1 Mar 2023 19:00:54 -0600
Subject: [PATCH 038/126] make common `update_peo_after_slice`

---
 qtensor/optimisation/Optimizer.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/qtensor/optimisation/Optimizer.py b/qtensor/optimisation/Optimizer.py
index 96189a94..7af151f7 100644
--- a/qtensor/optimisation/Optimizer.py
+++ b/qtensor/optimisation/Optimizer.py
@@ -165,10 +165,12 @@ class SlicesOptimizer(Optimizer):
 
     def __init__(self, tw_bias=2, max_tw=None, max_slice=None
                  , base_ordering='greedy'
+                 , peo_after_slice_strategy='run-again'
                  , **kwargs):
         self.tw_bias = tw_bias
         self.max_tw = max_tw
         self.max_slice = max_slice
+        self.peo_after_slice_strategy = peo_after_slice_strategy
         if isinstance(base_ordering, str):
             self.base_ordering = qtensor.toolbox.get_ordering_algo(base_ordering)
         else:
@@ -187,12 +189,22 @@ def _get_max_tw(self):
         # tw = log(cost/16) = log(cost) - 4
         return int(np.log2(avail)) - 4
 
+    def _update_peo_after_slice(self, p_graph):
+        if self.peo_after_slice_strategy == 'run-again':
+            peo_ints, path = self.base_ordering.get_ordering_ints(p_graph)
+        elif self.peo_after_slice_strategy == 'TD-reuse':
+            pass
+
+        self.peo_ints = peo_ints
+        self.treewidth = max(path)
+        log.info('Treewidth after slice: {}', self.treewidth)
+        return peo_ints, path
+
     def _split_graph(self, p_graph, max_tw):
         searcher = GreedyParvars(p_graph)
         while True:
             #nodes, path = utils.get_neighbors_path(graph, peo=peo_ints)
             tw = self.treewidth
-            log.info('Treewidth: {}', tw)
             if tw < max_tw:
                 log.info('Found parvars: {}', searcher.result)
                 break
@@ -206,8 +218,7 @@ def _split_graph(self, p_graph, max_tw):
                 log.error('Memory is not enough. Max tw: {}', max_tw)
                 raise Exception('Estimated OOM')
 
-            self.peo_ints, path = self.base_ordering.get_ordering_ints(p_graph)
-            self.treewidth = max(path)
+            self._update_peo_after_slice(p_graph)
 
         return self.peo_ints, searcher.result
 
@@ -312,14 +323,10 @@ def _split_graph(self, p_graph, max_tw):
             pv_cnt = len(result)
             log.info('Parvars count: {}. Amps count: {}', pv_cnt, 2**pv_cnt)
 
-            peo_ints, path = self.base_ordering.get_ordering_ints(p_graph)
+            peo_ints, path = self._update_peo_after_slice(p_graph)
             tw = max(path)
-            log.info('Treewidth: {}', tw)
             self._slice_hist.append([pv_cnt, tw])
-
             delta = tw - max_tw
-            self.treewidth = tw
-
 
         return peo_ints, result
 

From 8dd69fcb10116998cbd664b8d6c7c1bdec3a63d1 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Thu, 2 Mar 2023 14:32:29 -0600
Subject: [PATCH 039/126] add source for qc_simulation bench

---
 bench/qc_simulation/.gitignore                |   1 +
 bench/qc_simulation/README.md                 |   6 +
 bench/qc_simulation/main.py                   | 191 ++++++++++++++
 bench/qc_simulation/requirements.txt          |   6 +
 bench/qc_simulation/scripts/README.md         |  36 +++
 .../qc_simulation/scripts/download_from_gh.sh |   4 +
 .../scripts/generate_qaoa_maxcut.sh           |   3 +
 .../scripts/http_unzip_on_the_fly.sh          |   3 +
 .../scripts/preprocess_qtensor.sh             |   3 +
 bench/qc_simulation/scripts/simple_test.sh    |   3 +
 bench/qc_simulation/src/__init__.py           |   1 +
 .../qc_simulation/src/circuit_gen/__init__.py |   0
 bench/qc_simulation/src/circuit_gen/qaoa.py   |  70 ++++++
 bench/qc_simulation/src/simulators/qtensor.py | 237 ++++++++++++++++++
 qtensor/Simulate.py                           |   3 +-
 qtensor/contraction_backends/cupy.py          |  34 +--
 16 files changed, 570 insertions(+), 31 deletions(-)
 create mode 100644 bench/qc_simulation/.gitignore
 create mode 100755 bench/qc_simulation/main.py
 create mode 100644 bench/qc_simulation/requirements.txt
 create mode 100644 bench/qc_simulation/scripts/README.md
 create mode 100755 bench/qc_simulation/scripts/download_from_gh.sh
 create mode 100755 bench/qc_simulation/scripts/generate_qaoa_maxcut.sh
 create mode 100755 bench/qc_simulation/scripts/http_unzip_on_the_fly.sh
 create mode 100755 bench/qc_simulation/scripts/preprocess_qtensor.sh
 create mode 100755 bench/qc_simulation/scripts/simple_test.sh
 create mode 100644 bench/qc_simulation/src/__init__.py
 create mode 100644 bench/qc_simulation/src/circuit_gen/__init__.py
 create mode 100644 bench/qc_simulation/src/circuit_gen/qaoa.py
 create mode 100644 bench/qc_simulation/src/simulators/qtensor.py

diff --git a/bench/qc_simulation/.gitignore b/bench/qc_simulation/.gitignore
new file mode 100644
index 00000000..8fce6030
--- /dev/null
+++ b/bench/qc_simulation/.gitignore
@@ -0,0 +1 @@
+data/
diff --git a/bench/qc_simulation/README.md b/bench/qc_simulation/README.md
index f0866832..354933be 100644
--- a/bench/qc_simulation/README.md
+++ b/bench/qc_simulation/README.md
@@ -35,3 +35,9 @@ cat estimations/bris/cpu/*greedy*
 
 This shows how UNIX utilities are used to filter and present data. In SQL this would be something like
 `SELECT * FROM simulations WHERE ordering_algo="greedy"`. 
+
+## Filetypes
+
+- `.txt` - gate sequence as in GRCS
+- `.qasm` - openqasm file
+- `.jsonterms` - json file of QAOA terms (`src/circuit_gen/qaoa.py`)
diff --git a/bench/qc_simulation/main.py b/bench/qc_simulation/main.py
new file mode 100755
index 00000000..141508dc
--- /dev/null
+++ b/bench/qc_simulation/main.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+import sys
+from pathlib import Path
+from functools import wraps
+import fire
+def log(*args):
+    print(f"[main.py] ", *args, file=sys.stderr, flush=True)
+
+# -- Utils
+
+import pandas as pd
+import fsspec
+import itertools
+from dataclasses import dataclass
+import io
+
+@dataclass
+class File:
+    path: Path
+    f: io.IOBase
+
+def general_glob(urlpath, **kwargs):
+    """General glob function to handle local and remote paths."""
+    filelist = fsspec.open_files(urlpath, **kwargs)
+    for file in filelist:
+        yield file
+
+def is_sequence(x):
+    if isinstance(x, str):
+        return False
+    try:
+        iter(x)
+        return True
+    except TypeError:
+        return False
+
+def dict_vector_iter(**d):
+    """
+    For each value that is a list in dict d, iterate over all possible
+    combinations of values.
+    """
+    keys = d.keys()
+    vals = d.values()
+    vector_keys = [k for k, v in zip(keys, vals) if is_sequence(v)]
+    vector_vals = [v for v in vals if is_sequence(v)]
+    for instance in itertools.product(*vector_vals):
+        p = dict(d)
+        p.update(zip(vector_keys, instance))
+        yield p
+
+def general_indexed(in_path, out_path, func, fsspec_kwargs={},  **kwargs):
+    """
+    Arguments:
+        in_path: a glob-like urlpath to pass to fsspec.open_files
+        out_path: a string to store the output into. Optionally,
+            can provide formatting arguments
+            If no formatting arguments provided, will be treated as a directory,
+            I.E `<out_path>/{in_file}`
+            otherwise, will be treated as a file, I.E. `<out_path>.format(**kwargs)`
+            For many input files, the {in_file} argument will be provided.
+            This will be passed as the second argument to the function
+        func: a function that takes two arguments, the first being the input
+            file object, and the second being the output file.
+        fsspec_kwargs: kwargs to pass to fsspec.open_files
+    """
+    # If no formatting arguments provided, treat as directory
+    if "{" not in out_path:
+        out_pattern = f"{out_path}/{{in_file}}"
+    else:
+        out_pattern = out_path
+
+    def unit(kwargs):
+        in_file = kwargs.pop("in_file")
+        in_path = Path(in_file.path)
+        out_file = out_pattern.format(
+            in_path=in_path,
+            in_file=in_path.name,
+            **kwargs)
+        out_path = Path(out_file)
+        # make parent dir
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        with in_file.open() as f:
+            fl = File(in_path, f)
+            changed_out = func(fl, out_file, **kwargs)
+        
+        log(f"{in_file.path} -> [{func.__name__}] -> {changed_out}")
+        index_file = Path(changed_out).parent / "index.csv"
+        update_index(index_file, input=in_file.path, output=changed_out, **kwargs)
+        return changed_out
+
+    
+    in_path = in_path.format(**kwargs)
+    files = iter(general_glob(in_path, **fsspec_kwargs))
+    combinations = iter(dict_vector_iter(in_file=files, **kwargs))
+    return list(map(unit, combinations))
+
+def update_index(index_file, **kwargs):
+    df = pd.DataFrame(kwargs, index=[0])
+    # check if index file exists
+    if not (file := Path(index_file)).exists():
+        # create directories if needed
+        file.parent.mkdir(parents=True, exist_ok=True)
+
+        print("Creating index file")
+        df.to_csv(index_file, header=True, index=False)
+    else:
+        df_exist = pd.read_csv(index_file, nrows=2)
+        if isinstance(df_exist, pd.DataFrame):
+            if df_exist.columns.tolist() != df.columns.tolist():
+                raise ValueError("Index file already exists but has different columns")
+        # append to csv
+        print(f"Appending to index file {index_file}")
+        df.to_csv(index_file, mode="a", header=False, index=False)
+# --
+
+from src.simulators.qtensor import preprocess as qtensor_preprocess
+from src.simulators.qtensor import estimate as qtensor_estimate
+from src.simulators.qtensor import simulate as qtensor_simulate
+from src.circuit_gen.qaoa import generate_maxcut
+
+# -- Main
+sim_preprocessors = {
+    'qtensor': qtensor_preprocess
+}
+
+sim_estimators = {
+    'qtensor': qtensor_estimate
+}
+
+sim_simulators = {
+    'qtensor': qtensor_simulate
+}
+
+circ_generators = {
+    'qaoa_maxcut': generate_maxcut
+}
+class Main:
+
+    def echo(self, in_path, out_dir, **kwargs):
+        """
+         Simple mapper that just echoes stuff
+         """
+        @wraps(self.echo)
+        def unit(in_file, out_file, **kwargs):
+            with open(out_file, "wb") as f:
+                f.write(in_file.f.read())
+            return out_file
+        general_indexed(in_path, out_dir, unit, **kwargs)
+
+    def generate(self, out_dir, type, **kwargs):
+        @wraps(self.generate)
+        def unit(in_file, out_file, type, **kwargs):
+            circ_generators[type](out_file, **kwargs)
+            return out_file
+        general_indexed('/dev/null', out_dir, unit, type=type, **kwargs)
+
+    def preprocess(self, in_path, out_dir, sim='qtensor', **kwargs):
+        @wraps(self.preprocess)
+        def unit(in_file, out_file, sim, **kwargs):
+            sim_preprocessors[sim](in_file, out_file, **kwargs)
+            return out_file
+        general_indexed(in_path, out_dir, unit, sim=sim, **kwargs)
+
+    def estimate(self, in_path, out_dir, sim='qtensor', **kwargs):
+        """
+        Estimate the parameters of a simulator
+        """
+        @wraps(self.estimate)
+        def unit(in_file, out_file, sim, **kwargs):
+            sim_estimators[sim](in_file, out_file, **kwargs)
+            return out_file
+        general_indexed(in_path, out_dir, unit, sim=sim, **kwargs)
+
+    if estimate.__doc__:
+        # Modify doc to include info about additional parameters
+        estimate.__doc__ += f"\n{qtensor_estimate.__doc__.replace('Arguments:', 'Additional:')}"
+
+    def simulate(self, in_path, out_dir, sim='qtensor', **kwargs):
+        """
+        Simulate the quantum circuit
+        """
+        @wraps(self.simulate)
+        def unit(in_file, out_file, **kwargs):
+            sim_simulators[sim](in_file, out_file, **kwargs)
+            return out_file
+        general_indexed(in_path, out_dir, unit, sim=sim, **kwargs)
+
+
+if __name__ == "__main__":
+    fire.core.Display = lambda lines, out: print(*lines, file=out)
+    fire.Fire(Main)
diff --git a/bench/qc_simulation/requirements.txt b/bench/qc_simulation/requirements.txt
new file mode 100644
index 00000000..520efee0
--- /dev/null
+++ b/bench/qc_simulation/requirements.txt
@@ -0,0 +1,6 @@
+fire
+fsspec
+pandas
+qiskit
+aiohttp
+cupy
diff --git a/bench/qc_simulation/scripts/README.md b/bench/qc_simulation/scripts/README.md
new file mode 100644
index 00000000..d452df23
--- /dev/null
+++ b/bench/qc_simulation/scripts/README.md
@@ -0,0 +1,36 @@
+# Scripts
+
+These are example and helper scripts
+
+## Examples
+
+### Download via http, unpack on the fly
+
+```
+╰─λ ./scripts/http_unzip_on_the_fly.sh
+[main.py]  bris_5_24_0.txt -> [echo] -> circuits/bris/bris_5_24_0.txt_dummy1.circ
+[main.py]  bris_5_24_0.txt -> [echo] -> circuits/bris/bris_5_24_0.txt_dummy2.circ
+[main.py]  bris_5_28_0.txt -> [echo] -> circuits/bris/bris_5_28_0.txt_dummy1.circ
+[main.py]  bris_5_28_0.txt -> [echo] -> circuits/bris/bris_5_28_0.txt_dummy2.circ
+[main.py]  bris_5_32_0.txt -> [echo] -> circuits/bris/bris_5_32_0.txt_dummy1.circ
+[main.py]  bris_5_32_0.txt -> [echo] -> circuits/bris/bris_5_32_0.txt_dummy2.circ
+[main.py]  bris_5_36_0.txt -> [echo] -> circuits/bris/bris_5_36_0.txt_dummy1.circ
+[main.py]  bris_5_36_0.txt -> [echo] -> circuits/bris/bris_5_36_0.txt_dummy2.circ
+[main.py]  bris_5_40_0.txt -> [echo] -> circuits/bris/bris_5_40_0.txt_dummy1.circ
+[main.py]  bris_5_40_0.txt -> [echo] -> circuits/bris/bris_5_40_0.txt_dummy2.circ
+╰─λ tree circuits/
+circuits/
+└── bris
+    ├── bris_5_24_0.txt_dummy1.circ
+    ├── bris_5_24_0.txt_dummy2.circ
+    ├── bris_5_28_0.txt_dummy1.circ
+    ├── bris_5_28_0.txt_dummy2.circ
+    ├── bris_5_32_0.txt_dummy1.circ
+    ├── bris_5_32_0.txt_dummy2.circ
+    ├── bris_5_36_0.txt_dummy1.circ
+    ├── bris_5_36_0.txt_dummy2.circ
+    ├── bris_5_40_0.txt_dummy1.circ
+    └── bris_5_40_0.txt_dummy2.circ
+
+2 directories, 10 file
+```
diff --git a/bench/qc_simulation/scripts/download_from_gh.sh b/bench/qc_simulation/scripts/download_from_gh.sh
new file mode 100755
index 00000000..2d7e3c51
--- /dev/null
+++ b/bench/qc_simulation/scripts/download_from_gh.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+#
+#./main.py echo tar://bris_5/bris*_24_0.txt::github://danlkv:GRCS@/inst/bristlecone/cz_v2/bris_5.tar.gz circuits/bris/\{in_file\}_dummy\{dummy\}.circ --dummy=1,2
+./main.py echo github://danlkv:GRCS@/inst/bristlecone/cz_v2/bris_11.tar.gz data/circuits/bris11/\{in_file\}.circ 
diff --git a/bench/qc_simulation/scripts/generate_qaoa_maxcut.sh b/bench/qc_simulation/scripts/generate_qaoa_maxcut.sh
new file mode 100755
index 00000000..458b5237
--- /dev/null
+++ b/bench/qc_simulation/scripts/generate_qaoa_maxcut.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+#
+./main.py generate data/circuits/qaoa/maxcut_regular_N{N}_p{p} --type=qaoa_maxcut --N=8,12,16,24,32,48,64 --p=1,2,3,4,5 --d=3
diff --git a/bench/qc_simulation/scripts/http_unzip_on_the_fly.sh b/bench/qc_simulation/scripts/http_unzip_on_the_fly.sh
new file mode 100755
index 00000000..17f21dfd
--- /dev/null
+++ b/bench/qc_simulation/scripts/http_unzip_on_the_fly.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+#
+./main.py echo tar://*0.txt::https://github.com/danlkv/GRCS/raw/master/inst/bristlecone/cz_v2/bris_5.tar.gz data/circuits/bris/\{in_file\}_dummy{dummy}.circ --dummy=1,2
diff --git a/bench/qc_simulation/scripts/preprocess_qtensor.sh b/bench/qc_simulation/scripts/preprocess_qtensor.sh
new file mode 100755
index 00000000..41a45b7d
--- /dev/null
+++ b/bench/qc_simulation/scripts/preprocess_qtensor.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+#
+./main.py preprocess tar://*0.txt::https://github.com/danlkv/GRCS/raw/master/inst/bristlecone/cz_v2/bris_5.tar.gz data/preprocess/bris/\{in_file\}_oalgo{O}.circ --O=greedy,rgreedy --sim=qtensor
diff --git a/bench/qc_simulation/scripts/simple_test.sh b/bench/qc_simulation/scripts/simple_test.sh
new file mode 100755
index 00000000..ac8409cb
--- /dev/null
+++ b/bench/qc_simulation/scripts/simple_test.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+#
+./main.py echo github://danlkv:GRCS@/inst/bristlecone/cz_v2/bris_9/*0.txt data/circuits/bris/\{in_file\}_dummy\{dummy\}.circ --dummy=1,2
diff --git a/bench/qc_simulation/src/__init__.py b/bench/qc_simulation/src/__init__.py
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/bench/qc_simulation/src/__init__.py
@@ -0,0 +1 @@
+
diff --git a/bench/qc_simulation/src/circuit_gen/__init__.py b/bench/qc_simulation/src/circuit_gen/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/bench/qc_simulation/src/circuit_gen/qaoa.py b/bench/qc_simulation/src/circuit_gen/qaoa.py
new file mode 100644
index 00000000..6f5a002d
--- /dev/null
+++ b/bench/qc_simulation/src/circuit_gen/qaoa.py
@@ -0,0 +1,70 @@
+import networkx
+import numpy as np
+
+def generate_ibm_connectivity(arch):
+    """
+    Generate a connectivity graph from an IBM architecture
+
+    Args:
+        arch (str): one of ["eagle", "falcon"]
+    """
+    supported_archs = ["eagle", "falcon"]
+    if arch not in supported_archs:
+        raise ValueError("Architecture {} not supported".format(arch))
+
+    def coupling_map_from_provider(p_class):
+        p = p_class()
+        graph = p.coupling_map.graph.to_undirected()
+        elist = list(graph.edge_list())
+        G = networkx.from_edgelist(elist)
+        return G
+
+    if arch == "eagle":
+        # IBM quantum volume 64
+        from qiskit.providers.fake_provider import FakeWashingtonV2
+        return coupling_map_from_provider(FakeWashingtonV2)
+    if arch == "eagle":
+        # IBM quantum volume 64
+        from qiskit.providers.fake_provider import FakeCairoV2
+        return coupling_map_from_provider(FakeCairoV2)
+
+def save_terms_format(file, terms):
+    """
+    Save terms in a format that can be read by the qtensor simulator Takes a
+    list of terms in format `(coeff, [qubits])` and saves it to a file
+    """
+    import json
+    filename = file + '.jsonterms'
+    with open(filename, "w") as f:
+        json.dump(terms, f)
+    return filename
+
+def generate_graph(n, d, type="random"):
+    if type == "random":
+        return networkx.random_regular_graph(d, n)
+    elif type[:4] == "ibm_":
+        arch = type[4:]
+        return generate_ibm_connectivity(arch)
+
+def generate_maxcut(out_file, N, p, d, graph_type='random', seed=None):
+    """
+    Generate a random regular maxcut problem
+
+    Args:
+        out_file (str): Path to output file
+        N (int): Number of nodes
+        p (int): Number of layers
+        d (int): Random regular graph degree 
+
+    Returns:
+        str: Path to output file
+    """
+    G = generate_graph(N, d, graph_type)
+    terms = []
+    for u, v in G.edges:
+        terms.append((1, (u, v)))
+    gamma = np.random.uniform(0, 2 * np.pi, p)
+    beta = np.random.uniform(0, np.pi, p)
+    pb = {"terms": terms, "gamma": gamma.tolist(), "beta": beta.tolist()}
+
+    return save_terms_format(out_file, pb)
diff --git a/bench/qc_simulation/src/simulators/qtensor.py b/bench/qc_simulation/src/simulators/qtensor.py
new file mode 100644
index 00000000..99bbd3b3
--- /dev/null
+++ b/bench/qc_simulation/src/simulators/qtensor.py
@@ -0,0 +1,237 @@
+import qtensor
+import qtree
+import numpy as np
+
+# -- QAOA generic parser
+
+class QAOAComposer(qtensor.DefaultQAOAComposer):
+    def __init__(self, N, terms, **kwargs):
+        self.n_qubits = N
+        # from ccomp (Can't call DefaultQAOA Composer since need graph)
+        self.builder = self._get_builder()
+        # gamma and beta
+        self.params = kwargs
+        # 
+        self.terms = terms
+        self.qubit_map = {n: i for i, n in enumerate(range(N))}
+
+    def cost_operator_circuit(self, gamma):
+        for factor, term in self.terms:
+            t_mapped = [self.qubit_map[i] for i in term]
+            self.append_Z_term(term, gamma)
+
+    def append_Z_term(self, term, gamma):
+        if len(term) == 2:
+            self.apply_gate(self.operators.ZZ, term[0], term[1], alpha=2*gamma)
+            #self.apply_gate(qtensor.OpFactory.ZZFull, term[0], term[1], alpha=2*gamma)
+        elif len(term) == 4:
+            self.apply_gate(self.operators.Z4, *term, alpha=2*gamma)
+        else:
+            raise ValueError(f"Invalid QAOA term length: {len(term)}")
+
+    def mixer_operator(self, beta):
+        qubits = self.qubit_map.values()
+        for qubit in qubits:
+            self.x_term(qubit, beta)
+
+def parse_qaoa(data):
+    import json
+    data = json.loads(data)
+    terms = data["terms"]
+    gamma = np.array(data["gamma"])/np.pi/2
+    beta = np.array(data["beta"])/np.pi
+    N = len(set(sum([t[1] for t in terms], [])))
+    composer = QAOAComposer(N, terms, gamma=gamma, beta=beta)
+    composer.ansatz_state()
+    return composer.circuit
+# --
+
+def read_circ(circ_f, type=None):
+
+    if type is None:
+        type = circ_f.path.name.split(".")[-1]
+
+    print("Reading circuit of type", type)
+    if type == "jsonterms":
+        b = circ_f.f.read()
+        return parse_qaoa(b)
+
+    elif type == "qasm":
+        from qiskit import QuantumCircuit
+        b = circ_f.f.read()
+        str = b.decode('utf-8')
+
+        qiskit_circuit = QuantumCircuit.from_qasm_str(str)
+        return qtree.operators.from_qiskit_circuit(qiskit_circuit)
+    else:
+        b = circ_f.f.read()
+        str = b.decode('utf-8')
+        import io
+        f = io.StringIO(str)
+        N, circ = qtree.operators.read_circuit_stream(f)
+        return sum(circ, [])
+
+def read_preps(prep_f):
+    import pickle
+    return pickle.load(prep_f.f)
+
+def write_preps(peo, prep_f):
+    import pickle
+    pickle.dump(peo, open(prep_f, 'wb'))
+
+def write_json(data, out_file):
+    import json
+    with open(out_file, 'w') as f:
+        json.dump(data, f)
+        # This newline plays nice when cat-ing multiple files
+        f.write('\n')
+
+def preprocess(in_file, out_file, O='greedy', S=None, M=30, after_slice='run-again'):
+    """
+    Arguments:
+        in_file: input file
+        out_file: output file
+        O: ordering algorithm 
+        S: slicing algorithm 
+        M: Memory limit for slicing 
+    """
+    circ = read_circ(in_file)
+    tn = qtensor.optimisation.QtreeTensorNet.from_qtree_gates(circ)
+    opt = qtensor.toolbox.get_ordering_algo(O)
+    if S:
+        # ignore argument type mismatch for pyright -- opt can be `Optimizer`
+        # pyright: reportGeneralTypeIssues=false
+        opt = qtensor.optimisation.TreeTrimSplitter(
+            tw_bias=0, max_tw=M, base_ordering=opt,
+            peo_after_slice_strategy=after_slice
+        )
+        
+        peo, par_vars, _ = opt.optimize(tn)
+        # --dbg
+        import networkx as nx
+        graph = tn.get_line_graph()
+        ignore_vars = tn.bra_vars + tn.ket_vars
+        for pv in par_vars:
+            graph.remove_node(int(pv))
+        components = list(nx.connected_components(graph))
+        print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
+        print(f"peo size without par_vars and ignore_vars: {len(peo) - len(par_vars) - len(ignore_vars)}")
+        def inspect_node(g, n):
+            neighbors = sorted(list(g.neighbors(n)))
+            return f"{n} -> {len(neighbors)}({neighbors[0]}::{neighbors[-1]})"
+        # inspect first 10 nodes
+        graph, label_dict = qtree.graph_model.relabel_graph_nodes(
+            graph, dict(zip(opt.peo_ints, range(graph.number_of_nodes())))
+        ) 
+        for n in sorted(list(graph.nodes()))[127*2:127*4]:
+            print(inspect_node(graph, n), end='; ', flush=True)
+        print()
+        # --
+    else:
+        peo, _ = opt.optimize(tn)
+        par_vars = []
+    print("W", opt.treewidth)
+    # -- qtensor_estim
+    prep_data = (peo, par_vars, tn)
+    write_preps(prep_data, out_file)
+
+
+def estimate(in_file, out_file, C=100, M=30, F=1e12, T=1e9, **kwargs):
+    """
+    Arguments:
+        in_file: file with preprocessed data
+        out_file: file to write the results to
+        C: Compression ratio
+        M: Memory limit in log2(b/16)
+        F: assumed FLOPS 
+        T: Throughput of compression
+    """
+    from qtensor.compression.cost_estimation import compressed_contraction_cost, Cost
+    from dataclasses import asdict
+    import json
+    prep_data = read_preps(in_file)
+    peo, par_vars, tn = prep_data
+
+    tn.slice({i: slice(0, 1) for i in par_vars})
+    peo = peo[:len(peo) - len(par_vars)]
+    costs: list[Cost] = compressed_contraction_cost(tn, peo, mem_limit=M, compression_ratio=C)
+    totals: Cost = sum(costs[1:], costs[0])
+    time = totals.time(F, T, T, M)
+    C = asdict(totals)
+    C['time'] = time*2**len(par_vars)
+    print("C", C)
+    out_file += ".json"
+    write_json(C, out_file)
+    return out_file
+
+def simulate(in_file, out_file, backend='einsum', compress=None, M=29, **kwargs):
+    """
+    Args:
+        in_file: file with preprocessed data
+        out_file: file to write the results to
+        backend: backend to use
+        compress: compression algorithm
+        M: memory threshold for compression
+    """
+    import time
+    from qtensor.contraction_algos import bucket_elimination
+    import cupy
+    cupy.cuda.profiler.start()
+    prep_data = read_preps(in_file)
+    peo, par_vars, tn = prep_data
+    
+    backend = qtensor.contraction_backends.get_backend(backend)
+    if compress is not None:
+        if compress == 'szx':
+            compressor = qtensor.compression.CUSZCompressor(r2r_error=1e-2, r2r_threshold=1e-2)
+        else:
+            raise ValueError(f"Unknown compression algorithm: {compress}")
+        backend = qtensor.contraction_backends.CompressionBackend(backend, compressor, M)
+
+    relabelid = {}
+    for tensor in tn.tensors:
+        for i in tensor.indices:
+            relabelid[int(i)] = i
+
+    slice_ext = {relabelid[int(i)]: 0 for i in par_vars}
+
+    if len(par_vars) > 0:
+        print("Parvars", par_vars)
+        print(f"Detected {len(par_vars)} slice variables")
+    sim = qtensor.QtreeSimulator(backend=backend)
+    sim.tn = tn
+    sim.tn.backend = backend
+    sim.peo = peo
+    sim._slice_relabel_buckets(slice_ext)
+    buckets = sim.tn.buckets
+    # --dbg
+    ignore_vars  = sim.tn.bra_vars + sim.tn.ket_vars 
+    graph = qtree.graph_model.importers.buckets2graph(buckets, ignore_vars)
+    graph, label_dict = qtree.graph_model.relabel_graph_nodes(
+        graph, dict(zip(graph.nodes, np.array(list(graph.nodes)) - 127*2))
+    ) 
+    import networkx as nx
+    components = list(nx.connected_components(graph))
+    print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
+    print(f"peo size without par_vars and ignore_vars: {len(peo) - len(ignore_vars)}")
+    # --
+
+    start = time.time()
+    for i in range(2**0):
+        print(f"P {i}", end='', flush=True)
+        bcopy = [b[:] for b in buckets]
+        res = bucket_elimination(
+            bcopy, backend,
+            n_var_nosum=len(tn.free_vars)
+        )
+        del bcopy
+        print("Result", res.data.flatten()[0])
+        time.sleep(0.5)
+    print("Simulation result:", backend.get_result_data(res).flatten()[0])
+    end = time.time()
+    print("D", end - start)
+    out_file += ".json"
+    C = {'time': end - start}
+    write_json(C, out_file)
+    cupy.cuda.profiler.stop()
+    return out_file
diff --git a/qtensor/Simulate.py b/qtensor/Simulate.py
index 285e51d1..0b24629b 100644
--- a/qtensor/Simulate.py
+++ b/qtensor/Simulate.py
@@ -121,7 +121,7 @@ def prepare_buckets(self, qc, batch_vars=0, peo=None):
             self.peo = peo
         self._slice_relabel_buckets()
 
-    def _slice_relabel_buckets(self):
+    def _slice_relabel_buckets(self, slice_extension={}):
         """
         Relabels peo according to bucket indices.
         Assumes self.tn and self.peo exists
@@ -133,6 +133,7 @@ def _slice_relabel_buckets(self):
 
         self._reorder_buckets()
         slice_dict = self._get_slice_dict()
+        slice_dict.update(slice_extension)
         #log.info('batch slice {}', slice_dict)
 
         sliced_buckets = self.tn.slice(slice_dict)
diff --git a/qtensor/contraction_backends/cupy.py b/qtensor/contraction_backends/cupy.py
index 4a3b4c56..61fca322 100644
--- a/qtensor/contraction_backends/cupy.py
+++ b/qtensor/contraction_backends/cupy.py
@@ -98,43 +98,17 @@ def get_sliced_buckets(self, buckets, data_dict, slice_dict):
                 # transpose_order = np.argsort(list(map(int, tensor.indices)))
                 # cp.argsort requires input to be cp array
                 #print(tensor.indices)
-                transpose_order = cp.argsort(cp.asarray(list(map(int, tensor.indices)))).tolist()
-                transpose_order = list(reversed(transpose_order))
-                
-                '''
-                Change 2:
-                Original: Data is all converted into torch.tensor and use torch api, the results are in torch
-                New:      Convert all data to CuPy.ndarray, will raise exceptional signal
-                '''
+                out_indices = list(sorted(tensor.indices, key=int, reverse=True))
                 data = data_dict[tensor.data_key]
+                data, new_indices = slice_numpy_tensor(data, tensor.indices, out_indices, slice_dict)
+                # transpose indices
                 try:
                     data = cp.asarray(data)
-                    data = data.transpose(tuple(transpose_order))
                 except:
                     print("CuPy Backend doesn't support gradient.")
-                
-                # transpose indices
-                indices_sorted = [tensor.indices[pp]
-                                  for pp in transpose_order]
-
-                # slice data
-                slice_bounds = []
-                for idx in indices_sorted:
-                    try:
-                        slice_bounds.append(slice_dict[idx])
-                    except KeyError:
-                        slice_bounds.append(slice(None))
-
-                data = data[tuple(slice_bounds)]
-
-                # update indices
-                indices_sliced = [idx.copy(size=size) for idx, size in
-                                  zip(indices_sorted, data.shape)]
-                indices_sliced = [i for sl, i in zip(slice_bounds, indices_sliced) if not isinstance(sl, int)]
-                assert len(data.shape) == len(indices_sliced)
 
                 sliced_bucket.append(
-                    tensor.copy(indices=indices_sliced, data=data))
+                    tensor.copy(indices=new_indices, data=data))
             sliced_buckets.append(sliced_bucket)
 
         return sliced_buckets

From cdde85242becd19661bc13267766e35f30c157c2 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Thu, 2 Mar 2023 14:37:16 -0600
Subject: [PATCH 040/126] modify slicing algo to change update_peo strategy;
 support slice_ext_dict

---
 qtensor/Simulate.py                           |  4 +-
 qtensor/compression/compressed_contraction.py |  2 +-
 qtensor/contraction_backends/common.py        |  3 +-
 qtensor/contraction_backends/cupy.py          |  2 +-
 qtensor/optimisation/Optimizer.py             | 39 ++++++++++++++++---
 qtensor/optimisation/__init__.py              |  2 +-
 6 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/qtensor/Simulate.py b/qtensor/Simulate.py
index 0b24629b..ec258e8b 100644
--- a/qtensor/Simulate.py
+++ b/qtensor/Simulate.py
@@ -131,10 +131,10 @@ def _slice_relabel_buckets(self, slice_extension={}):
         self.peo = [identity_map[int(i)] for i in self.peo]
 
 
-        self._reorder_buckets()
+        perm_dict = self._reorder_buckets()
         slice_dict = self._get_slice_dict()
+        slice_extension = {perm_dict[k]: v for k, v in slice_extension.items()}
         slice_dict.update(slice_extension)
-        #log.info('batch slice {}', slice_dict)
 
         sliced_buckets = self.tn.slice(slice_dict)
         #self.backend.pbar.set_total ( len(sliced_buckets))
diff --git a/qtensor/compression/compressed_contraction.py b/qtensor/compression/compressed_contraction.py
index c6648f24..e2b3e527 100644
--- a/qtensor/compression/compressed_contraction.py
+++ b/qtensor/compression/compressed_contraction.py
@@ -158,7 +158,7 @@ def compressed_sum(A:Tensor, sum_ixs,
     # -- Early return: if no need to compress, do the regular contraction
     if len(need_compressed)==0 and len(exist_compressed)==0:
         C = Tensor.empty(new_tensor_name, result_indices)
-        sum_axes = [A.indices.index(i) for i in sum_ixs]
+        sum_axes = tuple([A.indices.index(i) for i in sum_ixs])
         C.data = A.data.sum(axis=sum_axes)
         return C
     # --
diff --git a/qtensor/contraction_backends/common.py b/qtensor/contraction_backends/common.py
index bf132ca7..d09441e7 100644
--- a/qtensor/contraction_backends/common.py
+++ b/qtensor/contraction_backends/common.py
@@ -44,7 +44,8 @@ def slice_numpy_tensor(data:np.ndarray, indices_in, indices_out, slice_dict):
         i for sl, i in zip(slice_bounds, indices_in) if not isinstance(sl, int)
     ]
     indices_sized = [v.copy(size=size) for v, size in zip(indices_sliced, s_data.shape)]
-    assert len(indices_out) == len(s_data.shape)
+    indices_out = [v for v in indices_out if not isinstance(slice_dict.get(v, None), int)]
+    assert len(indices_sized) == len(s_data.shape)
     assert len(indices_sliced) == len(s_data.shape)
     st_data = permute_np_tensor_data(s_data, indices_sliced, indices_out)
     return st_data, indices_out
diff --git a/qtensor/contraction_backends/cupy.py b/qtensor/contraction_backends/cupy.py
index 61fca322..ee4e4b6c 100644
--- a/qtensor/contraction_backends/cupy.py
+++ b/qtensor/contraction_backends/cupy.py
@@ -103,7 +103,7 @@ def get_sliced_buckets(self, buckets, data_dict, slice_dict):
                 data, new_indices = slice_numpy_tensor(data, tensor.indices, out_indices, slice_dict)
                 # transpose indices
                 try:
-                    data = cp.asarray(data)
+                    data = cp.asarray(data, dtype=cp.complex64)
                 except:
                     print("CuPy Backend doesn't support gradient.")
 
diff --git a/qtensor/optimisation/Optimizer.py b/qtensor/optimisation/Optimizer.py
index 7af151f7..44ec946b 100644
--- a/qtensor/optimisation/Optimizer.py
+++ b/qtensor/optimisation/Optimizer.py
@@ -189,11 +189,40 @@ def _get_max_tw(self):
         # tw = log(cost/16) = log(cost) - 4
         return int(np.log2(avail)) - 4
 
-    def _update_peo_after_slice(self, p_graph):
+    def _update_peo_after_slice(self, p_graph, slice_vars):
         if self.peo_after_slice_strategy == 'run-again':
             peo_ints, path = self.base_ordering.get_ordering_ints(p_graph)
         elif self.peo_after_slice_strategy == 'TD-reuse':
-            pass
+            # Remove sliced vars from TD graph. Then, reconstruct peo from this TD
+            peo_old = self.peo_ints
+            peo_ints = [i for i in peo_old if i not in slice_vars]
+            nodes, path = qtensor.utils.get_neighbors_path(p_graph, peo_ints)
+            # -- Tree re-peo
+            g_components = list(nx.connected_components(p_graph))
+            print(f"# of components: {len(g_components)}, # of nodes total: {p_graph.number_of_nodes()}, # of nodes per component: {[len(c) for c in g_components]}")
+            from qtree.graph_model.clique_trees import (
+                get_tree_from_peo, get_peo_from_tree)
+            tree = get_tree_from_peo(p_graph, peo_ints)
+            clique_vertices = []
+            print("Calling get_peo_from_tree")
+            # ---- re-create peo from tree
+            peo_recreate = []
+            components = list(nx.connected_components(tree))
+            print("# of components: ", len(components))
+            for subtree in components:
+                peo_recreate += get_peo_from_tree(tree.subgraph(subtree).copy(), clique_vertices=clique_vertices)
+            # ----
+            nodes, path_recreate = qtensor.utils.get_neighbors_path(p_graph, peo_recreate)
+            log.info(f"Re-created peo width from tree: {max(path_recreate)}")
+            if max(path_recreate) < max(path):
+                log.info("Re-created peo is better than old peo. Using new peo.")
+                peo_ints = peo_recreate
+                path = path_recreate
+            # --
+
+        else:
+            raise ValueError('Unknown peo_after_slice_strategy: {}'
+                             .format(self.peo_after_slice_strategy))
 
         self.peo_ints = peo_ints
         self.treewidth = max(path)
@@ -206,7 +235,7 @@ def _split_graph(self, p_graph, max_tw):
             #nodes, path = utils.get_neighbors_path(graph, peo=peo_ints)
             tw = self.treewidth
             if tw < max_tw:
-                log.info('Found parvars: {}', searcher.result)
+                log.info(f'Found {len(searcher.result)} parvars: {searcher.result}')
                 break
             if self.max_slice is not None:
                 if len(searcher.result) > self.max_slice:
@@ -218,7 +247,7 @@ def _split_graph(self, p_graph, max_tw):
                 log.error('Memory is not enough. Max tw: {}', max_tw)
                 raise Exception('Estimated OOM')
 
-            self._update_peo_after_slice(p_graph)
+            self._update_peo_after_slice(p_graph, searcher.result)
 
         return self.peo_ints, searcher.result
 
@@ -323,7 +352,7 @@ def _split_graph(self, p_graph, max_tw):
             pv_cnt = len(result)
             log.info('Parvars count: {}. Amps count: {}', pv_cnt, 2**pv_cnt)
 
-            peo_ints, path = self._update_peo_after_slice(p_graph)
+            peo_ints, path = self._update_peo_after_slice(p_graph, result)
             tw = max(path)
             self._slice_hist.append([pv_cnt, tw])
             delta = tw - max_tw
diff --git a/qtensor/optimisation/__init__.py b/qtensor/optimisation/__init__.py
index f36361c1..b9323eb5 100644
--- a/qtensor/optimisation/__init__.py
+++ b/qtensor/optimisation/__init__.py
@@ -2,7 +2,7 @@
 from qtensor.optimisation.Optimizer import TamakiTrimSlicing, TamakiOptimizer
 
 from qtensor.optimisation.Optimizer import GreedyOptimizer, WithoutOptimizer
-from qtensor.optimisation.Optimizer import Optimizer, SlicesOptimizer
+from qtensor.optimisation.Optimizer import Optimizer, SlicesOptimizer, TreeTrimSplitter
 from qtensor.optimisation.Greedy import GreedyParvars
 from qtensor.optimisation.late_parallelisation import LateParOptimizer
 

From e43c2a1b1caf599cd4f7c248ea02ec6b51350175 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Thu, 2 Mar 2023 14:43:45 -0600
Subject: [PATCH 041/126] move qtensor profile bench to subfolder

---
 bench/qc_simulation/qtensor/test_circuits.py  | 20 -------------------
 .../simulators/qtensor_profile.py}            |  4 ++--
 2 files changed, 2 insertions(+), 22 deletions(-)
 delete mode 100644 bench/qc_simulation/qtensor/test_circuits.py
 rename bench/qc_simulation/{qtensor/run.py => src/simulators/qtensor_profile.py} (99%)

diff --git a/bench/qc_simulation/qtensor/test_circuits.py b/bench/qc_simulation/qtensor/test_circuits.py
deleted file mode 100644
index 884ca10c..00000000
--- a/bench/qc_simulation/qtensor/test_circuits.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import qtensor
-import numpy as np
-import networkx as nx
-
-def get_qaoa_graph_params(n=10, p=2, d=3, type='random', seed=10):
-    if type == 'random':
-        G = nx.random_regular_graph(d, n, seed=seed)
-    elif type == 'grid2d':
-        G = nx.grid_2d_graph(n,n)
-    elif type == 'line':
-        G = nx.Graph()
-        G.add_edges_from(zip(range(n-1), range(1, n)))
-    gamma, beta = [np.pi/5]*p, [np.pi/2]*p
-    return G, gamma, beta
-
-def gen_qaoa_maxcut_circuit(n=10, p=2, d=3, type='random', seed=10):
-    G, gamma, beta = get_qaoa_graph_params(n, p, d, type, seed)
-    composer = qtensor.QtreeQAOAComposer(graph=G, gamma=gamma, beta=beta)
-    composer.ansatz_state()
-    return composer.circuit
diff --git a/bench/qc_simulation/qtensor/run.py b/bench/qc_simulation/src/simulators/qtensor_profile.py
similarity index 99%
rename from bench/qc_simulation/qtensor/run.py
rename to bench/qc_simulation/src/simulators/qtensor_profile.py
index 9c7ae7e0..e48e1774 100644
--- a/bench/qc_simulation/qtensor/run.py
+++ b/bench/qc_simulation/src/simulators/qtensor_profile.py
@@ -72,8 +72,8 @@ def mean_mmax(x: list):
     return np.mean(x)
 
 def main():
-    Ns = [24]
-    p = 15
+    Ns = [30]
+    p = 10
     ordering_algo = 'greedy'
     repeats = 2
     top_K = 15

From e4113c309aa471ea2feb72c29e8b8d2e10983d2c Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Thu, 2 Mar 2023 14:50:16 -0600
Subject: [PATCH 042/126] move compression  tests

---
 qtensor/compression/{ => tests}/test_compressed_contract.py | 0
 qtensor/compression/{ => tests}/test_compressed_tensor.py   | 0
 qtensor/compression/{ => tests}/test_cost_estimation.py     | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 rename qtensor/compression/{ => tests}/test_compressed_contract.py (100%)
 rename qtensor/compression/{ => tests}/test_compressed_tensor.py (100%)
 rename qtensor/compression/{ => tests}/test_cost_estimation.py (100%)

diff --git a/qtensor/compression/test_compressed_contract.py b/qtensor/compression/tests/test_compressed_contract.py
similarity index 100%
rename from qtensor/compression/test_compressed_contract.py
rename to qtensor/compression/tests/test_compressed_contract.py
diff --git a/qtensor/compression/test_compressed_tensor.py b/qtensor/compression/tests/test_compressed_tensor.py
similarity index 100%
rename from qtensor/compression/test_compressed_tensor.py
rename to qtensor/compression/tests/test_compressed_tensor.py
diff --git a/qtensor/compression/test_cost_estimation.py b/qtensor/compression/tests/test_cost_estimation.py
similarity index 100%
rename from qtensor/compression/test_cost_estimation.py
rename to qtensor/compression/tests/test_cost_estimation.py

From 10ed7df9396e1bac82a397d1d957725afb89d513 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Thu, 2 Mar 2023 14:54:58 -0600
Subject: [PATCH 043/126] move Compressor to a separate file

---
 qtensor/compression/CompressedTensor.py       | 104 +----------------
 qtensor/compression/Compressor.py             | 105 ++++++++++++++++++
 qtensor/compression/__init__.py               |   2 +-
 qtensor/compression/pytest.ini                |   3 +
 .../tests/test_compressed_tensor.py           |   2 +-
 5 files changed, 111 insertions(+), 105 deletions(-)
 create mode 100644 qtensor/compression/Compressor.py
 create mode 100644 qtensor/compression/pytest.ini

diff --git a/qtensor/compression/CompressedTensor.py b/qtensor/compression/CompressedTensor.py
index 9a92170e..3f9181d2 100644
--- a/qtensor/compression/CompressedTensor.py
+++ b/qtensor/compression/CompressedTensor.py
@@ -1,21 +1,8 @@
 import itertools
 import numpy as np
-import io
 from qtree.optimizer import Tensor
 from qtree.system_defs import NP_ARRAY_TYPE
-import sys
-from pathlib import Path
-print(Path(__file__).parent/'szx/src/')
-sys.path.append(str(Path(__file__).parent/'szx/src/'))
-sys.path.append('./szx/src')
-
-try:
-    from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
-except:
-    # Silently fail on missing build of cuszx
-    pass
-
-CUSZX_BLOCKSIZE = 256
+from .Compressor import NumpyCompressor, Compressor
 
 def iterate_indices(indices: list):
     if len(indices)==0:
@@ -23,95 +10,6 @@ def iterate_indices(indices: list):
     ranges = [range(v.size) for v in indices]
     return itertools.product(*ranges)
 
-class Compressor():
-    def compress(self, data):
-        raise NotImplementedError
-
-    def decompress(self, ptr):
-        raise NotImplementedError
-
-class NumpyCompressor(Compressor):
-    def compress(self, data):
-        comp = io.BytesIO()
-        np.savez_compressed(comp, data)
-        return comp
-
-    def decompress(self, ptr):
-        ptr.seek(0)
-        return  np.load(ptr)['arr_0']
-
-class CUSZCompressor(Compressor):
-    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
-        self.r2r_error = r2r_error
-        self.r2r_threshold = r2r_threshold
-
-    def compress(self, data):
-        import cupy
-        if isinstance(data, cupy.ndarray):
-            isCuPy = True
-        else:
-            isCuPy = False
-        num_elements = data.size
-        # Adapt numele depending on itemsize
-        itemsize = data.dtype.itemsize
-        num_elements_eff = int(num_elements*itemsize/4) 
-
-        dtype = data.dtype
-        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
-        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype)
-
-    def decompress(self, obj):
-        import cupy
-        import ctypes
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype = obj
-        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
-        # -- Workaround to convert GPU pointer to int
-        p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
-        # cast to int64 pointer
-        # (effectively converting pointer to pointer to addr to pointer to int64)
-        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-        decompressed_int = p_decompressed_int.contents
-        # --
-        mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
-        mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
-        arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
-        return arr
-    
-    ### Compression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
-    # - num_elements = Number of floating point elements in data
-    # - r2r_error = relative-to-value-range error bound for lossy compression
-    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
-    # Returns:
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
-    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
-        
-        if not isCuPy:
-            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-        else:
-            cmp_bytes, outSize_ptr = cuszx_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-        return cmp_bytes, outSize_ptr
-
-    ### Decompression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - num_elements = Number of floating point elements in original data
-    # Returns:
-    # - decompressed_data = Float32 pointer to decompressed data
-    #
-    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
-
-    def cuszx_decompress(self, isCuPy, cmp_bytes, num_elements):
-        if not isCuPy:
-            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
-        else:
-            decompressed_data = cuszx_device_decompress(num_elements, cmp_bytes)
-
-        return decompressed_data
 
 class CompressedTensor(Tensor):
     """
diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
new file mode 100644
index 00000000..2430a135
--- /dev/null
+++ b/qtensor/compression/Compressor.py
@@ -0,0 +1,105 @@
+import io
+import sys
+import numpy as np
+from pathlib import Path
+print(Path(__file__).parent/'szx/src/')
+sys.path.append(str(Path(__file__).parent/'szx/src/'))
+sys.path.append('./szx/src')
+
+try:
+    from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
+except:
+    # Silently fail on missing build of cuszx
+    pass
+
+CUSZX_BLOCKSIZE = 256
+
+class Compressor():
+    def compress(self, data):
+        raise NotImplementedError
+
+    def decompress(self, ptr):
+        raise NotImplementedError
+
+class NumpyCompressor(Compressor):
+    def compress(self, data):
+        comp = io.BytesIO()
+        np.savez_compressed(comp, data)
+        return comp
+
+    def decompress(self, ptr):
+        ptr.seek(0)
+        return  np.load(ptr)['arr_0']
+
+class CUSZCompressor(Compressor):
+    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
+        self.r2r_error = r2r_error
+        self.r2r_threshold = r2r_threshold
+
+    def compress(self, data):
+        import cupy
+        if isinstance(data, cupy.ndarray):
+            isCuPy = True
+        else:
+            isCuPy = False
+        num_elements = data.size
+        # Adapt numele depending on itemsize
+        itemsize = data.dtype.itemsize
+        num_elements_eff = int(num_elements*itemsize/4) 
+
+        dtype = data.dtype
+        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
+        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype)
+
+    def decompress(self, obj):
+        import cupy
+        import ctypes
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype = obj
+        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
+        # -- Workaround to convert GPU pointer to int
+        p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
+        # cast to int64 pointer
+        # (effectively converting pointer to pointer to addr to pointer to int64)
+        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        decompressed_int = p_decompressed_int.contents
+        # --
+        mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
+        mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
+        arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
+        return arr
+    
+    ### Compression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
+    # - num_elements = Number of floating point elements in data
+    # - r2r_error = relative-to-value-range error bound for lossy compression
+    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
+    # Returns:
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
+    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
+        
+        if not isCuPy:
+            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+        else:
+            cmp_bytes, outSize_ptr = cuszx_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+        return cmp_bytes, outSize_ptr
+
+    ### Decompression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - num_elements = Number of floating point elements in original data
+    # Returns:
+    # - decompressed_data = Float32 pointer to decompressed data
+    #
+    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
+
+    def cuszx_decompress(self, isCuPy, cmp_bytes, num_elements):
+        if not isCuPy:
+            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
+        else:
+            decompressed_data = cuszx_device_decompress(num_elements, cmp_bytes)
+
+        return decompressed_data
diff --git a/qtensor/compression/__init__.py b/qtensor/compression/__init__.py
index 6a5f7464..181aa53b 100644
--- a/qtensor/compression/__init__.py
+++ b/qtensor/compression/__init__.py
@@ -1,4 +1,4 @@
+from .Compressor import Compressor, NumpyCompressor, CUSZCompressor
 from .CompressedTensor import CompressedTensor, Tensor
-from .CompressedTensor import Compressor, NumpyCompressor, CUSZCompressor
 from .compressed_contraction import compressed_contract, compressed_sum
 from .cost_estimation import compressed_contraction_cost
diff --git a/qtensor/compression/pytest.ini b/qtensor/compression/pytest.ini
new file mode 100644
index 00000000..c24fe5bb
--- /dev/null
+++ b/qtensor/compression/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+filterwarnings =
+    ignore::DeprecationWarning
diff --git a/qtensor/compression/tests/test_compressed_tensor.py b/qtensor/compression/tests/test_compressed_tensor.py
index 3136e7ee..29ad3243 100644
--- a/qtensor/compression/tests/test_compressed_tensor.py
+++ b/qtensor/compression/tests/test_compressed_tensor.py
@@ -1,5 +1,5 @@
 from qtensor.compression import CompressedTensor
-from qtensor.compression.CompressedTensor import NumpyCompressor, CUSZCompressor
+from qtensor.compression import NumpyCompressor, CUSZCompressor
 from qtree.optimizer import Var
 from qtree.system_defs import NP_ARRAY_TYPE
 import pytest

From 9b787d17c0f877a44ed3e4310cad5519fec0754a Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Thu, 2 Mar 2023 19:50:11 -0600
Subject: [PATCH 044/126] add memory leak test

---
 qtensor/compression/Compressor.py             | 90 ++++++++++++++++++-
 qtensor/compression/__init__.py               |  2 +-
 qtensor/compression/tests/test_memory_leak.py | 28 ++++++
 3 files changed, 117 insertions(+), 3 deletions(-)
 create mode 100644 qtensor/compression/tests/test_memory_leak.py

diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index 2430a135..88ac0f48 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -21,12 +21,75 @@ def compress(self, data):
     def decompress(self, ptr):
         raise NotImplementedError
 
+    def compress_size(self, ptr):
+        return ptr.nbytes
+
+# -- Debugging and profiling
+
+import time
+from dataclasses import dataclass
+@dataclass
+class CompressMeasure:
+    time: float = 0
+    size_in: int = 0
+    size_out: int = 0
+    label: str = ''
+
+    def __str__(self):
+        compress_ratio = self.size_in / self.size_out
+        return (f'Measure: {self.time:.3f}s, '
+                f'{self.size_in/1024**2:.2f}MB -> {self.size_out/1024**2:.2f}MB ({compress_ratio:.3f} in/out ratio)'
+        )
+
+class ProfileCompressor(Compressor):
+    def __init__(self, compressor:Compressor, trace=True):
+        self.trace = trace
+        self.compressor = compressor
+        self.profile_data = {'compress': [], 'decompress': []}
+
+    def compress(self, data):
+        start = time.time()
+        ptr = self.compressor.compress(data)
+        end = time.time()
+        out_size = self.compressor.compress_size(ptr)
+        cmeasure = CompressMeasure(end-start, data.nbytes, out_size)
+        self.profile_data['compress'].append(cmeasure)
+        if self.trace:
+            print(f'Compress: {cmeasure}')
+        return ptr
+
+    def decompress(self, ptr):
+        start = time.time()
+        data = self.compressor.decompress(ptr)
+        end = time.time()
+        in_size = self.compressor.compress_size(ptr)
+        dmeasure = CompressMeasure(end-start, in_size, data.nbytes)
+        self.profile_data['decompress'].append(dmeasure)
+        if self.trace:
+            print(f'Decompress: {dmeasure}')
+        return data
+
+    def get_profile_data(self):
+        return self.profile_data['compress'], self.profile_data['decompress']
+
+    def get_profile_stats(self):
+        compress, decompress = self.get_profile_data()
+        compress_time = sum([x.time for x in compress])
+        decompress_time = sum([x.time for x in decompress])
+        compress_ratios = np.mean([x.size_in/x.size_out for x in compress])
+        compress_size = sum([x.size_out for x in compress])
+        return compress_time, decompress_time, compress_size, compress_ratios
+# --
+
 class NumpyCompressor(Compressor):
     def compress(self, data):
         comp = io.BytesIO()
         np.savez_compressed(comp, data)
         return comp
 
+    def compress_size(self, ptr):
+        return ptr.getbuffer().nbytes
+
     def decompress(self, ptr):
         ptr.seek(0)
         return  np.load(ptr)['arr_0']
@@ -35,6 +98,25 @@ class CUSZCompressor(Compressor):
     def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
         self.r2r_error = r2r_error
         self.r2r_threshold = r2r_threshold
+        self.decompressed_own = []
+
+    def free_decompressed(self):
+        import cupy
+        print("Cleanup", len(self.decompressed_own))
+        for x in self.decompressed_own:
+            print("CUDA Free", x)
+            cupy.cuda.runtime.free(x)
+        self.decompressed_own = []
+
+    def free_compressed(self, ptr):
+        import ctypes, cupy
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
+        p_decompressed_ptr = ctypes.addressof(cmp_bytes)
+        # cast to int64 pointer
+        # (effectively converting pointer to pointer to addr to pointer to int64)
+        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        decompressed_int = p_decompressed_int.contents
+        cupy.cuda.runtime.free(decompressed_int.value)
 
     def compress(self, data):
         import cupy
@@ -49,12 +131,15 @@ def compress(self, data):
 
         dtype = data.dtype
         cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
-        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype)
+        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
+
+    def compress_size(self, ptr):
+        return ptr[5]
 
     def decompress(self, obj):
         import cupy
         import ctypes
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype = obj
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = obj
         decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
         # -- Workaround to convert GPU pointer to int
         p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
@@ -63,6 +148,7 @@ def decompress(self, obj):
         p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
         decompressed_int = p_decompressed_int.contents
         # --
+        self.decompressed_own.append(decompressed_int.value)
         mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
         mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
         arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
diff --git a/qtensor/compression/__init__.py b/qtensor/compression/__init__.py
index 181aa53b..cf248bee 100644
--- a/qtensor/compression/__init__.py
+++ b/qtensor/compression/__init__.py
@@ -1,4 +1,4 @@
-from .Compressor import Compressor, NumpyCompressor, CUSZCompressor
+from .Compressor import Compressor, NumpyCompressor, CUSZCompressor, ProfileCompressor
 from .CompressedTensor import CompressedTensor, Tensor
 from .compressed_contraction import compressed_contract, compressed_sum
 from .cost_estimation import compressed_contraction_cost
diff --git a/qtensor/compression/tests/test_memory_leak.py b/qtensor/compression/tests/test_memory_leak.py
new file mode 100644
index 00000000..b74934d4
--- /dev/null
+++ b/qtensor/compression/tests/test_memory_leak.py
@@ -0,0 +1,28 @@
+"""
+Run `watch -n 0.1 nvidia-smi` and then run this test
+"""
+from qtensor.compression import CUSZCompressor
+import cupy
+import ctypes
+
+def free_compressed(ptr):
+    cmp_bytes, *_ = ptr
+    p_decompressed_ptr = ctypes.addressof(cmp_bytes)
+    # cast to int64 pointer
+    # (effectively converting pointer to pointer to addr to pointer to int64)
+    p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decompressed_int = p_decompressed_int.contents
+    cupy.cuda.runtime.free(decompressed_int.value)
+
+def test_leak():
+    N = 1024*1024*8 # 8MB
+    a = cupy.zeros(N, dtype=float)
+    a[::1024] = .1
+
+    c = CUSZCompressor()
+    for i in range(100):
+        out = c.compress(a)
+        #b = c.decompress(out)
+        print(i, "Compressed size", c.compress_size(out)/1024**2, "MB")
+        #c.free_decompressed()
+        free_compressed(out)

From e86e30331bb2cbb634da83daf654cfd4156f346a Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Thu, 2 Mar 2023 20:27:59 -0600
Subject: [PATCH 045/126] maybe fix the memory leak problem; update memory leak
 test

---
 qtensor/compression/szx/src/cuszx_entry.cu    | 6 ++++++
 qtensor/compression/tests/test_memory_leak.py | 7 ++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/qtensor/compression/szx/src/cuszx_entry.cu b/qtensor/compression/szx/src/cuszx_entry.cu
index c9098471..35c91990 100644
--- a/qtensor/compression/szx/src/cuszx_entry.cu
+++ b/qtensor/compression/szx/src/cuszx_entry.cu
@@ -617,6 +617,12 @@ unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize,
     // free(meta);
     // free(offsets);
     // free(midBytes);
+    checkCudaErrors(cudaFree(d_num_sig));
+    checkCudaErrors(cudaFree(d_blk_idx));
+    checkCudaErrors(cudaFree(d_blk_subidx));
+    checkCudaErrors(cudaFree(d_blk_vals));
+    checkCudaErrors(cudaFree(d_blk_sig));
+
     checkCudaErrors(cudaFree(d_meta));
     checkCudaErrors(cudaFree(d_offsets));
     checkCudaErrors(cudaFree(d_midBytes));
diff --git a/qtensor/compression/tests/test_memory_leak.py b/qtensor/compression/tests/test_memory_leak.py
index b74934d4..84b096e8 100644
--- a/qtensor/compression/tests/test_memory_leak.py
+++ b/qtensor/compression/tests/test_memory_leak.py
@@ -15,14 +15,15 @@ def free_compressed(ptr):
     cupy.cuda.runtime.free(decompressed_int.value)
 
 def test_leak():
-    N = 1024*1024*8 # 8MB
+    N = 1024*1024*8 # 64MB
     a = cupy.zeros(N, dtype=float)
     a[::1024] = .1
 
     c = CUSZCompressor()
     for i in range(100):
         out = c.compress(a)
-        #b = c.decompress(out)
         print(i, "Compressed size", c.compress_size(out)/1024**2, "MB")
-        #c.free_decompressed()
+        b = c.decompress(out)
+        print(i, "Decompressed, 0, 1024", b[0], b[1024])
+        c.free_decompressed()
         free_compressed(out)

From 7d5818c0fbc4d3dd65f17bb553c950ae7a00ae1f Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Thu, 2 Mar 2023 22:13:42 -0600
Subject: [PATCH 046/126] update compression test_memory_leak

---
 qtensor/compression/tests/test_memory_leak.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/qtensor/compression/tests/test_memory_leak.py b/qtensor/compression/tests/test_memory_leak.py
index 84b096e8..4ae02b93 100644
--- a/qtensor/compression/tests/test_memory_leak.py
+++ b/qtensor/compression/tests/test_memory_leak.py
@@ -15,15 +15,18 @@ def free_compressed(ptr):
     cupy.cuda.runtime.free(decompressed_int.value)
 
 def test_leak():
-    N = 1024*1024*8 # 64MB
+    N = 1024*1024//2 # 32MB
     a = cupy.zeros(N, dtype=float)
-    a[::1024] = .1
+    a[::1024] = .01
+    for i in range(1000):
+        a[32*i] = .005*(i%5+1)
 
-    c = CUSZCompressor()
-    for i in range(100):
+    c = CUSZCompressor(r2r_error=1e-2, r2r_threshold=1e-2)
+    for i in range(200):
         out = c.compress(a)
-        print(i, "Compressed size", c.compress_size(out)/1024**2, "MB")
+        print(i, "Compression ratio", 4*N/c.compress_size(out))
         b = c.decompress(out)
+        a[:] = b
         print(i, "Decompressed, 0, 1024", b[0], b[1024])
         c.free_decompressed()
         free_compressed(out)

From 7756340c19114daf50cc75a010604d9189909edb Mon Sep 17 00:00:00 2001
From: Milan Kartik Shah <mkshah5@nps4.ece.ncsu.edu>
Date: Fri, 3 Mar 2023 00:16:31 -0500
Subject: [PATCH 047/126] Fixed incorrectly initialized variable;
 test_memory_leak returning consistent compression ratio now

---
 qtensor/compression/szx/src/cuszx_entry.cu | 15 ++++++++++++---
 qtensor/compression/szx/src/cuszx_float.cu |  7 ++++---
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/qtensor/compression/szx/src/cuszx_entry.cu b/qtensor/compression/szx/src/cuszx_entry.cu
index 35c91990..8474d110 100644
--- a/qtensor/compression/szx/src/cuszx_entry.cu
+++ b/qtensor/compression/szx/src/cuszx_entry.cu
@@ -116,7 +116,7 @@ int _post_proc(float *oriData, unsigned char *meta, short *offsets, unsigned cha
     	if(meta[i]==2) s2++;
     	if(meta[i]==3) s3++;
     }
-    printf("%d %d %d %d\n", s0, s1, s2, s3);
+//    printf("%d %d %d %d\n", s0, s1, s2, s3);
     out_size += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
 
     //outBytes = (unsigned char*)malloc(out_size);
@@ -453,11 +453,12 @@ __global__ void device_post_proc(size_t *outSize, float *oriData, unsigned char
     	if(meta[i]==2) s2++;
     	if(meta[i]==3) s3++;
     }
-   // printf("%d %d %d %d\n", s0, s1, s2, s3);
+  //  printf("%d %d %d %d\n", s0, s1, s2, s3);
     out_size += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
 
     //outBytes = (unsigned char*)malloc(out_size);
 	unsigned char* r = outBytes;
+   // printf("outbytes %p\n",r);
     unsigned char* r_old = outBytes; 
 	r[0] = SZx_VER_MAJOR;
 	r[1] = SZx_VER_MINOR;
@@ -469,12 +470,20 @@ __global__ void device_post_proc(size_t *outSize, float *oriData, unsigned char
     longToBytes_bigEndian_d(r, nbConstantBlocks);
 	r += sizeof(size_t);
     //sizeToBytes(r, (size_t) num_sig);
+
+   // printf("outbytes %p\n",r);
     longToBytes_bigEndian_d(r, (unsigned long)num_sig);
     r += sizeof(size_t); 
 	r += convert_state_to_out(meta, nbBlocks, r);
+   // printf("num sig %d\n", num_sig); 
+   // printf("outbytes %p\n",r);
     r += convert_block2_to_out(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    
+   // printf("outbytes %p\n",r);
     memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
     r += (nbEle%blockSize)*sizeof(float);
+
+   // printf("outbytes %p\n",r);
     unsigned char* c = r;
     unsigned char* o = c+nbConstantBlocks*sizeof(float);
     unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
@@ -495,7 +504,7 @@ __global__ void device_post_proc(size_t *outSize, float *oriData, unsigned char
 
     // return out_size;
     *outSize = (uint32_t) (nc-r_old);
-   // printf("outBytes 0 %d\n", (int) outBytes[0]);
+    //printf("outsize kernel %ld\n", *outSize);
     // return (uint32_t) (nc-r_old);
 }
 
diff --git a/qtensor/compression/szx/src/cuszx_float.cu b/qtensor/compression/szx/src/cuszx_float.cu
index 48be2365..da6022f1 100644
--- a/qtensor/compression/szx/src/cuszx_float.cu
+++ b/qtensor/compression/szx/src/cuszx_float.cu
@@ -258,9 +258,10 @@ __global__ void compress_float(float *oriData, unsigned char *meta, short *offse
     uchar4* cvalue = (uchar4*)shared;
     int* sums = &ivalue[bs];
 
-    //if(threadIdx.x == 0 && blockIdx.x == 0){
-//	printf("tid threshold: %f\n", threshold);
-  //  }
+    if(threadIdx.x == 0 && blockIdx.x == 0){
+	num_state2=0;
+	total_sig=0;	
+    }
 
     for (unsigned long b=bid; b<nb; b+=gridDim.x){
         if (tidx ==0 && tidy ==0)

From 75066a537e47cb9ddd4212637b5b2086c7f7e0f7 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Fri, 3 Mar 2023 13:34:35 -0600
Subject: [PATCH 048/126] add memory cleanup operations and memory profile
 compressor/backend

---
 bench/qc_simulation/src/simulators/qtensor.py | 24 +++---
 qtensor/compression/Compressor.py             |  7 ++
 qtensor/compression/compressed_contraction.py | 19 +++--
 qtensor/contraction_backends/compression.py   | 44 +++++++++-
 .../performance_measurement_decorator.py      | 85 ++++++++++++++++++-
 5 files changed, 159 insertions(+), 20 deletions(-)

diff --git a/bench/qc_simulation/src/simulators/qtensor.py b/bench/qc_simulation/src/simulators/qtensor.py
index 99bbd3b3..049f80e5 100644
--- a/bench/qc_simulation/src/simulators/qtensor.py
+++ b/bench/qc_simulation/src/simulators/qtensor.py
@@ -159,6 +159,7 @@ def estimate(in_file, out_file, C=100, M=30, F=1e12, T=1e9, **kwargs):
     time = totals.time(F, T, T, M)
     C = asdict(totals)
     C['time'] = time*2**len(par_vars)
+    C['slices'] = 2**len(par_vars)
     print("C", C)
     out_file += ".json"
     write_json(C, out_file)
@@ -183,10 +184,13 @@ def simulate(in_file, out_file, backend='einsum', compress=None, M=29, **kwargs)
     backend = qtensor.contraction_backends.get_backend(backend)
     if compress is not None:
         if compress == 'szx':
-            compressor = qtensor.compression.CUSZCompressor(r2r_error=1e-2, r2r_threshold=1e-2)
+            compressor = qtensor.compression.CUSZCompressor(r2r_error=5e-2, r2r_threshold=5e-2)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
         else:
             raise ValueError(f"Unknown compression algorithm: {compress}")
         backend = qtensor.contraction_backends.CompressionBackend(backend, compressor, M)
+        from qtensor.contraction_backends.performance_measurement_decorator import MemProfBackend
+        backend = MemProfBackend(backend)
 
     relabelid = {}
     for tensor in tn.tensors:
@@ -205,15 +209,15 @@ def simulate(in_file, out_file, backend='einsum', compress=None, M=29, **kwargs)
     sim._slice_relabel_buckets(slice_ext)
     buckets = sim.tn.buckets
     # --dbg
-    ignore_vars  = sim.tn.bra_vars + sim.tn.ket_vars 
-    graph = qtree.graph_model.importers.buckets2graph(buckets, ignore_vars)
-    graph, label_dict = qtree.graph_model.relabel_graph_nodes(
-        graph, dict(zip(graph.nodes, np.array(list(graph.nodes)) - 127*2))
-    ) 
-    import networkx as nx
-    components = list(nx.connected_components(graph))
-    print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
-    print(f"peo size without par_vars and ignore_vars: {len(peo) - len(ignore_vars)}")
+    #ignore_vars  = sim.tn.bra_vars + sim.tn.ket_vars 
+    #graph = qtree.graph_model.importers.buckets2graph(buckets, ignore_vars)
+    #graph, label_dict = qtree.graph_model.relabel_graph_nodes(
+        #graph, dict(zip(graph.nodes, np.array(list(graph.nodes)) - 127*2))
+    #) 
+    #import networkx as nx
+    #components = list(nx.connected_components(graph))
+    #print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
+    #print(f"peo size without par_vars and ignore_vars: {len(peo) - len(ignore_vars)}")
     # --
 
     start = time.time()
diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index 88ac0f48..36bf1644 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -72,6 +72,13 @@ def decompress(self, ptr):
     def get_profile_data(self):
         return self.profile_data['compress'], self.profile_data['decompress']
 
+    def get_profile_data_json(self):
+        compress, decompress = self.get_profile_data()
+        return {
+            'compress': [c.asdict() for c in compress],
+            'decompress': [c.asdict() for c in decompress],
+        }
+
     def get_profile_stats(self):
         compress, decompress = self.get_profile_data()
         compress_time = sum([x.time for x in compress])
diff --git a/qtensor/compression/compressed_contraction.py b/qtensor/compression/compressed_contraction.py
index e2b3e527..eb6ee0c2 100644
--- a/qtensor/compression/compressed_contraction.py
+++ b/qtensor/compression/compressed_contraction.py
@@ -101,12 +101,12 @@ def compressed_contract(A:Tensor, B: Tensor,
     result_chunk_ixs = result_indices[-mem_limit:]
     print(f"Chunk indices: {result_chunk_ixs}, remove_compress: {remove_compress}")
     slice_dict = {}
+    chunk = np.empty(2**len(result_chunk_ixs), dtype=B.dtype)
+    chunk = chunk.reshape(*(v.size for v in result_chunk_ixs))
+    chunk = move_data(chunk)
     for r_i in iterate_indices(need_compressed):
         for ix, sl in zip(need_compressed, r_i):
             slice_dict[ix] = sl
-        chunk = np.empty(2**len(result_chunk_ixs), dtype=B.dtype)
-        chunk = chunk.reshape(*(v.size for v in result_chunk_ixs))
-        chunk = move_data(chunk)
         for irm in iterate_indices(remove_compress):
             for i, ival in zip(remove_compress, irm):
                 slice_dict[i] = ival#slice(ival, ival+1)
@@ -119,6 +119,12 @@ def compressed_contract(A:Tensor, B: Tensor,
             C_ixs = [v for v in result_chunk_ixs if v not in exist_compressed]
             C = Tensor('tmp', indices=C_ixs, data=chunk_view)
             contract_two_tensors(A_slice, B_slice, C)
+            # Free temp slices
+            #import cupy
+            #print("Flags", A_slice.data.flags, B_slice.data.flags, C.data.flags)
+            #cupy.cuda.runtime.free(A_slice.data.data.ptr)
+            #cupy.cuda.runtime.free(B_slice.data.data.ptr)
+            compressor.compressor.free_decompressed()
         if len(need_compressed)==0:
             R = Tensor(new_tensor_name, result_indices, data=chunk)
         else:
@@ -174,12 +180,12 @@ def compressed_sum(A:Tensor, sum_ixs,
     result_chunk_ixs = result_indices[-mem_limit:]
     print(f"Chunk indices: {result_chunk_ixs}, remove_compress: {remove_compress}")
     slice_dict = {}
+    chunk = np.empty(2**len(result_chunk_ixs), dtype=A.dtype)
+    chunk = chunk.reshape(*(v.size for v in result_chunk_ixs))
+    chunk = move_data(chunk)
     for r_i in iterate_indices(need_compressed):
         for ix, sl in zip(need_compressed, r_i):
             slice_dict[ix] = sl
-        chunk = np.empty(2**len(result_chunk_ixs), dtype=A.dtype)
-        chunk = chunk.reshape(*(v.size for v in result_chunk_ixs))
-        chunk = move_data(chunk)
         for irm in iterate_indices(remove_compress):
             for i, ival in zip(remove_compress, irm):
                 slice_dict[i] = ival#slice(ival, ival+1)
@@ -196,4 +202,5 @@ def compressed_sum(A:Tensor, sum_ixs,
             R = Tensor(new_tensor_name, result_indices, data=chunk)
         else:
             R.set_chunk(r_i, chunk)
+        compressor.compressor.free_decompressed()
     return R
diff --git a/qtensor/contraction_backends/compression.py b/qtensor/contraction_backends/compression.py
index bdac4b65..4df06f3a 100644
--- a/qtensor/contraction_backends/compression.py
+++ b/qtensor/contraction_backends/compression.py
@@ -1,5 +1,5 @@
 from qtensor.contraction_backends import ContractionBackend
-from qtensor.compression import Compressor
+from qtensor.compression import Compressor, CompressedTensor, Tensor
 from qtensor.compression.compressed_contraction import compressed_contract, compressed_sum
 from qtensor.contraction_backends.common import slice_numpy_tensor
 from qtree.optimizer import Tensor
@@ -46,6 +46,7 @@ def process_bucket(self, bucket, no_sum=False):
         """
         ctr_kw = dict(zip(['einsum', 'move_data'], self._get_backend_specific_fns(self.backend)))
         bucket.sort(key=lambda x: len(x.indices))
+        print("Processing bucket", bucket)
         accum = bucket[0]
         for t in bucket[1:-1]:
             accum = compressed_contract(
@@ -58,15 +59,52 @@ def process_bucket(self, bucket, no_sum=False):
                 set().union(*[t.indices, accum.indices])
                 , key=int, reverse=True
             )
-            accum = compressed_contract(
+            accum_new = compressed_contract(
                 accum, t, [total_ixs[-1]], self.max_tw, self.compressor
                 ,**ctr_kw
             )
+            # free data
+            import cupy
+            for t in [accum, t]:
+                if isinstance(t, CompressedTensor):
+                    for c in t.data:
+                        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = c
+                        import ctypes
+                        p_decompressed_ptr = ctypes.addressof(cmp_bytes)
+                        # cast to int64 pointer
+                        # (effectively converting pointer to pointer to addr to pointer to int64)
+                        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+                        decompressed_int = p_decompressed_int.contents
+                        print("Freeing mem", decompressed_int.value)
+                        cupy.cuda.runtime.free(decompressed_int.value)
+                    t.compressor.compressor.free_decompressed()
+                    #raise ValueError("Done")
+                else:
+                    #print("PTR", t.data.data.ptr)
+                    #cupy.cuda.runtime.free(t.data.data.ptr)
+                    pass
+                    
+            accum = accum_new
+
             return accum
         else:
-            # This assumes large buckets with one element don't exist
+            if len(accum.indices) < 1:
+                return accum
             indices = (accum.indices[-1], )
             res = compressed_sum(accum, indices, self.compressor, self.max_tw,  **ctr_kw)
+            if isinstance(accum, CompressedTensor):
+                import cupy
+                for c in accum.data:
+                    cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = c
+                    import ctypes
+                    p_decompressed_ptr = ctypes.addressof(cmp_bytes)
+                    # cast to int64 pointer
+                    # (effectively converting pointer to pointer to addr to pointer to int64)
+                    p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+                    decompressed_int = p_decompressed_int.contents
+                    print("Freeing mem", decompressed_int.value)
+                    cupy.cuda.runtime.free(decompressed_int.value)
+                accum.compressor.compressor.free_decompressed()
             return res
 
     def get_sliced_buckets(self, buckets, data_dict, slice_dict):
diff --git a/qtensor/contraction_backends/performance_measurement_decorator.py b/qtensor/contraction_backends/performance_measurement_decorator.py
index b365b969..e96b80d0 100644
--- a/qtensor/contraction_backends/performance_measurement_decorator.py
+++ b/qtensor/contraction_backends/performance_measurement_decorator.py
@@ -1,10 +1,93 @@
 import numpy as np
 from dataclasses import dataclass
-from qtensor.contraction_backends import ContractionBackend, NumpyBackend
+from qtensor.contraction_backends import ContractionBackend, NumpyBackend, CompressionBackend
 from pyrofiler import timing
 from qtensor.tools.lazy_import import torch, pandas
 import string
 
+# -- memory profiling
+from weakref import WeakValueDictionary
+
+class MemProfBackend(ContractionBackend):
+    def __init__(self, backend=NumpyBackend(), print=True):
+        self.backend = backend
+        self.object_store = WeakValueDictionary()
+        self.object_keys = []
+        self.print = print
+        self.max_mem = 0
+
+    def _print(self, *args, **kwargs):
+        if self.print:
+            print(*args, **kwargs)
+
+    def check_store(self):
+        import cupy
+        mempool = cupy.get_default_memory_pool()
+        total_mem = 0
+        deleted_keys = []
+        for key in self.object_keys:
+            tensor = self.object_store.get(key, None)
+            if tensor is None:
+                #self._print("Tensor", key, "was deleted")
+                deleted_keys.append(key)
+                continue
+            else:
+                size = self.tensor_size(tensor)
+                total_mem += size
+        for key in deleted_keys:
+            self.object_keys.remove(key)
+
+        if total_mem>1024**2:
+            self._print("Total memory usage", total_mem/1024/1024, "MB")
+        cupy_mem = mempool.used_bytes()
+        # get maximum memory usage
+        gpu_mem = cupy_mem
+        if isinstance(self.backend, CompressionBackend):
+            gpu_mem += 8*2**self.backend.max_tw
+        self.max_mem = max(self.max_mem, gpu_mem)
+        # --
+        if cupy_mem>1024**2:
+            self._print("CuPy memory usage", cupy_mem/1024/1024, "MB. Total MB:", mempool.total_bytes()/1024**2)
+
+    def tensor_size(self, tensor)->int:
+        from qtensor.compression import Tensor, CompressedTensor
+        if tensor.data is None:
+            return 0
+        if isinstance(tensor, Tensor):
+            return tensor.data.nbytes
+        elif isinstance(tensor, CompressedTensor):
+            chunks = tensor.data
+            sizes = [tensor.compressor.compress_size(x) for x in chunks]
+            return sum(sizes)
+        else:
+            raise ValueError("Unknown tensor type")
+
+    def add_tensor(self, tensor):
+        label = str(tensor)
+        self.object_store[label] = tensor
+        self.object_keys.append(label)
+        tsize = self.tensor_size(tensor)
+        if tsize>1024:
+            self._print("Added tensor with data size", tsize/1024, "KB")
+        self.check_store()
+
+    def process_bucket(self, bucket, no_sum=False):
+        res = self.backend.process_bucket(bucket, no_sum=no_sum)
+        self.add_tensor(res)
+        return res
+
+    def get_sliced_buckets(self, buckets, data_dict, slice_dict):
+        buckets = self.backend.get_sliced_buckets(buckets, data_dict, slice_dict)
+        for bucket in buckets:
+            for tensor in bucket:
+                self.add_tensor(tensor)
+        return buckets
+
+    def get_result_data(self, result):
+        return self.backend.get_result_data(result)
+
+# --
+
 @dataclass
 class BucketContnractionStats:
     """

From 13b2c44150f43d4cabc41da87962953c8315014a Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Fri, 3 Mar 2023 13:34:53 -0600
Subject: [PATCH 049/126] update submodule

---
 qtree | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qtree b/qtree
index ccbb4093..7b038d5a 160000
--- a/qtree
+++ b/qtree
@@ -1 +1 @@
-Subproject commit ccbb4093360da843bcb8282941aa22154b85e2af
+Subproject commit 7b038d5a4cc1f9b5e0ede4b0e5740bff4b22153e

From 53d1f888504828618f052c17b2e9b66b378dd5d0 Mon Sep 17 00:00:00 2001
From: Milan Shah <mkshah5@ncsu.edu>
Date: Sun, 5 Mar 2023 15:07:50 -0500
Subject: [PATCH 050/126] Updated post proc for compression

---
 qtensor/compression/szx/src/cuszx_entry.cu | 212 ++++++++++++++++++++-
 1 file changed, 205 insertions(+), 7 deletions(-)

diff --git a/qtensor/compression/szx/src/cuszx_entry.cu b/qtensor/compression/szx/src/cuszx_entry.cu
index c9098471..e244d6ff 100644
--- a/qtensor/compression/szx/src/cuszx_entry.cu
+++ b/qtensor/compression/szx/src/cuszx_entry.cu
@@ -40,6 +40,24 @@ __host__ __device__ size_t convert_state_to_out(unsigned char* meta, size_t leng
     return out_length;
 }
 
+__global__ void convert_state_to_out_kernel(unsigned char* meta, size_t length, unsigned char *result, size_t out_length){
+    
+
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < out_length; i += blockDim.x*gridDim.x){
+        uint8_t tmp = 0;
+
+        for (size_t j = 0; j < 4; j++)
+        {
+            if (i*4 + j < length)
+            {
+                tmp |= (0x03 & meta[i*4+j]) << 2*j;
+            }
+            
+        }
+        result[i] = tmp;
+    }
+}
+
 // nbBlocks, r, stateNBBytes, stateArray
 __host__ __device__ size_t convert_out_to_state(size_t nbBlocks, unsigned char* cmp, unsigned char* out_state){
     size_t state_length;
@@ -64,6 +82,7 @@ __host__ __device__ size_t convert_out_to_state(size_t nbBlocks, unsigned char*
 
 __host__ __device__ size_t convert_block2_to_out(unsigned char *result, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
     size_t out_length = 0;
+    
     memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
     out_length += numBlocks*4;
     memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
@@ -76,6 +95,49 @@ __host__ __device__ size_t convert_block2_to_out(unsigned char *result, uint32_t
     return out_length;
 }
 
+__global__ void convert_block2_to_out_kernel(unsigned char *result, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    
+    size_t out_length = 0;
+    unsigned char *tmp_result = result;
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
+        uint32_t local_blkidx = blk_idx[i];
+        tmp_result[4*i] = (local_blkidx) & 0xff;
+        tmp_result[4*i+1] = (local_blkidx >> (8*1)) & 0xff;
+        tmp_result[4*i+2] = (local_blkidx >> (8*2)) & 0xff;
+        tmp_result[4*i+3] = (local_blkidx >> (8*3)) & 0xff;
+    }
+    // memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
+    out_length += numBlocks*4;
+    tmp_result = result+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
+        float value = blk_vals[i];
+        tmp_result[4*i] = (value) & 0xff;
+        tmp_result[4*i+1] = (value >> (8*1)) & 0xff;
+        tmp_result[4*i+2] = (value >> (8*2)) & 0xff;
+        tmp_result[4*i+3] = (value >> (8*3)) & 0xff;
+    }
+    // memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
+    out_length += num_sig*sizeof(float);
+    tmp_result = result+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
+        tmp_result[i] = blk_subidx[i];
+        
+    }
+
+    out_length += num_sig*sizeof(uint8_t);
+    tmp_result = result+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
+        tmp_result[i] = blk_sig[i];
+        
+    }
+    out_length+= numBlocks*sizeof(uint8_t);
+
+    // return out_length;
+}
+
 __host__ __device__ size_t convert_out_to_block2(unsigned char *in_cmp, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
     size_t out_length = 0;
     memcpy(blk_idx, in_cmp, numBlocks*sizeof(uint32_t));
@@ -420,6 +482,20 @@ __device__ inline void longToBytes_bigEndian_d(unsigned char *b, unsigned long n
 //		symTransform_8bytes(*b);
 }
 
+inline void longToBytes_bigEndian_memset(unsigned char *b, unsigned long num) 
+{
+    checkCudaErrors(cudaMemset(&b[0], (unsigned char)(num>>56), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[1], (unsigned char)(num>>48), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[2], (unsigned char)(num>>40), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[3], (unsigned char)(num>>32), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[4], (unsigned char)(num>>24), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[5], (unsigned char)(num>>16), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[6], (unsigned char)(num>>8), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[7], (unsigned char)(num), sizeof(char)));
+//	if(dataEndianType==LITTLE_ENDIAN_DATA)
+//		symTransform_8bytes(*b);
+}
+
 __device__ inline void shortToBytes_d(unsigned char* b, short value)
 {
 	lint16 buf;
@@ -427,7 +503,125 @@ __device__ inline void shortToBytes_d(unsigned char* b, short value)
 	memcpy(b, buf.byte, 2);
 }
 
-__global__ void device_post_proc(size_t *outSize, float *oriData, unsigned char *meta, short *offsets, unsigned char *midBytes, unsigned char *outBytes, size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig)
+__global__ void getNumNonConstantBlocks(size_t nbBlocks, short *offsets, unsigned char *meta, int blockSize, size_t *nonconstant, int *out_size){
+    for (int tid = blockDim.x*blockIdx.x + threadIdx.x; tid < nbBlocks; tid += blockDim.x*gridDim.x){
+        if (meta[tid] == 3){ 
+            atomicAdd(nonconstant, 1);
+            atomicAdd(out_size,1+(blockSize/4)+offsets[tid]);
+        }
+    }
+}
+
+__global__ void ncblkCopy(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize)
+{
+    for (int i=blockDim.x*blockIdx.x + threadIdx.x; i<nbBlocks; i+=blockDim.x*gridDim.x){
+        
+        if (meta[i]==0 || meta[i] == 1){
+            memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
+            c += sizeof(float);
+        }else if(meta[i] == 3){
+           shortToBytes_d(o, offsets[i]);
+            o += sizeof(short);
+            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
+            nc += mSize; 
+            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            nc += offsets[i];
+        } 
+    }
+}
+
+void better_post_proc(size_t *outSize, float *oriData, unsigned char *meta, 
+                                short *offsets, unsigned char *midBytes, unsigned char *outBytes, 
+                                size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, 
+                                float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    /**
+     * outSize: host pointer
+     * float *oriData: device pointer
+     * unsigned char* meta: device pointer
+     * short *offsets: device pointer
+     * 
+     * 
+     */
+    int out_size_h = 0;
+    int *out_size_d;
+    int tmp_outsize = 0;
+
+    size_t nbConstantBlocks = 0;
+    size_t nbBlocks = nbEle/blockSize;
+    size_t ncBytes = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    out_size_h += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
+    if (nbBlocks%8==0)
+        out_size_h += nbBlocks/8;
+    else
+        out_size_h += nbBlocks/8+1;
+
+    size_t *nonconstant_d, *nonconstant_h;
+    
+    checkCudaErrors(cudaMalloc((void **)&nonconstant_d, sizeof(size_t)));
+    checkCudaErrors(cudaMalloc((void **)&out_size_d, sizeof(int)));
+
+    checkCudaErrors(cudaMemset(nonconstant_d, 0, sizeof(size_t)));
+    checkCudaErrors(cudaMemset(out_size_d, 0, sizeof(int)));
+
+
+    getNumNonConstantBlocks<<<40,64>>>(nbBlocks, offsets, meta, blockSize, nonconstant_d, out_size_d);
+    cudaDeviceSynchronize();
+
+    checkCudaErrors(cudaMemcpy(nonconstant_h, nonconstant_d, sizeof(size_t), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(&tmp_outsize, out_size_d, sizeof(int), cudaMemcpyDeviceToHost));
+
+    nbConstantBlocks = nbBlocks - nonconstant_h;
+    out_size_h+=tmp_outsize;
+
+    out_size_h += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
+
+    //outBytes = (unsigned char*)malloc(out_size);
+	unsigned char* r = outBytes;
+    unsigned char* r_old = outBytes;
+    checkCudaErrors(cudaMemset(r, SZx_VER_MAJOR, sizeof(char)));
+    checkCudaErrors(cudaMemset(r+1, SZx_VER_MINOR, sizeof(char)));
+    checkCudaErrors(cudaMemset(r+2, 1, sizeof(char)));
+    checkCudaErrors(cudaMemset(r+3, 0, sizeof(char)));
+    checkCudaErrors(cudaMemset(r+4, blockSize, sizeof(char)));
+
+	r=r+5; //1 byte
+	//sizeToBytes(r, nbConstantBlocks);
+    longToBytes_bigEndian_memset(r, nbConstantBlocks);
+	r += sizeof(size_t);
+    //sizeToBytes(r, (size_t) num_sig);
+    longToBytes_bigEndian_memset(r, (unsigned long)num_sig);
+    r += sizeof(size_t); 
+    size_t out_length;
+
+    if(nbBlocks%4==0)
+		out_length = nbBlocks/4;
+	else
+		out_length = nbBlocks/4+1;
+
+    convert_state_to_out_kernel<<<20,256>>>(meta, nbBlocks, r, out_length);
+    r+=out_length;
+    convert_block2_to_out_kernel<<<20,256>>>(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    r += nbBlocks*4 + num_sig*sizeof(float) + num_sig*sizeof(uint8_t) + nbBlocks*sizeof(uint8_t);
+
+    checkCudaErrors(cudaMemcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float), cudaMemcpyDeviceToDevice));
+    // memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
+    r += (nbEle%blockSize)*sizeof(float);
+    unsigned char* c = r;
+    unsigned char* o = c+nbConstantBlocks*sizeof(float);
+    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
+    ncblkCopy<<<20,256>>>(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
+    cudaDeviceSynchronize();
+    checkCudaErrors(cudaMemset(outSize, (size_t)nc-r_old, sizeof(size_t)));
+    // *outSize = (size_t) (nc-r_old);
+    return outBytes;
+}
+
+__global__ void device_post_proc(size_t *outSize, float *oriData, unsigned char *meta, 
+                                short *offsets, unsigned char *midBytes, unsigned char *outBytes, 
+                                size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, 
+                                float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig)
 {
     int out_size = 0;
 
@@ -494,7 +688,7 @@ __global__ void device_post_proc(size_t *outSize, float *oriData, unsigned char
     }
 
     // return out_size;
-    *outSize = (uint32_t) (nc-r_old);
+    *outSize = (size_t) (nc-r_old);
    // printf("outBytes 0 %d\n", (int) outBytes[0]);
     // return (uint32_t) (nc-r_old);
 }
@@ -508,6 +702,7 @@ unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize,
      *  unsigned char* outBytes
      * 
      */
+    timer_GPU.StartCounter();
 
     float sparsity_level = SPARSITY_LEVEL;
 
@@ -561,16 +756,17 @@ unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize,
     checkCudaErrors(cudaMalloc((void**)&d_midBytes, mbsz)); 
     checkCudaErrors(cudaMemset(d_midBytes, 0, mbsz));
 
-    timer_GPU.StartCounter();
+    
     // apply_threshold<<<80,256>>>(d_oriData, threshold, nbEle);
     // cudaDeviceSynchronize();
     dim3 dimBlock(32, blockSize/32);
     dim3 dimGrid(65536, 1);
     const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
+    printf("Malloc end timestamp: %f ms\n", timer_GPU.GetCounter());
     compress_float<<<dimGrid, dimBlock, sMemsize>>>(d_oriData, d_meta, d_offsets, d_midBytes, absErrBound, blockSize, nbBlocks, mSize, sparsity_level, d_blk_idx, d_blk_subidx,d_blk_vals, threshold, d_blk_sig);
     cudaError_t err = cudaGetLastError();        // Get error code
     printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    printf("GPU compression timing: %f ms\n", timer_GPU.GetCounter());
+    printf("GPU compression timestamp: %f ms\n", timer_GPU.GetCounter());
     cudaDeviceSynchronize();
     get_numsig<<<1,1>>>(d_num_sig);
     cudaDeviceSynchronize();
@@ -605,9 +801,10 @@ unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize,
 
     checkCudaErrors(cudaMalloc(&d_outSize, sizeof(size_t)));
 
-    device_post_proc<<<1,1>>>(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
-
-    cudaDeviceSynchronize();
+    // device_post_proc<<<1,1>>>(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
+    better_post_proc(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
+    // cudaDeviceSynchronize();
+    
     checkCudaErrors(cudaMemcpy(outSize, d_outSize, sizeof(size_t), cudaMemcpyDeviceToHost));
 
     // printf("completed compression\n");
@@ -621,6 +818,7 @@ unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize,
     checkCudaErrors(cudaFree(d_offsets));
     checkCudaErrors(cudaFree(d_midBytes));
 //    printf("completed compression\n");
+    printf("Compression end timestamp: %f ms\n", timer_GPU.GetCounter());
     return d_outBytes;
 }
 

From a8c9e70c8be9b926d0116643c3e965f778717f58 Mon Sep 17 00:00:00 2001
From: Milan Shah <mkshah5@ncsu.edu>
Date: Sun, 5 Mar 2023 15:23:45 -0500
Subject: [PATCH 051/126] Compilation error fixes

---
 qtensor/compression/szx/src/cuszx_entry.cu | 29 +++++++++++-----------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/qtensor/compression/szx/src/cuszx_entry.cu b/qtensor/compression/szx/src/cuszx_entry.cu
index 5f2dd4a9..08d81fc7 100644
--- a/qtensor/compression/szx/src/cuszx_entry.cu
+++ b/qtensor/compression/szx/src/cuszx_entry.cu
@@ -112,10 +112,10 @@ __global__ void convert_block2_to_out_kernel(unsigned char *result, uint32_t num
     
     for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
         float value = blk_vals[i];
-        tmp_result[4*i] = (value) & 0xff;
-        tmp_result[4*i+1] = (value >> (8*1)) & 0xff;
-        tmp_result[4*i+2] = (value >> (8*2)) & 0xff;
-        tmp_result[4*i+3] = (value >> (8*3)) & 0xff;
+        tmp_result[4*i] = (unsigned char)((value) & 0xff);
+        tmp_result[4*i+1] = (unsigned char)((value >> (8*1)) & 0xff);
+        tmp_result[4*i+2] = (unsigned char)((value >> (8*2)) & 0xff);
+        tmp_result[4*i+3] = (unsigned char)((value >> (8*3)) & 0xff);
     }
     // memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
     out_length += num_sig*sizeof(float);
@@ -503,7 +503,7 @@ __device__ inline void shortToBytes_d(unsigned char* b, short value)
 	memcpy(b, buf.byte, 2);
 }
 
-__global__ void getNumNonConstantBlocks(size_t nbBlocks, short *offsets, unsigned char *meta, int blockSize, size_t *nonconstant, int *out_size){
+__global__ void getNumNonConstantBlocks(size_t nbBlocks, short *offsets, unsigned char *meta, int blockSize, int *nonconstant, int *out_size){
     for (int tid = blockDim.x*blockIdx.x + threadIdx.x; tid < nbBlocks; tid += blockDim.x*gridDim.x){
         if (meta[tid] == 3){ 
             atomicAdd(nonconstant, 1);
@@ -531,7 +531,7 @@ __global__ void ncblkCopy(unsigned char * c, unsigned char* o, unsigned char *nc
     }
 }
 
-void better_post_proc(size_t *outSize, float *oriData, unsigned char *meta, 
+size_t better_post_proc(size_t *outSize, float *oriData, unsigned char *meta, 
                                 short *offsets, unsigned char *midBytes, unsigned char *outBytes, 
                                 size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, 
                                 float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
@@ -557,19 +557,19 @@ void better_post_proc(size_t *outSize, float *oriData, unsigned char *meta,
     else
         out_size_h += nbBlocks/8+1;
 
-    size_t *nonconstant_d, *nonconstant_h;
+    int *nonconstant_d, nonconstant_h;
     
-    checkCudaErrors(cudaMalloc((void **)&nonconstant_d, sizeof(size_t)));
+    checkCudaErrors(cudaMalloc((void **)&nonconstant_d, sizeof(int)));
     checkCudaErrors(cudaMalloc((void **)&out_size_d, sizeof(int)));
 
-    checkCudaErrors(cudaMemset(nonconstant_d, 0, sizeof(size_t)));
+    checkCudaErrors(cudaMemset(nonconstant_d, 0, sizeof(int)));
     checkCudaErrors(cudaMemset(out_size_d, 0, sizeof(int)));
 
 
     getNumNonConstantBlocks<<<40,64>>>(nbBlocks, offsets, meta, blockSize, nonconstant_d, out_size_d);
     cudaDeviceSynchronize();
 
-    checkCudaErrors(cudaMemcpy(nonconstant_h, nonconstant_d, sizeof(size_t), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(&nonconstant_h, nonconstant_d, sizeof(int), cudaMemcpyDeviceToHost));
     checkCudaErrors(cudaMemcpy(&tmp_outsize, out_size_d, sizeof(int), cudaMemcpyDeviceToHost));
 
     nbConstantBlocks = nbBlocks - nonconstant_h;
@@ -613,9 +613,10 @@ void better_post_proc(size_t *outSize, float *oriData, unsigned char *meta,
     unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
     ncblkCopy<<<20,256>>>(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
     cudaDeviceSynchronize();
-    checkCudaErrors(cudaMemset(outSize, (size_t)nc-r_old, sizeof(size_t)));
+    return (size_t) (nc-r_old);
+    // checkCudaErrors(cudaMemcpy(outSize, (size_t)(nc-r_old), sizeof(size_t)));
     // *outSize = (size_t) (nc-r_old);
-    return outBytes;
+    // return outBytes;
 }
 
 __global__ void device_post_proc(size_t *outSize, float *oriData, unsigned char *meta, 
@@ -811,10 +812,10 @@ unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize,
     checkCudaErrors(cudaMalloc(&d_outSize, sizeof(size_t)));
 
     // device_post_proc<<<1,1>>>(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
-    better_post_proc(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
+    *outSize = better_post_proc(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
     // cudaDeviceSynchronize();
     
-    checkCudaErrors(cudaMemcpy(outSize, d_outSize, sizeof(size_t), cudaMemcpyDeviceToHost));
+    // checkCudaErrors(cudaMemcpy(outSize, d_outSize, sizeof(size_t), cudaMemcpyDeviceToHost));
 
     // printf("completed compression\n");
     //free(blk_idx);

From 50be22f1f8463f7ca2f54141b4c64922bc97dfcb Mon Sep 17 00:00:00 2001
From: Milan Shah <mkshah5@ncsu.edu>
Date: Sun, 5 Mar 2023 15:25:59 -0500
Subject: [PATCH 052/126] Compilation error fixes, integral type error

---
 qtensor/compression/szx/src/cuszx_entry.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/qtensor/compression/szx/src/cuszx_entry.cu b/qtensor/compression/szx/src/cuszx_entry.cu
index 08d81fc7..c1c3a23c 100644
--- a/qtensor/compression/szx/src/cuszx_entry.cu
+++ b/qtensor/compression/szx/src/cuszx_entry.cu
@@ -112,10 +112,10 @@ __global__ void convert_block2_to_out_kernel(unsigned char *result, uint32_t num
     
     for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
         float value = blk_vals[i];
-        tmp_result[4*i] = (unsigned char)((value) & 0xff);
-        tmp_result[4*i+1] = (unsigned char)((value >> (8*1)) & 0xff);
-        tmp_result[4*i+2] = (unsigned char)((value >> (8*2)) & 0xff);
-        tmp_result[4*i+3] = (unsigned char)((value >> (8*3)) & 0xff);
+        tmp_result[(int)4*i] = (unsigned char)((value) & 0xff);
+        tmp_result[(int)4*i+1] = (unsigned char)((value >> (8*1)) & 0xff);
+        tmp_result[(int)4*i+2] = (unsigned char)((value >> (8*2)) & 0xff);
+        tmp_result[(int)4*i+3] = (unsigned char)((value >> (8*3)) & 0xff);
     }
     // memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
     out_length += num_sig*sizeof(float);

From 12dd9c0e89500008b44c2f28196835b5a87f27ed Mon Sep 17 00:00:00 2001
From: Milan Kartik Shah <mkshah5@nps4.ece.ncsu.edu>
Date: Sun, 5 Mar 2023 15:56:04 -0500
Subject: [PATCH 053/126] Updated post_proc to faster kernels

---
 qtensor/compression/szx/src/cuszx_entry.cu | 24 +++++++++++++---------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/qtensor/compression/szx/src/cuszx_entry.cu b/qtensor/compression/szx/src/cuszx_entry.cu
index c1c3a23c..62aa10d3 100644
--- a/qtensor/compression/szx/src/cuszx_entry.cu
+++ b/qtensor/compression/szx/src/cuszx_entry.cu
@@ -112,10 +112,12 @@ __global__ void convert_block2_to_out_kernel(unsigned char *result, uint32_t num
     
     for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
         float value = blk_vals[i];
-        tmp_result[(int)4*i] = (unsigned char)((value) & 0xff);
-        tmp_result[(int)4*i+1] = (unsigned char)((value >> (8*1)) & 0xff);
-        tmp_result[(int)4*i+2] = (unsigned char)((value >> (8*2)) & 0xff);
-        tmp_result[(int)4*i+3] = (unsigned char)((value >> (8*3)) & 0xff);
+	memcpy(&tmp_result[4*i], &value, sizeof(float));
+	//unsigned char *v = ()
+        //tmp_result[(int)4*i] = (unsigned char)((value) & 0xff);
+        //tmp_result[(int)4*i+1] = (unsigned char)((value >> (8*1)) & 0xff);
+        //tmp_result[(int)4*i+2] = (unsigned char)((value >> (8*2)) & 0xff);
+        //tmp_result[(int)4*i+3] = (unsigned char)((value >> (8*3)) & 0xff);
     }
     // memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
     out_length += num_sig*sizeof(float);
@@ -772,11 +774,11 @@ unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize,
     dim3 dimBlock(32, blockSize/32);
     dim3 dimGrid(65536, 1);
     const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
-    printf("Malloc end timestamp: %f ms\n", timer_GPU.GetCounter());
+    //printf("Malloc end timestamp: %f ms\n", timer_GPU.GetCounter());
     compress_float<<<dimGrid, dimBlock, sMemsize>>>(d_oriData, d_meta, d_offsets, d_midBytes, absErrBound, blockSize, nbBlocks, mSize, sparsity_level, d_blk_idx, d_blk_subidx,d_blk_vals, threshold, d_blk_sig);
     cudaError_t err = cudaGetLastError();        // Get error code
-    printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    printf("GPU compression timestamp: %f ms\n", timer_GPU.GetCounter());
+   // printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    //printf("GPU compression timestamp: %f ms\n", timer_GPU.GetCounter());
     cudaDeviceSynchronize();
     get_numsig<<<1,1>>>(d_num_sig);
     cudaDeviceSynchronize();
@@ -811,11 +813,11 @@ unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize,
 
     checkCudaErrors(cudaMalloc(&d_outSize, sizeof(size_t)));
 
-    // device_post_proc<<<1,1>>>(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
+   // device_post_proc<<<1,1>>>(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
     *outSize = better_post_proc(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
-    // cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
     
-    // checkCudaErrors(cudaMemcpy(outSize, d_outSize, sizeof(size_t), cudaMemcpyDeviceToHost));
+    //checkCudaErrors(cudaMemcpy(outSize, d_outSize, sizeof(size_t), cudaMemcpyDeviceToHost));
 
     // printf("completed compression\n");
     //free(blk_idx);
@@ -835,6 +837,8 @@ unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize,
     checkCudaErrors(cudaFree(d_midBytes));
 //    printf("completed compression\n");
     printf("Compression end timestamp: %f ms\n", timer_GPU.GetCounter());
+    
+    printf("CUDA Error: %s\n", cudaGetErrorString(err));
     return d_outBytes;
 }
 

From 43adb666fb2d24f19afe73f5337bb26972a0b718 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Tue, 7 Mar 2023 02:57:42 -0600
Subject: [PATCH 054/126] add memory prof and fix reversed backend

---
 bench/qc_simulation/src/simulators/qtensor.py |  7 ++-
 qtensor/compression/Compressor.py             |  6 +--
 .../merged_bucket_elimination.py              | 44 ++++++++++++-------
 .../performance_measurement_decorator.py      |  3 +-
 4 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/bench/qc_simulation/src/simulators/qtensor.py b/bench/qc_simulation/src/simulators/qtensor.py
index 049f80e5..e29ede10 100644
--- a/bench/qc_simulation/src/simulators/qtensor.py
+++ b/bench/qc_simulation/src/simulators/qtensor.py
@@ -235,7 +235,12 @@ def simulate(in_file, out_file, backend='einsum', compress=None, M=29, **kwargs)
     end = time.time()
     print("D", end - start)
     out_file += ".json"
-    C = {'time': end - start}
+    C = {'time': 2**len(par_vars)*(end - start)}
+    C['memory'] = backend.max_mem
+    if compress is not None:
+        if isinstance(compressor, qtensor.compression.ProfileCompressor):
+            C['compression'] = compressor.get_profile_data_json()
+
     write_json(C, out_file)
     cupy.cuda.profiler.stop()
     return out_file
diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index 36bf1644..8669cd5e 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -27,7 +27,7 @@ def compress_size(self, ptr):
 # -- Debugging and profiling
 
 import time
-from dataclasses import dataclass
+from dataclasses import dataclass, asdict
 @dataclass
 class CompressMeasure:
     time: float = 0
@@ -75,8 +75,8 @@ def get_profile_data(self):
     def get_profile_data_json(self):
         compress, decompress = self.get_profile_data()
         return {
-            'compress': [c.asdict() for c in compress],
-            'decompress': [c.asdict() for c in decompress],
+            'compress': [asdict(c) for c in compress],
+            'decompress': [asdict(c) for c in decompress],
         }
 
     def get_profile_stats(self):
diff --git a/qtensor/contraction_algos/merged_bucket_elimination.py b/qtensor/contraction_algos/merged_bucket_elimination.py
index 933dbd65..8b92d54c 100644
--- a/qtensor/contraction_algos/merged_bucket_elimination.py
+++ b/qtensor/contraction_algos/merged_bucket_elimination.py
@@ -1,6 +1,19 @@
 import itertools
+import numpy as np
+from qtree.optimizer import Tensor, Var
 
-def bucket_elimination(buckets, ibunch, process_bucket_fn,
+def is_reverse_order_backend(backend):
+    """
+    Duck-test if the tensors are with reverse index order
+    using slice_buckets method
+    """
+    a, b = Var(1), Var(2)
+    test_b = [[Tensor('T', [a, b], data_key='k')]]
+    data_dict={'k': np.random.rand(2, 2)}
+    sliced = backend.get_sliced_buckets(test_b, data_dict, {a: slice(None), b: slice(None)})
+    return sliced[0][0].indices[0] == b
+
+def bucket_elimination(buckets, process_bucket_fn,
                        n_var_nosum=0):
     """
     Algorithm to evaluate a contraction of a large number of tensors.
@@ -12,9 +25,8 @@ def bucket_elimination(buckets, ibunch, process_bucket_fn,
     Parameters
     ----------
     buckets : list of lists
-    ibunch : list of lists of indices to contract.
     process_bucket_fn : function
-    function that will process buckets, takes list of indices to contract + buckets
+              function that will process this kind of buckets
     n_var_nosum : int, optional
               number of variables that have to be left in the
               result. Expected at the end of bucket list
@@ -22,35 +34,37 @@ def bucket_elimination(buckets, ibunch, process_bucket_fn,
     -------
     result : numpy.array
     """
+    # import pdb
+    # pdb.set_trace()
     n_var_contract = len(buckets) - n_var_nosum
-    assert len(ibunch) == len(buckets), "Buckets length should be same as ibunch length"
 
     result = None
-    for ixs, bucket in zip(ibunch, buckets[:n_var_contract]):
+    for n in range(n_var_contract):
+        bucket = buckets[n]
         if len(bucket) > 0:
-            tensor = process_bucket_fn(ixs, bucket)
+            tensor = process_bucket_fn(bucket)
+            #-- Memory management
+            buckets[n] = []
+            #--
+
             if len(tensor.indices) > 0:
                 # tensor is not scalar.
                 # Move it to appropriate bucket
-                smallest_ix = min([int(x) for x in tensor.indices])
-                appended = False
-                for j, ixs in enumerate(ibunch):
-                    if smallest_ix in map(int, ixs):
-                        buckets[j].append(tensor)
-                        appended = True
-                if not appended:
-                    raise Exception('Algorithmic error, investigate.')
+                first_index = int(tensor.indices[-1])
+                buckets[first_index].append(tensor)
             else:   # tensor is scalar
                 if result is not None:
                     result *= tensor
                 else:
                     result = tensor
+        # free up space, the tensors are no longer needed
+        buckets[n] = []
 
     # form a single list of the rest if any
     rest = list(itertools.chain.from_iterable(buckets[n_var_contract:]))
     if len(rest) > 0:
         # only multiply tensors
-        tensor = process_bucket_fn([], rest, no_sum=True)
+        tensor = process_bucket_fn(rest, no_sum=True)
         if result is not None:
             result *= tensor
         else:
diff --git a/qtensor/contraction_backends/performance_measurement_decorator.py b/qtensor/contraction_backends/performance_measurement_decorator.py
index e96b80d0..4ea2cff9 100644
--- a/qtensor/contraction_backends/performance_measurement_decorator.py
+++ b/qtensor/contraction_backends/performance_measurement_decorator.py
@@ -1,6 +1,7 @@
 import numpy as np
 from dataclasses import dataclass
-from qtensor.contraction_backends import ContractionBackend, NumpyBackend, CompressionBackend
+from qtensor.contraction_backends import ContractionBackend, NumpyBackend
+from qtensor.contraction_backends.compression import CompressionBackend
 from pyrofiler import timing
 from qtensor.tools.lazy_import import torch, pandas
 import string

From 708bc8d079fd8b8dc5a6e4ccb45a671517b304c9 Mon Sep 17 00:00:00 2001
From: Milan Shah <mkshah5@ncsu.edu>
Date: Tue, 7 Mar 2023 16:03:51 -0500
Subject: [PATCH 055/126] Fixed median value array bug

---
 qtensor/compression/szx/src/cuszx_entry.cu | 298 ++++++++++++++++++++-
 1 file changed, 289 insertions(+), 9 deletions(-)

diff --git a/qtensor/compression/szx/src/cuszx_entry.cu b/qtensor/compression/szx/src/cuszx_entry.cu
index 62aa10d3..d76b5e5d 100644
--- a/qtensor/compression/szx/src/cuszx_entry.cu
+++ b/qtensor/compression/szx/src/cuszx_entry.cu
@@ -4,6 +4,8 @@
 #include "szx_TypeManager.h"
 #include "timingGPU.h"
 #include "szx.h"
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
 
 #define SPARSITY_LEVEL 0.25
 
@@ -58,6 +60,26 @@ __global__ void convert_state_to_out_kernel(unsigned char* meta, size_t length,
     }
 }
 
+__global__ void convert_out_to_state_kernel(size_t nbBlocks, unsigned char* cmp, unsigned char* out_state, size_t state_length, int *num_state2blks, int *ncBlocks){
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < state_length; i += blockDim.x*gridDim.x){
+        for (size_t j = 0; j < 4; j++)
+        {
+            if (4*i + j < nbBlocks)
+            {
+                out_state[4*i + j]= (cmp[i] >> 2*j) & 0x03;
+                if (out_state[4*i+j] == 2)
+                {
+                    atomicAdd(num_state2blks, 1);
+                }else if(out_state[4*i+j]==3){
+                    atomicAdd(ncBlocks, 1);
+                }
+                
+            }
+            
+        }
+    }
+}
+
 // nbBlocks, r, stateNBBytes, stateArray
 __host__ __device__ size_t convert_out_to_state(size_t nbBlocks, unsigned char* cmp, unsigned char* out_state){
     size_t state_length;
@@ -112,7 +134,7 @@ __global__ void convert_block2_to_out_kernel(unsigned char *result, uint32_t num
     
     for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
         float value = blk_vals[i];
-	memcpy(&tmp_result[4*i], &value, sizeof(float));
+	    memcpy(&tmp_result[4*i], &value, sizeof(float));
 	//unsigned char *v = ()
         //tmp_result[(int)4*i] = (unsigned char)((value) & 0xff);
         //tmp_result[(int)4*i+1] = (unsigned char)((value >> (8*1)) & 0xff);
@@ -140,6 +162,50 @@ __global__ void convert_block2_to_out_kernel(unsigned char *result, uint32_t num
     // return out_length;
 }
 
+__global__ void convert_out_to_block2_kernel(unsigned char *in_cmp, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    size_t out_length = 0;
+    
+    unsigned char *tmp_result = in_cmp;
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
+        
+        uint32_t local_blkidx = (tmp_result[4*i] & 0xff) | ((tmp_result[4*i+1] & 0xff) << (8*1)) 
+                                | ((tmp_result[4*i+2] & 0xff) << (8*2)) | ((tmp_result[4*i+3] & 0xff) << (8*3));
+        blk_idx[i] = local_blkidx;
+    }
+    // memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
+    out_length += numBlocks*4;
+    tmp_result = in_cmp+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
+        float value = 0.0;
+        memcpy(&value, &tmp_result[4*i], sizeof(float));
+        blk_vals[i] = value;
+	    
+	//unsigned char *v = ()
+        //tmp_result[(int)4*i] = (unsigned char)((value) & 0xff);
+        //tmp_result[(int)4*i+1] = (unsigned char)((value >> (8*1)) & 0xff);
+        //tmp_result[(int)4*i+2] = (unsigned char)((value >> (8*2)) & 0xff);
+        //tmp_result[(int)4*i+3] = (unsigned char)((value >> (8*3)) & 0xff);
+    }
+    // memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
+    out_length += num_sig*sizeof(float);
+    tmp_result = in_cmp+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
+        blk_subidx[i] = tmp_result[i];
+        
+    }
+
+    out_length += num_sig*sizeof(uint8_t);
+    tmp_result = in_cmp+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
+        blk_sig[i] = tmp_result[i];
+        
+    }
+    out_length+= numBlocks*sizeof(uint8_t);
+}
+
 __host__ __device__ size_t convert_out_to_block2(unsigned char *in_cmp, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
     size_t out_length = 0;
     memcpy(blk_idx, in_cmp, numBlocks*sizeof(uint32_t));
@@ -522,6 +588,9 @@ __global__ void ncblkCopy(unsigned char * c, unsigned char* o, unsigned char *nc
         if (meta[i]==0 || meta[i] == 1){
             memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
             c += sizeof(float);
+	    // float g;
+	    // memcpy(&g, (meta+(nbBlocks+i*mSize)),sizeof(float));
+	    // printf("%d %f\n",i,g);
         }else if(meta[i] == 3){
            shortToBytes_d(o, offsets[i]);
             o += sizeof(short);
@@ -568,7 +637,7 @@ size_t better_post_proc(size_t *outSize, float *oriData, unsigned char *meta,
     checkCudaErrors(cudaMemset(out_size_d, 0, sizeof(int)));
 
 
-    getNumNonConstantBlocks<<<40,64>>>(nbBlocks, offsets, meta, blockSize, nonconstant_d, out_size_d);
+    getNumNonConstantBlocks<<<40,256>>>(nbBlocks, offsets, meta, blockSize, nonconstant_d, out_size_d);
     cudaDeviceSynchronize();
 
     checkCudaErrors(cudaMemcpy(&nonconstant_h, nonconstant_d, sizeof(int), cudaMemcpyDeviceToHost));
@@ -602,9 +671,9 @@ size_t better_post_proc(size_t *outSize, float *oriData, unsigned char *meta,
 	else
 		out_length = nbBlocks/4+1;
 
-    convert_state_to_out_kernel<<<20,256>>>(meta, nbBlocks, r, out_length);
+    convert_state_to_out_kernel<<<40,256>>>(meta, nbBlocks, r, out_length);
     r+=out_length;
-    convert_block2_to_out_kernel<<<20,256>>>(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    convert_block2_to_out_kernel<<<40,256>>>(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
     r += nbBlocks*4 + num_sig*sizeof(float) + num_sig*sizeof(uint8_t) + nbBlocks*sizeof(uint8_t);
 
     checkCudaErrors(cudaMemcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float), cudaMemcpyDeviceToDevice));
@@ -613,7 +682,7 @@ size_t better_post_proc(size_t *outSize, float *oriData, unsigned char *meta,
     unsigned char* c = r;
     unsigned char* o = c+nbConstantBlocks*sizeof(float);
     unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
-    ncblkCopy<<<20,256>>>(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
+    ncblkCopy<<<1,1>>>(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
     cudaDeviceSynchronize();
     return (size_t) (nc-r_old);
     // checkCudaErrors(cudaMemcpy(outSize, (size_t)(nc-r_old), sizeof(size_t)));
@@ -689,7 +758,11 @@ __global__ void device_post_proc(size_t *outSize, float *oriData, unsigned char
         if (meta[i]==0 || meta[i] == 1){
             memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
             c += sizeof(float);
-        }else if(meta[i] == 3){
+       
+	    // float g;
+	    // memcpy(&g, (c-sizeof(float)),sizeof(float));
+	    // printf("%d %f\n",i,g);
+       	}else if(meta[i] == 3){
            shortToBytes_d(o, offsets[i]);
             o += sizeof(short);
             memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
@@ -813,7 +886,7 @@ unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize,
 
     checkCudaErrors(cudaMalloc(&d_outSize, sizeof(size_t)));
 
-   // device_post_proc<<<1,1>>>(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
+  //  device_post_proc<<<1,1>>>(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
     *outSize = better_post_proc(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
     //cudaDeviceSynchronize();
     
@@ -931,6 +1004,48 @@ __global__ void decompress_get_stats(float *newData, size_t nbEle, unsigned char
 
 }
 
+ void setup_data_stateArray_better(float *newData, size_t nbEle, unsigned char* r, 
+    size_t num_sig, int blockSize,
+    size_t nbConstantBlocks, size_t nbBlocks, size_t *ncBlks,
+    unsigned char *stateArray, unsigned char *newR
+){
+
+    //printf("ma\n");
+    blockSize = 256;
+    r += 4;
+    r++;
+    r += sizeof(size_t);
+    r += sizeof(size_t);
+    int ncBlocks, *ncBlocks_d;
+	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    int num_state2_blks, *num_state2_d;
+    checkCudaErrors(cudaMalloc((void **)&num_state2_d, sizeof(int)));
+    checkCudaErrors(cudaMalloc((void **)&ncBlocks_d, sizeof(int)));
+    checkCudaErrors(cudaMemset(num_state2_d, 0, sizeof(int)));
+    checkCudaErrors(cudaMemset(ncBlocks_d, 0, sizeof(int)));
+
+    //printf("ma2\n");
+//	printf("Converting state array\n");
+    // printf("cmp %d\n", (int)r[0]);
+    // printf("state %d\n", (int)stateArray[0]);
+    // convert_out_to_state(nbBlocks, r, stateArray);
+    convert_out_to_state_kernel<<<40,256>>>(nbBlocks,r,stateArray,stateNBBytes,
+                            num_state2_d, ncBlocks_d);
+    // printf("state %d\n", (int)stateArray[0]);
+    // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
+	cudaDeviceSynchronize();
+    
+    //printf("ma3\n");
+	r += stateNBBytes;
+    newR = r;
+    cudaMemcpy(&ncBlocks, ncBlocks_d, sizeof(int), cudaMemcpyDeviceToHost);
+    
+    //printf("ma4\n");
+    *ncBlks = ncBlocks;
+
+    //printf("ma4\n");
+ }
+
 __global__ void setup_data_stateArray(float *newData, size_t nbEle, unsigned char* r, 
     size_t num_sig, int blockSize,
     size_t nbConstantBlocks, size_t nbBlocks, size_t *ncBlks,
@@ -948,6 +1063,7 @@ __global__ void setup_data_stateArray(float *newData, size_t nbEle, unsigned cha
     // printf("cmp %d\n", (int)r[0]);
     // printf("state %d\n", (int)stateArray[0]);
     convert_out_to_state(nbBlocks, r, stateArray);
+    // convert_out_to_state_kernel<<<40,256>>>(nbBlocks,r,stateArray,stateNBBytes);
     // printf("state %d\n", (int)stateArray[0]);
     // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
 	for (size_t i = 0; i < nbBlocks; i++)
@@ -965,6 +1081,96 @@ __global__ void setup_data_stateArray(float *newData, size_t nbEle, unsigned cha
     *ncBlks = ncBlocks;
 }
 
+__global__ void decomp_startup_kernel(unsigned char* r, size_t nbConstantBlocks, 
+unsigned char *data, int blockSize, size_t mSize, size_t ncBlocks, float *constantMedianArray){
+    unsigned char * fr = r; //fr is the starting address of constant median values.
+    int i = 0, j = 0, k = 0;
+  //  printf("%p\n", r);
+    unsigned char tmp_r[4];
+    tmp_r[0]=fr[0];
+    tmp_r[1]=fr[1];
+    tmp_r[2]=fr[2];
+    tmp_r[3]=fr[3];
+
+
+//    printf("nbconstant: %f\n", ((float*)tmp_r)[0]);
+// nbConstantBlocks
+    for(i = blockDim.x*blockIdx.x + threadIdx.x; i < nbConstantBlocks; i += blockDim.x*gridDim.x, j+=4){ //get the median values for constant-value blocks
+	    
+    	    tmp_r[0]=fr[j];
+    	    tmp_r[1]=fr[j+1];
+    	    tmp_r[2]=fr[j+2];
+    	    tmp_r[3]=fr[j+3];
+	    float tmp = ((float*)tmp_r)[0];
+	    constantMedianArray[i] = tmp;
+	    // printf("%d %f\n", i, tmp);
+    }
+
+    fr += nbConstantBlocks*sizeof(float);
+    unsigned char* p = fr + ncBlocks * sizeof(short);
+    for(i = blockDim.x*blockIdx.x + threadIdx.x;i < ncBlocks;i += blockDim.x*gridDim.x){
+        int leng = (int)bytesToShort(fr)+mSize;
+        fr += sizeof(short);
+        if (leng > blockSize*sizeof(float))
+        {
+            printf("Warning: compressed block is larger than the original block!\n");
+            return;
+            // exit(0);
+        }
+        memcpy(data+i*blockSize*sizeof(float), p, leng);
+
+	    p += leng;
+    }
+}
+
+void decompress_startup_better(float *newData, size_t nbEle, unsigned char* r, 
+    uint32_t *blk_idx, uint8_t *blk_subidx, uint8_t *blk_sig,
+    float *blk_vals, size_t num_sig, int blockSize,
+    size_t nbConstantBlocks, size_t nbBlocks, size_t ncBlocks,
+    unsigned char *stateArray, float* constantMedianArray, unsigned char *data,
+    size_t mSize, unsigned char *newCmpBytes
+){
+    blockSize = 256;
+    size_t nb_tmp = (int) nbEle/256;
+    /**
+     * Structures to return:
+     * blk_idx, blk_subidx, blk_sig, blk_vals, numSigValues (pointer)
+     * bs (pointer to blockSize), numConstantBlks (pointer), numBlks (pointer)
+     * ncBlks (pointer), stateArray, constantMedianArray
+     */
+
+
+    size_t stateNBBytes = nb_tmp%4==0 ? nb_tmp/4 : nb_tmp/4+1;
+
+    r += 4;
+    r++;
+    r += sizeof(size_t);
+    r += sizeof(size_t);
+
+    r += stateNBBytes;
+
+    convert_out_to_block2_kernel<<<40,256>>>(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    size_t to_add = nbBlocks*4 + num_sig*sizeof(float) + num_sig*sizeof(uint8_t) + nbBlocks*sizeof(uint8_t);
+    r+= to_add;
+
+    size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+    
+    // printf("before mallocs in kernel\n");
+    checkCudaErrors(cudaMemcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float), cudaMemcpyDeviceToDevice));
+    // memcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
+
+    //printf("before mallocs in kernel %p\n", r);
+    r += (nbEle%blockSize)*sizeof(float);
+    //printf("r: %p\n", r);
+    //printf("%d, %d, %d\n",nbEle, 256, nbEle%256);
+    decomp_startup_kernel<<<40,256>>>(r, nbConstantBlocks,data, blockSize, mSize, ncBlocks, constantMedianArray);
+    cudaDeviceSynchronize();
+    r += nbConstantBlocks*sizeof(float);
+
+    newCmpBytes = r;
+
+}
+
 __global__ void decompress_startup(float *newData, size_t nbEle, unsigned char* r, 
     uint32_t *blk_idx, uint8_t *blk_subidx, uint8_t *blk_sig,
     float *blk_vals, size_t num_sig, int blockSize,
@@ -1043,6 +1249,8 @@ __global__ void decompress_startup(float *newData, size_t nbEle, unsigned char*
 	    float tmp = ((float*)tmp_r)[0];
 //	    printf("median: %f\n", tmp);	
 	    constantMedianArray[i] = tmp;
+
+	    // printf("%d %f\n", i, tmp);
     }
     //printf("after constantmedian\n");
     r += nbConstantBlocks*sizeof(float);
@@ -1068,6 +1276,56 @@ __global__ void decompress_startup(float *newData, size_t nbEle, unsigned char*
     // printf("nb blocks: %d\n", nbBlocks);
 }
 
+__global__ void cBlkCopy_decompress(int nb, float* constantMedianArray, float *newData, int blockSize, int i){
+    int j;
+    float Median = constantMedianArray[nb];
+    // j = threadIdx.x; j < blockSize; j += blockDim.x
+    for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+        *((newData)+i*blockSize+j) = Median;
+}
+
+__global__ void ncBlkCopy_decompress(int blockSize, float *newData, int nc, float *fdata, int i){
+    int j;
+    for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+        *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
+}
+
+void decompress_post_proc_better(unsigned char *data, float *newData, int blockSize, 
+    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
+    float *constantMedianArray
+){
+    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
+    float* fdata = (float*)data;
+    int i,j;
+    int nb=0, nc=0;
+    //printf("h1\n");
+    for (i=0;i<nbBlocks;i++){
+        unsigned char state;
+        cudaMemcpy(&state, &stateArray[i], sizeof(char), cudaMemcpyDeviceToHost);
+
+        if (state==0 || state==1){
+            cBlkCopy_decompress<<<1,256>>>(nb, constantMedianArray, newData, blockSize, i);
+            nb++;
+        }else if(state==3){
+            ncBlkCopy_decompress<<<1,256>>>(blockSize, newData, nc, fdata, i);
+            nc++;
+        }
+    }
+    cudaDeviceSynchronize();
+    //for(int k = 0; k < nbBlocks*blockSize;k++){
+//	printf("%f\n", newData[k]);
+  //  }
+}
+
+__global__ void print_newdata(float *newData, size_t nbBlocks, int blockSize){
+    for (size_t i = 0; i < nbBlocks*blockSize; i++)
+    {
+        printf("%f\n", newData[i]);
+    }
+    
+}
+
 __global__ void decompress_post_proc(unsigned char *data, float *newData, int blockSize, 
     size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
     float *constantMedianArray
@@ -1077,15 +1335,36 @@ __global__ void decompress_post_proc(unsigned char *data, float *newData, int bl
     float* fdata = (float*)data;
     int i,j;
     int nb=0, nc=0;
+    // if (blockIdx.x == 0)
+    // {
+    //     for (i=0;i<nbBlocks;i++){
+    //         if (stateArray[i]==0 || stateArray[i]==1){
+    //             float Median = constantMedianArray[nb];
+    //             // if (Median>1) printf("data%i:%f\n",i, Median);
+    //             for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+    //                 *((newData)+i*blockSize+j) = Median;
+    //             nb++;
+    //         }
+    //     }
+    // }else{
+    //     for (i=0;i<nbBlocks;i++){
+    //         if(stateArray[i]==3){
+    //             for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+    //                 *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
+    //             nc++;
+    //         }
+    //     }
+    // }
+    
     for (i=0;i<nbBlocks;i++){
         if (stateArray[i]==0 || stateArray[i]==1){
             float Median = constantMedianArray[nb];
             // if (Median>1) printf("data%i:%f\n",i, Median);
-            for (j=0;j<blockSize;j++)
+            for (j = threadIdx.x; j < blockSize; j += blockDim.x)
                 *((newData)+i*blockSize+j) = Median;
             nb++;
         }else if(stateArray[i]==3){
-            for (j=0;j<blockSize;j++)
+            for (j = threadIdx.x; j < blockSize; j += blockDim.x)
                 *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
             nc++;
         }
@@ -1220,6 +1499,7 @@ float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
     nbBlocks_h, ncBlocks_h, stateArray,
     constantMedianArray);
     cudaDeviceSynchronize();
+    print_newdata<<<1,1>>>(newData, nbBlocks_h, bs);
 	cudaFree(stateArray);
 	cudaFree(constantMedianArray);
 	cudaFree(data);

From 5307d8e4b9c6a5b24ca44de0e87970f04fdcfccc Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Thu, 9 Mar 2023 22:00:09 +0000
Subject: [PATCH 056/126] add instructions on how to use main.py

---
 bench/qc_simulation/README.md                 |  34 ++++++++++++------
 .../3reg_N256_p1.jsonterms_Otamaki_120_M30    | Bin 0 -> 221346 bytes
 .../scripts/http_unzip_on_the_fly.sh          |   2 +-
 3 files changed, 24 insertions(+), 12 deletions(-)
 create mode 100644 bench/qc_simulation/data/preprocess/qaoa_maxcut/3reg_N256_p1.jsonterms_Otamaki_120_M30

diff --git a/bench/qc_simulation/README.md b/bench/qc_simulation/README.md
index 354933be..7e0c1877 100644
--- a/bench/qc_simulation/README.md
+++ b/bench/qc_simulation/README.md
@@ -1,18 +1,16 @@
-## Advanced usage
 
-It is possible to glob over inputs and vectorize over outputs
-The globbing is possible over remote files
+## Examples
 
-```
-main.py process \
-    gh://example.com/data/*/*.element \
-    results/{X}/{in_file}_y{y}.r \
-    -X=1,2 --Y=foo,bar
-```
+1. generate or download circuits:
 
-The parent directory for each out file will be created automatically
+* As tar `./main.py echo github://danlkv:GRCS@/inst/bristlecone/cz_v2/bris_11.tar.gz data/circuits/bris11/\{in_file\}.circ` (need to unzip)
+* Using http and (unzip on the fly)[./scripts/http_unzip_on_the_fly.sh]
+* generate `./main.py generate data/circuits/qaoa/maxcut_regular_N{N}_p{p} --type=qaoa_maxcut --N=8,12,16,24,32,48,64 --p=1,2,3,4,5 --d=3`
 
-## Examples
+2. preprocess using both of `greedy` and `rgreedy` algorithms:
+`./main.py preprocess data/circuits/qaoa/maxcut_regular\* data/preprocess/maxcut/\{in_file\}_oalgo{O}.circ --O=greedy,rgreedy --sim=qtensor
+`
+3. Simulate: `./main.py simulate ./data/preprocess/maxcut/maxcut_regular\* data/simulations/maxcut/{in_file}_comp_m{M} --sim qtensor -M 25 --backend=cupy --compress=szx`
 
 ### Easily manage simulation and estimation results
 
@@ -41,3 +39,17 @@ This shows how UNIX utilities are used to filter and present data. In SQL this w
 - `.txt` - gate sequence as in GRCS
 - `.qasm` - openqasm file
 - `.jsonterms` - json file of QAOA terms (`src/circuit_gen/qaoa.py`)
+
+## Advanced usage
+
+It is possible to glob over inputs and vectorize over outputs
+The globbing is possible over remote files
+
+```
+main.py process \
+    gh://example.com/data/*/*.element \
+    results/{X}/{in_file}_y{y}.r \
+    -X=1,2 --Y=foo,bar
+```
+
+The parent directory for each out file will be created automatically
diff --git a/bench/qc_simulation/data/preprocess/qaoa_maxcut/3reg_N256_p1.jsonterms_Otamaki_120_M30 b/bench/qc_simulation/data/preprocess/qaoa_maxcut/3reg_N256_p1.jsonterms_Otamaki_120_M30
new file mode 100644
index 0000000000000000000000000000000000000000..7485593d6c2f70fb2d093097dbec69fcede11938
GIT binary patch
literal 221346
zcmb@Pd038J7xtT_sAR|xnU!d`=bKU}L&-daiUumuAY}@bjG0Rzq)Fz;JVnWnvCO10
zL`jAeD)RB{-DjQecN~uIkKR8V@8S75*L7XzzV=>g?S0?(Qk7^$1x1BF|8t5mjMVlI
z4Dj$U@%0Pz^6^^W5fBxrJbqe0)aEFo(5R5#|Glw`m%GR8K(E00Q4R_!k*Y3pep6BQ
zO27Z#>}ft8zi}@Yv#3Z7m*4-&#l>^l9M7oL>L2b88kRd?(U`fep2|$eQ_Vq9#j}yU
zlBfEge_<Zwsj*u9mWpGuu}a%U0H#DR!GBm6@xJ7`+5IK~rc5y5e^|0!#Psl{d7l7N
zA(-et?9j*i?#4D2wt%S;O#C0F@_NUftGRK7fT<Bo@*k!)?%Rda&mTPptP#Pa|6z%}
zU#nOR=n)B+I>BWBVUj-2yfyb+vj(g&!Q}s8^X%UTB#qa10!*VGQ~CXP&Hm%s#?7l-
zaPzwgaGI23KAb4BGgK<_9Rgev$}uBc+PHRJrF?7`a7`)4oalHv_=J+_>D9n#QI1)m
z+O+zO`S=Hlz-d#Cc_CY|=xfu;cAmiLP>z|gs?p-<b2K|V0j?S4m>bVmb+f(gJGm!t
zx|Cyf6rDSn-Y<Hx5^&8a$NUJ^YqPzpqem8SEeI!Ieyl&C_ilZB*Pp<(q#W}jW!vf{
zlJC<cz_p?r^P}oVk;B-pMY_Q0QI7esLvezy>bV6*z_q3v^CP3Xy#2*_ZC(S{hH}i0
zp|y6N<MPI@1FkLQm>;_vUw?A+U7yXswWA#KBWZo+<_h=V?!f6&j`>kOJT1ENQL90~
zwWl2OW79y}M!C-(;cL`^a6;yX=iplgkNeH60zCuDF+Yaq-csFl#uCS0N6IljV%P3d
zyPr_226~++$NUg>Te{Eo{x*N$I#Z7M;r4b`+UW_=nZR|S9P?wovTaW9PJYh7b)_8h
zBR}D1@1dHuI16>79P`6yTWX$WlH)_r>rOf5$0fJwd(S%*d<Cut<(MB$@4cRMz0cF7
zz!?%w#QYc`b+=wpGGQ`sMwDZIm^nNQPFQ*w$6rs%F+VciOdB2FuQ^W6#*|}zth<=^
zaGTkxXo%a3a?FooZe7H43p{;+>rFZ4$B=F@v*Xg-<^pFzIp#;RmFG_zE40}QoGIm)
zA0M<0?X7}V-3QK$a?Fp377mL;SLJmD&YW`056gwKF5bTzdIvZG;lzBta-aS3M2|Km
zzzHeG{5TyDK5hB-F0+9XQI7fX>2kL;+Zoa0fD==W`EhcgnXsk$)v3TqD98NJUt2p?
z=el1Ma8k-KKV06=`+4HXr;otND98Mmu4^;y-jOdlz{x4c{D_WoNc6Wk)gHJ$l;iW&
z6IFMsi3<XN>q|N2$Lq~2YqM{CYYdzP;UvtDK<Rbu)DL@c99UA0`4K*|x4}FGQAg10
zM>*!l)w-L{?MF_u2hNIe%n$jteQh^BPek9WDaZWq5%)X)tzAhS==G-@^J9$h(b69C
z8jk{Q0Ogn;X&<(#wH<J{FK`1X$NXp>V7A;VK5PqcHk4z2Xu4(F@2h&U5I9@PF+c3W
zt&bFBT-gcSAi_zRAD1T`u!so`&;)KU<(MCZ;ZODBW7GQqH-vJ`kA5i;vXK{O*#I|`
za?FqPa|w+*Y<q+At{vr=AAaHw!)<eogrH|nIp#-DN?Z%m3J<)#VU%NjJf0W)CC+?^
zJLox3j`=ZZokqsddlD<)hEtCD(R+(U+mVUS9sxIka?Fo6nfXp5JL%&3JCblR=Es|4
z%Ws2%*5dj*igL^krI;&IJ=U&Bgt((A$NbQJsWZ06{4itS#!!y=G0eiN=gkWfuLC!h
za?Fov3yGoC=~FmgjiVg%<EHPH6VvKKzk%L($}vBdD#fmeE<1PwxCxYFet15AyRKy6
zn0LTUq#X03RsW|tSF=J=fODiA^W*aCxOLO&T8#y665-^`kILAdHM=^M<pVdFa?B6S
zkw34O)l@G4ZVKg?A1gI)Cyolx!9vi9a?FoC-G*E+IFu6xdd`$%eq0JGllC3Bzcp|!
zlw*FlwK&{$-irvF@266Z`B7w`<YNAP^AymVMmgrkjwO!-Ycpo_2F{go{QdA8m6m<%
z!ExZ+D98L*>ZsNFs^(QWaPIY->YsX$8FIz5V8qZNfoPlufXtCZx75n)rap5(aXNs^
zlGMhMvu~$nV5L6;K<3Go7oKhFE_dAxik<*6Q__t3>PfC@VKM3jAaiAU{H5pXD*8Bs
zqBnrdmZ9M~_n&Iqd<x)90GTg2FZZ4q5Zj~%z*zt?W1bGpm42^s!a~jmK<3PR!}Kgw
z6<^F4XH$@u>P=h+UMx$fTnwRo0c6g+kq>L0>vn25fPMfnXA~U_?IhXna5D4<kU4Wq
z?ZAePeGIW04gioj({JRbCU%>`u#B7oAamx@+zrvk&TC=a5(pr3ru18N;w`<4j&L(`
z0c6f}=(pT@P~CJKxIq9iXT1GR?#$^iK^+u>0c6hTjlE|YXE<jtfb%HGi}uFHlREC!
z9E44U`2aF!4y^wb(q^5D02CJh$efwt>gH!<|3wABg#a>V&R%g{cEPT)1i(cAGG}&f
z^2}Xz#6k>U2!PC)kIj{GJU$D309*_pb7s;8vGRaJz0txY05WG@JSZw2a^G(nC@uw%
zIa96B<LitiuW_*t1&}#2NTuJ<!;6RIf?^m2dHL@6C0KjYUinb~!vSQ@<VShlSuwC~
z27t=|WX|-veyzr0nyV>*5dbo0W&~e9qt@7T8-U9JWX|lnH!dUf-cTIjD*$B9489_0
zwdU=~v!J*VK<3QsfJ4fbt!Lt^unIut%qbh0<lvn7hM>3_K<3QM$aRk_b*JMTz6L<%
z%=sVpE=;HkyaI}m6y$~c5xvAl8_(Rr1a&Qd%$aW!ZP&Nd>~<0q*8#|!8G6W8y*Ot+
zc5T-K$egJ>I<kA2z~wzCZUB%ubM8%fg-z}QT+}xL$ebx^sn%ZKZ4_2hn*e0aC?0dX
zzPjm)FA#b&fXta&4-M?vE<cNPLll6_nfv299ZeZ@ZX_s11IU~aC`4bM_+E;Y<`xR_
zl3q1$o53sfmS;dQ20-RaX~3h-Hl5n$0T>G)b0&PYPUERp2R#OGD}c<Iegp1&xVqwd
zQvkOC$eby!DN8R%{h#-GJAllY{hiu}#C(pzd@Bw>=8TVQNz)wXCD@?Y0U&ecaONma
z>-jaRaE&_wWX@da60~R1xM~c&3qa<~lCD{EzgM|p%^6QYUfg#+{8-<#?^q=Wy&FL0
zjK{j|)$&-?Q~(nIWX|ll7@w^)CPx>*L;#sHjaE;y3zhXa0ALb;%o**ZrC;1)^>COa
z1IV1&F@Jyj)P%>kK`{kD=8Q|tj@EnUv~vP*4}i>>xJ{G7%PU-PRY(PpIU^}tapIFr
z_C`?L3m|hQs&V}48|uOe0QXUlm-##HDa?1@u=hED`vGLmY%o6NaY*?vzB2~^WX`<2
zXz?U_YgGa$9t4m%^J)6s#Wte3dH@~*kU6t*>W$YkRhMIWeHcLIjF!pnqwS`K;oOr3
zAalmrwyLinH{23J9|4d#bH1@mshQ_<oJfxX$eih{`(m!fkF`rcF&#kW%##xTz2U0{
zV)b#1g1q2AYJPRFrpwx5P&^JGbLMN)$@2Lxq~!pf0FXJecEZ55Z%zf{z&!~dbLRAh
z4F{7`M|=jwQvfn&?uvXmRCWA}3({!-nKK#PU))MH=&S{bX8>f*WNY>5@l?GzX7*<R
zWX^n#IAM@8VB;xJJO?0i=JxKWB<s2#IB?Gc$ebx2rMoPxQ%MvkUZ5Z^{UsmHB~Sab
zc@cma05WH0+;7+Mp?6j=fENK|&eXMVINEhZ^g#eG0mz&wEx7GiqLz9Yz)S#{GhQ?A
zyQKWg-2h+~fXtaTtNM*Ne0unP051c`XS1gYg}Ht$^a0EUkU1k>cV~?0xHZWDUICCf
z<MyScRa1>uXyH`=nKMoL4{QH)UmW%dt`ShcdjKYa9tA4V+Bk5p1IV25TXCkz%4jt#
z_i_Ma&Kz}LQM}5^Xg%D_4FH)lhlV_!Y%uKkLjZ3A$ef8vm|i@&r4DZ8-vW?1bNZ~r
z@YK8|9YFCmfXo^Bnvpio`q$P1cn3h{OwU@y>3v?0TLs`<0GTsV+vZ^t&cxv)l?xzq
zCc7%7I%C`T7ND3%LEZz1xW9B&MdF<Z0Pg|FoSCEa##{c&xjBIM0c6fB(JKmVt8%Ie
zzy|;_XHJClHNKsxyAQyJ05WG<E41w5q-=_V;t_z%nd%Um4LOB<ao|1%kU6tr-)j}y
zJW&~hegYtK=KbO6@A8|Q)B*SuK;}$Q*t%0+?VjMv^$bAf%uDI4tyTvWLP0T~g1iUd
z+`?dd$E*9o0DKN0b0+_?&4^|0h7te^0A$V-UfexeebRVrr56IoocVA*GO)~P>2^>o
z0+2al+0=jBvL{M@0KNc_IrF{h$;6A+590xR2_SQ(H2n3_cV)|l0Qd?(=1i>DI1A+$
zrr3#l4Ip!7W$$&)fhuw=Ns9qw&KTbCvMo65jzz^A3i2L+bITt4AAdA)fNOjUAakar
zq99p3^~^;8O8{if%*}GSVPQBI%e_(nnKK9M?dCj-*^i3v0A$W&sg=~UdQ*XuR2hKG
znR}Ht*Pi;=5=(&h05WGn_rIIA@JqNI+)O!u%$ZgrX5O_nKZIq<2LPEfN>6@9ol*(C
z4vHTEWX_x)rmvv#_8qR*pD4(C0ISnqD~rc&#jN==fXtZ|1~+TQesWfY&=mkOXB;LL
zs?ChQ5DVZJ0GTuIhd7oV@$fDNuo6J#OpnE1t}T4M5sQi{0GTuE+W*+<TB3#1-d6yb
zGeb1$)-4{L<_e*|0mz(L?7gbR^R5F=1Na?4=FE?2@AGY^PQ{s`8bIcZPSZOccDt=c
zfnp5>c@Mzx%OZt6Q<mHSuogh(jJ1iOjpvS~MF9Q)kU6v4e_MQ*NH-q9p8zsvPQNZT
z?~vJ{Gk|pfGG{8!4qax~%Mv?rzW`*;yo<fF)FbTrP^xIJP=C_U_xIVvKPL?@ZYVE~
zUoV1|tG(i1irsHmayP4(l))cBOX)AgZr1Tl8?%(Ymu3N_{Fh>P+Um#fb?3syxd2sv
z{mQ;B>^3_#DpT&&Q2}xTd)2>M>>ew-DK4I`xCzz(d$qq58+ta*m2qtj<g9>*js8+>
z+(o@m*s8H{Dy*XR>VGLV=z=~5^-J+GdH`CD|59wkh28jaR?}3!8&DdYQt_=nhFE*;
z!hUJ5yUPI6Bp7@8HLq<RcFR9K6|g1*V~@Z6>HgJiwj5psSW|+r=kGXb>6r@W)X9Kp
z5sVq&WVBG4XD^0kioG_$m;-(nr=3weRt6{h>~#pnEcm=}M%NVy$05tKZ$>cY!4Cfk
zdjcfZ>)_gS3C2w5^~z#?jOq{Aj<Rn~Fy=y!SsOeuZ;ZoRXz>^0IgC$>(0r9Hr#v8N
zOM)>M_OyMr-5_=l%$@eF2*zBf8|$MpGBfNBsOb@mxuDo2TW?^?jS~TDO)%y{WdCa`
zryPH=2e38-V=k!czSdozbSeU{wgh7?%$6oOzwe|l6R>s!V=k=uFsf(yl=+aM+UpaH
zzY4P*)eJAa@dvf`1Y<6kjL-?WW}aySScktD&sWZ7S8Di_-dqWo0l}CH&&BfnMGw?r
zSH->~!I%p-D$n~oNE(PQb0>l^7oO^rU)F0ZnFm2T6O6e~ctz6m%3MQlz`78Ox$tJq
zJMY)Yb6`WvzAM3)3mHcKWrqs|utj3ujbO}$I^`P)#__+*AZT}jF&7@Vx8HEI+1>tt
z^&l8?!FgqeefN$B;Bzzli}BnfyEb*?>b4mIP%|PJbHQ<zS|7ce{!sVZ_aqo|;iA;K
zlTTz*=xf;<6O6ea?tHt^o>TW=$I-qQ!I%q2cV1YQdw7^NTw8B~F&EB;ou9I)*$8|W
zObEtYC^s(dueH3_O;9r>7;`}(=jYDVhcuuvw>KjgbKzm`hbLi+Mwfz`Il-6<minHR
z#V<_IQ^8-1=OJ(3HJ`S;*WF}L6B3NMke0W*%c~Hlvw(>R#$0%E+*~C$W)JMN*^3Fr
zT$uh~;_M$uckhClgka2thp*mD)O5T9skptAV9bTnnwG($*~e~ynv7t~1=E7JhvG~O
zJ^>~t7<0j{<*}=IZ{EYXB>O%DV=hRan)Pnl!U=DoFTt1#Zh0GTFDNUIgrF9GF~09$
zcR$I-V1!{9V3q`9F3dZm?R<0e0RzDL5sbNDHaj`%lIt%aU{(ZUE}V?Jtf?5+ega_D
z1Y<6g2g(i|)oTI;x_y6wF&At@s|tLQcS5OQKY(D&g}M{-BQ_>2gf6H3K!PzBLKYNX
z5R5N0hikJT7<0kUYW+dqBWBPkwznl1bK&GS)BP=mmtF<6L4Ps6$FS&2=XWpLNBaOa
zm|)C>eLEKTPf+-71=tXRF&DNDzFO-m(J%pQD8ZNu_J_R>s;Qp!1I&(K%!M_dWJAn%
zt=J5hJ;9g@nksJxs#=^yvxX6jxo}En{jTpGukg)sAQ*GueY3|&9g3bef}q0*#$1?X
zfA?vk-FuvSM-Ys;P}K85T}j{OnAwl~i}C%0yKSq&=Zm`I+&hY3+=Z4MUj_B@fpc>9
zqY1`b*w_2>!PhsIq1qUNF&8%7y`uGfc_Gx6_G1agTzE5Si`D3ly+6YP8AmYYLMsiA
zL*r{ljR9;t!I%qQI<)$n=J}%|U=s+&Tv(ISJu5CpHwLhY1Y<6IjGZLksrLOIV2%W1
zE}V%ds7_e@DGsnne=)vyaA(o3#X@EOOu!}+jJZ&FC|!ERD6$B!DFkCKs5$k0^~H6z
z8DLHXV=f%mF-><qZv7H4XM!;o92Omk^S@#r4VVkTm<#(~FVyUKL?H*TsRUy#tdY8O
zG=6H}3fMG)F&CUtC+f`%a9$3WE5VoxVZlF@XEo}8g{d3Cm<y8F>a9=Dbe;-o?i^G7
zQ?;?0dD;f;14DOd;l+7Sj`=W9z3rzfKQA-^z3G%=MzrbvzK78p6C6u3D94--7+dW$
z_FKFb^gJoYtSA;s`daDt@B+?@a?A^9`hZ@AS1U0y_of^(qn#+QXzttTI67xij=7Qh
z{(#f96dhcpXHkyXaeUA0p4I_N`oQ)1P>%Vb8DlvAYtX2-z|AHc-yV4F@nmjh`Dbn5
zd@0BL$T}3}Z~5&SX03jdV}7W7t1RjB^Z8=X^QRp1qu{C0$S1N(I=}@`j`<PY#Yr-H
zf&L}n=1`9LksTZ>S6LZ{GgKhum>+u&WT@tJlEY~u`?-{3eoX0Oc;b?-GWK?YD98Me
zl&KDMjn~C2A((Q^kBF`JA1y41><afWk8r%8w!iRF-e~h?C*bB&j`=bET#M|4(hyvG
z7Eq2)RA$DGr`6tDg5E;PF+ax0+MWFMZRCC67E$g$KN{^4yu9vo4<B;~#h4v?b8M$O
z9jJsYUHiq9V|MsW6DpY2#Ro#zC6r@!*p{9jHn(kA2yja&$Lz4E4s(b~%B%)1lyc0D
zCEs53a?uq(0xpbjys{o(dg8;0(cj#G3#S~jW531Qgx#t$F=JRpIc7&6hewj&JGbz)
zjG!E|<H*(HYpcFaQH8k6DaT%qZMVA(msoFwT>$$Plw*F}RA|&8=59GAWh*Ji{4kCC
zrEq^-K^us>igL^k?~~D<&n${yhs1t0<(MB^H$={FEDtyVdTS`h{P;NBTz{;_gYm#c
z5{}o{p8fkgu9(~e+XQPV$Nb128J|5&7}*2#)=`f6F(78Sw%Y4CbAek=Ip#-^X#L?I
z+Wm0Z+CVwxM|k4ng*lSTTA;U)a?Fpc^Tt-&Vg&Dj+eA6$M@!-Et~x7@V!vZE<(MB1
z$teR*eU#wz97Q?iM_|F0i%k+2&xg3tlw*EWhw8-jNVLJ0#}>lzYP-4Rju&r+WLSV+
z4CR;~p(mm$FZ-6P1}>Iz%#V&WU0$E9UXuXaR?0Cy<XOGS9%QT%0k@5E%#Us-^26J{
z*T(r}JLQ-kXDbuWwY6D_d2k%%m>=z{)Sa$2nS|Be4$3h<v<pLYj!Yjh1g>u<<(MD-
z$>+|!l&|;%+%C#7KZMKIYnDenJPKSq;dtHcJH;~Cu*2d`!0o0S^W#+B)SZc~-Y5Z=
zKsn~eJnIWbD{a061D8lS=0{k0vyvv?=4AqxL^<Y1_N6MLf)|0fY$a2U`O&@4uVINR
zg3g0p3gws|28CJQ41*MOf!jkl=Epb50E7GY2jKmtQjYnt(sTFj`4PIX)o#C+a@>zT
zzQ@yNw8!*xAK`cfzV+!<$@y)0o^XBpDaZWSJi5PHAOGyHz#X6*^Fv#uV9}XZMj5~z
zq#X0Z=;aUBw>w_81?~{#m>*9jCaMKRH^c7iVahQ-`VUiSS23(D&XZ}BV}6Xbmc4I0
zH0uY%JwiF=hiidlO8T_jSlb<?9P?w~{+ylP^G{ZRUOMHNAI}0k<MYR^$M@(M;dm`>
zYkqB)srIpbpm&^d%#S-MDO0mP9p4Px3Cb})HjfE;r{xzm3b>P$V}5KiGh6K$q=#wt
zDatWFj`<}FDvWR)3woz1$NW$Xtv<Ye%myq4&rpu}p?o^+Y3)1zkDzy!a?Foci{;5%
zn*YMC={d?VKeDCUm2=z^4ual!$}vB3v=84nUv?bJo(qKIRe7f~O<OuuoHhi#49YP-
z?CqC-KPy;(`OrnmF+Z;N*3nwjs|L4*E>Vv8Vdu2?@IKKLSBRTQIp&8~ZT73}Kj-5*
zm_<3}M|j2Vd?hOl-1fOlIp#-R*R9quot%$B+-%A*KWeHTn62A)9p}j_lw*FhuhO=0
zvD=R8)>X<eKW_Asewcgwf6BaTf4TZ|d|MwmZZOeW(i*PsI^~!jGlmB|Y$41U23!v1
zm>+hlgR{DX4w(Yn4azY;0xXZk73!MU19y{h%#WKh3j6P#{X_uVEy^)J232jWov?S-
zCg5&Uj`=Z8=#antM_D9rcPPjFs5@ufzi5<WFW~M{j`^`S|8w!2{tl;s%cUIiqi24f
zLKBafI8O5j$IJ5z6YH!Goe#ja#XZU~Kk^e-ycqxNd0&WopK{EPPLFmOog6gI2e=26
zV}2A5yf3_!)(YDU4=Km|u*m2+x5%x;81x=dj`?wGvD1zru@0D@KBgS=BXZNwWgYxY
zv95SRIp#-gpPiH57d*iB%TvlRKMKkM#R<xHFb{i1Ip#-?9j6M5$0gOkedH64m**o2
zy(SJVc#C=1bILJ4_Jn;+d}lAfj$Q%fm>&ZtE)%UZ`GxJPLdr2esvjy1S3TJQ>xv@E
zF+YC!CH0(=Yk3mx;|1lIABzq|?kPF=;x%wDDaZWSm7QfI)8Cy0+$+j4KlV<pvCVj;
zJ_xwilw*EOJ=N)##gMi~fGegPpRb}eAMDa76zlUhgyZ%3-dnBQt{It0LGLZ)m>)S;
zxAqKnRL%pggmTOeiO4#uZ~DQlz?D*t`LW+SGwDjXZ3S@eD98MGn^9O{c1wukxr}nm
z51%tZX73J<8VY*vDaYRrk0ocUr#s_&R8BeOM}Ak!)|J9jIIn%69P{J3hUX|d!*R|K
z_ao()AM%7sr!WQMFTi~w9Iww`bu;>YWB0>i;6781`B51A(<h;CD+AyvD98LzKb&E*
zHTcS7;J#3f`Ekr;gUR)vUCn{3q#W}jcGbiF!DloF09Qph=Es#%v5J%23)`_@DaZWy
z^!#DPYe^Se&%aTQ`QiFwdj9NZdALr0ryTR6S$v^S|BY?1Uah7a^P@%GPn%gj3%0<0
z)DVu>=T+N_bc<rVab2jT9P=Y_`Q3FEGn#Awy&sfge%LJ2(Kh~Z{3dWeDaZUc-&4JV
zuU?HKaCMYpeiXIIsPtR(49Dp&$}vACKL}ToTxkjUvxA;;Js9L7m9ra(^62|uWthJP
z4eU^&_!kt(*poK4Y-#k%<_F9~4&g*mCTEXY`K-y0v6nUGfZ`&eDCD)p+JIh;$0oFV
z2cRcal(L8Ttoq;^L92(5%sRvnMUfdBHEwn4zkb}5tpi0Ps%XZpG2+=)^_@HCF97f%
zRpj;P3%4r{AMc(M0+>w{<r4P5cUPEn6s;Qvg{DIvq9`<H4tkf2@sg;<go9!pRg|&^
zp7F6Vxb}QDq%sb@siGOPFkb0&liZ1;pmuN=MigZt7Fuh_g&l?c+@a`mNGFOSA-kCv
zRmC0`r#>u)Ya9iN>^X+p6jiS6+!>ZTheM#qwz2wK_{x$sTS5Kc(1s{V<?Kc1pS^Tb
zdg$a}2>pgC%GgX|`K$V2=QXB~F*q0#MVUE!JWER34~QHRG7A*%Q$-1T;G09%q#sjR
z2|WdeG@>Zy&#~(?L-pgHo@YRD1W^?7=XhhhYQguE54!=Zs8>`A`n}7?^H%+hm#%cb
z{ka8@O^KvH!k&0d8-s|2+rF0p7+No>8ss9Avv=WQAN7ZCpPT&va4}I7%Gldg{n+S~
zujVa$X>Jomxs(l`?@Gp*=1r2ufTB57G-JbO{lXrZJNyH^0rVq^GCl*0mpF=we}=*8
z?696H@;PPg;`_6I__;&d-eCe&6tmDfQ@@+MZhg`mLZ=c%k$~OInMd~?evCSUk0+TZ
z%7tt|2F>t}9qCX3E40HTq9_!x0eLgUdEemRFjxW|PEtiN8<1w!nSqjPTXf(WhfqZU
z8<0OrGL5T4>eK=3O%!G3>>B5v5-xZppP>R^9#IrY*fm~g61yvDXX-%!CsIW*1D)b@
zZ02l?hCYMC9;zr{p?$(Sp13=5;X_c2CW>-%HXhYlH(9^QW(Q;@4&FpjC}HDKezkD8
z->%SPP&6lsGOl>TEjRm#s0-wP4!4P-h$|-E-4}9l=aGC+lu|{0Go$?awT{Z(qYU6R
zs>mNtzTeq*8GSWiyTsuUQIyNr;|UQR8*_el(tS`gB#Ht#d*I_V_3RX<DyRYIPZVV`
zHhMgacZ~Ia;rs=_bwp7pXQQWP)u@&JOSi)@Xovn(k&p23$G+(S<1a&P?Qnr8iuefM
zQ$6R>okqiFLg>Mu$R;V*{lCBI>Qy3C!w#oFku3|#l5;(W`?@>;#q~r{&L{D=%7yE;
zti06-KtG}=<db+sjN0_eI_;MNxQZ(BNow+LmoZ<he?rCN5JVOEv{$oq<hPVdT`^4w
zB#JT_n^MA}{HH$EdRPLX*HA?{o1}_D-xgdtaPTL9y{Mu&dlh<`78(p4(-;odINT$O
zA_;rBvQF())Vp1L7Ze?+qKJh)o18VTPuHW6-#Q$nib8fXs}tY7?p^f>&O<qDA&PQy
zHq6fDnwq@2GJO?<_9BWx2^+Xq)DDhpHpb2lz{x~WCT62&_TiqEjn`is4qy^d6!GT5
zk;U^yWh!ml3}9oTD3!9uvtZ?={#}1fN(1mKQ52i82R?HBfv8>c`k*rsq9_wEFh^-Z
z-Y~O0nC@LCiXt%!ecLG_N3;A%7KC0z73ECP!N<{Mw{Ra$_TfZPC}WE0T8|nJv6sFA
z#g0^wA029Y_Hba(VijCx3W=gf${zT6>ljIL#|P~|@mswjOH<hJaq0hR*5}c;;M9_X
zI*}Ae*znOxaX*q#TLE>vgJr$spEO0lVz1rO<7E7t-mt;%kVzCpyfO<sE7_84U-kqp
zGMy;OMeO;M&yH0QczR$yGmIz-h3ttBtgJHn6%z9S6z@|-o~Bgx5V>w>J*EyoW1=YF
zX^QLi>y6*|32~vBP84OltSEO;>Aqm_a?Ikk5=9X&RHjXm%?o(e<OPHtNfkxxHB7Ia
z_P+B^bId*tQ$-<r4a3Dl@7I)TUk1f#RFPLS{g!wf&Oe#l6~LWTkykV;ex7{Qubq+?
zfUk(6oX>l4_i}4}Ti3h>P@gCY_)I#g^YNq4eI2?3C?<+B0lS&08%;}>Yn2=TFo!6L
z#O(20R=Kafez+|bd?odY{|#~e)+`#ky*BsY5zH#|h@^nOHJ+ujGy;!j4}sX@>LvdS
zaWTsQ9PWuf*u6dD3*bJYC=#$7po`|=>n7ex)&OoGigLa#d@`w;b58#ibb}mxiK39N
z3kS}bR0dC)j9KnsswiUDXkHsJVc)7q*avqQNfm|c8tcLjubSj#hG|?|q9~QK86bIw
zSAPee$+&hE6Gf4X%_$+WjT4U73Kqj%&mxL4DGPmhwc_SnX<JMTH&aD3b~F8Vm3l{<
z*Tq(a8c~$;rF}`4QPSvkBXl8jHBl7vWI@^5E4ui58ct`CL{V<W#@W%w^QWjD$inGt
z4p9_J*~qOKqhD+%zmC~iJEAD%FU_2!*9)}z*Ia;Wd`%QZ{H5{vmD}`bi%Lw{g+x(i
z&ZgW8)0#f*kv$F9*PB$4AC3^6dpy4Td5vrc{j*;2zn6x;hgBm2mdFFP4+c_+NDBCS
z=-xACQCih6TL69PCI7rMe90cUvt-n=yvuh0+(Z;be93k(_39J1c{_HxYU>r*;>GTJ
z=i@uh-@6u$14$JkDd6{QUwuXr++86C!FtKx7b)e9DPx004#RfzDh9BYD2jPwN=>8e
zb?<q~VgOx<qKp@4*H&hiTsQ4`2*4dgQN&BWJw4Cmbygb@3*bhgDCeM`QsQF&$MKl)
z%_fRM9(ux^Z7K6!x)p+A6ezOy(E0V5-XpaiV=g`u6xn;IRZ;AHt9z$Upg4ml%J|Fh
zLoDB9+0g>4_E@4Q;;&atMvvxBmtF^f;wGXf=W9~uOM{#0F0{sM!G|ac`6_pzDC%HQ
zQMD5&9;b>THhLDjm>R})GTa8B9aZF8EQxXL(<fWaKnpKZMV=$2mdsKeF|tojQ0z+;
z&Djij;8^L6PA-Qq^j@kcWU0^P*!=6Z747zc;&`ekVyVxY=1G^YE_%EHz?MW&Dr0xO
zZC&3t>EXkA1Ne?8iul2VBT5+?9v)7u0?>je%6L;_WUa!;;fj&4qu`K56?vP)E8=eK
zyz3t@&;3-d$d+t2=>{z=@3zwUbTq`)CXxa^&f?#8jyG-Y`x3ww^^$*HY~Ji%IQD11
zH|-2^0W2emBHljN>AL5F`nWTh09p}68Be=P9&Q_+60<xGz>8FoCvs!Pey+P`X=DOm
z7EzQ-*xR*#rd;yrfE7+Y7DQ2K&feFdgKv&2v5<Ly;tr}PVB_q<g%0;-_M4*tpet1r
zv$ef;POsuyZCbknxSA@OvC!&IyUei)cUuqOT&gH#H&ZiIGGorI9XQTr6Ga*Ciigh4
z7R<Byi51#LqA22B@wpwVOds~R5C)+;QAG}}DYia#<%C8EfCW^MhmL9g?u!4*iY5TY
z5Jfp(U}LgpEHGO&!XH3SqA27GO<~a4oyzWt82TJl<o%{cd8Jlb%?)~i;y|j%J6IZC
z)BEhxPe=f;2~m{tnkMOnewww;sFMJGA&Mei(=@r#-dAOL1!k`qL{ZMm_S89ZCaez#
z!V1KiC<=MmZnW<B_Z{JOr4U+)C`x&iv~F%y-H_umQ1K^G6!R*nSLLKfF<V^Dg5pi0
zDCY}yTy=_eX%{yo0EI+R$QSG-t<{vDc8V_tu)1E6Wfg4B4eh^IZ^ZZiNnh27q=0uJ
z2R&7obyBHF86*?xC4Wyoa=w6ADQKR4v&{nbNFAJrqL431sd+DgvR1AYfT9gil<}m!
z%+c?_yOhpY;GZRmB3=ggPRU--eOwEi#fMTw5ldemU0gbKgw?O@5c&jF6tcP5=|u6F
zt#8ai0gNDua{d~og*^-F^7HjD02dHNA%6{Lw+vhKdGP^kS1t!dw#dy^uZvI1>bnjU
z=Yt}%@RQB-m<OJZ#{u}IUXhv2Zu`NwEloEVnbrc?gh&c_8?sf9*1=^TgK@5CUoZJ*
zrQzE&K0ki7Ni?&328u6<qKI$P9Mm7b|J%Z6*k`*)6y<zU(Msqe8lKQS92BjHqL5Eh
z75Vn|`#+lDA~%{S%6RKVuTbz(zv)(7Kn@Z`5pU*%O7^!MqN0PP+y$a2=VibRySAeO
zm7Al7{fVNG7Z!PkL=oF=Yx}^>Xc0vze?04tD0u|CWvmA9Gf@=r2i|7k0_m?JBiv~#
zB8qa}kSl1FzxhDF#W;%_5JdrR$X(v<5k6hJIrjM1QANJVw`ybV60O09yx|)CiK38i
z^7-CZ{rPOS?PvfEh@zBF&V64dNMn;0VgXx36h(Zx4+&R){%x_(SWpZmiZXsPvb<HV
zAA0+31aKu$6!ORORkk2<OZN@9F1#j+a=x=MVfltGi#G(>f?_+ODByb~kx$P*4fQ*(
z3E+OJC}c^gW9If<%e2)o!yZc%h5U4l<tTY0ogc$72M`fO8DEmNS#+NGO*-*7guX!(
zMSPJPAU&`(F{A>o(UvOmRQ#Id?6p5^tBpbN3{~W*c>L+w<E?^D90BkJQIzwY@oxzp
zX%6NKunOov6a{=|Jm0uwX<@z^7UQ}^QOdz<7EWJBjqcb7LYEUo5f7cWYsm5bx95)n
za4Augaj!F{ly;5r56J^?2~iYsua71Ce6JIgw-&%bRFSVA67%9_J$`Kr0Pr+X6!8_Q
z+@|ZAp4q_*0USaV`Jl)snqd*5rPmI?lT?upvo&+}L>^Y$Vh`Xvs>rK_3fH0=aYHq+
z>{>w-g}hq0^mIYdyK`Z1>d#>rQIzv0<RDXp1C1)~V{X5YD)J^|yoqDuw7vSn;Tp#h
zMVXi_hNmX&89mgt5Sz67iK0lr7Q-=})=vLYeg!A{&Qy^%UIG;Son8ctha*)E&#5AB
z<qWb+(hka%;V^4T6s3G?b8J-Xwa&c;jewh}po)BJQ`qWcV!)$Bd}odkML92~9`!lY
zXzjwEOF_||C<=Lb_Oy|)fk<0v0e}HSQO4Jq$q^R|r}b%xJ&m<gk*_ln@A+4jwm-H8
z6rU4CIWMuDokC*c{gQDPt20p)@Dls>tKFBq<eQp;qB~KP@#JGr#J7Sc^9s%axSc49
z_^#@)05u!0ujbkSJ|l{9zA6~pH2WIxz_mYsU5TQAuL@Iw2Cs7Z)fLy79ID83`z1c}
zzJ2gMh)pChQ55pr-g5E#z|S7DA|bRsQIyutJ&!Kv_I6Be4&W=IDB^REfnHUm)wgpv
zHJcJe8SlHyF`6=YbCBt2P`pbOc_+tR{qelr&v#)u<3<&EKi<%3$NGV)vpa%f9985U
zelN|3oi}s{!$PGSRjl8}KJR(5lixZqC_bf%yw`o%uW-;&%^^_$zOPqg?G#qsFaO%}
z*Uo*~xj;52k^(8~7fAPgo;cy2#VG&})Jy)UX!w+S$a_=ifOkE%12~2#3i*_4yWVU0
znqjH9=cGv#rF`DI8=GG4*w=k4C{_|hF`G#n-B_izH^^l#aE?S!#`kI0eAsxRhd_q;
zV+s)z@txn#hfb!BH$CnGigoph|2;=e);LHaPlv_a1yYen3V7_F-csYG(U0&sKB<@d
z^BnnB$*8c;6Thq-yaN=wQ$@b{67nU|{ejyL34pOwktaTT;=NA1D|ep(;0&TD<cW`)
z%a=KV-gO%Rtg2UJH4<y(e783pZ*jQWcpx>1q<}Y^`wZ7xaL)A+E=lDe$p`sXS=$y9
zy?jhTQkP2dL4G#dZ--6sb$lDF>m`3bNg3b!jhr=bjBK*sB~Z*DiXy(7{KD__IqTET
zRsh~2igMn;YV`EYG40xPEV~3mQOJ8@ch#C?u0M1EH#dWbqKv0!<rW#9#`zxmA@nM$
z$kVeEHx<K2eZD#qz&%7!&d(#Rl5H_<yWlp~*b}KDKjFCg!<VBE=L88sQIRN0d8Ja3
zvT3#IfOj<j))7T9uYH>I85Gz;Ar({c>qJq`6_2}QtT#RAa|RS8L{Z2UHMMN5-VA-H
z3E)hkDC5(V+q$z`1O}6P02oCSMP_W`leez?I`hWbNdR6UigG@3pH<Di+HG^uHvr{C
zQOGx6au%FfUaIDZwa+C`WQ~QHmwy?%&M-Owiv2*5^;1;Z@67B}Qh;lc7g3b)@wm!)
z@~(#)rgj0vEkseo$76@;lZk-_>hl5AA&OGA!Pux}dA4rxWK1<bQbE3B>6THjKlZE?
z=iEa?P|i!Ag8WV5_YNgvAo3`pDCEVIUB}XX*D4k@1~8Q>@<!LwHeIg1sVqzdZ~|52
zjjpsFaoa~ux52`>A5oO?W?aYfyId?BpJ0pj5>XVfW?Uni9ea=WJUkg9ZzY0q-qCpY
z%I!$w=58qfPA7sw-Xj@fU$-=Nc?>2M8bndb8)DDPx(w;nrFID@RuM%pZ?T>8SLrl0
zs}Ig)-|H3G9M1ao3f~_u-``l{Do8dWk^;U7(Q?m$Y0ECzeg|+{z2u*zi!X*To6ntD
zvqc|Y84sc;<cs0Og3C=KUj;q@#T2T@PpECnc%NNj8>9!IBUR*Q-nO=DlfF9M277UF
zRFR(vHXqUZsL7&E{-Eea75ORSs2>5|I|l1d1yDc~W&Chl@~6g)4O<#u_wyD}6!F7x
zF2#}`W9I9v2SsnH$XB>?t@Bn)Uv?A|!)U6=*USU24F+2;%EL0Zv|f?rdu;9r{d#Za
zvbMr;5W5wT6!3wjATVm%J9BGi08i9Q{xO?340BWTst3%F;i@o{C<=MQ@agZZ`#YLx
z$3W4SD9U)g_qJ|}>zjCu8~`^EMG>!@jeSN4tZ(=q1u&B+%6U8f=XLEH&IkNf0BA`R
zh5VRtFO>-=mwX-g48Xfok)Kbo+P}e2X}9W208ObPKc6B??;BX{oPb%%ZmP&1k715m
zv$GmiSQ<K0MgG8bk2E>3z~xaEgtjM&GCuDq7%jDX=-we6z+*&F#AnjhJ5tPt8c&P{
z@F-D~^FzTOZG+}-9IJs-t^-jN@?*x@56UXK_s+yF$#9}5<BEeXOnzCsB{2;`A0dh&
zt{B>?y4$iX_b&seOcbSj-}hz8{UZUUpWOibK@`ROe1U;czGtPS3eMt&L{Y{;^@Cw)
z(^^L3YxsyLig@S=+D}4K7wp5EF`<h5#KQ9EJ<dDVELaKGm`fG;iG{%DKdcqT7vh1S
zzEqKSa&C914fRaehl%)Qs>pjo?ss($9XViN2chk#B44mOr0CXlNvuT+k5fgyU_Z4q
zn^QaE1un0riK3jJjf;MuqvK|H1W$<#B8ozOHcqkQu`2Ds;yG}QONgS3XCIE0qlc~X
z96JTTrBsohmujB=am|L}@mPL5Ac}I{mYO>(s7kqeq8cdnB#HvwmWsPK+hg+C9j^e~
zNfr6cOrPjxJ3qQ{djO{qMInDY56>1kWJMM|2e2zql=0NnPjOfO^KIX50`M786!By@
z@0xq2^PU=<drlEWIUf{*HV2M1*`EIh6bBPUAs=Q6mrWKmz19XF_*1IL>zSkRzgqQm
z?TvF!H=-!u^^9eIkMvZH+XfI?ohVBAj5{;dFrp@A(;5K35k)bdaa{@)+N>ygk6VF#
zh@y-;^M3jMdn03?<7}2q6h+*bBQvdbIjwicl=Li7l=EESLh+$(9X>`G!8O_tMIp~V
zdVlMjwKvIcD1cpvqKx0n$tQOYUfbFIEP(k$QN$llzk}t&ew|jw<s+9U%6W&@Ks-M9
z?zRqip2LJF3VF}B&e8g6sp2h6_ne5LjOR1gJB(f*@1u#`rUarW;u+G_iMMa5w8CFi
zaJWVk<-Do+bV48P6*KH{iIWmVA#Z9{WjwgJ;^!W7csy&VBJV8z^baU=@f?D~EPyKV
zUZwffMQT2ibM!&62T_#qQv(r}6$hWHB;5k=F;Nupa|p89QvK@Xy*>k2Q?JM}aaQmB
z%*x)Yvuw~)AXSN^fIo3V`Mh82AGXBBu()3GPjbc^)5~6`4=f$1stAg0sUmN6dwJ-d
z5m>0;-fMTFDB~4mnz+1ip|W3FP<%oZMZ6|`ZWX?1twYab05=mwIq$r#Tz%t7SVt}F
z+s`73Lf&~T&%86XOQ(QQptzVQ%6Lc2d6s45prCFzONCNJ-eYs9@>Wip(GG9s2vL;t
zvchb~(ul6tMz4a<!>J-KR8-%6lFZyQ9M_M1RFUs>l{fR;epRNQ35w&WBH!zJoIUHl
z#OsRyz&S)w##`NwW@L@OcyaAx03)d)Z=NUpG90W~q7?&RJW-VM<9ipUn4B2%enKpO
zE>w{p-!ooyt9bfmS0@0c5Jee3uQ2dK|H+9LKH;uUB2g6a^9s-1OFmq*Gsb-8L%kxa
zXju7rt;_zbP8-~@A=ZpY3iyB=FsF@%_@D+>#QF7-e_k5C#kXyWPQm&IXLrKgbfJoT
zi|_GJnP*Suy*LFp6Ga)XKiVnITX*WE3)UaIiK2+tAGvvUZX;J(Vh#D0D9ZU^fKmmg
zw;#TL4TNiKO%w(EFhJ)lQ*o(0sSdzNL{Y|D5N|(y+ZJKhZ90I-L{Y?>8k2M;x<x-e
zfTN-jQIzuZqwn%oHW$x1;N$sD6vaHDIca}LS23q%288}vugGRd*0sBH+hU({&95>b
z8xu(ZPn@l!#|P*Z)!?y#HT9Bz%;wYh^{&G;=Pj?W0mVR~DCE=l{oSeN+p|AnXTgOi
z%6OYZLFl2aH>&XxP>d&vB7Ro$$?f!_5|1Nj&3U3I=WUYN^TbE*C)~kVd;n1t^2SSY
zO~*kNISQ2!dOT5-@k5=9>}NHfF}&+y0QV9_5kDsC|E;yp@u;)b0Di1jWbYx{OTH@c
z>et!5YypruL{h*XWXR38%9p=&n*d-zz2tw37yH1dV4BC$-mM(481F<C`L626Iqi>>
zycu@{6nhdy8E+#;?)y4weq&`X03Q%V5pPTvdaBLrIO0)109A>ilus(sCoVbZ-9t74
zSVI)WeBCT-Wbn8ox-GsnH;AH~7m-^1`zNi|Ino6bMMP1^%hGGN?hb8}oLvFnE~?0r
z_MC{5cg(8YmjO7HD)KVGW9uE~LlLX6=VL|`Wqjm57@AO4VItcLig$>jh`$R1`nE6e
zXq?>}z%Z)F%c%<|QoY-3)WrsW2vHRB0xf7nnp&C9rskm7nkY*7?sOaVx4lC3&us_r
zEm0Kl{q9@MD{nLkYKeV-fkaWpcT_T^GoqjMyq5%u=cppzQORjD>u~p5wJiWlB#Lr=
z5@DK|#nq9%Gw{WpLKKDkY{Tf#xz=7L&2fe-q>6k=I%Q@e-t8KJ6J$rCD5ziL&b>;=
zuM%QwdOlH<@s~?;f_i?M%H!p5Gs}shkRMsFS=_u&p~qcplqwKKDbMcPv|PC6+-sFM
zQ2a#{#XP%D^_ZD7b@HMw0FEGvGQM%Q$nkc{v~K5i0+>b=MSLsJZ=+@El--^<iC5Mu
zvXzFNq3OQkYI3I1*kd56NhAe)o8QFX)nFH`i?}=e4kUR&)%gA69;s2!w}50zD#=T<
zX2uzz?$=M@G<C0D^3M#(JEhIk6zs2T5@2Vl7f}@QPHC}h<dOl`2Y!dpAw*He%kksU
z;vXN2mq-8%ql&!1*IzDQ+N4#x8vxFwiaf`jyW(`Cz@#m>z^<l>JZo2TwS9Ua`yl2%
zcZi~#=M{T)4UK!JZR-M|&8Q;JE0TvFIzK1poH~GVqA265+)BN>=Nk<+$p!EVQ55lY
zb6%Ho?Ve6wi*>+NqA2I1V(|D|O1IO0%>+dmQ55pSADw1+teCJ#0~3|0L{Y}8A0L$o
z!@^Zo<K(=HD2jMJ^;4enzU`B)VG#Ory&_w(+1bkH-fcH!`TOnwQj16mcv0LUR`_5*
z_DD|v$J9&y9*{C#2v{UP+Oos#ygh&ih@yy>7CDU#1Sy*|5&TiF_}@kHQwv}Ae!Xkx
z{sbFs%0yDYPc1B(x7R!`>3=?dva(+Czm$(9k-mDh4}Y}li(@>PC<^&NGwhdoxbMI<
z6X9-}5k)B<<Im$~eA}4PuN#0Lh@yy(@$p;D^9)jTv;dS5MHz1pt>0~_FX+7P0)SVk
zB5ygRxgVQ#riq(3fX#`blrM90T+U54?|6DBfbXdyUpRXf+-!3@x&{l*H$+j+*X-M6
zZztt1tT_RSZHS_Pui1ODdQNDh^6D~xFF}zx<CoFh)F}Aa9RS;dB0G)b=smjkms&AS
zr7Nf+@1K1>)5+y)<)C0toJSOeynpsQydvtb(*0ck4j_s$Ug6&hj``wJoP~!1&l5!v
z?-^J*ZkVri<A1)AVLMTj^Uv{Em)`5p!*)HkE8K~qkbg2Lxb5dnP2G3nx=>cH$lh#r
zE~mP$;E?K)L0IdyAd&)p+fBM0T@mN6IexCIhoPskQD{_1l%c1ZgLb{>@7G4k`wL4V
zgUgpEyvMe|C!#6h{e>%?OU<6=9J>UM@-b1A^R9gF5@BbFhvjSldk{qd@AF?MaCR%W
z93};DAyJg^^e1>nM%EaO?Ktf&BZ@+v{&c=GSU+gtphckQK^1u=&}96A)WUP!3IN<j
z6h*wY7+kntz3rW;KLLD56y?0tX5Htcx5?$gkpLPIMFDT#%@$n#VC--b>z^d5$g`TY
z<1UR&+na{7%4DJ_<ayFA_d)jty*r9)aR;I(<*#<Zq4JoYV-L8)HNGH<BK~UIbf30K
z|L!Ne#xq1w&R0(HedUOkX3d*{qAgJr^0jzw;LXtdn=c#yTtF0MJl~8SaJJQt+T|?)
zjG&4<gB=tXk+>x12wK>RC`!4a>Ab|jiRBlkfnq696miAJTCU|D3L|h$I!F}dym{xV
zR&_n~-B`StQl-D&xG3e5#e(?A?1!$ZpCR;l3bJz@jeI70T<-cExf6tw@RjE4uy>nY
z#O=hhLn##G8~rufq7(f}UCSVH90hsW{!)AMhr5RNv8oBDAm4Xh*f!AUVE7CiWuXAF
z!w7FaBo5SEd1N4j_NE};I5=5wYJSQrZ5-g^2`J{{HaGU+*r{&~%RzA;0VVt;oLQB8
zsz+;A>`rSEP|PQ17m@DNto@z7f#N6%^32)Sue4GBjKv)Rlu?jBN5#IAGOv7YJO)62
z0!sLs;J!cR!Tb9<2)-bol<&ZHcYKmQ;>S&Vx1IvXIx5Axa=XrR9&#5#*HVzz?e2;t
z;YE#Z;A%LTfD*pGW~>w{eRm6e3yNI{DCQOH(>k4N{bnA)_E2vM@@CiSkUopkP8EIx
z#bFfWQ+Rn#v+25x){O@60Rg4_IX-N4y{3t=SvG)62q@tvk`1G;MD&_8_9=kT6y*Ca
zfe9PVzL_ur^I;JI#XM<WHuSAVNZJS7Vth?NDL*wGmgs(|xFFpCLKjewryqk_wO=-A
z)K$#xKT?pV;-9W3cGfN1hNH)UfMPzGMQ{C?d+<@RGK6*^poF(d6Ze%*+;sYXesjKp
zfKoo!-)o=tYqg1^BPa$EP{ON#&tJDI8fb+z2XH9``QuT$SQytj<Yo?lhbYL)QuAEx
zk1khN4hJxdg8X=csd9jQ@QDSb01hOenAh0d{7cjx8>Hbmh}IP3r)bg>wk?|L<b%Dg
zLIO&8AIYyEHt(w68)FEaNI(g%Kkn+Jy$W9QxC?+k2q@*lOvSakUxisOO8{3AP{QBB
z+;LNCzqZXj0-%(DVqU*muKDKe`=k=HTsHzrc#megW^~c8SzRzE%_E?c_olsS`Uad%
z{ZRs;O$jLGbzHmAfl14&PPPHi2tbzPI#dnaGS2Z`K7eNkDB-I@MXQquewxlk04|^)
zf4S<MM-P@wP;~)t83pS*e4B-@w?CbUZ{b)1ig_+PI;&ZW#-(cvLGd{OrTl!;j3dFm
zi}h5ogzQW}F+VYt<+pp$Tcrn>_ADl#gzqp`H*NoA#V%QE2;G!`Vm?30u6{e*p~goO
zz*_{A^4+PxL1V7Bd4FLyfX4_Z;U(m8w<lZ97aJ!4IG2L_(A)LV_Y|7lGsM+(4+Z&K
zIC`0%joamU*v$>0AYTh3bW)lm_I-zEUak^QT0d|rCUn`d=m$P<IRVAI0&tzMq_|{-
z-YK}5I|P*SJ<AE&?t#andiMk{h=3A)&iF>o<m7L^`r(Ybm4bX1E9HZIO4##txGG#H
zpp>6Izw~%R;t#XoxVp|HpoCvzYD}8P_<`$>!!=GIpqOtkcFs4n(JUJh4WJ(ZCA?Fi
zvux!A+p&Ef0XT?&VxDs4H=iu){i`oFO8ZfeAA9xu>JjnDy*IWr6e-B>x@~@4*a%hg
zA_yHxKndU2DD`_};uxCY1K@iCO8NXKwotj}GT=UD&2<#yp9%;b5FB^nM;n~OzfzEI
ztG2XAHc;}D#X#sv0NFnFn6XDkJsQ1bA%I^1WWA4H!g*CyYs=aL_!&SpKSo~`+89hI
z#DV((K$aFROY0OJ^JM=~P^<!w?eS&T>|E7G_W25c<rL)KR?#V7#JlIk-*LnH9f0hv
z*QV???-#K50VtLL$XY|=D?}PIrd(14@D&Al5@)*f#PVgMOR#bOhJri;u$o@sXy0|C
zGbpwrpqMLu+h?;?$z&o<_D={X<+EAy2_7jyihXfV<WrCr>=Udm>MeQI1!vsp1eEZD
zqD|XftW7(0=muQl2m*?E<Ku!~k58AUy$A=;k$@8Zxqu@(uFiRqxF27IsQ|LWNb7We
z?sf8*iTR_LfMPyLtv%Oz{*Q?5FCg?i0!n!)rBgD~&?0)bDuC7m6!WIk#zzTvZ?*d1
z0N`*6@`8QG_t#;I4kpV0JVHPT&vARj&vZ$#x7z^V0RY)BJGNnK_^AsS=u7|w`QgD3
z-LA39NAC^+#d#FuUI$w~Ik9e9Gd%78kbqLInD=RIX`Q%oA}A&iP{JRNP`O0cJ8Nhb
zfR70%<>__*;tRL)v$hQea3KLDJR{BZNw-Pr^u-szZv>R`-0aOxB^~8}rs@D%5>U+d
z#Z7lNxjVfe6yHJ*0!sKC?)~h6xbghSOF^-kfKpy__3Sa<rjPOWMgZQYAYYK$ow)IF
z$%tmymcB_rzKdO{G5^YR^~N7SF`a-C{&Kmr{19$ALNge^a|D#~ZMN+4wp-fIEx8Y1
z3;`v)zWxyPUN7#R09UTJ1eEe#O7fyJ6VASKUImJi2q@v3i(Ol5tjg*;-U`4n0!n!g
zaF5m4pdG?{p#YvFpoBMFFSV}rS+XP@@A?J-rF_H5&&)aVe&E%qpty^G5`MJd^SH)~
zM2$LPnW9QSF;6t_%-AlfHEJ>t6ypgf;b%!s)lQ0d{yEqUz+3`K`JpAF{3QKP>CR&T
zyhcI3A6Id`k#5wD!X*H%C7^^K3SMMq-f#V(4y^!mq9Ffp&aNYGLcf%48wcQ73i6a|
zV~d!}p9TlxmV6%qiurb2v->wPEQ|8=K(Px2xz~3`1bpqjXeEN12q@uM$`kpW725|I
zodw131eEdveAnU=6W0zky98hq0VN#l`7Gy4ZcH0oK4wsmj~>lq$<t=XO~nUpKtM5n
z74{ono}HDKwGKkp5Kzib+nGFXa(8%s3BFvHDacRI{Mul@t!<DUCc_s1WQPHomkS?R
z@7fU$p|b#F3DTlg$2#VHIv5DxY6435hj<Qq9uAat8z=^_h=5YwRS4*v_Hu-Hn+bqc
z1QhdTfnQ$6-0MgD&j8qxg1pk))BAL#PWvuc4Syn_lxNLeJ|kW!bkTPKMGFFo`CFLk
zvq2X5Y2G0KwJ6A!INc4lbxZc>;_|VVfD*o~Fr)KTZ@rKkL7=#fg1ptVe6V5qV9QaZ
z03N3xZ_h3oT{6XgQ`t@ccTkX*d(Y0_I@)+e{7e8hQjj+)qt_fC6dBwVA5RYgig_pQ
zi_K{L9lhV;2sZ?f4YNTrPA>KQcE1`z_oX18NNeVXR9{b@JODrm1$miL=Wd!6t6+uO
zGlvN%;k(1Xzm93@^g+!qlR8R4o^rW08{Nd}(i&{p*%MIA6}P>}Q|vo>@ou=rMFf=a
z5#D9V`r@vF88}mz5Kzn~sp&V}XWvwqj}6}z6yzz_?9SQq^^y|IA+#QVtduGrk<w|&
zAiHe<+EI|V6=xMaJ`{X^{UZQp6HvmtT#84WyX&TO!B%`H0*d*D&$7amvy*OUtOmuq
z1eEgg52_urlTwOzR05buL4Lwep?G5VTCFvn015~wuK#jE|EU%#*`siO#+QH+p0Hof
zI-9#v$^AMg&H|7XkiAd1R;=`o#7Sxe1^I#$?`XR{bNfpCQi8?=6!Z1EB=XhqBcH5m
zA@olIN_q7$Fgdw(V)~2>03!$};SaoW?#`TuGJgXAZ3rmlC#K6b?CCjJ>y`?DLjh!?
z=az5vm*u`M5mcif-@Yo<dtL3Q_!QTfwiM(oy91#&&zfX5%7xI|2`J(1*|Yug{LXv0
zL;{#VLEcMAy**sd;_y_g*tH2L=7&VLUihSxxv?eAkI4j-@WW|=Gk0cmQ*FH&LT{rW
zpL<4pTN<@<r4BYUc2kho3YkI26KqE)ZU@C;0!sOA>e%p$5sRkG#jPP90!sMkc`|tP
z2iu_+F?CfZpqRg0JA>8xA6wp83qq?<kk<+;JbSc!b><Sj*BJzq@(nTh?o}`IYArC)
zY(hXWFH_9NHrKb0XqN<`cM?#-v%T#qvHg2qTF?T(O9YhiL^HZ(m7CI%xi}te2`J{P
zYln_umwlzdcmQ)E0VTYid4FJKth4@;84&s+0j2z;UFqS{y@wtr;3kPV0mZzEtG&AL
zW%@O}R8Z_rL7pIe^le^v>fYrJ04h<Cw@BnC-?xd8KEe^co`4d5QflOCnVH?kjqgA)
zn1cLN?)%`7@zurgUI1>PAfHGRiw`W8DZKp#;C>47qchD@hncsy)*H7p(g0*lmz5_X
zDnd%``-7qv1$l+^Qh(c($cgD501hUgn1?>LK5jvQV9Fi<dr^=dE=t{ZXLQxdPgqKI
zq#)n>(2`|t@=ENBQ?o7wd7FD)qwyBuSGy`fXdMdjKE=a+OVq4JF4G5aGzIyghPZ=T
zmS57pNC50lK|V>n%9KnOrl;ag?jr(9`Lw6$(?)sI&roep>`6c|Pc+NVZaA>2_VhXc
z=MzxE(`$Wuy}0wjp_pD96Hv^@W8mnM#}>P_!=_6%0j2!2i;^2l59f&vVZTS8fMUK&
z8nK|+lT#P}=VU=M3i3u}@5QGQ;<L4H!_6pAkXOS?^#-+D_b5vQpeF$(e10@f-MjIk
z(z-wZr%{mKjQ+mH=RdD?#-{X30!sN>=rd)7;_-L;aTmaufD)efB-xt{t2I7)076dz
zkoEQ+ja%Hot?(u`Q^ycc%sU@)g^f@A$6xdV#gPEAgk5dVYSl&Wb#c332nG2`H`h)*
z<5x`kf(7Ii0!sNQvImBbgL~ZjiRsJ$0*d*uo#r!355G!#nF-f;nt&31itI`DJBREy
z|KqigfMVX0>NR4YQ|7(*SlXKb$g;gx4!r}u=igWcp;HMc;hEHva*fcH)0_1Nup0rz
ze7m60idHug;-=LA*a1NHz;zalE6d1@dJABC0NK%^D<!eXHe0)3^3jHZe7~@8_sC`)
z-lbsP6H7n|-(%lqxH-~M@g+{An<>b9q@8B2Jzkkx`2?=9IRV9do#}Z#D}Gz;7(8gZ
zhJX@YKrXs{v)7tYKaD|g6$SZoG}a3L)gwd`r{+ck6!UFBZ})eJC-#)%+@lO2%W?Or
z)*d}|G-W=7{#Ad{jh)kGyG;*O=gfMi?}J&K0y#V<;0sM{<ix6bvGedj{vxrZJjqQx
zKB?IG{B7LmdPYJ1AbT~waz#n>KfgzDihvTn8?SgvczE<IWqi9@5m3wz0&K77bm4_v
zD;>D)aTMfl*Wqn<q&iWdxSf(iKq*gC49XS^^RNrVN!*=)5<cZ7pJ-yXuUR9^T@O-_
zA38r0+HQa7gG*RJjwTOW$Y;_y8rt{XxL0G_Rf8ysdGY%3vT*Zly)tFE>*WNL@G}Ts
zo_#nrU~|7;0B)cl_qu(H2S1FzU&XxeHUXu)M0)S;GhEH&%ok9cLqG{%utOs3lul?A
zPXN%Bf_x?|I@dnn)YSbef1hl2iZb;6s->XdUH#{eAAkP%_2&;I?caZ>{{F)wvV(u1
z$Lu-20VckFfnGjdbEXA)`OY>O``_OU_Xv!N)EX5S;NkK2FE&T9!_twBT<5yY^az|2
z^*f4Fl&4~(8iS(h2~YJ%MTe+Jbr-MM?p|&lbE3S}l@%1oVe5atB+H{B8@sqq3!LUM
z(_?;=cgx?GyITF_9nV0OpEDzV|3}sT{$teeNDUX?0C$go|FG5SAwQSRoTU6yKG$`c
zQ<S&v?|;P)cdP#=;oa)by=&ryD3ZbR|9)TP@+j|ifBw<`UV(S}-+w#YyTk84|3AZO
zVOS*y%M#(gza7<ZVVhyte}9Ig;lj4SuqyO!d9(cQk5DySSUn8;@2`<)xUg+8tQy=c
zZ*%_NAIE68u<bEyBM8eDsJ~xd*KlDwV%UHGI%dO#?Sf(d{Q>xf3)>yT{`-4g8!oI7
zhW+=4FdHsxFAUoRh7_Bn{{Heu!-X})uuUN>``q5&--U0uumTMG?@s|XTv#!N{r3-f
zHC$L3hW+=~6f|7ez8Lo3-%`?WVf$g&e}9>B!-egSVgLQD;tdzp2E+dQQ*R9yb})wh
z_vgABF037f{r6XSHe6T-468?HDSl-4?{DTdT-cEqwlxj=|9s(1!-XA#VgLOZ*M<u_
z9>f0od%GGgtRseP2M?L$Qh$HGr{Tg*!La%emUZy{{#JCug>}KO|NgXb!-aLlu>byT
zpN0$Tfng2cA+s|qe}Cet;lg@i*nfW^NyCMmiDCc!-9Zf(b~c9XOdm2oM)CLeT^cT|
zKZgDHH<mSA*gy>174DW*e1Cr_rs2W{W7uvGmd#Rszgx87!Y;tD-61TSrT+dPN5h2;
z!La}SkbJ|1U5a51;cnRp{&AXmc8XIK)aJT2T-b07`|l6OH(c1|81~;^``d6~S7BIV
zc*v~cbKQQu@f$zk?~l4QT-Znq`|qy}YPhiLF>G&gw?h6Ig?OdYO>!rW`h7m3;lggh
zuqF_e%~F5A@V4Q?Mq}82e?xr3g^k6q|Ne5Mh6}qL!<xfGW*bWD7xu{9;UD;WTesoD
z?!>VF{vn=*3%eV`3gK>9F7@}Pg&Qtx5{CWvR|7U&*gY6l40p?BslPwp)^K6>Vc38F
zI8eidJ&0ld{oT_I7d8#U{`(8V8ZK-)hLuxq`R8>0{<=}ag*|~``#@NhOL>=!@sg;<
zgg4}LaT>$+g|KXvI&iG?Mkkj;zYhjC+})nTuoe)O<x)cj-yBzBA^Ux*tKq_CU|35C
z%Vw#+zk1SeVKXu8zklAU;lgHPSSz?&HcS2eDeQ&|dkw={)3E%gsnNRQ-*<%D{r;pv
z!-c(pVf#Z^b`a+8k7_qu*xMNP-`_;saA9*X>_E6%R`F?gP4BZ$KOvzZAM$++YXf1~
zEY<t}{+vU@-R&a``|q#HYPhgZG3+3?TQ*Dm{dty#3;P_y4u-I7miqhKBMld}2*VD6
zuxyX-M@gn}bx56hLk_7|7<MRxWwX@ZU(0H^yM2RU?PyqjW;Zdeefngp8HXD3Zc8!j
zzkias;qLZ5h8+fX%PPKsl~qQ+LSjBN<lTP6u>bybtcJVW3Jg0O?v~9`e}9{z;lfs7
z*bxwx<x)4ss}_7u`SAOLU=0`cJBA$zVc9HI7k>Ev(01MNTz!8yBBUXG8(J!D4NV%_
zdueM@A$6rh(V$W(4NcNsB26SMN+m4~X`rR0NE(_*e)l-%bkCb#UUAR;{?Q-b*T;FD
z_w#<A=Zt&qxhuy`GfLnIn+;)y0$A))(&sDrMf)4V7JkN$AFPTRIHp$kH=OywszKNh
ztjOe`yXnyO8alrQFX5S64G23Dz~Z@-^eGm8(Uyj=qW~;+sW%n(U%u*^AYNX<54J3X
zH3zVGE+u_7l^?7QgdGiFu}evxBjyKN5yFlEu-K)f4~O!Dtqftu0$A))Hv=pV^>JC^
z$1}Hj5Vr7(VEIK`1Hu;m83TT>`Ve*k5E;*<0^?pdM|paQ2krU6)`hTzf2flmY<&o8
z$wo`w7LY#1#1FO+gq;Xru}euG58(%E2w^7ySnN{LR}=ZcHixj20W5Z@Y^SElxAdx7
z^DL>B5Y`I7VwaLWM$9kTHW1btz+#t@E*|FxYYbs+04$zMNuP)22ipO{P64oZ#V37?
zn;)zxgtY~**rlv2zc{sO_Im@*lIjX!r?Rl*?0J%N_^y)HDsDVsdqCJ}0G7O<RqE)F
z3d8&N=Ly>z!U_NuyHxwDNrN4ByZiHm?FV7)04#Q?av90Otp;^I@q`@+VW$IF>{8Mf
z6ZuuL8HBY5u-K)}KleWR{CVaCp3x42urmNGb}92a+XCjkv`XR$I}*aq1hCkpjzl#7
z>N0lRuYZE|Q7xfT0{!=ougZ*u_znOc+f{~dsVSFrYA*gK{C|WU2VrLcSZr4_@2lrL
z-rGyOjfG!rSwL7v7M9$GAbp>eAM7Lu>jYr2U1bfgk=(!ib@9zgez4XMb~b>;b|rmu
zkRPlqgq;Imv0Y{LYiH|pYiBc_(FzdO8Ngz@%33_+N5I8;?|8!6L)gMU^TSVM2M9YC
zM2qc8`r-^fSSJWO55Qu(`qXoZ?}Moc;&pWVV4WfCd;p7GYDT3cuAdhk5pTET2RjeK
zE&#CDrS>!*qZNFx#$cYXt`OFhg(Zj3y&Rj4IbmLjYb7s&ux<bryOi{CM1CSKfv^h!
zEOx2PgvO_5YTsJRQ)CYay9mHympZX8=e>@1oEJ}6PYAmhz+#uG82WMb`s7jK^)UQI
zUJYU00W6+WN#CmG2fG%+E&;IErKIot@q^s}VV43}>{6lS;<fq?Fn+~TWN!$&48UTS
zl0FN?FIry+>j7Y~OG#fS;s?78!Y*fF$su&<i=F&ncS6_|02aH{#Ygj>zl&JXn`dtK
zKv+)zi(Trw$^12&8>|=ayx<qDAB0^AV6jV`ys5Hu=;y2AeUkiO10d`w0E=Bp`kE#`
z*n<#uHGst~_1yjKvx`TL=J1U62!ve&V6jU{pS0%}Z4iX@0<hSn+6nWnEUtNCHP2{|
zL)f(d7Q0l|%AuaK7H{Xe7zu;0>i{fvDX)e1XZ&)o7O!OEC-P|syPkn<LoT5+v^Q(j
zHD=avp0ME%b_0OLE;Xn2w-yf@p6B|E@jQgx2w<^GS+<zgVaKNJ;)P`VM7{`NHvw4e
zQr&i^3LfWlLwLeohOpiM7Q0mYs$aj`#dwY7340a7`T$t$Qb%6v_vz{ySDh#94G6m#
zz+#uGm3hi<w!Wr#nG(O;-h!~c02aHH)o;z*UpGy-PS0Z@>=pn^Tq?3*#R(T*i+4El
zi}oIb-3nl_OHCSUGS~5OnHM}^A41q|ENp?#fAWhq5yEZ<u-K(eIF{-;?R!hEcJTzl
z?f|gZrKV{=th>I}5>=ieCqdYq02aH{fhG}gbxWCT;R*W^!tMgF*ritFoO;!{x+>Qd
zUowQ<4PdcLNgwj&S5heub`OBXE|qvY^m&R+$Zwt^r$N}g02aH{-MIc%Lp)90dBT2x
zu=@ZkcB!kQZ{Jd@yivTij-SY%A*>&O#V+;KI_1NK{)VG?!hVIY`&n3W2t8!s+@Y6L
zH;9)v^NaQeg!Koo*rmqmjJEPgIKs8xW<l5h0E=Df^RZKbqgsSn@{BeI!X5yy*rn{3
z)IJ$IWWht8uqD*NS-e01i(Tq${Jn=Cz0dyJbEtoO4pb7t9t5!1rDpx8Y9Hnu-j!#x
zr6BAf0E=D9Ks)Gqb3v6WtQLel3}CTK4X@)h<xBb%u5aj+fv`saEOx0ITKcU5Hfj&x
zDROxTdlbN8mvV1+xLRK|orOGMbs_9A7M2`B@78IiiNAK`$vj~zLD(Pwi(P7I+kW@6
z(#v1w30oDy1_M~^QVttT1IO>3T81ZVbqE^*V6jWNSI(@zZ1a7t(bj^n#{n#Msa1aO
zUN`&psWH!J>p<900E=D9S6!vyg^3T-dBWC%uqOa4cB!g616uYo9PPsswjqQK1F+bo
z9+?_BW!t{c<q6va!kz@M*rg_%Og_8yjS<%=Tr&uJ3czBQTJ3aTO_0iFuI6Y2VNbKL
z<Pdtvb`ebm&$N8NQ{>hV_6&f<E|uQ1-s&dNE(>_Vwu7)|0W5Z@J!i7RDm$O#x<S+)
z!iED_>{2twN6&9C^1p99qwNG?BLFORsT+FJgBsk*7QeZ{Z=T!*!kz=L*rmFRSwF8_
zjGB1uG(Xtx5cWKP#V&Oq)9K<Jjlo<WQS^eakpLFE)C-3*5naz%bm19oUkG~vz+#sg
zKH~H5d!~(>^MoA$VJ`yMJeShwx$|IHlb~@tQ3pfRODrn+jLPircANbNq=}c&^Q)+#
z5H<?HVw0NY5xZsX^$%Ph-;RK=mjNs`DeG9>V<AWMxgIJthp^EA7MoOvU6<Vx)>&)u
z6nQL!y#ipdNzFBgsQze**GitS6CmtW0E<m3*!sluvt_IVp0E=k>@@(3P3nn5_`ArC
zr5o~uwSutM0W3DD+Y>x)Xs0Je@r0cMVQ&CfY*Lri?rW|a+-p8h*l7?p2EgK}RCnVe
zTm9TJw0Xi#hp;zUSaJrvO+><|2Iopg^MsuVVQ&Fg>{6D=?S2iPS7jYfSVstZ8^B_h
z@{2v>c4}7$*UP--K-fC~7Q0mUjx|$kw4%8l>YNK<V*xC7DQnGyxqF}P=6djN0ffB^
zV6jWhe)_9xiBU=CdFFN@gpC8R*rm>w*|K~8uE3)_Vcj9@JphYcD(dt>l`6NBV|l_Z
zgRu7jEOx0)ZmUC2sCj<p3A+NqJ^--TrIMzWva4+t-<c=uDhT_Kg(XMNUszok^f5M~
zHBVSC2pbPzu}hhkPjm~Me<+wI?0N{B0AR68nPjKd$Xwp^Gf&t}5H=COVwZZ_W$8w*
zK~1>6ptBjmJ_4}Vr7k-p^**lM*PCawTOsUY0E=BJ?wa)_ivwA0dBX01uulLicB#%s
z(+B3B(d7CVXg7p?3ShBI^)}sK-ubRc2cFUHgRsv4EOsg1n(wa6dYQp>gVi6xCIMLN
zQs=t*w)3z3fa@WXKnVMsg(XMN-Ii)T{jtz~1W%C<L)aGp7Q0l>29q|{i2d}KC+sl@
z`x3xnmpWQw)Zrfs9&=p-g+SO>02aHHewA-uyZneS<QeS=2>TkqVwbYdy<7Ite_tE%
zggpgelL0JtDL0c1=3&{bxIP9t3t`^?SnN`!xtR~^u5Q6~^ZXoyeG6c*OWmsYHAcg^
z64#xY3lKI1z+#s&G_BM)x92ae8>~?fHWk2Pmr5u%Io+m&8P}bfD-iY_3rmikhp0xb
ze;9;5*~f2R>N<o?1F+bobm|>AuQ~GUC7!T1A?$kqi(RV8vzvOieX_V-Kye4crUO{)
zQgOTchIP9=Zz#`b;~?w@0E=B})Yg`9`hhy-c)~t_upa>|b}6gKXNR|h8z1Ben*d=y
z0a)x(SL&u3J-u<_B2U=I5cV^G#V)0{V7_tg^Tu4CP&|XM82}c$RO`y8{2b%`xVqE}
z2>S)VVwWm&rRGev<r!Sx-FOXQzcR3G@zHaQn3bgtI$LtBsJ9UH8$iV-6){V#&cvt=
zT%U`)gRtKLEH<ej>yCKup4)-zs~hPM_6LB)CT00-LC+P>-*Y{k{0YMT1hCkoP7OGw
zt8ydDmS?y90%0=&EH<f<U8ngZ{|p|)6ZSiV%>uC4q|!9>6H<JtaXo;S31PDVEH<f9
zrPE$Fn_H6W^7a>m{RLpLN$EF!GWDye8rK<IE`-eiu-K%UejOX{yV)|FXKqzXfpdSq
zSy*!NyiAAQv;Qj*$n`}PbqJdaV6jUD+Su)%IL=l4Vz0~@+?rCOhfJO1WaFLpP1^tS
zU0mUj__5xprB`Vd<IB;MTeT|a%8f3j_xnCt=`=_DzwAeA0|yMcyh*2{_(kdh-^xXQ
zm!FpkQA4Pxg?Z%&#)gZ1;!lHb68~ntNo?E>cSD!%E;~;C(qop5Jb9^7URD~Im-G=`
zM%5g^3%gPefB&j}p|+7yE_w1QNqH^R#=KnT9&8!6-%5P$!tnY9cwv)Po7})5!>FmL
zGQ89%ubD(I=`*|xuWY~zk6G3#DNCPg#OzjvmpbJ&tu)rFQTw^M>mF7TkLoeJvH&kU
zgg1X$CU)?Xl;_IuDn)r|Yhqr#|9r@o;gtz^VXtV?a9+<2hCelw;iW-&Z6N(o`qD4M
z>nGr4<c$pF+8#eYuYN|%+?{9LYf@gHk(-Ke<pQijBC|8Z!JpyvgXPs0k0ft)sp^z;
zEX&+E&6AfF<#nCZR~O^3?z+#jjsafZ0WUnTR_f%huj*jpt4tnCQ(pB+9y<)KGCyK+
z{4~Jp8{mcK8PQq0R_d92zM>2-ZOW?~@vr9kuliV)yRa4T`U-eqzaKv0_|W)an-?g<
zs|@AUg7in}bHq%2eF40%-{-cT`|X=oT1{nmm8HDQNZm`{D`t3Q0AASd@6`;>UDbTd
z7-e{sqr6;59{=0Y`a$>TAyYvfKLcLa?|m+}?x}B{Bo6RO%qPlIUZ==7wC8QzJuNEE
zd<l4c0=%%_PqR0BRl=Z#r82y9D6hA~C){qnEqVDzeRII;BjAPo{?m28y1HrGhAYFX
z0_EjTcy)KuE8X^JX>q~K)Yk{V3;X@D;5t6OiHDh2$$9#!F6CuN{3|@#VQ0_e>*Df<
z;g!zvYJ>g0j-%PTr^!E8D5F<J$}5}rgz=%zqs{Mig7ek)fEV`rww=`OS$4mFMj2lJ
zQC^XxKbA2#zGBBLZSkczMz1u$3;X?nE+d?Gw!X(4H0PPeN|aY0!mEN|q`UR?({LPm
z2Y6w>_X^nCva{pC2g>MGneqxF^;M!(<7&+=ZLJIRN(H>I-_PH(Ci>xIb>>+<Pra&8
zUJZ$VNnd_u@|Xg6k)7g<Hl=OHUsO>>ud0;SDdH1#qgu30ZNL9F(CaPWh5g>&^{jdL
zJByXd@Y18a_LBNK5Y@z7L+uru-@gI8u-|u?k}+mLy$u%1@Tx|6%_MqFv(Y`<s*{>M
z&?_16!hSzu=>-qh@pC>Z!>c;wWk>u=`ua3eU#|f#?DtFjtS=@%3)NSKR}IRmI;nf<
z^VAHlS1hmA*za2`KDm6^u#^+Z@Ty699U%2JMi|!YOLp7MAdfErFYNdB<5#GJy*tbt
z?d9ncwJ5KpB#%!FE2SnqDb)w?dI5N0zi%)mG1TnW%?irsrB8VcC%mLDU^97q4tQa|
zcWf5?(yZ+^=2Zfodex@98k0P3i+Hu~$+y-4K(8dg3;Vspvn_E~9o{Ik&s2x<N+&+?
z(Wz!g%A1kmLm@`5XMh*>``(pnE*m@aDzjC|Q?I&|R}9Ic^pS0b*Hgd?`@L<5%glvU
z)axr#Uj~%dVWL+;Ka;6VEDqiRdOZQWu-}_^xlm=%D+A^@Hc!3kQC{&xFX=Pgj9!lc
zFYNbCwXR%IZNomj=gF%+<<*SnC4I-6;q?gc!hWB;`)<9t7JZpxusnG+puF_R{7d@K
zH^VED<<$!NeMNJd0B4ns3iZc^lvfP#iSy4ate#z7Vhrjl0r0|p-!1w4?I%&&L<3f0
zUE7HA@+JEK(x<{1z2X5c?DyLCdL=jJxR+OkS7XZSDbZ_2tIHn@2c3Eh^m+(*VZV?2
z<dyon?bp%D@M=PNRU&o2<K7ILap60~vpI}j4*)Oh_YW&y&njbR#JoSjv+fNkuaab*
zBYlmW;dLMI!hT=<WX#7!W))5<qgPYPYY@p}vdNG|-LLof3G})LcwxWSNwI6x$!D*+
zGQ65mUWTOZrSF+Ddc^@=WIrdXqvM&tUn$D)YEF3#AoV4E@SNdw7x2P<-}uS?8lOTf
zm@}n3>#GIjb&L3_NodE}nHKxtzDX?Lh5cT=cJ%&$<eguY(aVVPvLyB8Kls==ryCLC
zwmhTP9hO&1?Dyv7t!IaMH*2B{ua=b8I8tBIH`N(lw*fEg_wTLk2bXGomU(8&Gmot(
zFJqF&*ivOKtvhzU0nqCf;D!BucJJZW_1>S~tBhW)DKB+0&yl{$&ggX$@WOuowe-9z
zQ#7?cD8s7_<&{hN<F#9{{q+2!Gk{((fEStH8(CbsKl|!LWq7rvyq*%hr0=>jdffoL
zu;1&a%^zsfe>U^7B+vS4M|qi%`qD1ftIL~y55?&*!|OWWh5i0w!g{}7MuVBRDS7fT
zro7Ba9=}CCxUeGUKuf^u8sLTfKI6KEu6In*B4zSuLV4K{|Egnav%-91Y51J)D&U3v
zetU+w!DiQA3ay9QQ(ogp9;I)~GkLrMcwxVP&?v#B;k}$lW%TMmdDSPrDt&yO;T6sD
zGQxh}E@p91T$^JG)qO|GYd4vHNnfXDcwGj(u;1sdAF!>ObN?`9^y)-;T_?Px&(|}&
zq5v=K_pX&s)Q<afbha|QOewFcq`su@+cUf_0bbbeOC*nOkX>%I_&`vJ^-yQZYZBq5
zR&&>-Iw{Y@$14o4i+~sQd!MY8R;r8Us42s%3+2_3_<hEc0Rs+yG`b6TT>!kW-=_wp
z9z2$??~pRQx>8=5q`n%j4g0xsY5!EfD-!U+e&2rY%9nB3ofMj{cB8!d5WV7#wei??
zyFA<{KM#0ezpt3yI=<`foqLqgt2^cOn2baIS+#q2x=})$9y9fI4)DtJdky>1Hka%D
z{HzSE9+Xx(Nn`10Q#u@~v2P!s6#-~ryN^z<wz=k<l>5r?>PdNZA#yeQQ8((~eusX5
zS2)9~U3+Zz8~*dX{Hc$NxDTs@P4uF?K9XU3&g`4+Pj0?|$A-=VUfAw4Dxcctpl#7u
z8D709FD;^%O6?Qh%J*sO1oS!scwxIwjD9yL`c3)U%JAw#c{L=wN?dO4TK1aWc);s4
z;Dzm8<yPyUVbe=0G@a;6c{!2k#IKY~hM8`^;qvhm;Dzn}V%1FhMT<gRl+mjn<rPkb
zp-19{^LMm^PXN760$$kezYl!3@x_i^3LOXNPkC)6KH;~cX6U#sws5){26$n+cO8~8
ze%8jcUCQV+fbxnYcK=AjP`^!iRdF@J*y;(u3)_8|RpRDz$%dlQDd7_XDX$z-Up>Bl
zYn<!mE3Q@<UZH>&p59wl`mog0Ok1J)8bo;|k@_nCAT6UovrFPEf#G!=@WOs?Rbg0}
zE*Do<R7S7Cl-CO46YB3iwX-`gxD?<O!tye~e&2MDon^p){_B<DWkz}RBYBJ&rT*+^
zzz1<RgwZP)@WOuYdD0`pE#>|!Wq1vtyxNog_<L#4%CXao5&*9tzzh5RqjQfYPU&62
zL>XQ~DK95dU;S&RcD|P3ss(r*1H7=`x4c{aqvci4!OHL&MtOZBuPZo}E`RThb*6ZF
zoyp@-zzh3*h<o+J?hh`mQij)X%1edl)ob0f<*Nq<)&snb0AASdPpsd%^z`{ig&seS
zpuA?$ZKk2^HFSOr7M~6?dL0J5u-{h<9N6-|Yt8hO(Q72-<wblVA$t1#cGJFyZ|^d^
z4gp@+@9n<%pKe%nvcEFCMp0h=q(8o?xc~B1*96g)7+wbfFYNcWb*~Cl++v)S;bl&F
z%_H>{+;GSCAyazl0$zcD7xw$a^wRF0Qz~>*hSzAys|Vq=>cf}g51pJ_16~JMUdGt(
zhb?pHX?1z-c4c^tp}ckyUR$f{g|6DuQ@o#q$zuTEh5bIOe4CS<Qzxe>!)q+%WlzST
zn*kPw`nW9d1HAkJFS31aw=1%~deu$J@ES*Xy&~(O-PXPD^?rA}C*ZXo@WOuIx@3y3
z5Y<niZMpH3mjU55%zaMRY0WFz0$zTA7xw#Do#0n4s}l^A(Q5+bRe|_wVB8DmC{Hiw
zU;6+r?Dv7U2UqD7H1UcuyeufMd!+7lL&_YPZy7J9hpGF$fEV`rOU_~YdYP#x)E_M=
zudT%I4OiYup0a831i)(#;D!DEboN+}r=MM{l+kM<<)u#Q%Vz5xi(?)u>jGZ80Wa+L
zpGRuBw$Z4q&}(y(D6i^7uZQ8!2Su%U{uJ=q1$be<->$Z$Ta%0P|5HY<$&}X}GCpQI
zHBG*ySJfKu+R5^2hyDKXxm(AzR_w7;hL;uP<wxq?`<J7zvri52=^az|I{+{2_b<zD
z`ViabzCwCgQ(k9CeYsToys@nH-f4i>cEAh!eR#JxhjTWTYn0K;hVt?ydR=&Sx$K%(
zv&B>Oj9%LSFYNbbKNfrM^3+l2b*m|qmj~fBe8BI;TbBEZr-2w=TLCZZ_W>UU1T1;F
zR-t3awv?AX;bmp{#i>=Z-x~n0Er1vH`)!Rjtr?^8@_;gVoJx7UCH*nUIeb@1YZW)Z
z%NOv%e*eMyeU%;eS}F7z>om&i7MbT<DRp#6h2j1C174c}FYNbz$wwBNlz96?8NCF`
z%Y*Q0e>G{a!)|wRIl<JI58#FUe)hig;cv{%6nboBM|rIvc`TQaEZk~P=M&)N4R~R{
zZ}RxYm%F}t!<Er%I_0&1j6>;{?u;;~<2V%X+QjlAuh-VG^9uWV_iLguyzD8jO+>Hr
z&%KX6f1U~F6B_|9?DvW7@2uF~qj9)0yk<~d?u3{5ooxYgUs@#ry*2<|*zbK-hxK0L
zQqNKuUNb4L)`ZuQh~{5i#*Py|WX06|dcX_&ea-(K{4)G`RiVd^4wP3A>5mz{rKVig
zsks>NS_gPxzaO&7#Hjzr4eylEYZm1-hRpA0-dE3gytmgdz-ukwh5decpGKj5Ooyf_
z!^@HKnn3zv%d4)X?8n`x0eE=<US$8if6LD6j@7EH3@<0jYbdF&tl>41`?tTI4S1~q
zys+P=I>a|0;~ptK*HvQwem3QGhv;>GZ(z&q(I4SHz-qt?`@QS!n@v{_%`sGl*Br|0
z0oAKtJ6oq)JDUN$Rsmkv?}MfW%{H#zLp-;sgkH{+R~YG!S&N7K2)I~J>}HHltYmqS
z*J};SEzND{rY$C130^Lg*Ge*vZqzw2sAG@Sa9-#McwxW4J92#X_iE80%J7;?c^xIZ
zo-WPs4pP0p8|bwH@WOt-XH2i{m$rKjREF0)%BwN)uTMRv_&%7LFcR=u4tQa||MK|5
ziSC;^=PJW%KILUVc+IG^#P##SBQTF1fEV`ruXA?Y@JO4buMDpRl-B{mYftkrTEPcv
zh>tOtx?cu(VZXnzsrBpakL<(^btUTFmGY`c`lFX)(=jK^E5ZJ_6!5}+zo6Re#z&Xh
ziW_)J@N%QPDv-K&HVZD5X1`H<tjOrK1n|Os-=d`B0GE^V6&fEGQeGp8UYQAvPtVl8
zwHEMl2fVP~zxm+TL)$Z?hcbFCqP*6T`Z}>M=e>@1oEPA=81TY=U+O~Aj;d}qZz#iS
zG38aBjE@yVKdxS%JPPhlEMj?)*J~U3T=8f+cElrPc)3$v?u6H%Z$imn+v?STUJC&)
z?Dv^vYkpa=+oY;8yp~X2!%5xOSl)hdnab5;053Pd3;Vr?PCyyIj_(v2x0h00$)xT>
z%f)N;9bo(l@Nxybu-_XjT9;g}m90YS$7Ph)U7}arJAG<6FX-(Gcr5_Du;1Uc5AC_X
z&KKr3Ql7`jJt(hlgjeq!9)62%gggVh<^x{X?|)ki>tkZBuFyJdIpx)Y@anm+VuvJ~
zSh$Zl5Aec%Ut>U(9p_s2YphJ&ub{m4692mRX#Vqe5lealz2*X5*za9T6Hl(4R6)Fl
zPKj~Ilk&15yuO>vU$eQvda-PnamWSm!hZkW#cfn(@*dIXl;E|J@-ij+g(q*SEFJp!
zs<{2l@Nx#cu;0hr80Y^Zw{uBlc&(zm#*q3dHM!TL^U+7$0k1hMFY<cr^<no)RJhkj
zp?$g4lvg<MujlS>pItn1GzajS4R~R{|NMQsihj8z;>1IVJg%X<Hj!~?e{Q2b+9~CH
z0WT-O3;VsBM}O6mrIXE-;pIhnMU%R3C(OIDxaNt~fR`iSh5g=5x89bL$7A~{!)q<&
zwUgvAYvoYSS&O&BeSlej7xsJW!@dvR-`7!S99l<tO(ML!7T%xn%fY%9(8~ev!hWx_
z%+tKrhz{|}=(V2mx=4J&(B7<7*O*zy0k4^W7xw$>QQ>zzRjscp!)pWO^_c8W%&GmY
z#lwc@;klt1fEV_Al@N;tx&a0il;O3J@|s3?S+<xao~+$2zQxY?sy*O^{XVhJun)cZ
zU2su`*CxuV7va@ycdFoVPFLJ^V0cXjys+OlJ2^SSbJm*X%JA~0yhf7xYG3v1ce@y`
zv4EEy%Zt2T8|WKsGpfhBFlBi8P+mcV*OAxyeY(2FRR_ETzzh5Rsb{C>1xznrP8nXC
zDKA^Xt5)VIzuEel^8l}DfEV`rmo>IsSu;A+Mj2kdl$RdyFRS00xxa3jz;*3Zzzh3*
z%fN#hE~u`ZtqiX%l$RIX$Bb-Pal*yd;xRa;?ri}t?DyIy{3p%WF>#SHytYzaT7=i6
zu_kjJAD4LncufJku-})hejz*f^f7Tkr^I|>8|4*G=3jl14r^AsGZD^nYydCp_rl``
zZMEi&6CaEy!D~C^b(ii>IF{-;?R!h;tJZ)Q_Iv-6WsDA0&``)%cTip*39o6|59_Y4
zwL}%<(F*Xwe!pGY=lQ_t1`5sZcT!#pNPQh>5)oIol-U-*Yck-4{r-7GO~=y{4|^(;
z$6b_HCBkb(&Z$?OtE-AKF#T~7%Zt2TTjP+{xzAn}r<LKgoAR1P>OQCajrY|O>%sZN
zM8FIC{aW3egA;7*6q*<Ap}ZcD`F-N;(B~;OA-{oMmVg)bd%r{Jqc@(>-J^_NdnvCF
z!s~8af2$#$rf{8R0eE4*-#@@;V7B4$Bg*jFM|lk;ysnPEeM_zK#_d3_34j;&``saL
z+`pu46CXe*;a`4~*KWe=sddVS3;hj80bb()FYNd6BNx`PO1i0#-|wfq?vZgjWZ~SQ
zmsB@w0=&inUfAy=JzJ~(v~qu|j9&hf*EyosSe?;UJ_$#le~ksau;16KT6$$v$5CCB
z;T1r6l_m49&&N&$j%pDmZnrW1H3sm)ejl>)s#BuhVTGO}AE3Mzkv!TjseLkb$byG}
z*J!{C`~CYPo?9$xB#H-?m8knb%FCAUIvaoQ;YaVY@LY#E!>cWMzqam^Mm@FD`uZrt
z>mcRzg4EZnA64zcyu-T!y+#3E*zfOdc{pO?+eYci@H#|!H6!Drfp*aK=7K8ZH4^Z`
zejl{$j<JrnyF&GKnDRPG<~hUbcuo0|ennhPFm*oy@WOsSEULnPTB)lWDWlgB%FB(6
z+c&iITLo;?9sqa^2fVP~w=c0Fan`5{4$AO4N_ll7dbzhdT&=H~&O*Rz7~qBde#g;r
zre8w8v{Q!HG0MxB^v7<UW}5hGSDp-b4F$Zg-_LYN)#w(vu(mS1f+(-@r0$ot?RP&b
zz5HdsYY5<l{l3(JRqC$qbrpIaIhgY5Nb1XBgK6OSy;I@&95cWR`~7)`hMz7^da+a)
zy+SCjWHJu9SI(@zZ1a7n*I>X4`~A+Jua~$U^*2$5*Kx}0Dygqke(zp4`}V0Z$m1ZE
z7kR(dXKT*gqw)R<Jy!^&yjl`ozUnFsFHC$W+BoB10|77W_vuZHrs!&{6%W=aF+QH4
zyb?(stL_YF+0SsaIP5XJ1^`~z@3*~(Q|UNt;a+8Ug;8E5$@uul)W|8@_JuCs)gSP}
zen0(r!ZDZo>lE6LK1q4iB)&S~Wb)aqZ;ar&qaWaf{r=X>%rDDlzEo32uTzxQcEW45
z(}6WXDx2Xr)EDr=em}WRlRYaYeNkwgcAD}!Nb0_1yND)(XIef0dF%stVZT4IZpZv*
zt;Q?#Uced3%ZKQd-m~87Cebbn0I%MF7xw!n6~~!0%k8*TnLM7Qyxa({J!i7RDm$Nq
z`{ca<FYNcVYA?Aw)7WK)GQ7emuPTJs%<<9l8;tz#8_=sK;D!A@ZT*2JeahWZsO}>u
zubOoJr8hmO!L4lZbtT5Xda%66`?d9FrIbq04?d)fUgs#UP{OOrnDz6@#i;23UflsN
z?DrMzQiEOvzr3Ujuk)0bAL)+=GMz5o(HLw8cy$B3u;0JB)NV@a&_IRGGeuHfEy+0a
z!r@Fr*E1I4GMdR_SHKJV{f78`v9~IJ7^I9|7bveNGA|rH;`8r&rj5nNQ4FsxfEV_A
zhq%Z&*F$EVRfg9^%4-qUOQYw`gJDgA#sONL0WECzQ@`7Ie6nsPC`0QKrS*f9*X;0i
zoBao*iEp_ua+v~N*zQ~GEfYKCnfnQ4ctufOAta5{JYu)Zz5W3n|LO#IVY?5sUvIMJ
z)7)dq@VZQS<&d(sj@3ODazwvB(5oZhh3!6Tj$7vS&~e?B;T27JwI|cB5W6nBC#<v9
z0=zl^UfAv{+>ePgeI8du8D3W?FB@X3a}6S@KbqpT67Xuz@*=O-PBveZoU)>dLfiLO
zDK8(wE7<zP^s{BG#BRpay$Rrj?Ovth*h^PFYbmsSe~t3mK>Fhohwyii9ZNR^yo>=a
zZ1<+Ce+V-Zzl!hIDY5LhPI)B}UbiQB+|W)>i~_vc0bbbde_OYR+ESv+V`X^VpuFl3
zpSY}cUvu5yUh@I3wtyG5d#{M9^L}}37oVRhp;rv$<wB-k-HnfI^>fS62E5t;UfA!K
z^?O^|E%-yAGQ4h5UK5C3Z6XpzH8@u~8t`fjcwxV9YIbPCrF-u$D#Pm*<#mwws%3J!
zU&H5B5l>k%{jn9`h5f$wtcjh}qKEcWhSzP%Yc;8Rzt}@=r*?%r0lZoQUfA!muP%5Q
zdaa5=dAvh;X%qkI-mzwijaGCsz{?2m!hZj1P&3D$PhyrSqgO2Dl}hT~S~Fqp-lw}q
z170myUgY)K>ceL5U;gcso-(}dQeHnuea(LQt80l-N#_Bt=71OW`zbfAXWT3?PoeFc
zILd1tsju^8w(Q=&EAS}b)eP{$e!t4C!@}UxNgtKb>mKEGk?@K-eNd&!?c`X%t0~}x
z{XX!}onhZRKUpcm>ptbxgYeqqwmS5Ln&)@G%MkFwem{HIsS^vWs?Su0*8|Gy3z>f<
zO)X_t+bq5_;MD~1!hW9~zW&I{>@(uICnd(Ahm@B-(d&iPl|dh4BU%GqjR7z0_k+Im
z+dOj2JJINr;1y4KeJ1_UynLct;QT|ufL9~Hi}=0msl`)&+|N{oR|4f_Me=BpomwMv
zdDG8;S3|%H`~8=m!)oko_Ew?eUx}309K!2qm!%uM1~q}#JTw5ju-|{2SuyGKz00+f
z(d!ZA^^N%bWrw8R$F=)<1HJ0AyvXad8Y?Qt?AtLZOBr5|DX%(2uefX0mn;rswFSKD
z0bbbeAN|(3*4ZvZoIom3_fIIVUW8ZYqv->4&uGHq69#}6_WM{f$L|eXJr!D~J*B)t
zNgjKf?l13r*Q5i`t1jS${XWWJuj^aY2bGl3>lx+sg3MQaYreZO>tzOfoK^?$!hXLj
z$#cg3n9{41;gv*rr4e5}*VVV3f9(g=fnK!%FYNc_OfGGl=GQS>8D7sRFH4d~x22j-
ze=M{g0eI;HUfAys8(y9f6&Iz@IP`+@N+t8vo((2#tP%T3Toy9(`&xh(_In-Ik!g{4
zy%jn(^pf&wOY}NgW7Odv3m(J$iJE{H_WLU-Tl{-&HCUib9$!&juLv*wD&M|#`4M3V
z^r``PVZZm7Um@}I1*P^UUQ=G?q(9o{-Yxs+zpvtH4JMD(SzhGz+8>8{ZdGkQT09@A
zgs&!3UR?+;H<JzKVcD(V@ri1H7xw!j?l;4mU(&d%46iqomnrcH)7;F5byv6W0ea~H
zUfA!G_715~>s`PuWq7@%ysD7?c&p;q7!Bu2;{dO!fEV`r(0$un=S;8{7n(}sF@^Ff
zOY|}{t<*TT=P&p;tqS0U{XX3+pw6P+{kJK@E0yxfCVC~5o1AV_!VK<5R|dSW-*3;T
zbN)sD%Hne*CG>hnc^xKoAEFw${$UXM$STtxD*;~E?+-LP^Hry&fq2eB30`TG*EzyV
zr{00{nj_DOPstcw{{ddu?`PDFo>yi6zLv`HdQW+6CwXl0?55srpR87ZS4F@J`+bvh
zQG2#!4;O!^gkI^C*Gl5|al8A5b-O)pDBz_FcwxVvt1+t6(yR4UmErY)@>)mq8nv}$
zoPMBAIl!v|%Zt2T+ic<KeS4yn`h3Sn%FCXtAFU#v9o`add=T)`0lcu^KkU4yRF@&k
zYAB=EC(0|D=yj!Ts?pOMC&X=2rtZrFUfA!qp8uqJX+tH2{QfiLHHg%g-h%nYxz8KJ
z$L{3-FYNca>wEoPbU;^A8ND(nuQP;K>&mD69OM0<-<Jivu-}_p{Sj0v%U;}2SE4_D
zp}f8j|0;8(=1jHa8RGU4lgBcE7xw#>BP!My;8A_QGQ7T0UK+UWHDXqlI_PW(%U&DM
z!gg=jMt5S=;W|H*q4kZ@T1~oR#4NQs6Qer7(*UIbFKqW8FSp)wyGmL~Wq5t3yb{TD
zV#vB9-n-{^fN!U10bbbdQ`Wo+3;EPlp=H_+%BwZm2C#g#py!I`@5R*@Q(l^Y7q<Id
zE}Gp=EUzp2uM%bdlk$opyiN@`rmJ!zOFZn)@X}y;k+*9F>vw)952P!!y_!jR4ItgI
zWY=jv$v=a|LnI8ZQh*n>`_wA0Gsmbr3R6a}EXpgB=#{3SpOE5H4Zd}v4tQa^zhk?-
zO?KllJ(S^<O?k~BdX*}j_PW{Jl5K%rYJeBE`xR3gR(f^zVg+S*{i3|mh)?J@elqo|
zsTy2%lmxu6-5*rXK7RUmz&vGm<xpO^M6ae_$Hx0^whRY)sRCZu?%SVwujgwVuh8xH
zzbUVNM6WU(de8o^M4-5eX6jxA@WOsy?aA*YX6h}UE2CE~<<*u<a{_Jb_D>w=`UUVR
z0eE4*UtFbk^|kR)ZHmLIk)T_?M2W`k-frHFrfRNH9Xr-rs4V^;MhXfktR$b)k?@#b
z(dQk4XBqLYORv%__^_jTof0KhX*%DjWE*+NZNBKWHASGoLM`#%RhqRMzIGls?em;D
zlK~?E{ZW2K^lBqvK4B!?Yb4YVe@|_k8z6nRKqyld2PeH)ARBOvN?$B_UQ3LkB}z`!
z5D%Seu2FZkG;W8jSFh57)e*1Nm;<gxd7@spFrmlFk;YZ?i5j+U)dDY%j0JNh+f49=
zss6vh(yABkkt9kb&k#F6YEU>repT=@E!>p9PO&Z$Ea?l<LVG1rO^>4r2S{y6XHn$?
zqGaV2b6!)7q$(iS_+tne0UKATcYnt`&EcAB>bydK5~yUm46$`o|3Tze1+P?=muq39
zs0jmjlLGc7U9I4!-{k=^SdkQ~1(o2>=W6q4$?_2?*ZCq;Wpizer=V-+eEZ!u%+^n`
zaIU$gchm`MNiUKvsAJ@ya|Gc&k)z-Qft&R7ttg=uNwf4yl;EU9zoyT<g(w2p`_Dsf
z`G8Wc5qDNnkZZgQdKqv(#IeO^D}{1RXP`n;l0|9y$q>t?rQecNgtbb_(&rj6yCqig
zaxHj4B{ot(ccOp6JND%PGQs_zSnUbcn_a3pB^}F(8!maYIQ8p%5vqY)<EfdUzU!cY
zJDVm;s&ehubQL5_A%RLuAVv;)L`HBS6;befj6CUugD}_hS%UD4&?@*KK^`CzTnx?i
zM1tjDWO3>K?5h*STq9bXa-A<ib&zYkJTOkQyb;>KZG%F&rYmtln`BX1hBI<Zr*eEJ
zaumEeOrG>YG|Y7l1<WOYJ^cNv`i0s?N;1d;WP+owRtf4v0_nY~JX)M`oiD;tAlG<_
zy2xOtu}^g~@l8yM$+ch8-5}vE2~^q&V&tGVZwTf@`hwSP$dg`p0dxJG<~oTC8kIWv
z>#I7L_)0R!17w1u=aPj_Bsl3%avm+Jd$9ne@j73G8X(tro229N$~`Yc47w!+$u-w>
zM^b1^axHB~GIC6!(yt@GDtHxBv6Mi6gjXavFxQVLU~`h|<lT4c&9&$&o>rA5{U1Ok
zxDFI+0m-%W0*!!Z$pVyeoi9R7kZZhEXSb!1N0#r#)l!gLb4_=}g<m94X-k}ugFgBb
zmXll;eC{VR*LroqEOiCdKa<cZ_}vMafLBqOAyiJO%ol(bNG0A4p6_|FTfN+|;^|k4
z$u*Vq!He*dBuRSe!lZI5RrWmTMFqb=CsWzhFqJ<k;4+fRYkU0sy!sh2@0iE~WP%$=
zu}n#D(lreNqD6Ht7NFE^^F>%1<Qi{_Jxw#{+teVNIVj9E*Yx3^picsop8PR#jG=Or
zC(;+ZZK_yGpg+PEZ3c6_o&x?RC0Ot?T=D>!;EqtNdL%gMljH)T#VObMBGd-C#@k}C
zTSC*vbhO^8P`@ri18qe_l^#Aba?rO&gbt)43cflbPkO-)<~oAX8cg($POrAP=A4xK
zk_<8dtx%aER8FbP7eE;{l_q%CYky3Q;I2ADPf0Ow?Y8vQZy}Z>NqX;_NhLi9A?zW)
zDtHP)p0W(qYMR9uQe4sp#Do=crAI9DKPfpu0Ru>IVOEKo&m|k~6IaKw0VRt@O)+7X
z1-Zt%UL(A>T&vr(x0yn@rpIiAW2ApbXKWZ^^yPNpF!@!%H`}@8`aG3>Gr_8u-a5YP
z?w#T#Gs0daa{ZhF?k0ea&4OQ=wcXa72cVQ|#GR3}9LP1^^;#FAYU5%R$-E~Zzh4WB
z#PViv>3J`q3h7D(4|{P-Wg1m>EK&AAml4i8Ti<gRl|83KDibJRYXaCU`TXrCQQP|P
z0F+XRxU-UiRN_soMQdErm+yPetnlThl7XbB2?aZnMd>IZ^Q&o8j`8GI1@BSkmTP)W
zRk%j5q(iDgyb`%SMuW2^7Txl0`Hz-YJqL@qmJKN78a2g8ssnPJx2dI3DLq;@d7Ow#
zX8`h3EmViF^!5j#J_)wq)erJAnIF)TsymMW7W~v>u>d7u^F>$z2#a@+p6;>zu_2&y
zeV)oPxui#!g{h=urSr?ouNG4!CJ`kHel)IFN}xZ&%M%R)P-3$wpalUe_!S;`fJ|^N
zD3&3?YA_}-)a=(y@ywNgXwix;7NC^td=ct`T;m<2Yn>*1A3ChI_>v#R<j{+f#9kzP
zQNn%RP~mcja9yTkj2TdGgM}y@LZv~-6b?1Tq^lwj4)51ESw@U&S^G>sp29JZ^t}k-
zF&R>&mm+*rN~n}T|1m$Ow^<4M$j=L2WmPOm&>!L6(iZlXb~F*!31GoXUgQBX!F{4w
zfdp%sz1gc01~n|jq$66KlAbTZ|3K35#?68eIi25B*T1V!(&<Y&f<FmVdP7I>lUv<V
zDu=>U9;EVy5LDZ`SA{BWG0swy+)}xgVx1sZ(x=4*M2k}@^F>$*q!RDTY+Kv$P3Y2r
z%u8(YjgP6>$~4e7WRR0y0^{h6f}N<DFiY&DvV{9n8cYC3X#g1{fDrfUhut4s6jw=k
zsIma1AqoY+pjKux)gEuVw3zGH$1nZDB&i(bXG(a5j4eb7;jU5PW|9)mxUQk=9g`&P
z_R0s8ghSjJ;i>@P@E(S@O~(hGOJ<ekDI5byUkw%P$yhAC7s~vK-s33<<W~hR@#N+%
zOQ`g=1WWobv*07wUH)GuaiV}~=nS%WidgzWqI^Ip*N8hKX;qMGyoaHu`L?Nhjfm|E
z<(j^pEJPAFklsyZh-FYY#u5GR)eO#E)qKntiIu!u3n5Smdg;7SmjD*Lab6xE6Wl@?
z+)jceeU&+n7AHf@7oi@=HQwat7QJ|5sQWk<NmcoV$OKxA23n5fT6#ZTKo3S`1^TBK
z3Ae;l3fkOeHB10OG=Q7LfghcFG;vDr3gZ3sd8o1ika{HwfI+PWGKJ6G`0t$7ZIJyq
z<`oRCnWC302qj3M(hUm?F?yAsaFpb;;5~lwI*E{sQrU!GX*5Lbp+-4n3N^(DQXOOp
z?|C@we?7lkx2*FD^{E!LBt3|ENO#>a#OP%n!cvl{f;V}{%M^oUNEx{iEa`SMAy=;S
ze|k9sc!L7ICxH5C^9S1WpB*LkO4)!?uS88Ts5L;Y@t((}`188W#`srMDA)9=7$JfL
zD%}&q5W7I-=th2(8kBnQSi-(T5-WMR7WC=?CDJM2XHtT79L?T6P5!xp7a$Yd7mAfk
zuyR|^{r1f(t)`f3M2l0u&KF@#kZZh6WB)5x&(Fy8u@od%2d38*31>(arMrq4Il56f
zW|3bNyqH~PuG_*~@27x9WMF7rGDTO2>L>O!S<?T%etni=ttPq7sC;UlgSLfut%!hV
z$pV!6b-oB|fn4L=nP<n+eRD<}v6h15nrnJxmf%H1mF~-8a=nepQIq7_?8jp7U7kAP
zOKq~0_#?eA6z1B60uCX7?_JzRWhU>L#tm?SXsLOqvI3A&nJ)l+kV?Ee)A8j#<1PCa
zD0N`DHr33AXeM3B#-x(oSty(&zbbfPq0DY;2UGcs(lR4msdwd?%f=49D!%J0OM0nx
zwu_TA0inujPfBIJ0BSR-G%>>4H`T&}W;-uLU(ukLTzh35s@Yr8E2Ufim{bm+zltNj
zD)=cRd8uTud@0sYg7v}seU%;eT8XOw!9%X}QYufORLX8oN+oKFiJ}fjCEie4o!s@r
zm6N626iQ`XD%>_w{?f&g3^616t6<VA3tnd}FO|Y>sKf{g=uK!9{5+vNKqk1S6w8-j
z*@n2xTv$b2d={HWi*})60ZO^f7hzqHYrLW4SYp<M7mlMgE0k+`U%JqfWKp^-osomy
zf+{Q|U8&#|sKrtO{Zan2XW>-(xddy&f4-ML^>GOhr9a25+cLr3rGV23VB;tIYkUf|
z5HISK4=Cjtac58sK(6tI(y7GQqt|vd2v;cA^=P2Rq^P9}b_I2A8$d!l40IPoZA&U_
zyV{m+O)kzC@4J%Ki~e8jjH6g>NQBatj`L{A0+dpjFT#2tm3YU=L1lNhbJgB%l!8Qm
zzDiThjSGT3RMv_{yNE<v@VoTfdgXhHl}NDKc2c`%+5P?*v6CQLoRrNMVSS)1-XR)y
zZtAHz4r|5xb10^KWto0SFYXrlkaCr7?Ph+piOLa8epScLE9~ptuZa>XZqmP|SObal
zOZ==aCO-?+7o}IPFH`#e*Nc)VpaBVP#L^2MuH)x?<N+x4BE+4Mv;oL9-XRKZo352q
z(xIB9D%V^$q=8-_&M(~@&k)O^zgkaz)%<Ch*uhUynB8P~xfbd`CA=u0BLQ^n-?H<%
zW3?(vGROmDg4<28tO(ZUky@^8G-{iQxkj{TA1fB1l<RyEHUzoGJ47C)8Al(f?N{o3
z$>ucB1QMw9HIckC!khxlcigxU2o!IRXdYb>JgJfob4{rHq84hQnUsdyg#AA^h@-+S
zA@(yZtE1zYz+Wk1g&<T;!l9;^x@-)D!<!KvkGE>>+BI*!)D1WYcNpSE(`XkH?`s#c
zI4G{oF>X6{f|EE}3J8^xaQOmg!V1?CFa6gX`aX1?R!L?jii2?KC=OpGo=^nS9E>43
z@I2{};g)iL78-x}%{5R{OvM=j;qXFxQEJWZYx=rP=UQ>*u;QB1XeX0s3x2kzSmyix
z`tl%(HInpY>C4@MJ2&$X%A<`++3iWA2WpDRR8x>Cyi9J=B1ymBa4lv`=1}7H;-W}+
zD!1bPUm`}+M06%ee}BYti$#q@@pexEp>pyD)D$CJGawvZbIv>6dj6~SwrX5U+!Q9P
zBaQY78G8$Ufs<Q_AE#J%2$ofaVP(2pTrECj6c8;=%I1r(IZzfa0hjw6u-LVFz9&~@
zb5PeZlN-mMk`z887e|e9(giieR9p+73wgxfc>K6c&Mm~1BgN#<Rd*s?_Hyg0LJLu1
z`Rz#-hNv;R7y(`IV&_4>eQ6oZO%8I^B^=`V(B9FT3;?tDtq*@=ZZ=f{mDdx6e1Lr@
zpd~T7g5Rbu7NDeTKERehS-iA}vAP?b*v6or6eNd=`v`||fd*QcD0}MJ>3IRu%QM$Q
z7b{SytL6jTK>^DWz?bDWeTZ#zU+%t|q-;LGRzO+2mhu0jrPZjCzW7EX$G&_^R91K^
z*OUGT$8@_jRI`Ufvs8!p=40F=?Zv@CHlQRNYKlQ^4TQsMne&s%K57^}iR)W)3}hP`
zXg?CD^etQFSH0-3I+D&=tM-!1GmTxCdtznQtX>14#9<0JiU3v&9N6-|Yt8he`jH8k
zgnS#J(t0dbdr~U%1<-~~Wh=b8+SaM==pV+TnS0IUy9<*f`o(0S5}D9QpG#(b^@=K+
zMH1d(@yX@OhNYa~rfh4N%Jvj6i~tt=N_erPM}L(6kx&Z7`a!TpWwa@6JN}}I*liIl
zPPxt(VOx-Eyda#op+)NQaw$h7RpsYeFhz5h!`$X{OjWp2;U19MF<7)Nxn3*V;b<}}
z*I=bif|_Db+X3P5`s{|PdE7vw1Cd;X%Y@4tV;Ze1DU*U9VwYEvLf$NMlH7{>e<dDI
zxz8rfsBN!Ta$}Brc`;M60i{f#rkG3_gG}KC;ZTRpRlTDRlq^oB;2rD|EP4l(fi(fJ
zc(t@{uW5}|b=MaC;P0e=x}y9J)%P^14fFEWv%|cbH4*jgs6;}qQo!Q`aC?Ti!DiQA
z=mddmKuKBD6r*f=pe$Z*c~*%%r_rZ5*Ow)QD70>r{nmgK2WpCmqXUQouaDj@KYVY<
zmZyQmDa?Pk#J)dG*bCy7)1N0CbGg4RUQC#5bCv={O)-IX1cBn^%I#X&?o;<28pShE
zp$8gdWWT&B#ete);^+k8z>9++*(HVyR$0@XXB-S<C{5S^k}%6kAC{V$X^ZdUFuyXT
zzv@JO)nad%*eTE4`Av+rqZ!LNB|OB2a_>U83y&YP)tWbMu~?F_0i{%;rdY*5D)Dl~
zYjeX>dz_lR<(W#sLo6Yot@w|u`P=^+O#`WLp=2~oOfT*3Ii*53Q8<Ju3s4dcHN^<m
znH8=zUL4dux7)JQm_)8e^#o%$--@EqZY5>nTKPonxKBrCOWBe+?+HSgLa3aC%NIZw
zARL})r###?IKX!~vlK<T%R4~B<f<#xtUA$bL;SwjTa`Zy5-X$*ziq@!*tKp^R6jB)
znB9B$b-nlJ_j1dWwg@O7R8E=77eH5#DLl1bGrh-flW`6gr5NOAN-#$Ai)1C%M6+lP
z8j&1WANGCl{=N=c6CqSi!l9;^dg}&+!!zxr2{S_-M_nGxRk%KAG^!)l<NlwiZl}Uk
zAf_7LEzaSbjpZ6qIE2bcIMfs)Tz4QGUMt*l@%>_%9QBE(a15j$4b+KLNUToqE0@&?
z24bMHA9zcn2WpBz?EwPCQ};{8wmy%W++)6A$)RiAg97)X(Q1;k-kO>DW%<mP%o9O*
zqX!e?K8p1p!3w-RxJsv>iQ+fg1VoEd3FV8hCr}nICep?@9C;T|w>r-f61t&bvbtRJ
z{nPh^N@&h5yFIDmP*Y4cdI4SV+T)t(46k3kGR2SE(_F}}I0mveEr463Ydxtr&ZJpx
z$E{)kC^8f5O9A(iikn=g$(|LHzU&bJZ5zt0xIdX<BDAA`mq~<yzQH!5daM(-hvfrG
zsYKiv)ZQSKc=^??Wpagvxf>jLrZR6x7SIV6S{}vn1}P3S#4~a90de59$K=CnQx?4P
z<9july{Mm{C3i;q{|aC(Er2FuX6a&@cyjHe3hTrIK&YIALrpQl^<{-?gV!FB&uu%o
zm8-(l;~2=jG|<|lYh7{<+t<rXWxE(?54nN<KV>~Apeh0UI%n4nkF;6nAhc{isU%TT
zOql(EvUqV4e_`=Nvo5(}q%@mcUGfYK^biR&G5X!0=r`yK?!sPf!wVDR1PVBZ)a>q%
zH|}53wvFTgC@G7$GpPN6vUo93D?s<RfnPTB_Pu;%h23ICg%xt=_J2xJ2oX(7etVKG
zh#I5I0H6zAE0k!wxc@`zTA@5EE^ofV;4Y$7(1Q$JjeM?nG#xwQkpwE!?a>5@0UQVd
z#dG^|eb4Vq>TJz*ZZE8Wqs(R+=y1}%{F09>G%4}+hs;3#R0tEJCB+&|MD;tAK6>LB
zT{Hnfw5Wl|R5o9PgMhMlF%j+`HMc{(;~k|SIdrX5aVj9plRF>yf5Oe7!W|^ST_1L@
zM1^~e#3acFl=>dx&ZKKF5Du>u`kWv4GI_I~QlEtRN&|gK0v$f$_|W)an-_?I%D&7;
z3KTWP1ZoBX#gl*6F}DLI)sNW4vqBiiAvDlkq}!D`uu9$ay{>o?P-x3-SQGRbfk3BF
zR4p>Lm92guJNWdm7%9qP^}YXBv-c@hFM?J5WX#7!W))6~5h7ZgI(xndhk#V#$$x0}
zr1y1mTKY*ra_H=5(0X*J5_?M%sc^bPxV5@D2PfFrp-nl2%1JoX6jP8xfpB=Wv9#y&
zuU>WQa@}%aAcxUFXOKRy<7he4FQH%BiGd#ErmWCQEH<GMw@J8AU7VrhHOeUwh##v8
zn~1h}^-?9N)yo=Xx7zTmU<PtHO~iJRi1sBmB+eRj!2vuyTuOhXL%exGwb_l2F176`
zv65#V!Z@hJFAC^P0M#ZpaL6!fD!z{=OZxxkR-Y+W3c>QZ+`6Z}dD3aI77;C3fYQK|
zFT&v<*LXeE$v=McPOEbRq#!xC)d4uGo=l_FAa{g5kErQ*dg5W`e7?L?GR3uwVr?MG
z-q_Uo_4Y@0;sr7SqQyzsd=ZWS%Hs9Z>9Gff_3M?y_th_RIE^1kqfI4UDQMdrV;ygI
z@l>s>K>yUNFb54htL2uY)Lu7nwjsYgsl*XACjBEpBJcuk(7m=F`oBFpQ_@6!i8GL+
zXb$v9#ogWVaKyy7jnaXgxzk@&B)@v|!LNt5XGjl;mApP6?1f5vq=3T+;HT?;b#>FW
ziQ_t-(qJSwN3ql}mWKUko6Gfneir*2VkHYtN_D;fM}bu1C9La!=@o4U*IOvX$TijG
zG|sC;)4ER@_0&%5>jP4KoBm3L*xh?;`@yA}pZy}Sl9}prFxAT`U^a1qFONT*=)S3Q
zE-yeP)eaQv29ZE#nWuTL5gpK3Ekuh`uJc7`4swkb)w=?JHh*39R7)vH`MDN$qWvXw
zl3i9DlHKLa-cX~m+LQVKYKqCmXrK#TGf&u|)6;2#Pa~eXFp$4#0X!iE;L_b?$H`xM
z;&Utc3lu3(wANuz$ACcbntAny4JRA6GFhcipl@lQBgn$Y-}S6{_&bZ0VxZ{md`=lf
zO)-Iv1%cuf<=am`wt4igKZR$Y!Yj0{NanXr$!l*NM_u3?avabFFBoS8H8Is%cR{H)
zdB)QM*g|YIe&oVhR!KL-?F-?cT&w$I5lrqsP{1_OwU!0f@$pSOd|U*q)L3T4{Xf7x
z6mS|T@zdF3J)VAcvEl(J^<~7J33EJ~%653k+2cdEaen7NnQ<-g6=?2yhF^(aM_opC
zdy+1wDMptGKo`6`yIyP9x;^&V-aK_-AT4MC_>dCcKfq{Uw&8K{EgRv75+(kG0+u2b
zXWr#Pl|`=%v_-%!+)7-q61mH7Ps$Xc#>xgVg_nEPRpzce{nC=TOB(&Tyi=2cmpBRv
zY5XcU2<albJxLeT6r+nJ(1kocYi(1$%22IEJS$i*Mscj*w{blUwTBwzG^nGd7+od;
zUGP%HUd?>qQngiex$1)U@`YTvgYciB0^d;}yFIC1P*aR9lYlOGTHkq*P0*85{`Gk3
z!a#ac16n~0sLA6SU+((qq3v!Lxdk8zmy9~D?Diz#P*Y44lYww}-ha(AI^}wcCdY~+
z+~3|FLw8g#uvP#T&%zr`*txEU`i$B|hArgnF`*aqTWgxMk7StgKUv1;Pz4R~eG^$7
zR?1hN0qo^A05O_9rre(s?oG9>Tv2Tk_)RQ0*?>}kpr#m9Ymh0tjOgw0<5}X|XW^1)
z=+9-2P*+9AVQ?4FXxowQb=N+$=l(ihn9VmN`YKK5JC$rB54p`3^H)>cCK)W$68~ML
zS-atD=Yi8c&k@DZ75_(LcW*cEMpHF~Kbc~%9#X7k1S{^7SL*M!U&T}10-`0WG$dv7
zMQ8(*#S4)#qvi)DMbv-BQ&}d^Lp0DDWL;qQ&Hr@6s+0Z2K=t`~)(<qjmEE2s9BPV*
zVhSsqF`o9PTdFmf-)DImPvL|Wuq)V7^;VHG(LUinX~vF;%#1`{NeVt<MGHyX%tPpi
z1`pZoNr^yBF^RARiNNcOUv1lOG_Bo<>t-4QIhE#M4Kcx%fd@BSP+cp&CndDyH@*D^
z1N}x(KahSCowaMFp2=tN%)2aa_+v(SQ&0h+%4$!_RK5VFf=uCc#^cPRZ(3D;s4m5T
z{#>SuJE3jQM6v8-#qmcMCi53*4opbqeX>?ssV<r$zT+StP!bMtXHcgB;qXFarNy}2
z57$o=UnQYIauDtd+>bp;qtzf4_kG~IjW2fWVs6}$=QT`>sT50vV9npNCi>xIb@bE)
z(V|fn3s6!vUxWfs7B6KYs>ZFDvg|nDYgL^@TM)X5r7bIt|5uV%sBrs9Nj_+l;L`A3
zPNXOtLX`z535S|u(q#vP!)qO@wc(pv>5pqzoRWm|BMBCLpOAr_4q)+0;qoe#O?Qo}
ziPHxf>Ob~`LZ<8tq&?NQ0ns<H&#(`@`dtub>w@eJ;y(!$^6sh{)I?^!r1mDDQCxO=
zQX)`OtSTT8c<nS~SIW?3ahLBDIT1Ghbo0gxnuynA=A#l~(LguApaOVYxr+X32l-Xd
z)S%hM^?S6HSQX1cBxS8cB?KoWMwe5R`vk)MV%1FhMT<gFsUcKa-^B_*N+oKFN#zWX
zO1!vopA+3`ZkL=KNwk0LQ~#FAnKaNZB+%<o;deb%t*?uL>dOuEPo|g{{V8BI(urr=
z?TW0gUUicQ=%xhVB?_2MA}o<Sx<Pii)l+!@N~uKLnJ{O9RN~d#+uxffzu8mz2G3Lq
zF>o<-k49USM5~ix*Qk@vUS?WXteX9k$~-{%<EJECz5pCp;Y{#qZdAK5ZIW}A9Ofz<
zI!Y<n@mtQL_Lu^g1>(SqAsfS;BWAs@V9slB?aPid_YFzz-E`}1DS14$uUP3v#WlDr
z(~=4xOssUFy4<yi)ZQkvDn*UT;wKdpYKlpOBS-{Z>x}cbkTj`7CBAQ@+!7@b{N$c&
zmUOAZr9DX(M2*qK3Fv~CGV!)iqb^+VO5obj(AD7#?n+vvx5)rtSZ--<LpSYg2^48i
z<efYz(0qWiL7;eLvBZ1U#7E^Grbw9bd!mp79l(tS8bsDn9y$SK{5rnlH~(=*Ph}7)
zr=F89fH^=oyt0_H;*7@Z{hOJ?((;94io=;|7EUyaxiQZFM{ehmqJ?$jwq_GTk@siI
zwH&Fd=AuN%ZcnOU)D)`<NCaMMrFV*)YWT#4@0jQ(P9}sg{Dz9MX!}QYdy+1wDMl9;
zpbK95n8dAo{ZMe&z_n*@K=tCrZ%4cb+7U;Ma_YFKDMpvMKo`7@$e8>*W=B75zHi~y
zLrdEUa`%M(^dlyOBx(c>#0VUoZHc?;@WxOy0))y*IMft_Iu8hkXYJbNnook&5|(o<
zsJsH0#V-!j6cfjMHjegq67ADHyz;N?<&}8G!9bp-3429`b(LGKgN98ni8jB^@S7}N
zN0UX_?MaS;nqs0@0EEM{c9)%zQNv4bx8o_CFdU5-=%sU6h5V=I2=VAh8fsKldy+1w
zDMlAppbMTv53(O)xwrKpO|H76iX$J-k=_@zhZ^Oi3u=nd#SQ3!C(&U(ZGU}CUL?N$
zM=_DCG9RQwqWtpv7?^<CW8w&*y|)foIyT*7XBjY{|9UA1RBJLO<@o>?0^#ta++w=v
zn%>U!$4i(Tgo}qm;{zIKS2C_Vtb9GIjG<Aq1X`>>rJk5Kw?wEk`^9Qc>WTRRSOkQ_
zlkzcNhYeggbp%)8nBsUzHQPw6-G2NEm9Te*1<~4N-?t+Likf0j7lT0Yr1#SJ)Ox<N
z+~TBUaj1~Ivx{iv%PC=~DJBkg5C@*5Ug)6t@a<EhU&V>zZ_j7YMdl3b5&(<mfagyJ
z3VIuKwTle<Z_j4}Xq}WMeJiVco0FYWC#Q*ZGC{7sQU}XKiwfE8Np**sVuV`?gu`>d
zlHtvNjksLesmQ_=vI#FtKvNmniwY?Y)D#oPGA54pMtBa`yG4m38X5PKxyBJK;tKw9
z4NuDNQZ!`CZcow$HO1)S0d&Dryx=8|ml)){9>`UfhG>z8#Np&Cs3|6n<sc3`C7Yr7
zdrxpw$F*GJ@HZ6WkbV8Aln>Mt6UPb=2cANGK3m7~``6ygF--L5GN({^XZO+fB{=0H
z-=``(K^%BSGqXze!(Y`q@|_AkhEvdY)OEiS*Yyh6+p@Fc!3R<i6l(?|b;!JJMuf^q
zxO@Su1j6AN&A7dm!@qaUiI-yFPylbyn#)FR9Dj6SYIrBj!C5kWethoMajg}5#5JRQ
zK&j^-?hNWGARL|{X+AP=>ax^o9arJ<-WEaELUAep)E*PZY7hsWAziOGSbOgB45jv&
z6KTRql7v+>w+V1o>1Ze>4BfZQDNxiD6X+TcD4vrYPnb7e{YVtwYxwhCP;1O>cYz6E
z0~PKZ5zc3ISnoA1^(;l<WCKc-j+$any?}6dPL?yvG0k#nUwy8Xo_8z$1aT(KsQ^$@
zOdM<3ILMcrS8lkws9c|8;$n(o{=3!vZ7q=Rd&KJiES~bbEt#}-v*#_1BEuH47Qnku
z=qeE!O0h;BsSNVnO0pirfu}s5ntyYOsPT$#8H{Txjuhy)x2!n+j1)rNqg>Reto9^b
zP*Y4<Z2-F9DU^e%-@;i5d-z@rhz_JPxM9=`P7yP39hNeF*2XmEwmF#vkPmPp2o%q;
z>R#+qR(C<y#3C26xHkJ+|3MpVOq`nlES{lwu8K)qQoEe^-4>eDf9$qm|M?qiz84I<
z0W2QhEnc5(Hl%z)vAXTwV9~V{Otd}#77xdzs+L)|vFFaB9}FnWE$WNY8TmH_NWB7G
zI>5xS8N`7{+lad@n!ma-MZ9XEh*>LaA&*WV3SQivnld45p&jfJ*?HU6Xw#Z8DlZR+
zqpfT}NjTILgX#-}!=tTlwC#MOm1eWJ3U>$DY&Jie?T6Y!jnd@GbRX0dqstan7xJah
z1AC^Oe3x!*TO3{f)*sQW`3&q<0E<WOhp((cv?^K!6dAU#{&*EF^oMh+WBj_2?DnKG
zMNKifYy-OB*?{k+h_kCV*Z5InT?*^prXsG;klzMZ6!Ht%?Mb?zrWjqe16}ZJplj;A
zS`B-x<NAa^$lHiTMZ>APq4t<Kc7QnWVBgZL+41JCb@<v)3uGG{Z|cmu&I2{dNf*=<
zqsva93!aUfKVR#fU1z7#T<d-l@`I;x{XnWy)D#oPE)WNv-b@*7)oY%QR!y#Pv=Hk`
zIK^-K0=37)u^YsJ=Sw#{PTeuev|d)6IQ}*)phW}&y9dDH>CJ`l%})-0Z@#6-u!Rf@
z3}h^wj=UrDef8SW`va19eif%9=t6!@^^TfiQ1^mB@pPoo4XX;_rN4<U4Ny!}W->R3
z`$61H8f|wHZIg!cdUi1U$;`crHNTLSg$OY3v#T_SVznoAuzUgRV}&ChFYHo6`^=kd
zolbBSt}QYO*{|D5aiFG{0`LQI;0e$qqfS?c%(N9h7f!X~Pymr)4-?R6PurB457ZPB
z$9@n8p0Ie0yf`BGU~oD9aiGU&oZ>)DF>&~VIPe6+*;d#xWKc%2o-F>&@X*wPfeiq#
zcs{T#@_lqlFXt*nmamXict^O&aR9*L;dZ6T<`&iF-!A&|!a{sjNJNh(R?5wq<Suzv
z#h^wxRRL;>Nk<^i1rOhk^bD@P`I;11<a87^{Wcf(PzChtMphhubYVicOdWm`aroTz
z1GZIj?jI(OX9!gmpk!yLDF*c*5Dw25E^m2qy;nwczNd5ZUP$raH$O-1F>xFMao`C^
zdWEUmubR}j#I*qGppFqLHy?irK*&3ljvD2ZU(^(%%VD4kp0A9#w0-xo@|wlEL+NiH
zI>!e{(B&$e;y?#dm^hBGaggti9TPUDc7NA!JJ<Z8tH2rDT-qz2l1;C`u(8P&=Wa8L
z0+dS`XA$!O9tDBo`Nhmq->wI~8zFu>n2P)F^NYWYXULG5IFA8XJRNxUWMkX+gHnbU
z8Mct|3>^(-V1ocG9=%&;dzc?uxu#BWVE<a4d3S%v?zWPRqV||Lf<YX3I#6#?Bh9d>
zQ_mJTj>5(tZFIhgqm81bm^eZ}9C&_Vv~#gXy=%jYwN3rE>O+Uk8Q9|h77yPadpXbB
zFrswPhws9sHF+oY!{xfI<lgmBI%Kyel@)4=(Iph<f=B1{8C%p^O|{{BtSaxdSM=f-
zH5i$uhT3D|I053oqjP+RV;Wuya*F=Us<7^HrkR+Jt#Ws}r2O^}?Mi-oQhpINMwc+4
z3m%;pTDEAkqfVo3Tx}!o>6Yv(6r?y%Q%oEu**M7eb?ydK$?R@xlEpQSnIbO7J9~w^
z-EY(=r@BW?F}j=ry5PawsOzQK?XGRs;i`)n+VuD+7Qd`Gq`Fr}PX$n;vf7jC1vSO!
zavJD@XB)jY%^uNw`;%gwvisXGgI<4NV9x+pJlpVjprd2eDCBvO(^1GI2fa|oz@7!L
zcyiG8eo2p)Mis?NYN*0Utbf=uECe<R!iED_vNfN4Y+J35-i>*}q9@Kwv=IOnkLB@k
zsa?ud)NjfY7QHFKz@7uJc&MB)x9#!!`|cEdU#<|5(W4m#_B?>aBU?=WYC~t2tSH{l
zS@higbpl%fIiHnSBeYGKBQ$D?i6aukfk&c1CnxiDj?R8Xjzb)w|JEbXr$QLm3oI<T
zCBNLh4s8bSYp}G)u!T+hjFEguLr(IcrkFS`f;h0z+g;8zoNU`z{5B)S{Kqa{*aT-N
zn&8OZmy?VcHO0hn3B-YoIp=!$7>gqgD~c1x-^u{(pfRvf02UkbVY~M2J{{>&^iABt
z%D@0UeM6H(ni8442sOpTaT&ycojx#Sy1Lnr4o!+9-(TGTz2+{MiT`kl1GUG*5e?$N
zL&x5X=C#|b9Vxy-O))u?b6&SZk9KLBGV_7jW8%01;=p6k<8L#r*5CBJSf?`nR?aJ6
zIbQ{^cr5BO>Xz#5&|L8lJ`MFB<tt<?LMN-4Xs-cSJQh7N?@)e)?SLqru;{)@2KG9D
z#Y0D!W#qaRN9~Kg_fc5q)fUGNo_9wbMo-vKqckHjiw`x$=yHSAg<J)_$bRmR55iIA
zsqH^jK_NpbI!wU8#sFA66qW1NZT~8r5QE}W!CyVd5)DP@K1ojUp{AHPZh|=QP_!d6
zpnR%<Rk7}1{afEfyB-YeEdYy$B8^9HPL$6M<vSxuMO#Oa;y<$FLnSLz7B7((@@59I
z+mmJns3}I5+dvmQwEa{yywtMH{-SRX7gEjW{VoRf4uHiY(ShClcDyU!OMJ_NO81YQ
zs1Pef>M^jf02U7%T@J5rq`FuAB~Ms%I*Wn53t;iUq2G3tOYF8<);wX+%z=T81F(4D
zi2V4~CHq{oS8-tfY8iPKVCc!+o|9@6wa3J955$2-q6(!-47jqfRk1ANuW{sAAKLq+
z1t@dmKus}m+-Kt;*G_(I*k<zjs>6R5IUj}eH8k;JU>^WjJaWt#==`l@1HV#5hApJ8
zp_45P>_Y&HjlOG#Q$q90NkfVQ``4`Hoebi5$tG%wi6b7wfkzIvHYbLk+q?gMk>e;N
zYv>sU1DgO~u`xGGX<fISjp>Xc!xlFE$~(}GMlDW7kJ@A6NCa_UrysiH^XM;LeRdW(
zjzZ)^Tbm5*BLIt?{`j8U${i;+3oSBiVe;k8Hf3+oO5TUsW8!!W;vhrEO{4FQ4<>gj
zavY+3f9*x+vemp#7jY_sd|yF&0^-19(YEnA&(=K%XWr!c#{s{PGC+6GGV(nIuz2XO
znBcZ$kAr{F&vh1722LWb{10nNWq{ga;&{f!K`vWsHOXdiv&w@W7pDyVHfu&+%D^T8
zSUhx`Hfr8>?_>}0!WCL6|CqHxh7R;fECc%-z~Z6fS>-32j&xpF^i%nTWvwFGZ>c0M
zw`nLc2Yl2N6UPe>2Oc_p?6V0T)N(%G=YS2-^GtMy2B$bsQ%oE$K^%B2a)@luqOr?~
zVqO3L*Xn~K?<)X{hmIZh?q<)dn)RAz^`Wy`Oth~7EFL<{m#;TixZb%}abW*yxp~!x
zwk|nk4YkL_kqqL%L&vh$p*>Q2s2BawNg<YtijRSP17NY!-%fiw_Q`^*qVJg(;u$~S
z<oYdu#ZI4EV#3=GKfj9~YN87NyH)))Yk61hbQ3F;wkgvyP*aS2DKw5!&Xz5W$el{A
zBerMXJsQ%l$bl4+HgtA?!A*s@*z4WD-?Lj*t@X$v;}&MRd1swCt`<>KOd#)IAb9l9
zdY{lR(EI7;A_r24fas6`gPR6%vD;^MY;{b1QSUBA#x1Py;nd(g#KrEs{Ckt!U5Cow
zEe`HqE58n!<H&xMR+{6W_80-vVIbJ=x0+P@Z}c?#7DW!Eki?;>E`$34;$p+UW^g#F
z&U$OE`}zl=V||3U*r2bro7TCmb=v$Q$6H9jpwmuFyq_R0c4+6t>GeG=&iL@eMc*Z5
za6dy_Y|&4JY8#_w%`Ey-u#kAsc~Ay71K^SyQEF_sc%{MZ&;Rj^7mW-I?iYxQ9reM;
z&1Kg&Zozzr;NOR&zfGHvp)$B%Aucw+20O3%Us4@W^mnof(HU6*gZmBQVgvj+pv9<8
zLG?%R)EV6p#o&I2xYz*i-0pP9A}g2cDrGyw{Q+^YUs-m0HU0CjTN8Q4i$1Bz#QPKC
zV!tZ!Ghz8*t<qO{;-c+P1~(JpV!v|lcCd=s&sGb0;-V=rgPR3$v0vpxMIY2z*834p
zT=ac61~(hxV!zszaL3|jy`*Bzi2l}p(WQY5?k|8#?tu7m@Ox|{>%><)<3%5#V{mgI
zF7~T$W{!*6J8tj86BlhZGPu7XF7~UeLG9L@S>hYZ6Bq5$F}S%97yH#ib*Jg?YS<V3
z<&y$FSqVM8Lyzzn$PxrnXr(Rw|I({8^(rk`9r0Rihxjs>dP!+Xf&O-t=F2-%XRGDd
zd58-Q_A}FNIm_(Emdt5CXYxN`s!+_{S{O4UVB;$F?(echOjQQ6c1w}33>;O8^GgHc
ztex}ici%8uKM^NaRf<h6PDzTBLgINDa6iPc#b+xK=NCsDHHza%;xW{B9W-!fQ}N`I
zkjW88o#K2ijde3lw7e18z-@zw^PMA3DT>pV#Iwj?sIgCVGhGqq3r8FcigSU)({Xv_
zo);np-4by=am3N2I7S4=Zc8JNEZ>c*MVxewI9e3vVJV!S`JNZM)yo|lEaJT5h*O&4
zc#^t)nr6_qsX_K45$7#O9BqoDMsQ-cgr<+_XuVa$dCd{048{3G>TQ2ajo_|2Lr;k~
zFF4|qr8p-waehX4Z@E^tX>T(T=NU(waulaO!C4ogYU5%Rxk|)&%n_$N#koYrh(&8$
z(wFah?<?XYaKzD}I0CK%jY{d!y2;}Pi!cv3!c?FzYe+ax_t^f}5YV~4h!e*VN0;I(
zC+WG?X~OrR!)kXBaqe)$sYr3Qka(OdBgVC?eMa1y6K-<E`H$k9CEaJih@8%Es_Tne
zI>L31IF%?)Ai>$Tw&R=7r30IZI9LAH-knG1bZ&nD529jd+G@>8L=aQVq=cAfF%}UL
znRpyTCPYjTL_);$3N^$$)R>w{L~5$ix~9fF)EKj>sRZ}gyL)@q?dn>$Yu)|d@7KH5
z_g=lJPtQ5ed)_y4p8XYcin8;H&RdPB6>U~L%=FjJ0YRr2I|b-=Z3}MnOMGmY9$RE(
z3OdEv`GSt8gu|U`6H2G+Pmb1JL8k;e4e5BA9g3N~b%M`G?d%bBO0wfY>tNO5LCrc1
z_0c<Mtet{RDRz306S4AUv}3c^8QR$<=#*yX8r=`}pUo&--FwyN+WAh<`I4R5<P6Pu
zUt&eg#`?#y)+Rxx3_G`Jy<OS#aNgS<8}tKY)_OswEIWPacp6<_-eCU9(eN>Gt)Sz?
zPByLUl4+YGM~t)A8|bW6f=)ShuF!g$cB|>O!#fI$)y{H3$C;f}I&a~2Vf}s$@2P*}
zU@Z}J%Cpm)KKAZBe`4d2l!|HEStRJVuv3Pd&|FKJ?Jap{wsyV|bSkj(l+K&aUbVAa
zXi`1xBnvtf*$JoPNv-!V|J6L9CABk0(5b{ue{v=_e!B63{Yv=uXQrUz%FZMD81Jw#
zHr2n#T>V6(HC@oD%uZ=~old^^jYCSxsA%m>6?Cew^MuY@;<^`IN9E5mNIO#movQ3K
zrQ;cY_siyqZO0DL&Llyn8ao-}RH<>bY`gCH;r(=ipi`Zl5#)^8RVVt!R|D;}GfvQP
zW9J#Ix45}$d{d@H$7^S_pi_gL+jKvOsNC0Q&WIYjwG$=i)MUqj-cNsNxBUK7H}`L~
zGhEQA#ZEk3pJRb151p@DUVpzZH>$rK{$+&=g0&f3O&4m_ll=Kzi<F(94{VsAQ-_^C
z^v0Uu|M7)Nhy39i$RUDGU3MaAK`iia?)|bs@9o+dB<R#*rwkoW|5-Ke#yMth(9Qrs
zr#?HEXh9s!-Ro4xnrrk9JS$MpX~0erIbB1W6t^8(lSe!K1s!*GX44C4#<IqxYda^w
z2l{@3&R6UNkP~0*#NK>u+~6DHK7vj|cIwj`(Sx}4r@PF*tS8o3y#$>`?7XJe>G3je
zCaC4>^p8rd9)gYsJDX@-cX?yi#Vc20bM15$bQ-hsjMm%q<Vri2MpVC}9WOzr2|Hux
zygBV%;^*Og9zM`_5Oh4*$)xj^owdMg^}IOvSxs9(rztybbUcYeT6A~!3H@F>tpuHB
z>@210Gx6N8t>HZn>UjoMb3vy$JE`<KJw0;X?|!!qJkgG)pwoh#lH}}fFmUe0DyLU!
z$3xI*$<7z_{&U|ubi=*X&ZV^Tm7vp#ooI57G!B`b)9R0^+Nm$-v}R{EIrVGZ`mECX
zS>f8LBj~hYXEm+2fb~h)qi?K$Kai~{=(J^LJ{`}#YXd#TIu?WLQ(e$$$4*Id&Su8+
z@9g=0p?0bWI_=r9qxYXOs|&v?zv7La+i6u2bULt;MfV-M$}#P;g39RaF_w#<(~%uF
zS_h78GZtTM5(M{ya)M4Lb`Fx0w(3AsLzgX$v{OdV@nYvXowqg_KG$z0uYjKimJ)P2
zv*S+pw=v$wSNm5;*V|>R;(|^Wb_&t^+}1b#51ilDf}h(L5p=q;Q<vX=T3&lL-PsxL
z*M$Y0ZtOIn*J-h<wXc`{_?uwud?M&{XD6DjkH@OB$%_N`cF;~CL8k{h>*#oP^;y%c
zU8}3`2P63fov+!cLhGQ@>3Waa-`O)vJ0A%;J=wWJ_qQ<}hI~HI%SAszX5|rdda)Bk
z>n&yNE4Q1)ch=I5ouJd3oiXI}Tl8tmh*@)8wevQY+;cD9>=dB;&a_9fc2uhE4nKc>
zE$H-NXAfPUJ5OFNnB2a?0PVaKbbQ!3LFX-GbCt&VXJo>kA3PIu`m(c#j;F<=W<D!2
zp1|_~zX>}1*zqFg$1>e!EFD_VrkzKEjxRf#$SIX8DrwhszbV?eFX;HObBY|hI>Db5
z>zW^)N68j+`m+;4_tByDy*f^`pX#igTY`>1JB8`~)?|F}=1V(QxoGFQpcBAO6gfxY
z4`u&Yqsk`jTorTz*?B<k3$FbZ&uEc1#7R4s1f3vuj+4`R^2;mdqpqZB=e(d3%+48F
z2Zee@uf3XH;fi*$1f2owd_>php`U%l=r(csQE}_Epfix2spRC;`)YP$Cl7f4IU(pM
zc8bz{^x_{IZ`UrGl~+4I3p#_?38MF(1Ye&eSu^{W*UnKv$6{v#IqAhuO&Kt03;bZ~
z2SH~rJKe|$OiD|xUDl_Pb`A<UL)d9f>)_zjTyst~D7i{I-wQfJ*%?4imz3<TlTID{
zmv%A)oe*};kuzcVq1sWNZ{KPsP0$&}P6l0{JMWSMZZ9oxR6Dx_oltgs>GjjIzIWYQ
zZ)4YMXS<*i#?AtAf~zj>5%#O^@7mcS=!CPgpYGQ_lQXaSH!K3*Z*LZKBG{QppAXIs
zJkT>UbGq(twKfPkk?icE^A>9h>wm#9D_c9?3Od8t=}GTD!>8=d7r8RIrFK>eIwRO=
zNbCBc$39i{QZTHy6@t!4b^>S}xSvY6nPr8}(#}#rCyJdEa-IfUJGQ7@LHP5k#e&W#
zcB;|!DcH;5QmXyAIPEMDbfVdLO3vKoX+?A1#1+xbJV9qPJIBcxxp?1!?gbVO){afk
z8N<$Xdi``7>g~6z&N>I}%o23QveSi*XZzfGIgf9RudkgXL1!F0<LJKQJEU=9>k$=n
z@2E9R&>7E8A#%K*b?ca*dVmuz=)|y-NUxv$eeb%=3mbb%JCg;S3G8&Eb&%Q6A?wcN
znmO8u6?9_Rain#Sv16@OVpidT+8HnCOl0R%I-WdfZaF;;7KqT!7(r(eJ1gn>w60;d
zA>Y$8x3n`#(3#AR6FI4|7h}u6y{QY}tPz4v96J@s32}J0H1$BEB<(~9I#bwLN9*9X
zJ>~pfJZlXPE{6&_@$AHrGpyK~tejGf;J!0d&`Drt3#~VwWq#dXG<|zkJC>j`m7P=c
z`P$(^`3TR6fitu-P|%si&QElI`_aX#U{u^w_`DM&=p?e^MaQ${+^D{JHynWH0{jJ?
z>Fo5O<9Qnve&0{EEvp@0K_`iwp7i=jn00czW1CHuc6<b#8SHGJ^>!!E)uiP3pYv&_
zx1ckVotkugp13`aNH2W@&fC|5&MbDukrSGG#kjC@OX25p-2|Q4?93#``TI5>7ara8
zymmSZI&;{GqV*O(_1O7XC&w7=bQE-K>`bEL@x8gHdY4bm!8&Ls=*(p&me$)x#S6`t
z-EuK}Ki68&NoMCLIq4^BxZOBBAVE7V1f6;8+@$k1@6f>LnhBl5w9{13na@sX`hITB
z(m^}!Jq#MCoyLOBH|)fclXWBTyZoOAuGUUNL1zIwHge|8N#8Rqx%5lzG!S$ava^HU
zx0~KQU8Yj28SweKuAsAsouza<wK`Yd;x?untb<yD&SG|cqV<-MQRejECJ_a+<0j~&
zuv47wJEqUxsw(I#VJA0TAJaE)xe7W<*-573F$MEh1wm&SJ11xzn0|N5S<qR|jw5|O
zFm>owSwUw7J5F?cOkKHETF_a^&NXsOow!v(&{@TfMehry?%OIV=&WXE0j&em?`?f9
z=&WI<DLJOl+xk?{S<B7{a!f(C^|7GyEj!EU{$~25tpbA1I(DYg{lN4+TloZ?_3VVu
z>(uloTlRv^26o<&WBQG)T!PNV4-UOfO}(&s$FnN`*WoLh7zw4tX3B)s8!2QnBlT&e
znd)HmN(xD3q&3}cTt4*rs^?P3cZ^J;wQNdv)l(^C3nQ24nwW}R^;ino%E)UvU8b*A
zJ&;1SF_KO%6;mdw?n)us8A+zqWtv&lZ7F02BfIFDnA%izLkii+NGM$s(?P1PNg=x!
zaiJw_3O&_jDP%Vz9}!~eJJkg#WDg?`=mu%ZI@MVzB#jXdI+BbJ)tu^#6q3%!JzB!1
zh*O=ELiRFpj*i6CZmQ!_NCqP<=@oBEHq|jHWFI3h=yaI|Om##G$z;Txj>Pm`szXx9
z_l)?`ZN-#Ps{K;Pent)wVwxh=J}Kk?BZujlm|jSgE`=OqWD#HZ52cN2w-j=S5ofv!
zn?^;oLkc;}NN>6(rtVN}l|p`CM9~e>lpCs4DdY$vJ?OS#stnaeDda~+I@0Mf#f55}
z6mpc23-pRNEr4o`6mpD_96DX5^G~gmLVjW-Hyw#7<WtL}ke?Z;OV`BI@2M0i<TxX}
z>BVTe?9@Uj<X?>RrPqmRu2b`+kQ0m?rPF14>eO5*<Rl|)X?2-aIyGAgImL)8T@%wm
zr)EeYrx|Ha*TfXWsYEH{3?p{*eq!3&RDu-p3nPW;nwYLO6(@ycF>--UmuY5G6Qz)|
zjM(VHo7yxLBZZt}<Om&!DM?dfrI7QC6r~rVX)sgKQpg2Hs?dcuePwE-6mpS~Gjyvj
z`=LxsMM@!;7+Fh4Vw%2Gm=toEkt1{@rq@e_NFi4kxkpE0O1IQtDdZ|66=?~ZiY=w2
zkZX)|q0?mwwN$Va@+%`TgqXG~^`V33fBiGa>x`_V(`Bltl%Ld*++d_Q-Re#8l<F&m
z++;-2=`uA_%3BJ##mEp^!lnaC^^`(xGm=8r#56vs?o!AdMz+zBn0_bK<)0wIuKjHP
zIfVc3hO}oh<3g9-v@fYn|AhG`$X!OZ&^wB$T2k$$kb8`@q^n|TkW?Ef<US+mgqY4F
z)lv$1z{o3lrI>;v)l3R`$cQUl6H`8<nn)p!7+Fq>*fb5PMpDRQMs5;fYK4@$6!L_T
zZS*=Z9YU&}6!IG*k+ix@VUVgVg*;`XBYjXc?LexA6!JSG)o2NuDj-!&3VFsz6FOa{
z_eWKhLY_0?Os^AD@}nwBAukv?OiS1_`l#|!$V)~Z(S<klJ<3T6dBsQ)oi5YWqrQ|v
z{$Rv`R+lN}Q6;62*NhCHYhqe>R52;!4I@M7wqh!G)E81n4kI!2Ix+n^>N6?iEhGKu
zNKBcIa+E^eG4d&`F4LT&3Q8gG89D3tXUBr6$x-(I4C!b4|Ns2oJp;js`Gfo;{r!Rl
z1xDJ&>K5|Sy5T6XNXYP^p`+XahlK~b4IMrta*%&`xc?|yVj;iq;J{%a5s~4;^_S1F
zCFaF1v?b;X3HtL(=GZ#eO}4ej6=0iU>t=K9kgG%P4!QM0baL0fZQh{BQK7-`&*ctu
zw<W~?`6YQ0b9vajyu8}K*MDJju#2<>B(!j{vy1Wka}eua^UuF;(_dy4E2ry`%Ie?V
za@GO5<)Z(`fAW=q&l6ayzEQ2}`tSW6+qCPv8=<~e9(%TGyy)Zz-_(6+z3L|Ya3Yj{
z?fKVV|MlzfU#p|;O8c+>S`GBSLPCqC&o@u_WytoMp2eON9?>y4d*5Gv?fGARy$`{9
z|KvPWL(u7W`+9g{|K`8<!KsGdaCo9aK)Ao(2><X1eTQ(h9CRg>RS?1v;xmXZAc{eh
zg!mG|38FkiMTp7})gWp>)P|@B;SSLVq6tJZh?Wp-AlgH8g6IO#9ik_MH$-0uKZpQ`
zU<d^<7$O8B3?dR@Bt$gCScn*ii4buR2@r`8GazO|%!QZ_u@E8!Vj09rh&2%FAT~mz
zLTrWD0kIn*9bzBEeuzU5M<9+t9EUgwaR%Zn#07}U5Z54XK-`A73-JKrF~n1d=Mb+T
z-ax#A$Yrm?9wHw^0f>(wK85%kq9{ZOh|&;cA)Fy9K)6Cwg>ZwY1yL8G0YpQH#t=;*
zT0pdhXa~^|qBBG{h_4}fL-;`WLij@jK@5bjAcjJOLPS7}fEWca24Xx!EW~7pc!+5b
zNf5IjY!LGx7C<b9SPHQMVl~9K5E~#iLu`TA4zUX&4I%^Ldx(P&KR_IX_!;5^#A%2u
zi1QGaAg)4Ohqwii4RIgh5yWp0&mdkxyoPuSVV74&9*B=1@<SAY_ynRbL=lMM5Tzi>
zK$L@Ufv5yg1)@4cO^7-W^&!53@PP1yXb#Z|qAf%R2rr1P5IrDzLG*#>2hkrQ5Mltt
zAc!Fl!yv*ThC@U_jD{EoF#%!{#1x3B5Yr)MLd=0khWG|z5yTRR<q)eN)<Ud@*aYz%
z#5RbX5PKl@LS#Z5fH(~CBg9V-|AIIL@e9N`h>H+cAby3o32_JF9>hb4ClJ3wyny%v
zA_w9<L~i~4hm}`HaH7ND;K&cRLuIcDDo6F1`a%^`CDoV8NtIU>Rb^F8)ljunJ>{+%
zsV1tKYN^_&_NtTWqPnY|%3JkSekwo(E2Rdj5EZ5()kqbs#;O=KQN^hQm8fQ@*=nwu
zuNJBlwM?y4Yt%ZmQKhP_YKPjb($zk-Uma3M)G>8jom6MkS#?2OR@c-Gbz9w457cAz
zR6SR()Eo5<pJT;`X7LGKd>j~`J;n!}@u_Njgd3k1$A{DLNqBth9-r~Y4F$M+0k=xv
zz6{(1f;&!ddkgNR!Hqk(O9;0l;r=JwY=t|!a2px!S;Gx-xEl|*{^34D+|-CWC~><c
z?hVC_sJQDDx7gx-Ufev4JC$);H15I14d1xC9JjLLzIoiFj}rlK`~c26z`+VQ#Q{e~
z;Cu=khJlkeaLfnJ6u|)|I4uQ7v)~*U96Ey&Zg89q&hEiMK{%BNM<C(6B^>UAlcaEL
z70$@Qfm}HK3rCINTr(V^h7;RxJRHuV!@+krWe-RC;ru`xR)~`kaSS8Qe8d5hIL#7A
zXX2br914mPNO9aL&Q`@it~fOoN7UjxTpYfOlYVimFwQW>fyp?X8An0m+-V$gjT5zT
zd^gSt$HC+{1szAO<9v4<CXbWraZElA=f@@htR29P11wa)_6Dqqz+MV0!NA51toOh!
z5iBmjmK3aH!TuL4o55xqtkJ>F9xMpLHX^J(!k#58b;1TItgFIqEG*)})-S9W!#*=C
zPs5@%Y=y%LI_$f{ay@MF!`eXXD8xcUY+tmX>JfV-vBVM^GqIi%dqMTb=K<lt!F*dt
z%opGvI5;>Y$d>3F78x88F)Unvhz^Mi_YaI5G%UoA{@(~U`q*5^3qEHzr+;RSt?~pt
Jlx~X6?cZhGB=!IR

literal 0
HcmV?d00001

diff --git a/bench/qc_simulation/scripts/http_unzip_on_the_fly.sh b/bench/qc_simulation/scripts/http_unzip_on_the_fly.sh
index 17f21dfd..2c7a379e 100755
--- a/bench/qc_simulation/scripts/http_unzip_on_the_fly.sh
+++ b/bench/qc_simulation/scripts/http_unzip_on_the_fly.sh
@@ -1,3 +1,3 @@
 #!/bin/bash
 #
-./main.py echo tar://*0.txt::https://github.com/danlkv/GRCS/raw/master/inst/bristlecone/cz_v2/bris_5.tar.gz data/circuits/bris/\{in_file\}_dummy{dummy}.circ --dummy=1,2
+./main.py echo tar://*0.txt::https://github.com/danlkv/GRCS/raw/master/inst/bristlecone/cz_v2/bris_5.tar.gz data/circuits/bris/\{in_file\}.circ

From d89181cecbce9729533e50128463e337bf25a1ec Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Thu, 9 Mar 2023 22:03:46 +0000
Subject: [PATCH 057/126] fix link in readme

---
 bench/qc_simulation/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench/qc_simulation/README.md b/bench/qc_simulation/README.md
index 7e0c1877..be78054a 100644
--- a/bench/qc_simulation/README.md
+++ b/bench/qc_simulation/README.md
@@ -4,7 +4,7 @@
 1. generate or download circuits:
 
 * As tar `./main.py echo github://danlkv:GRCS@/inst/bristlecone/cz_v2/bris_11.tar.gz data/circuits/bris11/\{in_file\}.circ` (need to unzip)
-* Using http and (unzip on the fly)[./scripts/http_unzip_on_the_fly.sh]
+* Using http and [unzip on the fly](./scripts/http_unzip_on_the_fly.sh)
 * generate `./main.py generate data/circuits/qaoa/maxcut_regular_N{N}_p{p} --type=qaoa_maxcut --N=8,12,16,24,32,48,64 --p=1,2,3,4,5 --d=3`
 
 2. preprocess using both of `greedy` and `rgreedy` algorithms:

From 16920967ebd54bf4f9b7fc15b7aa26908e72b9ee Mon Sep 17 00:00:00 2001
From: Milan Kartik Shah <mkshah5@nps4.ece.ncsu.edu>
Date: Fri, 17 Mar 2023 01:09:06 -0400
Subject: [PATCH 058/126] Updated compress pipeline for throughput improvement

---
 qtensor/compression/szx/src/cuszx_entry.cu | 3264 +++++++++++---------
 1 file changed, 1751 insertions(+), 1513 deletions(-)

diff --git a/qtensor/compression/szx/src/cuszx_entry.cu b/qtensor/compression/szx/src/cuszx_entry.cu
index d76b5e5d..56f278e6 100644
--- a/qtensor/compression/szx/src/cuszx_entry.cu
+++ b/qtensor/compression/szx/src/cuszx_entry.cu
@@ -1,1513 +1,1751 @@
-#include "cuszx_entry.h"
-#include "szx_defines.h"
-#include "szx_BytesToolkit.h"
-#include "szx_TypeManager.h"
-#include "timingGPU.h"
-#include "szx.h"
-#include <thrust/copy.h>
-#include <thrust/execution_policy.h>
-
-#define SPARSITY_LEVEL 0.25
-
-TimingGPU timer_GPU;
-void bin(unsigned n)
-{
-    unsigned i;
-    for (i = 1 << 31; i > 0; i = i / 2)
-        (n & i) ? printf("1") : printf("0");
-}
-
-__host__ __device__ size_t convert_state_to_out(unsigned char* meta, size_t length, unsigned char *result){
-    size_t out_length;
-
-    if(length%4==0)
-		out_length = length/4;
-	else
-		out_length = length/4+1;
-
-    for (size_t i = 0; i < out_length; i++)
-    {
-        uint8_t tmp = 0;
-
-        for (size_t j = 0; j < 4; j++)
-        {
-            if (i*4 + j < length)
-            {
-                tmp |= (0x03 & meta[i*4+j]) << 2*j;
-            }
-            
-        }
-        result[i] = tmp;
-    }
-    return out_length;
-}
-
-__global__ void convert_state_to_out_kernel(unsigned char* meta, size_t length, unsigned char *result, size_t out_length){
-    
-
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < out_length; i += blockDim.x*gridDim.x){
-        uint8_t tmp = 0;
-
-        for (size_t j = 0; j < 4; j++)
-        {
-            if (i*4 + j < length)
-            {
-                tmp |= (0x03 & meta[i*4+j]) << 2*j;
-            }
-            
-        }
-        result[i] = tmp;
-    }
-}
-
-__global__ void convert_out_to_state_kernel(size_t nbBlocks, unsigned char* cmp, unsigned char* out_state, size_t state_length, int *num_state2blks, int *ncBlocks){
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < state_length; i += blockDim.x*gridDim.x){
-        for (size_t j = 0; j < 4; j++)
-        {
-            if (4*i + j < nbBlocks)
-            {
-                out_state[4*i + j]= (cmp[i] >> 2*j) & 0x03;
-                if (out_state[4*i+j] == 2)
-                {
-                    atomicAdd(num_state2blks, 1);
-                }else if(out_state[4*i+j]==3){
-                    atomicAdd(ncBlocks, 1);
-                }
-                
-            }
-            
-        }
-    }
-}
-
-// nbBlocks, r, stateNBBytes, stateArray
-__host__ __device__ size_t convert_out_to_state(size_t nbBlocks, unsigned char* cmp, unsigned char* out_state){
-    size_t state_length;
-    if(nbBlocks%4==0)
-		state_length = nbBlocks/4;
-	else
-		state_length = nbBlocks/4+1;
-
-    for (size_t i = 0; i < state_length; i++)
-    {
-        for (size_t j = 0; j < 4; j++)
-        {
-            if (4*i + j < nbBlocks)
-            {
-                out_state[4*i + j]= (cmp[i] >> 2*j) & 0x03;
-            }
-            
-        }
-    }
-    return nbBlocks;
-}
-
-__host__ __device__ size_t convert_block2_to_out(unsigned char *result, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
-    size_t out_length = 0;
-    
-    memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
-    out_length += numBlocks*4;
-    memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
-    out_length += num_sig*sizeof(float);
-    memcpy(result+out_length, blk_subidx, num_sig*sizeof(uint8_t));
-    out_length += num_sig*sizeof(uint8_t);
-    memcpy(result+out_length, blk_sig, numBlocks*sizeof(uint8_t));
-    out_length+= numBlocks*sizeof(uint8_t);
-
-    return out_length;
-}
-
-__global__ void convert_block2_to_out_kernel(unsigned char *result, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
-    
-    size_t out_length = 0;
-    unsigned char *tmp_result = result;
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
-        uint32_t local_blkidx = blk_idx[i];
-        tmp_result[4*i] = (local_blkidx) & 0xff;
-        tmp_result[4*i+1] = (local_blkidx >> (8*1)) & 0xff;
-        tmp_result[4*i+2] = (local_blkidx >> (8*2)) & 0xff;
-        tmp_result[4*i+3] = (local_blkidx >> (8*3)) & 0xff;
-    }
-    // memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
-    out_length += numBlocks*4;
-    tmp_result = result+out_length;
-    
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
-        float value = blk_vals[i];
-	    memcpy(&tmp_result[4*i], &value, sizeof(float));
-	//unsigned char *v = ()
-        //tmp_result[(int)4*i] = (unsigned char)((value) & 0xff);
-        //tmp_result[(int)4*i+1] = (unsigned char)((value >> (8*1)) & 0xff);
-        //tmp_result[(int)4*i+2] = (unsigned char)((value >> (8*2)) & 0xff);
-        //tmp_result[(int)4*i+3] = (unsigned char)((value >> (8*3)) & 0xff);
-    }
-    // memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
-    out_length += num_sig*sizeof(float);
-    tmp_result = result+out_length;
-    
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
-        tmp_result[i] = blk_subidx[i];
-        
-    }
-
-    out_length += num_sig*sizeof(uint8_t);
-    tmp_result = result+out_length;
-    
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
-        tmp_result[i] = blk_sig[i];
-        
-    }
-    out_length+= numBlocks*sizeof(uint8_t);
-
-    // return out_length;
-}
-
-__global__ void convert_out_to_block2_kernel(unsigned char *in_cmp, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
-    size_t out_length = 0;
-    
-    unsigned char *tmp_result = in_cmp;
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
-        
-        uint32_t local_blkidx = (tmp_result[4*i] & 0xff) | ((tmp_result[4*i+1] & 0xff) << (8*1)) 
-                                | ((tmp_result[4*i+2] & 0xff) << (8*2)) | ((tmp_result[4*i+3] & 0xff) << (8*3));
-        blk_idx[i] = local_blkidx;
-    }
-    // memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
-    out_length += numBlocks*4;
-    tmp_result = in_cmp+out_length;
-    
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
-        float value = 0.0;
-        memcpy(&value, &tmp_result[4*i], sizeof(float));
-        blk_vals[i] = value;
-	    
-	//unsigned char *v = ()
-        //tmp_result[(int)4*i] = (unsigned char)((value) & 0xff);
-        //tmp_result[(int)4*i+1] = (unsigned char)((value >> (8*1)) & 0xff);
-        //tmp_result[(int)4*i+2] = (unsigned char)((value >> (8*2)) & 0xff);
-        //tmp_result[(int)4*i+3] = (unsigned char)((value >> (8*3)) & 0xff);
-    }
-    // memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
-    out_length += num_sig*sizeof(float);
-    tmp_result = in_cmp+out_length;
-    
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
-        blk_subidx[i] = tmp_result[i];
-        
-    }
-
-    out_length += num_sig*sizeof(uint8_t);
-    tmp_result = in_cmp+out_length;
-    
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
-        blk_sig[i] = tmp_result[i];
-        
-    }
-    out_length+= numBlocks*sizeof(uint8_t);
-}
-
-__host__ __device__ size_t convert_out_to_block2(unsigned char *in_cmp, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
-    size_t out_length = 0;
-    memcpy(blk_idx, in_cmp, numBlocks*sizeof(uint32_t));
-    out_length += numBlocks*4;
-    memcpy(blk_vals, in_cmp+out_length,num_sig*sizeof(float));
-    out_length += num_sig*sizeof(float);
-    memcpy(blk_subidx, in_cmp+out_length, num_sig*sizeof(uint8_t));
-    out_length += num_sig*sizeof(uint8_t);
-    memcpy(blk_sig, in_cmp+out_length, numBlocks*sizeof(uint8_t));
-    out_length += numBlocks*sizeof(uint8_t);
-//    printf("outlength: %d\n",out_length);
-    return out_length;
-}
-
-int _post_proc(float *oriData, unsigned char *meta, short *offsets, unsigned char *midBytes, unsigned char *outBytes, size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig)
-{
-    int out_size = 0;
-
-    size_t nbConstantBlocks = 0;
-    size_t nbBlocks = nbEle/blockSize;
-    size_t ncBytes = blockSize/4;
-    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
-    out_size += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
-    if (nbBlocks%8==0)
-        out_size += nbBlocks/8;
-    else
-        out_size += nbBlocks/8+1;
-    int s0 = 0;
-    int s1 = 0;
-    int s2 = 0;
-    int s3 = 0;
-    for (int i=0; i<nbBlocks; i++){
-        if (meta[i]==0 || meta[i]==1 || meta[i] == 2) nbConstantBlocks++;
-        else out_size += 1+(blockSize/4)+offsets[i];
-    
-    	if(meta[i]==0) s0++;
-    	if(meta[i]==1) s1++;
-    	if(meta[i]==2) s2++;
-    	if(meta[i]==3) s3++;
-    }
-//    printf("%d %d %d %d\n", s0, s1, s2, s3);
-    out_size += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
-
-    //outBytes = (unsigned char*)malloc(out_size);
-  //  printf("accessing outbytes now...\n");
-	unsigned char* r = outBytes;
-    unsigned char* r_old = outBytes; 
-	r[0] = SZx_VER_MAJOR;
-	r[1] = SZx_VER_MINOR;
-	r[2] = 1;
-	r[3] = 0; // indicates this is not a random access version
-	r[4] = (unsigned char)blockSize;
-	r=r+5; //1 byte
-	sizeToBytes(r, nbConstantBlocks);
-	r += sizeof(size_t);
-    sizeToBytes(r, (size_t) num_sig);
-    r += sizeof(size_t); 
-	r += convert_state_to_out(meta, nbBlocks, r);
-    r += convert_block2_to_out(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-    memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
-    r += (nbEle%blockSize)*sizeof(float);
-    unsigned char* c = r;
-    unsigned char* o = c+nbConstantBlocks*sizeof(float);
-    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
-    for (int i=0; i<nbBlocks; i++){
-        
-        if (meta[i]==0 || meta[i] == 1){
-	    memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
-            c += sizeof(float);
-        }else if(meta[i] == 3){
-            shortToBytes(o, offsets[i]);
-	   
-            o += sizeof(short);
-            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
-            
-	    nc += mSize; 
-            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
-            
-	    nc += offsets[i];
-	   
-        } 
-    }
-
-    // return out_size;
-    return (uint32_t) (nc-r_old);
-}
-
-unsigned char* cuSZx_fast_compress_args_unpredictable_blocked_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold)
-{
-//    printf("tr thresh abs %f %f\n", threshold, absErrBound);
-  //  printf("first: %f %f %f\n", oriData[0], oriData[1], oriData[2]);
-    float sparsity_level = SPARSITY_LEVEL;
-	float* d_oriData;
-    cudaMalloc((void**)&d_oriData, sizeof(float)*nbEle); 
-    cudaMemcpy(d_oriData, oriData, sizeof(float)*nbEle, cudaMemcpyHostToDevice); 
-
-	size_t nbBlocks = nbEle/blockSize;
-	size_t remainCount = nbEle%blockSize;
-	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
-
-    size_t ncBytes = blockSize/4;
-    //ncBytes = (blockSize+1)%4==0 ? ncBytes : ncBytes+1; //Bytes to store one non-constant block data.
-    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
-    size_t msz = (1+mSize) * nbBlocks * sizeof(unsigned char);
-    size_t mbsz = sizeof(float) * nbEle * sizeof(unsigned char);
-
-    unsigned char *meta = (unsigned char*)malloc(msz);
-    short *offsets = (short*)malloc(nbBlocks*sizeof(short));
-    unsigned char *midBytes = (unsigned char*)malloc(mbsz);
-
-	unsigned char* d_meta;
-	unsigned char* d_midBytes;
-	short* d_offsets;
-
-    uint32_t *blk_idx, *d_blk_idx;
-    uint8_t *blk_sig, *d_blk_sig;
-    uint8_t *blk_subidx, *d_blk_subidx;
-    float *blk_vals, *d_blk_vals;
-    uint64_t *num_sig, *d_num_sig;
-
-    checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
-    num_sig = (uint64_t *)malloc(sizeof(uint64_t));
-    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
-    // blk_idx = malloc()
-    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, nbEle*sizeof(uint8_t)));
-
-    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, nbEle*sizeof(float)));
-
-    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
-
-    checkCudaErrors(cudaMalloc((void**)&d_meta, msz)); 
-    //checkCudaErrors(cudaMemcpy(d_meta, meta, msz, cudaMemcpyHostToDevice)); 
-    checkCudaErrors(cudaMemset(d_meta, 0, msz));
-    checkCudaErrors(cudaMalloc((void**)&d_offsets, nbBlocks*sizeof(short))); 
-    checkCudaErrors(cudaMemset(d_offsets, 0, nbBlocks*sizeof(short)));
-    checkCudaErrors(cudaMalloc((void**)&d_midBytes, mbsz)); 
-    checkCudaErrors(cudaMemset(d_midBytes, 0, mbsz));
-
-    timer_GPU.StartCounter();
-    // apply_threshold<<<80,256>>>(d_oriData, threshold, nbEle);
-    // cudaDeviceSynchronize();
-    dim3 dimBlock(32, blockSize/32);
-    dim3 dimGrid(65536, 1);
-    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
-    compress_float<<<dimGrid, dimBlock, sMemsize>>>(d_oriData, d_meta, d_offsets, d_midBytes, absErrBound, blockSize, nbBlocks, mSize, sparsity_level, d_blk_idx, d_blk_subidx,d_blk_vals, threshold, d_blk_sig);
-    cudaError_t err = cudaGetLastError();        // Get error code
-    printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    printf("GPU compression timing: %f ms\n", timer_GPU.GetCounter());
-    cudaDeviceSynchronize();
-    get_numsig<<<1,1>>>(d_num_sig);
-    cudaDeviceSynchronize();
-
-    checkCudaErrors(cudaMemcpy(num_sig, d_num_sig, sizeof(uint64_t), cudaMemcpyDeviceToHost));
-
-    blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
-    blk_vals= (float *)malloc((*num_sig)*sizeof(float));
-    blk_subidx = (uint8_t *)malloc((*num_sig)*sizeof(uint8_t));
-    blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
-
-    checkCudaErrors(cudaMemcpy(meta, d_meta, msz, cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(offsets, d_offsets, nbBlocks*sizeof(short), cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(midBytes, d_midBytes, mbsz, cudaMemcpyDeviceToHost)); 
-    
-    
-    checkCudaErrors(cudaMemcpy(blk_idx, d_blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(blk_vals,d_blk_vals, (*num_sig)*sizeof(float), cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(blk_subidx,d_blk_subidx, (*num_sig)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(blk_sig,d_blk_sig, (nbBlocks)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
-
-    size_t maxPreservedBufferSize = sizeof(float)*nbEle;
-    unsigned char* outBytes = (unsigned char*)malloc(maxPreservedBufferSize);
-    memset(outBytes, 0, maxPreservedBufferSize);
-
-    outSize = (size_t *)malloc(sizeof(size_t));
-    //outSize[0] = _post_proc(oriData, meta, offsets, midBytes, outBytes, nbEle, blockSize, *num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-
-    *outSize = _post_proc(oriData, meta, offsets, midBytes, outBytes, nbEle, blockSize, *num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-//    printf("Beginning free\n");
-    printf("outsize %p \n", outBytes);
-    free(blk_idx);
-    free(blk_subidx);
-    free(blk_vals);
-    free(meta);
-    free(offsets);
-    free(midBytes);
-    checkCudaErrors(cudaFree(d_meta));
-    checkCudaErrors(cudaFree(d_offsets));
-    checkCudaErrors(cudaFree(d_midBytes));
-    return outBytes;
-}
-
-void cuSZx_fast_decompress_args_unpredictable_blocked_float(float** newData, size_t nbEle, unsigned char* cmpBytes)
-{
-    uint32_t *blk_idx, *d_blk_idx;
-    uint8_t *blk_subidx, *d_blk_subidx;
-    uint8_t *blk_sig, *d_blk_sig;
-    float *blk_vals, *d_blk_vals;
-    size_t num_sig, *d_num_sig;
-
-	*newData = (float*)malloc(sizeof(float)*nbEle);
-    memset(*newData, 0, sizeof(float)*nbEle);
-	
-	unsigned char* r = cmpBytes;
-	r += 4;
-	int blockSize = r[0];  //get block size
-	if(blockSize == 0)blockSize = 256;
-	r++;
-	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
-	r += sizeof(size_t);
-	num_sig = bytesToSize(r);
-    r += sizeof(size_t);
-	size_t nbBlocks = nbEle/blockSize;
-    size_t ncBlocks = 0;
-    size_t num_state2_blks = 0;
-	// size_t ncBlocks = nbBlocks - nbConstantBlocks; //get number of constant blocks
-	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
-    size_t ncLeading = blockSize/4;
-    size_t mSize = sizeof(float)+1+ncLeading; //Number of bytes for each data block's metadata.
-	unsigned char* stateArray = (unsigned char*)malloc(nbBlocks);
-    unsigned char* d_stateArray;
-    cudaMalloc(&d_stateArray, nbBlocks);
-	float* constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));			
-	
-    
-
-    blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
-    blk_vals= (float *)malloc((num_sig)*sizeof(float));
-    blk_subidx = (uint8_t *)malloc((num_sig)*sizeof(uint8_t));
-    blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
-
-	printf("Converting state array\n");
-    convert_out_to_state(nbBlocks, r, stateArray);
-	// convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
-	for (size_t i = 0; i < nbBlocks; i++)
-    {
-        if (stateArray[i] == 2)
-        {
-            num_state2_blks++;
-        }else if(stateArray[i] == 3){
-            ncBlocks++;
-        }
-    }
-    
-	r += stateNBBytes;
-    unsigned char* data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
-    memset(data, 0, ncBlocks*blockSize*sizeof(float));
-    printf("converting block vals\n");
-    size_t to_add = convert_out_to_block2(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-    r+= to_add;
-    // checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
-    // num_sig = (uint64_t *)malloc(sizeof(uint64_t));
-    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
-    // blk_idx = malloc()
-    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, num_sig*sizeof(uint8_t)));
-
-    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, num_sig*sizeof(float)));
-
-    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
-
-    checkCudaErrors(cudaMemcpy(d_blk_idx, blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(d_blk_vals, blk_vals, (num_sig)*sizeof(float), cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(d_blk_subidx, blk_subidx, (num_sig)*sizeof(uint8_t), cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(d_stateArray, stateArray, nbBlocks, cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(d_blk_sig, blk_sig, nbBlocks*sizeof(uint8_t), cudaMemcpyHostToDevice));
-
-
-	size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
-    memcpy((*newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
-    r += (nbEle%blockSize)*sizeof(float);
-	float* fr = (float*)r; //fr is the starting address of constant median values.
-	for(i = 0;i < nbConstantBlocks;i++, j+=4) //get the median values for constant-value blocks
-		constantMedianArray[i] = fr[i];
-    r += nbConstantBlocks*sizeof(float);
-    unsigned char* p = r + ncBlocks * sizeof(short);
-    for(i = 0;i < ncBlocks;i++){
-        int leng = (int)bytesToShort(r)+mSize;
-        r += sizeof(short);
-        if (leng > blockSize*sizeof(float))
-        {
-            printf("Warning: compressed block is larger than the original block!\n");
-            exit(0);
-        }
-        memcpy(data+i*blockSize*sizeof(float), p, leng);
-        p += leng;
-    } 
-
-    unsigned char* d_data;
-    float *d_newdata;
-    checkCudaErrors(cudaMalloc((void**)&d_data, ncBlocks*blockSize*sizeof(float))); 
-    checkCudaErrors(cudaMemcpy(d_data, data, ncBlocks*blockSize*sizeof(float), cudaMemcpyHostToDevice)); 
-    checkCudaErrors(cudaMalloc(&d_newdata, nbBlocks*blockSize*sizeof(float)));
-
-    timer_GPU.StartCounter();
-    dim3 dimBlock(32, blockSize/32);
-    dim3 dimGrid(65536, 1);
-    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
-    decompress_state2<<<nbBlocks, 64>>>(d_newdata, d_stateArray,d_blk_idx, d_blk_vals, d_blk_subidx,blockSize, d_blk_sig);
-    decompress_float<<<dimGrid, dimBlock, sMemsize>>>(d_data, blockSize, ncBlocks, mSize);
-    cudaError_t err = cudaGetLastError();        // Get error code
-    printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
-    cudaDeviceSynchronize();
-    checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
-    float* fdata = (float*)data;
-
-    int nb=0, nc=0;
-    for (i=0;i<nbBlocks;i++){
-        if (stateArray[i]==0 || stateArray[i]==1){
-            float Median = constantMedianArray[nb];
-            if (Median>1) printf("data%i:%f\n",i, Median);
-            for (j=0;j<blockSize;j++)
-                *((*newData)+i*blockSize+j) = Median;
-            nb++;
-        }else if(stateArray[i]==3){
-            for (j=0;j<blockSize;j++)
-                *((*newData)+i*blockSize+j) = fdata[nc*blockSize+j];
-            nc++;
-        }
-    }
-
-	free(stateArray);
-	free(constantMedianArray);
-	free(data);
-    cudaFree(d_newdata);
-    cudaFree(d_stateArray);
-    checkCudaErrors(cudaFree(d_data));
-
-}
-
-__device__ inline void longToBytes_bigEndian_d(unsigned char *b, unsigned long num) 
-{
-	b[0] = (unsigned char)(num>>56);
-	b[1] = (unsigned char)(num>>48);
-	b[2] = (unsigned char)(num>>40);
-	b[3] = (unsigned char)(num>>32);
-	b[4] = (unsigned char)(num>>24);
-	b[5] = (unsigned char)(num>>16);
-	b[6] = (unsigned char)(num>>8);
-	b[7] = (unsigned char)(num);
-//	if(dataEndianType==LITTLE_ENDIAN_DATA)
-//		symTransform_8bytes(*b);
-}
-
-inline void longToBytes_bigEndian_memset(unsigned char *b, unsigned long num) 
-{
-    checkCudaErrors(cudaMemset(&b[0], (unsigned char)(num>>56), sizeof(char)));
-    checkCudaErrors(cudaMemset(&b[1], (unsigned char)(num>>48), sizeof(char)));
-    checkCudaErrors(cudaMemset(&b[2], (unsigned char)(num>>40), sizeof(char)));
-    checkCudaErrors(cudaMemset(&b[3], (unsigned char)(num>>32), sizeof(char)));
-    checkCudaErrors(cudaMemset(&b[4], (unsigned char)(num>>24), sizeof(char)));
-    checkCudaErrors(cudaMemset(&b[5], (unsigned char)(num>>16), sizeof(char)));
-    checkCudaErrors(cudaMemset(&b[6], (unsigned char)(num>>8), sizeof(char)));
-    checkCudaErrors(cudaMemset(&b[7], (unsigned char)(num), sizeof(char)));
-//	if(dataEndianType==LITTLE_ENDIAN_DATA)
-//		symTransform_8bytes(*b);
-}
-
-__device__ inline void shortToBytes_d(unsigned char* b, short value)
-{
-	lint16 buf;
-	buf.svalue = value;
-	memcpy(b, buf.byte, 2);
-}
-
-__global__ void getNumNonConstantBlocks(size_t nbBlocks, short *offsets, unsigned char *meta, int blockSize, int *nonconstant, int *out_size){
-    for (int tid = blockDim.x*blockIdx.x + threadIdx.x; tid < nbBlocks; tid += blockDim.x*gridDim.x){
-        if (meta[tid] == 3){ 
-            atomicAdd(nonconstant, 1);
-            atomicAdd(out_size,1+(blockSize/4)+offsets[tid]);
-        }
-    }
-}
-
-__global__ void ncblkCopy(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
-                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize)
-{
-    for (int i=blockDim.x*blockIdx.x + threadIdx.x; i<nbBlocks; i+=blockDim.x*gridDim.x){
-        
-        if (meta[i]==0 || meta[i] == 1){
-            memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
-            c += sizeof(float);
-	    // float g;
-	    // memcpy(&g, (meta+(nbBlocks+i*mSize)),sizeof(float));
-	    // printf("%d %f\n",i,g);
-        }else if(meta[i] == 3){
-           shortToBytes_d(o, offsets[i]);
-            o += sizeof(short);
-            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
-            nc += mSize; 
-            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
-            nc += offsets[i];
-        } 
-    }
-}
-
-size_t better_post_proc(size_t *outSize, float *oriData, unsigned char *meta, 
-                                short *offsets, unsigned char *midBytes, unsigned char *outBytes, 
-                                size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, 
-                                float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
-    /**
-     * outSize: host pointer
-     * float *oriData: device pointer
-     * unsigned char* meta: device pointer
-     * short *offsets: device pointer
-     * 
-     * 
-     */
-    int out_size_h = 0;
-    int *out_size_d;
-    int tmp_outsize = 0;
-
-    size_t nbConstantBlocks = 0;
-    size_t nbBlocks = nbEle/blockSize;
-    size_t ncBytes = blockSize/4;
-    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
-    out_size_h += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
-    if (nbBlocks%8==0)
-        out_size_h += nbBlocks/8;
-    else
-        out_size_h += nbBlocks/8+1;
-
-    int *nonconstant_d, nonconstant_h;
-    
-    checkCudaErrors(cudaMalloc((void **)&nonconstant_d, sizeof(int)));
-    checkCudaErrors(cudaMalloc((void **)&out_size_d, sizeof(int)));
-
-    checkCudaErrors(cudaMemset(nonconstant_d, 0, sizeof(int)));
-    checkCudaErrors(cudaMemset(out_size_d, 0, sizeof(int)));
-
-
-    getNumNonConstantBlocks<<<40,256>>>(nbBlocks, offsets, meta, blockSize, nonconstant_d, out_size_d);
-    cudaDeviceSynchronize();
-
-    checkCudaErrors(cudaMemcpy(&nonconstant_h, nonconstant_d, sizeof(int), cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(&tmp_outsize, out_size_d, sizeof(int), cudaMemcpyDeviceToHost));
-
-    nbConstantBlocks = nbBlocks - nonconstant_h;
-    out_size_h+=tmp_outsize;
-
-    out_size_h += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
-
-    //outBytes = (unsigned char*)malloc(out_size);
-	unsigned char* r = outBytes;
-    unsigned char* r_old = outBytes;
-    checkCudaErrors(cudaMemset(r, SZx_VER_MAJOR, sizeof(char)));
-    checkCudaErrors(cudaMemset(r+1, SZx_VER_MINOR, sizeof(char)));
-    checkCudaErrors(cudaMemset(r+2, 1, sizeof(char)));
-    checkCudaErrors(cudaMemset(r+3, 0, sizeof(char)));
-    checkCudaErrors(cudaMemset(r+4, blockSize, sizeof(char)));
-
-	r=r+5; //1 byte
-	//sizeToBytes(r, nbConstantBlocks);
-    longToBytes_bigEndian_memset(r, nbConstantBlocks);
-	r += sizeof(size_t);
-    //sizeToBytes(r, (size_t) num_sig);
-    longToBytes_bigEndian_memset(r, (unsigned long)num_sig);
-    r += sizeof(size_t); 
-    size_t out_length;
-
-    if(nbBlocks%4==0)
-		out_length = nbBlocks/4;
-	else
-		out_length = nbBlocks/4+1;
-
-    convert_state_to_out_kernel<<<40,256>>>(meta, nbBlocks, r, out_length);
-    r+=out_length;
-    convert_block2_to_out_kernel<<<40,256>>>(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-    r += nbBlocks*4 + num_sig*sizeof(float) + num_sig*sizeof(uint8_t) + nbBlocks*sizeof(uint8_t);
-
-    checkCudaErrors(cudaMemcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float), cudaMemcpyDeviceToDevice));
-    // memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
-    r += (nbEle%blockSize)*sizeof(float);
-    unsigned char* c = r;
-    unsigned char* o = c+nbConstantBlocks*sizeof(float);
-    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
-    ncblkCopy<<<1,1>>>(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
-    cudaDeviceSynchronize();
-    return (size_t) (nc-r_old);
-    // checkCudaErrors(cudaMemcpy(outSize, (size_t)(nc-r_old), sizeof(size_t)));
-    // *outSize = (size_t) (nc-r_old);
-    // return outBytes;
-}
-
-__global__ void device_post_proc(size_t *outSize, float *oriData, unsigned char *meta, 
-                                short *offsets, unsigned char *midBytes, unsigned char *outBytes, 
-                                size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, 
-                                float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig)
-{
-    int out_size = 0;
-
-    size_t nbConstantBlocks = 0;
-    size_t nbBlocks = nbEle/blockSize;
-    size_t ncBytes = blockSize/4;
-    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
-    out_size += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
-    if (nbBlocks%8==0)
-        out_size += nbBlocks/8;
-    else
-        out_size += nbBlocks/8+1;
-    int s0 = 0;
-    int s1 = 0;
-    int s2 = 0;
-    int s3 = 0;
-    for (int i=0; i<nbBlocks; i++){
-        if (meta[i]==0 || meta[i]==1 || meta[i] == 2) nbConstantBlocks++;
-        else out_size += 1+(blockSize/4)+offsets[i];
-    
-    	if(meta[i]==0) s0++;
-    	if(meta[i]==1) s1++;
-    	if(meta[i]==2) s2++;
-    	if(meta[i]==3) s3++;
-    }
-  //  printf("%d %d %d %d\n", s0, s1, s2, s3);
-    out_size += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
-
-    //outBytes = (unsigned char*)malloc(out_size);
-	unsigned char* r = outBytes;
-   // printf("outbytes %p\n",r);
-    unsigned char* r_old = outBytes; 
-	r[0] = SZx_VER_MAJOR;
-	r[1] = SZx_VER_MINOR;
-	r[2] = 1;
-	r[3] = 0; // indicates this is not a random access version
-	r[4] = (unsigned char)blockSize;
-	r=r+5; //1 byte
-	//sizeToBytes(r, nbConstantBlocks);
-    longToBytes_bigEndian_d(r, nbConstantBlocks);
-	r += sizeof(size_t);
-    //sizeToBytes(r, (size_t) num_sig);
-
-   // printf("outbytes %p\n",r);
-    longToBytes_bigEndian_d(r, (unsigned long)num_sig);
-    r += sizeof(size_t); 
-	r += convert_state_to_out(meta, nbBlocks, r);
-   // printf("num sig %d\n", num_sig); 
-   // printf("outbytes %p\n",r);
-    r += convert_block2_to_out(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-    
-   // printf("outbytes %p\n",r);
-    memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
-    r += (nbEle%blockSize)*sizeof(float);
-
-   // printf("outbytes %p\n",r);
-    unsigned char* c = r;
-    unsigned char* o = c+nbConstantBlocks*sizeof(float);
-    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
-    for (int i=0; i<nbBlocks; i++){
-        
-        if (meta[i]==0 || meta[i] == 1){
-            memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
-            c += sizeof(float);
-       
-	    // float g;
-	    // memcpy(&g, (c-sizeof(float)),sizeof(float));
-	    // printf("%d %f\n",i,g);
-       	}else if(meta[i] == 3){
-           shortToBytes_d(o, offsets[i]);
-            o += sizeof(short);
-            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
-            nc += mSize; 
-            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
-            nc += offsets[i];
-        } 
-    }
-
-    // return out_size;
-    *outSize = (size_t) (nc-r_old);
-   // printf("outBytes 0 %d\n", (int) outBytes[0]);
-    // return (uint32_t) (nc-r_old);
-}
-
-unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold)
-{
-    /**
-     * Assuming the following are device pointers:
-     *  float *oriData
-     *  size_t *outSize
-     *  unsigned char* outBytes
-     * 
-     */
-    timer_GPU.StartCounter();
-
-    float sparsity_level = SPARSITY_LEVEL;
-
-    // Set the input data as the function parameter, this should be a device pointer
-
-	float* d_oriData = oriData;
-    // cudaMalloc((void**)&d_oriData, sizeof(float)*nbEle); 
-    // cudaMemcpy(d_oriData, oriData, sizeof(float)*nbEle, cudaMemcpyHostToDevice); 
-
-	size_t nbBlocks = nbEle/blockSize;
-	size_t remainCount = nbEle%blockSize;
-	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
-
-    size_t ncBytes = blockSize/4;
-    //ncBytes = (blockSize+1)%4==0 ? ncBytes : ncBytes+1; //Bytes to store one non-constant block data.
-    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
-    size_t msz = (1+mSize) * nbBlocks * sizeof(unsigned char);
-    size_t mbsz = sizeof(float) * nbEle * sizeof(unsigned char);
-
-    // These are host pointers and do not need to be allocated
-
-    // unsigned char *meta = (unsigned char*)malloc(msz);
-    // short *offsets = (short*)malloc(nbBlocks*sizeof(short));
-    // unsigned char *midBytes = (unsigned char*)malloc(mbsz);
-
-	unsigned char* d_meta;
-	unsigned char* d_midBytes;
-	short* d_offsets;
-
-    uint32_t *blk_idx, *d_blk_idx;
-    uint8_t *blk_sig, *d_blk_sig;
-    uint8_t *blk_subidx, *d_blk_subidx;
-    float *blk_vals, *d_blk_vals;
-    uint64_t *num_sig, *d_num_sig;
-
-    checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
-    num_sig = (uint64_t *)malloc(sizeof(uint64_t));
-    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
-    // blk_idx = malloc()
-    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, nbEle*sizeof(uint8_t)));
-
-    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, nbEle*sizeof(float)));
-
-    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
-
-    checkCudaErrors(cudaMalloc((void**)&d_meta, msz)); 
-    //checkCudaErrors(cudaMemcpy(d_meta, meta, msz, cudaMemcpyHostToDevice)); 
-    checkCudaErrors(cudaMemset(d_meta, 0, msz));
-    checkCudaErrors(cudaMalloc((void**)&d_offsets, nbBlocks*sizeof(short))); 
-    checkCudaErrors(cudaMemset(d_offsets, 0, nbBlocks*sizeof(short)));
-    checkCudaErrors(cudaMalloc((void**)&d_midBytes, mbsz)); 
-    checkCudaErrors(cudaMemset(d_midBytes, 0, mbsz));
-
-    
-    // apply_threshold<<<80,256>>>(d_oriData, threshold, nbEle);
-    // cudaDeviceSynchronize();
-    dim3 dimBlock(32, blockSize/32);
-    dim3 dimGrid(65536, 1);
-    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
-    //printf("Malloc end timestamp: %f ms\n", timer_GPU.GetCounter());
-    compress_float<<<dimGrid, dimBlock, sMemsize>>>(d_oriData, d_meta, d_offsets, d_midBytes, absErrBound, blockSize, nbBlocks, mSize, sparsity_level, d_blk_idx, d_blk_subidx,d_blk_vals, threshold, d_blk_sig);
-    cudaError_t err = cudaGetLastError();        // Get error code
-   // printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    //printf("GPU compression timestamp: %f ms\n", timer_GPU.GetCounter());
-    cudaDeviceSynchronize();
-    get_numsig<<<1,1>>>(d_num_sig);
-    cudaDeviceSynchronize();
-
-    checkCudaErrors(cudaMemcpy(num_sig, d_num_sig, sizeof(uint64_t), cudaMemcpyDeviceToHost));
-
-    // These are allocations and memcpys to host pointers, do not need them
-
-    // blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
-    // blk_vals= (float *)malloc((*num_sig)*sizeof(float));
-    // blk_subidx = (uint8_t *)malloc((*num_sig)*sizeof(uint8_t));
-    // blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
-
-    // checkCudaErrors(cudaMemcpy(meta, d_meta, msz, cudaMemcpyDeviceToHost)); 
-    // checkCudaErrors(cudaMemcpy(offsets, d_offsets, nbBlocks*sizeof(short), cudaMemcpyDeviceToHost)); 
-    // checkCudaErrors(cudaMemcpy(midBytes, d_midBytes, mbsz, cudaMemcpyDeviceToHost)); 
-    
-    
-    // checkCudaErrors(cudaMemcpy(blk_idx, d_blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyDeviceToHost));
-    // checkCudaErrors(cudaMemcpy(blk_vals,d_blk_vals, (*num_sig)*sizeof(float), cudaMemcpyDeviceToHost));
-    // checkCudaErrors(cudaMemcpy(blk_subidx,d_blk_subidx, (*num_sig)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
-    // checkCudaErrors(cudaMemcpy(blk_sig,d_blk_sig, (nbBlocks)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
-
-
-    size_t maxPreservedBufferSize = sizeof(float)*nbEle;
-    unsigned char *d_outBytes;
-    // unsigned char* outBytes = (unsigned char*)malloc(maxPreservedBufferSize);
-    // memset(outBytes, 0, maxPreservedBufferSize);
-    checkCudaErrors(cudaMalloc(&d_outBytes, maxPreservedBufferSize));
-
-    size_t *d_outSize;
-
-    checkCudaErrors(cudaMalloc(&d_outSize, sizeof(size_t)));
-
-  //  device_post_proc<<<1,1>>>(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
-    *outSize = better_post_proc(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
-    //cudaDeviceSynchronize();
-    
-    //checkCudaErrors(cudaMemcpy(outSize, d_outSize, sizeof(size_t), cudaMemcpyDeviceToHost));
-
-    // printf("completed compression\n");
-    //free(blk_idx);
-    //free(blk_subidx);
-    //free(blk_vals);
-    // free(meta);
-    // free(offsets);
-    // free(midBytes);
-    checkCudaErrors(cudaFree(d_num_sig));
-    checkCudaErrors(cudaFree(d_blk_idx));
-    checkCudaErrors(cudaFree(d_blk_subidx));
-    checkCudaErrors(cudaFree(d_blk_vals));
-    checkCudaErrors(cudaFree(d_blk_sig));
-
-    checkCudaErrors(cudaFree(d_meta));
-    checkCudaErrors(cudaFree(d_offsets));
-    checkCudaErrors(cudaFree(d_midBytes));
-//    printf("completed compression\n");
-    printf("Compression end timestamp: %f ms\n", timer_GPU.GetCounter());
-    
-    printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    return d_outBytes;
-}
-
-__device__ inline long bytesToLong_bigEndian(unsigned char* b) {
-	long temp = 0;
-	long res = 0;
-
-	res <<= 8;
-	temp = b[0] & 0xff;
-	res |= temp;
-
-	res <<= 8;
-	temp = b[1] & 0xff;
-	res |= temp;
-	
-	res <<= 8;
-	temp = b[2] & 0xff;
-	res |= temp;
-	
-	res <<= 8;
-	temp = b[3] & 0xff;
-	res |= temp;
-	
-	res <<= 8;
-	temp = b[4] & 0xff;
-	res |= temp;
-	
-	res <<= 8;
-	temp = b[5] & 0xff;
-	res |= temp;
-	
-	res <<= 8;
-	temp = b[6] & 0xff;
-	res |= temp;
-	
-	res <<= 8;
-	temp = b[7] & 0xff;
-	res |= temp;						
-	
-	return res;
-}
-
-__device__ inline size_t bytesToSize(unsigned char* bytes)
-{
-	size_t result = bytesToLong_bigEndian(bytes);//8	
-	return result;
-}
-
-__device__ inline short bytesToShort(unsigned char* bytes)
-{
-	lint16 buf;
-	memcpy(buf.byte, bytes, 2);
-	
-	return buf.svalue;
-}
-
-__global__ void decompress_get_stats(float *newData, size_t nbEle, unsigned char* cmpBytes, 
-    size_t *numSigValues, int *bs,
-    size_t *numConstantBlks, size_t *numBlks,
-    size_t *mSizeptr, unsigned char *newCmpBytes
-){
-	unsigned char* r = cmpBytes;
-    size_t num_sig;
-	r += 4;
-	int blockSize = (int) r[0];  //get block size
-	
-	if(blockSize == 0)blockSize = 256;
-	r++;
-	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
-	r += sizeof(size_t);
-	num_sig = bytesToSize(r);
-
-    r += sizeof(size_t);
-	size_t nbBlocks = nbEle/blockSize;
-    size_t ncBlocks = 0;
-    size_t num_state2_blks = 0;
-	// size_t ncBlocks = nbBlocks - nbConstantBlocks; //get number of constant blocks
-	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
-    size_t ncLeading = blockSize/4;
-    size_t mSize = sizeof(float)+1+ncLeading; //Number of bytes for each data block's metadata.
-
-    *mSizeptr = mSize;
-
-    *numConstantBlks = nbConstantBlocks;
-    *numBlks = nbBlocks;
-    *numSigValues = num_sig;
-    *bs = blockSize;
-    newCmpBytes = r;
-  //  printf("nb blocks: %d\n", nbBlocks);
-
-}
-
- void setup_data_stateArray_better(float *newData, size_t nbEle, unsigned char* r, 
-    size_t num_sig, int blockSize,
-    size_t nbConstantBlocks, size_t nbBlocks, size_t *ncBlks,
-    unsigned char *stateArray, unsigned char *newR
-){
-
-    //printf("ma\n");
-    blockSize = 256;
-    r += 4;
-    r++;
-    r += sizeof(size_t);
-    r += sizeof(size_t);
-    int ncBlocks, *ncBlocks_d;
-	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
-    int num_state2_blks, *num_state2_d;
-    checkCudaErrors(cudaMalloc((void **)&num_state2_d, sizeof(int)));
-    checkCudaErrors(cudaMalloc((void **)&ncBlocks_d, sizeof(int)));
-    checkCudaErrors(cudaMemset(num_state2_d, 0, sizeof(int)));
-    checkCudaErrors(cudaMemset(ncBlocks_d, 0, sizeof(int)));
-
-    //printf("ma2\n");
-//	printf("Converting state array\n");
-    // printf("cmp %d\n", (int)r[0]);
-    // printf("state %d\n", (int)stateArray[0]);
-    // convert_out_to_state(nbBlocks, r, stateArray);
-    convert_out_to_state_kernel<<<40,256>>>(nbBlocks,r,stateArray,stateNBBytes,
-                            num_state2_d, ncBlocks_d);
-    // printf("state %d\n", (int)stateArray[0]);
-    // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
-	cudaDeviceSynchronize();
-    
-    //printf("ma3\n");
-	r += stateNBBytes;
-    newR = r;
-    cudaMemcpy(&ncBlocks, ncBlocks_d, sizeof(int), cudaMemcpyDeviceToHost);
-    
-    //printf("ma4\n");
-    *ncBlks = ncBlocks;
-
-    //printf("ma4\n");
- }
-
-__global__ void setup_data_stateArray(float *newData, size_t nbEle, unsigned char* r, 
-    size_t num_sig, int blockSize,
-    size_t nbConstantBlocks, size_t nbBlocks, size_t *ncBlks,
-    unsigned char *stateArray, unsigned char *newR
-){
-    blockSize = 256;
-    r += 4;
-    r++;
-    r += sizeof(size_t);
-    r += sizeof(size_t);
-    size_t ncBlocks = 0;
-	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
-    size_t num_state2_blks = 0;
-//	printf("Converting state array\n");
-    // printf("cmp %d\n", (int)r[0]);
-    // printf("state %d\n", (int)stateArray[0]);
-    convert_out_to_state(nbBlocks, r, stateArray);
-    // convert_out_to_state_kernel<<<40,256>>>(nbBlocks,r,stateArray,stateNBBytes);
-    // printf("state %d\n", (int)stateArray[0]);
-    // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
-	for (size_t i = 0; i < nbBlocks; i++)
-    {
-        if (stateArray[i] == 2)
-        {
-            num_state2_blks++;
-        }else if(stateArray[i] == 3){
-            ncBlocks++;
-        }
-    }
-    
-	r += stateNBBytes;
-    newR = r;
-    *ncBlks = ncBlocks;
-}
-
-__global__ void decomp_startup_kernel(unsigned char* r, size_t nbConstantBlocks, 
-unsigned char *data, int blockSize, size_t mSize, size_t ncBlocks, float *constantMedianArray){
-    unsigned char * fr = r; //fr is the starting address of constant median values.
-    int i = 0, j = 0, k = 0;
-  //  printf("%p\n", r);
-    unsigned char tmp_r[4];
-    tmp_r[0]=fr[0];
-    tmp_r[1]=fr[1];
-    tmp_r[2]=fr[2];
-    tmp_r[3]=fr[3];
-
-
-//    printf("nbconstant: %f\n", ((float*)tmp_r)[0]);
-// nbConstantBlocks
-    for(i = blockDim.x*blockIdx.x + threadIdx.x; i < nbConstantBlocks; i += blockDim.x*gridDim.x, j+=4){ //get the median values for constant-value blocks
-	    
-    	    tmp_r[0]=fr[j];
-    	    tmp_r[1]=fr[j+1];
-    	    tmp_r[2]=fr[j+2];
-    	    tmp_r[3]=fr[j+3];
-	    float tmp = ((float*)tmp_r)[0];
-	    constantMedianArray[i] = tmp;
-	    // printf("%d %f\n", i, tmp);
-    }
-
-    fr += nbConstantBlocks*sizeof(float);
-    unsigned char* p = fr + ncBlocks * sizeof(short);
-    for(i = blockDim.x*blockIdx.x + threadIdx.x;i < ncBlocks;i += blockDim.x*gridDim.x){
-        int leng = (int)bytesToShort(fr)+mSize;
-        fr += sizeof(short);
-        if (leng > blockSize*sizeof(float))
-        {
-            printf("Warning: compressed block is larger than the original block!\n");
-            return;
-            // exit(0);
-        }
-        memcpy(data+i*blockSize*sizeof(float), p, leng);
-
-	    p += leng;
-    }
-}
-
-void decompress_startup_better(float *newData, size_t nbEle, unsigned char* r, 
-    uint32_t *blk_idx, uint8_t *blk_subidx, uint8_t *blk_sig,
-    float *blk_vals, size_t num_sig, int blockSize,
-    size_t nbConstantBlocks, size_t nbBlocks, size_t ncBlocks,
-    unsigned char *stateArray, float* constantMedianArray, unsigned char *data,
-    size_t mSize, unsigned char *newCmpBytes
-){
-    blockSize = 256;
-    size_t nb_tmp = (int) nbEle/256;
-    /**
-     * Structures to return:
-     * blk_idx, blk_subidx, blk_sig, blk_vals, numSigValues (pointer)
-     * bs (pointer to blockSize), numConstantBlks (pointer), numBlks (pointer)
-     * ncBlks (pointer), stateArray, constantMedianArray
-     */
-
-
-    size_t stateNBBytes = nb_tmp%4==0 ? nb_tmp/4 : nb_tmp/4+1;
-
-    r += 4;
-    r++;
-    r += sizeof(size_t);
-    r += sizeof(size_t);
-
-    r += stateNBBytes;
-
-    convert_out_to_block2_kernel<<<40,256>>>(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-    size_t to_add = nbBlocks*4 + num_sig*sizeof(float) + num_sig*sizeof(uint8_t) + nbBlocks*sizeof(uint8_t);
-    r+= to_add;
-
-    size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
-    
-    // printf("before mallocs in kernel\n");
-    checkCudaErrors(cudaMemcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float), cudaMemcpyDeviceToDevice));
-    // memcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
-
-    //printf("before mallocs in kernel %p\n", r);
-    r += (nbEle%blockSize)*sizeof(float);
-    //printf("r: %p\n", r);
-    //printf("%d, %d, %d\n",nbEle, 256, nbEle%256);
-    decomp_startup_kernel<<<40,256>>>(r, nbConstantBlocks,data, blockSize, mSize, ncBlocks, constantMedianArray);
-    cudaDeviceSynchronize();
-    r += nbConstantBlocks*sizeof(float);
-
-    newCmpBytes = r;
-
-}
-
-__global__ void decompress_startup(float *newData, size_t nbEle, unsigned char* r, 
-    uint32_t *blk_idx, uint8_t *blk_subidx, uint8_t *blk_sig,
-    float *blk_vals, size_t num_sig, int blockSize,
-    size_t nbConstantBlocks, size_t nbBlocks, size_t ncBlocks,
-    unsigned char *stateArray, float* constantMedianArray, unsigned char *data,
-    size_t mSize, unsigned char *newCmpBytes
-){
-    blockSize = 256;
-    size_t nb_tmp = (int) nbEle/256;
-    /**
-     * Structures to return:
-     * blk_idx, blk_subidx, blk_sig, blk_vals, numSigValues (pointer)
-     * bs (pointer to blockSize), numConstantBlks (pointer), numBlks (pointer)
-     * ncBlks (pointer), stateArray, constantMedianArray
-     */
-	
-    // size_t ncBlocks = 0;
-	// size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
-    // size_t num_state2_blks = 0;
-	// printf("Converting state array\n");
-    // convert_out_to_state(nbBlocks, r, stateArray);
-    // printf("state %d\n", (int)stateArray[0]);
-    // // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
-	// for (size_t i = 0; i < nbBlocks; i++)
-    // {
-    //     if (stateArray[i] == 2)
-    //     {
-    //         num_state2_blks++;
-    //     }else if(stateArray[i] == 3){
-    //         ncBlocks++;
-    //     }
-    // }
-   // size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
-
-    size_t stateNBBytes = nb_tmp%4==0 ? nb_tmp/4 : nb_tmp/4+1;
-    //printf("%p\n", r);
-    r += 4;
-    r++;
-    r += sizeof(size_t);
-    r += sizeof(size_t);
-    //printf("statenb %d %d\n", stateNBBytes, nb_tmp);
-    r += stateNBBytes;
-    // data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
-    // memset(data, 0, ncBlocks*blockSize*sizeof(float));
-   // printf("converting block vals %d\n", data[0]);
-    size_t to_add = convert_out_to_block2(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-    r+= to_add;
-
-    size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
-    
-    // printf("before mallocs in kernel\n");
-    
-    memcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
-
-    //printf("before mallocs in kernel %p\n", r);
-    r += (nbEle%blockSize)*sizeof(float);
-    //printf("r: %p\n", r);
-    //printf("%d, %d, %d\n",nbEle, 256, nbEle%256);
-    unsigned char * fr = r; //fr is the starting address of constant median values.
-
-  //  printf("%p\n", r);
-    unsigned char tmp_r[4];
-    tmp_r[0]=r[0];
-    tmp_r[1]=r[1];
-    tmp_r[2]=r[2];
-    tmp_r[3]=r[3];
-
-
-//    printf("nbconstant: %f\n", ((float*)tmp_r)[0]);
-    for(i = 0;i < nbConstantBlocks;i++, j+=4){ //get the median values for constant-value blocks
-	    
-    	    tmp_r[0]=r[j];
-    	    tmp_r[1]=r[j+1];
-    	    tmp_r[2]=r[j+2];
-    	    tmp_r[3]=r[j+3];
-	    float tmp = ((float*)tmp_r)[0];
-//	    printf("median: %f\n", tmp);	
-	    constantMedianArray[i] = tmp;
-
-	    // printf("%d %f\n", i, tmp);
-    }
-    //printf("after constantmedian\n");
-    r += nbConstantBlocks*sizeof(float);
-    unsigned char* p = r + ncBlocks * sizeof(short);
-    for(i = 0;i < ncBlocks;i++){
-        int leng = (int)bytesToShort(r)+mSize;
-        r += sizeof(short);
-        if (leng > blockSize*sizeof(float))
-        {
-            printf("Warning: compressed block is larger than the original block!\n");
-            return;
-            // exit(0);
-        }
-//	printf("before memcpy\n");
-        memcpy(data+i*blockSize*sizeof(float), p, leng);
-  //      printf("after memcpy\n");
-	p += leng;
-    } 
-
-    newCmpBytes = r;
-//    printf("before mallocs in kernel\n");
-
-    // printf("nb blocks: %d\n", nbBlocks);
-}
-
-__global__ void cBlkCopy_decompress(int nb, float* constantMedianArray, float *newData, int blockSize, int i){
-    int j;
-    float Median = constantMedianArray[nb];
-    // j = threadIdx.x; j < blockSize; j += blockDim.x
-    for (j = threadIdx.x; j < blockSize; j += blockDim.x)
-        *((newData)+i*blockSize+j) = Median;
-}
-
-__global__ void ncBlkCopy_decompress(int blockSize, float *newData, int nc, float *fdata, int i){
-    int j;
-    for (j = threadIdx.x; j < blockSize; j += blockDim.x)
-        *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
-}
-
-void decompress_post_proc_better(unsigned char *data, float *newData, int blockSize, 
-    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
-    float *constantMedianArray
-){
-    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
-    // checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
-    float* fdata = (float*)data;
-    int i,j;
-    int nb=0, nc=0;
-    //printf("h1\n");
-    for (i=0;i<nbBlocks;i++){
-        unsigned char state;
-        cudaMemcpy(&state, &stateArray[i], sizeof(char), cudaMemcpyDeviceToHost);
-
-        if (state==0 || state==1){
-            cBlkCopy_decompress<<<1,256>>>(nb, constantMedianArray, newData, blockSize, i);
-            nb++;
-        }else if(state==3){
-            ncBlkCopy_decompress<<<1,256>>>(blockSize, newData, nc, fdata, i);
-            nc++;
-        }
-    }
-    cudaDeviceSynchronize();
-    //for(int k = 0; k < nbBlocks*blockSize;k++){
-//	printf("%f\n", newData[k]);
-  //  }
-}
-
-__global__ void print_newdata(float *newData, size_t nbBlocks, int blockSize){
-    for (size_t i = 0; i < nbBlocks*blockSize; i++)
-    {
-        printf("%f\n", newData[i]);
-    }
-    
-}
-
-__global__ void decompress_post_proc(unsigned char *data, float *newData, int blockSize, 
-    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
-    float *constantMedianArray
-){
-    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
-    // checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
-    float* fdata = (float*)data;
-    int i,j;
-    int nb=0, nc=0;
-    // if (blockIdx.x == 0)
-    // {
-    //     for (i=0;i<nbBlocks;i++){
-    //         if (stateArray[i]==0 || stateArray[i]==1){
-    //             float Median = constantMedianArray[nb];
-    //             // if (Median>1) printf("data%i:%f\n",i, Median);
-    //             for (j = threadIdx.x; j < blockSize; j += blockDim.x)
-    //                 *((newData)+i*blockSize+j) = Median;
-    //             nb++;
-    //         }
-    //     }
-    // }else{
-    //     for (i=0;i<nbBlocks;i++){
-    //         if(stateArray[i]==3){
-    //             for (j = threadIdx.x; j < blockSize; j += blockDim.x)
-    //                 *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
-    //             nc++;
-    //         }
-    //     }
-    // }
-    
-    for (i=0;i<nbBlocks;i++){
-        if (stateArray[i]==0 || stateArray[i]==1){
-            float Median = constantMedianArray[nb];
-            // if (Median>1) printf("data%i:%f\n",i, Median);
-            for (j = threadIdx.x; j < blockSize; j += blockDim.x)
-                *((newData)+i*blockSize+j) = Median;
-            nb++;
-        }else if(stateArray[i]==3){
-            for (j = threadIdx.x; j < blockSize; j += blockDim.x)
-                *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
-            nc++;
-        }
-    }
-
-    //for(int k = 0; k < nbBlocks*blockSize;k++){
-//	printf("%f\n", newData[k]);
-  //  }
-}
-
-float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
-{
-    /**
-     * Assume the following are device pointers
-     * 
-     * unsigned char* cmpBytes
-     * float** newData
-     * 
-     */
-    
-    uint32_t *blk_idx;
-    uint8_t *blk_subidx;
-    uint8_t *blk_sig;
-    float *blk_vals, *constantMedianArray;
-    size_t *num_sig, *mSize, mSize_h, num_sig_h;
-    int *blockSize, bs;
-    size_t *nbConstantBlocks, *nbBlocks, *ncBlocks, nbBlocks_h, ncBlocks_h, nbConstantBlocks_h;
-    unsigned char *stateArray, *data;
-    float *newData;
-
-    unsigned char *oldCmpBytes = cmpBytes;
-	//*newData = (float*)malloc(sizeof(float)*nbEle);
-//    printf("cmpbytes check %d\n", (int)cmpBytes[0]);
-//    printf("new check %f\n", *newData[0]);
-    // printf("malloc\n");
-    checkCudaErrors(cudaMalloc((void**)&num_sig, sizeof(size_t)));
-    checkCudaErrors(cudaMalloc((void**)&blockSize, sizeof(int)));
-    checkCudaErrors(cudaMalloc((void**)&nbConstantBlocks, sizeof(size_t)));
-    checkCudaErrors(cudaMalloc((void**)&nbBlocks, sizeof(size_t)));
-    checkCudaErrors(cudaMalloc((void**)&ncBlocks, sizeof(size_t)));
-    checkCudaErrors(cudaMalloc((void**)&mSize, sizeof(size_t)));    
-    checkCudaErrors(cudaMalloc((void**)&newData, sizeof(float)*nbEle));
-
-    decompress_get_stats<<<1,1>>>(newData, nbEle, cmpBytes, 
-        num_sig, blockSize,
-        nbConstantBlocks, nbBlocks,
-        mSize, cmpBytes
-    );
-    cudaDeviceSynchronize();
-
-    cudaError_t err = cudaGetLastError();        // Get error code
-    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    checkCudaErrors(cudaMemcpy(&nbBlocks_h, nbBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(&nbConstantBlocks_h, nbConstantBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(&bs, blockSize, sizeof(int), cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(&mSize_h, mSize, sizeof(size_t), cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(&num_sig_h, num_sig, sizeof(size_t), cudaMemcpyDeviceToHost)); 
-
-
-    checkCudaErrors(cudaMalloc((void**)&stateArray, nbBlocks_h));
-    checkCudaErrors(cudaMalloc((void**)&constantMedianArray, nbConstantBlocks_h*sizeof(float)));
-
-    checkCudaErrors(cudaMalloc((void**)&blk_idx, nbBlocks_h*sizeof(uint32_t)));
-    checkCudaErrors(cudaMalloc((void**)&blk_vals, num_sig_h*sizeof(float)));
-    checkCudaErrors(cudaMalloc((void**)&blk_subidx, num_sig_h*sizeof(uint8_t)));
-    checkCudaErrors(cudaMalloc((void**)&blk_sig, nbBlocks_h*sizeof(uint8_t)));
-
-    setup_data_stateArray<<<1,1>>>(newData, nbEle, cmpBytes, 
-        num_sig_h, bs,
-        nbConstantBlocks_h, nbBlocks_h, ncBlocks,
-        stateArray, cmpBytes
-    );
-    cudaDeviceSynchronize();
-
-   // printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
-    checkCudaErrors(cudaMemcpy(&ncBlocks_h, ncBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
-
-    checkCudaErrors(cudaMalloc((void**)&data, ncBlocks_h*bs*sizeof(float)));
-    // cmpBytes = newCmpBytes;
-    // data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
-    // memset(data, 0, ncBlocks*blockSize*sizeof(float));
-    // stateArray = (unsigned char*)malloc(nbBlocks);
-    
-    // // unsigned char* d_stateArray;
-    // // cudaMalloc(&d_stateArray, nbBlocks);
-	// constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));			
-
-    // blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
-    // blk_vals= (float *)malloc((num_sig)*sizeof(float));
-    // blk_subidx = (uint8_t *)malloc((num_sig)*sizeof(uint8_t));
-    // blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
-
-    //printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
-    //test_nbBlks = (size_t *)malloc(sizeof(size_t));
-    // printf("malloc\n");
-    decompress_startup<<<1,1>>>(newData, nbEle, cmpBytes, 
-    blk_idx, blk_subidx, blk_sig,
-    blk_vals, num_sig_h, bs,
-    nbConstantBlocks_h, nbBlocks_h, ncBlocks_h,
-    stateArray, constantMedianArray, data, mSize_h, cmpBytes);
-    cudaDeviceSynchronize();
-    // cmpBytes = newCmpBytes;
-
-    //printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
-
-    // unsigned char* d_data;
-    float *d_newdata;
-    // checkCudaErrors(cudaMalloc((void**)&d_data, ncBlocks*blockSize*sizeof(float))); 
-    // checkCudaErrors(cudaMemcpy(d_data, data, ncBlocks*blockSize*sizeof(float), cudaMemcpyHostToDevice)); 
-    //printf("nblocks: %d bs: %d\n", nbBlocks_h, bs);
-    checkCudaErrors(cudaMalloc(&d_newdata, nbBlocks_h*bs*sizeof(float)));
-
-    timer_GPU.StartCounter();
-    dim3 dimBlock(32, bs/32);
-    dim3 dimGrid(65536, 1);
-    const int sMemsize = bs * sizeof(float) + dimBlock.y * sizeof(int);
-    decompress_state2<<<nbBlocks_h, 64>>>(d_newdata, stateArray,blk_idx, blk_vals, blk_subidx, bs, blk_sig);
-    decompress_float<<<dimGrid, dimBlock, sMemsize>>>(data, bs, ncBlocks_h, mSize_h);
-    //err = cudaGetLastError();        // Get error code
-    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    //printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
-    cudaDeviceSynchronize();
-
-    err = cudaGetLastError();        // Get error code
-    printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
-    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(newData, d_newdata, nbBlocks_h*bs*sizeof(float), cudaMemcpyDeviceToDevice));
-    cudaFree(d_newdata);
-
-    decompress_post_proc<<<1,1>>>(data, newData, bs, 
-    nbBlocks_h, ncBlocks_h, stateArray,
-    constantMedianArray);
-    cudaDeviceSynchronize();
-    print_newdata<<<1,1>>>(newData, nbBlocks_h, bs);
-	cudaFree(stateArray);
-	cudaFree(constantMedianArray);
-	cudaFree(data);
-    cudaFree(blk_idx);
-    cudaFree(blk_subidx);
-    cudaFree(blk_vals);
-    cudaFree(blk_sig);
-    return newData;
-
-}
-
+#include "cuszx_entry.h"
+#include "szx_defines.h"
+#include "szx_BytesToolkit.h"
+#include "szx_TypeManager.h"
+#include "timingGPU.h"
+#include "szx.h"
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+#include <cub/cub.cuh>
+
+#define SPARSITY_LEVEL 0.25
+
+TimingGPU timer_GPU;
+void bin(unsigned n)
+{
+    unsigned i;
+    for (i = 1 << 31; i > 0; i = i / 2)
+        (n & i) ? printf("1") : printf("0");
+}
+
+__host__ __device__ size_t convert_state_to_out(unsigned char* meta, size_t length, unsigned char *result){
+    size_t out_length;
+
+    if(length%4==0)
+		out_length = length/4;
+	else
+		out_length = length/4+1;
+
+    for (size_t i = 0; i < out_length; i++)
+    {
+        uint8_t tmp = 0;
+
+        for (size_t j = 0; j < 4; j++)
+        {
+            if (i*4 + j < length)
+            {
+                tmp |= (0x03 & meta[i*4+j]) << 2*j;
+            }
+            
+        }
+        result[i] = tmp;
+    }
+    return out_length;
+}
+
+__global__ void convert_state_to_out_kernel(unsigned char* meta, size_t length, unsigned char *result, size_t out_length){
+    
+
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < out_length; i += blockDim.x*gridDim.x){
+        uint8_t tmp = 0;
+
+        for (size_t j = 0; j < 4; j++)
+        {
+            if (i*4 + j < length)
+            {
+                tmp |= (0x03 & meta[i*4+j]) << 2*j;
+            }
+            
+        }
+        result[i] = tmp;
+    }
+}
+
+__global__ void convert_out_to_state_kernel(size_t nbBlocks, unsigned char* cmp, unsigned char* out_state, size_t state_length, int *num_state2blks, int *ncBlocks){
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < state_length; i += blockDim.x*gridDim.x){
+        for (size_t j = 0; j < 4; j++)
+        {
+            if (4*i + j < nbBlocks)
+            {
+                out_state[4*i + j]= (cmp[i] >> 2*j) & 0x03;
+                if (out_state[4*i+j] == 2)
+                {
+                    atomicAdd(num_state2blks, 1);
+                }else if(out_state[4*i+j]==3){
+                    atomicAdd(ncBlocks, 1);
+                }
+                
+            }
+            
+        }
+    }
+}
+
+// nbBlocks, r, stateNBBytes, stateArray
+__host__ __device__ size_t convert_out_to_state(size_t nbBlocks, unsigned char* cmp, unsigned char* out_state){
+    size_t state_length;
+    if(nbBlocks%4==0)
+		state_length = nbBlocks/4;
+	else
+		state_length = nbBlocks/4+1;
+
+    for (size_t i = 0; i < state_length; i++)
+    {
+        for (size_t j = 0; j < 4; j++)
+        {
+            if (4*i + j < nbBlocks)
+            {
+                out_state[4*i + j]= (cmp[i] >> 2*j) & 0x03;
+            }
+            
+        }
+    }
+    return nbBlocks;
+}
+
+__host__ __device__ size_t convert_block2_to_out(unsigned char *result, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    size_t out_length = 0;
+    
+    memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
+    out_length += numBlocks*4;
+    memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
+    out_length += num_sig*sizeof(float);
+    memcpy(result+out_length, blk_subidx, num_sig*sizeof(uint8_t));
+    out_length += num_sig*sizeof(uint8_t);
+    memcpy(result+out_length, blk_sig, numBlocks*sizeof(uint8_t));
+    out_length+= numBlocks*sizeof(uint8_t);
+
+    return out_length;
+}
+
+__global__ void convert_block2_to_out_kernel(unsigned char *result, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    
+    size_t out_length = 0;
+    unsigned char *tmp_result = result;
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
+        uint32_t local_blkidx = blk_idx[i];
+        tmp_result[4*i] = (local_blkidx) & 0xff;
+        tmp_result[4*i+1] = (local_blkidx >> (8*1)) & 0xff;
+        tmp_result[4*i+2] = (local_blkidx >> (8*2)) & 0xff;
+        tmp_result[4*i+3] = (local_blkidx >> (8*3)) & 0xff;
+    }
+    // memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
+    out_length += numBlocks*4;
+    tmp_result = result+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
+        float value = blk_vals[i];
+	    memcpy(&tmp_result[4*i], &value, sizeof(float));
+	//unsigned char *v = ()
+        //tmp_result[(int)4*i] = (unsigned char)((value) & 0xff);
+        //tmp_result[(int)4*i+1] = (unsigned char)((value >> (8*1)) & 0xff);
+        //tmp_result[(int)4*i+2] = (unsigned char)((value >> (8*2)) & 0xff);
+        //tmp_result[(int)4*i+3] = (unsigned char)((value >> (8*3)) & 0xff);
+    }
+    // memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
+    out_length += num_sig*sizeof(float);
+    tmp_result = result+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
+        tmp_result[i] = blk_subidx[i];
+        
+    }
+
+    out_length += num_sig*sizeof(uint8_t);
+    tmp_result = result+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
+        tmp_result[i] = blk_sig[i];
+        
+    }
+    out_length+= numBlocks*sizeof(uint8_t);
+
+    // return out_length;
+}
+
+__global__ void convert_out_to_block2_kernel(unsigned char *in_cmp, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    size_t out_length = 0;
+    
+    unsigned char *tmp_result = in_cmp;
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
+        
+        uint32_t local_blkidx = (tmp_result[4*i] & 0xff) | ((tmp_result[4*i+1] & 0xff) << (8*1)) 
+                                | ((tmp_result[4*i+2] & 0xff) << (8*2)) | ((tmp_result[4*i+3] & 0xff) << (8*3));
+        blk_idx[i] = local_blkidx;
+    }
+    // memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
+    out_length += numBlocks*4;
+    tmp_result = in_cmp+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
+        float value = 0.0;
+        memcpy(&value, &tmp_result[4*i], sizeof(float));
+        blk_vals[i] = value;
+	    
+	//unsigned char *v = ()
+        //tmp_result[(int)4*i] = (unsigned char)((value) & 0xff);
+        //tmp_result[(int)4*i+1] = (unsigned char)((value >> (8*1)) & 0xff);
+        //tmp_result[(int)4*i+2] = (unsigned char)((value >> (8*2)) & 0xff);
+        //tmp_result[(int)4*i+3] = (unsigned char)((value >> (8*3)) & 0xff);
+    }
+    // memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
+    out_length += num_sig*sizeof(float);
+    tmp_result = in_cmp+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
+        blk_subidx[i] = tmp_result[i];
+        
+    }
+
+    out_length += num_sig*sizeof(uint8_t);
+    tmp_result = in_cmp+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
+        blk_sig[i] = tmp_result[i];
+        
+    }
+    out_length+= numBlocks*sizeof(uint8_t);
+}
+
+__host__ __device__ size_t convert_out_to_block2(unsigned char *in_cmp, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    size_t out_length = 0;
+    memcpy(blk_idx, in_cmp, numBlocks*sizeof(uint32_t));
+    out_length += numBlocks*4;
+    memcpy(blk_vals, in_cmp+out_length,num_sig*sizeof(float));
+    out_length += num_sig*sizeof(float);
+    memcpy(blk_subidx, in_cmp+out_length, num_sig*sizeof(uint8_t));
+    out_length += num_sig*sizeof(uint8_t);
+    memcpy(blk_sig, in_cmp+out_length, numBlocks*sizeof(uint8_t));
+    out_length += numBlocks*sizeof(uint8_t);
+//    printf("outlength: %d\n",out_length);
+    return out_length;
+}
+
+int _post_proc(float *oriData, unsigned char *meta, short *offsets, unsigned char *midBytes, unsigned char *outBytes, size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig)
+{
+    int out_size = 0;
+
+    size_t nbConstantBlocks = 0;
+    size_t nbBlocks = nbEle/blockSize;
+    size_t ncBytes = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    out_size += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
+    if (nbBlocks%8==0)
+        out_size += nbBlocks/8;
+    else
+        out_size += nbBlocks/8+1;
+    int s0 = 0;
+    int s1 = 0;
+    int s2 = 0;
+    int s3 = 0;
+    for (int i=0; i<nbBlocks; i++){
+        if (meta[i]==0 || meta[i]==1 || meta[i] == 2) nbConstantBlocks++;
+        else out_size += 1+(blockSize/4)+offsets[i];
+    
+    	if(meta[i]==0) s0++;
+    	if(meta[i]==1) s1++;
+    	if(meta[i]==2) s2++;
+    	if(meta[i]==3) s3++;
+    }
+//    printf("%d %d %d %d\n", s0, s1, s2, s3);
+    out_size += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
+
+    //outBytes = (unsigned char*)malloc(out_size);
+  //  printf("accessing outbytes now...\n");
+	unsigned char* r = outBytes;
+    unsigned char* r_old = outBytes; 
+	r[0] = SZx_VER_MAJOR;
+	r[1] = SZx_VER_MINOR;
+	r[2] = 1;
+	r[3] = 0; // indicates this is not a random access version
+	r[4] = (unsigned char)blockSize;
+	r=r+5; //1 byte
+	sizeToBytes(r, nbConstantBlocks);
+	r += sizeof(size_t);
+    sizeToBytes(r, (size_t) num_sig);
+    r += sizeof(size_t); 
+	r += convert_state_to_out(meta, nbBlocks, r);
+    r += convert_block2_to_out(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
+    r += (nbEle%blockSize)*sizeof(float);
+    unsigned char* c = r;
+    unsigned char* o = c+nbConstantBlocks*sizeof(float);
+    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
+    for (int i=0; i<nbBlocks; i++){
+        
+        if (meta[i]==0 || meta[i] == 1){
+	    memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
+            c += sizeof(float);
+        }else if(meta[i] == 3){
+            shortToBytes(o, offsets[i]);
+	   
+            o += sizeof(short);
+            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
+            
+	    nc += mSize; 
+            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            
+	    nc += offsets[i];
+	   
+        } 
+    }
+
+    // return out_size;
+    return (uint32_t) (nc-r_old);
+}
+
+unsigned char* cuSZx_fast_compress_args_unpredictable_blocked_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold)
+{
+//    printf("tr thresh abs %f %f\n", threshold, absErrBound);
+  //  printf("first: %f %f %f\n", oriData[0], oriData[1], oriData[2]);
+    float sparsity_level = SPARSITY_LEVEL;
+	float* d_oriData;
+    cudaMalloc((void**)&d_oriData, sizeof(float)*nbEle); 
+    cudaMemcpy(d_oriData, oriData, sizeof(float)*nbEle, cudaMemcpyHostToDevice); 
+
+	size_t nbBlocks = nbEle/blockSize;
+	size_t remainCount = nbEle%blockSize;
+	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
+
+    size_t ncBytes = blockSize/4;
+    //ncBytes = (blockSize+1)%4==0 ? ncBytes : ncBytes+1; //Bytes to store one non-constant block data.
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    size_t msz = (1+mSize) * nbBlocks * sizeof(unsigned char);
+    size_t mbsz = sizeof(float) * nbEle * sizeof(unsigned char);
+
+    unsigned char *meta = (unsigned char*)malloc(msz);
+    short *offsets = (short*)malloc(nbBlocks*sizeof(short));
+    unsigned char *midBytes = (unsigned char*)malloc(mbsz);
+
+	unsigned char* d_meta;
+	unsigned char* d_midBytes;
+	short* d_offsets;
+
+    uint32_t *blk_idx, *d_blk_idx;
+    uint8_t *blk_sig, *d_blk_sig;
+    uint8_t *blk_subidx, *d_blk_subidx;
+    float *blk_vals, *d_blk_vals;
+    uint64_t *num_sig, *d_num_sig;
+
+    checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
+    num_sig = (uint64_t *)malloc(sizeof(uint64_t));
+    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
+    // blk_idx = malloc()
+    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, nbEle*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, nbEle*sizeof(float)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void**)&d_meta, msz)); 
+    //checkCudaErrors(cudaMemcpy(d_meta, meta, msz, cudaMemcpyHostToDevice)); 
+    checkCudaErrors(cudaMemset(d_meta, 0, msz));
+    checkCudaErrors(cudaMalloc((void**)&d_offsets, nbBlocks*sizeof(short))); 
+    checkCudaErrors(cudaMemset(d_offsets, 0, nbBlocks*sizeof(short)));
+    checkCudaErrors(cudaMalloc((void**)&d_midBytes, mbsz)); 
+    checkCudaErrors(cudaMemset(d_midBytes, 0, mbsz));
+
+    timer_GPU.StartCounter();
+    // apply_threshold<<<80,256>>>(d_oriData, threshold, nbEle);
+    // cudaDeviceSynchronize();
+    dim3 dimBlock(32, blockSize/32);
+    dim3 dimGrid(65536, 1);
+    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
+    compress_float<<<dimGrid, dimBlock, sMemsize>>>(d_oriData, d_meta, d_offsets, d_midBytes, absErrBound, blockSize, nbBlocks, mSize, sparsity_level, d_blk_idx, d_blk_subidx,d_blk_vals, threshold, d_blk_sig);
+    cudaError_t err = cudaGetLastError();        // Get error code
+    printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    printf("GPU compression timing: %f ms\n", timer_GPU.GetCounter());
+    cudaDeviceSynchronize();
+    get_numsig<<<1,1>>>(d_num_sig);
+    cudaDeviceSynchronize();
+
+    checkCudaErrors(cudaMemcpy(num_sig, d_num_sig, sizeof(uint64_t), cudaMemcpyDeviceToHost));
+
+    blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
+    blk_vals= (float *)malloc((*num_sig)*sizeof(float));
+    blk_subidx = (uint8_t *)malloc((*num_sig)*sizeof(uint8_t));
+    blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
+
+    checkCudaErrors(cudaMemcpy(meta, d_meta, msz, cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(offsets, d_offsets, nbBlocks*sizeof(short), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(midBytes, d_midBytes, mbsz, cudaMemcpyDeviceToHost)); 
+    
+    
+    checkCudaErrors(cudaMemcpy(blk_idx, d_blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(blk_vals,d_blk_vals, (*num_sig)*sizeof(float), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(blk_subidx,d_blk_subidx, (*num_sig)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(blk_sig,d_blk_sig, (nbBlocks)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
+
+    size_t maxPreservedBufferSize = sizeof(float)*nbEle;
+    unsigned char* outBytes = (unsigned char*)malloc(maxPreservedBufferSize);
+    memset(outBytes, 0, maxPreservedBufferSize);
+
+    outSize = (size_t *)malloc(sizeof(size_t));
+    //outSize[0] = _post_proc(oriData, meta, offsets, midBytes, outBytes, nbEle, blockSize, *num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+
+    *outSize = _post_proc(oriData, meta, offsets, midBytes, outBytes, nbEle, blockSize, *num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+//    printf("Beginning free\n");
+    printf("outsize %p \n", outBytes);
+    free(blk_idx);
+    free(blk_subidx);
+    free(blk_vals);
+    free(meta);
+    free(offsets);
+    free(midBytes);
+    checkCudaErrors(cudaFree(d_meta));
+    checkCudaErrors(cudaFree(d_offsets));
+    checkCudaErrors(cudaFree(d_midBytes));
+    return outBytes;
+}
+
+void cuSZx_fast_decompress_args_unpredictable_blocked_float(float** newData, size_t nbEle, unsigned char* cmpBytes)
+{
+    uint32_t *blk_idx, *d_blk_idx;
+    uint8_t *blk_subidx, *d_blk_subidx;
+    uint8_t *blk_sig, *d_blk_sig;
+    float *blk_vals, *d_blk_vals;
+    size_t num_sig, *d_num_sig;
+
+	*newData = (float*)malloc(sizeof(float)*nbEle);
+    memset(*newData, 0, sizeof(float)*nbEle);
+	
+	unsigned char* r = cmpBytes;
+	r += 4;
+	int blockSize = r[0];  //get block size
+	if(blockSize == 0)blockSize = 256;
+	r++;
+	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
+	r += sizeof(size_t);
+	num_sig = bytesToSize(r);
+    r += sizeof(size_t);
+	size_t nbBlocks = nbEle/blockSize;
+    size_t ncBlocks = 0;
+    size_t num_state2_blks = 0;
+	// size_t ncBlocks = nbBlocks - nbConstantBlocks; //get number of constant blocks
+	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    size_t ncLeading = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncLeading; //Number of bytes for each data block's metadata.
+	unsigned char* stateArray = (unsigned char*)malloc(nbBlocks);
+    unsigned char* d_stateArray;
+    cudaMalloc(&d_stateArray, nbBlocks);
+	float* constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));			
+	
+    
+
+    blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
+    blk_vals= (float *)malloc((num_sig)*sizeof(float));
+    blk_subidx = (uint8_t *)malloc((num_sig)*sizeof(uint8_t));
+    blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
+
+	printf("Converting state array\n");
+    convert_out_to_state(nbBlocks, r, stateArray);
+	// convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
+	for (size_t i = 0; i < nbBlocks; i++)
+    {
+        if (stateArray[i] == 2)
+        {
+            num_state2_blks++;
+        }else if(stateArray[i] == 3){
+            ncBlocks++;
+        }
+    }
+    
+	r += stateNBBytes;
+    unsigned char* data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
+    memset(data, 0, ncBlocks*blockSize*sizeof(float));
+    printf("converting block vals\n");
+    size_t to_add = convert_out_to_block2(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    r+= to_add;
+    // checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
+    // num_sig = (uint64_t *)malloc(sizeof(uint64_t));
+    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
+    // blk_idx = malloc()
+    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, num_sig*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, num_sig*sizeof(float)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMemcpy(d_blk_idx, blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_blk_vals, blk_vals, (num_sig)*sizeof(float), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_blk_subidx, blk_subidx, (num_sig)*sizeof(uint8_t), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_stateArray, stateArray, nbBlocks, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_blk_sig, blk_sig, nbBlocks*sizeof(uint8_t), cudaMemcpyHostToDevice));
+
+
+	size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+    memcpy((*newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
+    r += (nbEle%blockSize)*sizeof(float);
+	float* fr = (float*)r; //fr is the starting address of constant median values.
+	for(i = 0;i < nbConstantBlocks;i++, j+=4) //get the median values for constant-value blocks
+		constantMedianArray[i] = fr[i];
+    r += nbConstantBlocks*sizeof(float);
+    unsigned char* p = r + ncBlocks * sizeof(short);
+    for(i = 0;i < ncBlocks;i++){
+        int leng = (int)bytesToShort(r)+mSize;
+        r += sizeof(short);
+        if (leng > blockSize*sizeof(float))
+        {
+            printf("Warning: compressed block is larger than the original block!\n");
+            exit(0);
+        }
+        memcpy(data+i*blockSize*sizeof(float), p, leng);
+        p += leng;
+    } 
+
+    unsigned char* d_data;
+    float *d_newdata;
+    checkCudaErrors(cudaMalloc((void**)&d_data, ncBlocks*blockSize*sizeof(float))); 
+    checkCudaErrors(cudaMemcpy(d_data, data, ncBlocks*blockSize*sizeof(float), cudaMemcpyHostToDevice)); 
+    checkCudaErrors(cudaMalloc(&d_newdata, nbBlocks*blockSize*sizeof(float)));
+
+    timer_GPU.StartCounter();
+    dim3 dimBlock(32, blockSize/32);
+    dim3 dimGrid(65536, 1);
+    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
+    decompress_state2<<<nbBlocks, 64>>>(d_newdata, d_stateArray,d_blk_idx, d_blk_vals, d_blk_subidx,blockSize, d_blk_sig);
+    decompress_float<<<dimGrid, dimBlock, sMemsize>>>(d_data, blockSize, ncBlocks, mSize);
+    cudaError_t err = cudaGetLastError();        // Get error code
+    printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
+    cudaDeviceSynchronize();
+    checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
+    float* fdata = (float*)data;
+
+    int nb=0, nc=0;
+    for (i=0;i<nbBlocks;i++){
+        if (stateArray[i]==0 || stateArray[i]==1){
+            float Median = constantMedianArray[nb];
+            if (Median>1) printf("data%i:%f\n",i, Median);
+            for (j=0;j<blockSize;j++)
+                *((*newData)+i*blockSize+j) = Median;
+            nb++;
+        }else if(stateArray[i]==3){
+            for (j=0;j<blockSize;j++)
+                *((*newData)+i*blockSize+j) = fdata[nc*blockSize+j];
+            nc++;
+        }
+    }
+
+	free(stateArray);
+	free(constantMedianArray);
+	free(data);
+    cudaFree(d_newdata);
+    cudaFree(d_stateArray);
+    checkCudaErrors(cudaFree(d_data));
+
+}
+
+__device__ inline void longToBytes_bigEndian_d(unsigned char *b, unsigned long num) 
+{
+	b[0] = (unsigned char)(num>>56);
+	b[1] = (unsigned char)(num>>48);
+	b[2] = (unsigned char)(num>>40);
+	b[3] = (unsigned char)(num>>32);
+	b[4] = (unsigned char)(num>>24);
+	b[5] = (unsigned char)(num>>16);
+	b[6] = (unsigned char)(num>>8);
+	b[7] = (unsigned char)(num);
+//	if(dataEndianType==LITTLE_ENDIAN_DATA)
+//		symTransform_8bytes(*b);
+}
+
+inline void longToBytes_bigEndian_memset(unsigned char *b, unsigned long num) 
+{
+    checkCudaErrors(cudaMemset(&b[0], (unsigned char)(num>>56), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[1], (unsigned char)(num>>48), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[2], (unsigned char)(num>>40), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[3], (unsigned char)(num>>32), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[4], (unsigned char)(num>>24), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[5], (unsigned char)(num>>16), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[6], (unsigned char)(num>>8), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[7], (unsigned char)(num), sizeof(char)));
+//	if(dataEndianType==LITTLE_ENDIAN_DATA)
+//		symTransform_8bytes(*b);
+}
+
+__device__ inline void shortToBytes_d(unsigned char* b, short value)
+{
+	lint16 buf;
+	buf.svalue = value;
+	memcpy(b, buf.byte, 2);
+}
+
+
+
+__global__ void getNumNonConstantBlocks(size_t nbBlocks, short *offsets, unsigned char *meta, int blockSize, int *nonconstant, int *out_size){
+    for (int tid = blockDim.x*blockIdx.x + threadIdx.x; tid < nbBlocks; tid += blockDim.x*gridDim.x){
+        if (meta[tid] == 3){ 
+            atomicAdd(nonconstant, 1);
+            atomicAdd(out_size,1+(blockSize/4)+offsets[tid]);
+        }
+    }
+}
+
+__global__ void generateFlags(unsigned char *states, uint64_t *cBlk_flags, uint64_t *ncBlk_flags,uint64_t* offset_indices,short* offsets, size_t nbBlocks){
+    for (int tid = blockDim.x*blockIdx.x + threadIdx.x; tid < nbBlocks; tid += blockDim.x*gridDim.x){
+        if (states[tid] == 0 || states[tid] == 1)
+        {
+            cBlk_flags[tid] = 1;
+            ncBlk_flags[tid] = 0;
+            offset_indices[tid] = 0;
+        }else if(states[tid]==3){
+            ncBlk_flags[tid] = 1;
+            cBlk_flags[tid] = 0;
+            offset_indices[tid] = (uint64_t) offsets[tid];
+        }else{
+            cBlk_flags[tid] = 0;
+            ncBlk_flags[tid] = 0;
+            offset_indices[tid] = 0;
+        }
+        
+    }
+}
+
+__global__ void nccopy_kernel2(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, uint64_t *cBlk_indices, uint64_t *ncBlk_indices, uint64_t* offset_indices){
+   // printf("blockdim %d blockidx %d threadidx %d griddim %d\n", blockDim.x, blockIdx.x, threadIdx.x, gridDim.x);
+    int i;
+    int num_threads = (blockDim.x*gridDim.x);
+    int tid = blockDim.x*blockIdx.x + threadIdx.x;
+    int blocks_per_thread = nbBlocks/num_threads;
+    int start_idx = tid*blocks_per_thread;
+    int end_idx = start_idx+blocks_per_thread;
+
+    if (tid == num_threads-1)
+    {
+        end_idx = nbBlocks;
+    }
+    
+    unsigned char* tmp_o = o+(sizeof(short)*ncBlk_indices[start_idx]);
+    unsigned char* tmp_nc= nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]);
+    for (i=start_idx; i<end_idx; i++){
+        if(meta[i] == 3){
+	
+            
+            shortToBytes_d(o, offsets[i]);
+            tmp_o += sizeof(short);
+            memcpy(tmp_nc, meta+(nbBlocks+i*mSize), mSize);
+            tmp_nc += mSize; 
+            memcpy(tmp_nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            tmp_nc += offsets[i];
+
+            // shortToBytes_d(o+(sizeof(short)*ncBlk_indices[i]), offsets[i]);
+            
+            // memcpy(nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]), meta+(nbBlocks+i*mSize), mSize);
+
+
+            // memcpy(nc+(mSize*(ncBlk_indices[i]+1) + offset_indices[i]*ncBlk_indices[i]), midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+        } 
+    }
+    
+}
+
+
+__global__ void nccopy_kernel(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, uint64_t *cBlk_indices, uint64_t *ncBlk_indices, uint64_t* offset_indices){
+   // printf("blockdim %d blockidx %d threadidx %d griddim %d\n", blockDim.x, blockIdx.x, threadIdx.x, gridDim.x);
+    int i;
+    // if(threadIdx.x==0){
+	// printf("c: %ld nc: %ld\n", cBlk_indices[nbBlocks-1], ncBlk_indices[nbBlocks-1]);
+    // }
+    for (i=blockDim.x*blockIdx.x + threadIdx.x; i<nbBlocks; i+=blockDim.x*gridDim.x){
+        //printf("meta %d i: %d\n",meta[i], i); 
+        if (meta[i]==0 || meta[i] == 1){
+            // printf("cblk\n");
+	        memcpy(c+(sizeof(float)*cBlk_indices[i]), meta+(nbBlocks+i*mSize), sizeof(float));
+	   
+            // printf("cblk done\n");
+	    // c += sizeof(float);
+	    // float g;
+	    // memcpy(&g, (meta+(nbBlocks+i*mSize)),sizeof(float));
+	    // printf("%d %f\n",i,g);
+        }
+        // else if(meta[i] == 3){
+	
+        // //     printf("ncblk 1\n");
+        //     shortToBytes_d(o+(sizeof(short)*ncBlk_indices[i]), offsets[i]);
+        //      // o += sizeof(short);
+
+        // //     printf("ncblk 2 nbBlocks %d %d \n", nbBlocks, i);
+        //     printf("nbBlkindices %ld offset_indices %ld\n", ncBlk_indices[i], offset_indices[i]);
+        // //     printf(" test 1%c\n",meta+(nbBlocks+i*mSize));
+        // //     printf("test 2%c\n", nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]));
+        //     memcpy(nc+((mSize + offset_indices[i])*ncBlk_indices[i]), meta+(nbBlocks+i*mSize), mSize);
+        // //         // nc += mSize; 
+                
+        // //     printf("ncblk 3\n");
+        //     memcpy(nc+((mSize+mSize + offset_indices[i])*ncBlk_indices[i]), midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+        // //         // nc += offsets[i];
+            
+        // //     printf("ncblk 4\n");
+        // } 
+    }
+    
+}
+
+//__global__ void nccopy_kernel(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+//                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, int *cBlk_indices, int *ncBlk_indices, int* offset_indices){
+//    printf("blockdim %d blockidx %d threadidx %d griddim %d\n", blockDim.x, blockIdx.x, threadIdx.x, gridDim.x);
+//    int i;
+//    for (i=blockDim.x*blockIdx.x + threadIdx.x; i<nbBlocks; i+=blockDim.x*gridDim.x){
+        //printf("meta %d i: %d\n",meta[i], i); 
+//        if (meta[i]==0 || meta[i] == 1){
+            // printf("cblk\n");
+//	    memcpy(c+(sizeof(float)*cBlk_indices[i]), meta+(nbBlocks+i*mSize), sizeof(float));
+
+            // printf("cblk done\n");
+	    // c += sizeof(float);
+	    // float g;
+	    // memcpy(&g, (meta+(nbBlocks+i*mSize)),sizeof(float));
+	    // printf("%d %f\n",i,g);
+//        }else if(meta[i] == 3){
+	
+//           printf("ncblk 1\n");
+//           shortToBytes_d(o+(sizeof(short)*ncBlk_indices[i]), offsets[i]);
+            // o += sizeof(short);
+
+//           printf("ncblk 2 nbBlocks %d %d \n", nbBlocks, i);
+//	   printf("nbBlkindices %d offset_indices %d\n", ncBlk_indices[i], offset_indices[i]);
+//	   memcpy(nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]), meta+(nbBlocks+i*mSize), mSize);
+            // nc += mSize; 
+            
+//           printf("ncblk 3\n");
+//	   memcpy(nc+(mSize*(ncBlk_indices[i]+1) + offset_indices[i]*ncBlk_indices[i]), midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            // nc += offsets[i];
+        
+//           printf("ncblk 4\n");
+//	} 
+//    }
+    
+//}
+
+__global__ void set_nc(unsigned char *nc, short *offsets, uint64_t *offset_indices, uint64_t *ncBlk_indices, size_t mSize, size_t nbBlocks){
+    if (threadIdx.x == 0 && blockIdx.x == 0)
+    {
+        nc = nc + (mSize*(ncBlk_indices[nbBlocks -1]+1) + offset_indices[nbBlocks - 1]*ncBlk_indices[nbBlocks - 1]) + offsets[nbBlocks-1];
+    }
+    
+}
+
+void ncblkCopy_fast(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize){
+    uint64_t *cBlk_indices, *ncBlk_indices;
+    uint64_t *offset_indices;
+    TimingGPU timer2;
+
+    // timer2.StartCounter();
+    
+    checkCudaErrors(cudaMalloc(&cBlk_indices, sizeof(uint64_t)*nbBlocks));
+    checkCudaErrors(cudaMalloc(&ncBlk_indices, sizeof(uint64_t)*nbBlocks));
+    checkCudaErrors(cudaMalloc(&offset_indices, sizeof(uint64_t)*nbBlocks));
+
+    generateFlags<<<40,256>>>(meta, cBlk_indices, ncBlk_indices, offset_indices, offsets, nbBlocks);
+    cudaDeviceSynchronize();
+
+    thrust::exclusive_scan(thrust::device, cBlk_indices, cBlk_indices + nbBlocks, cBlk_indices, 0);
+    thrust::exclusive_scan(thrust::device, ncBlk_indices, ncBlk_indices + nbBlocks, ncBlk_indices, 0);
+    thrust::exclusive_scan(thrust::device, offset_indices, offset_indices + nbBlocks, offset_indices, 0);
+
+    nccopy_kernel<<<40,256>>>(c, o, nc, midBytes, meta, nbBlocks, blockSize, offsets, mSize, cBlk_indices,ncBlk_indices,offset_indices);
+    // nccopy_kernel2<<<1,1>>>(c, o, nc, midBytes, meta, nbBlocks, blockSize, offsets, mSize, cBlk_indices,ncBlk_indices,offset_indices);
+
+    cudaDeviceSynchronize();
+    // printf("%s\n", cudaGetErrorString(cudaGetLastError()));
+    // set_nc<<<1,1>>>(nc, offsets, offset_indices, ncBlk_indices, mSize, nbBlocks);
+    // cudaDeviceSynchronize();
+    // printf("ncblockcpy: %f ms\n", timer2.GetCounter());
+    checkCudaErrors(cudaFree(cBlk_indices));
+    checkCudaErrors(cudaFree(ncBlk_indices));
+    checkCudaErrors(cudaFree(offset_indices));
+}
+
+void ncblkCopy_h(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize){
+    unsigned char *tmp_states;
+    unsigned char *ncold = nc;
+    uint64_t col_off = 0;
+    short *tmp_offsets;
+    tmp_offsets = (short*)malloc(sizeof(short)*nbBlocks);
+    tmp_states = (unsigned char *)malloc(sizeof(char)*nbBlocks);
+    checkCudaErrors(cudaMemcpy(tmp_states, meta, sizeof(char)*nbBlocks, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(tmp_offsets,offsets,sizeof(short)*nbBlocks,cudaMemcpyDeviceToHost));
+    cudaStream_t stream[3];
+    cudaStreamCreate(&stream[0]);
+    cudaStreamCreate(&stream[1]);
+    cudaStreamCreate(&stream[2]);
+
+    //printf("here\n");
+    //checkCudaErrors(cudaMemcpy((void**)&d_offsets, nbBlocks*sizeof(short))); 
+    for (int i = 0; i < nbBlocks; i++)
+    {
+        if(tmp_states[i]==3){
+            // shortToBytes_d(o, offsets[i]);
+            // buf = (unsigned char*)
+            
+//	    printf("here2\n");
+            cudaMemcpyAsync(o, offsets+i, 2, cudaMemcpyDeviceToDevice, stream[0]);
+            o += sizeof(short);
+        
+    //	    printf("here2.1\n");
+            // printf("offsets %ld\n", col_off);
+            cudaMemcpyAsync(nc, meta+(nbBlocks+i*mSize), mSize, cudaMemcpyDeviceToDevice, stream[1]);
+                // memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
+                
+            nc += mSize; 
+                
+    //	    printf("here2.2\n");
+            //checkCudaErrors(cudaMemcpy(buf, offsets+i, sizeof(short), cudaMemcpyDeviceToHost));
+                
+    //	    //printf("here2.3 %d\n", buf);
+            cudaMemcpyAsync(nc, midBytes+(i*blockSize*sizeof(float)), (int)tmp_offsets[i], cudaMemcpyDeviceToDevice, stream[2]);
+            // memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            nc += tmp_offsets[i];
+            col_off+=tmp_offsets[i];
+       
+///	    printf("here2.4\n");
+       	}
+    }
+    cudaStreamDestroy(stream[0]);
+    cudaStreamDestroy(stream[1]);
+    cudaStreamDestroy(stream[2]);
+
+    free(tmp_states);
+    free(tmp_offsets); 
+}
+
+__global__ void ncblkCopy(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize)
+{
+    for (int i=blockDim.x*blockIdx.x + threadIdx.x; i<nbBlocks; i+=blockDim.x*gridDim.x){
+        
+        if (meta[i]==0 || meta[i] == 1){
+            memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
+            c += sizeof(float);
+	    // float g;
+	    // memcpy(&g, (meta+(nbBlocks+i*mSize)),sizeof(float));
+	    // printf("%d %f\n",i,g);
+        }else if(meta[i] == 3){
+           shortToBytes_d(o, offsets[i]);
+            o += sizeof(short);
+            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
+            nc += mSize; 
+            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            nc += offsets[i];
+        } 
+    }
+}
+
+size_t better_post_proc(size_t *outSize, float *oriData, unsigned char *meta, 
+                                short *offsets, unsigned char *midBytes, unsigned char *outBytes, 
+                                size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, 
+                                float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    /**
+     * outSize: host pointer
+     * float *oriData: device pointer
+     * unsigned char* meta: device pointer
+     * short *offsets: device pointer
+     * 
+     * 
+     */
+    int out_size_h = 0;
+    int *out_size_d;
+    int tmp_outsize = 0;
+
+    size_t nbConstantBlocks = 0;
+    size_t nbBlocks = nbEle/blockSize;
+    size_t ncBytes = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    out_size_h += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
+    if (nbBlocks%8==0)
+        out_size_h += nbBlocks/8;
+    else
+        out_size_h += nbBlocks/8+1;
+
+    int *nonconstant_d, nonconstant_h;
+    
+    checkCudaErrors(cudaMalloc((void **)&nonconstant_d, sizeof(int)));
+    checkCudaErrors(cudaMalloc((void **)&out_size_d, sizeof(int)));
+
+    checkCudaErrors(cudaMemset(nonconstant_d, 0, sizeof(int)));
+    checkCudaErrors(cudaMemset(out_size_d, 0, sizeof(int)));
+
+
+    getNumNonConstantBlocks<<<40,256>>>(nbBlocks, offsets, meta, blockSize, nonconstant_d, out_size_d);
+    cudaDeviceSynchronize();
+
+    checkCudaErrors(cudaMemcpy(&nonconstant_h, nonconstant_d, sizeof(int), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(&tmp_outsize, out_size_d, sizeof(int), cudaMemcpyDeviceToHost));
+
+    nbConstantBlocks = nbBlocks - nonconstant_h;
+    out_size_h+=tmp_outsize;
+
+    out_size_h += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
+
+    //outBytes = (unsigned char*)malloc(out_size);
+	unsigned char* r = outBytes;
+    unsigned char* r_old = outBytes;
+    checkCudaErrors(cudaMemset(r, SZx_VER_MAJOR, sizeof(char)));
+    checkCudaErrors(cudaMemset(r+1, SZx_VER_MINOR, sizeof(char)));
+    checkCudaErrors(cudaMemset(r+2, 1, sizeof(char)));
+    checkCudaErrors(cudaMemset(r+3, 0, sizeof(char)));
+    checkCudaErrors(cudaMemset(r+4, blockSize, sizeof(char)));
+
+	r=r+5; //1 byte
+	//sizeToBytes(r, nbConstantBlocks);
+    longToBytes_bigEndian_memset(r, nbConstantBlocks);
+	r += sizeof(size_t);
+    //sizeToBytes(r, (size_t) num_sig);
+    longToBytes_bigEndian_memset(r, (unsigned long)num_sig);
+    r += sizeof(size_t); 
+    size_t out_length;
+
+    if(nbBlocks%4==0)
+		out_length = nbBlocks/4;
+	else
+		out_length = nbBlocks/4+1;
+
+    convert_state_to_out_kernel<<<40,256>>>(meta, nbBlocks, r, out_length);
+    r+=out_length;
+    convert_block2_to_out_kernel<<<40,256>>>(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    r += nbBlocks*4 + num_sig*sizeof(float) + num_sig*sizeof(uint8_t) + nbBlocks*sizeof(uint8_t);
+
+    checkCudaErrors(cudaMemcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float), cudaMemcpyDeviceToDevice));
+    // memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
+    r += (nbEle%blockSize)*sizeof(float);
+    unsigned char* c = r;
+    unsigned char* o = c+nbConstantBlocks*sizeof(float);
+    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
+    // ncblkCopy<<<1,1>>>(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
+    
+    ncblkCopy_h(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
+    ncblkCopy_fast(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
+    // cudaDeviceSynchronize();
+    return (size_t) (nc-r_old);
+    // checkCudaErrors(cudaMemcpy(outSize, (size_t)(nc-r_old), sizeof(size_t)));
+    // *outSize = (size_t) (nc-r_old);
+    // return outBytes;
+}
+
+__global__ void device_post_proc(size_t *outSize, float *oriData, unsigned char *meta, 
+                                short *offsets, unsigned char *midBytes, unsigned char *outBytes, 
+                                size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, 
+                                float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig)
+{
+    int out_size = 0;
+
+    size_t nbConstantBlocks = 0;
+    size_t nbBlocks = nbEle/blockSize;
+    size_t ncBytes = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    out_size += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
+    if (nbBlocks%8==0)
+        out_size += nbBlocks/8;
+    else
+        out_size += nbBlocks/8+1;
+    int s0 = 0;
+    int s1 = 0;
+    int s2 = 0;
+    int s3 = 0;
+    for (int i=0; i<nbBlocks; i++){
+        if (meta[i]==0 || meta[i]==1 || meta[i] == 2) nbConstantBlocks++;
+        else out_size += 1+(blockSize/4)+offsets[i];
+    
+    	if(meta[i]==0) s0++;
+    	if(meta[i]==1) s1++;
+    	if(meta[i]==2) s2++;
+    	if(meta[i]==3) s3++;
+    }
+  //  printf("%d %d %d %d\n", s0, s1, s2, s3);
+    out_size += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
+
+    //outBytes = (unsigned char*)malloc(out_size);
+	unsigned char* r = outBytes;
+   // printf("outbytes %p\n",r);
+    unsigned char* r_old = outBytes; 
+	r[0] = SZx_VER_MAJOR;
+	r[1] = SZx_VER_MINOR;
+	r[2] = 1;
+	r[3] = 0; // indicates this is not a random access version
+	r[4] = (unsigned char)blockSize;
+	r=r+5; //1 byte
+	//sizeToBytes(r, nbConstantBlocks);
+    longToBytes_bigEndian_d(r, nbConstantBlocks);
+	r += sizeof(size_t);
+    //sizeToBytes(r, (size_t) num_sig);
+
+   // printf("outbytes %p\n",r);
+    longToBytes_bigEndian_d(r, (unsigned long)num_sig);
+    r += sizeof(size_t); 
+	r += convert_state_to_out(meta, nbBlocks, r);
+   // printf("num sig %d\n", num_sig); 
+   // printf("outbytes %p\n",r);
+    r += convert_block2_to_out(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    
+   // printf("outbytes %p\n",r);
+    memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
+    r += (nbEle%blockSize)*sizeof(float);
+
+   // printf("outbytes %p\n",r);
+    unsigned char* c = r;
+    unsigned char* o = c+nbConstantBlocks*sizeof(float);
+    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
+    for (int i=0; i<nbBlocks; i++){
+        
+        if (meta[i]==0 || meta[i] == 1){
+            memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
+            c += sizeof(float);
+       
+	    // float g;
+	    // memcpy(&g, (c-sizeof(float)),sizeof(float));
+	    // printf("%d %f\n",i,g);
+       	}else if(meta[i] == 3){
+           shortToBytes_d(o, offsets[i]);
+            o += sizeof(short);
+            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
+            nc += mSize; 
+            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            nc += offsets[i];
+        } 
+    }
+
+    // return out_size;
+    *outSize = (size_t) (nc-r_old);
+   // printf("outBytes 0 %d\n", (int) outBytes[0]);
+    // return (uint32_t) (nc-r_old);
+}
+
+unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold)
+{
+    /**
+     * Assuming the following are device pointers:
+     *  float *oriData
+     *  size_t *outSize
+     *  unsigned char* outBytes
+     * 
+     */
+    timer_GPU.StartCounter();
+
+    float sparsity_level = SPARSITY_LEVEL;
+
+    // Set the input data as the function parameter, this should be a device pointer
+
+	float* d_oriData = oriData;
+    // cudaMalloc((void**)&d_oriData, sizeof(float)*nbEle); 
+    // cudaMemcpy(d_oriData, oriData, sizeof(float)*nbEle, cudaMemcpyHostToDevice); 
+
+	size_t nbBlocks = nbEle/blockSize;
+	size_t remainCount = nbEle%blockSize;
+	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
+
+    size_t ncBytes = blockSize/4;
+    //ncBytes = (blockSize+1)%4==0 ? ncBytes : ncBytes+1; //Bytes to store one non-constant block data.
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    size_t msz = (1+mSize) * nbBlocks * sizeof(unsigned char);
+    size_t mbsz = sizeof(float) * nbEle * sizeof(unsigned char);
+
+    // These are host pointers and do not need to be allocated
+
+    // unsigned char *meta = (unsigned char*)malloc(msz);
+    // short *offsets = (short*)malloc(nbBlocks*sizeof(short));
+    // unsigned char *midBytes = (unsigned char*)malloc(mbsz);
+
+	unsigned char* d_meta;
+	unsigned char* d_midBytes;
+	short* d_offsets;
+
+    uint32_t *blk_idx, *d_blk_idx;
+    uint8_t *blk_sig, *d_blk_sig;
+    uint8_t *blk_subidx, *d_blk_subidx;
+    float *blk_vals, *d_blk_vals;
+    uint64_t *num_sig, *d_num_sig;
+
+    checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
+    num_sig = (uint64_t *)malloc(sizeof(uint64_t));
+    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
+    // blk_idx = malloc()
+    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, nbEle*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, nbEle*sizeof(float)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void**)&d_meta, msz)); 
+    //checkCudaErrors(cudaMemcpy(d_meta, meta, msz, cudaMemcpyHostToDevice)); 
+    checkCudaErrors(cudaMemset(d_meta, 0, msz));
+    checkCudaErrors(cudaMalloc((void**)&d_offsets, nbBlocks*sizeof(short))); 
+    checkCudaErrors(cudaMemset(d_offsets, 0, nbBlocks*sizeof(short)));
+    checkCudaErrors(cudaMalloc((void**)&d_midBytes, mbsz)); 
+    checkCudaErrors(cudaMemset(d_midBytes, 0, mbsz));
+
+    
+    // apply_threshold<<<80,256>>>(d_oriData, threshold, nbEle);
+    // cudaDeviceSynchronize();
+    dim3 dimBlock(32, blockSize/32);
+    dim3 dimGrid(65536, 1);
+    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
+    //printf("Malloc end timestamp: %f ms\n", timer_GPU.GetCounter());
+    compress_float<<<dimGrid, dimBlock, sMemsize>>>(d_oriData, d_meta, d_offsets, d_midBytes, absErrBound, blockSize, nbBlocks, mSize, sparsity_level, d_blk_idx, d_blk_subidx,d_blk_vals, threshold, d_blk_sig);
+    cudaError_t err = cudaGetLastError();        // Get error code
+   // printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    //printf("GPU compression timestamp: %f ms\n", timer_GPU.GetCounter());
+    cudaDeviceSynchronize();
+    get_numsig<<<1,1>>>(d_num_sig);
+    cudaDeviceSynchronize();
+
+    checkCudaErrors(cudaMemcpy(num_sig, d_num_sig, sizeof(uint64_t), cudaMemcpyDeviceToHost));
+
+    // These are allocations and memcpys to host pointers, do not need them
+
+    // blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
+    // blk_vals= (float *)malloc((*num_sig)*sizeof(float));
+    // blk_subidx = (uint8_t *)malloc((*num_sig)*sizeof(uint8_t));
+    // blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
+
+    // checkCudaErrors(cudaMemcpy(meta, d_meta, msz, cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(offsets, d_offsets, nbBlocks*sizeof(short), cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(midBytes, d_midBytes, mbsz, cudaMemcpyDeviceToHost)); 
+    
+    
+    // checkCudaErrors(cudaMemcpy(blk_idx, d_blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyDeviceToHost));
+    // checkCudaErrors(cudaMemcpy(blk_vals,d_blk_vals, (*num_sig)*sizeof(float), cudaMemcpyDeviceToHost));
+    // checkCudaErrors(cudaMemcpy(blk_subidx,d_blk_subidx, (*num_sig)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
+    // checkCudaErrors(cudaMemcpy(blk_sig,d_blk_sig, (nbBlocks)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
+
+
+    size_t maxPreservedBufferSize = sizeof(float)*nbEle;
+    unsigned char *d_outBytes;
+    // unsigned char* outBytes = (unsigned char*)malloc(maxPreservedBufferSize);
+    // memset(outBytes, 0, maxPreservedBufferSize);
+    checkCudaErrors(cudaMalloc(&d_outBytes, maxPreservedBufferSize));
+
+    size_t *d_outSize;
+
+    checkCudaErrors(cudaMalloc(&d_outSize, sizeof(size_t)));
+
+  //  device_post_proc<<<1,1>>>(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
+    *outSize = better_post_proc(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
+    //cudaDeviceSynchronize();
+    
+    //checkCudaErrors(cudaMemcpy(outSize, d_outSize, sizeof(size_t), cudaMemcpyDeviceToHost));
+
+    // printf("completed compression\n");
+    //free(blk_idx);
+    //free(blk_subidx);
+    //free(blk_vals);
+    // free(meta);
+    // free(offsets);
+    // free(midBytes);
+    checkCudaErrors(cudaFree(d_num_sig));
+    checkCudaErrors(cudaFree(d_blk_idx));
+    checkCudaErrors(cudaFree(d_blk_subidx));
+    checkCudaErrors(cudaFree(d_blk_vals));
+    checkCudaErrors(cudaFree(d_blk_sig));
+
+    checkCudaErrors(cudaFree(d_meta));
+    checkCudaErrors(cudaFree(d_offsets));
+    checkCudaErrors(cudaFree(d_midBytes));
+//    printf("completed compression\n");
+    printf("Compression end timestamp: %f ms\n", timer_GPU.GetCounter());
+    
+    printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    return d_outBytes;
+}
+
+__device__ inline long bytesToLong_bigEndian(unsigned char* b) {
+	long temp = 0;
+	long res = 0;
+
+	res <<= 8;
+	temp = b[0] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = b[1] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[3] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[4] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[5] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[6] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[7] & 0xff;
+	res |= temp;						
+	
+	return res;
+}
+
+__device__ inline size_t bytesToSize(unsigned char* bytes)
+{
+	size_t result = bytesToLong_bigEndian(bytes);//8	
+	return result;
+}
+
+__device__ inline short bytesToShort(unsigned char* bytes)
+{
+	lint16 buf;
+	memcpy(buf.byte, bytes, 2);
+	
+	return buf.svalue;
+}
+
+__global__ void decompress_get_stats(float *newData, size_t nbEle, unsigned char* cmpBytes, 
+    size_t *numSigValues, int *bs,
+    size_t *numConstantBlks, size_t *numBlks,
+    size_t *mSizeptr, unsigned char *newCmpBytes
+){
+	unsigned char* r = cmpBytes;
+    size_t num_sig;
+	r += 4;
+	int blockSize = (int) r[0];  //get block size
+	
+	if(blockSize == 0)blockSize = 256;
+	r++;
+	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
+	r += sizeof(size_t);
+	num_sig = bytesToSize(r);
+
+    r += sizeof(size_t);
+	size_t nbBlocks = nbEle/blockSize;
+    size_t ncBlocks = 0;
+    size_t num_state2_blks = 0;
+	// size_t ncBlocks = nbBlocks - nbConstantBlocks; //get number of constant blocks
+	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    size_t ncLeading = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncLeading; //Number of bytes for each data block's metadata.
+
+    *mSizeptr = mSize;
+
+    *numConstantBlks = nbConstantBlocks;
+    *numBlks = nbBlocks;
+    *numSigValues = num_sig;
+    *bs = blockSize;
+    newCmpBytes = r;
+  //  printf("nb blocks: %d\n", nbBlocks);
+
+}
+
+ void setup_data_stateArray_better(float *newData, size_t nbEle, unsigned char* r, 
+    size_t num_sig, int blockSize,
+    size_t nbConstantBlocks, size_t nbBlocks, size_t *ncBlks,
+    unsigned char *stateArray, unsigned char *newR
+){
+
+    //printf("ma\n");
+    blockSize = 256;
+    r += 4;
+    r++;
+    r += sizeof(size_t);
+    r += sizeof(size_t);
+    int ncBlocks, *ncBlocks_d;
+	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    int num_state2_blks, *num_state2_d;
+    checkCudaErrors(cudaMalloc((void **)&num_state2_d, sizeof(int)));
+    checkCudaErrors(cudaMalloc((void **)&ncBlocks_d, sizeof(int)));
+    checkCudaErrors(cudaMemset(num_state2_d, 0, sizeof(int)));
+    checkCudaErrors(cudaMemset(ncBlocks_d, 0, sizeof(int)));
+
+    //printf("ma2\n");
+//	printf("Converting state array\n");
+    // printf("cmp %d\n", (int)r[0]);
+    // printf("state %d\n", (int)stateArray[0]);
+    // convert_out_to_state(nbBlocks, r, stateArray);
+    convert_out_to_state_kernel<<<40,256>>>(nbBlocks,r,stateArray,stateNBBytes,
+                            num_state2_d, ncBlocks_d);
+    // printf("state %d\n", (int)stateArray[0]);
+    // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
+	cudaDeviceSynchronize();
+    
+    //printf("ma3\n");
+	r += stateNBBytes;
+    newR = r;
+    cudaMemcpy(&ncBlocks, ncBlocks_d, sizeof(int), cudaMemcpyDeviceToHost);
+    
+    //printf("ma4\n");
+    *ncBlks = ncBlocks;
+
+    //printf("ma4\n");
+ }
+
+__global__ void setup_data_stateArray(float *newData, size_t nbEle, unsigned char* r, 
+    size_t num_sig, int blockSize,
+    size_t nbConstantBlocks, size_t nbBlocks, size_t *ncBlks,
+    unsigned char *stateArray, unsigned char *newR
+){
+    blockSize = 256;
+    r += 4;
+    r++;
+    r += sizeof(size_t);
+    r += sizeof(size_t);
+    size_t ncBlocks = 0;
+	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    size_t num_state2_blks = 0;
+//	printf("Converting state array\n");
+    // printf("cmp %d\n", (int)r[0]);
+    // printf("state %d\n", (int)stateArray[0]);
+    convert_out_to_state(nbBlocks, r, stateArray);
+    // convert_out_to_state_kernel<<<40,256>>>(nbBlocks,r,stateArray,stateNBBytes);
+    // printf("state %d\n", (int)stateArray[0]);
+    // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
+	for (size_t i = 0; i < nbBlocks; i++)
+    {
+        if (stateArray[i] == 2)
+        {
+            num_state2_blks++;
+        }else if(stateArray[i] == 3){
+            ncBlocks++;
+        }
+    }
+    
+	r += stateNBBytes;
+    newR = r;
+    *ncBlks = ncBlocks;
+}
+
+__global__ void decomp_startup_kernel(unsigned char* r, size_t nbConstantBlocks, 
+unsigned char *data, int blockSize, size_t mSize, size_t ncBlocks, float *constantMedianArray){
+    unsigned char * fr = r; //fr is the starting address of constant median values.
+    int i = 0, j = 0, k = 0;
+  //  printf("%p\n", r);
+    unsigned char tmp_r[4];
+    tmp_r[0]=fr[0];
+    tmp_r[1]=fr[1];
+    tmp_r[2]=fr[2];
+    tmp_r[3]=fr[3];
+
+
+//    printf("nbconstant: %f\n", ((float*)tmp_r)[0]);
+// nbConstantBlocks
+    for(i = blockDim.x*blockIdx.x + threadIdx.x; i < nbConstantBlocks; i += blockDim.x*gridDim.x, j+=4){ //get the median values for constant-value blocks
+	    
+    	    tmp_r[0]=fr[j];
+    	    tmp_r[1]=fr[j+1];
+    	    tmp_r[2]=fr[j+2];
+    	    tmp_r[3]=fr[j+3];
+	    float tmp = ((float*)tmp_r)[0];
+	    constantMedianArray[i] = tmp;
+	    // printf("%d %f\n", i, tmp);
+    }
+
+    fr += nbConstantBlocks*sizeof(float);
+    unsigned char* p = fr + ncBlocks * sizeof(short);
+    for(i = blockDim.x*blockIdx.x + threadIdx.x;i < ncBlocks;i += blockDim.x*gridDim.x){
+        int leng = (int)bytesToShort(fr)+mSize;
+        fr += sizeof(short);
+        if (leng > blockSize*sizeof(float))
+        {
+            printf("Warning: compressed block is larger than the original block!\n");
+            return;
+            // exit(0);
+        }
+        memcpy(data+i*blockSize*sizeof(float), p, leng);
+
+	    p += leng;
+    }
+}
+
+void decompress_startup_better(float *newData, size_t nbEle, unsigned char* r, 
+    uint32_t *blk_idx, uint8_t *blk_subidx, uint8_t *blk_sig,
+    float *blk_vals, size_t num_sig, int blockSize,
+    size_t nbConstantBlocks, size_t nbBlocks, size_t ncBlocks,
+    unsigned char *stateArray, float* constantMedianArray, unsigned char *data,
+    size_t mSize, unsigned char *newCmpBytes
+){
+    blockSize = 256;
+    size_t nb_tmp = (int) nbEle/256;
+    /**
+     * Structures to return:
+     * blk_idx, blk_subidx, blk_sig, blk_vals, numSigValues (pointer)
+     * bs (pointer to blockSize), numConstantBlks (pointer), numBlks (pointer)
+     * ncBlks (pointer), stateArray, constantMedianArray
+     */
+
+
+    size_t stateNBBytes = nb_tmp%4==0 ? nb_tmp/4 : nb_tmp/4+1;
+
+    r += 4;
+    r++;
+    r += sizeof(size_t);
+    r += sizeof(size_t);
+
+    r += stateNBBytes;
+
+    convert_out_to_block2_kernel<<<40,256>>>(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    size_t to_add = nbBlocks*4 + num_sig*sizeof(float) + num_sig*sizeof(uint8_t) + nbBlocks*sizeof(uint8_t);
+    r+= to_add;
+
+    size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+    
+    // printf("before mallocs in kernel\n");
+    checkCudaErrors(cudaMemcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float), cudaMemcpyDeviceToDevice));
+    // memcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
+
+    //printf("before mallocs in kernel %p\n", r);
+    r += (nbEle%blockSize)*sizeof(float);
+    //printf("r: %p\n", r);
+    //printf("%d, %d, %d\n",nbEle, 256, nbEle%256);
+    decomp_startup_kernel<<<40,256>>>(r, nbConstantBlocks,data, blockSize, mSize, ncBlocks, constantMedianArray);
+    cudaDeviceSynchronize();
+    r += nbConstantBlocks*sizeof(float);
+
+    newCmpBytes = r;
+
+}
+
+__global__ void decompress_startup(float *newData, size_t nbEle, unsigned char* r, 
+    uint32_t *blk_idx, uint8_t *blk_subidx, uint8_t *blk_sig,
+    float *blk_vals, size_t num_sig, int blockSize,
+    size_t nbConstantBlocks, size_t nbBlocks, size_t ncBlocks,
+    unsigned char *stateArray, float* constantMedianArray, unsigned char *data,
+    size_t mSize, unsigned char *newCmpBytes
+){
+    blockSize = 256;
+    size_t nb_tmp = (int) nbEle/256;
+    /**
+     * Structures to return:
+     * blk_idx, blk_subidx, blk_sig, blk_vals, numSigValues (pointer)
+     * bs (pointer to blockSize), numConstantBlks (pointer), numBlks (pointer)
+     * ncBlks (pointer), stateArray, constantMedianArray
+     */
+	
+    // size_t ncBlocks = 0;
+	// size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    // size_t num_state2_blks = 0;
+	// printf("Converting state array\n");
+    // convert_out_to_state(nbBlocks, r, stateArray);
+    // printf("state %d\n", (int)stateArray[0]);
+    // // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
+	// for (size_t i = 0; i < nbBlocks; i++)
+    // {
+    //     if (stateArray[i] == 2)
+    //     {
+    //         num_state2_blks++;
+    //     }else if(stateArray[i] == 3){
+    //         ncBlocks++;
+    //     }
+    // }
+   // size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+
+    size_t stateNBBytes = nb_tmp%4==0 ? nb_tmp/4 : nb_tmp/4+1;
+    //printf("%p\n", r);
+    r += 4;
+    r++;
+    r += sizeof(size_t);
+    r += sizeof(size_t);
+    //printf("statenb %d %d\n", stateNBBytes, nb_tmp);
+    r += stateNBBytes;
+    // data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
+    // memset(data, 0, ncBlocks*blockSize*sizeof(float));
+   // printf("converting block vals %d\n", data[0]);
+    size_t to_add = convert_out_to_block2(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    r+= to_add;
+
+    size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+    
+    // printf("before mallocs in kernel\n");
+    
+    memcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
+
+    //printf("before mallocs in kernel %p\n", r);
+    r += (nbEle%blockSize)*sizeof(float);
+    //printf("r: %p\n", r);
+    //printf("%d, %d, %d\n",nbEle, 256, nbEle%256);
+    unsigned char * fr = r; //fr is the starting address of constant median values.
+
+  //  printf("%p\n", r);
+    unsigned char tmp_r[4];
+    tmp_r[0]=r[0];
+    tmp_r[1]=r[1];
+    tmp_r[2]=r[2];
+    tmp_r[3]=r[3];
+
+
+//    printf("nbconstant: %f\n", ((float*)tmp_r)[0]);
+    for(i = 0;i < nbConstantBlocks;i++, j+=4){ //get the median values for constant-value blocks
+	    
+    	    tmp_r[0]=r[j];
+    	    tmp_r[1]=r[j+1];
+    	    tmp_r[2]=r[j+2];
+    	    tmp_r[3]=r[j+3];
+	    float tmp = ((float*)tmp_r)[0];
+//	    printf("median: %f\n", tmp);	
+	    constantMedianArray[i] = tmp;
+
+	    // printf("%d %f\n", i, tmp);
+    }
+    //printf("after constantmedian\n");
+    r += nbConstantBlocks*sizeof(float);
+    unsigned char* p = r + ncBlocks * sizeof(short);
+    for(i = 0;i < ncBlocks;i++){
+        int leng = (int)bytesToShort(r)+mSize;
+        r += sizeof(short);
+        if (leng > blockSize*sizeof(float))
+        {
+            printf("Warning: compressed block is larger than the original block!\n");
+            return;
+            // exit(0);
+        }
+//	printf("before memcpy\n");
+        memcpy(data+i*blockSize*sizeof(float), p, leng);
+  //      printf("after memcpy\n");
+	p += leng;
+    } 
+
+    newCmpBytes = r;
+//    printf("before mallocs in kernel\n");
+
+    // printf("nb blocks: %d\n", nbBlocks);
+}
+
+__global__ void cBlkCopy_decompress(int nb, float* constantMedianArray, float *newData, int blockSize, int i){
+    int j;
+    float Median = constantMedianArray[nb];
+    // j = threadIdx.x; j < blockSize; j += blockDim.x
+    for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+        *((newData)+i*blockSize+j) = Median;
+}
+
+__global__ void ncBlkCopy_decompress(int blockSize, float *newData, int nc, float *fdata, int i){
+    int j;
+    for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+        *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
+}
+
+void decompress_post_proc_better(unsigned char *data, float *newData, int blockSize, 
+    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
+    float *constantMedianArray
+){
+    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
+    float* fdata = (float*)data;
+    int i,j;
+    int nb=0, nc=0;
+    //printf("h1\n");
+    for (i=0;i<nbBlocks;i++){
+        unsigned char state;
+        cudaMemcpy(&state, &stateArray[i], sizeof(char), cudaMemcpyDeviceToHost);
+
+        if (state==0 || state==1){
+            cBlkCopy_decompress<<<1,256>>>(nb, constantMedianArray, newData, blockSize, i);
+            nb++;
+        }else if(state==3){
+            ncBlkCopy_decompress<<<1,256>>>(blockSize, newData, nc, fdata, i);
+            nc++;
+        }
+    }
+    cudaDeviceSynchronize();
+    //for(int k = 0; k < nbBlocks*blockSize;k++){
+//	printf("%f\n", newData[k]);
+  //  }
+}
+
+__global__ void print_newdata(float *newData, size_t nbBlocks, int blockSize){
+    for (size_t i = 0; i < nbBlocks*blockSize; i++)
+    {
+        printf("%f\n", newData[i]);
+    }
+    
+}
+
+__global__ void decompress_post_proc(unsigned char *data, float *newData, int blockSize, 
+    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
+    float *constantMedianArray
+){
+    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
+    float* fdata = (float*)data;
+    int i,j;
+    int nb=0, nc=0;
+    // if (blockIdx.x == 0)
+    // {
+    //     for (i=0;i<nbBlocks;i++){
+    //         if (stateArray[i]==0 || stateArray[i]==1){
+    //             float Median = constantMedianArray[nb];
+    //             // if (Median>1) printf("data%i:%f\n",i, Median);
+    //             for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+    //                 *((newData)+i*blockSize+j) = Median;
+    //             nb++;
+    //         }
+    //     }
+    // }else{
+    //     for (i=0;i<nbBlocks;i++){
+    //         if(stateArray[i]==3){
+    //             for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+    //                 *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
+    //             nc++;
+    //         }
+    //     }
+    // }
+    
+    for (i=0;i<nbBlocks;i++){
+        if (stateArray[i]==0 || stateArray[i]==1){
+            float Median = constantMedianArray[nb];
+            // if (Median>1) printf("data%i:%f\n",i, Median);
+            for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+                *((newData)+i*blockSize+j) = Median;
+            nb++;
+        }else if(stateArray[i]==3){
+            for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+                *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
+            nc++;
+        }
+    }
+
+    //for(int k = 0; k < nbBlocks*blockSize;k++){
+//	printf("%f\n", newData[k]);
+  //  }
+}
+
+float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
+{
+    /**
+     * Assume the following are device pointers
+     * 
+     * unsigned char* cmpBytes
+     * float** newData
+     * 
+     */
+    
+    uint32_t *blk_idx;
+    uint8_t *blk_subidx;
+    uint8_t *blk_sig;
+    float *blk_vals, *constantMedianArray;
+    size_t *num_sig, *mSize, mSize_h, num_sig_h;
+    int *blockSize, bs;
+    size_t *nbConstantBlocks, *nbBlocks, *ncBlocks, nbBlocks_h, ncBlocks_h, nbConstantBlocks_h;
+    unsigned char *stateArray, *data;
+    float *newData;
+
+    unsigned char *oldCmpBytes = cmpBytes;
+	//*newData = (float*)malloc(sizeof(float)*nbEle);
+//    printf("cmpbytes check %d\n", (int)cmpBytes[0]);
+//    printf("new check %f\n", *newData[0]);
+    // printf("malloc\n");
+    checkCudaErrors(cudaMalloc((void**)&num_sig, sizeof(size_t)));
+    checkCudaErrors(cudaMalloc((void**)&blockSize, sizeof(int)));
+    checkCudaErrors(cudaMalloc((void**)&nbConstantBlocks, sizeof(size_t)));
+    checkCudaErrors(cudaMalloc((void**)&nbBlocks, sizeof(size_t)));
+    checkCudaErrors(cudaMalloc((void**)&ncBlocks, sizeof(size_t)));
+    checkCudaErrors(cudaMalloc((void**)&mSize, sizeof(size_t)));    
+    checkCudaErrors(cudaMalloc((void**)&newData, sizeof(float)*nbEle));
+
+    decompress_get_stats<<<1,1>>>(newData, nbEle, cmpBytes, 
+        num_sig, blockSize,
+        nbConstantBlocks, nbBlocks,
+        mSize, cmpBytes
+    );
+    cudaDeviceSynchronize();
+
+    cudaError_t err = cudaGetLastError();        // Get error code
+    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    checkCudaErrors(cudaMemcpy(&nbBlocks_h, nbBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(&nbConstantBlocks_h, nbConstantBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(&bs, blockSize, sizeof(int), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(&mSize_h, mSize, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(&num_sig_h, num_sig, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+
+
+    checkCudaErrors(cudaMalloc((void**)&stateArray, nbBlocks_h));
+    checkCudaErrors(cudaMalloc((void**)&constantMedianArray, nbConstantBlocks_h*sizeof(float)));
+
+    checkCudaErrors(cudaMalloc((void**)&blk_idx, nbBlocks_h*sizeof(uint32_t)));
+    checkCudaErrors(cudaMalloc((void**)&blk_vals, num_sig_h*sizeof(float)));
+    checkCudaErrors(cudaMalloc((void**)&blk_subidx, num_sig_h*sizeof(uint8_t)));
+    checkCudaErrors(cudaMalloc((void**)&blk_sig, nbBlocks_h*sizeof(uint8_t)));
+
+    setup_data_stateArray<<<1,1>>>(newData, nbEle, cmpBytes, 
+        num_sig_h, bs,
+        nbConstantBlocks_h, nbBlocks_h, ncBlocks,
+        stateArray, cmpBytes
+    );
+    cudaDeviceSynchronize();
+
+   // printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
+    checkCudaErrors(cudaMemcpy(&ncBlocks_h, ncBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+
+    checkCudaErrors(cudaMalloc((void**)&data, ncBlocks_h*bs*sizeof(float)));
+    // cmpBytes = newCmpBytes;
+    // data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
+    // memset(data, 0, ncBlocks*blockSize*sizeof(float));
+    // stateArray = (unsigned char*)malloc(nbBlocks);
+    
+    // // unsigned char* d_stateArray;
+    // // cudaMalloc(&d_stateArray, nbBlocks);
+	// constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));			
+
+    // blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
+    // blk_vals= (float *)malloc((num_sig)*sizeof(float));
+    // blk_subidx = (uint8_t *)malloc((num_sig)*sizeof(uint8_t));
+    // blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
+
+    //printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
+    //test_nbBlks = (size_t *)malloc(sizeof(size_t));
+    // printf("malloc\n");
+    decompress_startup<<<1,1>>>(newData, nbEle, cmpBytes, 
+    blk_idx, blk_subidx, blk_sig,
+    blk_vals, num_sig_h, bs,
+    nbConstantBlocks_h, nbBlocks_h, ncBlocks_h,
+    stateArray, constantMedianArray, data, mSize_h, cmpBytes);
+    cudaDeviceSynchronize();
+    // cmpBytes = newCmpBytes;
+
+    //printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
+
+    // unsigned char* d_data;
+    float *d_newdata;
+    // checkCudaErrors(cudaMalloc((void**)&d_data, ncBlocks*blockSize*sizeof(float))); 
+    // checkCudaErrors(cudaMemcpy(d_data, data, ncBlocks*blockSize*sizeof(float), cudaMemcpyHostToDevice)); 
+    //printf("nblocks: %d bs: %d\n", nbBlocks_h, bs);
+    checkCudaErrors(cudaMalloc(&d_newdata, nbBlocks_h*bs*sizeof(float)));
+
+    timer_GPU.StartCounter();
+    dim3 dimBlock(32, bs/32);
+    dim3 dimGrid(65536, 1);
+    const int sMemsize = bs * sizeof(float) + dimBlock.y * sizeof(int);
+    decompress_state2<<<nbBlocks_h, 64>>>(d_newdata, stateArray,blk_idx, blk_vals, blk_subidx, bs, blk_sig);
+    decompress_float<<<dimGrid, dimBlock, sMemsize>>>(data, bs, ncBlocks_h, mSize_h);
+    //err = cudaGetLastError();        // Get error code
+    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    //printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
+    cudaDeviceSynchronize();
+
+    err = cudaGetLastError();        // Get error code
+    printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
+    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(newData, d_newdata, nbBlocks_h*bs*sizeof(float), cudaMemcpyDeviceToDevice));
+    cudaFree(d_newdata);
+
+    decompress_post_proc<<<1,1>>>(data, newData, bs, 
+    nbBlocks_h, ncBlocks_h, stateArray,
+    constantMedianArray);
+    cudaDeviceSynchronize();
+//    print_newdata<<<1,1>>>(newData, nbBlocks_h, bs);
+	cudaFree(stateArray);
+	cudaFree(constantMedianArray);
+	cudaFree(data);
+    cudaFree(blk_idx);
+    cudaFree(blk_subidx);
+    cudaFree(blk_vals);
+    cudaFree(blk_sig);
+    return newData;
+
+}
+

From a091ae3ed0c0b5a6dcbd06dbe6cb1369e47d6272 Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Fri, 17 Mar 2023 07:10:44 +0000
Subject: [PATCH 059/126] add new preprocess data for tests

---
 .../3reg_N52_p3.jsonterms_Otamaki_3_M30         | Bin 0 -> 77527 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 bench/qc_simulation/data/preprocess/qaoa_maxcut/3reg_N52_p3.jsonterms_Otamaki_3_M30

diff --git a/bench/qc_simulation/data/preprocess/qaoa_maxcut/3reg_N52_p3.jsonterms_Otamaki_3_M30 b/bench/qc_simulation/data/preprocess/qaoa_maxcut/3reg_N52_p3.jsonterms_Otamaki_3_M30
new file mode 100644
index 0000000000000000000000000000000000000000..71f0e809101fcc711b1aac87f8289bef33db61cc
GIT binary patch
literal 77527
zcmcg#2Y6LQ(<ao=LJ<U{gN7csHz5fMgc{1_(t9xk2ppCk5->CYC6Ew7Bs38arAP;n
zj(`+F6e-e+(gbOOG?Dt>y*r1Q$wfZN+53P0^W^`+_h#NHyL)zbX3iOzeu_^TpEUA6
zosvr@<rokX9TgSOuYXL>K0Sv-MJFevYaJP#JUO|{sN~_U|2=a=&u&qDV|vEKB|AfX
zk}^aLbfJ<RX<h%VZ)Be+7rbXgrR1b65w5?Ah%h1t8p(;iJN;_5xf$AEd~6pZoswf@
zbXM^(GC9&3zVa_@$wrn$-(QRN`Rdl8Dsc?<Hi6mQu#$O(?l}{e{FK4oA+SI<tlV$o
z^88*rxF~~FC9ohjEOO|f1&dd&>dIi%2&}RjmS<jrrv3JA{DHx$6Iietb}puHfwt>9
ze9vHE1Qz0k^_?1XqSV=(xf!emfrYwZd&lNXEIX{nMh2@%U{&0(FXDziev~GzB!ks5
zVd-7&t>ngCJJV^qF@HomhO13+Dh^+6tF>ln?<xkv)uA{Qh?P^LY~SS^Hj?4$Qk;s!
zwRQy~8{cTZj^V;7PKDyP?!SJ!sX@SB3|EihR4jf@bFi~L{Y@W+t50z%7`v9w+FRwD
z`QZ%LfZ|j%>Zk26Sj?%Cm*E;xoC?Po|7VR#M%Q_p;T#mF;?a6^+Y)a_w5`K%jR?-J
z;_<8B2klFSevp>ooD`?xas0=MLoN?(UZ3F_Q=E#&r;}QppWSR+9)@c|aVj2*m(^`q
zc<VMEx26=Q;&IfcSm&thrK>Y}%_vUAW69#SMF#B3w3p$UQ=E!Njb~}*ZQeJp3d6Oa
zI2Dh`r9~ER*d4x*;aXChiidsp@%Pg9`nDCrwW2r`kFGOro#{9D%{C0zn&1LeJf3`1
z;w$@<;N=Y0hT>E_&K94L_eiIiB^a(P#X&qy*V_8st*Cnp*N);;JjxiuOZ9kr{w{`V
zPjM<9%c})69ll}CZw%Lg;#54&&K{dTazWXd4A+t3R6H{Hhh-VP^?)zKb)q;Gj}DLW
zeO@vDKYX=%m*P}BO7&@e<mS1_OPRa~ic|4;uiduY<A;1)gyA|9T#$-K^vTHSZ6BTA
z&Tx?wr{YnpcJ5`(8n+wAa9t=)#UsOk@UbUqO&!Q^T`5k*BTs|9mG(axu!`ZjQJjiL
z)j1oB?sZPy!*Ed)r{Zz7O6-+$j{OG1b*DHLk000W9Mdu6R0D?VL2)V`KXtD4{>7Lc
zCm7D4I28|D@TjE^3mlura6Ktb#iP}v>stcj<2Ey#Ah^mZ9@DlSxl?)0^~DU=i{ex~
z;ybn)IHux}DHP{OV?OQsx%LP0X}_+=fsPv-0aY2w6qc?Z><QYpnNhK7fkA9-cI_Hq
z=g==gMYhLh9|!$1`lB>VRystfxcb-3diD7EUe_5aJ)%@pPw&oiDAS4M@eGv#Q7Wcg
z#L1aH2d}=*P#F=WB06M#mUYc;-ujNAG9gOEb9|e1F$sT9J;G4Fh*HrE`1sVGcbgC4
zjF}OoVmY|%7flj;9xh|DvH&W*%d5)yQ@bq5HOgo8Hw>1Qz*Hn3JuDpltWM`Z2FpfZ
z%Kft^Io92rQE3N*WhXG@|3XVf?CUzB@&N|(BQO;JqsQnwf3}XR$zVAM%pHMDtABs}
z>7*hV87e11sR;b`W%b}ompU|Muv`SDLNIRF>GccF&Hj|Zaub+}LB911vR(c97TX0n
zUL!CSgbw0Vuei8puQO?R5C+fx13tY@ga^Holfm*5m<qzwPO%-oIItuegS}2*DhM4$
zH=Ized}lto^AVT|!lasauZ-ID)qN%{KY_t4c)a-a*flNL+U6)gU}_rtG49Vt-w*xh
zJtpl90#i}&trVImYm<d+BkL$gU@8hDw&p#z`9Y^sOj;oVQ&HG`BD(t0I~%7nSYZNF
zQD`!1{HR%-n!dweMGyv0{choF-amFap%;S{B`_6*wM}RJcq4CI4F)SlU@8jxTeXj#
z?AWu0!HN@@ibCOHS>tZK<-?;;g1}T1CNHjcq3nkpYBOmi2~0&HY*4vTHDV5Z&0ud5
zn2N&a(aSpQu($n!!QLV;6@?GukG((t+UfcX=1*WM3V~HZ12gp4yOqI85txd?s`E{Q
zp5(vFqfi=Q@T|YxdgL#?7ml69q?I8s6@{%2D{fxby;A`OD@$My1zT3zFApxUC#j<x
zfvG5r9M^Nq<u9v8GHK-rOhqAJW2ZBP=Wct<U=;{VMd7`=Kh=43!nY8E1rV5u!rS%|
zv-%f3u#Leg5}1lYY{%UFR#ceIPLhsF1g4_UC(Eq^h5yRR_S6m=fvG6mi7S3|XO}De
z8L1s%@Pz+ie2w4oiZ1MA<p?A&6@@!LY-_Z4a8M>Dt*Qwd6cMZ*GDAE0O?Z3y$Uhk@
z5@A)8rM-;TcGX)`atMR9L|CYbg70g74K}=1r~-pkLs+oVZOobGiIaZn#x}{0<_HT_
zBV=C33d2GN-5blKy^XNS%DvAL&*g1!<x(RC3nnn7+t{euKlhk8uQP-709b}WuBWaV
za2+EnOfIq@JOjhkKykrJ+=qjHEb=I<R&xexhp<o;)N*wjteIAJF%N2Ogas@A{^*$0
zs>7kT4=`!X5EiN&m*vEq-j!#Z<ZE;o!h)3vo6~Pz^!VZB986jlgjG?F>vdtw@jTZy
zbC1+PSg?Z4o!6w!f=>goGimQ2tg^E7#L&(K$8T=QcD#=I2n$g`J+idSv)MB~WE*@(
z6u{hJP^PYZHhV?Ud$oB~yP~)%inY?54zntrOT#@<3t_=(LciB;#_PWwUX{dbXo0X$
z<+!u;#?LF=X9PPnI=WNklo1uWh9?fZ*>eyR*Nx)T?0PR<*|afBZznTcWu&dFXp<fn
z$=<I2ptB72F2bs)A=kcJ=1IQ?e9d>4^$-@KJh$)i-^2DFAI*;Dj!=YERxYVh_~G1U
zJInH!-w9z=ly1Sl7V91zR-LaSO%WEVEG@C&i=v17zRy?8>Ie%~j>|Q<Twb5?XZTXs
z24SIUI-GvC@`H{8#x!9LX+>aaHJ@9mPjdE;i`Qka)&!<3eJi3*b^nt`vM^Xpgas>U
zu}u<Mgio3@j=>xV3sJBR?T1|YyT#$G4AvH5p=#Fb5T7OaPaM{V!I~f}M2+9BT|yeJ
zocV|k%qj@0tQ?ZF<tX1ZV{-Rn(jpL6Md>zRjenDOR<!nIFek!7l(aMVRzE3_<-k7-
zRu^HxO1GHok@0aY-*3ubjS&{2EcIV_aZ2aBnfbP$1H!7PVR^OSz}tOKAF9WsbtW(s
ztO>nll)O~1Hy@Va2n$i;ZC$yC8++#2k&{VlkFY8VHa=w9`MZ8khBH_Q!YV7jWlsEh
zQh|(J`Tnv2!a~$sxKwb}gU!LKxI;Q3tcn`shs*T%Zp4C0CmCr&goUW-oc5#Bt+pJf
z%0ll9G*8#Ar7*CP3Rm^@3my;ZIys2Sb5>wL<>JldX7BU8czzQD79v19Jg4^!Nc?@*
z;%jWEJKv%}nEi8YeQ)_+Kb4M2v=d-pB^9=V1>Rron~;$`B%B2a&<;!NnofHjpD~)S
zgMrhJ0wKbU`uB<WtofX;7<GOMgr#R_$JlbE{7SG#fHQ~y11qV48q{uIzI$Wp=VB5=
zDA1<-cr)f|@alcN4F)VjfOgmxtseBz{MK{(^IJ<_3WVLl!7a-(j9fYD4wG1k00S$j
zL6PhDj>gkYWvj-3r3latQ~lhrNax}=PAp@<ViX7qk$usqA)7V{9+{#HsD{~tbwQEe
zJX`2s5-Sp5AncHAmCip|G`k5O6gdgd4$u0pllC77XmR5plURTPVPCUkN#pwstCe79
zc4q(q23As2Yfi&tzm97sRb&zi6QDg*&FGDL#s>M7owSDm^HCryz`Gk)ulmM{NuM)d
z77Bzd#Ed~h51wjXYbFC`V?Z^`His=~c4_&$d~yX7U?5!Z)K0jtx_P-3Y>VT}PJnh;
zh`V<j_f6d5H~GANlLBGo=^3-lXXNaKiA+a-22{h$824yhsoTBTG1FOu0z=ijUZ1$G
zZp?+9`<TQW6bQSH3r%CbycE&%Is=xVKv<I76uI5H@Skx!)|Ck`u#$TG#Er~$=+WBI
zu}orm0<^;!CgwqoVQ)1$$c|yoToefF_x#aSr_JiMyCsuYoC0CjeqzDG5gkU~VcQgE
zNeYCATEgH(kB<&4y@5&0NP)2NS^eewg`Mj>=)!=R7*IW9w#Db|ee2<ZU<S;<fNGJN
z|3_FWTm7LI889;gs;Tf-VgD={?eDPb4rh4+46LLcmXqF?zTDPg_$?-}ECJf#;4tAz
zyO85CL*_DIUJ8Vb#@c|#tsVy*YtMjr7*GwfW@p>IJMMloJ54!XXFxT%&iNlcvSob&
zUwdo>7+6VJJ->Z}o5$Pt;)%*~1Zam7#P+DTfaSfmwPQNwra*XJpA<8$7^_C{5nh@C
zt0-fhRJ<11Cfjv(u5!LXfpEY*vt{hv)(^|DyA9`S6bO4xXa2VbG+OP)uJD{$DG<()
z_X`~VBYebnVbmDMC$RO9r>h{PVw<wgn0bQ_@h$Nvg0sUMoF0Gq?2~ZzIC2c7I5lc#
z#Gee<oHt-Q<NcoCVB{1iTikC{d`b4Kb_^#tyNcergd7zL4B8}^yiW<PvRcQ^i6=wS
zefxkd^Nz0w4$iP&XIs;y{Ig>GgmnpVcDQIfzbeyT-?VE}hVecixF9tQOHZFQF0|H!
zF%;)aO9KLz6Y+1&z1}GH(J=~itVDsf5EYPRiMdZ*s6Y57!wn=jn{v#O9@%b*L-qNF
zb{N6gm19n?EB<a^o!`4Mc?S_^hedWv%~9d4hF0SNnS(eRobDUkUG!S>o>l&4@@7$-
z+C+4#&~p9I)$j6?+bP7^ZE9d$Yh7V>!s%vqChsc21u8qzCaf&J=FY5a4EHhOY(Waw
zF4pJq;V&w(Wz*4z;K1G=E~V`;a@=-4V17WHy^>m8mrv|=FCh2Ck&Jgf;%sney#Hxv
z_=nR=v*)AZ7UJx1cba&wN7=03-<!<joh3MkThVZ1Ms%OG4;k(}#i{X9r$xKC{hL0S
z%5c*VXREC2a6Dc6-J7$^@Cp1Q;_R@eyBB^oq{X(+`4#XG#M$g>16H&5KSyIuW#$XV
zID!jQ`i1ZOZbyqFS@`YlW`cv=>DL2uwb^{E6wh8QCpcJq>n_XtLBH4i^D%p;5**x)
z?koS<z@AT6zs7L=3C^bM$b9W!j^>l+eZ_EtDNc>krMdQgzbHH=j^X-IobvN2W8b8u
z%l^5*aDO1q9;kTZ{O;e++hg5QhWiF_HaOITT#ZaqDWYf|hC7ZpyG{8yFg)LZsSD0=
z+*5*sg|hdxA3lqDtJ)+cZwcaTu#I!HK0D;Vy}o@J?j+*uHnjjcTl)Do_ph9p;Vu(g
zpz=%4SzR-pDfS&7_unGU20O0aL)sTC82)4>lQ)Urg4Fu=_g6>C{dpzZ6NY=A;OuH0
zy0s^LKvJi$4;XGW!NK<bP}L6tFMj0D*U4Q32PeIj2b&zaJ$NynAIA`9hxNJO<o<Ju
z+$q8{(|-|Mkn&5lMWb^U>av;VD@G#DW><bL<hON%@71&OnH>oP7pUy*)pl)~3afhz
zWVpSEv%>*u?a#x@o*z_$ue<*cT#y>~ckeFWeJC(!36r+~aW*&=ZO+-YP}1Xzl^AX^
z!38P(a*waEWnIO-Q4AMNa5lA$muc5%X2PlP$_)2A;_R?7Nyu@cR?d3+_A}fT#M$7q
zacf%Un?pa^$o+f)adsFlmkJgy{rmk#eVDu}1Q)0_I*lHM96cO+nXi9eAkGH+{qF|^
zR|_7VmiuJ|!NJZhaJ#5Fe#S#~(r~OHI5;8y^kKPe+v4i-eZ)h=*<t-l*DB}S@=Ydl
zI}#CR3smFkaDgvQ)%>Fg+v__v5*(cWR`eeB*M?0!eq?^RhB$kmnwOheJQ`AC%Ammv
zw+?YOxc)hOF~Rq8p-<jrxG@9=em=FOe44>cGWj#yGJ=C6-HM+#hwPYG^cus(5u9Dk
zw|iy7hOGT-+Es>oj5vE`wLX`>@>`xMN2l`ncAwy213hqYzUmovR{Mp?+m1LJ+}W>+
z$oc;I0-F*UE}7t9L)m8HyFMpMb?{@j8;G;RJpaXK?c!@^Di)?VXBxKgR1b!4KHj@<
zZ{^R~8O^Z>1=`?l;X>vP*`l+A-=G2=2M}k6%d^qTXZZL}yHJ+moIZ?L&Gd)^F;D(U
zNM@Cljx$6cL}XQbaKD-J9`Pso48++WInwrI{@Snac#Ut#KO{JKvUZ6#T2(BNDK}Nn
zF_GYc)C^u0Qn=u++o$jyRuaXj({R^&&)TFvU1S)Ow~gT7&SlZxgZdWvCzhQb9Xlva
z&5+kSeLd~-sPDTodA}mg4m+%aEi#pzllbREhFgp{8{Cf`|GjVgu8@)y7;X^3!IS!E
z<d;`wUYoR<;S7R<o5uzIIh+6beTa?WJ|j5D8|`1avh&H(PQJtc8*z46@t^KXQ|`-W
z7udy`V*=uAL2AZ59k=}bhR=RI!FV?j9OQ(qH@tnZZ`X4C39^;o;7VqGgCD01>@$YP
z;~wJdu#;FZ`}dtis~2R~OO8VX2e))HjNHCGBa_oH{SH%{vg69iUi*9ae$|@cdLzyT
z6Y2Q8fL?yh=T2idL2+sVuZ{aTwsBJC+YA>3xb%Y}V8QCrrfNjM-fednY$g#0DUSO4
zO1^hHY&&0Irc<0U<69rU9#ucw{~41POK>(d%|q&RuAA|EZg#)o*g$Y_5j**pd{vHz
zR_EcpjyO9U7()J7k#$Xp&oeUK^@y`U>SEI0@sqoR-r)J6{fM(e(j>>W$$xcExc?TD
zHx_X=cyOJF-8JCGU&RYC+#bZ);c_hA7FO=ch#Ub7_c`Kha2c8R`wzo6O!@32#W~Zl
zbyDrH@;vMOVbQTwSt7*o6AHA$y!>Tpvkcj%^v^{FI&KnNppsWN;C4^{sriR6+#Q00
z^>5USovSwPJHQ9RY{c2%=3@T(W<S<$+?Su=;t38;=>CB%3x*A=(~9waL~yX<EqZ=W
zQj_-|@^$Dtf`gP(&iEaL+tj|8pUJz3I6Lf&C*-afdB5=f%MAA?!NE>v%Z~cl9{OJ1
z$#4r1XM?M={FRDqiJi81FT?$UI6EYVGIbwbA*k?sD;RD(;%pF)$TL&MI0rh|b)jP}
z!9g+1%46mHDujRf50lrE;NU{@<Ejmx7R^7J<L)BP4mTl1wr8rhZA!+?Ox_WKv#ICc
z#nFG4&%dh~%PKl{BF+Z4nOldhSm#qG6TdN8OK@<RS)fX>xt%@<X~TFQA<hmr5^oOc
zaqZU~#al4k6N*#%HSg5FUbB<4mNDEjic`<kq%k`>=dFLZ6T^*0oDHt_%3fGiyxo3(
zeo`JmaCWusmRPxa*vU5Mi!yn;2@dX!({D5u=jmSV6NY<_;Glx^Wb3?pJ6CHxo8k5$
z&JOXdlktnV%gyY3fABNKDg8Pu{`1Jw8Rz+PAqH_axEYxLz~<_4X>Tyz0TieFd_BN^
z<4lC$SGRpBPVI+Ew|uu=$iYQ<nY^Qjv%?9_v8LD2OkdP|%5c39XM+Vg^6J*;@n7B!
zXSfrHv%`aCNwr}A{JTXzhC5Ag;OA8XGKY7V)p;Ak{YG)h-n)}8ugX3)pgzN$qc}CL
zzWp})gj>%_^<cO!5od#9tGOF8eSCjriDZVGM{(+8TVrM2@%gUyiDtN^1P4je4)1<<
zq*&8o4;XF=!NCc=#|K;c3|`o6EW>?Ga8RDn@oI}luRk2HgW*<DoO+<Q?=d6K+J?<|
zUEyuS*<rsHI(zVgs!f;k=S(BOx%MS+p<QL-JJ%yJ-#El*KR|IdxF8Dodz{hNx9y_X
zE>WG5lZp(8iRwGBUo<o~4vdWH*{^RvOZUGuj*3Z6@@po0AMqcaH$7(R65F*`RLsC+
zSBGS$WWy&ZqXIo|i*$7{`Z$x5d?R}H?bfqv)WBq89(Fo9UbeA{Pj<Cbc8iROjOY~=
zmn_<)Pm?CmcT<a;J96%etLge5Gr0f9<i@U!%6`$^qN3ffMBn2_!y1kMu$3$45ZNhN
zw08X!^i}%0MTqvUe=!yk9V(*}Jqp9(lSPCJ??EmiUH{rxbaDOskBo~6V=!S<qZF$o
z^)W6Zj3I<k%`>dM(Z{%wFoqIFwSr>hn?A<Xgs}=?RC5c<xAZZtC5&$qMs>e~MX~xA
zHxR~m2&3A6V*RH+#?6GWDl&T1F6(35N*JpVMs<sXh0FRFw-d(dgi#&GuwYUj<1WG&
zMi|xe5sQcQG43IZH3*|xbFh?JALEaNu_j?uYYvur>SO$wFxDcBYR$m{WPOYW31e--
zsMZ`TDc8q%m@w8MjB3rnLUDbJ#|UFx!l>39tWed*c#<%NBcn&%sXoRtgs~oBRBH~F
zbL(R~OBm}DMz!W(iM~F@3xu%&VN`1l7S`%xyhIoq5=OP=VDYU!#w&!;K^WDVgLUKj
z7_Sq?mu<oIG2SGMPGX~4bFi>nALDJp*qAV?H3y5&^)cQfj7<omT63^sUmxQG!q^lU
zJ$ftjF+L)U%?P7fbFj}yALCQP*qkt`H3xgWv@r@F*DG@fX$wNCmK|*O(MOt|leQ$J
zYURNmIeny=IB6?FsumyYl+j0;g_E`>q-xKDeK-0@vvblmgj6j-*gB+-G$$u*OGwo!
zgv~zsNMGZm?Fgw_h_JO#AL;9yv^|n~G}P!LEx<`T5K^@qVe61S(n6fHBOz5Q5_a+E
zBQ45FI}uW~B4M|kKGG7L^j$)#RwV2z)JOUjCygMaYDL0UHGQO|IcaA?s#YXyF4IR^
zj*~_bQneys@3cPB08ZM4kg634dwcbf+Bj)fLaJ6IY{u0`8pKJv0jaxFSS2Q~$yOh0
z2xpC=tZGfd{$zcuZ*$h}lvOQC*m$guwHjybL0Q$Rge~zFS(7rhZR&d8U{tcY{C@tT
zLG$`ue3{O*WpllF=+&zlOwFEVkT>DiM$GAU@Z;#5u79N$<lVPqFF3WZtVZ>?l9zv%
zQY*NS7h|P|Tw1-bPy18iFORMSZ)gRV_M#<dg;(}uvpa)orJ{(1RNl}EF80MwpcP*0
z6RoCGt<;FddJ}JG1(*NgW6%n({)tx8s8(urW7)hnw1NbISOr>PdVpv(m1?C{IxKGY
zhE|Y75Y0d<Od$}hUYdyXhE|Yh5dA<aOfwLzl8LW83M##!6(l1>Ezs(PuTgnv8MHUF
zf`o-|fL55UAiny9`buq!up7)9T0v4n%mA%0r9rfsM72^oL@f39hE|aH5Zys5OnVTm
zKBii!%`!F`c|$8mj)?Z46{bdrRxd4T^@diEKoRpnD@>mdttL=kskI$TJH4S5Bw55z
z&<ay5M5~t;qIp9rNW_TFpcSTJh*n9|S1HSly`dE(Yea9*3R5*itCtqBc|$8m=!lJ=
z6{d5DRxfQ{@P<~9^by~KRxf<jNCJtVM+=TOw1UKt_z1MZv=H&tII5Le+p!VP8(Kl~
zNJN8Hm^va_jip+twH>Q&y`dE(n8be23e!tOtCyCtdP6HnLWv`w6{etwRxfRZ@rG8A
zs1jd+R+y$DT75_(sMdCDxA2BmkjxVGWsQ{U<n`*+WZuvU5?*2*_zKfq#8)pZJ@tlG
zkQ5W&f>xLkBU+6j5%lQo_l8!GI1|f3D@>aatwvI<)WsDxJbFVbNUn*gpw$asTlPNH
zO5NdMJE=Fcf&`rC3R+?Mjri)NrOMvW3X*i9F=&M;I-=EZ>MOOj<GUNYp%rBD#0by|
zb9h9nVN@%%Z^suRcta~l^NISP6(;$JRzs;)>P`mVo8b+uAP*?!fL53VBw7ujTB)@i
zoBX|@6(kD97oZiU35iy5R4a8SgDsri&<Zk%;xo_+^N2*N_o!BC-;U+|-p~qCjbaIC
zg^5O@)nK9(yj}xeTi^|?AQve*fmWD_Bw7ukTB)@iTM@mX6(lRg51`cx-~AU$wNh(4
zcCvXxE68SwZJ-tAGKsHZs8(v<j_n)X&<fI>;t*(s$xfoxK&q8m+wt`?-p~s2p<*&<
zh1pP|RW#K~t?l^I0B>jo2~&{>T4B1BXf=RprPg+Atnr3ckWm%ifL54KC0g~TTB)@i
zUwq;Xtsvzpc7Rrxa3xyxqgtu89bcc~4Xq$2E9!z)n2{w~y|m@Y8(KlqR`fKrdeQ5c
z`w(Awyu!#ET0vG<ya!rgPM7$qH`Pk5?bz1r4Xq&UD?S9RFy~9OdTEoHH?)GZu!se%
zFegm3dTDF7H?)GZu^0tfVa}Ln_0k4YZ)gQ+Wzhz-!kjYE%An&)t?l?)f>+n7H0!!b
zO547@N8%Y6#b`)8BOOr-HVtAafIN2$nS5o|d0&guX%kCL`@p3s%{u>bz3B1KTWdsY
z7p9Mpf0Q!i40&781E?@*D|}sW(+Ar4AeSKe03W^`!#dydl)76K@#kkG%pPx*Filj6
z{Io;kB3f^*{YeU!CKo;ZX!gb;#oS$kJj^e2D-3t>Cz$!dW@=bJ#G7#GgdM7;PTJfK
z%eq*WR^4ymYw5(B-Uu@sh{bf^!_%;JJ`A%|orHN74YS&7<Ld!Umud|&thM5E2s6Gz
zL->2ct#Gp~HUS@AYl|A*;DhI`m<4?J(6!EoVU((qFwUl7R1bT6H-uNh2qiV*0~oFN
zmQU+pL^I`xAZE^?W~zENeCee=Gesj#3kgbb6hexx$uK#!Ejp0j6Mp~|zM;aclqEiN
zYYO{5CvNq4MT$vjeGw4lxmzJ~FBZcP@=V)XbF1j<;&YRlHl#`(x-EtNbE(_Z83|t$
zVp3Xn8yWv_RuqT9V|Y*`AUGt6BJhaBmqM62rOFNRk5cZ0pn68U4x<&{ASga|!L{?j
z?W`CA_T$y8bv_KURGs+q3mRs1K*u)+nkH%uGaSCfNiY*n-b5|9Q544j<awQFEzDvu
zcLUsiiTp6EJTJej@e$J?aWAsKu)-H-x|K40o+>^Ja|-*uq+wRKL-^VdlhS&a#RTqF
zC?OOFA&j2&gQ+r8{!z-=3in^4AVd%^zuZchHl~UX-I~I_dDN}yxQK6AF)1~;^(XFD
zsIm}yz^$Go7O67xxm!hh>0n7}+K?)F=(ZI4&!=uv_qzBRYm?Ht+sLGYJ6llzCLLbb
zim6u3eC{^F2Nk_yD?|v(dBt^a@Il5{JODmS7hC7UFiO>lkqc-TRnh?8#_N?ZLP9`H
z0W&chK-7Y|8u1In8s86R>XfR{{5;GeV`k<CsJ9RqVdcU1d}-pdyWH+3wImP2m_q-B
zG>qz$jIW|IDXq^&@wGH#m^Vg0)BuQ)Ff6bDz)fn}kgBkw+ob+vn0-avrcR~!kq48~
zy4wgNG%tunFtD+6L1giUTcH$8)BrwwC9-usbgNXI7`ceLRh@$I4Z&V<E7Zb@{$S<{
z-@6ORWbr<LJTu9u8eO=kyzX*?sP9@?OdGAqBOZm?IUzuGES$5>hhdbe6Vk;bj2`a`
zHchlXJH%2xdZ0#0WQMgE-=%1BYI7@;j)_d*R;(LyD`klf-I~I_CDg6ZDfIk<2_~g=
zw~|0Y#fiuP9>sDK0)k>05e~latd!B_cBmE-yI`%yQXvuJjoAZzA>tnJVN-~8J`A%|
zo%nMp4YN9x<69ZM5@u+g5aS_&*f}8vX>+Rxb&0Yr58WnJBW_zp-KI{t_<DYQZWF_#
z87H+pa^3QX8PMV;s=}1R7j2q0q-r*y+ob-4Z#i|FI>qAGZcIw8iwYD2i@`9d;+}uy
zOMIryt;9lTITJC!hy7;O`OvLWbwav=x>cQW@nakM+$szgt&nE~Q>MkyAw>~0R#G$6
zDG$FDW>Q)o9pX(nsTR0?m@-pkhWsP<)sWaI<w>EI<Y9EBDB_k?)Gg}NgfFl)DK)sI
zvos^zsu}1Oc>ohqsHtPB+>CCKqKI3*rfyNEB7Fa^NvXjtqorFOSRM4}mi5vtmgS*a
zq$uK+)zmHORD&OW(&(1%(k<WXbBo)}QmCn;#nB=~5x0Cp+~V=wF_TimX!(ZICMsGH
z1e+cF7Lm!BDl>63xz`>^YDpfBrWE?Gp>9*BfS;a!p2?)t;I=$jT%*a)su}2(+j5dw
zmWOVUqR41kOWmS&&G=0Kjc#eqX+MC`<bdrOzEIrcv^bjN4Yde%#X^rf;<1^a`U02;
z_>C6pd>AIFIw4(0!=!fU_)#v?L=B_&V^^4jjFl-fRbfCgQiOdyHAC&1@zXFSr3N#?
zW%T92$&_i)3@M64V*@oq?NafJIVPnBGkj$<a_W0B*`+)w)YQ?UTcjxBmW|XcYS)Wj
zqBAKqxJ5>n&?YNd(HtfuewEGSv}k4uzD?9jwTs2iT$z*_%yh5(mQT!55w*FOqKMlz
zQ#0T${`q%ZHJahxC0gEIp&3#hG2>fmhT0|KXGt}h;ZE<$hr21$;-HeEh#6am86IE3
zG$}O<Dt86cAbn;?dBlvZ)C{%D!cV(tG{b%KZ~26WgG!1bW^AKos9h0$F;Jr!HC%yk
z-$j@*Ee<MqtwGHAj+&u%4fvrhlTyQ=a_<^64k{^+n6aIjp>_@Uh0a$ugV*!p7?5L%
zu<oF&>LHI`gM3BSe}8eEJGo0(cT!gM48?DG>0^~=RKmK8vZ_ZKemhwot32Wn*6%5+
zdIA+FTikC{e94WmUG%ZaqX1#uO<C0g1HTHUk5x_w!ny}pJwHzP>a73PJ>lLhTE7v*
zNwI*3><4OwTHlvX>~=38_r#H}95UCZqyB9w%IyMi-d@V8mSg-R_NzPZ-&o~_i?IGk
zS=H)!FZ^ssi*28~J}Ca`tp6G^_d`H#%}tpW9|BSoG2<s{hFU%G>y#$te}4%4oAcx`
zg|P0UtZMbdZ{@ur>q}Noxho^AKT}q<dg7P&^s&l_5!U^bRjr=*A=6i9{kMTa7@-$S
zjDaK13qS7*RUx7SfILe=)Ih7W;j<q;IoEuOOH<kaH>Hq~-1X4p^#7#+4x%k`!*vdZ
zTGY>)a$sxVgJ-iC1g7DG**YJteNuJe^aC`^>T!kN*XCiK-L~|qHXppXI)zJ{3>aAA
z^P|3B{MQCJ=*1FqAq9txSVS#off2I-<k|nCEzHp0Ag+PivAIEX^G28<jV3+;(=dl-
zoe#q-RVV&DNW-l52U}{63U4*En#Zn0Q<$MwOB?{VVxyMm?hUuX1%{Xae0X~yMtXw}
zx{1Vhz=vO3w$6uPl&X_39wK4%_$atnW-B~=L|#|~@yTO7jA*7D5yZ@2sF`X*hTl5Y
zXQoKvv~aQ%Cm^Jr$4YIB4qPdSN#^W${;7DkQl^cm;zPHlu<tN+tJ-nmr{YaY`PYfQ
zRu%!#o4XZGf8uYj)${0=Dl_FDsXM^sE<QJ@X+x^yq1#gEe}uYCZJF_l)h4BNw~_G=
z-6SFyCLOkt5D+wMiPGR!&u*<$x#7862^E|!MK;g}kCx(q%lD=awDCc~j3@`DVYQ5P
zJ`A%|ofvtPhFKjc@B`AOiCV)9y;vd;+=`7@L@g-v6jxxo;P9yOwB~kkmAe6|*+o3?
zd6u$U<0GcQEw{K0RCvYhR?75ws`xO>DeOB&!>rD51t<5PTjWlW_>>1=ON-#o+^ulh
z7ujJ*dmi;uWv2Y2lnrpuR3%n}Td|``NToEnHHCf0saw?{H84EifvF45xju7mQd*xq
zB3m{#X(8Vuel$m`N3zG7Tg9g?kGV-r8&V|?M`H^8Pf)k1Q#O8O-lVkdHZtj;S4+GG
z0}&gw#8IoG<3*zpE}X=6h!Ea3iSn*knl`4&tuNw(0xR(d_^`UlIv<8ns!p7Gl7>;8
zqMa@M{G0n%&a89v6GrIO66Ya|*r-K7kh>Nk0P;*+TXVYz=Wc+y8}ShMJd18D@<q$p
zC`m018w_I#{ZG*_s#Ef{))i(aoNi{<8Afqinz7j%vki(-MJ?D;U?r-XRFm7J{$!Y)
zCT_E<cag3dkU6}=tj^nXx{WZx^{?0s0}bzfMMH166>4L|3gE-S80&mE8l~#Q$TQTf
z>J+>xKDgh^d5`jW$*qtf76ZXdOb-*aprK0)1JLo``^N7IDOmw^vN*|GyWAkgS)JrB
zdNx3%mRJL-W0{t9J`AH&o%ril8b)>MKG-5t**S@SPDGc<zqWYlzRpJv)L)A*SP49f
zudRiNSO~?t;w(IJL#{@qsT5Jvweb^DDNTz`3j2PeZdIr9yOS@k%04!rzD~CiGocqt
zQ~{4-BNhQc=aHxlL(H@7NSoWCTSxp1ZpT&~@r^fT59HIuQ{cm7x^+IBJyLbz&$Bel
z>Qr7d+?WyFXYE5Tg&7JnL=%W0R%M8_+T1GMbBVGp58WnJBW^oK-KI{t-+r5Y!mVee
zdgyeU*euO>L!VouJYoi9=EX`Fa-M1VRLv%ITMEAO)NSe%Tc<_4xc!?xnX1!m#6qax
z7XdKZetKTMFHT!sJ74q&gz^+I96XA(Dc1SWtx|Pj<OS+hb;`X|uz2a;??39J)2-qw
z7p;(I1Wk_|DT<hJk(!}Sc@uKijJ#iX|7ERabd!_nlD<jxP|A}+HMvEKB5wJex<#Fu
z5^|iVm9yTy{aW4fxill*su|BGh5G;|q)<(6k)nuOE)ll`s#DSVRhj<!rd^vdTHUf;
zx+T=A8R(V=auQjVhiicpMci_kx<#F8dd}*a@k}xKEnG7a@~<s!hZjh<WYp&tx0|I<
zQ%8$W9w~~r<qzr>bxMfyyMI4#k9AA6y5%mXg$iv^6dpgfre(f4^rMZBkuz0h;%IWO
zJ(5(D+fwL%g}P0h0vbIEIeIwuauzf}!)R)m-8GsTTQvjSVt0wME)OS}RE><LtJE!O
z*PJ$CW$`t4W@XdrmQOe>G<1n-Fq%BOxh#$*c|$D*xnf~yG(8Uk@fei5iS?j5*1B2e
z!!SwJ3F$Q&Cbdi7)Z)>Q8dC-h)*7Y*a<s`<S#leiks|EZsTpe59P-DCtZPbqo>8kA
z@iO}I;H1e6DT+kn1~o(NQh)JTyZG9fiiNeBQC~)*smm5iW;`FnB3jCmLM_R|=t@z<
zEq_wCs9o=t9rd$4^u4@Ot6OAr2`$u?i!CrA<85K(u8hbLfSjpXOVG>|d^f3?Y8P8K
z;C4^{sriQ>rTlA)`x3j1mgN&Onju9Ix7{LU1gTwT;=LYavwnY1@Ak^QOSHVbLNla1
zV#Z(847E$FwrF(jLR~hKt%-(cxYN7x;jU@!NKwR$+tduTs|%ey_(9dC%k>T_cLmg1
zeP&2`#Ed)C47JM|HDl+hjr$JxX`LJHoBw!yE148U%(zR<P`jcc+cVYMHYMX`t!9jL
z1;Tw7p=nUbYYk$?J!*#9HLRHZ`_7`(3&v_S!@X<JIH;sNV#a-HhT1g@T%50ZhMm=Z
zNsSr**1{pj6k+|FvZ{yt(|u{mefjLd=daBAk`@k`+$F3JD64vgmYzOqTxhKcV|230
zGb&;Ihq9_i+TFX$cOME2TB4Ix9`Ok4L&92FJ%Q4-$~m`ulgY1s+4m9;%cB5ceMDK+
z1LMtMJ+A$_qj(FQw#w;1SRYeXwZ50X@>`xMN2ktD4eP%($+&lm)^7yyX?ux>><KkP
zt?$h{^{?0L<g8_{Y{p9(;N*6JIPWQCRm*YjYd?Gz^H#M<ugv<Afg(3tg!LI^RjX&x
zm>r$-)<4`SHLU*{GWSD3Zp}441f(c5!<oj1+M$-uZ$93;aBt<$=f1KXFNv8vrx02n
zMyuA(hyyWC{z*u#^UAa@37XuO5!$qjRxP0MZ_T~lDE84YowPD&gf<<cRV!$oXPrMR
zI=1Tc)X@IhU_nX;^ka#4;n;)CSOf%>A)*_AJj+7V+32T&Esy2?!V%*dg0u;4O#Gv(
zrN3?FYWU<2&F4o-q;i@87m>h)(H7(UQjTuxgbsK>i*dk*PiM<~Nj~+}SNGhKh>_`O
zoZ-_zF%NPKd#lmG?=jAwxy()Ut@F$AKZo8tw>O1P>y;?KHqU@UUH@wr95iH!nUIIW
zZY<Qx0evvyckqR06O67nJD|xy+yyG^a1iso5oXAxiPOM`i8SkcGEk~BM5>PdbY@_2
zR=b4xe}uKN)gOA%OL2ydEin!3oMGhl?HQSz&NcTIdc&`9jUn~|AKqbz@4UeWJw@VE
z;KPO@>wFkTsXB>cMixi4?^^xk{DqzCJm}(`I6BQIkeCP)79T&>1Br&pF+mK?#0*tC
zvu*Ktd*6DvAXulNVjtsmz!6g<LsUJ_m)aH{xLXkWfC?`b+)DA?(<?q-(68J+r!VuX
z+It?{vOL4cm80%x_3JY3S2zcXBp5`Vhd^t76*pacLXv9o8`s~NnfXoap8qQBpCzOH
zoi#fBhEu?CT<vqk1#lU5l@Jj0Zi(Vhn&;WLWzDaI3J#j$35-`fWr|!mEc#t4nX;P)
zYG*_V;KRZh>wGvrr0Qs-GYgBeI$g{dH1y!9=Cx*eDbCQBC49h6Y|28t98mNrDnOh)
zD?Y8oS(Iaba6o0dC<}a^W$o7Zh-q-+Ez*Pic-`$*O4EGi_Bpe%IIH8_{Q}4T2p{oX
zn9ewh9Ne#P^cR<4QhT2JwfPk~tVAgYA+}e!mD1!_Zl5z7^Q$_qZVp@0?9%dg$7%Je
zllv92KVmKT)id>D&9CB=e7;ChO@8D0JF_#tsRMWSj^n<GTm0rGoqi+p4qCUwb#NK>
zZHatV?Uc?=xt_!QlSl{S5ig%aA8+tMy_L8L_F=J=bv}H4NY&A)PCpt)`07ui{(T}o
zYd+^IFU1jhx5PZK6C1ZsF9+nZ#c#04dM2_h&X2Lo4-Tlq5eq>n&mtU)e5>Vrl%$&G
zBiG-VgT+xDm@hPq`SMak&+9tlC<0w3h$mL}tuLByP@O6YLCCQ*)lI6&Z&H68XU?3=
zZ|Z>hb<+L=0WEGE)af@uX<i76wqPjU28%>*I2OufL@VIKS{dtnxUfjoiIusSW7UCp
z{^+XHX7$?L(mRfIK-O6N1$JV}81-^Mlb47CkY|UN#fd&uPV^tFPV^T|5UAb~UxVsc
zx@DaY<0w@}e>rorII08qNipM!v1*jwg{1=HbwE+JNPq>|vm)DCoX|uERP>57K!xSJ
zLMo+c;o<f<Ut@k%2Xtrtw+A#@?f0IS<_C0Wi49;Uwr3G%oP$$*CD`?NHX~{CJM{2~
z&7co9@Q8a>{r;l)16g*F6Xp-5*sb&7{E@1oKb?74oYeun+1YmQj=LY-+)HtWdJWMC
z?8IUX@wYa=iVZGN*5#q!q-w-(d70nTfp=}d<5rJ@j<wh6H}OQ;(LtYIq&&340U3Ld
z2IiAzx;|C&3H`?ToUb##sRQjf|HDVNtWR*IGR(D1{<*dc8I<;mw_v<sUB4)-&9B6z
zP`4s%z=uUE*7?w{QgtMC=3{<U2j1O{t5<zv#UxUXWZkdgirhcQ(}Ec*i{nFzB6j4b
zcEAUmhIWiCSIV!%MxEnB%y;=ul*(z*j_2#Rc*iBmEmV_Vq<+LN1(;vd0cy#T#`ham
zEAfL?znqhP+3osa$qw|3Jc|h_)RH_L_fiz`%Nxuu>VR}+%h<cEAC`&K>X&pfzHwIV
zK)+ORiLx#a{UTK(eksWOqRuyKI_-IU#%Mw+PBg3sm!%!`Ecs>1_JB0*{}{dk9prYn
zl&)#uNKwQ=g_wiX!C~Kk#NT%<zNYtPwX$pAAg==|xkX)g;ChyEOBW>izG8+F{pMbX
zB&jC9as8czncvjGVB?;#L4IW??a?~nrpO65(Q5qBF9Thotjog*Cso6^I*TyBsEzdc
z#C3IJF6`W=)i1v=UI(;(iML_5;o0|PaXiV(Yq8GiczPZO;xe-;PV5BTu_VqqAqGn7
zPFRbwK&hSkgLOfX-#lCB&>E;*asbO{X_|;U@Xlh)4z-iEFB&yu(<YH!s~x-L6I>pr
zG}$3Vk$4nmcBq}}i3JNsbQpc7idH)k<Usz|WsD^|o)2WPTFR3`Ey=@iDn${$l%Rfr
zk6yJYa=UfmKjZZ7_+)&M*8!#Iq5(`w&pPx}EhuOw=W~{1cB-9i!r(=Zj}9!oL2LX6
z%6M5mNuwQ76#C8iCbL8BM0>_;^BFmNVWL($+&f2WJ1*Bo3+<5dh#hY+JJil`uC4Da
z|Ldo;p3qeaSU%-7%^xX>*x}FYP#ZjB+@p1+ZufS5V#4H<Zm@RKph~6w)@O$-y&`s$
zVs@wv*STYn&c$z>AWan->~LoWcDn|H=^Lx_oR6#0%nr3ddQ$OPWSeZ)J8O-{ck;O_
zuPHRyAw`jRlwo$L{ln@(AI)z)w|^V0cDVNs8poBCN9-ue>`?oM30K;M9FG}78bAK~
zxZ*7UxV*?=MQF=0TJ^Z!9u*g`yw|pNuk1dL#t)>G_YQ=%JfRJ&q`Ju_y)k{ct;O(L
zI%(ypmC#mTwCdrux!ml1z8BBGdL7D3td)m8LL0zn)w5_$!(_jXYbRCIX{|gP5Za22
zRy{gwmCip|G`q>GKRRAwt(+8uwi2UND}AoxI~q?rm91K8to^s{8u#YW`kf(;QF;AH
z?65IA)Ji|UeS@3F+xPnGmF@6o#X?%ScObNOMynR)g9YAS?wgSD)ejOciJIJh5!ygT
ztJcx#>lZv8)OB*uD_iTL{nx0up9gZYuIYIoMG-rKm>p^z4QjV9-@P&QbG@=1FNvBw
zxe(gQj8?6qaU-)GdbD;l>E4z>{}0=nmqbk#>Ji#tN()~Os-19Qb@Os78ojc$FQJt|
zBeWrmR;{BqW3C3T-q+i(PTS6P7nh>E&S|!CiBL%YVjq_=*BnXm4-<ViwaB?6=f1d_
zu79Q7amSB_H5&h6tFy6PJnVcCq1jU;8Ui6U6dAKk!k==vG_^otHoS;BK!V*a#!Ou#
zX0wT?2PD{OVocXXVs>7Le|%gcB}<|&wp|$VV|@`ardHi$wkZfFSb{wY#wWT+%o=&|
z29RKZyfIN1iCLg61^@|GXB$bnNX%+wQ5Z<DRM|+-MPiorijqKrb-hNsE)uiuR9uD8
zg+-^v2f9eiqDWB@EWwIMW27z;vw~1$1rjVDG=}RUF{|msJ3xY^bjA=}Bxb3XhyoI<
z^D+kOA~EZpL|-7mq9;SPKWkZ5%%U674oI-##u%WB#H>&et$_r~Q;fd4NX&8$(Gf_n
zmc!_!i^QyL5cz-v3mXhW7l~PTAa1~n$EpJ(N*9Tl#TR~H38wIk=T}lzA6jNgT=W7G
z%!eBhx-2pC%i=P)3zN%6M_nXlj#m5%B$%Kz+UX)O6Ph9qxC^tHMr&OpW_D1N2NFyT
z8ZC5@n3*k+0Z1^dWi-`AVy2x$S|Gukli}1wV&;TIbs)inkkL>ViJ4##b|As*icwD&
ziJ9FHHGu?E8%7;nBxdSC_yY;%9SnIY(eltT^8(@=Oh!x!7}a%=n3wjV3|N9U_QpH9
zNX#2`@eC}%>vSVj7m0b@Eex;(@4Ai3x=75sXmKAb!Ha0au8YLHC>AZi61*igD(WIJ
zuZ2YyAi+Cfqr5H>^AcDz1rodgHp=KCF|T~ZDToQ)_Zt4XNX!dfkqa!r+g+oiE)w%P
zSNH-6-sKv_bdi{sx8gCl3vX_X!n#Pzn^h43mf*Fj@rEuE^Bz@P1WWJ|)ySud#Jn{X
z$AJW|OpQFcNX#ouaSKTBKGVphi^ROZ6qO++czbF1=^`<&D@8FN!MjQ$t1c4ra#Gv@
zcj3*Xk^aAtbV~mJ_`j&6%-temA|tx>>>86i+;wB#*>y>ll(%nepZ;+HUHe5x1@wvS
z9n&*1Iyy2gIVoF2bX3=VeFw%w$GYH?lan&Ifyqgk`gT*0$;pm1W0Gt8bV(kY+&sCo
z!^e@<k=A8!=xZ*q8M?*9^^cN&m$qwYa>6(TNtfhPHMw!)#tomj{^S1PNE4IXC82h)
zG-*bLsWQ-%Kh?h{yI>-nY=jPRi5G9Xen@sv)%9OEeN9Q9uB$B=Sw&sfzrBdIM$?Z@
z_1`@wCM<1i+=>N7z7D%szsP`XM{k<HZ%z4oO1xXG?EgirCN32cYS$>9IAFP7hCN$b
z-e244y}mcMruaSVfBe4HO<3#|2qW%1IWex*jj%O0mi37H<KWh`Q8_DZxVFapJuKz#
zDe-PXr|Ug>|I_hc;hKGMMYps&88*51rycr~xv@3H?_vMrcV%JcS0G&9;$p9FCtnIH
zJ>|#Nr^}t$I;m|)=`qI-nZJjn{5>V!ZQ)0*M8|)%uzRii8LK|LvGqdvqK!IsJGwQ+
z?_vMrcV*$+|3X;DJ!p#ENnhDLIbQJpx60`vEr?C>?GhasF(@*6pldZLZOk(k8jFo(
z#!6$gvDVmNY&NzU+l^hu9^*&jXXBu8*f?gKG|m`jjSI#l<BD<JxM|!r?imk^N5)g(
z<66r^dif)h{E<ce$S!~6ls{gRKVFwV3dkRY<d35AM+y1kE%~Fg{83K+2#`N)@<)*T
z5h8!QEq_#V{fJ8P?G+V+TBkR%8h%DDBae~Kc*7`c6f;U1{ze(2yiw7x8<mYv;~k^A
zQPZen)H50yPNS*O!f0)@Gdda(Mi(Q>FpOSCUt@p~V+=Ni7{iT`#s@~ckzgbl6OB)d
zDaLeTrZL-?>skuCM@L1Oyh)k5M0V{J)wf%6Qi%aEQGEyYi*_xDePf~{yT<hF*Ea(G
d*MR}B1ZHb2*UVb*H<Odgj7pB}GB!Ej{{YnncQF6}

literal 0
HcmV?d00001


From 2642baeb4565d0809700b2705c6c27dbee2321de Mon Sep 17 00:00:00 2001
From: Milan Kartik Shah <mkshah5@nps4.ece.ncsu.edu>
Date: Sun, 19 Mar 2023 17:49:47 -0400
Subject: [PATCH 060/126] Updated decompression for faster pre- and
 post-processing

---
 qtensor/compression/szx/src/cuszx_entry.cu | 190 +++++++++++++++++----
 1 file changed, 159 insertions(+), 31 deletions(-)

diff --git a/qtensor/compression/szx/src/cuszx_entry.cu b/qtensor/compression/szx/src/cuszx_entry.cu
index 56f278e6..07755c30 100644
--- a/qtensor/compression/szx/src/cuszx_entry.cu
+++ b/qtensor/compression/szx/src/cuszx_entry.cu
@@ -1320,7 +1320,7 @@ __global__ void setup_data_stateArray(float *newData, size_t nbEle, unsigned cha
 }
 
 __global__ void decomp_startup_kernel(unsigned char* r, size_t nbConstantBlocks, 
-unsigned char *data, int blockSize, size_t mSize, size_t ncBlocks, float *constantMedianArray){
+unsigned char *data, int blockSize, size_t mSize, size_t ncBlocks, float *constantMedianArray, uint64_t* g_leng){
     unsigned char * fr = r; //fr is the starting address of constant median values.
     int i = 0, j = 0, k = 0;
   //  printf("%p\n", r);
@@ -1333,31 +1333,68 @@ unsigned char *data, int blockSize, size_t mSize, size_t ncBlocks, float *consta
 
 //    printf("nbconstant: %f\n", ((float*)tmp_r)[0]);
 // nbConstantBlocks
-    for(i = blockDim.x*blockIdx.x + threadIdx.x; i < nbConstantBlocks; i += blockDim.x*gridDim.x, j+=4){ //get the median values for constant-value blocks
+    for(i = blockDim.x*blockIdx.x + threadIdx.x; i < nbConstantBlocks; i += blockDim.x*gridDim.x){ //get the median values for constant-value blocks
 	    
-    	    tmp_r[0]=fr[j];
-    	    tmp_r[1]=fr[j+1];
-    	    tmp_r[2]=fr[j+2];
-    	    tmp_r[3]=fr[j+3];
+    	    tmp_r[0]=fr[4*i];
+    	    tmp_r[1]=fr[4*i+1];
+    	    tmp_r[2]=fr[4*i+2];
+    	    tmp_r[3]=fr[4*i+3];
 	    float tmp = ((float*)tmp_r)[0];
 	    constantMedianArray[i] = tmp;
-	    // printf("%d %f\n", i, tmp);
+	    //printf("%d %f\n", i, tmp);
     }
+   
+
+/** PROBLEM AREA, CAN FIX WITH PARALLELIZATION BUT WATCH *FR and *P **/
 
+    // if(threadIdx.x==0 && blockIdx.x==0){
     fr += nbConstantBlocks*sizeof(float);
     unsigned char* p = fr + ncBlocks * sizeof(short);
-    for(i = blockDim.x*blockIdx.x + threadIdx.x;i < ncBlocks;i += blockDim.x*gridDim.x){
+    unsigned char* basefr = fr;
+    unsigned char* basep = p;
+    for(i = blockDim.x*blockIdx.x + threadIdx.x;i < ncBlocks;i+=blockDim.x*gridDim.x){
+        fr = basefr+(sizeof(short)*i);
         int leng = (int)bytesToShort(fr)+mSize;
-        fr += sizeof(short);
+        g_leng[i] = (uint64_t)leng;
+        // fr += sizeof(short);
         if (leng > blockSize*sizeof(float))
         {
             printf("Warning: compressed block is larger than the original block!\n");
             return;
             // exit(0);
         }
+        // memcpy(data+i*blockSize*sizeof(float), p, leng);
+
+        // p += leng;
+    }
+    
+    // }
+}
+
+__global__ void decompress_ncblk_kernel(unsigned char* r, size_t nbConstantBlocks, 
+unsigned char *data, int blockSize, size_t mSize, size_t ncBlocks, float *constantMedianArray, uint64_t* g_leng){
+    unsigned char * fr = r;
+    fr += nbConstantBlocks*sizeof(float);
+    unsigned char* p = fr + ncBlocks * sizeof(short);
+    unsigned char* basefr = fr;
+    unsigned char* basep = p;
+
+    for(int i = blockDim.x*blockIdx.x + threadIdx.x;i < ncBlocks;i+=blockDim.x*gridDim.x){
+        fr = basefr+(sizeof(short)*i);
+        int leng = (int)bytesToShort(fr)+mSize;
+        // g_leng[i] = leng;
+        // // fr += sizeof(short);
+        // if (leng > blockSize*sizeof(float))
+        // {
+        //     printf("Warning: compressed block is larger than the original block!\n");
+        //     return;
+        //     // exit(0);
+        // }
+        p = basep + g_leng[i];
+
         memcpy(data+i*blockSize*sizeof(float), p, leng);
 
-	    p += leng;
+        // p += leng;
     }
 }
 
@@ -1370,6 +1407,7 @@ void decompress_startup_better(float *newData, size_t nbEle, unsigned char* r,
 ){
     blockSize = 256;
     size_t nb_tmp = (int) nbEle/256;
+    uint64_t* g_leng;
     /**
      * Structures to return:
      * blk_idx, blk_subidx, blk_sig, blk_vals, numSigValues (pointer)
@@ -1379,7 +1417,7 @@ void decompress_startup_better(float *newData, size_t nbEle, unsigned char* r,
 
 
     size_t stateNBBytes = nb_tmp%4==0 ? nb_tmp/4 : nb_tmp/4+1;
-
+    
     r += 4;
     r++;
     r += sizeof(size_t);
@@ -1395,14 +1433,21 @@ void decompress_startup_better(float *newData, size_t nbEle, unsigned char* r,
     
     // printf("before mallocs in kernel\n");
     checkCudaErrors(cudaMemcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float), cudaMemcpyDeviceToDevice));
+    checkCudaErrors(cudaMalloc(&g_leng, sizeof(uint64_t)*ncBlocks));
     // memcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
 
     //printf("before mallocs in kernel %p\n", r);
     r += (nbEle%blockSize)*sizeof(float);
     //printf("r: %p\n", r);
     //printf("%d, %d, %d\n",nbEle, 256, nbEle%256);
-    decomp_startup_kernel<<<40,256>>>(r, nbConstantBlocks,data, blockSize, mSize, ncBlocks, constantMedianArray);
+    decomp_startup_kernel<<<40,256>>>(r, nbConstantBlocks,data, blockSize, mSize, ncBlocks, constantMedianArray, g_leng);
+    cudaDeviceSynchronize();
+
+    thrust::exclusive_scan(thrust::device, g_leng, g_leng + ncBlocks, g_leng, 0);
+
+    decompress_ncblk_kernel<<<40,256>>>(r, nbConstantBlocks, data, blockSize, mSize, ncBlocks, constantMedianArray, g_leng);
     cudaDeviceSynchronize();
+    cudaFree(g_leng);
     r += nbConstantBlocks*sizeof(float);
 
     newCmpBytes = r;
@@ -1564,6 +1609,65 @@ __global__ void print_newdata(float *newData, size_t nbBlocks, int blockSize){
     
 }
 
+__global__ void generateNbNc(size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray, uint64_t* nbs,  uint64_t* ncs){
+    for(int i = blockDim.x*blockIdx.x + threadIdx.x;i < nbBlocks;i+=blockDim.x*gridDim.x){
+        unsigned char state = stateArray[i];
+        if(state==0||state==1){
+            nbs[i] = 1;
+            ncs[i] = 0;
+        }else if(state==3){
+            nbs[i] = 0;
+            ncs[i] = 1;
+        }else{
+            nbs[i] = 0;
+            ncs[i] = 0;
+        }
+    }
+}
+
+__global__ void decompress_final_set(unsigned char *data, float *newData, int blockSize, 
+    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
+    float *constantMedianArray, uint64_t* nb, uint64_t* nc){
+    float* fdata = (float*)data;
+    for (int i = blockIdx.x;i < nbBlocks;i+=gridDim.x){
+        if (stateArray[i]==0 || stateArray[i]==1){
+            float Median = constantMedianArray[nb[i]];
+            // if (Median>1) printf("data%i:%f\n",i, Median);
+            for (int j = threadIdx.x; j < blockSize; j += blockDim.x)
+                *((newData)+i*blockSize+j) = Median;
+            // nb++;
+        }else if(stateArray[i]==3){
+            for (int j = threadIdx.x; j < blockSize; j += blockDim.x)
+                *((newData)+i*blockSize+j) = fdata[nc[i]*blockSize+j];
+            // nc++;
+        }
+        __syncthreads();
+    }
+}
+
+void decompress_post_proc_fast(unsigned char *data, float *newData, int blockSize, 
+    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
+    float *constantMedianArray
+){
+    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
+    
+    int i,j;
+    uint64_t *nb, *nc;
+    checkCudaErrors(cudaMalloc(&nb, sizeof(uint64_t)*nbBlocks));
+    checkCudaErrors(cudaMalloc(&nc, sizeof(uint64_t)*nbBlocks));
+
+    generateNbNc<<<40,256>>>(nbBlocks, ncBlocks, stateArray, nb,nc);
+    cudaDeviceSynchronize();
+    thrust::exclusive_scan(thrust::device, nb, nb + nbBlocks, nb, 0);
+    thrust::exclusive_scan(thrust::device, nc, nc + nbBlocks, nc, 0);
+
+    decompress_final_set<<<nbBlocks,256>>>(data, newData, blockSize,nbBlocks, ncBlocks, stateArray,constantMedianArray, nb, nc);
+    cudaDeviceSynchronize();
+    cudaFree(nb);
+    cudaFree(nc);
+}
+
 __global__ void decompress_post_proc(unsigned char *data, float *newData, int blockSize, 
     size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
     float *constantMedianArray
@@ -1632,7 +1736,7 @@ float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
     size_t *nbConstantBlocks, *nbBlocks, *ncBlocks, nbBlocks_h, ncBlocks_h, nbConstantBlocks_h;
     unsigned char *stateArray, *data;
     float *newData;
-
+    timer_GPU.StartCounter();
     unsigned char *oldCmpBytes = cmpBytes;
 	//*newData = (float*)malloc(sizeof(float)*nbEle);
 //    printf("cmpbytes check %d\n", (int)cmpBytes[0]);
@@ -1670,15 +1774,24 @@ float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
     checkCudaErrors(cudaMalloc((void**)&blk_subidx, num_sig_h*sizeof(uint8_t)));
     checkCudaErrors(cudaMalloc((void**)&blk_sig, nbBlocks_h*sizeof(uint8_t)));
 
-    setup_data_stateArray<<<1,1>>>(newData, nbEle, cmpBytes, 
-        num_sig_h, bs,
-        nbConstantBlocks_h, nbBlocks_h, ncBlocks,
-        stateArray, cmpBytes
-    );
-    cudaDeviceSynchronize();
+    unsigned char* tmp_r = cmpBytes;
+    unsigned char* newR;
+    setup_data_stateArray_better(newData, nbEle, tmp_r, 
+    num_sig_h, bs,
+    nbConstantBlocks_h, nbBlocks_h, &ncBlocks_h,
+    stateArray, newR);
+    
+    
+    
+   // setup_data_stateArray<<<1,1>>>(newData, nbEle, cmpBytes, 
+   //      num_sig_h, bs,
+   //      nbConstantBlocks_h, nbBlocks_h, ncBlocks,
+   //      stateArray, cmpBytes
+   //  );
+   // cudaDeviceSynchronize();
 
    // printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
-    checkCudaErrors(cudaMemcpy(&ncBlocks_h, ncBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+   // checkCudaErrors(cudaMemcpy(&ncBlocks_h, ncBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
 
     checkCudaErrors(cudaMalloc((void**)&data, ncBlocks_h*bs*sizeof(float)));
     // cmpBytes = newCmpBytes;
@@ -1698,12 +1811,23 @@ float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
     //printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
     //test_nbBlks = (size_t *)malloc(sizeof(size_t));
     // printf("malloc\n");
-    decompress_startup<<<1,1>>>(newData, nbEle, cmpBytes, 
+    
+    
+    tmp_r = cmpBytes;
+    decompress_startup_better(newData, nbEle, tmp_r, 
     blk_idx, blk_subidx, blk_sig,
     blk_vals, num_sig_h, bs,
-    nbConstantBlocks_h, nbBlocks_h, ncBlocks_h,
-    stateArray, constantMedianArray, data, mSize_h, cmpBytes);
-    cudaDeviceSynchronize();
+     nbConstantBlocks_h, nbBlocks_h, ncBlocks_h,
+    stateArray, constantMedianArray, data,
+    mSize_h, newR);
+
+
+    //decompress_startup<<<1,1>>>(newData, nbEle, cmpBytes, 
+    // blk_idx, blk_subidx, blk_sig,
+    // blk_vals, num_sig_h, bs,
+    // nbConstantBlocks_h, nbBlocks_h, ncBlocks_h,
+    // stateArray, constantMedianArray, data, mSize_h, cmpBytes);
+    //cudaDeviceSynchronize();
     // cmpBytes = newCmpBytes;
 
     //printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
@@ -1715,7 +1839,7 @@ float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
     //printf("nblocks: %d bs: %d\n", nbBlocks_h, bs);
     checkCudaErrors(cudaMalloc(&d_newdata, nbBlocks_h*bs*sizeof(float)));
 
-    timer_GPU.StartCounter();
+    
     dim3 dimBlock(32, bs/32);
     dim3 dimGrid(65536, 1);
     const int sMemsize = bs * sizeof(float) + dimBlock.y * sizeof(int);
@@ -1726,18 +1850,22 @@ float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
     //printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
     cudaDeviceSynchronize();
 
-    err = cudaGetLastError();        // Get error code
-    printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
+    
     // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
     checkCudaErrors(cudaMemcpy(newData, d_newdata, nbBlocks_h*bs*sizeof(float), cudaMemcpyDeviceToDevice));
     cudaFree(d_newdata);
 
-    decompress_post_proc<<<1,1>>>(data, newData, bs, 
+    // decompress_post_proc<<<1,1>>>(data, newData, bs, 
+    // nbBlocks_h, ncBlocks_h, stateArray,
+    // constantMedianArray);
+    // cudaDeviceSynchronize();
+    decompress_post_proc_fast(data, newData, bs, 
     nbBlocks_h, ncBlocks_h, stateArray,
     constantMedianArray);
-    cudaDeviceSynchronize();
-//    print_newdata<<<1,1>>>(newData, nbBlocks_h, bs);
+    err = cudaGetLastError();        // Get error code
+    printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
+    // print_newdata<<<1,1>>>(newData, nbBlocks_h, bs);
 	cudaFree(stateArray);
 	cudaFree(constantMedianArray);
 	cudaFree(data);

From 0af7bc6b8139b02392c00cdb3ad145dcf9b02283 Mon Sep 17 00:00:00 2001
From: Milan Shah <mkshah5@ncsu.edu>
Date: Thu, 23 Mar 2023 14:20:35 -0400
Subject: [PATCH 061/126] Improved compression throughput further

---
 qtensor/compression/szx/src/cuszx_entry.cu | 41 ++++++++++++----------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/qtensor/compression/szx/src/cuszx_entry.cu b/qtensor/compression/szx/src/cuszx_entry.cu
index 07755c30..9ebf7a79 100644
--- a/qtensor/compression/szx/src/cuszx_entry.cu
+++ b/qtensor/compression/szx/src/cuszx_entry.cu
@@ -662,25 +662,30 @@ __global__ void nccopy_kernel(unsigned char * c, unsigned char* o, unsigned char
 	    // memcpy(&g, (meta+(nbBlocks+i*mSize)),sizeof(float));
 	    // printf("%d %f\n",i,g);
         }
-        // else if(meta[i] == 3){
+        else if(meta[i] == 3){
 	
-        // //     printf("ncblk 1\n");
-        //     shortToBytes_d(o+(sizeof(short)*ncBlk_indices[i]), offsets[i]);
-        //      // o += sizeof(short);
-
-        // //     printf("ncblk 2 nbBlocks %d %d \n", nbBlocks, i);
-        //     printf("nbBlkindices %ld offset_indices %ld\n", ncBlk_indices[i], offset_indices[i]);
-        // //     printf(" test 1%c\n",meta+(nbBlocks+i*mSize));
-        // //     printf("test 2%c\n", nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]));
-        //     memcpy(nc+((mSize + offset_indices[i])*ncBlk_indices[i]), meta+(nbBlocks+i*mSize), mSize);
-        // //         // nc += mSize; 
+        //     printf("ncblk 1\n");
+            shortToBytes_d(o+(sizeof(short)*ncBlk_indices[i]), offsets[i]);
+             // o += sizeof(short);
+
+        //     printf("ncblk 2 nbBlocks %d %d \n", nbBlocks, i);
+            // printf("nbBlkindices %ld offset_indices %ld\n", ncBlk_indices[i], offset_indices[i]);
+        //     printf(" test 1%c\n",meta+(nbBlocks+i*mSize));
+        //     printf("test 2%c\n", nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]));
+            memcpy(nc+((mSize*ncBlk_indices[i] + offset_indices[i])), meta+(nbBlocks+i*mSize), mSize);
+        //         // nc += mSize; 
                 
-        // //     printf("ncblk 3\n");
-        //     memcpy(nc+((mSize+mSize + offset_indices[i])*ncBlk_indices[i]), midBytes+(i*blockSize*sizeof(float)), offsets[i]);
-        // //         // nc += offsets[i];
+        //     printf("ncblk 3\n");
+            memcpy(nc+(((mSize*ncBlk_indices[i])+mSize + offset_indices[i])), midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+        //         // nc += offsets[i];
             
-        // //     printf("ncblk 4\n");
-        // } 
+        //     printf("ncblk 4\n");
+        }
+        if (i==nbBlocks-1)
+        {
+            nc = nc+(((mSize*ncBlk_indices[i])+mSize + offset_indices[i]))+offsets[i];
+        }
+        
     }
     
 }
@@ -919,7 +924,7 @@ size_t better_post_proc(size_t *outSize, float *oriData, unsigned char *meta,
     unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
     // ncblkCopy<<<1,1>>>(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
     
-    ncblkCopy_h(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
+    // ncblkCopy_h(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
     ncblkCopy_fast(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
     // cudaDeviceSynchronize();
     return (size_t) (nc-r_old);
@@ -1865,7 +1870,7 @@ float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
     err = cudaGetLastError();        // Get error code
     printf("CUDA Error: %s\n", cudaGetErrorString(err));
     printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
-    // print_newdata<<<1,1>>>(newData, nbBlocks_h, bs);
+   // print_newdata<<<1,1>>>(newData, nbBlocks_h, bs);
 	cudaFree(stateArray);
 	cudaFree(constantMedianArray);
 	cudaFree(data);

From 1ade10af05353fc7392453ba4f29087909b4848a Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Thu, 23 Mar 2023 19:23:32 +0000
Subject: [PATCH 062/126] add small bench analysis script

---
 .../analysis/compression_scaling_analysis.py  | 44 +++++++++++++++++++
 .../analysis/simple_compression_report.py     | 36 +++++++++++++++
 bench/qc_simulation/src/simulators/qtensor.py | 12 +++--
 3 files changed, 89 insertions(+), 3 deletions(-)
 create mode 100644 bench/qc_simulation/analysis/compression_scaling_analysis.py
 create mode 100644 bench/qc_simulation/analysis/simple_compression_report.py

diff --git a/bench/qc_simulation/analysis/compression_scaling_analysis.py b/bench/qc_simulation/analysis/compression_scaling_analysis.py
new file mode 100644
index 00000000..a1d5d0e3
--- /dev/null
+++ b/bench/qc_simulation/analysis/compression_scaling_analysis.py
@@ -0,0 +1,44 @@
+import glob
+import pandas as pd
+import json
+import numpy as np
+import sys
+
+def fmt_unit(x, unit):
+    return str(np.round(x, 2)) + " " + unit
+
+def main():
+    glob_pat = sys.argv[1]
+    filenames = glob.glob(glob_pat)
+    filenames = sorted(filenames)
+
+    for file in filenames:
+        data = json.load(open(file))
+        stats = {}
+        for atr in ["compress", "decompress"]:
+            items = data["compression"][atr]
+            if len(items)==0:
+                continue
+            df = pd.DataFrame(items)
+            df["CR"] = df["size_in"]/df["size_out"]
+            df["T"] = df["size_in"]/df["time"]
+            stats["mean " + atr+" CR"] = df["CR"].mean()
+            stats["mean " + atr+" Throughput"] = fmt_unit(df["T"].mean( )/1e9, "GB/s")
+            stats[atr+" Count"] = len(df)
+
+        _res = data["result"]
+        stats["result"] = (_res["Re"] , _res["Im"])
+        stats["Time"] = fmt_unit(data["time"],'s')
+        stats["Memory"] = str(data["memory"]/1024/1024) + " MB"
+        print(file)
+        _prefix = "  "
+        last = lambda x: x==len(stats.items())-1
+        char = lambda i: "⎬ " if not last(i) else "┕ "
+        print("\n".join([
+            _prefix+char(i) + " = ".join(map(str, items))
+            for i, items in enumerate(stats.items())
+        ]))
+
+
+if __name__=="__main__":
+    main()
diff --git a/bench/qc_simulation/analysis/simple_compression_report.py b/bench/qc_simulation/analysis/simple_compression_report.py
new file mode 100644
index 00000000..864574c5
--- /dev/null
+++ b/bench/qc_simulation/analysis/simple_compression_report.py
@@ -0,0 +1,36 @@
+import pandas as pd
+import json
+import sys
+
+def main():
+    file = sys.argv[1]
+    data = json.load(open(file))
+    rows = []
+    for item in data['compression']['compress']:
+        k = item.copy()
+        k['type']='compress'
+        rows.append(k)
+
+    for item in data['compression']['decompress']:
+        k = item.copy()
+        k['type']='decompress'
+        rows.append(k)
+
+    if len(rows) == 0:
+        print("Rows:\n", rows)
+        return
+    df = pd.DataFrame(rows)
+    dfc = df[df['type'] == 'compress']
+    dfd = df[df['type'] == 'decompress']
+
+    for d in [dfc, dfd]:
+        d['Throughput'] = d['size_in'] / d['time']
+        d['CR'] = d['size_in'] / d['size_out']
+
+    print("Compression:")
+    print(dfc.describe([0.5]))
+    print("Decompression:")
+    print(dfd.describe([0.5]))
+
+if __name__=="__main__":
+    main()
diff --git a/bench/qc_simulation/src/simulators/qtensor.py b/bench/qc_simulation/src/simulators/qtensor.py
index e29ede10..9b1a65fa 100644
--- a/bench/qc_simulation/src/simulators/qtensor.py
+++ b/bench/qc_simulation/src/simulators/qtensor.py
@@ -184,7 +184,7 @@ def simulate(in_file, out_file, backend='einsum', compress=None, M=29, **kwargs)
     backend = qtensor.contraction_backends.get_backend(backend)
     if compress is not None:
         if compress == 'szx':
-            compressor = qtensor.compression.CUSZCompressor(r2r_error=5e-2, r2r_threshold=5e-2)
+            compressor = qtensor.compression.CUSZCompressor(r2r_error=1e-3, r2r_threshold=1e-3)
             compressor = qtensor.compression.ProfileCompressor(compressor)
         else:
             raise ValueError(f"Unknown compression algorithm: {compress}")
@@ -231,12 +231,18 @@ def simulate(in_file, out_file, backend='einsum', compress=None, M=29, **kwargs)
         del bcopy
         print("Result", res.data.flatten()[0])
         time.sleep(0.5)
-    print("Simulation result:", backend.get_result_data(res).flatten()[0])
+    sim_result = backend.get_result_data(res).flatten()[0]
+    print("Simulation result:", sim_result)
     end = time.time()
-    print("D", end - start)
+    print("Elapsed", end - start)
     out_file += ".json"
     C = {'time': 2**len(par_vars)*(end - start)}
+    C['elapsed'] = (end - start)
     C['memory'] = backend.max_mem
+    C['result'] = {
+        "Re": np.real(sim_result).tolist(),
+        "Im": np.imag(sim_result).tolist()
+    }
     if compress is not None:
         if isinstance(compressor, qtensor.compression.ProfileCompressor):
             C['compression'] = compressor.get_profile_data_json()

From 14f6d47c90a7a61c1fc406b60d4c858b3193fe88 Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Thu, 23 Mar 2023 19:25:09 +0000
Subject: [PATCH 063/126] add usage of simple simulation analysis to README.md

---
 bench/qc_simulation/README.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/bench/qc_simulation/README.md b/bench/qc_simulation/README.md
index be78054a..b7616296 100644
--- a/bench/qc_simulation/README.md
+++ b/bench/qc_simulation/README.md
@@ -53,3 +53,15 @@ main.py process \
 ```
 
 The parent directory for each out file will be created automatically
+
+
+## Analysis
+
+Simple simulation analysis script: `analysis/compression_scaling_analysis.py`.
+Accepts a glob pattern for simulation output files
+
+Usage:
+
+```
+python analysis/compression_scaling_analysis.py ./data/simulations/maxcut/file\*
+``

From 7072f3678a945eb11d944b59f50275551bed1130 Mon Sep 17 00:00:00 2001
From: Milan Shah <mkshah5@ncsu.edu>
Date: Thu, 23 Mar 2023 16:17:14 -0400
Subject: [PATCH 064/126] Added definitions of blocks and threads for kernel
 launches

---
 qtensor/compression/szx/src/cuszx_entry.cu | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/qtensor/compression/szx/src/cuszx_entry.cu b/qtensor/compression/szx/src/cuszx_entry.cu
index 9ebf7a79..ff961eec 100644
--- a/qtensor/compression/szx/src/cuszx_entry.cu
+++ b/qtensor/compression/szx/src/cuszx_entry.cu
@@ -9,6 +9,8 @@
 #include <cub/cub.cuh>
 
 #define SPARSITY_LEVEL 0.25
+#define BLOCKS 40
+#define THREADS_PER_BLOCK 256
 
 TimingGPU timer_GPU;
 void bin(unsigned n)
@@ -746,14 +748,14 @@ void ncblkCopy_fast(unsigned char * c, unsigned char* o, unsigned char *nc, unsi
     checkCudaErrors(cudaMalloc(&ncBlk_indices, sizeof(uint64_t)*nbBlocks));
     checkCudaErrors(cudaMalloc(&offset_indices, sizeof(uint64_t)*nbBlocks));
 
-    generateFlags<<<40,256>>>(meta, cBlk_indices, ncBlk_indices, offset_indices, offsets, nbBlocks);
+    generateFlags<<<BLOCKS,THREADS_PER_BLOCK>>>(meta, cBlk_indices, ncBlk_indices, offset_indices, offsets, nbBlocks);
     cudaDeviceSynchronize();
 
     thrust::exclusive_scan(thrust::device, cBlk_indices, cBlk_indices + nbBlocks, cBlk_indices, 0);
     thrust::exclusive_scan(thrust::device, ncBlk_indices, ncBlk_indices + nbBlocks, ncBlk_indices, 0);
     thrust::exclusive_scan(thrust::device, offset_indices, offset_indices + nbBlocks, offset_indices, 0);
 
-    nccopy_kernel<<<40,256>>>(c, o, nc, midBytes, meta, nbBlocks, blockSize, offsets, mSize, cBlk_indices,ncBlk_indices,offset_indices);
+    nccopy_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(c, o, nc, midBytes, meta, nbBlocks, blockSize, offsets, mSize, cBlk_indices,ncBlk_indices,offset_indices);
     // nccopy_kernel2<<<1,1>>>(c, o, nc, midBytes, meta, nbBlocks, blockSize, offsets, mSize, cBlk_indices,ncBlk_indices,offset_indices);
 
     cudaDeviceSynchronize();
@@ -877,7 +879,7 @@ size_t better_post_proc(size_t *outSize, float *oriData, unsigned char *meta,
     checkCudaErrors(cudaMemset(out_size_d, 0, sizeof(int)));
 
 
-    getNumNonConstantBlocks<<<40,256>>>(nbBlocks, offsets, meta, blockSize, nonconstant_d, out_size_d);
+    getNumNonConstantBlocks<<<BLOCKS,THREADS_PER_BLOCK>>>(nbBlocks, offsets, meta, blockSize, nonconstant_d, out_size_d);
     cudaDeviceSynchronize();
 
     checkCudaErrors(cudaMemcpy(&nonconstant_h, nonconstant_d, sizeof(int), cudaMemcpyDeviceToHost));
@@ -911,9 +913,9 @@ size_t better_post_proc(size_t *outSize, float *oriData, unsigned char *meta,
 	else
 		out_length = nbBlocks/4+1;
 
-    convert_state_to_out_kernel<<<40,256>>>(meta, nbBlocks, r, out_length);
+    convert_state_to_out_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(meta, nbBlocks, r, out_length);
     r+=out_length;
-    convert_block2_to_out_kernel<<<40,256>>>(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    convert_block2_to_out_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
     r += nbBlocks*4 + num_sig*sizeof(float) + num_sig*sizeof(uint8_t) + nbBlocks*sizeof(uint8_t);
 
     checkCudaErrors(cudaMemcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float), cudaMemcpyDeviceToDevice));
@@ -1272,7 +1274,7 @@ __global__ void decompress_get_stats(float *newData, size_t nbEle, unsigned char
     // printf("cmp %d\n", (int)r[0]);
     // printf("state %d\n", (int)stateArray[0]);
     // convert_out_to_state(nbBlocks, r, stateArray);
-    convert_out_to_state_kernel<<<40,256>>>(nbBlocks,r,stateArray,stateNBBytes,
+    convert_out_to_state_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(nbBlocks,r,stateArray,stateNBBytes,
                             num_state2_d, ncBlocks_d);
     // printf("state %d\n", (int)stateArray[0]);
     // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
@@ -1430,7 +1432,7 @@ void decompress_startup_better(float *newData, size_t nbEle, unsigned char* r,
 
     r += stateNBBytes;
 
-    convert_out_to_block2_kernel<<<40,256>>>(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    convert_out_to_block2_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
     size_t to_add = nbBlocks*4 + num_sig*sizeof(float) + num_sig*sizeof(uint8_t) + nbBlocks*sizeof(uint8_t);
     r+= to_add;
 
@@ -1445,12 +1447,12 @@ void decompress_startup_better(float *newData, size_t nbEle, unsigned char* r,
     r += (nbEle%blockSize)*sizeof(float);
     //printf("r: %p\n", r);
     //printf("%d, %d, %d\n",nbEle, 256, nbEle%256);
-    decomp_startup_kernel<<<40,256>>>(r, nbConstantBlocks,data, blockSize, mSize, ncBlocks, constantMedianArray, g_leng);
+    decomp_startup_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbConstantBlocks,data, blockSize, mSize, ncBlocks, constantMedianArray, g_leng);
     cudaDeviceSynchronize();
 
     thrust::exclusive_scan(thrust::device, g_leng, g_leng + ncBlocks, g_leng, 0);
 
-    decompress_ncblk_kernel<<<40,256>>>(r, nbConstantBlocks, data, blockSize, mSize, ncBlocks, constantMedianArray, g_leng);
+    decompress_ncblk_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbConstantBlocks, data, blockSize, mSize, ncBlocks, constantMedianArray, g_leng);
     cudaDeviceSynchronize();
     cudaFree(g_leng);
     r += nbConstantBlocks*sizeof(float);
@@ -1662,7 +1664,7 @@ void decompress_post_proc_fast(unsigned char *data, float *newData, int blockSiz
     checkCudaErrors(cudaMalloc(&nb, sizeof(uint64_t)*nbBlocks));
     checkCudaErrors(cudaMalloc(&nc, sizeof(uint64_t)*nbBlocks));
 
-    generateNbNc<<<40,256>>>(nbBlocks, ncBlocks, stateArray, nb,nc);
+    generateNbNc<<<BLOCKS,THREADS_PER_BLOCK>>>(nbBlocks, ncBlocks, stateArray, nb,nc);
     cudaDeviceSynchronize();
     thrust::exclusive_scan(thrust::device, nb, nb + nbBlocks, nb, 0);
     thrust::exclusive_scan(thrust::device, nc, nc + nbBlocks, nc, 0);

From a0e86301b0eaf791d4f0054fa7f4a09a4855c44f Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Sun, 26 Mar 2023 18:50:25 +0000
Subject: [PATCH 065/126] add simple analysis file, add nvmem monitor

---
 .../analysis/compression_scaling_analysis.py         |  2 ++
 .../performance_measurement_decorator.py             | 12 ++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/bench/qc_simulation/analysis/compression_scaling_analysis.py b/bench/qc_simulation/analysis/compression_scaling_analysis.py
index a1d5d0e3..1e246c2b 100644
--- a/bench/qc_simulation/analysis/compression_scaling_analysis.py
+++ b/bench/qc_simulation/analysis/compression_scaling_analysis.py
@@ -30,6 +30,8 @@ def main():
         stats["result"] = (_res["Re"] , _res["Im"])
         stats["Time"] = fmt_unit(data["time"],'s')
         stats["Memory"] = str(data["memory"]/1024/1024) + " MB"
+        if data.get('nvmemory'):
+            stats["NVMemory"] = str(data["nvmemory"]/1024/1024) + " MB"
         print(file)
         _prefix = "  "
         last = lambda x: x==len(stats.items())-1
diff --git a/qtensor/contraction_backends/performance_measurement_decorator.py b/qtensor/contraction_backends/performance_measurement_decorator.py
index 4ea2cff9..363ee847 100644
--- a/qtensor/contraction_backends/performance_measurement_decorator.py
+++ b/qtensor/contraction_backends/performance_measurement_decorator.py
@@ -17,10 +17,21 @@ def __init__(self, backend=NumpyBackend(), print=True):
         self.print = print
         self.max_mem = 0
 
+        import nvidia_smi
+        nvidia_smi.nvmlInit()
+        self.nvsmi_handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
+        self.nvsmi_max_mem = 0
+
     def _print(self, *args, **kwargs):
         if self.print:
             print(*args, **kwargs)
 
+    def _update_nvsmi(self):
+        import nvidia_smi
+        info = nvidia_smi.nvmlDeviceGetMemoryInfo(self.nvsmi_handle)
+        mem = info.used
+        self.nvsmi_max_mem = max(mem, self.nvsmi_max_mem)
+
     def check_store(self):
         import cupy
         mempool = cupy.get_default_memory_pool()
@@ -71,6 +82,7 @@ def add_tensor(self, tensor):
         if tsize>1024:
             self._print("Added tensor with data size", tsize/1024, "KB")
         self.check_store()
+        self._update_nvsmi()
 
     def process_bucket(self, bucket, no_sum=False):
         res = self.backend.process_bucket(bucket, no_sum=no_sum)

From a7e7818301ecf6850d13afe0a89697eae63e1886 Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Tue, 28 Mar 2023 21:03:08 +0000
Subject: [PATCH 066/126] remove some old prints

---
 bench/qc_simulation/src/simulators/qtensor.py | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/bench/qc_simulation/src/simulators/qtensor.py b/bench/qc_simulation/src/simulators/qtensor.py
index 9b1a65fa..a6d4a94b 100644
--- a/bench/qc_simulation/src/simulators/qtensor.py
+++ b/bench/qc_simulation/src/simulators/qtensor.py
@@ -116,15 +116,7 @@ def preprocess(in_file, out_file, O='greedy', S=None, M=30, after_slice='run-aga
         components = list(nx.connected_components(graph))
         print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
         print(f"peo size without par_vars and ignore_vars: {len(peo) - len(par_vars) - len(ignore_vars)}")
-        def inspect_node(g, n):
-            neighbors = sorted(list(g.neighbors(n)))
-            return f"{n} -> {len(neighbors)}({neighbors[0]}::{neighbors[-1]})"
-        # inspect first 10 nodes
-        graph, label_dict = qtree.graph_model.relabel_graph_nodes(
-            graph, dict(zip(opt.peo_ints, range(graph.number_of_nodes())))
-        ) 
-        for n in sorted(list(graph.nodes()))[127*2:127*4]:
-            print(inspect_node(graph, n), end='; ', flush=True)
+
         print()
         # --
     else:
@@ -165,7 +157,12 @@ def estimate(in_file, out_file, C=100, M=30, F=1e12, T=1e9, **kwargs):
     write_json(C, out_file)
     return out_file
 
-def simulate(in_file, out_file, backend='einsum', compress=None, M=29, **kwargs):
+def simulate(in_file, out_file,
+             backend='einsum',
+             compress=None,
+             M=29,
+             r2r_error=1e-3, r2r_threshold=1e-3,
+             **kwargs):
     """
     Args:
         in_file: file with preprocessed data
@@ -173,6 +170,8 @@ def simulate(in_file, out_file, backend='einsum', compress=None, M=29, **kwargs)
         backend: backend to use
         compress: compression algorithm
         M: memory threshold for compression
+        r2r_error: relative error for compression
+        r2r_threshold: relative threshold for compression
     """
     import time
     from qtensor.contraction_algos import bucket_elimination
@@ -184,7 +183,8 @@ def simulate(in_file, out_file, backend='einsum', compress=None, M=29, **kwargs)
     backend = qtensor.contraction_backends.get_backend(backend)
     if compress is not None:
         if compress == 'szx':
-            compressor = qtensor.compression.CUSZCompressor(r2r_error=1e-3, r2r_threshold=1e-3)
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = qtensor.compression.CUSZCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
             compressor = qtensor.compression.ProfileCompressor(compressor)
         else:
             raise ValueError(f"Unknown compression algorithm: {compress}")
@@ -239,6 +239,7 @@ def simulate(in_file, out_file, backend='einsum', compress=None, M=29, **kwargs)
     C = {'time': 2**len(par_vars)*(end - start)}
     C['elapsed'] = (end - start)
     C['memory'] = backend.max_mem
+    C['nvmemory'] = backend.nvsmi_max_mem
     C['result'] = {
         "Re": np.real(sim_result).tolist(),
         "Im": np.imag(sim_result).tolist()

From bd0564f11bc03ef04d926bc3147ad4177d3a2671 Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Mon, 3 Apr 2023 06:56:39 +0000
Subject: [PATCH 067/126] improve compression cusz wrapper. add details in perf

---
 bench/qc_simulation/src/simulators/qtensor.py |  1 +
 qtensor/compression/szx/src/cuszx_wrapper.py  | 10 +++++--
 .../performance_measurement_decorator.py      | 26 ++++++++++++++-----
 3 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/bench/qc_simulation/src/simulators/qtensor.py b/bench/qc_simulation/src/simulators/qtensor.py
index a6d4a94b..c64f728c 100644
--- a/bench/qc_simulation/src/simulators/qtensor.py
+++ b/bench/qc_simulation/src/simulators/qtensor.py
@@ -239,6 +239,7 @@ def simulate(in_file, out_file,
     C = {'time': 2**len(par_vars)*(end - start)}
     C['elapsed'] = (end - start)
     C['memory'] = backend.max_mem
+    C['memory_history'] = backend.mem_history
     C['nvmemory'] = backend.nvsmi_max_mem
     C['result'] = {
         "Re": np.real(sim_result).tolist(),
diff --git a/qtensor/compression/szx/src/cuszx_wrapper.py b/qtensor/compression/szx/src/cuszx_wrapper.py
index fd62e87d..d41703ec 100644
--- a/qtensor/compression/szx/src/cuszx_wrapper.py
+++ b/qtensor/compression/szx/src/cuszx_wrapper.py
@@ -67,13 +67,19 @@ def cuszx_host_decompress(nbEle, cmpBytes):
     newData = __cuszx_host_decompress(nbEle_p,cmpBytes)
     return newData
 
+
 def cuszx_device_compress(oriData, absErrBound, nbEle, blockSize,threshold):
     __cuszx_device_compress = get_device_compress()
     
     variable = ctypes.c_size_t(0)
     outSize = ctypes.pointer(variable)
-    absErrBound = absErrBound*(cp.amax(oriData.get())-cp.amin(oriData.get()))
-    threshold = threshold*(cp.amax(oriData.get())-cp.amin(oriData.get()))
+    #absErrBound = absErrBound*(cp.amax(oriData.get())-cp.amin(oriData.get()))
+    #threshold = threshold*(cp.amax(oriData.get())-cp.amin(oriData.get()))
+    sample = oriData[::2]
+    d = cp.amax(sample) - cp.amin(sample)
+    d = d.get()
+    absErrBound = absErrBound*(d)
+    threshold = threshold*(d)
     oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float))
     
     o_bytes = __cuszx_device_compress(oriData_p, outSize,np.float32(absErrBound), np.ulonglong(nbEle), np.int32(blockSize),np.float32(threshold))
diff --git a/qtensor/contraction_backends/performance_measurement_decorator.py b/qtensor/contraction_backends/performance_measurement_decorator.py
index 363ee847..cedb3e45 100644
--- a/qtensor/contraction_backends/performance_measurement_decorator.py
+++ b/qtensor/contraction_backends/performance_measurement_decorator.py
@@ -15,22 +15,31 @@ def __init__(self, backend=NumpyBackend(), print=True):
         self.object_store = WeakValueDictionary()
         self.object_keys = []
         self.print = print
-        self.max_mem = 0
+        self.mem_history = []
 
         import nvidia_smi
         nvidia_smi.nvmlInit()
         self.nvsmi_handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
-        self.nvsmi_max_mem = 0
 
     def _print(self, *args, **kwargs):
         if self.print:
             print(*args, **kwargs)
 
-    def _update_nvsmi(self):
+    def _get_nvsmi_mem(self):
         import nvidia_smi
         info = nvidia_smi.nvmlDeviceGetMemoryInfo(self.nvsmi_handle)
         mem = info.used
-        self.nvsmi_max_mem = max(mem, self.nvsmi_max_mem)
+        return mem
+
+    @property
+    def max_mem(self):
+        mems = [m['mem'] for m in self.mem_history]
+        return max(mems)
+
+    @property
+    def nvsmi_max_mem(self):
+        mems = [m['nvmem'] for m in self.mem_history]
+        return max(mems)
 
     def check_store(self):
         import cupy
@@ -51,12 +60,18 @@ def check_store(self):
 
         if total_mem>1024**2:
             self._print("Total memory usage", total_mem/1024/1024, "MB")
+            mempool.free_all_blocks()
         cupy_mem = mempool.used_bytes()
         # get maximum memory usage
         gpu_mem = cupy_mem
         if isinstance(self.backend, CompressionBackend):
             gpu_mem += 8*2**self.backend.max_tw
-        self.max_mem = max(self.max_mem, gpu_mem)
+        self.mem_history.append(dict(
+            mem=gpu_mem,
+            cupy_bufsize=mempool.total_bytes(),
+            nvmem = self._get_nvsmi_mem(),
+            tensors_sizes=[len(tensor.indices) for tensor in self.object_store.values()]
+        ))
         # --
         if cupy_mem>1024**2:
             self._print("CuPy memory usage", cupy_mem/1024/1024, "MB. Total MB:", mempool.total_bytes()/1024**2)
@@ -82,7 +97,6 @@ def add_tensor(self, tensor):
         if tsize>1024:
             self._print("Added tensor with data size", tsize/1024, "KB")
         self.check_store()
-        self._update_nvsmi()
 
     def process_bucket(self, bucket, no_sum=False):
         res = self.backend.process_bucket(bucket, no_sum=no_sum)

From 8d5111e000e6cb5883fa84557e0cd379a0dff47f Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Tue, 4 Apr 2023 06:05:42 +0000
Subject: [PATCH 068/126] threshold real value

---
 qtensor/compression/szx/src/cuszx_wrapper.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/qtensor/compression/szx/src/cuszx_wrapper.py b/qtensor/compression/szx/src/cuszx_wrapper.py
index d41703ec..cc38df89 100644
--- a/qtensor/compression/szx/src/cuszx_wrapper.py
+++ b/qtensor/compression/szx/src/cuszx_wrapper.py
@@ -78,6 +78,9 @@ def cuszx_device_compress(oriData, absErrBound, nbEle, blockSize,threshold):
     sample = oriData[::2]
     d = cp.amax(sample) - cp.amin(sample)
     d = d.get()
+    if d.dtype == np.complex64:
+        #d = min(d.real, d.imag)
+        d = d.real
     absErrBound = absErrBound*(d)
     threshold = threshold*(d)
     oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float))

From fdd3181f97b95122b2cf1bcad155556a43f70e1b Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Tue, 4 Apr 2023 06:06:44 +0000
Subject: [PATCH 069/126] small refactor in bench simulation

---
 bench/qc_simulation/src/simulators/qtensor.py | 90 +++++++++++--------
 1 file changed, 55 insertions(+), 35 deletions(-)

diff --git a/bench/qc_simulation/src/simulators/qtensor.py b/bench/qc_simulation/src/simulators/qtensor.py
index c64f728c..3b2bd1cf 100644
--- a/bench/qc_simulation/src/simulators/qtensor.py
+++ b/bench/qc_simulation/src/simulators/qtensor.py
@@ -162,6 +162,7 @@ def simulate(in_file, out_file,
              compress=None,
              M=29,
              r2r_error=1e-3, r2r_threshold=1e-3,
+             mpi=False,
              **kwargs):
     """
     Args:
@@ -174,12 +175,12 @@ def simulate(in_file, out_file,
         r2r_threshold: relative threshold for compression
     """
     import time
-    from qtensor.contraction_algos import bucket_elimination
     import cupy
     cupy.cuda.profiler.start()
     prep_data = read_preps(in_file)
     peo, par_vars, tn = prep_data
     
+    # -- Prepare backend
     backend = qtensor.contraction_backends.get_backend(backend)
     if compress is not None:
         if compress == 'szx':
@@ -192,46 +193,15 @@ def simulate(in_file, out_file,
         from qtensor.contraction_backends.performance_measurement_decorator import MemProfBackend
         backend = MemProfBackend(backend)
 
-    relabelid = {}
-    for tensor in tn.tensors:
-        for i in tensor.indices:
-            relabelid[int(i)] = i
-
-    slice_ext = {relabelid[int(i)]: 0 for i in par_vars}
 
     if len(par_vars) > 0:
         print("Parvars", par_vars)
         print(f"Detected {len(par_vars)} slice variables")
-    sim = qtensor.QtreeSimulator(backend=backend)
-    sim.tn = tn
-    sim.tn.backend = backend
-    sim.peo = peo
-    sim._slice_relabel_buckets(slice_ext)
-    buckets = sim.tn.buckets
-    # --dbg
-    #ignore_vars  = sim.tn.bra_vars + sim.tn.ket_vars 
-    #graph = qtree.graph_model.importers.buckets2graph(buckets, ignore_vars)
-    #graph, label_dict = qtree.graph_model.relabel_graph_nodes(
-        #graph, dict(zip(graph.nodes, np.array(list(graph.nodes)) - 127*2))
-    #) 
-    #import networkx as nx
-    #components = list(nx.connected_components(graph))
-    #print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
-    #print(f"peo size without par_vars and ignore_vars: {len(peo) - len(ignore_vars)}")
-    # --
 
+    # -- simulate
     start = time.time()
-    for i in range(2**0):
-        print(f"P {i}", end='', flush=True)
-        bcopy = [b[:] for b in buckets]
-        res = bucket_elimination(
-            bcopy, backend,
-            n_var_nosum=len(tn.free_vars)
-        )
-        del bcopy
-        print("Result", res.data.flatten()[0])
-        time.sleep(0.5)
-    sim_result = backend.get_result_data(res).flatten()[0]
+    sim_result = _simulate_wrapper(backend, tn, peo, par_vars, hpc=mpi)
+
     print("Simulation result:", sim_result)
     end = time.time()
     print("Elapsed", end - start)
@@ -252,3 +222,53 @@ def simulate(in_file, out_file,
     write_json(C, out_file)
     cupy.cuda.profiler.stop()
     return out_file
+
+def _simulate_wrapper(backend, tn, peo, par_vars, hpc=True):
+    from qtensor.contraction_algos import bucket_elimination
+    import cupy
+    """
+    Backend is modified in the simulation
+    """
+
+    # -- Prepare buckets
+    relabelid = {}
+    for tensor in tn.tensors:
+        for i in tensor.indices:
+            relabelid[int(i)] = i
+    slice_ext = {relabelid[int(i)]: 0 for i in par_vars}
+
+    sim = qtensor.QtreeSimulator(backend=backend)
+    sim.tn = tn
+    sim.tn.backend = backend
+    sim.peo = peo
+    sim._slice_relabel_buckets(slice_ext)
+    buckets = sim.tn.buckets
+    # --
+
+    # --dbg
+    #ignore_vars  = sim.tn.bra_vars + sim.tn.ket_vars 
+    #graph = qtree.graph_model.importers.buckets2graph(buckets, ignore_vars)
+    #graph, label_dict = qtree.graph_model.relabel_graph_nodes(
+        #graph, dict(zip(graph.nodes, np.array(list(graph.nodes)) - 127*2))
+    #) 
+    #import networkx as nx
+    #components = list(nx.connected_components(graph))
+    #print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
+    #print(f"peo size without par_vars and ignore_vars: {len(peo) - len(ignore_vars)}")
+    # --
+
+    if hpc:
+        res = _simulate_hpc(tn, backend)
+    else:
+        for i in range(2**0):
+            print(f"P {i}", end='', flush=True)
+            bcopy = [b[:] for b in buckets]
+            res = bucket_elimination(
+                bcopy, backend,
+                n_var_nosum=len(tn.free_vars)
+            )
+            del bcopy
+            print("Result", res.data.flatten()[0])
+            time.sleep(0.5)
+    sim_result = backend.get_result_data(res).flatten()[0]
+    return sim_result

From 7f2d6d47edb37d58d418763aa0d5f6e9ba7d71a7 Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Tue, 4 Apr 2023 07:42:46 +0000
Subject: [PATCH 070/126] reduce verbosity, mpi is functional

---
 bench/qc_simulation/scripts/mpi_debug.sh      |  3 +
 bench/qc_simulation/src/simulators/qtensor.py | 98 ++++++++++++++-----
 qtensor/compression/szx/src/cuszx_entry.cu    | 16 +--
 qtensor/contraction_backends/compression.py   |  6 +-
 4 files changed, 86 insertions(+), 37 deletions(-)
 create mode 100755 bench/qc_simulation/scripts/mpi_debug.sh

diff --git a/bench/qc_simulation/scripts/mpi_debug.sh b/bench/qc_simulation/scripts/mpi_debug.sh
new file mode 100755
index 00000000..18684e8a
--- /dev/null
+++ b/bench/qc_simulation/scripts/mpi_debug.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+mpiexec -n 4 ./main.py simulate ./data/preprocess/mpi_debug/qaoa/3reg_N42_p4.jsonterms_Otamaki_8_M29  ./data/simulations/mpi_debug/{in_file}_cM{M}_rE{r2r_threshold}.sim --sim qtensor -M 27 --backend=cupy --compress=szx --r2r_error=5e-5 --r2r_threshold=5e-5 --mpi
+
diff --git a/bench/qc_simulation/src/simulators/qtensor.py b/bench/qc_simulation/src/simulators/qtensor.py
index 3b2bd1cf..36c966f2 100644
--- a/bench/qc_simulation/src/simulators/qtensor.py
+++ b/bench/qc_simulation/src/simulators/qtensor.py
@@ -205,6 +205,8 @@ def simulate(in_file, out_file,
     print("Simulation result:", sim_result)
     end = time.time()
     print("Elapsed", end - start)
+    if mpi:
+        out_file += '_rank'+str(get_mpi_rank())
     out_file += ".json"
     C = {'time': 2**len(par_vars)*(end - start)}
     C['elapsed'] = (end - start)
@@ -223,26 +225,12 @@ def simulate(in_file, out_file,
     cupy.cuda.profiler.stop()
     return out_file
 
-def _simulate_wrapper(backend, tn, peo, par_vars, hpc=True):
-    from qtensor.contraction_algos import bucket_elimination
-    import cupy
+def _simulate_wrapper(backend, tn, peo, par_vars, hpc=False):
     """
     Backend is modified in the simulation
     """
 
     # -- Prepare buckets
-    relabelid = {}
-    for tensor in tn.tensors:
-        for i in tensor.indices:
-            relabelid[int(i)] = i
-    slice_ext = {relabelid[int(i)]: 0 for i in par_vars}
-
-    sim = qtensor.QtreeSimulator(backend=backend)
-    sim.tn = tn
-    sim.tn.backend = backend
-    sim.peo = peo
-    sim._slice_relabel_buckets(slice_ext)
-    buckets = sim.tn.buckets
     # --
 
     # --dbg
@@ -256,19 +244,77 @@ def _simulate_wrapper(backend, tn, peo, par_vars, hpc=True):
     #print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
     #print(f"peo size without par_vars and ignore_vars: {len(peo) - len(ignore_vars)}")
     # --
+    def make_sim():
+        import copy
+        sim = qtensor.QtreeSimulator(backend=backend)
+        sim.tn = copy.deepcopy(tn)
+        sim.tn.backend = backend
+        sim.peo = copy.deepcopy(peo)
+        return sim
 
     if hpc:
-        res = _simulate_hpc(tn, backend)
+        res = _simulate_hpc(make_sim, par_vars)
     else:
-        for i in range(2**0):
-            print(f"P {i}", end='', flush=True)
-            bcopy = [b[:] for b in buckets]
-            res = bucket_elimination(
-                bcopy, backend,
-                n_var_nosum=len(tn.free_vars)
-            )
-            del bcopy
-            print("Result", res.data.flatten()[0])
-            time.sleep(0.5)
+        res = simulate_slice(make_sim, [0]*len(par_vars), par_vars)
+
+    return res
+
+def simulate_slice(make_sim, slice_values, par_vars):
+    from qtensor.contraction_algos import bucket_elimination
+    sim = make_sim()
+    tn = sim.tn
+    backend = sim.backend
+    if hasattr(backend, 'print'):
+        backend.print = False
+    relabelid = {}
+    for tensor in tn.tensors:
+        for i in tensor.indices:
+            relabelid[int(i)] = i
+
+    slice_ext = {relabelid[int(i)]: int(v) for i,v in zip(par_vars, slice_values)}
+    print("Slice extents", slice_ext)
+    sim._slice_relabel_buckets(slice_ext)
+    buckets = sim.tn.buckets
+    print(f"P {i}", end='', flush=True)
+    bcopy = [b[:] for b in buckets]
+    res = bucket_elimination(
+		bcopy, backend,
+		n_var_nosum=len(tn.free_vars)
+	)
+    del bcopy
     sim_result = backend.get_result_data(res).flatten()[0]
+    print("Result", sim_result)
     return sim_result
+
+def _get_mpi_unit(sim, par_vars):
+    def _mpi_unit(rank):
+        slice_values = np.unravel_index(rank, [2]*len(par_vars))
+        res = simulate_slice(sim, slice_values, par_vars)
+        return res
+    return _mpi_unit
+
+def get_mpi_rank():
+    from qtensor.tools.lazy_import import MPI
+    w = MPI.COMM_WORLD
+    comm = MPI.Comm
+    rank = comm.Get_rank(w)
+    return rank
+
+def _simulate_hpc(_sim, par_vars):
+    from qtensor.contraction_algos import bucket_elimination
+    import cupy
+    from qtensor.tools.lazy_import import MPI
+    from qtensor.tools.mpi.mpi_map import MPIParallel
+    mpi_unit = _get_mpi_unit(_sim, par_vars)
+    par = MPIParallel()
+    w = MPI.COMM_WORLD
+    comm = MPI.Comm
+    size = comm.Get_size(w)
+    rank = comm.Get_rank(w)
+    cupy.cuda.runtime.setDevice(rank%4)
+    if rank==0:
+        print(f'MPI::I:: There are {size} workers and {2**len(par_vars)} tasks over {par_vars}')
+    if len(par_vars)==0:
+        return
+    values = par.map(mpi_unit, range(2**len(par_vars)))
+    return np.sum(values)
diff --git a/qtensor/compression/szx/src/cuszx_entry.cu b/qtensor/compression/szx/src/cuszx_entry.cu
index ff961eec..0e351af8 100644
--- a/qtensor/compression/szx/src/cuszx_entry.cu
+++ b/qtensor/compression/szx/src/cuszx_entry.cu
@@ -355,8 +355,8 @@ unsigned char* cuSZx_fast_compress_args_unpredictable_blocked_float(float *oriDa
     const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
     compress_float<<<dimGrid, dimBlock, sMemsize>>>(d_oriData, d_meta, d_offsets, d_midBytes, absErrBound, blockSize, nbBlocks, mSize, sparsity_level, d_blk_idx, d_blk_subidx,d_blk_vals, threshold, d_blk_sig);
     cudaError_t err = cudaGetLastError();        // Get error code
-    printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    printf("GPU compression timing: %f ms\n", timer_GPU.GetCounter());
+    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    //printf("GPU compression timing: %f ms\n", timer_GPU.GetCounter());
     cudaDeviceSynchronize();
     get_numsig<<<1,1>>>(d_num_sig);
     cudaDeviceSynchronize();
@@ -508,8 +508,8 @@ void cuSZx_fast_decompress_args_unpredictable_blocked_float(float** newData, siz
     decompress_state2<<<nbBlocks, 64>>>(d_newdata, d_stateArray,d_blk_idx, d_blk_vals, d_blk_subidx,blockSize, d_blk_sig);
     decompress_float<<<dimGrid, dimBlock, sMemsize>>>(d_data, blockSize, ncBlocks, mSize);
     cudaError_t err = cudaGetLastError();        // Get error code
-    printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
+    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    //printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
     cudaDeviceSynchronize();
     checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
     checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
@@ -1154,9 +1154,9 @@ unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize,
     checkCudaErrors(cudaFree(d_offsets));
     checkCudaErrors(cudaFree(d_midBytes));
 //    printf("completed compression\n");
-    printf("Compression end timestamp: %f ms\n", timer_GPU.GetCounter());
+    //printf("Compression end timestamp: %f ms\n", timer_GPU.GetCounter());
     
-    printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
     return d_outBytes;
 }
 
@@ -1870,8 +1870,8 @@ float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
     nbBlocks_h, ncBlocks_h, stateArray,
     constantMedianArray);
     err = cudaGetLastError();        // Get error code
-    printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
+    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    //printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
    // print_newdata<<<1,1>>>(newData, nbBlocks_h, bs);
 	cudaFree(stateArray);
 	cudaFree(constantMedianArray);
diff --git a/qtensor/contraction_backends/compression.py b/qtensor/contraction_backends/compression.py
index 4df06f3a..dc92d904 100644
--- a/qtensor/contraction_backends/compression.py
+++ b/qtensor/contraction_backends/compression.py
@@ -46,7 +46,7 @@ def process_bucket(self, bucket, no_sum=False):
         """
         ctr_kw = dict(zip(['einsum', 'move_data'], self._get_backend_specific_fns(self.backend)))
         bucket.sort(key=lambda x: len(x.indices))
-        print("Processing bucket", bucket)
+        #print("Processing bucket", bucket)
         accum = bucket[0]
         for t in bucket[1:-1]:
             accum = compressed_contract(
@@ -75,7 +75,7 @@ def process_bucket(self, bucket, no_sum=False):
                         # (effectively converting pointer to pointer to addr to pointer to int64)
                         p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
                         decompressed_int = p_decompressed_int.contents
-                        print("Freeing mem", decompressed_int.value)
+                        #print("Freeing mem", decompressed_int.value)
                         cupy.cuda.runtime.free(decompressed_int.value)
                     t.compressor.compressor.free_decompressed()
                     #raise ValueError("Done")
@@ -102,7 +102,7 @@ def process_bucket(self, bucket, no_sum=False):
                     # (effectively converting pointer to pointer to addr to pointer to int64)
                     p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
                     decompressed_int = p_decompressed_int.contents
-                    print("Freeing mem", decompressed_int.value)
+                    #print("Freeing mem", decompressed_int.value)
                     cupy.cuda.runtime.free(decompressed_int.value)
                 accum.compressor.compressor.free_decompressed()
             return res

From 95c0abf7756d90b1da1c075926fbc635ca996e17 Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Tue, 4 Apr 2023 07:48:07 +0000
Subject: [PATCH 071/126] minor fix with mpi bench sim

---
 bench/qc_simulation/src/simulators/qtensor.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/bench/qc_simulation/src/simulators/qtensor.py b/bench/qc_simulation/src/simulators/qtensor.py
index 36c966f2..75ec0670 100644
--- a/bench/qc_simulation/src/simulators/qtensor.py
+++ b/bench/qc_simulation/src/simulators/qtensor.py
@@ -284,6 +284,10 @@ def simulate_slice(make_sim, slice_values, par_vars):
     del bcopy
     sim_result = backend.get_result_data(res).flatten()[0]
     print("Result", sim_result)
+    try:
+        sim_result = sim_result.get()
+    except:
+        pass
     return sim_result
 
 def _get_mpi_unit(sim, par_vars):

From 98b6fa13c00ca2e58338ec4b4f42116eabff2775 Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Tue, 4 Apr 2023 07:58:15 +0000
Subject: [PATCH 072/126] add polaris scripts

---
 bench/qc_simulation/scripts/large_run.py      |  3 +++
 bench/qc_simulation/scripts/polaris/entry.sh  | 16 ++++++++++++++++
 bench/qc_simulation/scripts/polaris/submit.sh | 18 ++++++++++++++++++
 3 files changed, 37 insertions(+)
 create mode 100755 bench/qc_simulation/scripts/large_run.py
 create mode 100755 bench/qc_simulation/scripts/polaris/entry.sh
 create mode 100755 bench/qc_simulation/scripts/polaris/submit.sh

diff --git a/bench/qc_simulation/scripts/large_run.py b/bench/qc_simulation/scripts/large_run.py
new file mode 100755
index 00000000..05f50bb2
--- /dev/null
+++ b/bench/qc_simulation/scripts/large_run.py
@@ -0,0 +1,3 @@
+#!/bin/bash
+./main.py simulate 3reg_N72_p3.jsonterms_Otamaki_30_M30_M24.json ./data/simulations/sc23/large/{in_file}_cM{M}_rE{r2r_threshold}.sim --sim qtensor -M 27 --backend=cupy --compress=szx --r2r_error=5e-4 --r2r_threshold=5e-4 --mpi
+
diff --git a/bench/qc_simulation/scripts/polaris/entry.sh b/bench/qc_simulation/scripts/polaris/entry.sh
new file mode 100755
index 00000000..cce730ae
--- /dev/null
+++ b/bench/qc_simulation/scripts/polaris/entry.sh
@@ -0,0 +1,16 @@
+#!/bin/bash -l
+#
+echo "[entry.sh] JOB $PBS_JOBID Start. PARAM_P=$PARAM_P RANKS=$RANKS"
+module load conda cray-mpich cudatoolkit-standalone
+conda activate
+
+cd $PBS_O_WORKDIR
+echo "[entry.sh] Current workdir $PWD"
+echo "[entry.sh] Hostname: `hostname`"
+echo "[entry.sh] Parameter p: $PARAM_P"
+echo "[entry.sh] Ranks: $RANKS"
+export CUDA_HOME=/soft/compilers/nvidia/Linux_x86_64/2022/cuda/11.0
+export PARAM_P
+
+time mpiexec -n $RANKS --ppn 4 ./scripts/large_run.py
+echo "[entry.sh] JOB $PBS_JOBID Done."
diff --git a/bench/qc_simulation/scripts/polaris/submit.sh b/bench/qc_simulation/scripts/polaris/submit.sh
new file mode 100755
index 00000000..b86aa06a
--- /dev/null
+++ b/bench/qc_simulation/scripts/polaris/submit.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+#
+
+NODES=256
+RANKS=$(( NODES * 4 ))
+QUEUE=prod
+WALLTIME=420:00
+
+qsub -l select=$NODES:system=polaris:ncpus=32:ngpus=4:gputype=A100,walltime=$WALLTIME,filesystems=home \
+    -q $QUEUE -ACatalyst \
+    -v RANKS=$RANKS,PARAM_P=$PARAM_P \
+    -o job_out.output -e job_out.output \
+    ./entry.sh
+
+echo -e "===========\nNew job with NODES=$NODES, PARAM_P=$PARAM_P submitted.\n" >> job_out.output
+sleep 1.5
+tail -f job_out.output
+

From 5f1e786a5257b3ebf447ee1e278d232c9192179e Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Tue, 4 Apr 2023 08:10:46 +0000
Subject: [PATCH 073/126] upload preprocess file

---
 .../qaoa/3reg_N72_p3.jsonterms_Otamaki_30_M30  | Bin 0 -> 107456 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 bench/qc_simulation/data/preprocess/sc23/qaoa/3reg_N72_p3.jsonterms_Otamaki_30_M30

diff --git a/bench/qc_simulation/data/preprocess/sc23/qaoa/3reg_N72_p3.jsonterms_Otamaki_30_M30 b/bench/qc_simulation/data/preprocess/sc23/qaoa/3reg_N72_p3.jsonterms_Otamaki_30_M30
new file mode 100644
index 0000000000000000000000000000000000000000..82da261a907f7fad21327ff68fe4c0eb8d4b0f6f
GIT binary patch
literal 107456
zcmd5_2UJx@w+7LOz4sbLBZ?>&P^?%I+x23J4I4J}ICjAvMa3?5jJ?;Wv3H5sjiMN9
zY%%uUdqdxyJNr0$bCo}E=C1Yf);jMu`P|<wbM}<8XWya8rzdlAcH;lEiYyk9p;u_{
z;9$=lJwrQp>pU>HcVvW1leWDhr$xFCiyUnI@2P@1cL?qt+BtMUq-DKxM2esgD=N~Q
z%=*8&x9t{eg?A1r9T|~2$ojW}g0!|FTI4v_1`jMAom@X==+jPf5puNeEE}A)l;&ib
zEB_l_ky`3;u9<&tm~X^^!z~GHBZ8T1u<ZB8pUYh3cM98tVBR)Z-ukmYg#`V)mPp%-
zU_Lh3hM=7NLYB|CLttAF%-06{?&;8*jXNGYPheXStc(rj?Utj%=!`y{32YmJm9@d<
ztWD?ou)#zR0^5#Yel}SCj`?OqANHR}U^@`3oDFt<VbrdVS#Px=u$?+Axz)X;ZMd9g
zqpQ^M+Pje8c43@|LyrsxX8U^N%}sDo7$*YZKf0)M=JgGJAh<s;PDCR8?yWO?whUcG
zaJw;1gyQ+pZEiD~mS00~(HJLUQRd)AmrA|^$`IThj1$55>E^5FBR5{A`t8Ly5sfo>
z{tCI-`g#D7w-4h)I9lb6n{zDL_ZJCnKgNl8yz)r?%fhyAJ`vmjgfoeF1l?WdR;_r)
zEd+NE<3v1C-Jdn!)rTsZ3GNWaiFn*8f9KWfM?+%>?l8uQc<k%>ph>#4ciR)(pBN|N
zajHtkZ0+ZbFGO%hFiyln8|=HX;G-kK1a}nUL_97ODZTQ>G|MxBi@`V%kLRC0hR^O)
zZz;hY!#EL-mc5Q{Egly&hv1H5oQTK$@4D5Q`R=kS!Nnq+w}{8Ps9aIMK6(&Ka3?TM
z#KW!htfrgyx1@gl3*$sQx}WLMCA509KaqD5<3v2xcX>SZ`-y|95Zoz@6Y+?tn(|nW
zs%xoVPGg*iNBKUn>FdYN9Zck%!8j3*T|4?Ddp^+lcY-^MaUvdC;UjTX_lzt^aOW^i
z#G}jKeQI@{-kgq?zcEh4V?~Z)ORg;N2_o{&Bb<+jN5fWAx~<7|K8)ZlV4R3Yi)!w<
zuH-34^V&s>6Y+TaDLibUUoM(XFJYXBhpAY)X3bXZ{+IAx#yAm=A5Zn)c6eJ|iu(uS
zL_CiD;lW1#^7J{8cLn1_JjNaTG&|(pS(;z|#W)d<?|-=RaL?K1JBYli7$@RUui($&
zWtJ5vO>oyRPQ>GYcjmq)4&JBZ<vPOoig<inFtudU;7rMgyc-xN;*oRXO;>j1u@}MJ
z#5fU;oi8H}Bx`+OD8b#rI1!KVYEv#<E|cst!QI9<5s%7`S8NOHW$sUKcQ8)GV^iPb
zbv<VFdQWh7F;2weR-H`gn{UWWmo4`&PQ>HM%&nuYzu%OL$h(hmA|4@U`+uJK;bT35
zdw_8w9;u@9kMk>6pT3t55w47g$Jv$(w|8!ytTmDM2;)RN-hIw>#5wut?+NZP#))`*
zSpBp6?spj{5!@4u6Y;3%T`>9N_K#@(dx~))9=$`O&Ru)?Ih@FQhH)Yu9pA={uedjk
zE}EWWoQOxM8+}@5KXZuAGcPbs#N$+>BP&Dp?JrDtUt*ky$NBAUXWnJqO~=bCj1%$L
z*QUml_9e4aCGuV)Tv-v18B-@TE4i*K)$a|)iFgF>dRVfHw*PM|&+MeHhI?4o&wMrP
zJnn7ug`L|A5R@*=r3b8;?~Y<_)zaM{YXj@58dlalf*`Va%GBt!FvTe%%SEC@TwCva
zUn$(jy9+@jmnad{3%y%bDE)r=L4rylQ6i=_OKaZI?HcSNsP807MAVYI`LYUY`%NaO
zloBQ4`8lBXnfI;l%p@pRi4xKDO1X2+{;Va66I3dR60w}P?%y{D!hY&OP^kfx-0D@~
z{7OAF)0V;$sKe4An26-T=HBc3H_6n9NK1=g!u@`=x1K%j`QsA;ONU^>|C5@zhc5S+
zR*k^YBbW$4?146qM|t)o%S&?x1QQXMx1(%<bUWHzBhoS=m<U11^m#mQOx0+hG9j3V
zL8&DZHXV+NY)GVKMlcbCzzxNUR4>1N2!UlmFcF1gTaO%z$`eg8h&ii-!2&t;?-iR)
z-2E>j(y}3#h{E@K^ZdBvm!athEIWdUD0JRAAx)|Q9whCXb0C<A!j|tF+^Cj*>SZD=
zCxVG6INdjyOAUC~k-%~xn218(^AG*ox|@#@SZ)LpQJArL?E17t9*`x1IS+z~C^*-C
z+xWZO!~Kc0ya*<uFw5zOusLoq?gW+(!9*0c&K&uw(ZyF|2`s;a!Ls;=@Q}8CORwK0
zumT7sqTpR`-0Jzg4mT&Tf(Ry}Fuj5Ek@ihT)g`d+5llp3#+YqAqbK&9LSTguOhjQ@
ziH<FPo%rfDffYtD5rys9hKD52-<J*~Hv|(=IMJ^1#`TNu(3loMFcF1wjR&4f|8EvL
z-ijiah{B7WvnIPWjwIVz^A89nq7b#<&vGL=Ebt>H6q7Jm48Q7DZ}75y-N})G*&V?|
z6jmQTx2ob~7c-Go9Kl2sLL+W{uG;G+nR3k~5KKhj{Qc7-5B5Gwb{u981QSts(r4JD
zDmS~)H>)Iqi6~^b`*Xgu9ZMY{x_Kg)h{CPQ5!KJ08L)xCN+Fnt!rV#AulVLW7D-^G
z5llqEbMb(;b<35gNMK$FCZg~<!};%1WcBPyU?vHJC2$Vymqo?54$Vej-Uue5uxQn@
z*y(QNDifFwf{7>`z4A-BY>WPwPGG(WCZaGfS)1G^R&@T6z{(()h(d;o89Rk$Exez=
z$|9JE!qi2Nw#J;E7eZiu2qvP?d@-BUV^$oUD9a(3h{EfGZ#Qo2*KHt?Rvy7b6r85V
zrH!04HHg3}Aee~4-jZW;U1832kXMv2Sojw7E!<<*$+S^KS|tP%QE0_FW?S!4o@VmO
z2qvOX@t;G*YHiC!mWyV81QStk4tY1}TK&DL2x%1r6H!<+u-B*Hho9OJSXBfQQ5X_l
zaJk3adZP%e8iI)^be&rLQr{-IQxI5n1QSvCrN)CJ6~@n?mi~xfA_~62o6fLGN2uR^
zLNF19p~nX~J#Lnd><7&?Bn+0jb3V)o&FSBgYzNIX5llp3<`lnce}ylguX9r!);Gvc
zm{7iH+uu!IUe}4GjS)=v?Zm$5k4+-qH6XB`5ljTDd8G~88qO&3D}hA-EJfcSFUYEy
zrXI<-v3}(~1UDSvU=BPOeDq`K!ykVlxE=`SC5&BL-#_$u>x<0@uB}wBoG`X;@>EyL
z)wzA2z`A2`Vi~$`V9owF_7tTPQa_9n5u0%3V}YMT-h3ePnn_we5&x6}l64R3mzQRU
zaY&rEz-=GlU$J<jnnj7Yfe2?3Z%(#s9xY?0G$jWN=CKIpEzI3gslbB{*LJxRd805+
zygOqzmX7H&>MUJ71^_O3-=MNWy<-cO5Bn$QFNcY^x(FuTok~8tGk=&rnNF<T5Nm1S
zyq|jvsU7sVEsbp*Bu*%IcSiHVV~200>zh7EoR=_mNr|!Tdi+@P7ouGYDXyFtDIv}B
z_6vQOl4hNT66PmdJTzC`34SZ8%plSlAefMrac{d7-I@gsAg~|_D<|F_*XlLOyKLM^
zlT!zP@z<iX7<YkzMP^Sw)372D_Y1<Agik(fyEUQR#pQJgu8E}e6OlMxpvV0zucsCx
zut8Xycx$VzO1FFa+agy8uDKLfPH5M&V!ySgZu-)_$S^EU1onq4(XUTE7}uYO3r4uo
z!g>9&9rb)O<nJ~F*ACH&Vb`<Kh}O@;7rrB~Kne2`tdBQ-^u5^W&y@tGVOHV0)747u
zXf!GxIgK=T#5gemex5cu!|OZ0)+F-kBU<4USMxsqXP%|V#{_d<B+g4%k$zg{kXidm
z-zMVBQe0UvQf}p0ys2^Q;3Wh$0g3YwzI&6wxqHq(bH@?f5QH;{A@FeW%n^&LKip1m
z{V`6MvFY5%yr*`wcuH`!k$PgJOo})?{K(~zbUiTwiSriDJ9*;r{v2thl21kEkr*fB
zogcM3UAbKu=|;Z2q%9|6+bG7xD^J&k<RsiY35oL&;>K()+Q;)o$rr?m@d)QF<eh0S
z^7gKp@2MGG5w5fdTJgAFSC2_?zX_4o5~(Mw9rV)a-%Pv4t{|{3Se!^vtBYP9dCa>V
zT?Y(6IFktOq7J79PN{S?JCQdS<HTSoaC*n;F=@-xAh>XZ^A_^P4tE_lwBZ(Sf@=*p
zYkn^$-i%@qi#Cm|AGD3YED}~$SmE<|*TMEh0!#!JidenG(DPk9$@gmY0=)^YH^vFt
zw-Xw>&zip{Il<LKv?69d-Og7kYty~E39J_uC&ob5@OQa_TK1qzke(PPf>yap-e0zD
zKSbY*&IngpI5~ZA@rNEGtJWgC6A{iw$n%=9r%XOmEt*8z0FFCZcqQ!Go4^YBR?##S
zhQyhKm;ZV<d~&MwV?PnvF$m`^<PBW$sd2bdD)JH4+zQbOuN2H?iB1-E`4SPwkhs!9
zT)mx7JaXLnOqUEH2<Iid(#5rZin~ppl5Ys+P6$_8@Xk8$v|ghsjmHw+Mp8XLF$S70
z`0YsDHP6xz*ia<SBwol;Pt$r`zm_UD##w%MmOvkq7%f3lD#khY>0u$jMHmRnhLRJ0
zueu^GmQ3`PUl7pSPh`k(0c95Yk7(gSB+f%XA1`5Y_43`qXO)XsM}XrH&>P;j6Duxl
z4gT}>Bm!K4fIcQ6G3=^Sz|fxV9SCp~0(!#?U2{%I)v{@})FZ&<2<QVV$aDcqJLh{?
zAPWI5!9XwJ#~((t>E-q8`WOOSi-E9Gzc_l%NRN?S!w4`00lmwLWK?8?&tmO*4wBp~
zkqGEhT8xR!{Ram<a^72=NbHS(-moDmw{u>N+f}CyBftq52+RL9{R7)h2)IB$D2ygR
zq2s1XjaSdNydldv%R~Yc<8foUi?OXUEtyTIJ7ZuOA@S3-T4!R0RU&6smKg}>Q(C<D
z!?&ev^)gGFKZwNs2<Tl_ys35f=9w_zOdYZ#v;2mDJ|^+1&TaO>JM5hcSwLG>6QH1;
z9Ny2R$Ld{VeP<bufZnj29Z@2iU#TMPZW0|Q5ug}7;TZ$!P5s{KI04Q_Kp!s=>ph3+
zT)tkaLn{KDLx94MpOY7wusGU{oU>VmBcL~|ePgm-jHx;-ge+(*a}m(TOXTgoi&{-D
zH1N|@LcI(FVOD;YAvAX1gsLQ!S!QA&tQt%ag&T!03LQu!P9s3^ULU<x{8m8PoMb0&
z8H9k|WrZK3iXF?ldUAAkBC#I<idaWZd0%1LJQGa-0|`(JT%UP`Ki99h<OGp81Os6s
zG`Gj!MS>>pCi9hLAp-iqbl)r4cXM2pG5TKjL_qH{g8J}zmOuZf+Sv&8It28AWy^#*
zQzsla)i;;`7hoVvMmZ*)t6YE7e)?WVAfPu)zW-Kyb#`vK>obYOp%@7BQQ1o^D}SGU
zH<^ztV==It2;rOEzEh)q&PNtGmR}Lj2PW1cofn>7df+$m$<4AD17V|ZEyx@(ZdIjF
zqT^Bwgs(K(_yWHcdK%?PfWr{b+fNLNxRrk%*q!PcmAD20ePDL&yR|{orN5KUBN7K7
zpf@Z5OE+%W=$damn(V_c&`-E0JZjg=CesG(BoapupcoXR&z0S~jpfNofRho>2e!!b
zhd*<f7hPfx0fr)=cUkdXZ=RnqZEQ#hKLVVEfIe^zk;<~B!Ilw2Mibyl41`_xng$yy
zZ9I^N&I%(D&>Oa*l}5GnaX+_#PMWI_&<A#BC)Y1I_ivk8WFKM)M?h~lr5oVsG`rSh
z^LIqY=?LfptJSKz%DQds=}yjQEK>+jB+`CI?$zp)ZdVT?u?GTrml3nV__zc0j}@=t
zOMtTx&<DQVX5V!FVvWc!9|D|;fu%)c3QqA$KIKD|-w3cT0(zGfvqJsR%iL~sh@meR
z!@x2^Vm;R)zLy`rq|-+?0u--8u2&fsglx}Ro=^|QKtJIgC)b5BH6E4BPk`Mqu#A|7
z@3g+Z$iG1onjgCmpm-Jf-E?cT*s_7nu6;1DtN>a+zAf?`*NaXcV=%Csu=;wrkyTwz
zS8q#n)G!dv?RPZnw55CTd~}}aN`S)Qslrz8se8US**jT!5ulimT7GhS+qV5dvH~#2
z%4N6-vR;L*&rgPJZtxnbXg-c{A}(cCW(_LR)zzQi4oI9AB=D8@A{R~^|CVe|ERC>&
zaKOAhS6s|Vm!?gzKudiLg#A;I^X@G>4_QLfL~9I$Pf#s}%{kbzUwQJu-r|aYrZU2b
zep<~mfA-DrJ)t%upf}8K>GRax`ZjD-7XoaBfj+|91D6Jd)(p&0f&jB2pb7GFC(pap
z`Zg|3!eYrnfWqIyN{q@BRr?fOJZ2|Ak(a}2{CIWX*v7RmwRsl`3?wMc68(PL$3Y}F
zn)e_ajM)nQ4~piCp8p2Rvvfqn-oAqP&g3U<`9~*Tih<@QQlJTra*mgsb$xl2%uNXH
zEW*LAZ0L`1Q=SzoNLNN)m>4o>VA<#;+p7)oA_C7~f#U5em3{xLxV&2)5ZqLW^YRf1
zqUY}&#x9>zX%)epkT?@8`~zM*y4`k7Epk9;@gWKd&-9#=s_=o4naR1ir7;G=8R4bo
zJzA{2crG<zzJZwGsNvb7Rj<$2%~t^9EZ-x6CYaY2R(EZfWt-D83^Z?%0=*!4jw)BE
zfbYRTdOUqf;!H4O+zlO;dgreL{~_}JLO57Z_ZpI8bj}IqDQ>UCdBI|Ah56L_`OSyZ
zH?ILQ#2Y@0?0ivq`=-LJHeg~)B@7g&11a;B&o(f1KeFyNzmmizAMrZ1IzOsR={$|y
zh`e@Kp0CIag<me)TJ&5YX9CQLfF?L?D6u`PN$T|L)?uLeCSryq+Yg7z^eix}FP+d*
zVS#0Yid(%0S>Bd@kemp7fds-4>hTTjE5CSHj~shja$|w8G4Oo<@BCvEYSUp@90SXV
z@%qDs+)eMt#HGN@77GG;dy9$h!>z)hHI`~I1n7@}a41o}%c)1D#?>I_sFunED5z7v
z%J{lp$P60ipD+;i$i4eNZ+|v!WE`e8Z<pTjf_=gAfs341{#8Ae;Lb^$3HGy-KdgQK
zvCW>G7-y+Ph=qoAcWu;eeBMZoa4jt`&_^u8j;tJ9Z)(nAG#|zwW;k%myt4FjQ>rd^
zF|&Dv#Chq<x?IV^_Ez7Q8RIM!F){4xR=!IW_^Q~n2^eU8DFvE*#L#|{c~FCuPY=z(
zI7=HW&{rhRtz!c6%s;>~5}*?TntVm(OmV2XN2a#PPGX>Wjb!$Mm44adt<H~4>q^f0
z%yAND@)hH2ikt6<=I`R&bMZjL-mu3=@A-C;N6KgYF|ox30ZqPQVy@6><nW$9=OhWy
z;*Egbkldf{A8Afe$@CISG`~SALX=<UKjl`XiBoZmvy>tNg)LXgELe86>Vqc)SOWuL
zi!s(*XWjPQ>*%5WI?3z>AD-%a?JxLnp9f7ipCrx%8O&?VotExJUFd5cjEQ~4fIWGk
z&5%uftB`lk{2B>_`LX=Y<>#g?bialzF+V^!Z;`;7y02UoQSzsQ7-z|kh)r<v)4JW(
zE=#+tX^(;C>qsDYrbx%;4>~(9T262m5Drd=?>?wH@5b8P7YXhT!ohOR)a`JQIdgND
zBDk#*=LLy##wO2ZliqG?iE);+*bq1<nZW{_9$o(12?Nc~r9cyWc?g($^>K?me)I^t
zHWKI!hvHsyqL&@Ko2vp5xIhZ@f?=Keb9d&vF@VI_JQw4{3!3E**V$)FW-3DD-Iq9%
zw@3}6eGXQh`l`=jjI;cJ6g0uXV#g-iK5R;JY7GXO7fXR&u;1%=dh+!i3p&#eZl5L2
z<SWL^sqO*8_TF|UA+b1PVi>Feo(+d*zg6)gA>Jkhdck?dh3wBFrqBMg0OKs_u|PPJ
zepYlyr*Um_(saE92`nSt<MR6_Za+6DKi#KfA_4_)=J~q|{jyG?d$>RZ^oF&_n?p-~
zOL0>hM^vnXfpB7<t)pAc5!d2PJt+{-1ZUSP&bGUl?9JtASfcq4WD6`0OKxjfI^bzU
zIL29$VS%u=&sxDdaN6*N*D=t%772v?Kby@ij5*Nt1i|e<I5^-M`Xsv5l<LLl1h5n1
z#CZLyNxMhcx_8)4<UN!)6YL4&X7sGMsO+zc2yUswdBIMvaQYY38y+q@jNpz)oCyjY
zZWZ#YGk$NWxfo~3fgJ<qyo)c+D175%iYXXqzKsMz=%TO3By)L_iRR4t66Xbu4k~bR
zK)2IDG-u|;#IUZZ(5BC)@TRFYVq!~K1oVbNt5t&wKk(?kmo6KEFwj>Vpk%)^qtpDU
zv1FxWDTjfuF>4WU=R?u5k3L{(^S{y-6MO{w_m;;Kzai=V#5hYjB+yhwOv<ZQ1XgMA
zPigYfn-5BXUhp+&RM(pK4h%R!^ZyKl155I+J?@n)`+{zSI3Fg4Vd&OvPT}g=dru<3
zh6v~lbzUC|$F0?RU(hhn{9ZDf;K*TI=?vjJs$Z>)ahB9bpa~A4_E@Uc@G9Nz5CIm#
zK=?qoxYo_QN2d)UzOncspf?<ft}1o+h|~Fgo3KRlGO3~$Y%8V|xb@SM5og*E+$@BH
zwU=v5wXi(ha*=Ukz9MlZ_%NBZ*x^>avgMyg<ozmfUf`OIGqzOkcDwNof{Q{pD7}l`
zd1&%<x7gMMHyPnz_4@I*UCj&jSl<cbECrEcOd!x-`zOz3pP*_4=#GIfyxfLw4xCkE
zmKg)hmnE|aj`2%881HuTbea&1v;2$%dc#S_o^b`Q4)?owhXC7SAk2CNj+{y9e6$r=
zv0JKRAiPqkyxz|{zNT*mEYW-#sR%E9hEYS^f7m)JCBbcyI4?+FPddC_RqlsCa#U>o
zAaN#mqfXV$_qh0*Ud^#QiyKnV1gAy)f;NXVT^UOjES6jt2qz9Z0v9~YKmV)=OEkYl
z%syhfb^G)i=WaQiju2e5#CbuLV6W;cvNZ}k@HfHDLpWGBv>AJ)Qmg3GPY7-y!odmb
zq+(b9+E95io#qQ;Lx8x^hg!3;=J%v&vor#F!`Ja=OPd#r{jlr`X10_hK#_<GKCEkA
zdg9Fu3^d=8Dw<$%Sgud{V;gHvr*Fz#goDzqU!ybb7&CazbS%%(78AoOzdP{axR{J1
z`(U8?8WIS5@2Hu5-*^{H6+v((5e`z)whhfU7S0}Ao8ZnP9Bf!UUzS+jqV@*5T#1x8
zFBl6$Uk|A>W_5RRB4R0n4S{2-%~Q?|y!G7wAttso!$3$PqpQE~ANJ2`atLb4jDRLs
z*woE8<5ZS@sp*SyOsZ&tAm>c8^;p*YzFvemD-kGGHv8udTa~?}hX(-$AfPua#Us0?
zbqTvyjb@wX7zo?4S|1L-sMx3t-HCR>K-etI`s1G!%U(Xuf~n2_AY0(``TIE~S6*<t
zT@m9fepn!UkoRoe{=nnTDM@LOr6~r&QB?)c0+XxdT1`$I%<m909G^FhYB&A2M}eO)
zv-u;!!NEkBd*G|fF}?u=cUa=Qpc=7bk>ct9X;^3(!L39%*uTy9?>FE|7k8RocOx83
z%A?i}j_WY%CCPH;qY`I=?`Ffd{I+2A(7Bfg?|O;zg36Eqi!$VEzI)Uwg1aPfCOAf0
zcezL98~N%of}19BUT`SU{`KoSD|cM)NpQ6Q$IHXTxz5?+E)fqa6xl{#e@k&D_*$Rw
zVr=j9H-02%W9H2g=LJWglMl`aYy5ca3nK5S#F^kz&(5d{7yi0-l&ELPfX0gnDvw@O
zIq{+W?S^E*XZaBUz2QS|_exXmCiC!ai<vD&5YPk%&iC$wr5(EC&TRrLN`PVqT;THC
z>YeT}vKX*@hk>xOjca;jN|D#2PhyGY4bo^Y_|h|e%b@ZDW-dBIaDPgi368Q}<jFs`
z_MOUg32wi{dBM(mUr3>VNzcOQH-L;-L5T9}J|k+j?6ilzoi!2A8@?)}h<xJ|yKeD&
z!aN%>!-3M<TO%eMzP^aA_@*Nqyv3EbR-9JS#k(AlHwELwo`3iCa@YGm4BdxumL^z1
zI5x?CIPz3@{hp^W(0ow}G(kbe*=I+(|L(h(%<-0%M4(t)lsVYxQh`1}bX$8B354NQ
z=JzVg$GRjZ+f_>kED#PrcQsDmFZQ%M`5bG$hXlgGd3|eH`GVh@tR<E_!Z>jxS2bgI
z|E#IIjwQIq7$*#gZnv<Q^QV{j2<{oe!3U;>qXTOPIX@%kM3&^p5ECqaibT5n^{~eB
z?pUBD4FL)bL(YHtyfnOjAp*>Ufp8Gx?UZ^**J`)P4%kwF0ENWl{)^{5zI=?Vn$3r#
zEne`IVn%~X*&mJFx(73x_hFoPlS|A#He^+1XL9CcDTayRQ+3W4ZM;+e@~$iqxEu+D
zy<63v4h7`gbcqDd;)w)$L;9cccf<FEf80%`JBx_`#bRLcv)Yr728X*4=8cFM?(+=$
zH)F21Vb962gn1RliOSc^(}owWbL$FSp#6q$u<`D*;`Had6Mt)jc`a42g0SV;H*3hI
ztEXQbC%~!%DAum&3LR*e_jvt+1Xzs##oXiJzrXy3XR8}ypm{V>5e|_L_%0mLEA|Kt
z%1nfVr9keKtA}@J6hzZuc}xr+dLlj)YTeKM67|jRNFWq-_zt;O<>pP-R#-txJuDD3
zELXX5))ZNPtV)0>5zquDhaDf>=r%oL?0gI~ug1*6hrRbTYUFl#W;2Yl)I|clVPQW$
z@1h*b=X%qP!W<+JwwDv4*8iJj#J-zEU<oA91nVTT>2u79N$al?U<Cy9h84kw<1acq
zh)w?%1I>#NGaN)_?N_pOo?iK_JAhzo&pdW2g_9FI%OB_Y;}U;d;g4(lag#sp@W*}r
zcw~J9N96An8r(gkM{j7f7t%JgbC2$xjcoreAUHHKB7Fnihfn^CFAexow(HZrOK@mN
zq?Nf<q~;v)od8AZ2x~_@XG>&+Yf$Iz9XhuU4vEyR%C39ngqBOABCYNHI<yUK8`LFu
zKqQN^YC6s}=J}yip%p$ow*JQyw*MFzVD0YLqj!hk-Zt1c*SQVL=ZzdXD6UVtwyh%B
zd+XnV9)GSj5$u!o-)J|G4z<Cfe90ybmF0DwQIX8$JFA=Gld}}oe;dHQvp)YL<88!P
z2QiBDDR1+sW4wnL&4^K~2jq<^b&L-YV_n23@~^xAsgCgpVoc~tjylHYh%o@!D3%EF
zE~q-jSBNp8`#<U!-y+8P(ni>T$g6$o7(XD!28dDQYI&Vh9pfj&*bp&_iBsNlRmRAi
zt!+&a>CcE%q#b!nTpek0O4<mKip(Rg7^x#oNl6<cQjvV*wRv@<sVQj_L@M%+y!NY(
zG#w>vib%yeTlNu9N1Bn6HbbN$3(1T1>PWLt(m+HizAwp(#_CA3Q_|*=)b8%JI?`N}
zv;`s+=}6v{S4WzclD0&oA|uJmwCYF;P|{Y2RAeN1JyIQMAxhdBk&28Y@7t;)Eka3y
z5UI#WvK4_k(qfdf4I&j8NnUzYM_PiCwnd~OBgtkT>PS5)X*)zJGLmdoppMjwlD0>r
zA|uJ`@#;u@C}{^E{c5ucb*yD6YcOUNIY~B+P{&%HvL@IJLLF-*%GwFrDzcKiJgttk
z3T4$Wt4K?-L54ck>XfxJW)*o!Ub9!nT7$AO%qkL-yzZ`!wKiq#f>}jol6@f5vDT%m
zT`{XjO|qGVI@WrWwHsy?xk+~UQODYlvUW$T@X1y-a8k$Gn6mc3tRgqbE>!ARn^D%D
zm{ly8WFrf8tSu;OFU%@(lWeo0j<q#q?TuMQZj!x1)UmdutRa|H<R;ndLLF-d${LDU
zMQ)P)Q`E6`qO5%|tH@2Vjg~rAMp^q}R*{=zcO-SJ-6(56%qnt|Y~QAiwI^lmk67XB
zhitW^jx~g`4#2D;H_4tt>R9_y)`6H+<R;mvP95t2$~p+MirggonW<y_g|hyFSw(J=
zP1)43hEdkRm{sH^*#k-)>u}0C1hb0VB-@IqV;xCZ!!WDJO|mDFI@U3ibtq;Pxk>gM
zQpY-;vJS(nA~(q<dg@pwQr6*!)vn*BI@T$aH5{{w+%)z}e=>Efk(6}=W)-<f_NY_G
zI)k!~#H=DW$&PpGSZ7n#QJ7WaCfVUr9qT;Gn&8)Vb*u|1>lkdS$W5{%nmX1clyxj-
z6}d_FT2;rooU)F?tRgqbj<M=kf1|A9F{{W;vVEdD)-{wh0<qfl3|7awj<Qa`tRgqb
zMw{wbH&WJ#m{sH^*-TI!>lVs73A2jaB->{iWR3W)S$*qo5C%tzB7`qLKB$*Lu=>0<
z!TMt>iLV*eHLK>zD-s^WHmz1^O8O7he~Swhp4{=6d1_nX8DA-l;D0Z^R!|thOv!{+
zvMd69#ylteWq-@t5n4g<1bf8eX7!aUoj_Wh#af9d%1#%K&<YAHSW)nmEU!RXoxxg(
z5iQ$gI6^BZ!eDp6SF!{HX>}TFCEjj%v&9iwL7@f<1Ydo%bLuIql_<EAeK8!N6%=!@
zo8T*1%7J`!5@}_3|H2VkLBR)W2fq4hyVt+4R$>N}y+ItI6%>WAf}oWw2|>O(fwdB=
z0olyO5n4gv2zv(+lw~7Gt5~d+SQ*J~OpeeBic8pQ&`Oq;AgzvLt;7yi_I`4NR#1Jy
z_JCHh_5^8l3~MD;%<}4uBea6@6gCRRl`Kv{TE$?kL~fVeJshDG)UU8Vz*n+*1!;8@
zYbA2KyqMw$t)Re#r2t>a@)o4k5v-NS?ef;OBea4F7`799CF@_1Rta9naD-M+BE!yt
zR<b|_X_erWCP!!mH8d;-L{L`HAgvOt$#;ZSP-Mf>fL34qd4q#;T-o0ncZ60@g~MJz
z1Z52l^3?&XmB{U~GoB-~f-)Ue6MQ9$bdXjFUMq5hR#3;o27*?yiU(<x;2jP}Xa$8n
ztO;l(%Y2YldvOFsZkOH59HA9d0<n)Uu4Ek$(rORZO5}Fgu+I@%L8%by1-_DnLP)Cw
z?`JzgE2t%6$G}&zk_c&);6+YHXa&VaEGzg*mKq_g{y-75Ys=yYt)Lo-RRFDIEfUfy
z3Tq|S?Xoe0Bea5YCH5I2_|+fQ*@d+d>vq}3(h*ugITOnSzLK>}$X7eDRwB2{zBP`}
z3d)^WKG5o`KYX(TYbDn0vg?i`w1RRd77kj;S}5eJ?N}?ZZkHYU9HAAIOR?Obm8_LQ
zT5ZExiQF#xMma(&D5qjwKr2~Gg|ym=wGz2qb|Q0xR#0xm27^|z)(UC01#2bN?Xr8Q
zBea5YELI1!lC@Y!tIb#|aV8^gqdP(?DA!^?f>yFt3u(0pX=T@O#SvOTITyPRSya|?
zA+0uItwe5@7fT(X6_k6i6QGr>^+H;0z*>piF0afxLMtc-V@E(MSqp}=T9367xm`AE
zaD-M+F2*z%SF%<NX|)b(C33s$2I&Z`pqz{agI2PZ3~99%YbA2K><8xvt)SeD1%Xzw
z)(mO&JJw3%c6qhe5n4ex8XF2)$yzj|)f%jo$nCNzsw1?5ay8Zxw34-INUPOYE0Npf
zWn@Qa1?6n)DZGQSmJMmO3TqYrg1aNMf^s)j7<?sb-H=wlA+7AL(>p>dD2HRGz*n*s
z4r#R#YbA2KY--~Ot)N_vl>%SMS~;ZE3al06_AlLx9ibJJ(=iKZC2Q%BR?D$gBDc%d
z4vx?Y%I(-Ch@h;sLs~7vT8Z2)TlP6ZD=5cfuHY+Ki-)vYinS8CUG~y+gjP_l$C80o
zvQ`giwFGM=a=UEL>Iki%oR8H9tz<19(rPi*O5}Fg{?-v%LAf9E0Ig)LAJS?O)=K1d
z*`2`=T0uD=3k9uYEg;frA=XOdcG-E)5n4gHAR7x>$y!0A)dHlIU58~yXa(hjEInu?
zYYCB7^RZSUx65|1j?fCq4cQ^cS6}^Y?|E1&k=tdDMMr1_<%sNO@Rh7ZM82AfwGz2q
z_P%z6R#2|Ud_gN&tBACkgS8U5UG`>igjP__$oxPnS<8sDnvJy*xm~vSa)ee;?#Qme
zxRSMwNUK>`E0Nn}OIk;01?7+|7Al<*-+9OpT0yxa>jx2(wUWqJGqA5jZkIif9ii1&
zLCCH^1Z6EH(&|^NmB{U~39uuy8V6dP2d!kSCDLj-(#oz;og=gw4_Z})2+CSaq*Wx=
zO5}Fg?bH!kMSxcO!B?549?7_|er4MGbsE-6<aXIw(h*us0Il4?SF)B9`D!ZGO5}F=
zlTD7$Y9eUW9ki0Qo=B@HSSyj+Wn)7}Xf+A6stj7mT2Q3bWUQ6Q?Xq!p;<YM9x>iLb
zYu?<hU=&WwSWQTy^0>@iEp{lmsFbwVDN7n&Et-Glhf)77z1!HTDXtcEt`}O|d)mD7
zecBOy82?N0OGV+$Vwpf6`C75;R=Dm1Wqk1EjO_>0<YzPEd|ygoZK9Ar&&x2I?3ya;
zCW->w$1}T@x;Fcl_wighBq;BeVp~j2p;H@aPWCmo!p8|#2F#RSCXib2POzcid;1r{
znA=$)9&nQ*!U&(R*(0z|_7^wKC&S29NB+8i!zl8t{4E=Ygb{Kno97~4M43pDTCnk8
z<G>g8yAFe4tV7*!1~S`hP|Cg%*%%)(4ORneFpN;yvzBqCbe|`QPlh?3eHU?<#bhsA
z*6Nf>-&UB&nFY+0vpDhvtk>8c2($fSElF<Rf611VSw|XX*g3MNz-Pa0G{%QagK|-p
z5A2t<qBf;;pC^e=hB=;nmvETHJsjD<Tc<Q0W|o(_6+Sw$E8tf78xJ~XlFa-PM&wr5
zqp_VpB{yg`r3~>&x5l&YGIFckFLLOV3f$U^x)si4**37%{!mt#TcO;ORe;ed+t)L$
zlx|~^_@rCo+4m22tJp)z9{|uP6}a_Rs|8FmYQ~q*$i^s<C*8tTLqq%uc8fU7mA`nS
z&Mk-$HuG#T3^ciuM<A#<V*LSRUvQM9(IMTslez(FGTHAiBxO@=-3P|_kZC)F$5|gR
zO;#;2u9P7@8RmHQ{fomaPMhS<SLu|-M?W$XD$QAU@aR|j{?7zooCJ{Ua;@v6%<X>z
z)z3gBfAoQ!H|lmdtoB(&8J8!+$W=p5y^6yqw!5--zwR>QVYDS3M0*l^mj-+%fA~h{
zOj4MnnLNoMzH8V_an>MzlSijCZYE-cGh#LnER+Yt2n5I5>^6Yx&$g4~203~@Q8z#-
z8e0eZ1p7KPgM7B_B`4K=nk0ELjPdlpj>9MpvgA*k>XgRA$ehwzUphYT>N1mL#+NXl
zh@XRje-|Pke?{3wYKTv|jq8s{Z(z5HU9$W^be+<;+YlpsO=3U6lq5eTA<$m%#dh$8
z{p|o{Zigybb_=Lv2`wvb^_A`eWqj~On+1UV@^iLvJ{e}NI`ZdD9A>dgmcN~@n`k`D
zth@EaU|AjUcEHDN_5$1{ziit`bsLftCh0b=KXThG>^8CMls^=wQz~#<UGBChN4O1Y
zP1q1{n=CY8oK&~Lpxd}8<hI+$ZFaxes#6+w8)Aea16B>p{OWHwLxl@#1t9zKmL$1>
z|0QuIQ=0!TrYvbNKdUI?^5p2|sv)P{!C?}+RQW63>cYh8a5J779c|Lh6}g+aP+i9)
zg-yDJi$ZR>i`^o2sq$AJbxNh97s`p)6IkoXS|Zj^o!h4JFoY_Thdc%)iR>jzq2s^Q
zBQj2^`^;dNxF|$=4~I$YGUYFU>6A*t1T~Iq6NE_?INIJ+oztM1{G~@`-p6K&U6}kq
zd7V<JnY#od8w1glx58LgqtX2G5+e(b0N*f3%kt*BaX$HybJY>)1MF6@>uc)1a#=*l
zpR7L@uFkD&x0RN)bcEXug4-5=+vE>=8|Ra5<EkUKJ;ZJkyDa&0@ridE{o*WH`Th#A
zKEkYGlOli2FOjU)UyGt2-sOjGwu&0N4LsZwJZyiBElIEXmv_L%$Icne$kJ``^nZ-q
zCKmnjw+$2NHrr2*ef>i=o6E;|US(rk$lGpIo^%V>4~?cL*ezmFf4uCh>&vTTZldzl
zM~rZhfK`TxL*61lpv^GangGcD%ALW{=9-SUVFU1W0Y3W+Vg~u<S_d}cr20@xk~|s4
zc=|s@VYK@#T%D5t?QyP($~54X3fl)BlfU?=a+y}h>M@qlsN23I88pI9K*Sfq#4g)3
zF;1%c%wU+fC`9@Uhe>QQ<PWFnluE+{SDaWhgz2k4;JF`s(F;KG7l3u0lH`W3x&16q
zMS|O9hdNf?sN28BcL4Yp@X6jN#`)yi$W=#SevZQ|HcbI@uRd<E$Ise^P<N@)FvI;i
zb`>HhFV`WpPJu5rf-mfQ%NYwZYeL=d2k>=;B}2(=ElUSHjR==MFvxd>Csj_W`!q@N
zWEkV={{n|mY^siLXkYoo!+JHP2};8Vmp7RUcud~hWG16afG=+c>uL2E`%ay_T3j9%
zswfOx6dJZKaTvs=?@8uC4OTuqWc~ePol<ES;ASuT8{Q^)trw~F7mV&L@HW|ZA579{
zlwlgh-P+8kTctePNf8&S`z%TN$+v}zLjHJ#!z4D5t<H}sQ#wy$HwT3Yu7$JS@V3Z1
z;cSdiGrzbESqPWoSsvh%H{;oSWqi!TD$2M#878h8GU7E3li0MD-?{wUw1w{0A5hmv
zhVi$FE#+n?Ofoik{7hNb(ctLi@{n8JV7G`}_3Qkn+^RHjDz4Hkh;cXMkE)PlMy(wj
z*J0Mn3f9td!{3Z#^tmWR`WBlhcEPt#zj5xC!|8}hGi^(DcF5{tMbYA00A$8HY=+pi
zHjQdG{kKPfpH-TH7&k(+q99tbuL%O32GAu)5cVyFlQd|*<RW&Jzf{$Xx>d^i%en}#
z2~`v(t{?KpdlV*LaZhVp=?vjJs$Z?FGE8hZUk6rH=a%c-&0MG=w{TI&Eg!I3#4hku
z-F%OWzv<OnsatHz<J9WhVmreyEKlYpE(*EjBX*0}HHEnczPcRa8;}&YeCs%s&q;_i
z4zr3y{uDRg5zXI~Uz2E7>v8J0SozF_SU+J_v8MG}bEl<yQJ0^R!uqX4T*P=tFtSyU
zz-Bb4l>O1zt$VD~&24pCfj{}1Xjq;cB3u+Q;xl%eSYw{*9x!b0ZTC`%c3T36xQH<Z
z-1ZDs`LgS>@g12olW*=Msl~||+bI@lDGpWl$kaC3NtJerwI+|5qD03<Av>Ij9b#1#
z;Ms6^_FEM{Dz&3EPuNlF?BJr19m$9tV%e0nf_LDw;R~-TwS%AYOJ4IH7<}Vl@Yy$D
zP11TojyuX{aUph!bybB<BZv3=Ip?#a*!itw$g3ZvnU>_3wv1RO6p3{C>tT)M-4jil
z&?l{8>8-<rj}=A3or^+tq#$;PiG9V{b{CVqxjZe=b|jFVYzu42Yd#9Y<VQ&D_C44Q
z4imnRmwcA*h@E1lj%#{kN|D#2la!vmHB9({1)@z!XvI|B@xhI5(=*1-Pqcdz7$%a^
zd>lO630^??`vk0wbu1YgMqgIph%XlSI_NKa*XJk0HaB=}oum!($+w=1lB5<_;#aX4
z@zZLi`Ll0^?-T9Ugodq$HM^-i(Bg+Yj0-gs2k93s3i%}!@rzh(Z1o;wd0YBHa;1LR
zXC3FpcXiSZE)Us}n%E&$6MHOGYj~AzcSxxnw)ARv8z$}GqL3YFh#g`z5pw?1=cVEO
z3n{f@CZ8twp|+xN#YG`I(h@twYGMWpaC&t4ZzrX8*mn24=0=enTpqF`9kD~K7FMqa
ztkU40(n(s4e`^}x!wS)+$Fya|YT;SYA)Usx&ABwu?n_|D#&^4jHUps*ll!YGCqA^l
z-EetQXuq{(vz5TG;YMc{`4;h#8e~UCVuzT#U-ubNvt_3}LlbR>HQRqHYJA)w+DwF2
zOx`ok-(BdJbyBiK(<U%^+iE&hx{pT!*^!ypAtuYZ`DUET(l51dqU}g%vMkS&kit5m
z`CJ|sswf$7QOGY@h+o8HIjqE}Oi{H@HAsqIzBR0P$u6SJN@&I0wtwERRoP2=cqE!O
zp<!inAD<W$xsQuNc4Q-Vh`Ftk=iO?38yCNsXgd;kE4dknHanpeQ&CurAFmD^+qhPu
zX%on${F@e{%|U3z%(3|5jKViQrkJ9VmM;(yZB9(<CuWWk+ryfqPQPwlQfR-K!fofr
zhHLy~3g@Db9l3}dVk+7ZxZq*_`Daauwj+VT!oNKu+T4Uz%p9KY|DAtqLhU3ST7K&l
z=iWoKc?hkTIlP@x59wO%cH+NSCNM_%<pM;Tm(YqtU!hH(PvK2dZA`S26QJc^a}aGl
zLM!s9Temrdt7q>$NhK|xSrBc0LM!rU>vmhaEbX$Uy-HgCNdnOpAhaUwCHG%E@A2hh
zcalQ;tv6Cqn&As5yAR*!<mXQWIuHBs^l+l$`Le|F7PU8^Qyg)k;(fb``?DjR(^)44
z+(oSC-QaU+@3rh{-L`+dN*GZ=(-C$8^eORRyxYywX+o@j_ty;ZryJvZUk=;Ye6UTP
zv>7LqDU0hat+W&*fff^uzxGd_%RWKX;s?6mtcdXk@4BCi=hR#&E{}Wt$FJ6}G6^i3
zB&8WXv9SE$SowVgfuJuID-9s~rdY-T%@$kzz&1J}kos{L`v(Fk&%%ro%0P16rNb=W
zlR%28dtr6ghFP{b*&Pup3M4eOVxz!L+0jbsWrkapY%~Ni`g%+<mp7Tv3}G;kN2wq5
z>48lELi@SF7$LF^8t5<&aDeQd!?;#-MViPIAp;%Hz(P3C<-}rXajlzqk4_t8x4Tyq
zXlQH2>VTcHrxkJrw9R3EfZy$V<{0xkyG;UZhFj0<Gw|78c{au;O|$5m8CDP+Aa^o0
ztrP{C8fYm@0xedPjXu<xl{LR-tjg5P=2FK(6$HBmj<qj=P~}+jHJHv_!LPEViA^a*
zex>$V+=ySra=Bm7=8&c<W34q_Iwybs#<?1Kf!KNKSGXU>rh~Eem%~)~)uNw?v6jFk
z55#O*=|(1rP-Z4-prr_Ltk@9TyAzgn=#D$Ll{z+iM(f;t*Jy_2#)}*Au1e%dzi`!1
z`Y1~LB6chVF2Akb=^ity^oyi4Lvt%O3*KSb*-C;e`qy*TLr>XupU#ssK%`^KTOErC
z^$#9wEfCs&?J!1&xbzYS)(E&{eFNiaDS8FsS@;79wb&(gY_jdcrZlJQs+Nr87G$V?
zZp=dAU6+T(5@m)qSu6rT_C2zcd0yW;vK7E3H;$~CQP0b<RgJ&2hULkZmWz@OvlJtd
z6dUiHFWPvg{^ea+l`k#h$p`W3XGrW0j9dEyq$EX38XAw#ofs;%`fkHF2hOT7%Td`0
zx?r)AV4`e+B^_ago=@yI_~dTi@F_`tkOOE4al9Go4cT{~lzp+GK|b4#lane+Jyd^7
zaS}(dlmFpD?xy!+;!-FbutTjTFg^tvvg1n}WXLW0VHYb7F_0%+jI$adlrI1mg;+}v
z$BCV@+4MQ)#H96CRXR>mnxWAZI|^f4cDa%ui(clyjswcR#=+Q#L|l4dBXa`J%c@4U
z%BbhR5^6Iv3}Ll^Pj(A2&L<<yRYz8Oph$a{7F*}^o^L04q<q$2Wu)0->jIEPDqEmR
z6U@-|hb4o#RrdU`kt%vaxc-vVQj+*h>_iJbtZQC+;>``EemlnfR?!jT4O)V*K``EA
zKM=-A75R;eLVoikeiJ**DStP7U--w}b~hx|tOm`{EsJ@Bow8My)XNNQW!Mh@vhOKl
zEU~clJdXF3#gT2diZU)w<^`^r^oOMsiIdp4UMaI++108Ko_rbK(^oBVvi-xj9D_##
zN4Y^g)j24NJDdwuG?2L{<e<{TL1G8{Y-#g?u^*OQ`Qji2evy=BXq?5SLZXvhv)C!4
zFYK2IiDk}Y_1jBj@{s<}zlBYK$y+vWv5_i@6W3poTD(Y{#P&0=Z1j@t)dqPvC{ED6
zhs}gI$=*G-w^oszJOLm(O~g*IIh*{f_T;0%;VzD`^GgXnOAAR;*59*-Mqj)y0}|P&
z7iO_GKq#xSj1$UK$aP1o-o&$FH~4h_NOOuxrk5%`%PR9X@3JEtr<Xsnx!^ci`)Hg{
zI*#j(9OpwEC$?Wb=cFopU}R>yB8>ks*V7Nwl9q405Unqv6`Pk;rS2YaI^S<oqT`iN
zomFv?Y0Plpi!}fr+u!ywxP;(*tclU_A^k@6x0Jzt^AcNu%~Q?|y!G7wVWRz(z;|z!
zm)kMds2yJtgplWF6{;w+a{bVFDogw#mins(7k=Q;e{W`$evy=BXpzMNVFuY5RpG*4
z*N$3?G!O{xpfW!I*<V04INrt(KbYYnJM#rT`&;Y=`SMza1mmQN;z;$k_>nk5O<3#p
z2Of7$X;)4BU((TkDByt}T5JxuOg7L`xnf&vbs3v%)NfzjCpp9{dhsJ00Xa=pJ~Ga#
zC{SD!Vl77kCH5XI0`7b$TK3Tg2fc961&dV#J7o(jq?dlFgDrw^+TZ9f_68s>y|#`m
z1C3;19ouQt^Ir?Ke$#`M144Pt!#JUwbGYuv%<?4EViy(hp-}67?w2AQ6l&;{#s1QF
zAz#{LA-(kaQFa<o_Qj*dLe0)mN9gx4S!Zy+{bfvpgspjY<*bTc+IR+3AdwXNtPjUu
zba)V({;h)|3C*_Hdx(+jw8b{6TU|W0x{QrdC+|-#j|){42QCVY+=?U)VjsBjU8=xW
z#imVgP#mC}7TXJPkgc?kUV1@0I}9lMYIcJI{JYFn$DTLp*e|(}*$$AnP(^{_qL4!>
zkwDqDl_~R;&o(f1Kf8<MidG}gPK&ugijlpvn5$7Uzqn0WXofyjEHm)QCRHrIGCsD3
zzfXqc$r*}^l15l6lQ4-*Z0c7TU-t`{u|ehg#N4dBD)Wp@9zS(fG<>-{<QIS97qQDO
z-{sV!QsZhwsq_n?)bH{$4c1@sBERu3OL744k5q`Y3N{peYIEPLA)BtAesx@>p|%w~
zt6}uTkaqAL0kWeiu|sTQ(-k_<Fz@mD1y$N1Db3JRi>-zPARB5)kVU^U!<Ir+?Jvw2
zoEX~hH*24|iJ?BvI9#ZrH;aox4yi^0CHAQv{`<>sc(%H+%0RJXd_@?g&M)n_pSe&)
ze&M2!U#b(oh;3miulMtguj!jXsb6d><#Fo#VmsF`EKk1JTom%lkHjxx57oWW)Vs+%
zyxS(lFW)+5<!fL>`xBuR3;y2ypSM37H!?2Kv<X}~<5L)-twCtT`gW|j&bsZp*KJD*
z?f*W-vFNw>S#!u^@(MresBW0l;;*A&d2*O=QOI#MiQ~j)j1-Y?oMP84exGQ^C2+ip
zDD{ihY&fj><t=ODJ2Yu1-|QjQTEtMXVynArqjux-#u_RO6)R63HAVT3i$ZqPCU%Gw
zme-u<We4x(Qd^XN#X3-omzc<SRHSy)!FIr}%4O^5mUF~4#qr>u0FeJ&ftU4#(P!T%
zC`k(oX=pq`Gci=Gu+rzLyY+3@s4htvc;8CIylN8JS(ng?^}_hPi*hWV>z(*gq=deA
zt>p7NA1sQ-I~RrQun;@M{Qluq;m{gOwV0&1?^~JOw#G)hdYcTE8y2AUJu(cA6TYZN
zgaO1*F=0n`PwNtPubTBLt?pbt%>G+R&9}x0Kfpk=^$4w)u*+4hoHa$(AFC#sHi2=1
zDD_qmEDT;j*(ZWswK_$&HA(*Ef5~=P1rh2^BG^bElpP|B6UrB#>yB9K6VHk@Ns;sJ
zEjtfc;+JU8CNQ<KHP+0k@`#Hc``CmU3WW3v*AMxn0r87ia_oFjdHbfqt<;{K*e*U8
z-r-3*xIAP>Lt=+mQG6&Iw^r+YK~oxETZT2fBa?P;QOJ&;i5+4^v1eSttHb>++O;q+
za^}c#X0eVde$cIGTyaszjz-uH_z9`)x#D6@x-@O7G#<8{J}=8rWCxdr>}X8v5DSJ&
z&3m+1d+}WAq}cJTiGU9)MB9YWiUmXV!;z=L>-RjJXxapJZhYU1XqysRF~3*vEHJrR
zuGNVz{Y_y0v6aZMWY!_5C~Caq2HDY!*deCx(beDg5Bq2JfJD2`n(e<8H9qbTZ6Ki)
z)AyT0OMgpoQyZ6P+JxN4Yd=-Gk4FO8(VW;JCd*nM4!@|_sEysvR~Sk{JNe``*BU!R
z!{Ey*gXQN*jSDp-PbOzB3i+i4@r#%&kE|SAZ)(nAcD(}si(kGqtax}4ZA(Hc=C(2i
zJ6$T!C+ND;VP$h4pBNOmkBdTfv?6whxozOmz|fk38A>GDjs)IHZU&-ljcMU`BU%ib
zbFgK<^1BjEn?Nq*U%3!%5TO+_NA^oII?bONoA^cu3DEKdBBE_WXvNI2bxc5>`3G3W
zq*(jSEt>5p+3;lm`L1zM$d0ze4lxz&4!k%nCgaFHiMAtw!NR{rBieR^R?HlQUoP8P
z^jsn5MAIe^HSRq`+n&&hnWMmwGbx>qwn}_SKmxS<5(1*_Kxjpx-_<yMzu436zpAvB
zulo^gFrgKB^yGy$LpJrTlK4$W0@m`G1<`gSv?7n*nf$~p|LEk2Pu~g9@{bdUwiBTh
zX>Zma|EyT{@_Ck|IQd&|B%;(mh_ZulULwDSB2+<8j2(ePbNd!ru<K9b(P+xuii51v
z0`5!JaY4H$>9W+H@*nz=eQ969ZUUF=UBa?wwVQE_6Y589>;||%p0^n$jBu_MDSB{l
z)saUv9BTOKfgKGyZRuV-A8Jcm%=(wcxq7^-=AXUw`aj}1wZp^H9o<IN+WsH=l3DbR
zEi5NER(@$gis^l}*cv!=v2U_vEYz%!)hOoeh)C*(V(cPBQl5wzC*-jZq2#(Fk98)Y
z6w~*oYqieA46Bq*zO_n2srSfY8z7XjK^9U>zn;mqf;;T*W*Q8onR-H>B-kV%w4Wi2
z5hCOCW<0DNaLG<QjH?xIVVbx!A!8lSLPlaO)=c$Wi}+rC{IZpUVy(ByVy_|AvPTwj
zM^QaP2ZKB88}Jx&J?lhct>2kuA3-hq3)993k#YLAhK&W|<etW+mZDh4v#<+^wOCj7
zyXn?wv1LO;m9b`dsb}@l2zDMkYhM?k%(HrP6m|@_WM>qcT8cay&%&<6vtsF-Ds1(h
zy620d#ia3!$pWco^~+>zGgxbXlT4Xs^#d|yf-x&k$82gT@@zZ{yAjWd-9fHb85e|X
z&stvTn4MwuEbDB27_x(VTg<6rmB^EMo2!O0NO$5Fv3<GI`u-yS22EC~^b4ZYTV*jb
z3^v&(3!(Jy>8vB5>_4U(8<K1n^@RTUgRKNY`)?n{2$6Am(F5}bE?MzlQ_IlUk+F_v
zVGj~(v1c3~ccA{U;#JTuB^b{x$WZ+hndN|&T^=DLR54JDtp}8Szb$31*SC_)9k}E^
zl3h^u+V1DCtzmgGlw1_@SWgm4u@66dp5@O!s&+P&uPx*02l47hN-PC<&;C3qNs*F<
z#v|-S3>7>7mY>|-wrzh9{S1TBF{}5}V&`C^C!1*@Pw35~*gf!sednkod4m5Xk@-JU
z*Xxyt>^i7rUxsLq(6;~Ntcuc4JOg^;P{PjymD@S5#_g(8hbbMi&e^Ok<nyp0JH7-0
z1zkV>ViO?-^6ZOoRzrmH9pIu6YY1_i*hd!`;j>t~o+G<T$014&pk%3Ff|A{`5K6Cn
zV7tH__GJ&oh9u(B>mAtx;F86Ttb|e5e=XJ~-32?~1($tBj1$UObKQ}dp(NH~AH8{g
z#<Z~^B~X!#@qEX6Sl<tp)e$d=-ZO~3gRvtU2H98@y(HWK<hVY>abi>2b4s5@p5uBA
zRQZxH3wPWfj&PjbNrWwdAuij9*jN=gjvIg+*Oxd>>^bX?UgmbALria#jzg4s+bxz2
zMu_aWg;07&8kPZ2_U&klWf$hfL)k@H@?hnxqKwOvxq+*O9MX?OO6*}Hr@XH)ZJw!~
z%1ALYx8s?zA<imsKXajqMlctJ{L-KJMeJeg?#(k{!kIb^RQd%`dI2Sy0k5iTw#9x{
z=eTJ+5Pg)%Lk5)94VVlsm+bYzIIE&aaZ!kM0Ev{?ftGIEve7l)dSMQVl-}=$Wrb;6
zHv6%?uZj%iuRbz#ATd<z&wS<;{#?K2k`oRxRIkft17JwXqI}lPXhgrZK+;RKSbG>k
zvQEo5p-h5YcjVbY#Is^U7**_8-qn+%(T#Ei3#4c(FKgup$LR%;Y&k?qRzn&ml#b)N
zBgg%M9S6U)H!h&eLjMshQ0u(^?l@X`C29G_3(*cHv|{t(<hn4X#-oz?6a6MwD+a#x
z>7K2o=IR&0SU-60?Qek@Ts?3?_P5dGEFBlmfFZ<jVka=S$KOSQCht~yGRx+1JMvi{
zhEm~|3?bz0Fe*>_h3kg~R2cD#Sm=jG?Rwc{+Mu0@9#9EfLqL>%K*>tMSn~YRPz#~-
z8>%b_Q1;hT4UV`}*@-9gyYB2yAhf^eZjf-Eb#yb%YG`Q4P{uQ0C<&$5dX&A?vhw%o
zcc*bsDD|#e>{pnyWXmm;3%9~npRr6v9rxv(LZeKtiDc8j09g>pIIE&aaZ!kM7>ShF
zdc>{#^T6&@*Fqf>DZObHI|HLzcFaPG>6bm&DR77VO%G%50OHbX?%4OhB@6CY1*5M2
zTCDXOAgmiK6y)^}<An09<hmm>hm%-~?Nm(Gi!oJ)g*<goto629>><Qj_S8a(>4l~2
z0Jy`xs?=DlSySo>{jMgv0EG4zH4PH}%M&bTRrK1%Ga#IVQtY<|csk9lHQD@~gF>ly
z=3;NaXR<99^ESFD`0|plURIy6l<MTw=JL2uMS<X=(9j)00wMN;qtBJyyN%_^>YzaA
zowwL-5eV6K3+bhoy0bWlmVKSO!4W>1J2ud$V`boNXGvVBqCjy`$RQ(1pv3mF(x{d`
z?&mg)aZsT2-dpTnh=FXp#l{*n^o!$=je1inb{Pm|M=Q2K86hia6=htWoTj*H$cj-o
zQt)%AHRps>Et_VG+SAZw+zy4Q#wL#+Ju4c)TpsevXyO;K;oelK@#^`OH|XLXcdN1j
zYW=D|D-6@OyytKH6O$YO{A(35bPO?6Y>=;)8(G!mbam7b$he`lbv!$4^|GRX@qGcZ
zV=S>lY-r!?_MIB_b3R9XfT;$QECvE5J8vPy^cywoEUb6!uhSTu9Io+KtGZFoe#vp{
zq;)~SxKKrr;-Zj4#*s*g{pyJom$nA~`FfJd$$^FQWnm?wc1XY6u!^z?Rpb}0AM(q1
z;uo<y98n^hU#TMPloshQ+j2RLI=|S?Hw?>@8H$TSeu*G{5j&}WE515Ax7_uaNf|HS
zI&$Tc6QZ3!XvM03P5;2Q69O)v7AXHQG1^_ylGgG$4AD*`v|^3BG2O-3)|r;fPPDZN
zob)0}{f<8ihdd@P__I6eHU~%e>u6Y>bQ~9jte8X`C)S+d83XD~{oV=n-u#d8`fn$i
z`Za9!7?yzY4z}@KnlzMe_>iHKk)b~DGko)hKXaKEU1Cq7M@RyOilrxynxcHiMIk$;
z5Ie*YtKX4(wK}ES)kCQrK|F8&q0SC23fVE0*dbO>1*iBWpYox~Z%XapUjq=Y-bsk{
zgwbc;Mkq-O3pw!O5l$n9iY3<3Tg7h$q|KS6ONHM`#JqwM*%?V_#d=|L|G`0zocC5w
zw0{%Gh|E2Qb)4|QqG-HxQOJ(z#11jPSKU?CZEH{W!-=*dq4$<AvJtP|M1vKD^lsll
z!{9iHNBAo-R7}`KIxjrE^uTYElVa$%#tA>fKz7a`v|_>@zAbI5ms#5Uk!ack#tEX-
z8%eNV;1!fzBv@PPXfiaAzAQWuq25A*g#e-KA7PwOzW7{s#5$9BR;)=(5rrFtFA7y#
z6<E@m;Zz=T@q-`6g&GQk^a~e-{4$I9MJzc|S=KbzGGfT+M31<H-e}uB2*W!(X$O~w
z?3j)1fM4^<zUlnM8j)c>O6{;^Si?IqX$KdD?3hFB5G#t$$qP+b9PMUSYR4=-Q}DxX
zMdON#LUzn0c8C>4-$kva7aI6!s!}^_JAGcFqsR^}57{w~*dZ1S)ysDapH(hmT~h4$
z)<nRE6{4L_XvKnI&!IY(ub1kO_*>Ko?A-Xi7tt;tv|@hGG4WjG`lI$IX~X!<8XaCO
z!@`YDFY+zo@{k=1i5+74?v?DjIWEgsk}gVoD{6e)A=*WRR!rX|C;VP@MO^HRL=Tn(
zrf*v%s7m+oNFX~F6FbCYc`e8sF>X~Qr62iO1)hWyRvazh^0-h%$$*POepy2NA|}fp
zMzrbW_3V0*z6pJ6Sn)z%M7tEz!cTH&;|u&+=xLOv(qUzDAD<W$xsQuNb}S=yh`H@q
zhS1o36RM6)v>n#9#<$$Z%|Nuv39Xol(giH-obO=))H8_ZlK=1>B!OJYKXW146@*sI
z9AQ_T0*3Z<?~rI~6QJb_L`1ui(2AL3O@ob<HXg{6_^&<*<sjQpvf;Y{GKF(d$d2EL
z9bzh)+w6sR*gKcziFRKCgN1*QMzpI4t(ZAZu3vKQ-!`>^5>1;x)VTK$?P@|RW{%0>
z{akvi-j(=H8VS(yD+!2p4WSi@zVFrsQJ4NsK2N2!eBF;|e<!pej|NSt80XxlheahV
zpIH#?T1*R_`7e&1Gty&Z*Ti2sNWjVb;{>8zM`%Ubn^0%!gafDg1{=3F&UzwZyC#en
zHDF{dV2dsLW@-QExt;&Tajr4X52XsN@aeJj-{N*;a~qb=8##1PT%UG!c7BbpvrZV5
z%xZe-ajvqvlXg)joSD<AsR0tElhgpKWLqKaZ&f5+bdrWZBKz5BXH=2s-CbB~Ad#(I
zw3Dhx^hPJFHjv0JCt9p361{5(s|O^qWr!A|ibQW$!O8=P>{X%tsft8zq`+Rm5SCpO
zw1cWh^d<(ZEm$Hu7-;)ck?7a!*?O=<-mTZ7Rgvgd(b*~>k@wKGC{-l-9cWev+$AqS
zYdcht==X(LV<3^2g|)4!Nc6k4tU8d$i?!M&RV4bgP<9VuBJYH1>s68HcP!aHAdwd=
zwck~d=+^^Tc5s)x8>p>PMWSELV-LX+c`r{}p^8MmrN(XpiM*nwEmcLL-{@jffka;C
z(iW*A(eH1ux)2k2c}ttGibTKb#GZg9@}iSAM-_>FafsytOXRH~ZKf&`{YndS1`>In
zMVqdQM8A!}G6IRbilI$aMWSB=V1GkQ<Q)KQk}49tK%IGkC9*nQi%><P7hkiyKq4!z
zwXv#5^ul3Q0!U=lur^8+iC*){=0i+m!K)UoibO9UWm|znR*z~!Rgvh$o2(hQOIB`b
zgH@5}m6a?xkjS!1ZJ;U=y(o~)1b4}bK&_uD61~EX#Q=#cuhT+Rk?7@W>=Tg4S~abg
zDiXaojco!FS(&DFS4E<iqOl!7BJ0q!E~-fMvMm-1B(i2p(^Qe@B~z>b3}IO>r3I@Z
z(d(I5Ss;<cOxl-UYBRcz*Ncr<6(Et7Mp}?6OY}k?b^~G}t9-PUsz~&*9`-ws$eJE4
zP!)+@bHjYVU9#XtYodxouXkZ7fkYO&Xg{kW(W_fnG`LHawrKTLk?4gU>?)ARDi6(~
zibSuPU`4=PvSdQ5ql!c?X<(IrMAkECHC2)5^#<$=xJworXg{hV(Tf4tMIe!t09sX5
zB>I<nHXTUhXL_x&DiZy>JUa;_@}s;~K^2Mq6`p+n68Q;U^HW8le<Ekkfkb{E*L+ox
z=pV${Bp{Jr!!?sC68&p6OAYUd{8X)#QbnSF`ejSO68YU%^H4>if9PfDz!LeDS94cI
zqJM>D|9~a(6RcKL6^Z`Ml`R4i`N35ytcpbc!pi0ViTu2(6;wr{e@|t_ASUu-s+La`
ziT<&Z9Ry3{mrgCWDiZzECi4VK<abRihbj{NLniwZERkO^wXCX0^v{+o3s@q*S!$V7
zk?0>OSp--jzffxFRgvhQB-siek>4Y=G^$AS50LCFkjSr(TJrx!(kk-*)BgoWr0UQ%
zv~5s_&h0}Z2U|NqCI8OJDI#0<KHYi_@ND0scd%!-K3zjQx9#1#?SROLv_ZXt+xO@m
z656|u6+SI8B83eY8IiJk2LYKDX?7YJS>3r^<mkwTk;Tl;=49q%R>w@vZ55lML+F5>
z!Tj$fYws61c8q|yL^xNB3<wCQ^V#~3?O}EbjchlzdO;_rq5e)^{YU)!NGptG;LV*U
zS;e!d)(2;2k=Fk~>1T0zm-b#|BrBU|{dZrZt@5UwbJllzp7@vfl&e<t)?1@@jR^SM
zCp=95vw!?Q$H&{$YMlN*sI|hX!r1Cn+W$V+d13au{!v?x2SsJP6&>%-{{Q38(KbSl
z|KbNA%rW>?CG!zq|2r=S?TTGB)xSa2ndfT6J<<Oh9skeq@ixM<*58WzpN<E@y7d}l
z8~Wsa^qExC*r6KNqT~J9|9|{hSh$A&yy*XmG!VM@bRV{G&_n;#b;sNrnfy%j^`MC_
zKNO16|Lh<C&++j#3!7N0pZ+UhWzV?tJ93})e_eEM+^}(vqvQSA|9|{hSXk0p?(jbe
z=h_BM!57JWZ7;`H{(m%oXSfA@B3#?`ZX49MZSN3k)+wf4)oy6FwR_q_?TPkWd!@bA
zK4_nqv$K^oIe(<&kJS8;jz2Q;M;89b&L6q>BQJjx;EzK5QG`E=@ka^%@Z=9K{_x?C
zviwn=KPvG@75=EsA2s-+Hh<LRk9z#kkUtvpM>GCt!5^*pqb+}Q;EzuH!T6&afAr*!
z5dP@P9|QQ~7ybz2kKz0=l0U}q$9Vpj$RAVqBa%O6@W*Wan8zOr`C|!xEa#8k_+t%!
ztmBW3{ISLQ2##>=5*#YMnp{hxrPnfPS+yKmZY`fyP%Eqz)!a1?t(0ccd^JC<f>v3p
zs{N?d)aqy!t-kiN)<g@`T53UBJ1tn#v@TkAt(O+6_0tAwgSDYrxHd`~t3_y&w5i&3
zZKgIyo3Aa>mTD`sRod^`dTo=oRokIOY0=s~?V$Fj7Nf;#C$%%$-`Yj(A8ShQ*gH5_
z*DoSvySDAS1b6Qc8BwTLXmIzC9=)xJy?bcyw(UbZ_vju3|24!DQhC|{p8Ko8-<cNa
NJ}k0NyU~%J{|B#(!43cb

literal 0
HcmV?d00001


From 80c8edff0e4cb19c4400335b9daec519c0851af9 Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Tue, 4 Apr 2023 08:21:22 +0000
Subject: [PATCH 074/126] minor fix in run script

---
 bench/qc_simulation/scripts/large_run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench/qc_simulation/scripts/large_run.py b/bench/qc_simulation/scripts/large_run.py
index 05f50bb2..8f161e76 100755
--- a/bench/qc_simulation/scripts/large_run.py
+++ b/bench/qc_simulation/scripts/large_run.py
@@ -1,3 +1,3 @@
 #!/bin/bash
-./main.py simulate 3reg_N72_p3.jsonterms_Otamaki_30_M30_M24.json ./data/simulations/sc23/large/{in_file}_cM{M}_rE{r2r_threshold}.sim --sim qtensor -M 27 --backend=cupy --compress=szx --r2r_error=5e-4 --r2r_threshold=5e-4 --mpi
+./main.py simulate ./data/preprocess/sc23/qaoa/3reg_N72_p3.jsonterms_Otamaki_30_M30_M24.json ./data/simulations/sc23/large/{in_file}_cM{M}_rE{r2r_threshold}.sim --sim qtensor -M 27 --backend=cupy --compress=szx --r2r_error=5e-4 --r2r_threshold=5e-4 --mpi
 

From c8d262ccdde5c66f088acf699dddfe13575dbc12 Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Tue, 4 Apr 2023 08:22:49 +0000
Subject: [PATCH 075/126] minor fix in mpi run script

---
 bench/qc_simulation/scripts/large_run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench/qc_simulation/scripts/large_run.py b/bench/qc_simulation/scripts/large_run.py
index 8f161e76..a94a0b87 100755
--- a/bench/qc_simulation/scripts/large_run.py
+++ b/bench/qc_simulation/scripts/large_run.py
@@ -1,3 +1,3 @@
 #!/bin/bash
-./main.py simulate ./data/preprocess/sc23/qaoa/3reg_N72_p3.jsonterms_Otamaki_30_M30_M24.json ./data/simulations/sc23/large/{in_file}_cM{M}_rE{r2r_threshold}.sim --sim qtensor -M 27 --backend=cupy --compress=szx --r2r_error=5e-4 --r2r_threshold=5e-4 --mpi
+./main.py simulate ./data/preprocess/sc23/qaoa/3reg_N72_p3.jsonterms_Otamaki_30_M30 ./data/simulations/sc23/large/{in_file}_cM{M}_rE{r2r_threshold}.sim --sim qtensor -M 27 --backend=cupy --compress=szx --r2r_error=5e-4 --r2r_threshold=5e-4 --mpi
 

From d2a2d4dfeaa9f7b401a5e951bb0f1d6e29283f3f Mon Sep 17 00:00:00 2001
From: Milan Shah <mkshah5@ncsu.edu>
Date: Tue, 4 Apr 2023 22:03:39 -0400
Subject: [PATCH 076/126] Reduce output compressed buffer size

---
 qtensor/compression/szx/src/cuszx_entry.cu | 46 ++++++++++++++++++++--
 1 file changed, 43 insertions(+), 3 deletions(-)

diff --git a/qtensor/compression/szx/src/cuszx_entry.cu b/qtensor/compression/szx/src/cuszx_entry.cu
index ff961eec..41d4b2c9 100644
--- a/qtensor/compression/szx/src/cuszx_entry.cu
+++ b/qtensor/compression/szx/src/cuszx_entry.cu
@@ -7,6 +7,10 @@
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
 #include <cub/cub.cuh>
+#include <thrust/extrema.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+#include <cub/cub.cuh>
 
 #define SPARSITY_LEVEL 0.25
 #define BLOCKS 40
@@ -1032,8 +1036,34 @@ unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize,
      *  unsigned char* outBytes
      * 
      */
+    // float *dmin,*dmax, *hmin, *hmax;
+    // void *d_temp_storage = NULL;
+    // size_t temp_storage_bytes = 0;
     timer_GPU.StartCounter();
-
+//     cudaMalloc(&dmin, sizeof(float));
+//     cudaMalloc(&dmax, sizeof(float));
+
+//    // dmax = thrust::reduce(oriData, oriData+nbEle, -1, thrust::maximum<float>());
+//    // dmin = thrust::reduce(oriData, oriData+nbEle, 1, thrust::minimum<float>());
+//     cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, oriData, dmax, nbEle);
+//     cudaMalloc(&d_temp_storage, temp_storage_bytes);
+//     cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, oriData, dmax, nbEle);
+
+//     cudaFree(d_temp_storage);
+//     cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, oriData, dmin, nbEle);
+//     cudaMalloc(&d_temp_storage, temp_storage_bytes);
+//     cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, oriData, dmin, nbEle);
+
+//     cudaFree(d_temp_storage);
+//     // thrust::pair<float *, float *> result = thrust::minmax_element(thrust::device, oriData,oriData+nbEle);
+//     //printf("here\n");
+//     cudaMemcpy(hmin, dmin, sizeof(float), cudaMemcpyDeviceToHost);
+//     cudaMemcpy(hmax, dmax,sizeof(float), cudaMemcpyDeviceToHost);
+//     absErrBound = absErrBound*(hmax-hmin);
+//     threshold = threshold*(hmax-hmin);
+    // // printf("%f\n",absErrBound);
+    // cudaFree(dmin);
+    // cudaFree(dmax);
     float sparsity_level = SPARSITY_LEVEL;
 
     // Set the input data as the function parameter, this should be a device pointer
@@ -1153,11 +1183,21 @@ unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize,
     checkCudaErrors(cudaFree(d_meta));
     checkCudaErrors(cudaFree(d_offsets));
     checkCudaErrors(cudaFree(d_midBytes));
-//    printf("completed compression\n");
+
+    unsigned char *d_newout;
+    if (*outSize%4==0)
+    {
+        *outSize += *outSize%4;
+    }
+    checkCudaErrors(cudaMalloc(&d_newout, *outSize));
+    checkCudaErrors(cudaMemcpy(d_newout, d_outBytes, *outSize, cudaMemcpyDeviceToDevice));
+    
+
+    checkCudaErrors(cudaFree(d_outBytes));
     printf("Compression end timestamp: %f ms\n", timer_GPU.GetCounter());
     
     printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    return d_outBytes;
+    return d_newout;
 }
 
 __device__ inline long bytesToLong_bigEndian(unsigned char* b) {

From 310e74a5db9f4bcaff5582abe6c03defa43136a7 Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Wed, 5 Apr 2023 02:09:17 +0000
Subject: [PATCH 077/126] update submit script

---
 bench/qc_simulation/scripts/polaris/submit.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/bench/qc_simulation/scripts/polaris/submit.sh b/bench/qc_simulation/scripts/polaris/submit.sh
index b86aa06a..550cebf4 100755
--- a/bench/qc_simulation/scripts/polaris/submit.sh
+++ b/bench/qc_simulation/scripts/polaris/submit.sh
@@ -1,16 +1,16 @@
 #!/bin/bash
 #
 
-NODES=256
+NODES=2
 RANKS=$(( NODES * 4 ))
-QUEUE=prod
-WALLTIME=420:00
+QUEUE=debug-scaling
+WALLTIME=40:00
 
 qsub -l select=$NODES:system=polaris:ncpus=32:ngpus=4:gputype=A100,walltime=$WALLTIME,filesystems=home \
-    -q $QUEUE -ACatalyst \
+    -q $QUEUE -AQTensor \
     -v RANKS=$RANKS,PARAM_P=$PARAM_P \
     -o job_out.output -e job_out.output \
-    ./entry.sh
+    ./scripts/polaris/entry.sh
 
 echo -e "===========\nNew job with NODES=$NODES, PARAM_P=$PARAM_P submitted.\n" >> job_out.output
 sleep 1.5

From 9664af8da3192539ab9c02865485a318fea701b6 Mon Sep 17 00:00:00 2001
From: Milan Shah <mkshah5@ncsu.edu>
Date: Wed, 5 Apr 2023 13:35:39 -0400
Subject: [PATCH 078/126] Updated outsize and compressed buffer to reflect
 accurate value

---
 qtensor/compression/szx/src/cuszx_entry.cu | 88 +++++++++++++++-------
 1 file changed, 61 insertions(+), 27 deletions(-)

diff --git a/qtensor/compression/szx/src/cuszx_entry.cu b/qtensor/compression/szx/src/cuszx_entry.cu
index 41d4b2c9..b5761079 100644
--- a/qtensor/compression/szx/src/cuszx_entry.cu
+++ b/qtensor/compression/szx/src/cuszx_entry.cu
@@ -391,7 +391,7 @@ unsigned char* cuSZx_fast_compress_args_unpredictable_blocked_float(float *oriDa
 
     *outSize = _post_proc(oriData, meta, offsets, midBytes, outBytes, nbEle, blockSize, *num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
 //    printf("Beginning free\n");
-    printf("outsize %p \n", outBytes);
+    // printf("outsize %p \n", outBytes);
     free(blk_idx);
     free(blk_subidx);
     free(blk_vals);
@@ -443,7 +443,7 @@ void cuSZx_fast_decompress_args_unpredictable_blocked_float(float** newData, siz
     blk_subidx = (uint8_t *)malloc((num_sig)*sizeof(uint8_t));
     blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
 
-	printf("Converting state array\n");
+	// printf("Converting state array\n");
     convert_out_to_state(nbBlocks, r, stateArray);
 	// convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
 	for (size_t i = 0; i < nbBlocks; i++)
@@ -459,7 +459,7 @@ void cuSZx_fast_decompress_args_unpredictable_blocked_float(float** newData, siz
 	r += stateNBBytes;
     unsigned char* data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
     memset(data, 0, ncBlocks*blockSize*sizeof(float));
-    printf("converting block vals\n");
+    // printf("converting block vals\n");
     size_t to_add = convert_out_to_block2(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
     r+= to_add;
     // checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
@@ -650,7 +650,7 @@ __global__ void nccopy_kernel2(unsigned char * c, unsigned char* o, unsigned cha
 
 
 __global__ void nccopy_kernel(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
-                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, uint64_t *cBlk_indices, uint64_t *ncBlk_indices, uint64_t* offset_indices){
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, uint64_t *cBlk_indices, uint64_t *ncBlk_indices, uint64_t* offset_indices, size_t *final_nc){
    // printf("blockdim %d blockidx %d threadidx %d griddim %d\n", blockDim.x, blockIdx.x, threadIdx.x, gridDim.x);
     int i;
     // if(threadIdx.x==0){
@@ -689,8 +689,8 @@ __global__ void nccopy_kernel(unsigned char * c, unsigned char* o, unsigned char
         }
         if (i==nbBlocks-1)
         {
-            nc = nc+(((mSize*ncBlk_indices[i])+mSize + offset_indices[i]))+offsets[i];
-        }
+            *final_nc = (size_t) (((mSize*ncBlk_indices[i])+mSize + offset_indices[i]))+offsets[i];
+	}
         
     }
     
@@ -741,7 +741,7 @@ __global__ void set_nc(unsigned char *nc, short *offsets, uint64_t *offset_indic
 }
 
 void ncblkCopy_fast(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
-                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize){
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, size_t *final_nc){
     uint64_t *cBlk_indices, *ncBlk_indices;
     uint64_t *offset_indices;
     TimingGPU timer2;
@@ -759,10 +759,12 @@ void ncblkCopy_fast(unsigned char * c, unsigned char* o, unsigned char *nc, unsi
     thrust::exclusive_scan(thrust::device, ncBlk_indices, ncBlk_indices + nbBlocks, ncBlk_indices, 0);
     thrust::exclusive_scan(thrust::device, offset_indices, offset_indices + nbBlocks, offset_indices, 0);
 
-    nccopy_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(c, o, nc, midBytes, meta, nbBlocks, blockSize, offsets, mSize, cBlk_indices,ncBlk_indices,offset_indices);
+    nccopy_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(c, o, nc, midBytes, meta, nbBlocks, blockSize, offsets, mSize, cBlk_indices,ncBlk_indices,offset_indices,final_nc);
     // nccopy_kernel2<<<1,1>>>(c, o, nc, midBytes, meta, nbBlocks, blockSize, offsets, mSize, cBlk_indices,ncBlk_indices,offset_indices);
 
     cudaDeviceSynchronize();
+
+    //printf("nc: %p\n", nc);
     // printf("%s\n", cudaGetErrorString(cudaGetLastError()));
     // set_nc<<<1,1>>>(nc, offsets, offset_indices, ncBlk_indices, mSize, nbBlocks);
     // cudaDeviceSynchronize();
@@ -863,7 +865,7 @@ size_t better_post_proc(size_t *outSize, float *oriData, unsigned char *meta,
     int out_size_h = 0;
     int *out_size_d;
     int tmp_outsize = 0;
-
+    size_t *nc_diff;
     size_t nbConstantBlocks = 0;
     size_t nbBlocks = nbEle/blockSize;
     size_t ncBytes = blockSize/4;
@@ -873,9 +875,8 @@ size_t better_post_proc(size_t *outSize, float *oriData, unsigned char *meta,
         out_size_h += nbBlocks/8;
     else
         out_size_h += nbBlocks/8+1;
-
+    cudaMalloc(&nc_diff, sizeof(size_t));
     int *nonconstant_d, nonconstant_h;
-    
     checkCudaErrors(cudaMalloc((void **)&nonconstant_d, sizeof(int)));
     checkCudaErrors(cudaMalloc((void **)&out_size_d, sizeof(int)));
 
@@ -931,9 +932,11 @@ size_t better_post_proc(size_t *outSize, float *oriData, unsigned char *meta,
     // ncblkCopy<<<1,1>>>(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
     
     // ncblkCopy_h(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
-    ncblkCopy_fast(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
+    ncblkCopy_fast(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize, nc_diff);
     // cudaDeviceSynchronize();
-    return (size_t) (nc-r_old);
+    size_t h_nc_diff;
+    cudaMemcpy(&h_nc_diff,nc_diff, sizeof(size_t),cudaMemcpyDeviceToHost);
+    return (size_t) (nc+h_nc_diff-r_old);
     // checkCudaErrors(cudaMemcpy(outSize, (size_t)(nc-r_old), sizeof(size_t)));
     // *outSize = (size_t) (nc-r_old);
     // return outBytes;
@@ -1027,6 +1030,14 @@ __global__ void device_post_proc(size_t *outSize, float *oriData, unsigned char
     // return (uint32_t) (nc-r_old);
 }
 
+__global__ void fin_copy(unsigned char* in, unsigned char *out, size_t n){
+
+	for(size_t i = threadIdx.x+blockIdx.x*gridDim.x; i < n; i+=blockDim.x*gridDim.x){
+		out[i]=in[i];
+	}
+
+}
+
 unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold)
 {
     /**
@@ -1185,19 +1196,22 @@ unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize,
     checkCudaErrors(cudaFree(d_midBytes));
 
     unsigned char *d_newout;
-    if (*outSize%4==0)
-    {
-        *outSize += *outSize%4;
-    }
-    checkCudaErrors(cudaMalloc(&d_newout, *outSize));
-    checkCudaErrors(cudaMemcpy(d_newout, d_outBytes, *outSize, cudaMemcpyDeviceToDevice));
     
+    *outSize = *outSize;
+    size_t os = *outSize;
+    
+    checkCudaErrors(cudaMalloc(&d_newout, os));
+    //fin_copy<<<40,256>>>(d_outBytes, d_newout,os);
+    checkCudaErrors(cudaMemcpy(d_newout, d_outBytes, os, cudaMemcpyDeviceToDevice));
+    cudaDeviceSynchronize(); 
 
     checkCudaErrors(cudaFree(d_outBytes));
     printf("Compression end timestamp: %f ms\n", timer_GPU.GetCounter());
-    
+     
+    err = cudaGetLastError();        // Get error code
     printf("CUDA Error: %s\n", cudaGetErrorString(err));
     return d_newout;
+    //return d_outBytes;
 }
 
 __device__ inline long bytesToLong_bigEndian(unsigned char* b) {
@@ -1259,6 +1273,7 @@ __global__ void decompress_get_stats(float *newData, size_t nbEle, unsigned char
     size_t *mSizeptr, unsigned char *newCmpBytes
 ){
 	unsigned char* r = cmpBytes;
+
     size_t num_sig;
 	r += 4;
 	int blockSize = (int) r[0];  //get block size
@@ -1268,7 +1283,7 @@ __global__ void decompress_get_stats(float *newData, size_t nbEle, unsigned char
 	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
 	r += sizeof(size_t);
 	num_sig = bytesToSize(r);
-
+    
     r += sizeof(size_t);
 	size_t nbBlocks = nbEle/blockSize;
     size_t ncBlocks = 0;
@@ -1285,7 +1300,6 @@ __global__ void decompress_get_stats(float *newData, size_t nbEle, unsigned char
     *numSigValues = num_sig;
     *bs = blockSize;
     newCmpBytes = r;
-  //  printf("nb blocks: %d\n", nbBlocks);
 
 }
 
@@ -1429,7 +1443,9 @@ unsigned char *data, int blockSize, size_t mSize, size_t ncBlocks, float *consta
     for(int i = blockDim.x*blockIdx.x + threadIdx.x;i < ncBlocks;i+=blockDim.x*gridDim.x){
         fr = basefr+(sizeof(short)*i);
         int leng = (int)bytesToShort(fr)+mSize;
-        // g_leng[i] = leng;
+        
+	
+	// g_leng[i] = leng;
         // // fr += sizeof(short);
         // if (leng > blockSize*sizeof(float))
         // {
@@ -1440,7 +1456,7 @@ unsigned char *data, int blockSize, size_t mSize, size_t ncBlocks, float *consta
         p = basep + g_leng[i];
 
         memcpy(data+i*blockSize*sizeof(float), p, leng);
-
+	
         // p += leng;
     }
 }
@@ -1494,7 +1510,14 @@ void decompress_startup_better(float *newData, size_t nbEle, unsigned char* r,
 
     decompress_ncblk_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbConstantBlocks, data, blockSize, mSize, ncBlocks, constantMedianArray, g_leng);
     cudaDeviceSynchronize();
+    
+    // cudaError_t err = cudaGetLastError();        // Get error code
+    
+    // printf("CUDA Error: %s\n", cudaGetErrorString(err));
     cudaFree(g_leng);
+        
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA Error: %s\n", cudaGetErrorString(err));
     r += nbConstantBlocks*sizeof(float);
 
     newCmpBytes = r;
@@ -1841,6 +1864,9 @@ float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
    // checkCudaErrors(cudaMemcpy(&ncBlocks_h, ncBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
 
     checkCudaErrors(cudaMalloc((void**)&data, ncBlocks_h*bs*sizeof(float)));
+
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA start Error: %s\n", cudaGetErrorString(err));
     // cmpBytes = newCmpBytes;
     // data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
     // memset(data, 0, ncBlocks*blockSize*sizeof(float));
@@ -1869,6 +1895,8 @@ float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
     mSize_h, newR);
 
 
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA start Error: %s\n", cudaGetErrorString(err));
     //decompress_startup<<<1,1>>>(newData, nbEle, cmpBytes, 
     // blk_idx, blk_subidx, blk_sig,
     // blk_vals, num_sig_h, bs,
@@ -1883,20 +1911,26 @@ float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
     float *d_newdata;
     // checkCudaErrors(cudaMalloc((void**)&d_data, ncBlocks*blockSize*sizeof(float))); 
     // checkCudaErrors(cudaMemcpy(d_data, data, ncBlocks*blockSize*sizeof(float), cudaMemcpyHostToDevice)); 
-    //printf("nblocks: %d bs: %d\n", nbBlocks_h, bs);
+    // printf("nblocks: %d bs: %d ncblock %d\n", nbBlocks_h, bs, ncBlocks_h);
     checkCudaErrors(cudaMalloc(&d_newdata, nbBlocks_h*bs*sizeof(float)));
 
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA dec main Error: %s\n", cudaGetErrorString(err));
     
     dim3 dimBlock(32, bs/32);
     dim3 dimGrid(65536, 1);
     const int sMemsize = bs * sizeof(float) + dimBlock.y * sizeof(int);
     decompress_state2<<<nbBlocks_h, 64>>>(d_newdata, stateArray,blk_idx, blk_vals, blk_subidx, bs, blk_sig);
+    cudaDeviceSynchronize();
+
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA dec main Error: %s\n", cudaGetErrorString(err));
     decompress_float<<<dimGrid, dimBlock, sMemsize>>>(data, bs, ncBlocks_h, mSize_h);
-    //err = cudaGetLastError();        // Get error code
-    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
     //printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
     cudaDeviceSynchronize();
 
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA dec main Error: %s\n", cudaGetErrorString(err));
     
     // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
     checkCudaErrors(cudaMemcpy(newData, d_newdata, nbBlocks_h*bs*sizeof(float), cudaMemcpyDeviceToDevice));

From 820f8a3ae340913477b454dafa977bf321afb60c Mon Sep 17 00:00:00 2001
From: Milan Shah <mkshah5@ncsu.edu>
Date: Thu, 20 Apr 2023 13:59:06 -0400
Subject: [PATCH 079/126] Modifiable data block size

---
 qtensor/compression/szx/src/cuszx_entry.cu | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/qtensor/compression/szx/src/cuszx_entry.cu b/qtensor/compression/szx/src/cuszx_entry.cu
index b5761079..4720bc1d 100644
--- a/qtensor/compression/szx/src/cuszx_entry.cu
+++ b/qtensor/compression/szx/src/cuszx_entry.cu
@@ -1310,7 +1310,7 @@ __global__ void decompress_get_stats(float *newData, size_t nbEle, unsigned char
 ){
 
     //printf("ma\n");
-    blockSize = 256;
+    // blockSize = 256;
     r += 4;
     r++;
     r += sizeof(size_t);
@@ -1350,7 +1350,7 @@ __global__ void setup_data_stateArray(float *newData, size_t nbEle, unsigned cha
     size_t nbConstantBlocks, size_t nbBlocks, size_t *ncBlks,
     unsigned char *stateArray, unsigned char *newR
 ){
-    blockSize = 256;
+    // blockSize = 256;
     r += 4;
     r++;
     r += sizeof(size_t);
@@ -1468,8 +1468,8 @@ void decompress_startup_better(float *newData, size_t nbEle, unsigned char* r,
     unsigned char *stateArray, float* constantMedianArray, unsigned char *data,
     size_t mSize, unsigned char *newCmpBytes
 ){
-    blockSize = 256;
-    size_t nb_tmp = (int) nbEle/256;
+    // blockSize = 256;
+    size_t nb_tmp = (int) nbEle/blockSize;
     uint64_t* g_leng;
     /**
      * Structures to return:
@@ -1531,8 +1531,8 @@ __global__ void decompress_startup(float *newData, size_t nbEle, unsigned char*
     unsigned char *stateArray, float* constantMedianArray, unsigned char *data,
     size_t mSize, unsigned char *newCmpBytes
 ){
-    blockSize = 256;
-    size_t nb_tmp = (int) nbEle/256;
+    // blockSize = 256;
+    size_t nb_tmp = (int) nbEle/blockSize;
     /**
      * Structures to return:
      * blk_idx, blk_subidx, blk_sig, blk_vals, numSigValues (pointer)
@@ -1732,7 +1732,7 @@ void decompress_post_proc_fast(unsigned char *data, float *newData, int blockSiz
     thrust::exclusive_scan(thrust::device, nb, nb + nbBlocks, nb, 0);
     thrust::exclusive_scan(thrust::device, nc, nc + nbBlocks, nc, 0);
 
-    decompress_final_set<<<nbBlocks,256>>>(data, newData, blockSize,nbBlocks, ncBlocks, stateArray,constantMedianArray, nb, nc);
+    decompress_final_set<<<nbBlocks,blockSize>>>(data, newData, blockSize,nbBlocks, ncBlocks, stateArray,constantMedianArray, nb, nc);
     cudaDeviceSynchronize();
     cudaFree(nb);
     cudaFree(nc);

From 145810dca0737bb2c26f984ad5cd9f1119fadb47 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Fri, 5 May 2023 15:49:55 -0500
Subject: [PATCH 080/126] adjust slice count in qtensor bench estimation

---
 bench/qc_simulation/src/simulators/qtensor.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/bench/qc_simulation/src/simulators/qtensor.py b/bench/qc_simulation/src/simulators/qtensor.py
index 75ec0670..fe93ca53 100644
--- a/bench/qc_simulation/src/simulators/qtensor.py
+++ b/bench/qc_simulation/src/simulators/qtensor.py
@@ -128,7 +128,7 @@ def preprocess(in_file, out_file, O='greedy', S=None, M=30, after_slice='run-aga
     write_preps(prep_data, out_file)
 
 
-def estimate(in_file, out_file, C=100, M=30, F=1e12, T=1e9, **kwargs):
+def estimate(in_file, out_file, C=100, M=30, F=1e12, T=1e9, S=0, **kwargs):
     """
     Arguments:
         in_file: file with preprocessed data
@@ -137,12 +137,17 @@ def estimate(in_file, out_file, C=100, M=30, F=1e12, T=1e9, **kwargs):
         M: Memory limit in log2(b/16)
         F: assumed FLOPS 
         T: Throughput of compression
+        S: Offset of slice variables. If S=0, full slicing is used. If S=n last
+           n par_vars are omitted
     """
     from qtensor.compression.cost_estimation import compressed_contraction_cost, Cost
     from dataclasses import asdict
     import json
     prep_data = read_preps(in_file)
     peo, par_vars, tn = prep_data
+    if S > 0:
+        par_vars = par_vars[:-S]
+        print("Offset par_vars", par_vars)
 
     tn.slice({i: slice(0, 1) for i in par_vars})
     peo = peo[:len(peo) - len(par_vars)]

From 714fa67b8d0a37491e8e47a6fdffd8db19c0d647 Mon Sep 17 00:00:00 2001
From: Milan Kartik Shah <mkshah5@ncsu.edu>
Date: Mon, 8 May 2023 16:39:07 -0400
Subject: [PATCH 081/126] Added threshold and grouping code outside CUDA kernel

---
 qtensor/compression/Compressor.py            | 399 ++++++++++---------
 qtensor/compression/szx/src/cuszx_wrapper.py | 126 ++++--
 qtensor/contraction_backends/compression.py  |   4 +-
 3 files changed, 302 insertions(+), 227 deletions(-)

diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index 8669cd5e..c6cfdc51 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -1,198 +1,201 @@
-import io
-import sys
-import numpy as np
-from pathlib import Path
-print(Path(__file__).parent/'szx/src/')
-sys.path.append(str(Path(__file__).parent/'szx/src/'))
-sys.path.append('./szx/src')
-
-try:
-    from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
-except:
-    # Silently fail on missing build of cuszx
-    pass
-
-CUSZX_BLOCKSIZE = 256
-
-class Compressor():
-    def compress(self, data):
-        raise NotImplementedError
-
-    def decompress(self, ptr):
-        raise NotImplementedError
-
-    def compress_size(self, ptr):
-        return ptr.nbytes
-
-# -- Debugging and profiling
-
-import time
-from dataclasses import dataclass, asdict
-@dataclass
-class CompressMeasure:
-    time: float = 0
-    size_in: int = 0
-    size_out: int = 0
-    label: str = ''
-
-    def __str__(self):
-        compress_ratio = self.size_in / self.size_out
-        return (f'Measure: {self.time:.3f}s, '
-                f'{self.size_in/1024**2:.2f}MB -> {self.size_out/1024**2:.2f}MB ({compress_ratio:.3f} in/out ratio)'
-        )
-
-class ProfileCompressor(Compressor):
-    def __init__(self, compressor:Compressor, trace=True):
-        self.trace = trace
-        self.compressor = compressor
-        self.profile_data = {'compress': [], 'decompress': []}
-
-    def compress(self, data):
-        start = time.time()
-        ptr = self.compressor.compress(data)
-        end = time.time()
-        out_size = self.compressor.compress_size(ptr)
-        cmeasure = CompressMeasure(end-start, data.nbytes, out_size)
-        self.profile_data['compress'].append(cmeasure)
-        if self.trace:
-            print(f'Compress: {cmeasure}')
-        return ptr
-
-    def decompress(self, ptr):
-        start = time.time()
-        data = self.compressor.decompress(ptr)
-        end = time.time()
-        in_size = self.compressor.compress_size(ptr)
-        dmeasure = CompressMeasure(end-start, in_size, data.nbytes)
-        self.profile_data['decompress'].append(dmeasure)
-        if self.trace:
-            print(f'Decompress: {dmeasure}')
-        return data
-
-    def get_profile_data(self):
-        return self.profile_data['compress'], self.profile_data['decompress']
-
-    def get_profile_data_json(self):
-        compress, decompress = self.get_profile_data()
-        return {
-            'compress': [asdict(c) for c in compress],
-            'decompress': [asdict(c) for c in decompress],
-        }
-
-    def get_profile_stats(self):
-        compress, decompress = self.get_profile_data()
-        compress_time = sum([x.time for x in compress])
-        decompress_time = sum([x.time for x in decompress])
-        compress_ratios = np.mean([x.size_in/x.size_out for x in compress])
-        compress_size = sum([x.size_out for x in compress])
-        return compress_time, decompress_time, compress_size, compress_ratios
-# --
-
-class NumpyCompressor(Compressor):
-    def compress(self, data):
-        comp = io.BytesIO()
-        np.savez_compressed(comp, data)
-        return comp
-
-    def compress_size(self, ptr):
-        return ptr.getbuffer().nbytes
-
-    def decompress(self, ptr):
-        ptr.seek(0)
-        return  np.load(ptr)['arr_0']
-
-class CUSZCompressor(Compressor):
-    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
-        self.r2r_error = r2r_error
-        self.r2r_threshold = r2r_threshold
-        self.decompressed_own = []
-
-    def free_decompressed(self):
-        import cupy
-        print("Cleanup", len(self.decompressed_own))
-        for x in self.decompressed_own:
-            print("CUDA Free", x)
-            cupy.cuda.runtime.free(x)
-        self.decompressed_own = []
-
-    def free_compressed(self, ptr):
-        import ctypes, cupy
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
-        p_decompressed_ptr = ctypes.addressof(cmp_bytes)
-        # cast to int64 pointer
-        # (effectively converting pointer to pointer to addr to pointer to int64)
-        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-        decompressed_int = p_decompressed_int.contents
-        cupy.cuda.runtime.free(decompressed_int.value)
-
-    def compress(self, data):
-        import cupy
-        if isinstance(data, cupy.ndarray):
-            isCuPy = True
-        else:
-            isCuPy = False
-        num_elements = data.size
-        # Adapt numele depending on itemsize
-        itemsize = data.dtype.itemsize
-        num_elements_eff = int(num_elements*itemsize/4) 
-
-        dtype = data.dtype
-        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
-        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
-
-    def compress_size(self, ptr):
-        return ptr[5]
-
-    def decompress(self, obj):
-        import cupy
-        import ctypes
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = obj
-        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
-        # -- Workaround to convert GPU pointer to int
-        p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
-        # cast to int64 pointer
-        # (effectively converting pointer to pointer to addr to pointer to int64)
-        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-        decompressed_int = p_decompressed_int.contents
-        # --
-        self.decompressed_own.append(decompressed_int.value)
-        mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
-        mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
-        arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
-        return arr
-    
-    ### Compression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
-    # - num_elements = Number of floating point elements in data
-    # - r2r_error = relative-to-value-range error bound for lossy compression
-    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
-    # Returns:
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
-    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
-        
-        if not isCuPy:
-            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-        else:
-            cmp_bytes, outSize_ptr = cuszx_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-        return cmp_bytes, outSize_ptr
-
-    ### Decompression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - num_elements = Number of floating point elements in original data
-    # Returns:
-    # - decompressed_data = Float32 pointer to decompressed data
-    #
-    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
-
-    def cuszx_decompress(self, isCuPy, cmp_bytes, num_elements):
-        if not isCuPy:
-            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
-        else:
-            decompressed_data = cuszx_device_decompress(num_elements, cmp_bytes)
-
-        return decompressed_data
+import io
+import sys
+import numpy as np
+from pathlib import Path
+print(Path(__file__).parent/'szx/src/')
+sys.path.append(str(Path(__file__).parent/'szx/src/'))
+sys.path.append('./szx/src')
+
+try:
+    from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
+except:
+    # Silently fail on missing build of cuszx
+    pass
+
+CUSZX_BLOCKSIZE = 256
+
+class Compressor():
+    def compress(self, data):
+        raise NotImplementedError
+
+    def decompress(self, ptr):
+        raise NotImplementedError
+
+    def compress_size(self, ptr):
+        return ptr.nbytes
+
+# -- Debugging and profiling
+
+import time
+from dataclasses import dataclass, asdict
+@dataclass
+class CompressMeasure:
+    time: float = 0
+    size_in: int = 0
+    size_out: int = 0
+    label: str = ''
+
+    def __str__(self):
+        compress_ratio = self.size_in / self.size_out
+        return (f'Measure: {self.time:.3f}s, '
+                f'{self.size_in/1024**2:.2f}MB -> {self.size_out/1024**2:.2f}MB ({compress_ratio:.3f} in/out ratio)'
+        )
+
+class ProfileCompressor(Compressor):
+    def __init__(self, compressor:Compressor, trace=True):
+        self.trace = trace
+        self.compressor = compressor
+        self.profile_data = {'compress': [], 'decompress': []}
+
+    def compress(self, data):
+        start = time.time()
+        ptr = self.compressor.compress(data)
+        end = time.time()
+        out_size = self.compressor.compress_size(ptr)
+        cmeasure = CompressMeasure(end-start, data.nbytes, out_size)
+        self.profile_data['compress'].append(cmeasure)
+        if self.trace:
+            print(f'Compress: {cmeasure}')
+        return ptr
+
+    def decompress(self, ptr):
+        start = time.time()
+        data = self.compressor.decompress(ptr)
+        end = time.time()
+        in_size = self.compressor.compress_size(ptr)
+        dmeasure = CompressMeasure(end-start, in_size, data.nbytes)
+        self.profile_data['decompress'].append(dmeasure)
+        if self.trace:
+            print(f'Decompress: {dmeasure}')
+        return data
+
+    def get_profile_data(self):
+        return self.profile_data['compress'], self.profile_data['decompress']
+
+    def get_profile_data_json(self):
+        compress, decompress = self.get_profile_data()
+        return {
+            'compress': [asdict(c) for c in compress],
+            'decompress': [asdict(c) for c in decompress],
+        }
+
+    def get_profile_stats(self):
+        compress, decompress = self.get_profile_data()
+        compress_time = sum([x.time for x in compress])
+        decompress_time = sum([x.time for x in decompress])
+        compress_ratios = np.mean([x.size_in/x.size_out for x in compress])
+        compress_size = sum([x.size_out for x in compress])
+        return compress_time, decompress_time, compress_size, compress_ratios
+# --
+
+class NumpyCompressor(Compressor):
+    def compress(self, data):
+        comp = io.BytesIO()
+        np.savez_compressed(comp, data)
+        return comp
+
+    def compress_size(self, ptr):
+        return ptr.getbuffer().nbytes
+
+    def decompress(self, ptr):
+        ptr.seek(0)
+        return  np.load(ptr)['arr_0']
+
+class CUSZCompressor(Compressor):
+    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
+        self.r2r_error = r2r_error
+        self.r2r_threshold = r2r_threshold
+        self.decompressed_own = []
+
+    def free_decompressed(self):
+        import cupy
+        print("Cleanup", len(self.decompressed_own))
+        for x in self.decompressed_own:
+            print("CUDA Free", x)
+            cupy.cuda.runtime.free(x)
+        self.decompressed_own = []
+
+    def free_compressed(self, ptr):
+        import ctypes, cupy
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
+        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
+        # cast to int64 pointer
+        # (effectively converting pointer to pointer to addr to pointer to int64)
+        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        decompressed_int = p_decompressed_int.contents
+        cupy.cuda.runtime.free(decompressed_int.value)
+
+    def compress(self, data):
+        import cupy
+        if isinstance(data, cupy.ndarray):
+            isCuPy = True
+        else:
+            isCuPy = False
+        num_elements = data.size
+        # Adapt numele depending on itemsize
+        itemsize = data.dtype.itemsize
+        num_elements_eff = int(num_elements*itemsize/4) 
+
+        dtype = data.dtype
+        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
+        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
+
+    def compress_size(self, ptr):
+        return ptr[5]
+
+    def decompress(self, obj):
+        import cupy
+        import ctypes
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = obj
+        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff, self, dtype)
+        arr_cp = decompressed_ptr[0]
+        self.decompressed_own.append(decompressed_ptr[1])
+        # -- Workaround to convert GPU pointer to int
+        # p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
+        # # cast to int64 pointer
+        # # (effectively converting pointer to pointer to addr to pointer to int64)
+        # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        # decompressed_int = p_decompressed_int.contents
+        # # --
+        # self.decompressed_own.append(decompressed_int.value)
+        # mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
+        # mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
+        arr = cupy.reshape(arr_cp, shape)
+        # arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
+        return arr
+    
+    ### Compression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
+    # - num_elements = Number of floating point elements in data
+    # - r2r_error = relative-to-value-range error bound for lossy compression
+    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
+    # Returns:
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
+    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
+        
+        if not isCuPy:
+            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+        else:
+            cmp_bytes, outSize_ptr = cuszx_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+        return cmp_bytes, outSize_ptr
+
+    ### Decompression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - num_elements = Number of floating point elements in original data
+    # Returns:
+    # - decompressed_data = Float32 pointer to decompressed data
+    #
+    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
+
+    def cuszx_decompress(self, isCuPy, cmp_bytes, num_elements, owner, dtype):
+        if not isCuPy:
+            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
+        else:
+            decompressed_data = cuszx_device_decompress(num_elements, cmp_bytes, owner,dtype)
+
+        return decompressed_data
diff --git a/qtensor/compression/szx/src/cuszx_wrapper.py b/qtensor/compression/szx/src/cuszx_wrapper.py
index cc38df89..388bd7ab 100644
--- a/qtensor/compression/szx/src/cuszx_wrapper.py
+++ b/qtensor/compression/szx/src/cuszx_wrapper.py
@@ -3,6 +3,8 @@
 from ctypes import *
 import random
 from qtensor.tools.lazy_import import cupy as cp
+import time
+import torch
 
 from pathlib import Path
 LIB_PATH = str(Path(__file__).parent/'libcuszx_wrapper.so')
@@ -54,7 +56,7 @@ def cuszx_host_compress(oriData, absErrBound, nbEle, blockSize,threshold):
     variable = ctypes.c_size_t(0)
     outSize = ctypes.pointer(variable)
 
-    oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float))
+    oriData_p = ctypes.cast(oriD.data.ptr, ctypes.POINTER(c_float))
 
     o_bytes = __cuszx_host_compress(oriData_p, outSize,np.float32(absErrBound), np.ulonglong(nbEle), np.int32(blockSize),np.float32(threshold))
 
@@ -71,33 +73,93 @@ def cuszx_host_decompress(nbEle, cmpBytes):
 def cuszx_device_compress(oriData, absErrBound, nbEle, blockSize,threshold):
     __cuszx_device_compress = get_device_compress()
     
+    ori_nbEle = nbEle
     variable = ctypes.c_size_t(0)
     outSize = ctypes.pointer(variable)
     #absErrBound = absErrBound*(cp.amax(oriData.get())-cp.amin(oriData.get()))
     #threshold = threshold*(cp.amax(oriData.get())-cp.amin(oriData.get()))
+    oriData = oriData.flatten()
+    ori_real = oriData.real
+    ori_imag = oriData.imag
+    oriData = cp.concatenate((ori_real, ori_imag))
+   # print(oriData.dtype)
     sample = oriData[::2]
-    d = cp.amax(sample) - cp.amin(sample)
+    
+    #torch_tensor = torch.as_tensor(sample, device='cuda')
+    #d = torch.max(torch_tensor).item() - torch.min(torch_tensor).item()
+    #s_sample = cp.sort(sample)
+    #d = s_sample[-1] - s_sample[0]
+    #v_time = time.time()
+    #print(type(oriData))
+    d = cp.amax(oriData) - cp.amin(oriData)
+    #print("max min time (s): " +str(time.time()-v_time))
     d = d.get()
     if d.dtype == np.complex64:
         #d = min(d.real, d.imag)
         d = d.real
     absErrBound = absErrBound*(d)
     threshold = threshold*(d)
-    oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float))
+    s_1 = time.time() 
+    #print(cp.get_array_module(oriData))    
+    truth_values = abs(oriData)<=threshold
+    oriData[truth_values] = 0.0
+    truth_values = cp.invert(truth_values)
+    oriData = oriData[truth_values]
+    bitmap = truth_values
+    nbEle = oriData.shape[0]
     
+
+    oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float))
+    #print("starting") 
     o_bytes = __cuszx_device_compress(oriData_p, outSize,np.float32(absErrBound), np.ulonglong(nbEle), np.int32(blockSize),np.float32(threshold))
-    
-    return o_bytes, outSize
+  
+    #print("tg and max time (s): "+str(time.time()-s_1))
+    #print("bitmap shape: "+str(bitmap.shape[0]))
+    #print("percent nonzero bytes: "+str(bitmap[cp.nonzero(bitmap)].shape[0]/bitmap.shape[0]))
+    #print("CR")
+    print((ori_nbEle*4)/(outSize[0] + bitmap.shape[0]/8))
+    return (o_bytes,bitmap), outSize
 
 
-def cuszx_device_decompress(nbEle, cmpBytes):
+def cuszx_device_decompress(nbEle, cmpBytes, owner, dtype):
     __cuszx_device_decompress=get_device_decompress()
-    
-    nbEle_p = ctypes.c_size_t(nbEle)
+    (cmpBytes, bitmap) = cmpBytes
+    #print("bitmap len:" +str(len(bitmap)))
+    #print(nbEle)
+    tmp_nbEle = cp.count_nonzero(bitmap).item()
+    #print(tmp_nbEle)
+    nbEle_p = ctypes.c_size_t(tmp_nbEle)
     newData = __cuszx_device_decompress(nbEle_p,cmpBytes)
-    return newData
+
+    # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
+    # -- Workaround to convert GPU pointer to int
+    p_decompressed_ptr = ctypes.addressof(newData)
+    # cast to int64 pointer
+    # (effectively converting pointer to pointer to addr to pointer to int64)
+    p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decompressed_int = p_decompressed_int.contents
+    # --
+    pointer_for_free = decompressed_int.value
+    # self.decompressed_own.append(decompressed_int.value)
+    mem = cp.cuda.UnownedMemory(decompressed_int.value, tmp_nbEle, owner, device_id=0)
+    mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
+    #print("mem ptr")
+    #print(mem_ptr)
+    arr = cp.ndarray(shape=(tmp_nbEle,), dtype=np.float32, memptr=mem_ptr)
+
+    res = cp.zeros((nbEle,))
+    ## need to convert newData to cupy
+    cp.place(res,bitmap,arr)
+
+    c_res = cp.zeros(int(nbEle/2), np.complex64)
+    c_res.real = res[0:int(nbEle/2)]
+    c_res.imag = res[int(nbEle/2):]
+    return (c_res, pointer_for_free)
 
 ### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
 
 if __name__ == "__main__":
     
@@ -105,34 +167,44 @@ def cuszx_device_decompress(nbEle, cmpBytes):
     MAX_D = 10.0
     MIN_D = -10.0
     RANGE = MAX_D - MIN_D
-    r2r_threshold = 0.01
-    r2r_error = 0.01
+    r2r_threshold = 0.002
+    r2r_error = 0.0001
 
-    #in_vector = np.fromfile("real_tensor_d26.f32", dtype=np.float32)
+    in_vector = np.fromfile("real_sample.bin", dtype=np.float32)
     #print(np.max(in_vector))
+    DATA_SIZE = len(in_vector)
     #range_vr = np.max(in_vector)-np.min(in_vector)
     #r2r_threshold = r2r_threshold*range_vr
     #r2r_error = r2r_error*range_vr
-    in_vector = np.zeros((DATA_SIZE,))
-    for i in range(0,int(DATA_SIZE/4)):
-        in_vector[i] = 0.0
-    for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
-        in_vector[i] = 5.0
-    for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
-        in_vector[i] = random.uniform(MIN_D, MAX_D)
-    for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
-        in_vector[i] = -7.0
-    for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
-        in_vector[i] = 0.001
-
-
+    #in_vector = np.zeros((DATA_SIZE,))
+    #for i in range(0,int(DATA_SIZE/4)):
+    #    in_vector[i] = 0.0
+    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+    #    in_vector[i] = 5.0
+    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
+    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+    #    in_vector[i] = -7.0
+    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+    #    in_vector[i] = 0.001
+
+    print(DATA_SIZE)
     in_vector = in_vector.astype('float32')
     in_vector_gpu = cp.asarray(in_vector)
     
     # variable = ctypes.c_size_t(0)
     # outSize = ctypes.pointer(variable)
-
+    s_time = time.time()
     o_bytes, outSize = cuszx_device_compress(in_vector_gpu, r2r_error, DATA_SIZE, 256, r2r_threshold)
+    print("Time python: "+str(time.time()-s_time))
+    print(outSize[0])
     print("Compress Success...starting decompress ")
-    d_bytes = cuszx_device_decompress(DATA_SIZE, o_bytes)
+    comp = Comp()
+
+    s_time = time.time()
+    (d_bytes,ptr )= cuszx_device_decompress(DATA_SIZE, o_bytes, comp, in_vector_gpu.dtype)
+    
+    print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
     print("Decompress Success")
diff --git a/qtensor/contraction_backends/compression.py b/qtensor/contraction_backends/compression.py
index 4df06f3a..6bc09558 100644
--- a/qtensor/contraction_backends/compression.py
+++ b/qtensor/contraction_backends/compression.py
@@ -70,7 +70,7 @@ def process_bucket(self, bucket, no_sum=False):
                     for c in t.data:
                         cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = c
                         import ctypes
-                        p_decompressed_ptr = ctypes.addressof(cmp_bytes)
+                        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
                         # cast to int64 pointer
                         # (effectively converting pointer to pointer to addr to pointer to int64)
                         p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
@@ -97,7 +97,7 @@ def process_bucket(self, bucket, no_sum=False):
                 for c in accum.data:
                     cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = c
                     import ctypes
-                    p_decompressed_ptr = ctypes.addressof(cmp_bytes)
+                    p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
                     # cast to int64 pointer
                     # (effectively converting pointer to pointer to addr to pointer to int64)
                     p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))

From f53e6ed4aa03d3046ff3080b335ee1711b7f7a59 Mon Sep 17 00:00:00 2001
From: Milan Kartik Shah <mkshah5@ncsu.edu>
Date: Fri, 19 May 2023 10:48:18 -0400
Subject: [PATCH 082/126] Added cuSZp as compressor

---
 qtensor/compression/Compressor.py             |  17 +-
 qtensor/compression/szp/include/cuSZp.h       |  12 +
 qtensor/compression/szp/include/cuSZp_entry.h |  12 +
 qtensor/compression/szp/include/cuSZp_timer.h |  31 ++
 .../compression/szp/include/cuSZp_utility.h   |  14 +
 qtensor/compression/szp/src/cuSZp.cu          | 393 ++++++++++++++
 qtensor/compression/szp/src/cuSZp_entry.cu    | 147 ++++++
 qtensor/compression/szp/src/cuSZp_timer.cu    |  31 ++
 qtensor/compression/szp/src/cuSZp_utility.cu  | 493 ++++++++++++++++++
 qtensor/compression/szp/src/cuSZp_wrapper.cu  |  36 ++
 qtensor/compression/szp/src/cuSZp_wrapper.py  | 189 +++++++
 11 files changed, 1369 insertions(+), 6 deletions(-)
 create mode 100644 qtensor/compression/szp/include/cuSZp.h
 create mode 100644 qtensor/compression/szp/include/cuSZp_entry.h
 create mode 100644 qtensor/compression/szp/include/cuSZp_timer.h
 create mode 100644 qtensor/compression/szp/include/cuSZp_utility.h
 create mode 100644 qtensor/compression/szp/src/cuSZp.cu
 create mode 100644 qtensor/compression/szp/src/cuSZp_entry.cu
 create mode 100644 qtensor/compression/szp/src/cuSZp_timer.cu
 create mode 100644 qtensor/compression/szp/src/cuSZp_utility.cu
 create mode 100644 qtensor/compression/szp/src/cuSZp_wrapper.cu
 create mode 100644 qtensor/compression/szp/src/cuSZp_wrapper.py

diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index c6cfdc51..5f0123e3 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -5,10 +5,15 @@
 print(Path(__file__).parent/'szx/src/')
 sys.path.append(str(Path(__file__).parent/'szx/src/'))
 sys.path.append('./szx/src')
+sys.path.append(str(Path(__file__).parent/'szp/src/'))
+sys.path.append('./szp/src')
+#sys.path.append('/home/mkshah5/QTensor/qtensor/compression/szp/src')
 
 try:
-    from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
+    #from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
+    from cuSZp_wrapper import cuszp_device_compress, cuszp_device_decompress
 except:
+    print("import failed")
     # Silently fail on missing build of cuszx
     pass
 
@@ -146,8 +151,8 @@ def compress_size(self, ptr):
     def decompress(self, obj):
         import cupy
         import ctypes
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = obj
-        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff, self, dtype)
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
+        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
         arr_cp = decompressed_ptr[0]
         self.decompressed_own.append(decompressed_ptr[1])
         # -- Workaround to convert GPU pointer to int
@@ -179,7 +184,7 @@ def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
         if not isCuPy:
             cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
         else:
-            cmp_bytes, outSize_ptr = cuszx_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+            cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
         return cmp_bytes, outSize_ptr
 
     ### Decompression API with cuSZx ###
@@ -192,10 +197,10 @@ def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
     #
     # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
 
-    def cuszx_decompress(self, isCuPy, cmp_bytes, num_elements, owner, dtype):
+    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
         if not isCuPy:
             decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
         else:
-            decompressed_data = cuszx_device_decompress(num_elements, cmp_bytes, owner,dtype)
+            decompressed_data = cuszp_device_decompress(num_elements, cmp_bytes, cmpsize, owner,dtype)
 
         return decompressed_data
diff --git a/qtensor/compression/szp/include/cuSZp.h b/qtensor/compression/szp/include/cuSZp.h
new file mode 100644
index 00000000..0a168f34
--- /dev/null
+++ b/qtensor/compression/szp/include/cuSZp.h
@@ -0,0 +1,12 @@
+#ifndef CUSZP_INCLUDE_CUSZP_H
+#define CUSZP_INCLUDE_CUSZP_H
+
+static const int cmp_tblock_size = 32; // 32 should be the best, not need to modify.
+static const int dec_tblock_size = 32; // 32 should be the best, not need to modify.
+static const int cmp_chunk = 8192;
+static const int dec_chunk = 8192;
+
+__global__ void SZp_compress_kernel(const float* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle);
+__global__ void SZp_decompress_kernel(float* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle);
+
+#endif // CUSZP_INCLUDE_CUSZP_H
\ No newline at end of file
diff --git a/qtensor/compression/szp/include/cuSZp_entry.h b/qtensor/compression/szp/include/cuSZp_entry.h
new file mode 100644
index 00000000..5acd97a5
--- /dev/null
+++ b/qtensor/compression/szp/include/cuSZp_entry.h
@@ -0,0 +1,12 @@
+#ifndef CUSZP_INCLUDE_CUSZP_ENTRY_H
+#define CUSZP_INCLUDE_CUSZP_ENTRY_H
+
+#include <cuda_runtime.h>
+
+void SZp_compress_hostptr(float* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound);
+void SZp_decompress_hostptr(float* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, float errorBound);
+extern "C" void SZp_compress_deviceptr(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream = 0);
+void SZp_dev_new(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream = 0);
+extern "C" void SZp_decompress_deviceptr(float* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, float errorBound, cudaStream_t stream = 0);
+
+#endif // CUSZP_INCLUDE_CUSZP_ENTRY_H
\ No newline at end of file
diff --git a/qtensor/compression/szp/include/cuSZp_timer.h b/qtensor/compression/szp/include/cuSZp_timer.h
new file mode 100644
index 00000000..faca61c3
--- /dev/null
+++ b/qtensor/compression/szp/include/cuSZp_timer.h
@@ -0,0 +1,31 @@
+#ifndef CUSZP_INCLUDE_CUSZP_TIMER_H
+#define CUSZP_INCLUDE_CUSZP_TIMER_H
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+struct PrivateTimingGPU {
+    cudaEvent_t start;
+    cudaEvent_t stop;
+};
+
+class TimingGPU
+{
+    private:
+        PrivateTimingGPU *privateTimingGPU;
+
+    public:
+
+        TimingGPU();
+
+        ~TimingGPU();
+
+        void StartCounter();
+
+        void StartCounterFlags();
+
+        float GetCounter();
+
+};
+
+#endif // CUSZP_INCLUDE_CUSZP_TIMER_H
\ No newline at end of file
diff --git a/qtensor/compression/szp/include/cuSZp_utility.h b/qtensor/compression/szp/include/cuSZp_utility.h
new file mode 100644
index 00000000..e698633f
--- /dev/null
+++ b/qtensor/compression/szp/include/cuSZp_utility.h
@@ -0,0 +1,14 @@
+#ifndef CUSZP_INCLUDE_CUSZP_UTILITY_H
+#define CUSZP_INCLUDE_CUSZP_UTILITY_H
+
+void symTransForm_4Bytes(unsigned char data[4]);
+unsigned char *readByteData_Yafan(char *srcFilePath, size_t *byteLength, int *status);
+float *readFloatData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status);
+float *readFloatData_Yafan(char *srcFilePath, size_t *nbEle, int *status);
+void writeByteData_Yafan(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status);
+void writeFloatData_inBytes_Yafan(float *data, size_t nbEle, char* tgtFilePath, int *status);
+double SSIM_3d_calcWindow_float(float* data, float* other, size_t size1, size_t size0, int offset0, int offset1, int offset2, int windowSize0, int windowSize1, int windowSize2);
+double computeSSIM(float* oriData, float* decData, size_t size2, size_t size1, size_t size0);
+double *computePSNR(size_t nbEle, float *ori_data, float *data);
+
+#endif // CUSZP_INCLUDE_CUSZP_UTILITY_H
\ No newline at end of file
diff --git a/qtensor/compression/szp/src/cuSZp.cu b/qtensor/compression/szp/src/cuSZp.cu
new file mode 100644
index 00000000..c58cf21f
--- /dev/null
+++ b/qtensor/compression/szp/src/cuSZp.cu
@@ -0,0 +1,393 @@
+#include "cuSZp.h"
+
+__device__ inline int quantization(float data, float recipPrecision)
+{
+    float dataRecip = data*recipPrecision;
+    int s = dataRecip>=-0.5f?0:1;
+    return (int)(dataRecip+0.5f) - s;
+}
+
+
+__device__ inline int get_bit_num(unsigned int x)
+{
+    return (sizeof(unsigned int)*8) - __clz(x);
+}
+
+
+__global__ void SZp_compress_kernel(const float* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle)
+{
+    __shared__ unsigned int base_idx;
+
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    const int idx = bid * blockDim.x + tid;
+    const int lane = idx & 31;
+    const int warp = idx >> 5;
+    const int block_num = cmp_chunk/32;
+    const int rate_ofs = (nbEle+31)/32;
+    const float recipPrecision = 0.5f/eb;
+
+    int base_start_idx;
+    int base_block_start_idx, base_block_end_idx;
+    int quant_chunk_idx;
+    int block_idx;
+    int currQuant, lorenQuant, prevQuant, maxQuant;
+    int absQuant[cmp_chunk];
+    unsigned int sign_flag[block_num];
+    int sign_ofs;
+    int fixed_rate[block_num];
+    unsigned int thread_ofs = 0;
+
+    // Prequantization + Lorenzo Prediction + Fixed-length encoding + store fixed-length to global memory.
+    base_start_idx = warp * cmp_chunk * 32;
+    for(int j=0; j<block_num; j++)
+    {
+        // Block initilization.
+        base_block_start_idx = base_start_idx + j * 1024 + lane * 32;
+        base_block_end_idx = base_block_start_idx + 32;
+        sign_flag[j] = 0;
+        block_idx = base_block_start_idx/32;
+        prevQuant = 0;
+        maxQuant = 0;
+
+        // Operation for each block
+        for(int i=base_block_start_idx; i<base_block_end_idx; i++)
+        {
+            // Get quantization and Lorenzo prediction
+            quant_chunk_idx = j * 32 + i % 32;
+            currQuant = quantization(oriData[i], recipPrecision);
+            lorenQuant = currQuant - prevQuant;
+            prevQuant = currQuant;
+            // Get and combine sign info.
+            sign_ofs = i % 32;
+            sign_flag[j] |= (lorenQuant < 0) << (31 - sign_ofs);
+            // Get absolute quant.
+            absQuant[quant_chunk_idx] = abs(lorenQuant);
+            // Update max quant.
+            maxQuant = maxQuant > absQuant[quant_chunk_idx] ? maxQuant : absQuant[quant_chunk_idx];
+        }
+
+        // Record block info.
+        fixed_rate[j] = get_bit_num(maxQuant);
+        thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
+        // Write block fixed rate to compressed data.
+        if(block_idx<rate_ofs) cmpData[block_idx] = (unsigned char)fixed_rate[j];
+        __syncthreads();
+    }
+
+    // Warp-level prefix-sum (inclusive), also thread-block-level.
+    for(int i=1; i<32; i<<=1)
+    {
+        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
+        if(lane >= i) thread_ofs += tmp;
+    }
+    __syncthreads();
+
+    // Write warp(i.e. thread-block)-level prefix-sum to global-memory.
+    if(lane==31) 
+    {
+        cmpOffset[warp+1] = (thread_ofs+7)/8;
+        if(warp==0)
+            flag[1] = 2;
+        else
+            flag[warp+1] = 1;
+    }
+    __syncthreads();
+
+    // Global-level prefix-sum (exclusive).
+    if(warp>0)
+    {
+        if(!lane)
+        {
+            int temp_flag = 1;
+            while(temp_flag!=2) temp_flag = flag[warp];
+            __threadfence();
+            cmpOffset[warp] += cmpOffset[warp-1];
+            __threadfence();
+            flag[warp+1] = 2;
+        }
+    }
+    else
+    {
+        if(!lane) cmpOffset[0] = 0;
+    }
+    __syncthreads();
+    
+    // Assigning compression bytes by given prefix-sum results.
+    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
+    __syncthreads();
+
+    // Bit shuffle for each index, also storing data to global memory.
+    unsigned int base_cmp_byte_ofs = base_idx;
+    unsigned int cmp_byte_ofs;
+    unsigned int tmp_byte_ofs = 0;
+    unsigned int cur_byte_ofs = 0;
+    for(int j=0; j<block_num; j++)
+    {
+        int chunk_idx_start = j*32;
+
+        // Restore index for j-th iteration.
+        tmp_byte_ofs = (fixed_rate[j]) ? (4+fixed_rate[j]*4) : 0;
+        for(int i=1; i<32; i<<=1)
+        {
+            int tmp = __shfl_up_sync(0xffffffff, tmp_byte_ofs, i);
+            if(lane >= i) tmp_byte_ofs += tmp;
+        }
+        unsigned int prev_thread = __shfl_up_sync(0xffffffff, tmp_byte_ofs, 1);
+        if(!lane) cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs;
+        else cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs + prev_thread;
+
+        // Operation for each block, if zero block then do nothing.
+        if(fixed_rate[j])
+        {
+            // Assign sign information for one block.
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 24);
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 16);
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 8);
+            cmpData[cmp_byte_ofs++] = 0xff & sign_flag[j];
+
+            // Assign quant bit information for one block by bit-shuffle.
+            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
+            int mask = 1;
+            for(int i=0; i<fixed_rate[j]; i++)
+            {
+                // Initialization.
+                tmp_char0 = 0;
+                tmp_char1 = 0;
+                tmp_char2 = 0;
+                tmp_char3 = 0;
+
+                // Get ith bit in 0~7 quant, and store to tmp_char0.
+                tmp_char0 = (((absQuant[chunk_idx_start+0] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+1] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+2] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+3] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+4] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+5] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+6] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+7] & mask) >> i) << 0);
+
+                // Get ith bit in 8~15 quant, and store to tmp_char1.
+                tmp_char1 = (((absQuant[chunk_idx_start+8] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+9] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+10] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+11] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+12] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+13] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+14] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+15] & mask) >> i) << 0);
+
+                // Get ith bit in 16~23 quant, and store to tmp_char2.
+                tmp_char2 = (((absQuant[chunk_idx_start+16] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+17] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+18] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+19] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+20] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+21] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+22] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+23] & mask) >> i) << 0);
+                
+                // Get ith bit in 24-31 quant, and store to tmp_char3.
+                tmp_char3 = (((absQuant[chunk_idx_start+24] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+25] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+26] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+27] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+28] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+29] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+30] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+31] & mask) >> i) << 0);
+
+                // Move data to global memory.
+                cmpData[cmp_byte_ofs++] = tmp_char0;
+                cmpData[cmp_byte_ofs++] = tmp_char1;
+                cmpData[cmp_byte_ofs++] = tmp_char2;
+                cmpData[cmp_byte_ofs++] = tmp_char3;
+                mask <<= 1;
+            }
+        }
+
+        // Index updating across different iterations.
+        cur_byte_ofs += __shfl_sync(0xffffffff, tmp_byte_ofs, 31);
+    }
+}
+
+
+
+__global__ void SZp_decompress_kernel(float* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle)
+{
+    __shared__ unsigned int base_idx;
+
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    const int idx = bid * blockDim.x + tid;
+    const int lane = idx & 31;
+    const int warp = idx >> 5;
+    const int block_num = dec_chunk/32;
+    const int rate_ofs = (nbEle+31)/32;
+
+    int base_start_idx;
+    int base_block_start_idx;
+    int block_idx;    
+    int absQuant[32];
+    int currQuant, lorenQuant, prevQuant;
+    int sign_ofs;
+    int fixed_rate[block_num];
+    unsigned int thread_ofs = 0;
+
+    // Obtain fixed rate information for each block.
+    for(int j=0; j<block_num; j++)
+    {
+        block_idx = warp * dec_chunk + j * 32 + lane;
+        if(block_idx<rate_ofs) 
+        {
+            fixed_rate[j] = (int)cmpData[block_idx];
+            thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
+        }
+        __syncthreads();
+    }
+
+    // Warp-level prefix-sum (inclusive), also thread-block-level.
+    for(int i=1; i<32; i<<=1)
+    {
+        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
+        if(lane >= i) thread_ofs += tmp;
+    }
+    __syncthreads();
+
+    // Write warp(i.e. thread-block)-level prefix-sum to global-memory.
+    if(lane==31) 
+    {
+        cmpOffset[warp+1] = (thread_ofs+7)/8;
+        if(warp==0)
+            flag[1] = 2;
+        else
+            flag[warp+1] = 1;
+    }
+    __syncthreads();
+
+    // Global-level prefix-sum (exclusive).
+    if(warp>0)
+    {
+        if(!lane)
+        {
+            int temp_flag = 1;
+            while(temp_flag!=2) temp_flag = flag[warp];
+            __threadfence();
+            cmpOffset[warp] += cmpOffset[warp-1];
+            __threadfence();
+            flag[warp+1] = 2;
+        }
+    }
+    else
+    {
+        if(!lane) cmpOffset[0] = 0;
+    }
+    __syncthreads();
+
+    // Retrieving compression bytes and reconstruct decompression data.
+    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
+    __syncthreads();
+
+    // Restore bit-shuffle for each block.
+    unsigned int base_cmp_byte_ofs = base_idx;
+    unsigned int cmp_byte_ofs;
+    unsigned int tmp_byte_ofs = 0;
+    unsigned int cur_byte_ofs = 0;
+    base_start_idx = warp * dec_chunk * 32;
+    for(int j=0; j<block_num; j++)
+    {
+        // Block initialization.
+        base_block_start_idx = base_start_idx + j * 1024 + lane * 32;
+        unsigned int sign_flag = 0;
+
+        // Restore index for j-th iteration.
+        tmp_byte_ofs = (fixed_rate[j]) ? (4+fixed_rate[j]*4) : 0;
+        for(int i=1; i<32; i<<=1)
+        {
+            int tmp = __shfl_up_sync(0xffffffff, tmp_byte_ofs, i);
+            if(lane >= i) tmp_byte_ofs += tmp;
+        }
+        unsigned int prev_thread = __shfl_up_sync(0xffffffff, tmp_byte_ofs, 1);
+        if(!lane) cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs;
+        else cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs + prev_thread;
+
+        // Operation for each block, if zero block then do nothing.
+        if(fixed_rate[j])
+        {
+            // Retrieve sign information for one block.
+            sign_flag = (0xff000000 & (cmpData[cmp_byte_ofs++] << 24)) |
+                        (0x00ff0000 & (cmpData[cmp_byte_ofs++] << 16)) |
+                        (0x0000ff00 & (cmpData[cmp_byte_ofs++] << 8))  |
+                        (0x000000ff & cmpData[cmp_byte_ofs++]);
+            
+            // Retrieve quant data for one block.
+            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
+            for(int i=0; i<32; i++) absQuant[i] = 0;
+            for(int i=0; i<fixed_rate[j]; i++)
+            {
+                // Initialization.
+                tmp_char0 = cmpData[cmp_byte_ofs++];
+                tmp_char1 = cmpData[cmp_byte_ofs++];
+                tmp_char2 = cmpData[cmp_byte_ofs++];
+                tmp_char3 = cmpData[cmp_byte_ofs++];
+
+                // Get ith bit in 0~7 abs quant from global memory.
+                absQuant[0] |= ((tmp_char0 >> 7) & 0x00000001) << i;
+                absQuant[1] |= ((tmp_char0 >> 6) & 0x00000001) << i;
+                absQuant[2] |= ((tmp_char0 >> 5) & 0x00000001) << i;
+                absQuant[3] |= ((tmp_char0 >> 4) & 0x00000001) << i;
+                absQuant[4] |= ((tmp_char0 >> 3) & 0x00000001) << i;
+                absQuant[5] |= ((tmp_char0 >> 2) & 0x00000001) << i;
+                absQuant[6] |= ((tmp_char0 >> 1) & 0x00000001) << i;
+                absQuant[7] |= ((tmp_char0 >> 0) & 0x00000001) << i;
+
+                // Get ith bit in 8~15 abs quant from global memory.
+                absQuant[8] |= ((tmp_char1 >> 7) & 0x00000001) << i;
+                absQuant[9] |= ((tmp_char1 >> 6) & 0x00000001) << i;
+                absQuant[10] |= ((tmp_char1 >> 5) & 0x00000001) << i;
+                absQuant[11] |= ((tmp_char1 >> 4) & 0x00000001) << i;
+                absQuant[12] |= ((tmp_char1 >> 3) & 0x00000001) << i;
+                absQuant[13] |= ((tmp_char1 >> 2) & 0x00000001) << i;
+                absQuant[14] |= ((tmp_char1 >> 1) & 0x00000001) << i;
+                absQuant[15] |= ((tmp_char1 >> 0) & 0x00000001) << i;
+
+                // Get ith bit in 16-23 abs quant from global memory.
+                absQuant[16] |= ((tmp_char2 >> 7) & 0x00000001) << i;
+                absQuant[17] |= ((tmp_char2 >> 6) & 0x00000001) << i;
+                absQuant[18] |= ((tmp_char2 >> 5) & 0x00000001) << i;
+                absQuant[19] |= ((tmp_char2 >> 4) & 0x00000001) << i;
+                absQuant[20] |= ((tmp_char2 >> 3) & 0x00000001) << i;
+                absQuant[21] |= ((tmp_char2 >> 2) & 0x00000001) << i;
+                absQuant[22] |= ((tmp_char2 >> 1) & 0x00000001) << i;
+                absQuant[23] |= ((tmp_char2 >> 0) & 0x00000001) << i;
+
+                // // Get ith bit in 24-31 abs quant from global memory.
+                absQuant[24] |= ((tmp_char3 >> 7) & 0x00000001) << i;
+                absQuant[25] |= ((tmp_char3 >> 6) & 0x00000001) << i;
+                absQuant[26] |= ((tmp_char3 >> 5) & 0x00000001) << i;
+                absQuant[27] |= ((tmp_char3 >> 4) & 0x00000001) << i;
+                absQuant[28] |= ((tmp_char3 >> 3) & 0x00000001) << i;
+                absQuant[29] |= ((tmp_char3 >> 2) & 0x00000001) << i;
+                absQuant[30] |= ((tmp_char3 >> 1) & 0x00000001) << i;
+                absQuant[31] |= ((tmp_char3 >> 0) & 0x00000001) << i;
+            }
+            
+            // Delorenzo and store data back to decompression data.
+            prevQuant = 0;
+            for(int i=0; i<32; i++)
+            {
+                sign_ofs = i % 32;
+                if(sign_flag & (1 << (31 - sign_ofs)))
+                    lorenQuant = absQuant[i] * -1;
+                else
+                    lorenQuant = absQuant[i];
+                currQuant = lorenQuant + prevQuant;
+                decData[base_block_start_idx+i] = currQuant * eb * 2;
+                prevQuant = currQuant;
+            }
+        }
+
+        // Index updating across different iterations.
+        cur_byte_ofs += __shfl_sync(0xffffffff, tmp_byte_ofs, 31);
+    }
+}
\ No newline at end of file
diff --git a/qtensor/compression/szp/src/cuSZp_entry.cu b/qtensor/compression/szp/src/cuSZp_entry.cu
new file mode 100644
index 00000000..b92d4e41
--- /dev/null
+++ b/qtensor/compression/szp/src/cuSZp_entry.cu
@@ -0,0 +1,147 @@
+#include "cuSZp_entry.h"
+#include "cuSZp.h"
+
+void SZp_compress_hostptr(float* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound)
+{
+    // Data blocking.
+    int bsize = cmp_tblock_size;
+    int gsize = (nbEle + bsize * cmp_chunk - 1) / (bsize * cmp_chunk);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * cmp_chunk;
+
+    // Initializing global memory for GPU compression.
+    float* d_oriData;
+    unsigned char* d_cmpData;
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    unsigned int glob_sync;
+    cudaMalloc((void**)&d_oriData, sizeof(float)*pad_nbEle);
+    cudaMemcpy(d_oriData, oriData, sizeof(float)*pad_nbEle, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_cmpData, sizeof(float)*pad_nbEle);
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_compress_kernel<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
+
+    // Obtain compression ratio and move data back to CPU.  
+    cudaMemcpy(&glob_sync, d_cmpOffset+cmpOffSize-2, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+    *cmpSize = (size_t)glob_sync + (nbEle+31)/32;
+    cudaMemcpy(cmpBytes, d_cmpData, *cmpSize*sizeof(unsigned char), cudaMemcpyDeviceToHost);
+
+    // Free memory that is used.
+    cudaFree(d_oriData);
+    cudaFree(d_cmpData);
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+    cudaStreamDestroy(stream);
+}
+
+
+void SZp_decompress_hostptr(float* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, float errorBound)
+{
+    // Data blocking.
+    int bsize = dec_tblock_size;
+    int gsize = (nbEle + bsize * dec_chunk - 1) / (bsize * dec_chunk);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * dec_chunk;
+
+    // Initializing global memory for GPU compression.
+    float* d_decData;
+    unsigned char* d_cmpData;
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_decData, sizeof(float)*pad_nbEle);
+    cudaMemset(d_decData, 0, sizeof(float)*pad_nbEle);
+    cudaMalloc((void**)&d_cmpData, sizeof(float)*pad_nbEle);
+    cudaMemcpy(d_cmpData, cmpBytes, sizeof(unsigned char)*cmpSize, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_decompress_kernel<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
+
+    // Move data back to CPU.
+    cudaMemcpy(decData, d_decData, sizeof(float)*nbEle, cudaMemcpyDeviceToHost);
+
+    // Free memoy that is used.
+    cudaFree(d_decData);
+    cudaFree(d_cmpData);
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+    cudaStreamDestroy(stream);
+}
+
+
+void SZp_compress_deviceptr(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream)
+{
+    // Data blocking.
+    int bsize = cmp_tblock_size;
+    int gsize = (nbEle + bsize * cmp_chunk - 1) / (bsize * cmp_chunk);
+    int cmpOffSize = gsize + 1;
+
+    // Initializing global memory for GPU compression.
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    unsigned int glob_sync;
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_compress_kernel<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
+
+    // Obtain compression ratio and move data back to CPU.  
+    cudaMemcpy(&glob_sync, d_cmpOffset+cmpOffSize-2, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+    *cmpSize = (size_t)glob_sync + (nbEle+31)/32;
+
+    // Free memory that is used.
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+}
+
+
+void SZp_decompress_deviceptr(float* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, float errorBound, cudaStream_t stream)
+{
+    // Data blocking.
+    int bsize = dec_tblock_size;
+    int gsize = (nbEle + bsize * dec_chunk - 1) / (bsize * dec_chunk);
+    int cmpOffSize = gsize + 1;
+
+    // Initializing global memory for GPU compression.
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+    cudaMemset(d_decData, 0, sizeof(float)*nbEle);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_decompress_kernel<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
+    
+    // Free memoy that is used.
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+}
\ No newline at end of file
diff --git a/qtensor/compression/szp/src/cuSZp_timer.cu b/qtensor/compression/szp/src/cuSZp_timer.cu
new file mode 100644
index 00000000..74c81c30
--- /dev/null
+++ b/qtensor/compression/szp/src/cuSZp_timer.cu
@@ -0,0 +1,31 @@
+#include "cuSZp_timer.h"
+
+TimingGPU::TimingGPU() { privateTimingGPU = new PrivateTimingGPU;  }
+
+TimingGPU::~TimingGPU() { }
+
+void TimingGPU::StartCounter()
+{
+    cudaEventCreate(&((*privateTimingGPU).start));
+    cudaEventCreate(&((*privateTimingGPU).stop));
+    cudaEventRecord((*privateTimingGPU).start,0);
+}
+
+void TimingGPU::StartCounterFlags()
+{
+    int eventflags = cudaEventBlockingSync;
+
+    cudaEventCreateWithFlags(&((*privateTimingGPU).start),eventflags);
+    cudaEventCreateWithFlags(&((*privateTimingGPU).stop),eventflags);
+    cudaEventRecord((*privateTimingGPU).start,0);
+}
+
+// Gets the counter in ms
+float TimingGPU::GetCounter()
+{
+    float time;
+    cudaEventRecord((*privateTimingGPU).stop, 0);
+    cudaEventSynchronize((*privateTimingGPU).stop);
+    cudaEventElapsedTime(&time,(*privateTimingGPU).start,(*privateTimingGPU).stop);
+    return time;
+}
diff --git a/qtensor/compression/szp/src/cuSZp_utility.cu b/qtensor/compression/szp/src/cuSZp_utility.cu
new file mode 100644
index 00000000..784d378a
--- /dev/null
+++ b/qtensor/compression/szp/src/cuSZp_utility.cu
@@ -0,0 +1,493 @@
+//
+// Created by Yafan Huang on 5/31/22.
+//     Copied from SZx.
+//
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <math.h>
+#include <string.h>
+#include "cuSZp_utility.h"
+
+/*Macro Definition for Processing Data*/
+// #define SZ_SCES 0  //successful
+#define RW_SCES 0
+#define RW_FERR 1
+#define RW_TERR 2
+#define LITTLE_ENDIAN_SYSTEM 0
+#define QCAT_BUFS 64
+
+/*Global Varaibles for Processing Data*/
+int dataEndianType_Yafan = 0;
+int sysEndianType_Yafan = 0; //0 means little endian, 1 means big endian
+
+typedef union lint32
+{
+	int ivalue;
+	unsigned int uivalue;
+	unsigned char byte[4];
+} lint32;
+
+typedef union llfloat
+{
+    float value;
+    unsigned int ivalue;
+    unsigned char byte[4];
+} llfloat;
+
+/** ************************************************************************
+ * @brief Reverse 4-bit-length unsigned char array.
+ * 
+ * @param   data[4]         4-bit-length unsigned char array.
+ * *********************************************************************** */
+void symTransForm_4Bytes(unsigned char data[4])
+{
+        unsigned char tmp = data[0];
+        data[0] = data[3];
+        data[3] = tmp;
+
+        tmp = data[1];
+        data[1] = data[2];
+        data[2] = tmp;
+}
+
+/** ************************************************************************
+ * @brief Read byte data from path to source binary format file.
+ *        Usually used for decompressing data from input file.
+ *        Variables byteLength and status can be obtained through this function.       
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   byteLength      the length of byte array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  byteBuf         unsigned char array with length byteLength
+ * *********************************************************************** */
+unsigned char *readByteData_Yafan(char *srcFilePath, size_t *byteLength, int *status)
+{
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = RW_FERR;
+        return 0;
+    }
+	fseek(pFile, 0, SEEK_END);
+    *byteLength = ftell(pFile);
+    fclose(pFile);
+    
+    unsigned char *byteBuf = ( unsigned char *)malloc((*byteLength)*sizeof(unsigned char)); //sizeof(char)==1
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = RW_FERR;
+        return 0;
+    }
+    fread(byteBuf, 1, *byteLength, pFile);
+    fclose(pFile);
+    *status = RW_SCES;
+    return byteBuf;
+}
+
+/** ************************************************************************
+ * @brief Read float data from path to source binary format file in endian systems.
+ *        Usually used for compressing data from input file.
+ *        Variables nbEle and status can be obtained through this function. 
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   nbEle           the length of float array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  daBuf           float array with length nbEle
+ * *********************************************************************** */
+float *readFloatData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = RW_FERR;
+        return NULL;
+    }
+	fseek(pFile, 0, SEEK_END);
+    inSize = ftell(pFile);
+    *nbEle = inSize/4; 
+    fclose(pFile);
+    
+    if(inSize<=0)
+    {
+		printf("Error: input file is wrong!\n");
+		*status = RW_FERR;
+	}
+    
+    float *daBuf = (float *)malloc(inSize);
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = RW_FERR;
+        return NULL;
+    }
+    fread(daBuf, 4, *nbEle, pFile);
+    fclose(pFile);
+    *status = RW_SCES;
+    return daBuf;
+}
+
+/** ************************************************************************
+ * @brief Read float data from path to source binary format file.
+ *        Usually used for compressing data from input file.
+ *        Variables nbEle and status can be obtained through this function. 
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   nbEle           the length of float array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  daBuf           float array with length nbEle
+ * *********************************************************************** */
+float *readFloatData_Yafan(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = RW_SCES;
+	if(dataEndianType_Yafan==sysEndianType_Yafan)
+	{
+		float *daBuf = readFloatData_systemEndian_Yafan(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+		
+		size_t byteLength;
+		unsigned char* bytes = readByteData_Yafan(srcFilePath, &byteLength, &state);
+		if(state == RW_FERR)
+		{
+			*status = RW_FERR;
+			return NULL;
+		}
+		float *daBuf = (float *)malloc(byteLength);
+		*nbEle = byteLength/4;
+		
+		llfloat buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i*4;
+			memcpy(buf.byte, bytes+j, 4);
+			symTransForm_4Bytes(buf.byte);
+			daBuf[i] = buf.value;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+/** ************************************************************************
+ * @brief Write byte data to binary format file.
+ *        Usually used for writing compressed data.
+ *        Variable status can be obtained/switched through this function. 
+ * 
+ * @param   bytes           unsigned char array (compressed data)
+ * @param   byteLength      the length of unsigned char array
+ * @param   tgtFilePath     output file path
+ * @param   status          data processing states (macro definitions) 
+ * *********************************************************************** */
+void writeByteData_Yafan(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status)
+{
+	FILE *pFile = fopen(tgtFilePath, "wb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 3\n");
+        *status = RW_FERR;
+        return;
+    }
+    
+    fwrite(bytes, 1, byteLength, pFile); //write outSize bytes
+    fclose(pFile);
+    *status = RW_SCES;
+}
+
+/** ************************************************************************
+ * @brief Write float data to binary format file.
+ *        Usually used for writing decompressed (reconstructed) data.
+ *        Variable status can be obtained/switched through this function. 
+ * 
+ * @param   bytes           unsigned char array (compressed data)
+ * @param   nbEle           the length of float array
+ * @param   tgtFilePath     output file path
+ * @param   status          data processing states (macro definitions) 
+ * *********************************************************************** */
+void writeFloatData_inBytes_Yafan(float *data, size_t nbEle, char* tgtFilePath, int *status)
+{
+	size_t i = 0; 
+	int state = RW_SCES;
+	llfloat buf;
+	unsigned char* bytes = (unsigned char*)malloc(nbEle*sizeof(float));
+	for(i=0;i<nbEle;i++)
+	{
+		buf.value = data[i];
+		bytes[i*4+0] = buf.byte[0];
+		bytes[i*4+1] = buf.byte[1];
+		bytes[i*4+2] = buf.byte[2];
+		bytes[i*4+3] = buf.byte[3];					
+	}
+
+	size_t byteLength = nbEle*sizeof(float);
+	writeByteData_Yafan(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+// void convertIntArrayToBytes(int* states, size_t stateLength, unsigned char* bytes)
+// {
+// 	lint32 ls;
+// 	size_t index = 0;
+// 	size_t i;
+// 	if(sysEndianType_Yafan==dataEndianType_Yafan)
+// 	{
+// 		for(i=0;i<stateLength;i++)
+// 		{
+// 			index = i << 2; //==i*4
+// 			ls.ivalue = states[i];
+// 			bytes[index] = ls.byte[0];
+// 			bytes[index+1] = ls.byte[1];
+// 			bytes[index+2] = ls.byte[2];
+// 			bytes[index+3] = ls.byte[3];
+// 		}		
+// 	}
+// 	else
+// 	{
+// 		for(i=0;i<stateLength;i++)
+// 		{
+// 			index = i << 2; //==i*4
+// 			ls.ivalue = states[i];
+// 			bytes[index] = ls.byte[3];
+// 			bytes[index+1] = ls.byte[2];
+// 			bytes[index+2] = ls.byte[1];
+// 			bytes[index+3] = ls.byte[0];
+// 		}			
+// 	}
+// }
+
+// void writeIntData_inBytes(int *states, size_t stateLength, char *tgtFilePath, int *status)
+// {
+// 	int state = SZ_SCES;
+// 	size_t byteLength = stateLength*4;
+// 	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+// 	convertIntArrayToBytes(states, stateLength, bytes);
+// 	writeByteData_Yafan(bytes, byteLength, tgtFilePath, &state);
+// 	free(bytes);
+// 	*status = state;
+// }
+
+
+/** ************************************************************************
+ * @brief Calculate SSIM in a small fraction of a 3D data file.
+ *        A subfunction used in computeSSIM().
+ * 
+ * @param   data            original float array
+ * @param   other           other (reconstructed) float array
+ * @param   size1           3d-ssim setting.
+ * @param   size0           3d-ssim setting.
+ * @param   offset0         3d-ssim setting.
+ * @param   offset1         3d-ssim setting.
+ * @param   offset2         3d-ssim setting.
+ * @param   windowSize0     3d-ssim setting.
+ * @param   windowSize1     3d-ssim setting.
+ * @param   windowSize2     3d-ssim setting.
+ * 
+ * @return  ssim            ssim value of the current small fraction data
+ * *********************************************************************** */
+double SSIM_3d_calcWindow_float(float* data, float* other, size_t size1, size_t size0, int offset0, int offset1, int offset2, int windowSize0, int windowSize1, int windowSize2) {
+    int i0,i1,i2,index;
+    int np=0; //Number of points
+    float xMin=data[offset0+size0*(offset1+size1*offset2)];
+    float xMax=data[offset0+size0*(offset1+size1*offset2)];
+    float yMin=other[offset0+size0*(offset1+size1*offset2)];
+    float yMax=other[offset0+size0*(offset1+size1*offset2)];
+    double xSum=0;
+    double ySum=0;
+    for(i2=offset2; i2<offset2+windowSize2; i2++) {
+        for(i1=offset1; i1<offset1+windowSize1; i1++) {
+            for(i0=offset0; i0<offset0+windowSize0; i0++) {
+                np++;
+                index=i0+size0*(i1+size1*i2);
+                if(xMin>data[index])
+                    xMin=data[index];
+                if(xMax<data[index])
+                    xMax=data[index];
+                if(yMin>other[index])
+                    yMin=other[index];
+                if(yMax<other[index])
+                    yMax=other[index];
+                xSum+=data[index];
+                ySum+=other[index];
+            }
+        }
+    }
+    double xMean=xSum/np;
+    double yMean=ySum/np;
+    double var_x = 0, var_y = 0, var_xy = 0;
+    for(i2=offset2; i2<offset2+windowSize2; i2++) {
+        for(i1=offset1; i1<offset1+windowSize1; i1++) {
+            for(i0=offset0; i0<offset0+windowSize0; i0++) {
+                index=i0+size0*(i1+size1*i2);
+                var_x += (data[index] - xMean)*(data[index] - xMean);
+                var_y += (other[index] - yMean)*(other[index] - yMean);
+                var_xy += (data[index] - xMean)*(other[index] - yMean);
+            }
+        }
+    }
+    var_x /= np;
+    var_y /= np;
+    var_xy /= np;
+    double xSigma=sqrt(var_x);
+    double ySigma=sqrt(var_y);
+    double xyCov = var_xy;
+    double c1,c2;
+    if(xMax-xMin==0) {
+		/*K1==0.01, K2==0.03*/
+        c1=0.01*0.01;
+        c2=0.03*0.03;
+    } else {
+        c1=0.01*0.01*(xMax-xMin)*(xMax-xMin);
+        c2=0.03*0.03*(xMax-xMin)*(xMax-xMin);
+    }
+    double c3=c2/2;
+    double luminance=(2*xMean*yMean+c1)/(xMean*xMean+yMean*yMean+c1);
+    double contrast=(2*xSigma*ySigma+c2)/(xSigma*xSigma+ySigma*ySigma+c2);
+    double structure=(xyCov+c3)/(xSigma*ySigma+c3);
+    double ssim=luminance*contrast*structure;
+    return ssim;
+}
+
+/** ************************************************************************
+ * @brief Calculate SSIM between 3D original and decompressed (reconstructed) data.
+ *        API for computing SSIM.
+ * 
+ * @param   oriData         original float array
+ * @param   decData         decompressed (reconstructed) float array
+ * @param   size2           the 1st dim of 3D data.
+ * @param   size1           the 2nd dim of 3D data.
+ * @param   size0           the 3rd dim of 3D data. (the fastest dim)
+ * 
+ * @return  ssimSum/nw      final ssim value between oriData and decData
+ * *********************************************************************** */
+double computeSSIM(float* oriData, float* decData, size_t size2, size_t size1, size_t size0)
+{
+	int windowSize0=7;
+	int windowSize1=7;
+	int windowSize2=7;
+	int windowShift0=2;
+	int windowShift1=2;
+	int windowShift2=2;
+    int offset0,offset1,offset2;
+    int nw=0; //Number of windows
+    double ssimSum=0;
+    int offsetInc0,offsetInc1,offsetInc2;
+    if(windowSize0>size0) {
+        printf("ERROR: windowSize0 = %d > %zu\n", windowSize0, size0);
+    }
+    if(windowSize1>size1) {
+        printf("ERROR: windowSize1 = %d > %zu\n", windowSize1, size1);
+    }
+    if(windowSize2>size2) {
+        printf("ERROR: windowSize2 = %d > %zu\n", windowSize2, size2);
+    }
+    //offsetInc0=windowSize0/2;
+    //offsetInc1=windowSize1/2;
+    //offsetInc2=windowSize2/2;
+    offsetInc0=windowShift0;
+    offsetInc1=windowShift1;
+    offsetInc2=windowShift2;
+    for(offset2=0; offset2+windowSize2<=size2; offset2+=offsetInc2) { //MOVING WINDOW
+        for(offset1=0; offset1+windowSize1<=size1; offset1+=offsetInc1) { //MOVING WINDOW
+            for(offset0=0; offset0+windowSize0<=size0; offset0+=offsetInc0) { //MOVING WINDOW
+                nw++;
+                ssimSum+=SSIM_3d_calcWindow_float(oriData, decData, size1, size0, offset0, offset1, offset2, windowSize0, windowSize1, windowSize2);
+            }
+        }
+    }
+    return ssimSum/nw;
+}
+
+
+/** ************************************************************************
+ * @brief Calculate PSNR between 3D original and decompressed (reconstructed) data.
+ *        API for computing PSNR.
+ * 
+ * @param   nbEle           the length of float array
+ * @param   ori_data        original float array
+ * @param   dec_data        decompressed (reconstructed) float array
+ * 
+ * @return  result          6-length double array, which contains:
+ *                              0. *Mean Square Error (MSE)*
+ *                              1. *Value Range (Max-Min)*
+ *                              2. *Peak Signal-to-noise Ratio (PSNR)*
+ *                              3. Squared Error
+ *                              4. Normalized Squared Error
+ *                              5. Normalized Squared MSE
+ * *********************************************************************** */
+double *computePSNR(size_t nbEle, float *ori_data, float *data) {
+    size_t i = 0;
+    double Max = 0, Min = 0, diffMax = 0;
+    Max = ori_data[0];
+    Min = ori_data[0];
+    diffMax = data[0] > ori_data[0] ? data[0] - ori_data[0] : ori_data[0] - data[0];
+
+    //diffMax = fabs(data[0] - ori_data[0]);
+    double sum1 = 0, sum2 = 0, sum22 = 0;
+
+    for (i = 0; i < nbEle; i++) {
+        sum1 += ori_data[i];
+        sum2 += data[i];
+        sum22 += data[i] * data[i];
+    }
+    double mean1 = sum1 / nbEle;
+    double mean2 = sum2 / nbEle;
+
+    double sum3 = 0, sum4 = 0;
+    double sum = 0, prodSum = 0, relerr = 0;
+
+    double maxpw_relerr = 0;
+    for (i = 0; i < nbEle; i++) {
+        if (Max < ori_data[i]) Max = ori_data[i];
+        if (Min > ori_data[i]) Min = ori_data[i];
+
+        float err = fabs(data[i] - ori_data[i]);
+        if (ori_data[i] != 0) {
+            relerr = err / fabs(ori_data[i]);
+            if (maxpw_relerr < relerr)
+                maxpw_relerr = relerr;
+        }
+
+        if (diffMax < err)
+            diffMax = err;
+        prodSum += (ori_data[i] - mean1) * (data[i] - mean2);
+        sum3 += (ori_data[i] - mean1) * (ori_data[i] - mean1);
+        sum4 += (data[i] - mean2) * (data[i] - mean2);
+        sum += err * err;
+    }
+    double std1 = sqrt(sum3 / nbEle);
+    double std2 = sqrt(sum4 / nbEle);
+    double ee = prodSum / nbEle;
+    double acEff = ee / std1 / std2;
+
+    double mse = sum / nbEle;
+    double range = Max - Min;
+    double psnr = 20 * log10(range) - 10 * log10(mse);
+    double normErr = sqrt(sum);
+    double normErr_norm = normErr / sqrt(sum22);
+    double nrmse = sqrt(mse) / range;
+    double *result = (double *) malloc(sizeof(double) * 6);
+    result[0] = mse;
+    result[1] = range;
+    result[2] = psnr;
+    result[3] = normErr;
+    result[4] = normErr_norm;
+    result[5] = nrmse;
+
+    return result;
+}
\ No newline at end of file
diff --git a/qtensor/compression/szp/src/cuSZp_wrapper.cu b/qtensor/compression/szp/src/cuSZp_wrapper.cu
new file mode 100644
index 00000000..b71bda71
--- /dev/null
+++ b/qtensor/compression/szp/src/cuSZp_wrapper.cu
@@ -0,0 +1,36 @@
+#include "cuSZp_entry.h"
+#include "cuSZp_timer.h"
+#include "cuSZp_utility.h"
+#include "cuSZp.h"
+
+
+extern "C"{
+    /** Before entering SZp_compress, must allocate on device:
+     * - d_cmpBytes
+    */
+    unsigned char* cuSZp_device_compress(float *oriData, size_t *outSize, float absErrBound, size_t nbEle){
+        unsigned char *d_cmpBytes, *d_finalCmpBytes;
+        cudaStream_t stream;
+        cudaStreamCreate(&stream);
+        cudaMalloc((void**)&d_cmpBytes, sizeof(float)*nbEle);
+        SZp_compress_deviceptr(oriData, d_cmpBytes, nbEle, outSize, absErrBound, stream);
+        cudaMalloc((void**)&d_finalCmpBytes, *outSize);
+        cudaMemcpy(d_finalCmpBytes, d_cmpBytes, *outSize, cudaMemcpyDeviceToDevice);
+        cudaFree(d_cmpBytes);
+
+        return d_finalCmpBytes;
+    }
+
+    /** Before entering SZp_decompress, must allocate on device:
+     * - d_decData
+    */
+    float* cuSZp_device_decompress(size_t nbEle, unsigned char* cmpBytes, size_t cmpSize, float errorBound){
+        float *d_decData;
+        cudaStream_t stream;
+        cudaStreamCreate(&stream);
+        cudaMalloc((void**)&d_decData, sizeof(float)*nbEle);
+        SZp_decompress_deviceptr(d_decData, cmpBytes, nbEle, cmpSize, errorBound, stream);
+        return d_decData;
+    }
+    
+}
diff --git a/qtensor/compression/szp/src/cuSZp_wrapper.py b/qtensor/compression/szp/src/cuSZp_wrapper.py
new file mode 100644
index 00000000..ef2d3272
--- /dev/null
+++ b/qtensor/compression/szp/src/cuSZp_wrapper.py
@@ -0,0 +1,189 @@
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+from qtensor.tools.lazy_import import cupy as cp
+import time
+import torch
+
+from pathlib import Path
+LIB_PATH = str(Path(__file__).parent/'libcuszp_wrapper.so')
+
+# unsigned char* cuSZp_device_compress(float *oriData, size_t *outSize, float absErrBound, size_t nbEle){
+
+def get_device_compress():
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.cuSZp_device_compress
+    # Returns: unsigned char *bytes
+    # Needs: float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold
+    func.argtypes = [POINTER(c_float), POINTER(c_size_t), c_float, c_size_t]
+    func.restype = POINTER(c_ubyte)
+    return func
+
+# float* cuSZp_device_decompress(size_t nbEle, unsigned char* cmpBytes, size_t cmpSize, float errorBound){
+
+def get_device_decompress():
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.cuSZp_device_decompress
+    # Returns: float *newData
+    # Needs: size_t nbEle, unsigned char *cmpBytes
+    func.argtypes = [c_size_t, POINTER(c_ubyte), c_size_t, c_float]
+    func.restype = POINTER(c_float)
+    return func
+
+
+
+def cuszp_device_compress(oriData, absErrBound, nbEle,threshold):
+    __cuszp_device_compress = get_device_compress()
+    
+    ori_nbEle = nbEle
+    variable = ctypes.c_size_t(0)
+    outSize = ctypes.pointer(variable)
+    
+    oriData = oriData.flatten()
+    ori_real = oriData.real
+    ori_imag = oriData.imag
+    oriData = cp.concatenate((ori_real, ori_imag))
+    sample = oriData[::2]
+    
+    
+    d = cp.amax(oriData) - cp.amin(oriData)
+    #print("max min time (s): " +str(time.time()-v_time))
+    d = d.get()
+    if d.dtype == np.complex64:
+        #d = min(d.real, d.imag)
+        d = d.real
+    absErrBound = absErrBound*(d)
+    threshold = threshold*(d)
+    s_1 = time.time() 
+    #print(cp.get_array_module(oriData))    
+    truth_values = cp.absolute(oriData)<=threshold
+    oriData[truth_values] = 0.0
+    truth_values = cp.invert(truth_values)
+    oriData = oriData[truth_values]
+    bitmap = truth_values
+    nbEle = oriData.shape[0]
+    
+
+    oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float))
+    #print("starting") 
+    # float *oriData, size_t *outSize, float absErrBound, size_t nbEle
+    o_bytes = __cuszp_device_compress(oriData_p, outSize,np.float32(absErrBound), np.ulonglong(nbEle))
+
+    mempool = cp.get_default_memory_pool()
+    pinned_mempool = cp.get_default_pinned_memory_pool()
+    del oriData
+
+    #print("tg and max time (s): "+str(time.time()-s_1))
+    #print("bitmap shape: "+str(bitmap.shape[0]))
+    #print("percent nonzero bytes: "+str(bitmap[cp.nonzero(bitmap)].shape[0]/bitmap.shape[0]))
+    #print("CR")
+    print((ori_nbEle*4)/(outSize[0] + bitmap.shape[0]/8))
+    return (o_bytes,bitmap, absErrBound), outSize
+
+
+def cuszp_device_decompress(nbEle, cmpBytes, cmpSize, owner, dtype):
+    __cuszp_device_decompress=get_device_decompress()
+    (cmpBytes, bitmap, absErrBound) = cmpBytes
+    #print("bitmap len:" +str(len(bitmap)))
+    #print(nbEle)
+    #tmp_nbEle = nbEle
+    tmp_nbEle = cp.count_nonzero(bitmap).item()
+#    print(tmp_nbEle)
+    nbEle_p = ctypes.c_size_t(tmp_nbEle)
+    # size_t nbEle, unsigned char* cmpBytes, size_t cmpSize, float errorBound
+    newData = __cuszp_device_decompress(nbEle_p,cmpBytes, np.ulonglong(cmpSize), np.float32(absErrBound))
+
+    # decompressed_ptr = self.cuszp_decompress(isCuPy, cmp_bytes, num_elements_eff)
+    # -- Workaround to convert GPU pointer to int
+    p_decompressed_ptr = ctypes.addressof(newData)
+    # cast to int64 pointer
+    # (effectively converting pointer to pointer to addr to pointer to int64)
+    p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decompressed_int = p_decompressed_int.contents
+    # --
+    pointer_for_free = decompressed_int.value
+    # self.decompressed_own.append(decompressed_int.value)
+    mem = cp.cuda.UnownedMemory(decompressed_int.value, tmp_nbEle, owner, device_id=0)
+    mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
+    #print("mem ptr")
+    #print(mem_ptr)
+    arr = cp.ndarray(shape=tmp_nbEle, dtype=cp.float32, memptr=mem_ptr)
+#    print("attempt alloc")
+    res = cp.zeros(nbEle,dtype=cp.float32)
+#    print("alloc passed")
+    ## need to convert newData to cupy
+    cp.putmask(res,bitmap,arr)
+    mempool = cp.get_default_memory_pool()
+    pinned_mempool = cp.get_default_pinned_memory_pool()
+    #del arr
+    
+    #print(res[0])
+    #print(res[int(nbEle/2)])
+    #reshaped_data = arr.reshape(-1,2)
+    reshaped_data = res.reshape(-1,2)
+    
+    c_res = reshaped_data.view(dtype=cp.complex64)
+    #print(c_res[0])
+    #c_res = cp.zeros(int(nbEle/2), np.complex64)
+    #c_res.real = res[0:int(nbEle/2)]
+    #c_res.imag = res[int(nbEle/2):]
+    #del res
+    del bitmap
+    mempool.free_all_blocks()
+    pinned_mempool.free_all_blocks()
+
+    return (c_res, pointer_for_free)
+
+### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = int(1024)
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.002
+    r2r_error = 0.0001
+
+    in_vector = np.fromfile("real_sample.bin", dtype=np.float32)
+    #print(np.max(in_vector))
+    DATA_SIZE = len(in_vector)
+    #range_vr = np.max(in_vector)-np.min(in_vector)
+    #r2r_threshold = r2r_threshold*range_vr
+    #r2r_error = r2r_error*range_vr
+    #in_vector = np.zeros((DATA_SIZE,))
+    #for i in range(0,int(DATA_SIZE/4)):
+    #    in_vector[i] = 0.0
+    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+    #    in_vector[i] = 5.0
+    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
+    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+    #    in_vector[i] = -7.0
+    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+    #    in_vector[i] = 0.001
+
+    print(DATA_SIZE)
+    in_vector = in_vector.astype('float32')
+    in_vector_gpu = cp.asarray(in_vector)
+    
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+    s_time = time.time()
+    o_bytes, outSize = cuszp_device_compress(in_vector_gpu, r2r_error, DATA_SIZE,r2r_threshold)
+    print("Time python: "+str(time.time()-s_time))
+    print(outSize[0])
+    print("Compress Success...starting decompress ")
+    comp = Comp()
+
+    s_time = time.time()
+    (d_bytes,ptr )= cuszp_device_decompress(DATA_SIZE, o_bytes,outSize[0], comp, in_vector_gpu.dtype)
+    
+    print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
+    print("Decompress Success")

From f2a4305dd980113b8b0b98631cb3e1073eacf430 Mon Sep 17 00:00:00 2001
From: Milan Kartik Shah <mkshah5@ncsu.edu>
Date: Mon, 22 May 2023 17:12:11 -0400
Subject: [PATCH 083/126] Revert to SZx compression

---
 qtensor/compression/Compressor.py            | 16 +++++---
 qtensor/compression/szx/src/cuszx_wrapper.py | 42 ++++++++++++--------
 2 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index 5f0123e3..bec75b0b 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -7,10 +7,10 @@
 sys.path.append('./szx/src')
 sys.path.append(str(Path(__file__).parent/'szp/src/'))
 sys.path.append('./szp/src')
-#sys.path.append('/home/mkshah5/QTensor/qtensor/compression/szp/src')
-
+sys.path.append('/home/mkshah5/QTensor/qtensor/compression/szp/src')
+sys.path.append('/home/mkshah5/QTensor/qtensor/compression/szx/src')
 try:
-    #from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
+    from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
     from cuSZp_wrapper import cuszp_device_compress, cuszp_device_decompress
 except:
     print("import failed")
@@ -154,7 +154,8 @@ def decompress(self, obj):
         cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
         decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
         arr_cp = decompressed_ptr[0]
-        self.decompressed_own.append(decompressed_ptr[1])
+        #self.decompressed_own.append(decompressed_ptr[1])
+        
         # -- Workaround to convert GPU pointer to int
         # p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
         # # cast to int64 pointer
@@ -184,7 +185,9 @@ def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
         if not isCuPy:
             cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
         else:
-            cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
+            #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
+
+            cmp_bytes, outSize_ptr = cuszx_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
         return cmp_bytes, outSize_ptr
 
     ### Decompression API with cuSZx ###
@@ -201,6 +204,7 @@ def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtyp
         if not isCuPy:
             decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
         else:
-            decompressed_data = cuszp_device_decompress(num_elements, cmp_bytes, cmpsize, owner,dtype)
+            #decompressed_data = cuszp_device_decompress(num_elements, cmp_bytes, cmpsize, owner,dtype)
 
+            decompressed_data = cuszx_device_decompress(num_elements, cmp_bytes, owner,dtype)
         return decompressed_data
diff --git a/qtensor/compression/szx/src/cuszx_wrapper.py b/qtensor/compression/szx/src/cuszx_wrapper.py
index 388bd7ab..28cae69d 100644
--- a/qtensor/compression/szx/src/cuszx_wrapper.py
+++ b/qtensor/compression/szx/src/cuszx_wrapper.py
@@ -7,8 +7,8 @@
 import torch
 
 from pathlib import Path
-LIB_PATH = str(Path(__file__).parent/'libcuszx_wrapper.so')
-
+#LIB_PATH = str(Path(__file__).parent/'libcuszx_wrapper.so')
+LIB_PATH='/home/mkshah5/QTensor/qtensor/compression/szx/src/libcuszx_wrapper.so'
 # unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize)
 
 def get_host_compress():
@@ -161,6 +161,13 @@ class Comp():
     def __init__(self):
         self.name = "dummy"
 
+def free_compressed(ptr):
+    p_ptr = ctypes.addressof(ptr)
+    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decomp_int = p_int.contents
+    cp.cuda.runtime.free(decomp_int.value)
+
+
 if __name__ == "__main__":
     
     DATA_SIZE = int(1024)
@@ -170,7 +177,7 @@ def __init__(self):
     r2r_threshold = 0.002
     r2r_error = 0.0001
 
-    in_vector = np.fromfile("real_sample.bin", dtype=np.float32)
+    in_vector = np.fromfile("all_sample.bin", dtype=np.complex64)
     #print(np.max(in_vector))
     DATA_SIZE = len(in_vector)
     #range_vr = np.max(in_vector)-np.min(in_vector)
@@ -189,22 +196,25 @@ def __init__(self):
     #    in_vector[i] = 0.001
 
     print(DATA_SIZE)
-    in_vector = in_vector.astype('float32')
+    #in_vector = in_vector.astype('float32')
     in_vector_gpu = cp.asarray(in_vector)
     
     # variable = ctypes.c_size_t(0)
     # outSize = ctypes.pointer(variable)
-    s_time = time.time()
-    o_bytes, outSize = cuszx_device_compress(in_vector_gpu, r2r_error, DATA_SIZE, 256, r2r_threshold)
-    print("Time python: "+str(time.time()-s_time))
-    print(outSize[0])
-    print("Compress Success...starting decompress ")
-    comp = Comp()
-
-    s_time = time.time()
-    (d_bytes,ptr )= cuszx_device_decompress(DATA_SIZE, o_bytes, comp, in_vector_gpu.dtype)
-    
-    print("Time python: "+str(time.time()-s_time))
+    for i in range(200):
+        s_time = time.time()
+        o_bytes, outSize = cuszx_device_compress(in_vector_gpu, r2r_error, DATA_SIZE, 256, r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        print(outSize[0])
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        (d_bytes,ptr )= cuszx_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
+        
+        free_compressed(o_bytes[0])
+        cp.cuda.runtime.free(ptr)
+        print("Time python: "+str(time.time()-s_time))
     #for i in d_bytes:
     #    print(i)
-    print("Decompress Success")
+        print("Decompress Success")

From 1252c2be71dfc15004c1e8a8cf0927ec1653d472 Mon Sep 17 00:00:00 2001
From: Milan Shah <mkshah5@ncsu.edu>
Date: Mon, 22 May 2023 17:25:08 -0400
Subject: [PATCH 084/126] Fix lib paths for Compressor

---
 qtensor/compression/Compressor.py            | 4 ++--
 qtensor/compression/szx/src/cuszx_wrapper.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index bec75b0b..a7a73e93 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -7,8 +7,8 @@
 sys.path.append('./szx/src')
 sys.path.append(str(Path(__file__).parent/'szp/src/'))
 sys.path.append('./szp/src')
-sys.path.append('/home/mkshah5/QTensor/qtensor/compression/szp/src')
-sys.path.append('/home/mkshah5/QTensor/qtensor/compression/szx/src')
+# sys.path.append('/home/mkshah5/QTensor/qtensor/compression/szp/src')
+# sys.path.append('/home/mkshah5/QTensor/qtensor/compression/szx/src')
 try:
     from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
     from cuSZp_wrapper import cuszp_device_compress, cuszp_device_decompress
diff --git a/qtensor/compression/szx/src/cuszx_wrapper.py b/qtensor/compression/szx/src/cuszx_wrapper.py
index 28cae69d..11e81223 100644
--- a/qtensor/compression/szx/src/cuszx_wrapper.py
+++ b/qtensor/compression/szx/src/cuszx_wrapper.py
@@ -7,8 +7,8 @@
 import torch
 
 from pathlib import Path
-#LIB_PATH = str(Path(__file__).parent/'libcuszx_wrapper.so')
-LIB_PATH='/home/mkshah5/QTensor/qtensor/compression/szx/src/libcuszx_wrapper.so'
+LIB_PATH = str(Path(__file__).parent/'libcuszx_wrapper.so')
+# LIB_PATH='/home/mkshah5/QTensor/qtensor/compression/szx/src/libcuszx_wrapper.so'
 # unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize)
 
 def get_host_compress():

From 3894220d35704cfe1d677238b4c1121a290d7cd3 Mon Sep 17 00:00:00 2001
From: Milan Kartik Shah <mkshah5@ncsu.edu>
Date: Tue, 6 Jun 2023 10:39:37 -0400
Subject: [PATCH 085/126] Added cuSZ base compressor with only threshold

---
 qtensor/compression/Compressor.py             |   14 +-
 .../compression/cusz/include/cli/analyzer.hh  |  278 ++++
 .../compression/cusz/include/cli/document.hh  |  272 ++++
 .../cusz/include/cli/quality_viewer.hh        |  163 +++
 qtensor/compression/cusz/include/cli/query.hh |   71 +
 .../compression/cusz/include/cli/query_dev.hh |   69 +
 .../cusz/include/cli/timerecord_viewer.hh     |  109 ++
 .../compression/cusz/include/cli/verify.hh    |   87 ++
 qtensor/compression/cusz/include/common.hh    |   19 +
 .../cusz/include/common/capsule.hh            |  402 ++++++
 .../cusz/include/common/configs.hh            |  354 +++++
 .../cusz/include/common/definition.hh         |   66 +
 .../cusz/include/common/type_traits.hh        |  108 ++
 .../compression/cusz/include/compaction.hh    |   18 +
 qtensor/compression/cusz/include/component.hh |   19 +
 .../cusz/include/component/glue.cuh           |  120 ++
 .../component/pred_boilerplate_deprecated.hh  |  210 +++
 .../cusz/include/component/prediction.inl     |  193 +++
 .../cusz/include/component/spcodec.inl        |  218 +++
 .../compression/cusz/include/compressor.hh    |  165 +++
 qtensor/compression/cusz/include/context.hh   |  251 ++++
 qtensor/compression/cusz/include/cusz.h       |   60 +
 .../compression/cusz/include/cusz/custom.h    |   26 +
 qtensor/compression/cusz/include/cusz/it.hh   |   78 ++
 qtensor/compression/cusz/include/cusz/nd.h    |   15 +
 qtensor/compression/cusz/include/cusz/pn.hh   |   49 +
 .../compression/cusz/include/cusz/record.h    |   38 +
 qtensor/compression/cusz/include/cusz/type.h  |  219 +++
 qtensor/compression/cusz/include/framework.hh |   62 +
 qtensor/compression/cusz/include/header.h     |  111 ++
 qtensor/compression/cusz/include/hf/hf.hh     |  170 +++
 .../compression/cusz/include/hf/hf_bookg.hh   |   45 +
 .../compression/cusz/include/hf/hf_codecg.hh  |   82 ++
 .../compression/cusz/include/hf/hf_struct.h   |   53 +
 .../cusz/include/kernel/claunch_cuda.h        |   49 +
 .../cusz/include/kernel/cpplaunch_cuda.hh     |   51 +
 .../cusz/include/kernel/dryrun.cuh            |   47 +
 .../cusz/include/kernel/launch_prototype.cuh  |    0
 .../cusz/include/kernel/launch_spm.cuh        |  348 +++++
 .../cusz/include/kernel/lorenzo_all.h         |   44 +
 .../cusz/include/kernel/lorenzo_all.hh        |   96 ++
 .../compression/cusz/include/kernel/spv_gpu.h |   42 +
 .../cusz/include/kernel/spv_gpu.hh            |   33 +
 .../cusz/include/kernel/v2_lorenzo.hh         |   32 +
 .../cusz/include/pipeline/compaction_g.inl    |   73 +
 .../cusz/include/pipeline/v2_compressor.hh    |  146 ++
 .../compression/cusz/include/stat/compare.h   |   57 +
 .../cusz/include/stat/compare_cpu.hh          |   62 +
 .../cusz/include/stat/compare_gpu.hh          |   33 +
 qtensor/compression/cusz/include/stat/stat.h  |   29 +
 qtensor/compression/cusz/include/stat/stat.hh |   15 +
 .../compression/cusz/include/stat/stat_g.hh   |   44 +
 qtensor/compression/cusz/include/utils.hh     |   21 +
 .../cusz/include/utils/cuda_err.cuh           |  185 +++
 .../cusz/include/utils/cuda_mem.cuh           |  100 ++
 .../cusz/include/utils/cusparse_err.cuh       |   60 +
 .../compression/cusz/include/utils/format.hh  |   57 +
 qtensor/compression/cusz/include/utils/io.hh  |   59 +
 .../cusz/include/utils/print_gpu.h            |   45 +
 .../cusz/include/utils/print_gpu.hh           |   21 +
 .../cusz/include/utils/strhelper.hh           |  144 ++
 .../compression/cusz/include/utils/timer.h    |   92 ++
 .../compression/cusz/include/utils/timer.hh   |  153 ++
 qtensor/compression/cusz/src/cli/cli.cu       |   14 +
 qtensor/compression/cusz/src/cli/cli.cuh      |  195 +++
 .../compression/cusz/src/cli/dryrun_part.cu   |   17 +
 .../compression/cusz/src/cli/dryrun_part.cuh  |  196 +++
 qtensor/compression/cusz/src/cli_bin.cu       |   27 +
 qtensor/compression/cusz/src/compressor.cc    |  149 ++
 qtensor/compression/cusz/src/context.cc       |  493 +++++++
 qtensor/compression/cusz/src/cusz/custom.cc   |   34 +
 qtensor/compression/cusz/src/cusz_lib.cc      |  115 ++
 .../compression/cusz/src/cusz_version.h.in    |    3 +
 qtensor/compression/cusz/src/cusz_wrapper.cu  |  154 ++
 qtensor/compression/cusz/src/cusz_wrapper.py  |  173 +++
 .../cusz/src/detail/compare_cpu.inl           |  109 ++
 .../cusz/src/detail/compare_gpu.inl           |  193 +++
 .../cusz/src/detail/compressor_impl.cu        |   18 +
 .../cusz/src/detail/compressor_impl.inl       |  479 +++++++
 qtensor/compression/cusz/src/detail/spmat.cu  |   14 +
 .../compression/cusz/src/detail/spv_gpu.inl   |   77 +
 qtensor/compression/cusz/src/detail/spvec.cu  |   18 +
 .../cusz/src/experimental/Makefile            |    7 +
 .../src/experimental/dpcpp_demo_lorenzo.cu    |  120 ++
 .../cusz/src/hf/detail/hf_bookg.inl           |  742 ++++++++++
 .../cusz/src/hf/detail/hf_codecg.inl          |  296 ++++
 .../cusz/src/hf/detail/hf_pimpl.inl           |  364 +++++
 .../cusz/src/hf/detail/par_merge.inl          |  445 ++++++
 qtensor/compression/cusz/src/hf/hf.cc         |  109 ++
 qtensor/compression/cusz/src/hf/hf_bookg.cu   |   33 +
 qtensor/compression/cusz/src/hf/hf_codecg.cu  |  269 ++++
 qtensor/compression/cusz/src/hf/hf_pimpl.cu   |   31 +
 .../cusz/src/kernel/claunch_cuda.cu           |   76 +
 .../cusz/src/kernel/detail/hist.inl           |  100 ++
 .../cusz/src/kernel/detail/lorenzo.inl        |  816 +++++++++++
 .../cusz/src/kernel/detail/lorenzo23.inl      | 1237 +++++++++++++++++
 .../cusz/src/kernel/detail/lorenzo_proto.inl  |  214 +++
 .../cusz/src/kernel/detail/lorenzo_serial.inl |  326 +++++
 .../cusz/src/kernel/detail/lorenzo_var.inl    |  530 +++++++
 .../cusz/src/kernel/detail/spline3.inl        |  746 ++++++++++
 .../cusz/src/kernel/detail/subroutine.inl     | 1074 ++++++++++++++
 .../cusz/src/kernel/detail/subsub.inl         |   92 ++
 .../compression/cusz/src/kernel/lorenzo.cu    |  209 +++
 .../cusz/src/kernel/lorenzo_proto.cu          |  176 +++
 .../cusz/src/kernel/lorenzo_serial.cc         |  118 ++
 .../cusz/src/kernel/lorenzo_var.cu            |  206 +++
 .../cusz/src/kernel/preprocess.cuh            |   65 +
 qtensor/compression/cusz/src/kernel/rle.cuh   |   74 +
 .../compression/cusz/src/kernel/spv_gpu.cu    |   60 +
 .../compression/cusz/src/kernel/v2_lorenzo.cu |  118 ++
 .../cusz/src/pipeline/v2_compressor.cc        |  112 ++
 .../cusz/src/pipeline/v2_compressor_impl.cu   |   15 +
 .../cusz/src/pipeline/v2_compressor_impl.inl  |  239 ++++
 qtensor/compression/cusz/src/stat/cmpg1_1.cu  |   30 +
 qtensor/compression/cusz/src/stat/cmpg1_2.cu  |   30 +
 qtensor/compression/cusz/src/stat/cmpg1_3.cu  |   30 +
 qtensor/compression/cusz/src/stat/cmpg1_4.cu  |   30 +
 qtensor/compression/cusz/src/stat/cmpg1_5.cu  |   30 +
 qtensor/compression/cusz/src/stat/cmpg2.cu    |   34 +
 qtensor/compression/cusz/src/stat/cmpg3.cu    |   32 +
 qtensor/compression/cusz/src/stat/cmpg4_1.cu  |   24 +
 qtensor/compression/cusz/src/stat/cmpg4_2.cu  |   25 +
 qtensor/compression/cusz/src/stat/cmpg4_3.cu  |   24 +
 qtensor/compression/cusz/src/stat/cmpg4_4.cu  |   25 +
 .../compression/cusz/src/stat/compare_cpu.cc  |   43 +
 qtensor/compression/cusz/src/stat/stat.cc     |    0
 qtensor/compression/cusz/src/stat/stat_g.cu   |   96 ++
 .../compression/cusz/src/utils/dbg_print.cuh  |  132 ++
 .../compression/cusz/src/utils/print_gpu.cu   |  121 ++
 .../compression/cusz/src/utils/timer_cpu.cc   |   30 +
 .../compression/cusz/src/utils/timer_gpu.cu   |   82 ++
 .../compression/cusz/src/utils/vis_stat.hh    |  137 ++
 qtensor/compression/szp/src/cuSZp_entry.cu    |    8 +-
 qtensor/compression/szp/src/cuSZp_wrapper.cu  |    5 +-
 qtensor/compression/szp/src/cuSZp_wrapper.py  |   69 +-
 135 files changed, 18841 insertions(+), 45 deletions(-)
 create mode 100644 qtensor/compression/cusz/include/cli/analyzer.hh
 create mode 100644 qtensor/compression/cusz/include/cli/document.hh
 create mode 100644 qtensor/compression/cusz/include/cli/quality_viewer.hh
 create mode 100644 qtensor/compression/cusz/include/cli/query.hh
 create mode 100644 qtensor/compression/cusz/include/cli/query_dev.hh
 create mode 100644 qtensor/compression/cusz/include/cli/timerecord_viewer.hh
 create mode 100644 qtensor/compression/cusz/include/cli/verify.hh
 create mode 100644 qtensor/compression/cusz/include/common.hh
 create mode 100644 qtensor/compression/cusz/include/common/capsule.hh
 create mode 100644 qtensor/compression/cusz/include/common/configs.hh
 create mode 100644 qtensor/compression/cusz/include/common/definition.hh
 create mode 100644 qtensor/compression/cusz/include/common/type_traits.hh
 create mode 100644 qtensor/compression/cusz/include/compaction.hh
 create mode 100644 qtensor/compression/cusz/include/component.hh
 create mode 100644 qtensor/compression/cusz/include/component/glue.cuh
 create mode 100644 qtensor/compression/cusz/include/component/pred_boilerplate_deprecated.hh
 create mode 100644 qtensor/compression/cusz/include/component/prediction.inl
 create mode 100644 qtensor/compression/cusz/include/component/spcodec.inl
 create mode 100644 qtensor/compression/cusz/include/compressor.hh
 create mode 100644 qtensor/compression/cusz/include/context.hh
 create mode 100644 qtensor/compression/cusz/include/cusz.h
 create mode 100644 qtensor/compression/cusz/include/cusz/custom.h
 create mode 100644 qtensor/compression/cusz/include/cusz/it.hh
 create mode 100644 qtensor/compression/cusz/include/cusz/nd.h
 create mode 100644 qtensor/compression/cusz/include/cusz/pn.hh
 create mode 100644 qtensor/compression/cusz/include/cusz/record.h
 create mode 100644 qtensor/compression/cusz/include/cusz/type.h
 create mode 100644 qtensor/compression/cusz/include/framework.hh
 create mode 100644 qtensor/compression/cusz/include/header.h
 create mode 100644 qtensor/compression/cusz/include/hf/hf.hh
 create mode 100644 qtensor/compression/cusz/include/hf/hf_bookg.hh
 create mode 100644 qtensor/compression/cusz/include/hf/hf_codecg.hh
 create mode 100644 qtensor/compression/cusz/include/hf/hf_struct.h
 create mode 100644 qtensor/compression/cusz/include/kernel/claunch_cuda.h
 create mode 100644 qtensor/compression/cusz/include/kernel/cpplaunch_cuda.hh
 create mode 100644 qtensor/compression/cusz/include/kernel/dryrun.cuh
 create mode 100644 qtensor/compression/cusz/include/kernel/launch_prototype.cuh
 create mode 100644 qtensor/compression/cusz/include/kernel/launch_spm.cuh
 create mode 100644 qtensor/compression/cusz/include/kernel/lorenzo_all.h
 create mode 100644 qtensor/compression/cusz/include/kernel/lorenzo_all.hh
 create mode 100644 qtensor/compression/cusz/include/kernel/spv_gpu.h
 create mode 100644 qtensor/compression/cusz/include/kernel/spv_gpu.hh
 create mode 100644 qtensor/compression/cusz/include/kernel/v2_lorenzo.hh
 create mode 100644 qtensor/compression/cusz/include/pipeline/compaction_g.inl
 create mode 100644 qtensor/compression/cusz/include/pipeline/v2_compressor.hh
 create mode 100644 qtensor/compression/cusz/include/stat/compare.h
 create mode 100644 qtensor/compression/cusz/include/stat/compare_cpu.hh
 create mode 100644 qtensor/compression/cusz/include/stat/compare_gpu.hh
 create mode 100644 qtensor/compression/cusz/include/stat/stat.h
 create mode 100644 qtensor/compression/cusz/include/stat/stat.hh
 create mode 100644 qtensor/compression/cusz/include/stat/stat_g.hh
 create mode 100644 qtensor/compression/cusz/include/utils.hh
 create mode 100644 qtensor/compression/cusz/include/utils/cuda_err.cuh
 create mode 100644 qtensor/compression/cusz/include/utils/cuda_mem.cuh
 create mode 100644 qtensor/compression/cusz/include/utils/cusparse_err.cuh
 create mode 100644 qtensor/compression/cusz/include/utils/format.hh
 create mode 100644 qtensor/compression/cusz/include/utils/io.hh
 create mode 100644 qtensor/compression/cusz/include/utils/print_gpu.h
 create mode 100644 qtensor/compression/cusz/include/utils/print_gpu.hh
 create mode 100644 qtensor/compression/cusz/include/utils/strhelper.hh
 create mode 100644 qtensor/compression/cusz/include/utils/timer.h
 create mode 100644 qtensor/compression/cusz/include/utils/timer.hh
 create mode 100644 qtensor/compression/cusz/src/cli/cli.cu
 create mode 100644 qtensor/compression/cusz/src/cli/cli.cuh
 create mode 100644 qtensor/compression/cusz/src/cli/dryrun_part.cu
 create mode 100644 qtensor/compression/cusz/src/cli/dryrun_part.cuh
 create mode 100644 qtensor/compression/cusz/src/cli_bin.cu
 create mode 100644 qtensor/compression/cusz/src/compressor.cc
 create mode 100644 qtensor/compression/cusz/src/context.cc
 create mode 100644 qtensor/compression/cusz/src/cusz/custom.cc
 create mode 100644 qtensor/compression/cusz/src/cusz_lib.cc
 create mode 100644 qtensor/compression/cusz/src/cusz_version.h.in
 create mode 100644 qtensor/compression/cusz/src/cusz_wrapper.cu
 create mode 100644 qtensor/compression/cusz/src/cusz_wrapper.py
 create mode 100644 qtensor/compression/cusz/src/detail/compare_cpu.inl
 create mode 100644 qtensor/compression/cusz/src/detail/compare_gpu.inl
 create mode 100644 qtensor/compression/cusz/src/detail/compressor_impl.cu
 create mode 100644 qtensor/compression/cusz/src/detail/compressor_impl.inl
 create mode 100644 qtensor/compression/cusz/src/detail/spmat.cu
 create mode 100644 qtensor/compression/cusz/src/detail/spv_gpu.inl
 create mode 100644 qtensor/compression/cusz/src/detail/spvec.cu
 create mode 100644 qtensor/compression/cusz/src/experimental/Makefile
 create mode 100644 qtensor/compression/cusz/src/experimental/dpcpp_demo_lorenzo.cu
 create mode 100644 qtensor/compression/cusz/src/hf/detail/hf_bookg.inl
 create mode 100644 qtensor/compression/cusz/src/hf/detail/hf_codecg.inl
 create mode 100644 qtensor/compression/cusz/src/hf/detail/hf_pimpl.inl
 create mode 100644 qtensor/compression/cusz/src/hf/detail/par_merge.inl
 create mode 100644 qtensor/compression/cusz/src/hf/hf.cc
 create mode 100644 qtensor/compression/cusz/src/hf/hf_bookg.cu
 create mode 100644 qtensor/compression/cusz/src/hf/hf_codecg.cu
 create mode 100644 qtensor/compression/cusz/src/hf/hf_pimpl.cu
 create mode 100644 qtensor/compression/cusz/src/kernel/claunch_cuda.cu
 create mode 100644 qtensor/compression/cusz/src/kernel/detail/hist.inl
 create mode 100644 qtensor/compression/cusz/src/kernel/detail/lorenzo.inl
 create mode 100644 qtensor/compression/cusz/src/kernel/detail/lorenzo23.inl
 create mode 100644 qtensor/compression/cusz/src/kernel/detail/lorenzo_proto.inl
 create mode 100644 qtensor/compression/cusz/src/kernel/detail/lorenzo_serial.inl
 create mode 100644 qtensor/compression/cusz/src/kernel/detail/lorenzo_var.inl
 create mode 100644 qtensor/compression/cusz/src/kernel/detail/spline3.inl
 create mode 100644 qtensor/compression/cusz/src/kernel/detail/subroutine.inl
 create mode 100644 qtensor/compression/cusz/src/kernel/detail/subsub.inl
 create mode 100644 qtensor/compression/cusz/src/kernel/lorenzo.cu
 create mode 100644 qtensor/compression/cusz/src/kernel/lorenzo_proto.cu
 create mode 100644 qtensor/compression/cusz/src/kernel/lorenzo_serial.cc
 create mode 100644 qtensor/compression/cusz/src/kernel/lorenzo_var.cu
 create mode 100644 qtensor/compression/cusz/src/kernel/preprocess.cuh
 create mode 100644 qtensor/compression/cusz/src/kernel/rle.cuh
 create mode 100644 qtensor/compression/cusz/src/kernel/spv_gpu.cu
 create mode 100644 qtensor/compression/cusz/src/kernel/v2_lorenzo.cu
 create mode 100644 qtensor/compression/cusz/src/pipeline/v2_compressor.cc
 create mode 100644 qtensor/compression/cusz/src/pipeline/v2_compressor_impl.cu
 create mode 100644 qtensor/compression/cusz/src/pipeline/v2_compressor_impl.inl
 create mode 100644 qtensor/compression/cusz/src/stat/cmpg1_1.cu
 create mode 100644 qtensor/compression/cusz/src/stat/cmpg1_2.cu
 create mode 100644 qtensor/compression/cusz/src/stat/cmpg1_3.cu
 create mode 100644 qtensor/compression/cusz/src/stat/cmpg1_4.cu
 create mode 100644 qtensor/compression/cusz/src/stat/cmpg1_5.cu
 create mode 100644 qtensor/compression/cusz/src/stat/cmpg2.cu
 create mode 100644 qtensor/compression/cusz/src/stat/cmpg3.cu
 create mode 100644 qtensor/compression/cusz/src/stat/cmpg4_1.cu
 create mode 100644 qtensor/compression/cusz/src/stat/cmpg4_2.cu
 create mode 100644 qtensor/compression/cusz/src/stat/cmpg4_3.cu
 create mode 100644 qtensor/compression/cusz/src/stat/cmpg4_4.cu
 create mode 100644 qtensor/compression/cusz/src/stat/compare_cpu.cc
 create mode 100644 qtensor/compression/cusz/src/stat/stat.cc
 create mode 100644 qtensor/compression/cusz/src/stat/stat_g.cu
 create mode 100644 qtensor/compression/cusz/src/utils/dbg_print.cuh
 create mode 100644 qtensor/compression/cusz/src/utils/print_gpu.cu
 create mode 100644 qtensor/compression/cusz/src/utils/timer_cpu.cc
 create mode 100644 qtensor/compression/cusz/src/utils/timer_gpu.cu
 create mode 100644 qtensor/compression/cusz/src/utils/vis_stat.hh

diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index bec75b0b..6163902a 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -7,11 +7,15 @@
 sys.path.append('./szx/src')
 sys.path.append(str(Path(__file__).parent/'szp/src/'))
 sys.path.append('./szp/src')
-sys.path.append('/home/mkshah5/QTensor/qtensor/compression/szp/src')
-sys.path.append('/home/mkshah5/QTensor/qtensor/compression/szx/src')
+sys.path.append(str(Path(__file__).parent/'cusz/src'))
+sys.path.append('./cusz/src')
+#sys.path.append('/home/mkshah5/QTensor/qtensor/compression/szp/src')
+#sys.path.append('/home/mkshah5/QTensor/qtensor/compression/szx/src')
+#sys.path.append('/home/mkshah5/QTensor/qtensor/compression/cusz/src')
 try:
     from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
     from cuSZp_wrapper import cuszp_device_compress, cuszp_device_decompress
+    from cusz_wrapper import cusz_device_compress, cusz_device_decompress
 except:
     print("import failed")
     # Silently fail on missing build of cuszx
@@ -187,7 +191,7 @@ def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
         else:
             #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
 
-            cmp_bytes, outSize_ptr = cuszx_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+            cmp_bytes, outSize_ptr = cusz_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
         return cmp_bytes, outSize_ptr
 
     ### Decompression API with cuSZx ###
@@ -205,6 +209,6 @@ def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtyp
             decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
         else:
             #decompressed_data = cuszp_device_decompress(num_elements, cmp_bytes, cmpsize, owner,dtype)
-
-            decompressed_data = cuszx_device_decompress(num_elements, cmp_bytes, owner,dtype)
+# oriData, absErrBound, nbEle, blockSize,threshold
+            decompressed_data = cusz_device_decompress(num_elements, cmp_bytes, owner,dtype)
         return decompressed_data
diff --git a/qtensor/compression/cusz/include/cli/analyzer.hh b/qtensor/compression/cusz/include/cli/analyzer.hh
new file mode 100644
index 00000000..7ff4b37d
--- /dev/null
+++ b/qtensor/compression/cusz/include/cli/analyzer.hh
@@ -0,0 +1,278 @@
+/**
+ * @file analyzer.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.2
+ * @date 2021-03-26
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef ANALYSIS_ANALYZER_HH
+#define ANALYSIS_ANALYZER_HH
+
+#include <cstdio>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <thrust/device_ptr.h>
+#include <thrust/extrema.h>
+#include <thrust/sort.h>
+
+#include <algorithm>
+#include <numeric>
+
+#include "../hf/hf_bookg.hh"
+#include "../hf/hf_codecg.hh"
+#include "../kernel/cpplaunch_cuda.hh"
+#include "../utils/timer.hh"
+
+using std::cout;
+
+#if __cplusplus >= 201703L
+#define CONSTEXPR constexpr
+#else
+#define CONSTEXPR
+#endif
+
+enum class ExecutionPolicy { host, cuda_device };
+enum class AnalyzerMethod { thrust, cuda_native, stl };
+
+class Analyzer {
+    typedef struct ExtremaResult {
+        double max_val, min_val, rng;
+        double seconds;
+    } extrema_result_t;
+
+    typedef struct Compressibility {
+        size_t len;
+        struct {
+            double       entropy;
+            unsigned int top1_freq;
+            double       top1_prob;
+            double       dropout_equiv_bitlen_2x() const { return 64 * (1 - top1_prob); }
+            double       dropout_equiv_bitlen_1_5x() const { return 48 * (1 - top1_prob); }
+        } hist;
+        struct {
+            double r_lowerbound;
+            double avgb_lowerbound;
+            double r_upperbound;
+            double avgb_upperbound;
+        } huffman_theory;
+        struct {
+            double min_bitlen;
+            double avgb;
+        } huffman_stat;
+    } theory_t;
+
+    theory_t theory;
+
+   public:
+    Analyzer()  = default;
+    ~Analyzer() = default;
+
+    // TODO execution policy
+    template <typename T, ExecutionPolicy policy = ExecutionPolicy::host>
+    static std::vector<T> percentile100(T* in, size_t len)
+    {
+        std::vector<T> res;
+        auto           step = int(ceil(len / 100));
+
+        if CONSTEXPR (policy == ExecutionPolicy::cuda_device) {
+            // caveat: no residence check
+            thrust::sort(thrust::device, in, in + len);
+            T* htmp;
+            cudaMallocHost(&htmp, sizeof(T) * len);
+            cudaMemcpy(htmp, in, sizeof(T) * len, cudaMemcpyDeviceToHost);
+            for (auto i = 0; i < len; i += step) {  //
+                res.push_back(htmp[i]);
+            }
+            res.push_back(htmp[len - 1]);
+            cudaFreeHost(htmp);
+        }
+        else {  // fallback
+            std::sort(in, in + len);
+            for (auto i = 0; i < len; i += step) {  //
+                res.push_back(in[i]);
+            }
+            res.push_back(in[len - 1]);
+        }
+
+        return res;
+    }
+
+    template <typename Data, ExecutionPolicy policy, AnalyzerMethod method>
+    static extrema_result_t get_maxmin_rng(Data* d_data, size_t len)
+    {
+        if CONSTEXPR (policy == ExecutionPolicy::cuda_device and method == AnalyzerMethod::thrust) {
+            auto t0 = hires::now();
+            // ------------------------------------------------------------
+            thrust::device_ptr<Data> g_ptr = thrust::device_pointer_cast(d_data);
+
+            auto max_el_loc = thrust::max_element(g_ptr, g_ptr + len);  // excluding padded
+            auto min_el_loc = thrust::min_element(g_ptr, g_ptr + len);  // excluding padded
+
+            double max_val = *max_el_loc;
+            double min_val = *min_el_loc;
+            double rng     = max_val - min_val;
+            // ------------------------------------------------------------
+            auto t1 = hires::now();
+
+            return extrema_result_t{max_val, min_val, rng, static_cast<duration_t>(t1 - t0).count()};
+        }
+        else {
+            throw std::runtime_error("Analyzer::get_maxmin_rng() Other policy and method not implemented.");
+        }
+    }
+
+    template <typename UInt, ExecutionPolicy policy, AnalyzerMethod method>
+    static void get_histogram(UInt* data, size_t data_len, unsigned int* freq, size_t num_bins)
+    {
+        // TODO static check UInt
+        if CONSTEXPR (policy == ExecutionPolicy::cuda_device and method == AnalyzerMethod::cuda_native) {
+            float dummy;
+            launch_histogram(data, data_len, freq, num_bins, dummy);
+        }
+        else {
+            // TODO static check
+            throw std::runtime_error("Analyzer::get_histogram() using other policy or method not implemented.");
+        }
+    }
+
+    Analyzer& estimate_compressibility_from_histogram(unsigned int* h_freq, size_t dict_size)
+    {
+        auto   len       = std::accumulate(h_freq, h_freq + dict_size, 0u);  // excluding outlier
+        auto   top1_freq = *std::max_element(h_freq, h_freq + dict_size);
+        double top1_prob = (1.0 * top1_freq) / (1.0 * len);
+        double entropy   = 0.0;
+        for (auto i = 0; i < dict_size; i++) {
+            double p = h_freq[i] / (1.0 * len);
+            if (p != 0) entropy += -std::log2(p) * p;
+        }
+        double r_lowerbound    = 1 - (-std::log2(top1_prob) * top1_prob - std::log2(1 - top1_prob) * (1 - top1_prob));
+        double r_upperbound    = top1_prob + 0.086;  // [Gallager 78]
+        double avgb_lowerbound = entropy + r_lowerbound;
+        double avgb_upperbound = entropy + r_upperbound;
+
+        // dropout
+        // auto equiv_bitlen_dropout_2x   = 64 * (1 - top1_prob);
+        // auto equiv_bitlen_dropout_1_5x = 48 * (1 - top1_prob);
+
+        // record
+        theory.len                            = len;
+        theory.hist.entropy                   = entropy;
+        theory.hist.top1_freq                 = top1_freq;
+        theory.hist.top1_prob                 = top1_prob;
+        theory.huffman_theory.r_lowerbound    = r_lowerbound;
+        theory.huffman_theory.r_upperbound    = r_upperbound;
+        theory.huffman_theory.avgb_lowerbound = avgb_lowerbound;
+        theory.huffman_theory.avgb_upperbound = avgb_upperbound;
+
+        return *this;
+    };
+
+    template <typename Huff>
+    Analyzer&
+    get_stat_from_huffman_book(const unsigned int* h_freq, const Huff* h_codebook, size_t len, size_t num_bins)
+    {
+        // real-bitlen, for reference only, not part of workflow
+        std::vector<Huff>         v_canon_cb(h_codebook, h_codebook + num_bins);
+        std::vector<unsigned int> v_freq(h_freq, h_freq + num_bins);
+
+        // TODO somewhere explicitly state that null codeword is of length 0xff
+        std::sort(v_canon_cb.begin(), v_canon_cb.end(), [](Huff& a, Huff& b) {
+            auto a_bits = reinterpret_cast<struct PackedWordByWidth<sizeof(Huff)>*>(&a)->bits;
+            auto b_bits = reinterpret_cast<struct PackedWordByWidth<sizeof(Huff)>*>(&b)->bits;
+            return a_bits < b_bits;
+        });
+        std::sort(v_freq.begin(), v_freq.end(), std::greater<Huff>());
+
+        double real_avgb = 0.0;
+        for (auto i = 0; i < num_bins; i++) {
+            if (v_freq[i] != 0) {
+                auto bits = reinterpret_cast<struct PackedWordByWidth<sizeof(Huff)>*>(&v_canon_cb[i])->bits;
+                real_avgb += v_freq[i] * bits;
+            }
+        }
+        real_avgb /= len;
+
+        theory.huffman_stat.avgb = real_avgb;
+        theory.huffman_stat.min_bitlen =
+            reinterpret_cast<struct PackedWordByWidth<sizeof(Huff)>*>(&v_canon_cb.at(0))->bits;
+
+        return *this;
+    }
+
+    Analyzer&
+    print_compressibility(bool print_huffman_stat = false, bool print_dropout = false, double equiv_origin_bitlen = 32)
+    {
+        cout << "\n\e[31m";  // extra linebreak on start
+
+        cout << "* Derived from histogram:" << '\n';
+        cout << "  - len (freq sum):\t" << theory.len << '\n';
+        cout << "  - entropy H(X):\t" << theory.hist.entropy << '\n';
+        cout << "  - most likely freq:\t" << theory.hist.top1_freq << '\n';
+        cout << "  - most likely prob (p1):\t" << theory.hist.top1_prob << '\n';
+        cout << '\n';
+
+        if (theory.hist.top1_prob < 0.4) {
+            cout << "* The probability of the most likely symbol < 0.4, go recoding (Huffman)." << '\n';
+            cout << "* Compressibility lower bound is for reference only." << '\n';
+            cout << "  - est. redundancy upper bound (arbitrary p1):\t" << theory.huffman_theory.r_upperbound << '\n';
+            cout << "  - est. avg.bitlen upper bound (arbitrary p1):\t" << theory.huffman_theory.avgb_upperbound
+                 << '\n';
+            cout << "  - est. CR lower bound (arbitrary p1):\t"
+                 << equiv_origin_bitlen / theory.huffman_theory.avgb_upperbound << '\n';
+            cout << '\n';
+        }
+        else {
+            cout << "* Compressibility upper bound is determined by the lower bound of average bitlength." << '\n';
+            cout << "  - est. redundancy lower bound (p1 > 0.4):\t" << theory.huffman_theory.r_lowerbound << '\n';
+            cout << "  - est. avg.bitlen lower bound (p1 > 0.4):\t" << theory.huffman_theory.avgb_lowerbound << '\n';
+            cout << "  - est. CR upper bound (arbitrary p1):\t"
+                 << equiv_origin_bitlen / theory.huffman_theory.avgb_lowerbound << '\n';
+            cout << '\n';
+
+            cout << "* Compressibility lower bound is for reference only." << '\n';
+            cout << "  - est. redundancy upper bound (arbitrary p1):\t" << theory.huffman_theory.r_upperbound << '\n';
+            cout << "  - est. avg.bitlen upper bound (arbitrary p1):\t" << theory.huffman_theory.avgb_upperbound
+                 << '\n';
+            cout << "  - est. CR lower bound (arbitrary p1):\t"
+                 << equiv_origin_bitlen / theory.huffman_theory.avgb_upperbound << '\n';
+            cout << '\n';
+
+            if (print_dropout) {
+                auto dropout_equiv_bitlen_2x   = theory.hist.dropout_equiv_bitlen_2x();
+                auto dropout_equiv_bitlen_1_5x = theory.hist.dropout_equiv_bitlen_1_5x();
+                // TODO determine path, print log
+                cout << "* Considering dropout:" << '\n';
+                cout << "  - dropout at 1.0x metadata overhead" << '\n';
+                cout << "    | equiv.bitlen:\t" << dropout_equiv_bitlen_2x << '\n';
+                cout << "    | reduction rate:\t" << (equiv_origin_bitlen / dropout_equiv_bitlen_2x) << '\n';
+                cout << "    | bitlen_dropout <= bitlen_enc?\t"
+                     << (dropout_equiv_bitlen_2x <= theory.huffman_theory.avgb_lowerbound) << '\n';
+                cout << "  - dropout at 0.5x metadata overhead" << '\n';
+                cout << "    | equiv.bitlen:\t" << dropout_equiv_bitlen_1_5x << '\n';
+                cout << "    | reduction rate (fp32):\t" << (equiv_origin_bitlen / dropout_equiv_bitlen_1_5x) << '\n';
+                cout << "    | bitlen_dropout <= bitlen_enc?\t"
+                     << (dropout_equiv_bitlen_1_5x <= theory.huffman_theory.avgb_lowerbound) << '\n';
+                cout << '\n';
+            }
+        }
+
+        if (print_huffman_stat) {
+            cout << "* From Huffman codebook:" << '\n';
+            cout << "  - avg. bitlen:\t" << theory.huffman_stat.avgb << '\n';
+            cout << "  - shortest bitlen:\t" << theory.huffman_stat.min_bitlen << '\n';
+            cout << '\n';
+        }
+        cout << "\e[0m";
+
+        return *this;
+    }
+};
+
+#endif
diff --git a/qtensor/compression/cusz/include/cli/document.hh b/qtensor/compression/cusz/include/cli/document.hh
new file mode 100644
index 00000000..240de036
--- /dev/null
+++ b/qtensor/compression/cusz/include/cli/document.hh
@@ -0,0 +1,272 @@
+/**
+ * @file document.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.1.1
+ * @date 2020-09-22
+ *
+ * @copyright (C) 2020 by Washington State University, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef ARGUMENT_PARSER_DOCUMENT_HH
+#define ARGUMENT_PARSER_DOCUMENT_HH
+
+#include <regex>
+#include <string>
+
+
+const std::string fmt_b("\e[1m");
+const std::string fmt_0("\e[0m");
+
+const std::regex  bful("@(.*?)@");
+const std::string bful_text("\e[1m\e[4m$1\e[0m");
+const std::regex  bf("\\*(.*?)\\*");
+const std::string bf_text("\e[1m$1\e[0m");
+const std::regex  ul(R"(_((\w|-|\d|\.)+?)_)");
+const std::string ul_text("\e[4m$1\e[0m");
+const std::regex  red(R"(\^\^(.*?)\^\^)");
+const std::string red_text("\e[31m$1\e[0m");
+
+std::string  //
+Format(const std::string& s)
+{
+    auto a = std::regex_replace(s, bful, bful_text);
+    auto b = std::regex_replace(a, bf, bf_text);
+    auto c = std::regex_replace(b, ul, ul_text);
+    auto d = std::regex_replace(c, red, red_text);
+    return d;
+}
+
+static const char cusz_short_doc[] =
+    // "cusz, version [placeholder]\n"
+    "\n"
+    "usage: cusz [-zxrh] [-i file] [-t dtype] [-m mode] [-e eb] [-l x,y,z] "
+    "...\n"
+    "\n"
+    "  z : zip/compress\n"
+    "  x : unzip/decompress\n"
+    "  r : dryrun\n"
+    "  h : print full-length help document\n"
+    "\n"
+    "  i file  : path to input datum\n"
+    "  t dtype : f32 or fp4 (to be updated)\n"
+    "  m mode  : compression mode; abs, r2r\n"
+    "  e eb    : error bound; default 1e-4\n"
+    "  l size  : \"-l x\" for 1D; \"-l [X]x[Y]\" for 2D; \"-l [X]x[Y]x[Z]\" for 3D\n"
+    // "  p pred  : select predictor from \"lorenzo\" and \"spline3d\"\n"
+    "\n"
+    "  config list:\n"
+    "      syntax: opt=v, \"kw1=val1,kw1=val2[,...]\"\n"
+    "      + eb     error bound\n"
+    "      + radius The number of quant-codes is 2x radius.\n"
+    "      + demo  load predefined lengths for demo datasets\n"
+    "          - skipping \"-l x[,y[,z]]\"\n"
+    "          - (1D) hacc  hacc1b  (2D) cesm  exafel\n"
+    "          - (3D) hurricane  nyx-s  nyx-m  qmc  qmcpre  rtm  parihaka\n"
+    "      + anchor (on|off)\n"
+    // "      + pipeline auto, binary, radius\n"
+    "      example: \"--config demo=cesm,radius=512\"\n"
+    "  report list: \n"
+    "      syntax: opt[=v], \"kw1[=(on|off)],kw2[=(on|off)]\n"
+    "      keyworkds: time, quality\n"
+    "      example: \"--report time\", \"--report time=off\"\n"
+    "\n"
+    "example:\n"
+    "   CESM=./data/cesm-CLDHGH-3600x1800\n"
+    "   cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -z --report time\n"
+    "   cusz -i ${CESM}.cusza -x --report time --compare ${CESM}\n"
+    "\n"
+    "\"cusz -h\" for details.\n";
+
+static const char cusz_full_doc[] =
+    "*NAME*\n"
+    "        cuSZ: CUDA-Based Error-Bounded Lossy Compressor for Scientific Data\n"
+    "        Lowercased \"*cusz*\" is the command."
+    "\n"
+    "*SYNOPSIS*\n"
+    "        The basic use is listed below,\n"
+    "        *cusz* *-t* f32 *-m* r2r *-e* 1.0e-4.0 *-i* ./data/cesm-CLDHGH-3600x1800 *-l* 3600,1800 *-z* *--report* "
+    "time\n"
+    //   cusz -t f32 -m r2r -e 1.0e-4.0 -i ./data/cesm-CLDHGH-3600x1800 -l 3600x1800 -z --report time\n
+    "             ^^------ ------ ----------- ------------------------------- ------------  |  ^^\n"
+    "             ^^ dtype  mode  error bound            input file           low-to-high  zip ^^\n"
+    "\n"
+    "        *cusz* *-i* ./data/cesm-CLDHGH-3600x1800.cusza *-x* *--compare* ./data/cesm-CLDHGH-3600x1800 *--report* "
+    "time\n"
+    //       cusz -i ./data/cesm-CLDHGH-3600x1800.cusza -x --compare ./data/cesm-CLDHGH-3600x1800 --report
+    //       time\n"
+    "             ^^-------------------------------------  |   ^^\n"
+    "             ^^            compressed file          unzip ^^\n"
+    "\n"
+    "        *cusz* *-t* f32|64 *-m* [eb mode] *-e* [eb] *-i* [datum file] *-l* [x[,y[,z]]] *-z*\n"
+    "        *cusz* *-i* [basename].cusza *-x*\n"
+    "\n"
+    "*OPTIONS*\n"
+    "    *Mandatory* (zip and dryrun)\n"
+    "        *-z* or *--compress* or *--*@z@*ip*\n"
+    "        *-r* or *--dry-*@r@*un*\n"
+    "                No lossless Huffman codec. Only to get data quality summary.\n"
+    "                In addition, quant. rep. and dict. size are retained\n"
+    "\n"
+    "        *-m* or *--*@m@*ode* <abs|r2r>\n"
+    "                Specify error-controlling mode. Supported modes include:\n"
+    "                _abs_: absolute mode, eb = input eb\n"
+    "                _r2r_: relative-to-value-range mode, eb = input eb x value range\n"
+    "\n"
+    "        *-e* or *--eb* or *--error-bound* [num]\n"
+    "                Specify error bound. e.g., _1.23_, _1e-4_, _1.23e-4.56_\n"
+    "\n"
+    "        *-i* or *--*@i@*nput* [file]\n"
+    "\n"
+    "        *-d* or *--dict-size* [256|512|1024|...]\n"
+    "                Specify dictionary size/quantization bin number.\n"
+    "                Should be a power-of-2.\n"
+    "\n"
+    "        *-l* [x[,y[,z]]]   Specify (1|2|3)D data size, with dimensions from low to high.\n"
+    "\n"
+    "    *Mandatory* (unzip)\n"
+    "        *-x* or *--e*@x@*tract* or *--decompress* or *--unzip*\n"
+    "\n"
+    "        *-i* or *--*@i@*nput* [corresponding datum basename (w/o extension)]\n"
+    "\n"
+    "    *Additional*\n"
+    "        *-p* or *--*@p@*redictor*\n"
+    "                Select predictor from \"lorenzo\" (default) or \"spline3d\" (3D only).\n"
+    "        *--origin* or *--compare* /path/to/origin-datum\n"
+    "                For verification & get data quality evaluation.\n"
+    "        *--opath*  /path/to\n"
+    "                Specify alternative output path.\n"
+    "\n"
+    "    *Modules*\n"
+    "        *--skip* _module-1_,_module-2_,...,_module-n_,\n"
+    "                Disable functionality modules. Supported module(s) include:\n"
+    "                _huffman_  Huffman codec after prediction+quantization (p+q) and before reversed p+q.\n"
+    "                _write2disk_  Skip write decompression data.\n"
+    //    "\n"
+    //    "        *-p* or *--pre* _method-1_,_method-2_,...,_method-n_\n"
+    //    "                Enable preprocessing. Supported preprocessing method(s) include:\n"
+    //    "                _binning_  Downsampling datum by 2x2 to 1.\n"
+    "\n"
+    "    *Print Report to stdout*\n"
+    "        *--report* (option=on/off)-list\n"
+    "                Syntax: opt[=v], \"kw1[=(on|off)],kw2=[=(on|off)]\n"
+    "                Keyworkds: time  quality  compressibility\n"
+    "                Example: \"--report time\", \"--report time=off\"\n"
+    "\n"
+    "    *Demonstration*\n"
+    "        *-h* or *--help*\n"
+    "                Get help documentation.\n"
+    "\n"
+    //    "        *-V* or *--verbose*\n"
+    //    "                Print host and device information for diagnostics.\n"
+    //    "\n"
+    //    "        *-M* or *--meta*\n"
+    //    "                Get archive metadata. (TODO)\n"
+    "\n"
+    "    *Advanced Runtime Configuration*\n"
+    "        *--demo* [demo-dataset]\n"
+    "                Use demo dataset, will omit given dimension(s). Supported datasets include:\n"
+    "                1D: _hacc_  _hacc1b_    2D: _cesm_  _exafel_\n"
+    "                3D: _hurricane_  _nyx-s_  _nyx-m_  _qmc_  _qmcpre_  _rtm_  _parihaka_\n"
+    "\n"
+    "        *-c* or *--config* (option=value)-list\n"
+    "               Syntax: opt=v, \"kw1=val1,kw1=val2[,...]\"\n"
+    "                   + *eb*=<val>    error bound\n"
+    "                   + *cap*=<val>   capacity, number of quant-codes\n"
+    "                   + *demo*=<val>  skip length input (\"-l x[,y[,z]]\"), alternative to \"--demo dataset\"\n"
+    "\n"
+    "               Other internal parameters:\n"
+    "                   + *quantbyte*=<1|2>\n"
+    "                       Specify quantization code representation.\n"
+    "                       Options _1_, _2_ are for *1-* and *2-*byte, respectively. (default: 2)\n"
+    "                       ^^Manually specifying this may not result in optimal memory footprint.^^\n"
+    "                   + *huffbyte*=<4|8>\n"
+    "                       Specify Huffman codeword representation.\n"
+    "                       Options _4_, _8_ are for *4-* and *8-*byte, respectively. (default: 4)\n"
+    "                       ^^Manually specifying this may not result in optimal memory footprint.^^\n"
+    "                   + *huffchunk*=[256|512|1024|...]\n"
+    "                       Manually specify chunk size for Huffman codec, overriding autotuning.\n"
+    "                       Should be a power-of-2 that is sufficiently large.\n"
+    "                       ^^This affects Huffman decoding performance significantly.^^\n"
+    "\n"
+    "*EXAMPLES*\n"
+    "    *Demo Datasets*\n"
+    "        Set a *shell variable*:\n"
+    "        export PATH=$(pwd)/bin:$PATH\n"
+    "        CESM=./data/cesm-CLDHGH-3600x1800\n"
+    "        HURR=./data/hurr-CLOUDf48-500x500x100\n"
+    "\n"
+    "        *CESM* example:\n"
+    "        cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -z --report time\n"
+    "        cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -r\n"
+    "        cusz -i ${CESM}.cusza -x --report time --compare ${CESM} --skip write2disk\n"
+    "\n"
+    "        *CESM* example with specified output path:\n"
+    "        mkdir data2 data3\n"
+    "        ^^# zip, output to `data2`^^\n"
+    "        cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -z --opath data2\n"
+    "        ^^# unzip, in situ^^\n"
+    "        cusz -i ${CESM}.cusza -x && ls data2\n"
+    "        ^^# unzip, output to `data3`^^\n"
+    "        cusz -i ${CESM}.cusza -x --opath data3 && ls data3\n"
+    "        ^^# unzip, output to `data3`, compare to the original datum^^\n"
+    "        cusz -i ${CESM}.cusza -x --opath data3 --compare ${CESM} && ls data3\n"
+    "\n"
+    "        *Hurricane Isabel* example:\n"
+    "        cusz -t f32 -m r2r -e 1e-4 -i ${HURR} -l 500x500x100 -z\n"
+    "        cusz -t f32 -m r2r -e 1e-4 -i ${HURR} -l 500x500x100 -r\n"
+    "        cusz -i ${HURR}.cusza -x\n"
+    "\n";
+
+// TODO
+// "        *EXAFEL* example:\n"
+// "        cusz -t f32 -m r2r -e 1e-4 -i ./data/exafel-59200x388 --demo exafeldemo -z -x --pre binning\n"
+// "        cusz -t f32 -m r2r -e 1e-4 -i ./data/exafel-59200x388 --demo exafeldemo -z -x --pre binning "
+// "--skip huffman\n"
+// "        cusz -i ./data/exafel-59200x388.BN.cusza -x\n";
+
+static const char huff_re_short_doc[] =
+    "\n"
+    "OVERVIEW: Huffman submodule as standalone program\n"  // TODO from this line on
+    "\n"
+    "USAGE:\n"
+    "  The basic use with demo datum is listed below,\n"
+    "    ./huff --encode --decode --verify --input ./baryon_density.dat.b16 \\\n"
+    "        -3 512 512 512 --input-rep 16 --huffman-rep 32 --huffman-chunk 2048 --dict-size 1024\n"
+    "  or shorter\n"
+    "    ./huff -e -d -V -i ./baryon_density.dat.b16 -3 512 512 512 -R 16 -H 32 -C 2048 -c 1024\n"
+    "            ^  ^  ^ --------------------------- -------------- ----- ----- ------- -------\n"
+    "            |  |  |       input datum file         dimension   input Huff. Huff.   codebook\n"
+    "          enc dec verify                                       rep.  rep.  chunk   size\n"
+    "\n"
+    "EXAMPLES\n"
+    "  Essential:\n"
+    "    ./bin/huff -e -d -i ./baryon_density.dat.b16 -3 512 512 512 -R 16 -c 1024\n"
+    "    have to input dimension, and higher dimension for a multiplication of each dim.,\n"
+    "    as default values input-rep=16 (bits), huff-rep=32 (bits), codebook-size=1024 (symbols)\n"
+    "\n";
+
+static const char doc_dim_order[] =
+    "\n"
+    "  Input dimension follows low-to-high (e.g., x-y-z) order.\n"
+    "  Taking 2D CESM-ATM as an example, \n"
+    "\n"
+    "  |<------------------------- x 3600 --------------------------->|    \n"
+    "  +--------------------------------------------------------------+  - \n"
+    "  |                                                              |  ^ \n"
+    "  |                                                              |  | \n"
+    "  |              CESM-ATM:    1800x3600 (y-x order)              |  | \n"
+    "  |              datum name:  <field>_1800_3600                  |  y \n"
+    "  |                                                              | 1800 \n"
+    "  |              input:       -l 3600,1800                       |  | \n"
+    "  |              input order: -l [x,y]                           |  | \n"
+    "  |                                                              |  | \n"
+    "  |                                                              |  v \n"
+    "  +--------------------------------------------------------------+  - \n"
+    "\n"
+    "  Taking 3D Hurricane as another example, whose dimensions are\n"
+    "  100x500x500, the input is \"-l 500,500,100\".\n";
+
+#endif
diff --git a/qtensor/compression/cusz/include/cli/quality_viewer.hh b/qtensor/compression/cusz/include/cli/quality_viewer.hh
new file mode 100644
index 00000000..0a5e9eed
--- /dev/null
+++ b/qtensor/compression/cusz/include/cli/quality_viewer.hh
@@ -0,0 +1,163 @@
+/**
+ * @file quality_viewer.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-09
+ * @deprecated 0.3.2
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef QUALITY_VIEWER_HH
+#define QUALITY_VIEWER_HH
+
+// 22-11-20 would fail in cxxapi.cu if deleted
+#include <thrust/equal.h>
+
+#include "../common/capsule.hh"
+#include "../common/definition.hh"
+#include "../header.h"
+#include "../stat/compare_gpu.hh"
+#include "verify.hh"
+
+namespace cusz {
+
+const static auto HOST        = cusz::LOC::HOST;
+const static auto DEVICE      = cusz::LOC::DEVICE;
+const static auto HOST_DEVICE = cusz::LOC::HOST_DEVICE;
+
+struct QualityViewer {
+    template <typename Data>
+    static void print_metrics_cross(cusz_stats* s, size_t compressed_bytes = 0, bool gpu_checker = false)
+    {
+        auto checker = (not gpu_checker) ? string("(using CPU checker)") : string("(using GPU checker)");
+        auto bytes   = (s->len * sizeof(Data) * 1.0);
+
+        auto println = [](const char* s, double n1, double n2, double n3, double n4) {
+            printf("  %-10s %16.8g %16.8g %16.8g %16.8g\n", s, n1, n2, n3, n4);
+        };
+        auto printhead = [](const char* s1, const char* s2, const char* s3, const char* s4, const char* s5) {
+            printf("  \e[1m\e[31m%-10s %16s %16s %16s %16s\e[0m\n", s1, s2, s3, s4, s5);
+        };
+
+        auto is_fp = std::is_same<Data, float>::value or std::is_same<Data, double>::value ? const_cast<char*>("yes")
+                                                                                           : const_cast<char*>("no");
+        printf("\nquality metrics %s:\n", checker.c_str());
+
+        printhead("", "data-len", "data-byte", "fp-type?", "");
+        printf("  %-10s %16zu %16lu %16s\n", "", s->len, sizeof(Data), is_fp);
+
+        printhead("", "min", "max", "rng", "std");
+        println("origin", s->odata.min, s->odata.max, s->odata.rng, s->odata.std);
+        println("eb-lossy", s->xdata.min, s->xdata.max, s->xdata.rng, s->xdata.std);
+
+        printhead("", "abs-val", "abs-idx", "pw-rel", "VS-RNG");
+        println("max-error", s->max_err.abs, s->max_err.idx, s->max_err.pwrrel, s->max_err.rel);
+
+        printhead("", "CR", "NRMSE", "cross-cor", "PSNR");
+        println("metrics", bytes / compressed_bytes, s->reduced.NRMSE, s->reduced.coeff, s->reduced.PSNR);
+
+        // printf("\n");
+    };
+
+    static void print_metrics_auto(double* lag1_cor, double* lag2_cor)
+    {
+        auto printhead = [](const char* s1, const char* s2, const char* s3, const char* s4, const char* s5) {
+            printf("  \e[1m\e[31m%-10s %16s %16s %16s %16s\e[0m\n", s1, s2, s3, s4, s5);
+        };
+
+        printhead("", "lag1-cor", "lag2-cor", "", "");
+        printf("  %-10s %16lf %16lf\n", "auto", *lag1_cor, *lag2_cor);
+        printf("\n");
+    };
+
+    template <typename T>
+    static void echo_metric_gpu(T* reconstructed, T* origin, size_t len, size_t compressed_bytes = 0)
+    {
+        // cross
+        auto stat_x = new cusz_stats;
+        psz::thrustgpu_assess_quality<T>(stat_x, reconstructed, origin, len);
+        print_metrics_cross<T>(stat_x, compressed_bytes, true);
+
+        auto stat_auto_lag1 = new cusz_stats;
+        psz::thrustgpu_assess_quality<T>(stat_auto_lag1, origin, origin + 1, len - 1);
+        auto stat_auto_lag2 = new cusz_stats;
+        psz::thrustgpu_assess_quality<T>(stat_auto_lag2, origin, origin + 2, len - 2);
+
+        print_metrics_auto(&stat_auto_lag1->reduced.coeff, &stat_auto_lag2->reduced.coeff);
+    }
+
+    template <typename T>
+    static void echo_metric_cpu(T* _d1, T* _d2, size_t len, size_t compressed_bytes = 0, bool from_device = true)
+    {
+        auto stat = new cusz_stats;
+        T*   reconstructed;
+        T*   origin;
+        if (not from_device) {
+            reconstructed = _d1;
+            origin        = _d2;
+        }
+        else {
+            printf("allocating tmp space for CPU verification\n");
+            auto bytes = sizeof(T) * len;
+            cudaMallocHost(&reconstructed, bytes);
+            cudaMallocHost(&origin, bytes);
+            cudaMemcpy(reconstructed, _d1, bytes, cudaMemcpyDeviceToHost);
+            cudaMemcpy(origin, _d2, bytes, cudaMemcpyDeviceToHost);
+        }
+        cusz::verify_data<T>(stat, reconstructed, origin, len);
+        print_metrics_cross<T>(stat, compressed_bytes, false);
+
+        auto stat_auto_lag1 = new cusz_stats;
+        verify_data<T>(stat_auto_lag1, origin, origin + 1, len - 1);
+        auto stat_auto_lag2 = new cusz_stats;
+        verify_data<T>(stat_auto_lag2, origin, origin + 2, len - 2);
+
+        print_metrics_auto(&stat_auto_lag1->reduced.coeff, &stat_auto_lag2->reduced.coeff);
+
+        if (from_device) {
+            if (reconstructed) cudaFreeHost(reconstructed);
+            if (origin) cudaFreeHost(origin);
+        }
+    }
+
+    template <typename T>
+    static void load_origin(string const& fname, Capsule<T>& origin)
+    {
+        origin.mallochost().malloc().fromfile(fname);
+    }
+
+    template <typename T>
+    static void view(header_t header, Capsule<T>& xdata, Capsule<T>& cmp, string const& compare)
+    {
+        auto len             = ConfigHelper::get_uncompressed_len(header);
+        auto compressd_bytes = ConfigHelper::get_filesize(header);
+
+        auto compare_on_gpu = [&]() {
+            cmp.mallochost().malloc().fromfile(compare).host2device();
+            echo_metric_gpu(xdata.dptr(), cmp.dptr(), len, compressd_bytes);
+            cmp.freehost().free();
+        };
+
+        auto compare_on_cpu = [&]() {
+            cmp.mallochost().fromfile(compare);
+            xdata.device2host();
+            echo_metric_cpu(xdata.hptr(), cmp.hptr(), len, compressd_bytes);
+            cmp.freehost();
+        };
+
+        if (compare != "") {
+            auto gb = 1.0 * sizeof(T) * len / 1e9;
+            if (gb < 0.8)
+                compare_on_gpu();
+            else
+                compare_on_cpu();
+        }
+    }
+};
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/include/cli/query.hh b/qtensor/compression/cusz/include/cli/query.hh
new file mode 100644
index 00000000..91fcf65d
--- /dev/null
+++ b/qtensor/compression/cusz/include/cli/query.hh
@@ -0,0 +1,71 @@
+/**
+ * @file query.hh
+ * @author Jiannan Tian
+ * @brief query machine information
+ * @version 0.1.3
+ * @date 2020-10-05
+ *
+ * @copyright (C) 2020 by Washington State University, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef QUERY_HH
+#define QUERY_HH
+
+#include <array>
+#include <cstdio>
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <cuda_runtime.h>
+
+#include "query_dev.hh"
+
+struct Diagnostics {
+    static std::string ExecShellCommand(const char* cmd)
+    {
+        std::array<char, 128>                    buffer;
+        std::string                              result;
+        std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd, "r"), pclose);
+        if (!pipe) { throw std::runtime_error("popen() failed!"); }
+        while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { result += buffer.data(); }
+        return result;
+    }
+
+    static void GetMachineProperties()
+    {
+        std::vector<std::string> v;
+        std::cout << "host information: " << std::endl;
+
+        auto cpuinfo = ExecShellCommand(  //
+            std::string("cat /proc/cpuinfo "
+                        "| grep \"model name\" "
+                        "| head -n 1 "
+                        "| awk -F': ' '{print $NF}'")
+                .c_str());
+        std::cout << "  cpu model\t" << cpuinfo;
+
+        auto meminfo = ExecShellCommand(  //
+            std::string("cat /proc/meminfo"
+                        "| grep \"MemTotal\" "
+                        "| awk -F' ' '{print $2\" \"$3}'")
+                .c_str());
+
+        std::cout << "  memory size\t" << meminfo;
+
+        auto endianness = ExecShellCommand(  //
+            std::string("lscpu "
+                        "| grep Endian "
+                        "| awk -F'  ' '{print $NF}'")
+                .c_str());
+
+        std::cout << "  byte order\t" << endianness;
+        printf("\n");
+    }
+};
+
+#endif
diff --git a/qtensor/compression/cusz/include/cli/query_dev.hh b/qtensor/compression/cusz/include/cli/query_dev.hh
new file mode 100644
index 00000000..c2eb37aa
--- /dev/null
+++ b/qtensor/compression/cusz/include/cli/query_dev.hh
@@ -0,0 +1,69 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* This sample queries the properties of the CUDA devices present in the system
+ * via CUDA Runtime API. */
+
+/**
+ * @brief Get the Device Property object
+ * modified from `cuda-samples/Samples/deviceQuery/deviceQuery.cpp`
+ */
+
+struct GpuDiagnostics {
+    static void GetDeviceProperty()
+    {
+        int         num_dev  = 0;
+        cudaError_t error_id = cudaGetDeviceCount(&num_dev);
+
+        if (error_id != cudaSuccess) {
+            printf("cudaGetDeviceCount returned %d\n-> %s\n", static_cast<int>(error_id), cudaGetErrorString(error_id));
+            exit(EXIT_FAILURE);
+        }
+        if (num_dev == 0) { printf("NO CUDA device detected.\n"); }
+        int dev, driver_ver = 0, runtime_ver = 0;
+
+        for (dev = 0; dev < num_dev; ++dev) {
+            cudaSetDevice(dev);
+            cudaDeviceProp dev_prop;
+            cudaGetDeviceProperties(&dev_prop, dev);
+            printf("device #%d, %s: \n", dev, dev_prop.name);
+
+            cudaDriverGetVersion(&driver_ver);
+            cudaRuntimeGetVersion(&runtime_ver);
+            printf(
+                "  driver/runtime\t%d.%d/%d.%d\n", driver_ver / 1000, (driver_ver % 100) / 10, runtime_ver / 1000,
+                (runtime_ver % 100) / 10);
+            printf("  compute capability:\t%d.%d\n", dev_prop.major, dev_prop.minor);
+            printf("  global memory:\t%.0f MiB\n", static_cast<float>(dev_prop.totalGlobalMem / 1048576.0f));
+            printf("  constant memory:\t%zu bytes\n", dev_prop.totalConstMem);
+            printf("  shared mem per block:\t%zu bytes\n", dev_prop.sharedMemPerBlock);
+            printf("  shared mem per SM:\t%zu bytes\n", dev_prop.sharedMemPerMultiprocessor);
+            printf("  registers per block:\t%d\n", dev_prop.regsPerBlock);
+        }
+        printf("\n");
+    }
+};
\ No newline at end of file
diff --git a/qtensor/compression/cusz/include/cli/timerecord_viewer.hh b/qtensor/compression/cusz/include/cli/timerecord_viewer.hh
new file mode 100644
index 00000000..9e245073
--- /dev/null
+++ b/qtensor/compression/cusz/include/cli/timerecord_viewer.hh
@@ -0,0 +1,109 @@
+/**
+ * @file timerecord_viewer.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-09
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CLI_TIMERECORD_VIEWER_HH
+#define CLI_TIMERECORD_VIEWER_HH
+
+#include <algorithm>
+#include "../common/definition.hh"
+
+namespace cusz {
+
+struct TimeRecordViewer {
+    static float get_throughput(float milliseconds, size_t bytes)
+    {
+        auto GiB     = 1.0 * 1024 * 1024 * 1024;
+        auto seconds = milliseconds * 1e-3;
+        return bytes / GiB / seconds;
+    }
+
+    static void println_throughput(const char* s, float timer, size_t bytes)
+    {
+        if (timer == 0.0) return;
+
+        auto t = get_throughput(timer, bytes);
+        printf("  %-12s %'12f %'10.2f\n", s, timer, t);
+    };
+
+    static void println_throughput_tablehead()
+    {
+        printf(
+            "\n  \e[1m\e[31m%-12s %12s %10s\e[0m\n",  //
+            const_cast<char*>("kernel"),              //
+            const_cast<char*>("time, ms"),            //
+            const_cast<char*>("GiB/s")                //
+        );
+    }
+
+    static double get_total_time(timerecord_t r)
+    {
+        double total = 0.0;
+        std::for_each(r->begin(), r->end(), [&](TimeRecordTuple t) { return total += std::get<1>(t); });
+        return total;
+    }
+    static void view_compression(timerecord_t r, size_t bytes, size_t compressed_bytes = 0)
+    {
+        auto report_cr = [&]() {
+            auto cr = 1.0 * bytes / compressed_bytes;
+            if (compressed_bytes != 0) printf("  %-*s %.2f\n", 20, "compression ratio", cr);
+        };
+
+        TimeRecord reflow;
+
+        {  // reflow
+            TimeRecordTuple book_tuple;
+
+            auto total_time    = get_total_time(r);
+            auto subtotal_time = total_time;
+
+            for (auto& i : *r) {
+                auto item = std::string(std::get<0>(i));
+                if (item == "book") {
+                    book_tuple = i;
+                    subtotal_time -= std::get<1>(i);
+                }
+                else {
+                    reflow.push_back(i);
+                }
+            }
+            reflow.push_back({const_cast<const char*>("(subtotal)"), subtotal_time});
+            printf("\e[2m");
+            reflow.push_back(book_tuple);
+            reflow.push_back({const_cast<const char*>("(total)"), total_time});
+            printf("\e[0m");
+        }
+
+        printf("\n(c) COMPRESSION REPORT\n");
+        report_cr();
+
+        ReportHelper::println_throughput_tablehead();
+        for (auto& i : reflow) ReportHelper::println_throughput(std::get<0>(i), std::get<1>(i), bytes);
+
+        printf("\n");
+    }
+
+    static void view_decompression(timerecord_t r, size_t bytes)
+    {
+        printf("\n(d) deCOMPRESSION REPORT\n");
+
+        auto total_time = get_total_time(r);
+        (*r).push_back({const_cast<const char*>("(total)"), total_time});
+
+        ReportHelper::println_throughput_tablehead();
+        for (auto& i : *r) ReportHelper::println_throughput(std::get<0>(i), std::get<1>(i), bytes);
+
+        printf("\n");
+    }
+};
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/include/cli/verify.hh b/qtensor/compression/cusz/include/cli/verify.hh
new file mode 100644
index 00000000..1e856021
--- /dev/null
+++ b/qtensor/compression/cusz/include/cli/verify.hh
@@ -0,0 +1,87 @@
+#ifndef ANALYSIS_VERIFY_HH
+#define ANALYSIS_VERIFY_HH
+
+/**
+ * @file verify.hh
+ * @author Jiannan Tian
+ * @brief Verification of decompressed data.
+ * @version 0.2
+ * @date 2020-09-20
+ * Created on: 2019-09-30
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include "../common.hh"
+#include "../cusz/type.h"
+
+using namespace std;
+
+namespace cusz {
+
+template <typename T>
+void verify_data(cusz_stats* s, T* xdata, T* odata, size_t len)
+{
+    double max_odata = odata[0], min_odata = odata[0];
+    double max_xdata = xdata[0], min_xdata = xdata[0];
+    double max_abserr = max_abserr = fabs(xdata[0] - odata[0]);
+
+    double sum_0 = 0, sum_x = 0;
+    for (size_t i = 0; i < len; i++) sum_0 += odata[i], sum_x += xdata[i];
+
+    double mean_odata = sum_0 / len, mean_xdata = sum_x / len;
+    double sum_var_odata = 0, sum_var_xdata = 0, sum_err2 = 0, sum_corr = 0, rel_abserr = 0;
+
+    double max_pwrrel_abserr = 0;
+    size_t max_abserr_index  = 0;
+    for (size_t i = 0; i < len; i++) {
+        max_odata = max_odata < odata[i] ? odata[i] : max_odata;
+        min_odata = min_odata > odata[i] ? odata[i] : min_odata;
+
+        max_xdata = max_xdata < odata[i] ? odata[i] : max_xdata;
+        min_xdata = min_xdata > xdata[i] ? xdata[i] : min_xdata;
+
+        float abserr = fabs(xdata[i] - odata[i]);
+        if (odata[i] != 0) {
+            rel_abserr        = abserr / fabs(odata[i]);
+            max_pwrrel_abserr = max_pwrrel_abserr < rel_abserr ? rel_abserr : max_pwrrel_abserr;
+        }
+        max_abserr_index = max_abserr < abserr ? i : max_abserr_index;
+        max_abserr       = max_abserr < abserr ? abserr : max_abserr;
+        sum_corr += (odata[i] - mean_odata) * (xdata[i] - mean_xdata);
+        sum_var_odata += (odata[i] - mean_odata) * (odata[i] - mean_odata);
+        sum_var_xdata += (xdata[i] - mean_xdata) * (xdata[i] - mean_xdata);
+        sum_err2 += abserr * abserr;
+    }
+    double std_odata = sqrt(sum_var_odata / len);
+    double std_xdata = sqrt(sum_var_xdata / len);
+    double ee        = sum_corr / len;
+
+    s->len = len;
+
+    s->odata.max = max_odata;
+    s->odata.min = min_odata;
+    s->odata.rng = max_odata - min_odata;
+    s->odata.std = std_odata;
+
+    s->xdata.max = max_xdata;
+    s->xdata.min = min_xdata;
+    s->xdata.rng = max_xdata - min_xdata;
+    s->xdata.std = std_xdata;
+
+    s->max_err.idx    = max_abserr_index;
+    s->max_err.abs    = max_abserr;
+    s->max_err.rel    = max_abserr / s->odata.rng;
+    s->max_err.pwrrel = max_pwrrel_abserr;
+
+    s->reduced.coeff = ee / std_odata / std_xdata;
+    s->reduced.MSE   = sum_err2 / len;
+    s->reduced.NRMSE = sqrt(s->reduced.MSE) / s->odata.rng;
+    s->reduced.PSNR  = 20 * log10(s->odata.rng) - 10 * log10(s->reduced.MSE);
+}
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/include/common.hh b/qtensor/compression/cusz/include/common.hh
new file mode 100644
index 00000000..5d2bf33e
--- /dev/null
+++ b/qtensor/compression/cusz/include/common.hh
@@ -0,0 +1,19 @@
+/**
+ * @file common.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-09-26
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_COMMON_HH
+#define CUSZ_COMMON_HH
+
+#include "common/configs.hh"
+#include "common/definition.hh"
+#include "common/type_traits.hh"
+
+#endif
\ No newline at end of file
diff --git a/qtensor/compression/cusz/include/common/capsule.hh b/qtensor/compression/cusz/include/common/capsule.hh
new file mode 100644
index 00000000..05d8ebf6
--- /dev/null
+++ b/qtensor/compression/cusz/include/common/capsule.hh
@@ -0,0 +1,402 @@
+/**
+ * @file capsule.hh
+ * @author Jiannan Tian
+ * @brief Simple data analysis (header)
+ * @version 0.2.3
+ * @date 2020-11-03
+ * (create) 2020-11-03 (rev1) 2021-03-24 (rev2) 2021-09-08
+ * @deprecated 0.3.2
+ *
+ * @copyright (C) 2020 by Washington State University, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CAPSULE_HH
+#define CAPSULE_HH
+
+#if __cplusplus >= 201703L
+#define CONSTEXPR constexpr
+#else
+#define CONSTEXPR
+#endif
+
+#include <cuda_runtime.h>
+#include <driver_types.h>
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#include "../stat/compare_gpu.hh"
+// #include "../utils/io.hh"
+#include "../utils/timer.hh"
+#include "definition.hh"
+
+template <typename T>
+class Capsule {
+   private:
+    // variables
+    struct {
+        bool hptr{false}, dptr{false}, uniptr{false};
+    } alloc_status;
+
+    T *_dptr{nullptr}, *_hptr{nullptr}, *_uniptr{nullptr};
+
+    uint32_t _len{0};
+    dim3     _len3{1, 1, 1}, _stride3{1, 1, 1};
+
+    std::string name;
+
+    // logging setup; standalone
+    const std::string LOG_NULL      = "      ";
+    const std::string LOG_INFO      = "  ::  ";
+    const std::string LOG_ERR       = " ERR  ";
+    const std::string LOG_WARN      = "WARN  ";
+    const std::string LOG_DBG       = " dbg  ";
+    const std::string LOG_EXCEPTION = "  !!  ";
+
+    // https://stackoverflow.com/a/26080768/8740097  CC BY-SA 3.0
+    template <typename S>
+    void build_string(std::ostream& o, S t)
+    {
+        o << t << " ";
+    }
+
+    template <typename S, typename... Args>
+    void build_string(std::ostream& o, S t, Args... args)  // recursive variadic function
+    {
+        build_string(o, t);
+        build_string(o, args...);
+    }
+
+    template <typename... Args>
+    void LOGGING(const std::string& log_head, Args... args)
+    {
+        std::ostringstream oss;
+        oss << log_head;
+        build_string(oss, args...);
+
+        oss.seekp(0, std::ios::end);
+        std::stringstream::pos_type offset = oss.tellp();
+        if (log_head == LOG_DBG) { std::cout << "\e[2m"; }  // dbg
+        std::cout << oss.str() << std::endl;                // print content
+        if (log_head == LOG_DBG) std::cout << "\e[0m";      // finish printing dbg
+    }
+
+    // IO
+    int fs2mem(const char* fname, void* array, size_t num_els)
+    {
+        auto bytes = sizeof(T) * num_els;
+
+        std::ifstream ifs(fname, std::ios::binary | std::ios::in);
+        if (not ifs.is_open()) {
+            std::cerr << "fail to open " << fname << std::endl;
+            return -1;
+        }
+        ifs.read(reinterpret_cast<char*>(array), std::streamsize(bytes));
+        ifs.close();
+
+        return 0;
+    }
+
+    int mem2fs(const char* fname, void* array, size_t num_els)
+    {
+        auto bytes = sizeof(type) * num_els;
+
+        std::ofstream ofs(fname, std::ios::binary | std::ios::out);
+        if (not ofs.is_open()) {
+            std::cerr << "fail to open " << fname << std::endl;
+            return -1;
+        }
+
+        ofs.write(reinterpret_cast<const char*>(array), std::streamsize(bytes));
+        ofs.close();
+
+        return 0;
+    }
+
+    std::string ERRSTR_BUILDER(std::string func, std::string msg)
+    {
+        return "[Capsule(\"" + name + "\")::" + func + "] " + msg;
+    }
+
+    void check_len(std::string funcname)
+    {
+        if (_len == 0) throw std::runtime_error("[Capsule(\"" + name + "\")::" + funcname + "] " + "len == 0");
+    }
+
+    std::string ERROR_UNDEFINED_BEHAVIOR(std::string func, std::string msg = "undefined behavior")
+    {  //
+        return ERRSTR_BUILDER(func, "undefined behavior");
+    }
+
+   public:
+    using type = T;
+
+    // TODO rule of n
+    // constructor
+    Capsule() = default;
+    Capsule(const std::string _str) : name(_str){};
+    Capsule(uint32_t len, const std::string _str = std::string("<unnamed>")) : _len(len), name(_str) {}
+    Capsule(uint32_t x, uint32_t y, uint32_t z, const std::string _str = std::string("<unnamed>")) : name(_str)
+    {
+        _len3 = dim3(x, y, z);
+        _len  = x * y * z;
+    }
+
+    ~Capsule()
+    {
+        // Becasue _hptr can be obtained externally, and could be non-pinned, cudaFreeHost may not work properly.
+        // if (alloc_status.hptr) cudaFreeHost(_hptr);
+
+        if (alloc_status.dptr) cudaFree(_dptr);
+        if (alloc_status.uniptr) cudaFree(_uniptr);
+    }
+
+    // getter start --------------------
+    T*& dptr() { return _dptr; }
+    T*& hptr() { return _hptr; }
+    T*& uniptr() { return _uniptr; }
+
+    uint32_t len() const { return _len; }
+    dim3     len3() const { return _len3; }
+    dim3     stride3() const { return _stride3; }
+    // 1D
+    T& dptr(uint32_t i) { return _dptr[i]; }
+    T& hptr(uint32_t i) { return _hptr[i]; }
+    T& uniptr(uint32_t i) { return _uniptr[i]; }
+    // 2D
+    T& dptr(uint32_t x, uint32_t y) { return _dptr[x + y * _stride3.y]; }
+    T& hptr(uint32_t x, uint32_t y) { return _hptr[x + y * _stride3.y]; }
+    T& uniptr(uint32_t x, uint32_t y) { return _uniptr[x + y * _stride3.y]; }
+    // 3D
+    T& dptr(uint32_t x, uint32_t y, uint32_t z) { return _dptr[x + y * _stride3.y + z * _stride3.z]; }
+    T& hptr(uint32_t x, uint32_t y, uint32_t z) { return _hptr[x + y * _stride3.y + z * _stride3.z]; }
+    T& uniptr(uint32_t x, uint32_t y, uint32_t z) { return _uniptr[x + y * _stride3.y + z * _stride3.z]; }
+    // getter end -----------------------
+
+    // setter start ---------------------
+    Capsule& set_hptr(T* ptr)
+    {
+        _hptr = ptr, alloc_status.hptr = true;
+        return *this;
+    }
+    Capsule& set_dptr(T* ptr)
+    {
+        _dptr = ptr, alloc_status.dptr = true;
+        return *this;
+    }
+    Capsule& set_uniptr(T* ptr)
+    {
+        _uniptr = ptr, alloc_status.uniptr = true;
+        return *this;
+    }
+
+    // variable len
+    Capsule& set_len(uint32_t len)
+    {
+        if (len <= 0) throw std::runtime_error("length must be greater than 0");
+        _len = len;
+        return *this;
+    }
+
+    Capsule& set_len3(uint32_t x, uint32_t y = 1, uint32_t z = 1)
+    {
+        if (x == 1) throw std::runtime_error("x must be > 1.");
+        if (x * y * z == 0) throw std::runtime_error("x, y, z must be non-zero.");
+
+        _len3    = dim3(x, y, z);
+        _stride3 = dim3(1, x, x * y);
+        _len     = x * y * z;
+
+        return *this;
+    }
+    // setter end ----------------------
+
+    // debug
+    void debug()
+    {
+        printf("Capsule debugging information\n");
+        printf("  name   : %s\n", name.c_str());
+        printf("  len    : %u\n", len());
+        printf("  hptr   : %s\n", alloc_status.hptr ? "set" : "not set");
+        printf("  dptr   : %s\n", alloc_status.dptr ? "set" : "not set");
+        printf("  uniptr : %s\n", alloc_status.uniptr ? "set" : "not set");
+    }
+
+    // for debugging
+    Capsule& set_name(std::string _str)
+    {
+        name = _str;
+        return *this;
+    }
+
+    // IO
+    Capsule& fromfile(std::string fname, double* time = nullptr)
+    {
+        if (not _hptr) throw std::runtime_error(ERRSTR_BUILDER("fromfile", "_hptr not set"));
+        if (_len == 0) throw std::runtime_error(ERRSTR_BUILDER("fromfile", "len == 0"));
+
+        auto a = hires::now();
+        fs2mem(fname.c_str(), _hptr, _len);
+        auto z = hires::now();
+
+        if (time) *time = static_cast<duration_t>(z - a).count();
+
+        return *this;
+    }
+
+    Capsule& tofile(std::string fname, double* time = nullptr)
+    {
+        if (not _hptr) { throw std::runtime_error(ERRSTR_BUILDER("tofile", "_hptr not set")); }
+        if (_len == 0) throw std::runtime_error(ERRSTR_BUILDER("tofile", "len == 0"));
+
+        auto a = hires::now();
+        mem2fs(fname.c_str(), _hptr, _len);
+        auto z = hires::now();
+
+        if (time) *time = static_cast<duration_t>(z - a).count();
+
+        return *this;
+    }
+
+    uint32_t nbyte() const { return _len * sizeof(T); }
+
+    // memcpy h2d, synchronous
+    Capsule& host2device()
+    {
+        check_len("host2device");
+
+        cudaMemcpy(_dptr, _hptr, nbyte(), cudaMemcpyHostToDevice);
+        return *this;
+    }
+    // memcpy d2h, synchronous
+    Capsule& device2host()
+    {
+        check_len("device2host");
+
+        cudaMemcpy(_hptr, _dptr, nbyte(), cudaMemcpyDeviceToHost);
+        return *this;
+    }
+    // memcpy h2d, asynchronous
+    Capsule& host2device_async(cudaStream_t stream)
+    {
+        check_len("host2device_async");
+
+        cudaMemcpyAsync(_dptr, _hptr, nbyte(), cudaMemcpyHostToDevice, stream);
+        return *this;
+    }
+    // memcpy d2h, asynchronous
+    Capsule& device2host_async(cudaStream_t stream)
+    {
+        check_len("device2host_async");
+
+        cudaMemcpyAsync(_hptr, _dptr, nbyte(), cudaMemcpyDeviceToHost, stream);
+        return *this;
+    }
+    // shorthand
+    Capsule& h2d() { return host2device(); }
+    Capsule& d2h() { return device2host(); }
+    Capsule& async_h2d(cudaStream_t stream) { return host2device_async(stream); }
+    Capsule& async_d2h(cudaStream_t stream) { return device2host_async(stream); }
+
+    // cudaMalloc wrapper
+    Capsule& malloc(bool do_memset = true, uint8_t memset_val = 0)
+    {
+        check_len("malloc");
+
+        if (alloc_status.dptr)
+            LOGGING(LOG_WARN, "already allocated on device");
+        else {
+            cudaMalloc(&_dptr, nbyte());
+            cudaMemset(_dptr, memset_val, nbyte());
+            alloc_status.dptr = true;
+        }
+        return *this;
+    }
+    // cudaMallocHost wrapper, pinned
+    Capsule& mallochost(bool do_memset = true, uint8_t memset_val = 0)
+    {
+        check_len("mallochost");
+
+        if (alloc_status.hptr)
+            LOGGING(LOG_WARN, "already allocated on host");
+        else {
+            cudaMallocHost(&_hptr, nbyte());
+            memset(_hptr, memset_val, nbyte());
+            alloc_status.hptr = true;
+        }
+        return *this;
+    }
+    // cudaMallocManaged wrapper
+    Capsule& mallocmanaged(bool do_memset = true, uint8_t memset_val = 0)
+    {
+        check_len("mallocmanaged");
+
+        if (alloc_status.uniptr)
+            LOGGING(LOG_WARN, "already allocated as unified");
+        else {
+            cudaMallocManaged(&_uniptr, nbyte());
+            cudaMemset(_uniptr, memset_val, nbyte());
+            alloc_status.uniptr = true;
+        }
+        return *this;
+    }
+    // cudaFree wrapper
+    Capsule& free()
+    {
+        if (not _dptr) throw std::runtime_error(ERRSTR_BUILDER("free", "_dptr is null"));
+        cudaFree(_dptr);
+        alloc_status.dptr = false;
+        return *this;
+    }
+    // cudaFreeHost wrapper
+    Capsule& freehost()
+    {
+        if (not _hptr) throw std::runtime_error(ERRSTR_BUILDER("free", "_hptr is null"));
+        cudaFreeHost(_hptr);
+        alloc_status.hptr = false;
+        return *this;
+    }
+    // cudaFree wrapper, but for unified memory
+    Capsule& freemanaged()
+    {
+        if (not _uniptr) throw std::runtime_error(ERRSTR_BUILDER("free", "_uniptr is null"));
+        cudaFree(_uniptr);
+        alloc_status.uniptr = false;
+        return *this;
+    }
+
+   private:
+    double maxval, minval, rng;
+
+   public:
+    double get_maxval() { return maxval; }
+    double get_minval() { return minval; }
+    double get_rng() { return rng; }
+
+    // data scan
+    Capsule& prescan(double& max_value, double& min_value, double& rng)
+    {
+        // may not work for _uniptr
+        T result[4];
+        psz::thrustgpu_get_extrema_rawptr<T>(_dptr, _len, result);
+
+        min_value = result[0];
+        max_value = result[1];
+        rng       = max_value - min_value;
+
+        return *this;
+    }
+    // data scan
+    Capsule& prescan()
+    {
+        prescan(maxval, minval, rng);
+        return *this;
+    }
+};
+
+#endif
diff --git a/qtensor/compression/cusz/include/common/configs.hh b/qtensor/compression/cusz/include/common/configs.hh
new file mode 100644
index 00000000..7c1e0654
--- /dev/null
+++ b/qtensor/compression/cusz/include/common/configs.hh
@@ -0,0 +1,354 @@
+/**
+ * @file configs.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-09-26
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_COMMON_CONFIGS_HH
+#define CUSZ_COMMON_CONFIGS_HH
+
+#include <cuda_runtime.h>
+#include <cxxabi.h>
+#include <cmath>
+#include <fstream>
+#include <limits>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "../header.h"
+#include "definition.hh"
+
+#if __cplusplus >= 201703L
+#define CONSTEXPR constexpr
+#else
+#define CONSTEXPR
+#endif
+
+struct Reinterpret1DTo2D {
+    template <typename T>
+    static T get_square_size(T len)
+    {
+        return static_cast<T>(ceil(sqrt(len)));
+    }
+};
+
+struct Align {
+    template <cusz::ALIGNDATA ad = cusz::ALIGNDATA::NONE>
+    static size_t get_aligned_datalen(size_t len)
+    {
+        if CONSTEXPR (ad == cusz::ALIGNDATA::NONE) return len;
+        if CONSTEXPR (ad == cusz::ALIGNDATA::SQUARE_MATRIX) {
+            auto m = Reinterpret1DTo2D::get_square_size(len);
+            return m * m;
+        }
+    }
+
+    static const int DEFAULT_ALIGN_NBYTE = 128;
+
+    template <int NUM>
+    static inline bool is_aligned_at(const void* ptr)
+    {  //
+        return reinterpret_cast<uintptr_t>(ptr) % NUM == 0;
+    };
+
+    template <typename T, int NUM = DEFAULT_ALIGN_NBYTE>
+    static size_t get_aligned_nbyte(size_t len)
+    {
+        return ((sizeof(T) * len - 1) / NUM + 1) * NUM;
+    }
+};
+
+// sparsity rate is less that 5%
+struct SparseMethodSetup {
+    // "Density" denotes the degree of non-zeros (nz).
+    static constexpr float default_density  = 0.25;                 // ratio of nonzeros (R_nz)
+    static constexpr float default_sparsity = 1 - default_density;  // ratio of zeros, 1 - R_nz
+
+    static constexpr int default_density_factor = 4;  // ratio of nonzeros (R_nz)
+
+    template <typename T, typename M = int>
+    static uint32_t get_csr_nbyte(uint32_t len, uint32_t nnz)
+    {
+        auto m     = Reinterpret1DTo2D::get_square_size(len);
+        auto nbyte = sizeof(M) * (m + 1) + sizeof(M) * nnz + sizeof(T) * nnz;
+        return nbyte;
+    }
+};
+
+struct HuffmanHelper {
+    // deprecated
+    // template <typename SYM, typename BOOK>
+    // static uint32_t get_revbook_nbyte(int dict_size)
+    // {
+    //     constexpr auto TYPE_BITCOUNT = sizeof(BOOK) * 8;
+    //     return sizeof(BOOK) * (2 * TYPE_BITCOUNT) + sizeof(SYM) * dict_size;
+    // }
+
+    static const int BLOCK_DIM_ENCODE  = 256;
+    static const int BLOCK_DIM_DEFLATE = 256;
+
+    static const int ENC_SEQUENTIALITY = 4;  // empirical
+    static const int DEFLATE_CONSTANT  = 4;  // TODO -> deflate_chunk_constant
+};
+
+struct StringHelper {
+    static std::string nnz_percentage(uint32_t nnz, uint32_t data_len)
+    {
+        return "(" + std::to_string(nnz / 1.0 / data_len * 100) + "%)";
+    }
+};
+
+struct ConfigHelper {
+    static uint32_t predictor_lookup(std::string name)
+    {
+        const std::unordered_map<std::string, uint32_t> lut = {
+            {"lorenzo", 0}, {"lorenzoii", 1}, {"spline3", 2}  //
+        };
+        if (lut.find(name) != lut.end()) throw std::runtime_error("no such predictor as " + name);
+        return lut.at(name);
+    }
+
+    static uint32_t codec_lookup(std::string name)
+    {
+        const std::unordered_map<std::string, uint32_t> lut = {
+            {"huffman-coarse", 0}  //
+        };
+        if (lut.find(name) != lut.end()) throw std::runtime_error("no such codec as " + name);
+        return lut.at(name);
+    }
+
+    static uint32_t spcodec_lookup(std::string name)
+    {
+        const std::unordered_map<std::string, uint32_t> lut = {
+            {"spmat", 0}, {"spvec", 1}  //
+        };
+        if (lut.find(name) != lut.end()) throw std::runtime_error("no such codec as " + name);
+        return lut.at(name);
+    }
+
+    static std::string get_default_predictor() { return "lorenzo"; }
+    static std::string get_default_spcodec() { return "csr11"; }
+    static std::string get_default_codec() { return "huffman-coarse"; }
+    static std::string get_default_cuszmode() { return "r2r"; }
+    static std::string get_default_dtype() { return "f32"; }
+
+    static bool check_predictor(const std::string& val, bool fatal = false)
+    {
+        auto legal = (val == "lorenzo") or (val == "spline3");
+        if (not legal) {
+            if (fatal)
+                throw std::runtime_error("`predictor` must be \"lorenzo\" or \"spline3\".");
+            else
+                printf("fallback to the default \"%s\".", get_default_predictor().c_str());
+        }
+        return legal;
+    }
+
+    static bool check_codec(const std::string& val, bool fatal = false)
+    {
+        auto legal = (val == "huffman-coarse");
+        if (not legal) {
+            if (fatal)
+                throw std::runtime_error("`codec` must be \"huffman-coarse\".");
+            else
+                printf("fallback to the default \"%s\".", get_default_codec().c_str());
+        }
+        return legal;
+    }
+
+    static bool check_spcodec(const std::string& val, bool fatal = false)
+    {
+        auto legal = (val == "csr11") or (val == "rle");
+        if (not legal) {
+            if (fatal)
+                throw std::runtime_error("`codec` must be \"csr11\" or \"rle\".");
+            else
+                printf("fallback to the default \"%s\".", get_default_codec().c_str());
+        }
+        return legal;
+    }
+
+    static bool check_cuszmode(const std::string& val, bool fatal = false)
+    {
+        auto legal = (val == "r2r") or (val == "abs");
+        if (not legal) {
+            if (fatal)
+                throw std::runtime_error("`mode` must be \"r2r\" or \"abs\".");
+            else
+                printf("fallback to the default \"%s\".", get_default_cuszmode().c_str());
+        }
+        return legal;
+    }
+
+    static bool check_dtype(const std::string& val, bool fatal = false)
+    {
+        auto legal = (val == "f32");
+        // auto legal = (val == "f32") or (val == "f64");
+        if (not legal) {
+            if (fatal)
+                throw std::runtime_error("`dtype` must be \"f32\".");
+            else
+                printf("fallback to the default \"%s\".", get_default_dtype().c_str());
+        }
+        return legal;
+    }
+
+    static bool check_opt_in_list(std::string const& opt, std::vector<std::string> vs)
+    {
+        for (auto& i : vs) {
+            if (opt == i) return true;
+        }
+        return false;
+    }
+
+    static void parse_length_literal(const char* str, std::vector<std::string>& dims)
+    {
+        std::stringstream data_len_ss(str);
+        auto              data_len_literal = data_len_ss.str();
+        char              delimiter        = 'x';
+
+        while (data_len_ss.good()) {
+            std::string substr;
+            std::getline(data_len_ss, substr, delimiter);
+            dims.push_back(substr);
+        }
+    }
+
+    static size_t get_filesize(std::string fname)
+    {
+        std::ifstream in(fname.c_str(), std::ifstream::ate | std::ifstream::binary);
+        return in.tellg();
+    }
+
+    static size_t get_filesize(cusz_header* h)
+    {
+        auto END = sizeof(h->entry) / sizeof(h->entry[0]);
+        return h->entry[END - 1];
+    }
+
+    static size_t get_uncompressed_len(cusz_header* h) { return h->x * h->y * h->z; }
+
+    template <typename T1, typename T2>
+    static size_t get_npart(T1 size, T2 subsize)
+    {
+        static_assert(
+            std::numeric_limits<T1>::is_integer and std::numeric_limits<T2>::is_integer,
+            "[get_npart] must be plain interger types.");
+
+        return (size + subsize - 1) / subsize;
+    }
+
+    // #ifdef __CUDACC__
+    static int get_ndim(dim3 len3)
+    {
+        auto ndim = 3;
+        if (len3.z == 1) ndim = 2;
+        if (len3.z == 1 and len3.y == 1) ndim = 1;
+        return ndim;
+    }
+
+    static dim3 get_pardeg3(dim3 len3, dim3 sublen3)
+    {
+        return dim3(
+            get_npart(len3.x, sublen3.x),  //
+            get_npart(len3.y, sublen3.y),  //
+            get_npart(len3.z, sublen3.z));
+    }
+
+    template <typename T>
+    static dim3 get_pardeg3(dim3 len3, T sublen3[3])
+    {
+        return dim3(
+            get_npart(len3.x, sublen3[0]),  //
+            get_npart(len3.y, sublen3[1]),  //
+            get_npart(len3.z, sublen3[2]));
+    }
+
+    template <typename T>
+    static dim3 multiply_dim3(dim3 a, T b[3])
+    {
+        return dim3(a.x * b[0], a.y * b[1], a.z * b[2]);
+    }
+
+    static dim3 multiply_dim3(dim3 a, dim3 b)
+    {  //
+        return dim3(a.x * b.x, a.y * b.y, a.z * b.z);
+    }
+
+    static size_t get_serialized_len(dim3 a) { return a.x * a.y * a.z; }
+
+    static dim3 get_leap(dim3 len3) { return dim3(1, len3.x, len3.x * len3.y); }
+
+    // #endif
+
+    template <typename T>
+    static size_t get_serialized_len(T a[3])
+    {  //
+        return a[0] * a[1] * a[2];
+    }
+};
+
+struct CompareHelper {
+    template <typename TRIO>
+    static bool eq(TRIO a, TRIO b)
+    {
+        return (a.x == b.x) and (a.y == b.y) and (a.z == b.z);
+    };
+};
+
+struct ReportHelper {
+    static float get_throughput(float milliseconds, size_t nbyte)
+    {
+        auto GiB     = 1.0 * 1024 * 1024 * 1024;
+        auto seconds = milliseconds * 1e-3;
+        return nbyte / GiB / seconds;
+    }
+
+    static void println_throughput(const char* s, float timer, size_t _nbyte)
+    {
+        if (timer == 0.0) return;
+        auto t = get_throughput(timer, _nbyte);
+        printf("  %-12s %'12f %'10.2f\n", s, timer, t);
+    };
+
+    static void println_throughput_tablehead()
+    {
+        printf(
+            "\n  \e[1m\e[31m%-12s %12s %10s\e[0m\n",  //
+            const_cast<char*>("kernel"),              //
+            const_cast<char*>("time, ms"),            //
+            const_cast<char*>("GiB/s")                //
+        );
+    }
+
+    static void print_datasegment_tablehead()
+    {
+        printf(
+            "\ndata segments:\n  \e[1m\e[31m%-18s\t%12s\t%15s\t%15s\e[0m\n",  //
+            const_cast<char*>("name"),                                        //
+            const_cast<char*>("nbyte"),                                       //
+            const_cast<char*>("start"),                                       //
+            const_cast<char*>("end"));
+    }
+
+    static std::string demangle(const char* name)
+    {
+        int   status = -4;
+        char* res    = abi::__cxa_demangle(name, nullptr, nullptr, &status);
+
+        const char* const demangled_name = (status == 0) ? res : name;
+        std::string       ret_val(demangled_name);
+        free(res);
+        return ret_val;
+    };
+};
+
+#endif
diff --git a/qtensor/compression/cusz/include/common/definition.hh b/qtensor/compression/cusz/include/common/definition.hh
new file mode 100644
index 00000000..c7c328ef
--- /dev/null
+++ b/qtensor/compression/cusz/include/common/definition.hh
@@ -0,0 +1,66 @@
+/**
+ * @file definition.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-09-20
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_COMMON_DEFINITION_HH
+#define CUSZ_COMMON_DEFINITION_HH
+
+#include <cstdint>
+#include <tuple>
+#include <vector>
+
+namespace cusz {
+
+enum class TASK { COMPRESS, DECOMPRESS, EXPERIMENT, COMPRESS_DRYRUN };
+enum class DEV { TEST, DEV, RELEASE };
+enum class LOC { HOST, DEVICE, HOST_DEVICE, UNIFIED, FS, NONE, __BUFFER };
+enum class WHEN { COMPRESS, DECOMPRESS, EXPERIMENT, COMPRESS_DRYRUN };
+enum class ALIGNDATA { NONE, SQUARE_MATRIX, POWEROF2, NEXT_EVEN };
+enum class ALIGNMEM { NONE, WARP32B, WARP64B, WARP128B };
+
+// TODO when to use ADDR8?
+// TODO change to `enum class`
+enum class SEG { HEADER, BOOK, QUANT, REVBOOK, ANCHOR, SPFMT, HUFF_META, HUFF_DATA };
+
+enum class execution { cuda, serial };
+enum class method { native, thrust };
+
+struct OK {
+    template <cusz::DEV m>
+    static void ALLOC()
+    {
+        static_assert(
+            m == cusz::DEV::TEST or m == cusz::DEV::DEV,  //
+            "muse be cusz::DEV::TEST or cusz::DEV::DEV; use with caution");
+    }
+
+    template <cusz::DEV m>
+    static void FREE()
+    {
+        static_assert(
+            m == cusz::DEV::TEST or m == cusz::DEV::DEV,  //
+            "muse be cusz::DEV::TEST or cusz::DEV::DEV; use with caution");
+    }
+};
+
+using ADDR4 = uint32_t;
+using ADDR8 = size_t;
+
+using FREQ = uint32_t;
+
+using TimeRecordTuple = std::tuple<const char*, double>;
+using TimeRecord      = std::vector<TimeRecordTuple>;
+using timerecord_t    = TimeRecord*;
+
+using BYTE = uint8_t;
+
+};  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/include/common/type_traits.hh b/qtensor/compression/cusz/include/common/type_traits.hh
new file mode 100644
index 00000000..a77c2738
--- /dev/null
+++ b/qtensor/compression/cusz/include/common/type_traits.hh
@@ -0,0 +1,108 @@
+/**
+ * @file type_traits.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.1.1
+ * @date 2020-09-23
+ * (create) 2020-09-23, (rev) 2021-09-17
+ *
+ * @copyright (C) 2020 by Washington State University, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef TYPE_TRAITS_HH
+#define TYPE_TRAITS_HH
+
+#include <stdexcept>
+#include <type_traits>
+
+#include "cusz/type.h"
+#include "definition.hh"
+
+template <typename T>
+cusz_datatype cusz_typeof()
+{
+    if (std::is_same<T, float>::value)
+        return FP32;
+    else if (std::is_same<T, double>::value)
+        return FP64;
+    else
+        throw std::runtime_error("Type not supported.");
+}
+
+// clang-format off
+
+/**
+ * @brief CUDA API does not accept uint64_t (understandable by literal), but instead, 
+ * `unsigned long long`, which is ambiguous anyway.
+ */
+template <typename T> struct cuszCOMPAT;
+template <> struct cuszCOMPAT<uint32_t> { using type = uint32_t; };
+template <> struct cuszCOMPAT<uint64_t> { using type = unsigned long long; };
+
+template <int WIDTH, bool FP = true> struct DataTrait;
+template <> struct DataTrait<4, true>  { typedef float   type; };
+template <> struct DataTrait<8, true>  { typedef double  type; };
+template <> struct DataTrait<1, false> { typedef int8_t  type; }; // future use
+template <> struct DataTrait<2, false> { typedef int16_t type; }; // future use
+template <> struct DataTrait<4, false> { typedef int32_t type; }; // future use
+template <> struct DataTrait<8, false> { typedef int64_t type; }; // future use
+
+template <int NDIM> struct ChunkingTrait;
+template <> struct ChunkingTrait<1>     { static const int BLOCK = 256; static const int SEQ = 8; };
+template <> struct ChunkingTrait<0x101> { static const int BLOCK = 128; };
+template <> struct ChunkingTrait<0x201> { static const int BLOCK = 64;  };
+template <> struct ChunkingTrait<2>     { static const int BLOCK = 16; static const int YSEQ = 8; };
+template <> struct ChunkingTrait<3>     { static const int BLOCK = 8;  static const int YSEQ = 8; };
+
+// template <int WIDTH> struct QuantTrait;
+// template <> struct QuantTrait<1> { typedef uint8_t type; };
+// template <> struct QuantTrait<2> { typedef uint16_t type; };
+// template <> struct QuantTrait<4> { typedef uint32_t type; };
+
+template <int WIDTH, bool FP = false> struct ErrCtrlTrait;
+template <> struct ErrCtrlTrait<1, false> { typedef uint8_t  type; };
+template <> struct ErrCtrlTrait<2, false> { typedef uint16_t type; };
+template <> struct ErrCtrlTrait<4, false> { typedef uint32_t type; };
+template <> struct ErrCtrlTrait<4, true>  { typedef float    type; };
+template <> struct ErrCtrlTrait<8, true>  { typedef double   type; };
+
+template <int WIDTH> struct HuffTrait;
+template <> struct HuffTrait<4> { typedef cuszCOMPAT<uint32_t>::type type; };
+template <> struct HuffTrait<8> { typedef cuszCOMPAT<uint64_t>::type type; };
+
+template <int WIDTH> struct ReducerTrait;
+template <> struct ReducerTrait<4> { typedef uint32_t type; };
+template <> struct ReducerTrait<8> { typedef uint64_t type; };
+
+template <int WIDTH> struct MetadataTrait;
+template <> struct MetadataTrait<4> { typedef uint32_t type; };
+template <> struct MetadataTrait<8> { typedef uint64_t type; }; // size_t is problematic; do not use
+
+template <bool LARGE> struct LargeInputTrait;
+template <> struct LargeInputTrait<false> { using type = MetadataTrait<4>::type; };
+template <> struct LargeInputTrait<true>  { using type = MetadataTrait<8>::type; };
+
+template <bool FAST> struct FastLowPrecisionTrait;
+template <> struct FastLowPrecisionTrait<true>  { typedef float  type; };
+template <> struct FastLowPrecisionTrait<false> { typedef double type; };
+
+// template <typename F> struct cuszCUSPARSE;
+// template <> struct cuszCUSPARSE<float>  { const static cudaDataType type = CUDA_R_32F; };
+// template <> struct cuszCUSPARSE<double> { const static cudaDataType type = CUDA_R_64F; };
+
+#ifdef __CUDACC__
+#include <driver_types.h>
+
+template <cusz::LOC FROM, cusz::LOC TO> struct CopyDirection;
+template <> struct CopyDirection<cusz::LOC::HOST,   cusz::LOC::HOST>   { static const cudaMemcpyKind direction = cudaMemcpyHostToHost;     };
+template <> struct CopyDirection<cusz::LOC::HOST,   cusz::LOC::DEVICE> { static const cudaMemcpyKind direction = cudaMemcpyHostToDevice;   };
+template <> struct CopyDirection<cusz::LOC::DEVICE, cusz::LOC::HOST>   { static const cudaMemcpyKind direction = cudaMemcpyDeviceToHost;   };
+template <> struct CopyDirection<cusz::LOC::DEVICE, cusz::LOC::DEVICE> { static const cudaMemcpyKind direction = cudaMemcpyDeviceToDevice; };
+
+#endif
+
+// clang-format on
+
+#endif
diff --git a/qtensor/compression/cusz/include/compaction.hh b/qtensor/compression/cusz/include/compaction.hh
new file mode 100644
index 00000000..bd2a27eb
--- /dev/null
+++ b/qtensor/compression/cusz/include/compaction.hh
@@ -0,0 +1,18 @@
+/**
+ * @file compaction.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-23
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef DAB40B13_9236_42A9_8047_49CD896671C9
+#define DAB40B13_9236_42A9_8047_49CD896671C9
+
+template <typename T>
+struct CompactionDRAM;
+
+#endif /* DAB40B13_9236_42A9_8047_49CD896671C9 */
diff --git a/qtensor/compression/cusz/include/component.hh b/qtensor/compression/cusz/include/component.hh
new file mode 100644
index 00000000..ec5c08a6
--- /dev/null
+++ b/qtensor/compression/cusz/include/component.hh
@@ -0,0 +1,19 @@
+/**
+ * @file componment.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-10-06
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_COMPONENT_HH
+#define CUSZ_COMPONENT_HH
+
+#include "component/prediction.inl"
+#include "component/spcodec.inl"
+#include "hf/hf.hh"
+
+#endif
\ No newline at end of file
diff --git a/qtensor/compression/cusz/include/component/glue.cuh b/qtensor/compression/cusz/include/component/glue.cuh
new file mode 100644
index 00000000..c4d69141
--- /dev/null
+++ b/qtensor/compression/cusz/include/component/glue.cuh
@@ -0,0 +1,120 @@
+/**
+ * @file glue.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-03-01
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef WRAPPER_GLUE_CUH
+#define WRAPPER_GLUE_CUH
+
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include "spcodec.hh"
+
+// when using nvcc, functors must be defined outside a (__host__) function
+template <typename E>
+struct cleanup : public thrust::unary_function<E, E> {
+    int radius;
+    cleanup(int radius) : radius(radius) {}
+    __host__ __device__ E operator()(const E e) const { return e; }
+};
+
+template <typename E, typename Policy, typename IDX = int, bool SHIFT = true>
+void split_by_radius(
+    E*           in_errctrl,
+    size_t       in_len,
+    int const    radius,
+    IDX*         out_idx,
+    E*           out_val,
+    int&         out_nnz,
+    cudaStream_t stream = nullptr,
+    Policy       policy = thrust::device)
+{
+    using thrust::placeholders::_1;
+
+    thrust::cuda::par.on(stream);
+    thrust::counting_iterator<IDX> zero(0);
+
+    // find out the indices
+    out_nnz = thrust::copy_if(policy, zero, zero + in_len, in_errctrl, out_idx, _1 >= 2 * radius or _1 <= 0) - out_idx;
+
+    // fetch corresponding values
+    thrust::copy(
+        policy, thrust::make_permutation_iterator(in_errctrl, out_idx),
+        thrust::make_permutation_iterator(in_errctrl + out_nnz, out_idx + out_nnz), out_val);
+
+    // clear up
+    cleanup<E> functor(radius);
+    thrust::transform(
+        policy,                                                                      //
+        thrust::make_permutation_iterator(in_errctrl, out_idx),                      //
+        thrust::make_permutation_iterator(in_errctrl + out_nnz, out_idx + out_nnz),  //
+        thrust::make_permutation_iterator(in_errctrl, out_idx),                      //
+        functor);
+}
+
+template <typename E, typename Policy, typename IDX = int>
+void split_by_binary_twopass(
+    E*           in_errctrl,
+    size_t       in_len,
+    int const    radius,
+    IDX*         out_idx,
+    E*           out_val,
+    int&         out_nnz,
+    cudaStream_t stream = nullptr,
+    Policy       policy = thrust::device)
+{
+    using thrust::placeholders::_1;
+
+    thrust::cuda::par.on(stream);
+    thrust::counting_iterator<IDX> zero(0);
+
+    // find out the indices
+    out_nnz = thrust::copy_if(policy, zero, zero + in_len, in_errctrl, out_idx, _1 != radius) - out_idx;
+
+    // fetch corresponding values
+    thrust::copy(
+        policy, thrust::make_permutation_iterator(in_errctrl, out_idx),
+        thrust::make_permutation_iterator(in_errctrl + out_nnz, out_idx + out_nnz), out_val);
+}
+
+// when using nvcc, functors must be defined outside a (__host__) function
+template <typename Tuple>
+struct is_outlier {
+    int radius;
+    is_outlier(int radius) : radius(radius) {}
+    __host__ __device__ bool operator()(const Tuple t) const { return thrust::get<1>(t) != radius; }
+};
+
+template <typename E, typename Policy, typename IDX = int>
+void split_by_binary_onepass(
+    E*           in_errctrl,
+    size_t       in_len,
+    int const    radius,
+    IDX*         out_idx,
+    E*           out_val,
+    int&         out_nnz,
+    cudaStream_t stream = nullptr,
+    Policy       policy = thrust::device)
+{
+    thrust::cuda::par.on(stream);
+    using Tuple = thrust::tuple<IDX, E>;
+    thrust::counting_iterator<IDX> zero(0);
+
+    auto in      = thrust::make_zip_iterator(thrust::make_tuple(zero, in_errctrl));
+    auto in_last = thrust::make_zip_iterator(thrust::make_tuple(zero + in_len, in_errctrl + in_len));
+    auto out     = thrust::make_zip_iterator(thrust::make_tuple(out_idx, out_val));
+
+    is_outlier<Tuple> functor(radius);
+    out_nnz = thrust::copy_if(policy, in, in_last, out, functor) - out;
+}
+
+enum class GlueMethod { SPLIT_BY_RADIUS, SPLIT_01_ONEPASS, SPLIT_01_TWOPASS };
+
+#endif
diff --git a/qtensor/compression/cusz/include/component/pred_boilerplate_deprecated.hh b/qtensor/compression/cusz/include/component/pred_boilerplate_deprecated.hh
new file mode 100644
index 00000000..bb7a0584
--- /dev/null
+++ b/qtensor/compression/cusz/include/component/pred_boilerplate_deprecated.hh
@@ -0,0 +1,210 @@
+/**
+ * @file predictor_boilerplate.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-09-15
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_INCLUDE_PREDICTOR_HH
+#define CUSZ_INCLUDE_PREDICTOR_HH
+
+#include <cstdint>
+#include <cstdio>
+#include <stdexcept>
+
+#include "../common/configs.hh"
+#include "../cusz/type.h"
+
+namespace cusz {
+
+class PredictorBoilerplate {
+   protected:
+    struct DerivedLengths {
+        struct Interpretion3D {
+            dim3   len3, leap;
+            size_t serialized;
+
+            void set_leap() { leap = ConfigHelper::get_leap(len3); }
+            void set_serialized() { serialized = ConfigHelper::get_serialized_len(len3); }
+        };
+
+        struct Interpretion3D base, anchor, aligned;
+
+        dim3 nblock;
+        int  ndim;
+
+        struct {
+            size_t data, quant, outlier, anchor;
+        } assigned;
+
+        dim3 get_len3() const { return base.len3; }
+        dim3 get_leap() const { return base.leap; }
+    };
+
+    template <class DERIVED>
+    void __derive_len(dim3 base, DERIVED& derived)
+    {
+        int sublen[3]      = {1, 1, 1};
+        int anchor_step[3] = {1, 1, 1};
+        __derive_len(base, derived, sublen, anchor_step, false);
+    }
+
+    template <class DERIVED>
+    void
+    __derive_len(dim3 base, DERIVED& derived, int const sublen3[3], int const anchor_step3[3], bool use_anchor = false)
+    {
+        derived.base.len3 = base;
+        derived.base.set_leap();
+        derived.base.set_serialized();
+        derived.ndim = ConfigHelper::get_ndim(base);
+
+        if (not use_anchor) {
+            derived.assigned.data    = derived.base.serialized;
+            derived.assigned.quant   = derived.base.serialized;
+            derived.assigned.outlier = derived.base.serialized;
+            derived.assigned.anchor  = 0;
+        }
+        else {
+            derived.nblock = ConfigHelper::get_pardeg3(base, sublen3);
+
+            derived.aligned.len3 = ConfigHelper::multiply_dim3(derived.nblock, sublen3);
+            derived.aligned.set_leap();
+            derived.aligned.set_serialized();
+
+            derived.anchor.len3 = ConfigHelper::get_pardeg3(base, anchor_step3);
+            derived.anchor.set_leap();
+            derived.anchor.set_serialized();
+
+            derived.assigned.data    = derived.base.serialized;
+            derived.assigned.quant   = derived.aligned.serialized;
+            derived.assigned.outlier = std::max(derived.base.serialized, derived.aligned.serialized);  // TODO
+            derived.assigned.anchor  = derived.anchor.serialized;
+        }
+    }
+
+    template <class DERIVED, typename T, typename E, typename FP = float>
+    void __debug_list_derived(DERIVED const& derived, bool use_anchor = false)
+    {
+        auto base    = derived.base;
+        auto aligned = derived.aligned;
+        auto anchor  = derived.anchor;
+        auto nblock  = derived.nblock;
+
+        printf("%-*s:  (%u, %u, %u)\n", 16, "sizeof.{T,E,FP}", (int)sizeof(T), (int)sizeof(E), (int)sizeof(FP));
+        printf("%-*s:  (%u, %u, %u)\n", 16, "base.len3", base.len3.x, base.len3.y, base.len3.z);
+        printf("%-*s:  (%u, %u, %u)\n", 16, "base.leap", base.leap.x, base.leap.y, base.leap.z);
+        printf("%-*s:  %'zu\n", 16, "base.serial", base.serialized);
+
+        if (use_anchor) {
+            printf("%-*s:  (%u, %u, %u)\n", 16, "nblock", nblock.x, nblock.y, nblock.z);
+
+            printf("%-*s:  (%u, %u, %u)\n", 16, "aligned.len3", aligned.len3.x, aligned.len3.y, aligned.len3.z);
+            printf("%-*s:  (%u, %u, %u)\n", 16, "aligned.leap", aligned.leap.x, aligned.leap.y, aligned.leap.z);
+            printf("%-*s:  %'zu\n", 16, "aligned.serial", aligned.serialized);
+
+            printf("%-*s:  (%u, %u, %u)\n", 16, "anchor.len3", anchor.len3.x, anchor.len3.y, anchor.len3.z);
+            printf("%-*s:  (%u, %u, %u)\n", 16, "anchor.leap", anchor.leap.x, anchor.leap.y, anchor.leap.z);
+            printf("%-*s:  %'zu\n", 16, "anchor.serial", anchor.serialized);
+        }
+
+        printf("%-*s:  %'zu\n", 16, "len.data", derived.assigned.data);
+        printf("%-*s:  %'zu\n", 16, "len.quant", derived.assigned.quant);
+        printf("%-*s:  %'zu\n", 16, "len.outlier", derived.assigned.outlier);
+        printf("%-*s:  %'zu\n", 16, "len.anchor", derived.assigned.anchor);
+    }
+
+    void check_rtlen()
+    {
+        auto rtlen3    = rtlen.get_len3();
+        auto alloclen3 = alloclen.get_len3();
+
+        if (rtlen3.x > alloclen3.x or rtlen3.y > alloclen3.y or rtlen3.z > alloclen3.z or
+            rtlen.base.serialized > alloclen.base.serialized)
+            throw std::runtime_error("Predictor: the runtime lengths cannot be greater than the allocation lengths.");
+    }
+
+    template <typename T, typename E, typename FP = float>
+    void debug_list_alloclen(bool use_anchor = false)
+    {
+        printf("\ndebugging, listing allocation lengths:\n");
+        __debug_list_derived<decltype(alloclen), T, E, FP>(alloclen, use_anchor);
+    }
+
+    template <typename T, typename E, typename FP = float>
+    void debug_list_rtlen(bool use_anchor = false)
+    {
+        printf("\ndebugging, listing runtime lengths:\n");
+        __debug_list_derived<decltype(rtlen), T, E, FP>(rtlen, use_anchor);
+    }
+
+   protected:
+    struct DerivedLengths alloclen, rtlen;
+
+    float time_elapsed;
+
+    // -----------------------------------------------------------------------------
+    //                                  accessor
+    // -----------------------------------------------------------------------------
+   public:
+    // helper
+    size_t get_alloclen_data() const { return alloclen.assigned.data; }
+    size_t get_alloclen_anchor() const { return alloclen.assigned.anchor; }
+    size_t get_alloclen_quant() const { return alloclen.assigned.quant; }
+    size_t get_alloclen_outlier() const { return alloclen.assigned.outlier; }
+
+    dim3   get_len3() const { return rtlen.base.len3; }
+    dim3   get_leap3() const { return rtlen.base.leap; }
+    size_t get_len_data() const { return rtlen.assigned.data; }
+    size_t get_len_anchor() const { return rtlen.assigned.anchor; }
+    size_t get_len_quant() const { return rtlen.assigned.quant; }
+    size_t get_len_outlier() const { return rtlen.assigned.outlier; }
+
+    float get_time_elapsed() const { return time_elapsed; }
+
+    size_t get_x() const { return this->rtlen.get_len3().x; }
+    size_t get_y() const { return this->rtlen.get_len3().y; }
+    size_t get_z() const { return this->rtlen.get_len3().z; }
+
+    dim3 get_leap() const { return this->rtlen.get_leap(); }
+    int  get_ndim() const { return this->rtlen.ndim; }
+
+    void derive_alloclen(cusz_predictortype predictor, dim3 base)
+    {
+        if (predictor == LorenzoI) {
+            // normal
+            this->__derive_len(base, this->alloclen);
+        }
+
+        else if (predictor == Spline3) {
+            // maximum possible
+            int sublen[3]      = {32, 8, 8};
+            int anchor_step[3] = {8, 8, 8};
+            this->__derive_len(base, this->alloclen, sublen, anchor_step, true);
+        }
+    }
+
+    void derive_rtlen(cusz_predictortype predictor, dim3 base)
+    {
+        if (predictor == LorenzoI) {
+            // normal
+            this->__derive_len(base, this->rtlen);
+        }
+        else if (predictor == Spline3) {
+            // maximum possible
+            int sublen[3]      = {32, 8, 8};
+            int anchor_step[3] = {8, 8, 8};
+            this->__derive_len(base, this->rtlen, sublen, anchor_step, true);
+        }
+    }
+
+    // "real" methods
+    virtual ~PredictorBoilerplate() = default;
+};
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/include/component/prediction.inl b/qtensor/compression/cusz/include/component/prediction.inl
new file mode 100644
index 00000000..941f2592
--- /dev/null
+++ b/qtensor/compression/cusz/include/component/prediction.inl
@@ -0,0 +1,193 @@
+/**
+ * @file prediction.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-23
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef FB315D3E_6B96_4F5D_9975_F35702205BC1
+#define FB315D3E_6B96_4F5D_9975_F35702205BC1
+
+#include <cuda_runtime.h>
+#include <cstdint>
+#include <memory>
+#include "../common.hh"
+#include "../kernel/cpplaunch_cuda.hh"
+#include "../kernel/lorenzo_all.hh"
+#include "../utils.hh"
+
+#include "cusz/type.h"
+#include "pred_boilerplate_deprecated.hh"
+
+#define DEFINE_ARRAY(VAR, TYPE) TYPE* d_##VAR{nullptr};
+
+#define ALLOCDEV(VAR, SYM, NBYTE)                    \
+    if (NBYTE != 0) {                                \
+        CHECK_CUDA(cudaMalloc(&d_##VAR, NBYTE));     \
+        CHECK_CUDA(cudaMemset(d_##VAR, 0x0, NBYTE)); \
+    }
+
+#define ALLOCDEV2(VAR, TYPE, LEN)                                 \
+    if (LEN != 0) {                                               \
+        CHECK_CUDA(cudaMalloc(&d_##VAR, sizeof(TYPE) * LEN));     \
+        CHECK_CUDA(cudaMemset(d_##VAR, 0x0, sizeof(TYPE) * LEN)); \
+    }
+
+#define FREE_DEV_ARRAY(VAR)            \
+    if (d_##VAR) {                     \
+        CHECK_CUDA(cudaFree(d_##VAR)); \
+        d_##VAR = nullptr;             \
+    }
+
+namespace cusz {
+
+template <typename T, typename E, typename FP>
+class PredictionUnified : public PredictorBoilerplate {
+   public:
+    using Origin    = T;
+    using Anchor    = T;
+    using ErrCtrl   = E;
+    using Precision = FP;
+
+   public:
+    ~PredictionUnified()
+    {  // dtor
+        FREE_DEV_ARRAY(anchor);
+        FREE_DEV_ARRAY(errctrl);
+        FREE_DEV_ARRAY(outlier);
+    }
+    PredictionUnified() {}                                   // ctor
+    PredictionUnified(const PredictionUnified&);             // copy ctor
+    PredictionUnified& operator=(const PredictionUnified&);  // copy assign
+    PredictionUnified(PredictionUnified&&);                  // move ctor
+    PredictionUnified& operator=(PredictionUnified&&);       // move assign
+
+    void init(cusz_predictortype predictor, size_t x, size_t y, size_t z, bool dbg_print = false)
+    {
+        auto len3 = dim3(x, y, z);
+        init(predictor, len3, dbg_print);
+    }
+    void init(cusz_predictortype predictor, dim3 xyz, bool dbg_print = false)
+    {
+        this->derive_alloclen(predictor, xyz);
+
+        // allocate
+        ALLOCDEV2(anchor, T, this->alloclen.assigned.anchor);
+        ALLOCDEV2(errctrl, E, this->alloclen.assigned.quant);
+        ALLOCDEV2(outlier, T, this->alloclen.assigned.outlier);
+
+        if (dbg_print) this->debug_list_alloclen<T, E, FP>();
+    }
+
+    void construct(
+        cusz_predictortype predictor,
+        dim3 const         len3,
+        T*                 data,
+        T**                ptr_anchor,
+        E**                ptr_errctrl,
+        T**                ptr_outlier,
+        double const       eb,
+        int const          radius,
+        cudaStream_t       stream)
+    {
+        *ptr_anchor  = d_anchor;
+        *ptr_errctrl = d_errctrl;
+        *ptr_outlier = d_outlier;
+
+        if (predictor == LorenzoI) {
+            derive_rtlen(LorenzoI, len3);
+            this->check_rtlen();
+
+            // ad hoc placeholder
+            // auto      anchor_len3  = dim3(0, 0, 0);
+            // auto      errctrl_len3 = dim3(0, 0, 0);
+            uint32_t* outlier_idx = nullptr;
+
+            compress_predict_lorenzo_i<T, E, FP>(
+                data, len3, eb, radius,                      //
+                d_errctrl, d_outlier, outlier_idx, nullptr,  //
+                &time_elapsed, stream);
+        }
+        else if (predictor == Spline3) {
+            this->derive_rtlen(Spline3, len3);
+            this->check_rtlen();
+
+            cusz::cpplaunch_construct_Spline3<T, E, FP>(
+                true,  //
+                data, len3, d_anchor, this->rtlen.anchor.len3, d_errctrl, this->rtlen.aligned.len3, eb, radius,
+                &time_elapsed, stream);
+        }
+    }
+
+    void reconstruct(
+        cusz_predictortype predictor,
+        dim3               len3,
+        T*                 outlier_xdata,
+        T*                 anchor,
+        E*                 errctrl,
+        double const       eb,
+        int const          radius,
+        cudaStream_t       stream)
+    {
+        if (predictor == LorenzoI) {
+            this->derive_rtlen(LorenzoI, len3);
+            this->check_rtlen();
+
+            // ad hoc placeholder
+            // auto      anchor_len3  = dim3(0, 0, 0);
+            // auto      errctrl_len3 = dim3(0, 0, 0);
+            auto      xdata       = outlier_xdata;
+            auto      outlier     = outlier_xdata;
+            uint32_t* outlier_idx = nullptr;
+
+            auto xdata_len3 = len3;
+
+            decompress_predict_lorenzo_i<T, E, FP>(
+                errctrl, xdata_len3, outlier, outlier_idx, 0, eb, radius,  //
+                xdata,                                                     //
+                &time_elapsed, stream);
+        }
+        else if (predictor == Spline3) {
+            this->derive_rtlen(Spline3, len3);
+            this->check_rtlen();
+            // this->debug_list_rtlen<T, E, FP>(true);
+
+            // launch_reconstruct_Spline3<T, E, FP>(
+            cusz::cpplaunch_reconstruct_Spline3<T, E, FP>(
+                outlier_xdata, len3, anchor, this->rtlen.anchor.len3, errctrl, this->rtlen.aligned.len3, eb, radius,
+                &time_elapsed, stream);
+        }
+    }
+
+    void clear_buffer() { cudaMemset(d_errctrl, 0x0, sizeof(E) * this->rtlen.assigned.quant); }
+
+    float get_time_elapsed() const { return time_elapsed; }
+    // size_t get_alloclen_data() const;
+    // size_t get_alloclen_quant() const;
+    // size_t get_len_data() const;
+    // size_t get_len_quant() const;
+    // size_t get_len_anchor() const;
+
+    E* expose_quant() const { return d_errctrl; }
+    E* expose_errctrl() const { return d_errctrl; }
+    T* expose_anchor() const { return d_anchor; }
+    T* expose_outlier() const { return d_outlier; }
+
+   public:
+    // data
+    DEFINE_ARRAY(anchor, T);
+    DEFINE_ARRAY(errctrl, E);
+    DEFINE_ARRAY(outlier, T);
+};
+
+}  // namespace cusz
+
+#undef ALLOCDEV
+#undef FREE_DEV_ARRAY
+#undef DEFINE_ARRAY
+
+#endif /* FB315D3E_6B96_4F5D_9975_F35702205BC1 */
diff --git a/qtensor/compression/cusz/include/component/spcodec.inl b/qtensor/compression/cusz/include/component/spcodec.inl
new file mode 100644
index 00000000..32c91ab0
--- /dev/null
+++ b/qtensor/compression/cusz/include/component/spcodec.inl
@@ -0,0 +1,218 @@
+/**
+ * @file spcodec_vec.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-08-22
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CF358238_3946_4FFC_B5E6_45C12F0C0B44
+#define CF358238_3946_4FFC_B5E6_45C12F0C0B44
+
+#include <cuda_runtime.h>
+#include <cstdint>
+#include <memory>
+
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+
+#include "../common.hh"
+#include "../kernel/spv_gpu.hh"
+#include "utils/cuda_err.cuh"
+
+#define DEFINE_ARRAY(VAR, TYPE) TYPE* d_##VAR{nullptr};
+
+#define SPVEC_ALLOCDEV(VAR, SYM)                           \
+    CHECK_CUDA(cudaMalloc(&d_##VAR, rte.nbyte[RTE::SYM])); \
+    CHECK_CUDA(cudaMemset(d_##VAR, 0x0, rte.nbyte[RTE::SYM]));
+
+#define SPVEC_FREEDEV(VAR)             \
+    if (d_##VAR) {                     \
+        CHECK_CUDA(cudaFree(d_##VAR)); \
+        d_##VAR = nullptr;             \
+    }
+
+#define SPVEC_D2DCPY(VAR, FIELD)                                                                       \
+    {                                                                                                  \
+        auto dst = d_spfmt + header.entry[Header::FIELD];                                              \
+        auto src = reinterpret_cast<BYTE*>(d_##VAR);                                                   \
+        CHECK_CUDA(cudaMemcpyAsync(dst, src, nbyte[Header::FIELD], cudaMemcpyDeviceToDevice, stream)); \
+    }
+
+namespace cusz {
+
+/*******************************************************************************
+ * sparsity-aware coder/decoder, vector
+ *******************************************************************************/
+
+template <typename T, typename M = uint32_t>
+class SpcodecVec {
+   public:
+    using Origin    = T;
+    using BYTE      = uint8_t;
+    using MetadataT = M;
+
+    struct alignas(128) Header {
+        static const int HEADER = 0;
+        static const int IDX    = 1;
+        static const int VAL    = 2;
+        static const int END    = 3;
+
+        int       self_bytes : 16;
+        size_t    uncompressed_len;
+        int       nnz;
+        MetadataT entry[END + 1];
+
+        MetadataT subfile_size() const { return entry[END]; }
+    };
+
+    struct runtime_encode_helper {
+        static const int SPFMT = 0;
+        static const int IDX   = 1;
+        static const int VAL   = 2;
+        static const int END   = 3;
+
+        uint32_t nbyte[END];
+        int      nnz{0};
+    };
+
+   private:
+    DEFINE_ARRAY(spfmt, BYTE);
+    DEFINE_ARRAY(idx, M);
+    DEFINE_ARRAY(val, T);
+
+    using RTE = runtime_encode_helper;
+
+    float milliseconds{0.0};
+
+    RTE rte;
+
+   public:
+    ~SpcodecVec()
+    {
+        SPVEC_FREEDEV(spfmt);
+        SPVEC_FREEDEV(idx);
+        SPVEC_FREEDEV(val);
+    }                                          // dtor
+    SpcodecVec() {}                            // ctor
+    SpcodecVec(const SpcodecVec&);             // copy ctor
+    SpcodecVec& operator=(const SpcodecVec&);  // copy assign
+    SpcodecVec(SpcodecVec&&);                  // move ctor
+    SpcodecVec& operator=(SpcodecVec&&);       // move assign
+
+    void init(size_t const len, int density_factor = 4, bool dbg_print = false)
+    {
+        auto max_bytes = [&]() { return len / density_factor * sizeof(T); };
+        auto init_nnz  = [&]() { return len / density_factor; };
+
+        memset(rte.nbyte, 0, sizeof(uint32_t) * RTE::END);
+        rte.nnz = init_nnz();
+
+        rte.nbyte[RTE::SPFMT] = max_bytes();
+        rte.nbyte[RTE::IDX]   = rte.nnz * sizeof(int);
+        rte.nbyte[RTE::VAL]   = rte.nnz * sizeof(T);
+
+        SPVEC_ALLOCDEV(spfmt, SPFMT);
+        SPVEC_ALLOCDEV(idx, IDX);
+        SPVEC_ALLOCDEV(val, VAL);
+
+        // if (dbg_print) debug();
+    }
+
+    void encode(
+        T*           in,
+        size_t const in_len,
+        BYTE*&       out,
+        size_t&      out_len,
+        cudaStream_t stream    = nullptr,
+        bool         dbg_print = false)
+    {
+        Header header;
+
+        psz::spv_gather<T, M>(in, in_len, this->d_val, this->d_idx, &rte.nnz, &milliseconds, stream);
+
+        subfile_collect(header, in_len, stream, dbg_print);
+        out     = d_spfmt;
+        out_len = header.subfile_size();
+    }
+
+    void decode(BYTE* coded, T* decoded, cudaStream_t stream = nullptr)
+    {
+        Header header;
+        CHECK_CUDA(cudaMemcpyAsync(&header, coded, sizeof(header), cudaMemcpyDeviceToHost, stream));
+
+#define ACCESSOR(SYM, TYPE) reinterpret_cast<TYPE*>(coded + header.entry[Header::SYM])
+        auto d_idx = ACCESSOR(IDX, uint32_t);
+        auto d_val = ACCESSOR(VAL, T);
+#undef ACCESSOR
+
+        psz::spv_scatter<T, M>(d_val, d_idx, header.nnz, decoded, &milliseconds, stream);
+    }
+
+    void clear_buffer()
+    {
+        cudaMemset(d_spfmt, 0x0, rte.nbyte[RTE::SPFMT]);
+        cudaMemset(d_idx, 0x0, rte.nbyte[RTE::IDX]);
+        cudaMemset(d_val, 0x0, rte.nbyte[RTE::VAL]);
+    }
+
+    float get_time_elapsed() const { return milliseconds; }
+
+    void subfile_collect(Header& header, size_t len, cudaStream_t stream, bool dbg_print)
+    {
+        header.self_bytes       = sizeof(Header);
+        header.uncompressed_len = len;
+        header.nnz              = rte.nnz;
+
+        // update (redundant here)
+        rte.nbyte[RTE::IDX] = sizeof(int) * rte.nnz;
+        rte.nbyte[RTE::VAL] = sizeof(T) * rte.nnz;
+
+        MetadataT nbyte[Header::END];
+        nbyte[Header::HEADER] = 128;
+        nbyte[Header::IDX]    = rte.nbyte[RTE::IDX];
+        nbyte[Header::VAL]    = rte.nbyte[RTE::VAL];
+
+        header.entry[0] = 0;
+        // *.END + 1; need to knwo the ending position
+        for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] = nbyte[i - 1]; }
+        for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; }
+
+        auto debug_header_entry = [&]() {
+            printf("\nCSR11::subfile_collect() debugging:\n");
+            printf("%-*s:  %'10ld\n", 16, "final.nnz", rte.nnz);
+            printf("  ENTRIES\n");
+
+#define PRINT_ENTRY(VAR) printf("%d %-*s:  %'10u\n", (int)Header::VAR, 14, #VAR, header.entry[Header::VAR]);
+            PRINT_ENTRY(HEADER);
+            PRINT_ENTRY(IDX);
+            PRINT_ENTRY(VAL);
+            PRINT_ENTRY(END);
+            printf("\n");
+#undef PRINT_ENTRY
+        };
+        if (dbg_print) debug_header_entry();
+
+        CHECK_CUDA(cudaMemcpyAsync(d_spfmt, &header, sizeof(header), cudaMemcpyHostToDevice, stream));
+
+        /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        SPVEC_D2DCPY(idx, IDX)
+        SPVEC_D2DCPY(val, VAL)
+
+        /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream));
+    }
+};
+
+}  // namespace cusz
+
+#undef DEFINE_ARRAY
+#undef SPVEC_ALLOCDEV
+#undef SPVEC_FREEDEV
+#undef SPVEC_D2DCPY
+
+#endif /* CF358238_3946_4FFC_B5E6_45C12F0C0B44 */
diff --git a/qtensor/compression/cusz/include/compressor.hh b/qtensor/compression/cusz/include/compressor.hh
new file mode 100644
index 00000000..7ea8c0ab
--- /dev/null
+++ b/qtensor/compression/cusz/include/compressor.hh
@@ -0,0 +1,165 @@
+/**
+ * @file compressor.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-23
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_COMPRESSOR_HH
+#define CUSZ_COMPRESSOR_HH
+
+#include <cuda_runtime.h>
+#include <memory>
+
+#include "common/type_traits.hh"
+#include "compaction.hh"
+#include "component.hh"
+#include "context.hh"
+#include "header.h"
+
+#define PUBLIC_TYPES                                                   \
+    using Predictor     = typename BINDING::Predictor;                 \
+    using Spcodec       = typename BINDING::Spcodec;                   \
+    using Codec         = typename BINDING::Codec;                     \
+    using FallbackCodec = typename BINDING::FallbackCodec;             \
+    using BYTE          = uint8_t;                                     \
+                                                                       \
+    using T    = typename BINDING::DATA;                               \
+    using FP   = typename BINDING::FP;                                 \
+    using E    = typename BINDING::ERRCTRL;                            \
+    using H    = typename Codec::Encoded;                              \
+    using M    = typename Codec::MetadataT;                            \
+    using H_FB = typename FallbackCodec::Encoded;                      \
+                                                                       \
+    using TimeRecord   = std::vector<std::tuple<const char*, double>>; \
+    using timerecord_t = TimeRecord*;
+
+namespace cusz {
+
+// extra helper
+struct CompressorHelper {
+    static int autotune_coarse_parvle(Context* ctx);
+};
+
+template <class BINDING>
+class Compressor {
+   public:
+    using Predictor     = typename BINDING::Predictor;
+    using Spcodec       = typename BINDING::Spcodec;
+    using Codec         = typename BINDING::Codec;
+    using FallbackCodec = typename BINDING::FallbackCodec;
+    using BYTE          = uint8_t;
+
+    using T    = typename Predictor::Origin;
+    using FP   = typename Predictor::Precision;
+    using E    = typename Predictor::ErrCtrl;
+    using H    = typename Codec::Encoded;
+    using M    = typename Codec::MetadataT;
+    using H_FB = typename FallbackCodec::Encoded;
+
+    using TimeRecord   = std::vector<std::tuple<const char*, double>>;
+    using timerecord_t = TimeRecord*;
+
+   private:
+    class impl;
+    std::unique_ptr<impl> pimpl;
+
+   public:
+    ~Compressor();
+    Compressor();
+    Compressor(const Compressor&);
+    Compressor& operator=(const Compressor&);
+    Compressor(Compressor&&);
+    Compressor& operator=(Compressor&&);
+
+    // methods
+    void init(Context*, bool dbg_print = false);
+    void init(Header*, bool dbg_print = false);
+    void destroy();
+    void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false);
+    void decompress(Header*, BYTE*, T*, cudaStream_t = nullptr, bool = true);
+    void clear_buffer();
+    // getter
+    void export_header(Header&);
+    void export_header(Header*);
+    void export_timerecord(TimeRecord*);
+};
+
+template <class BINDING>
+class Compressor<BINDING>::impl {
+   public:
+    using Predictor     = typename BINDING::Predictor;
+    using Spcodec       = typename BINDING::Spcodec;
+    using Codec         = typename BINDING::Codec;
+    using FallbackCodec = typename BINDING::FallbackCodec;
+    using BYTE          = uint8_t;
+
+    using T    = typename Predictor::Origin;
+    using FP   = typename Predictor::Precision;
+    using E    = typename Predictor::ErrCtrl;
+    using H    = typename Codec::Encoded;
+    using M    = typename Codec::MetadataT;
+    using H_FB = typename FallbackCodec::Encoded;
+
+    using TimeRecord   = std::vector<std::tuple<const char*, double>>;
+    using timerecord_t = TimeRecord*;
+
+   private:
+    // state
+    bool  use_fallback_codec{false};
+    bool  fallback_codec_allocated{false};
+    BYTE* d_reserved_compressed{nullptr};
+    // profiling
+    TimeRecord timerecord;
+    // header
+    Header header;
+    // components
+
+    Predictor*     predictor;
+    Spcodec*       spcodec;
+    Codec*         codec;
+    FallbackCodec* fb_codec;
+    // variables
+    uint32_t* d_freq;
+    float     time_hist;
+    dim3      data_len3;
+
+   public:
+    ~impl();
+    impl();
+
+    // public methods
+    void init(Context* config, bool dbg_print = false);
+    void init(Header* config, bool dbg_print = false);
+    void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false);
+    void decompress(Header*, BYTE*, T*, cudaStream_t = nullptr, bool = true);
+    void clear_buffer();
+
+    // getter
+    void     export_header(Header&);
+    void     export_header(Header*);
+    void     export_timerecord(TimeRecord*);
+    uint32_t get_len_data();
+
+   private:
+    // helper
+    template <class CONFIG>
+    void init_detail(CONFIG*, bool);
+    void init_codec(size_t, unsigned int, int, int, bool);
+    void collect_compress_timerecord();
+    void collect_decompress_timerecord();
+    void encode_with_exception(E*, size_t, uint32_t*, int, int, int, bool, BYTE*&, size_t&, cudaStream_t, bool);
+    void subfile_collect(T*, size_t, BYTE*, size_t, BYTE*, size_t, cudaStream_t, bool);
+    void destroy();
+    // getter
+};
+
+}  // namespace cusz
+
+#undef PUBLIC_TYPES
+
+#endif
diff --git a/qtensor/compression/cusz/include/context.hh b/qtensor/compression/cusz/include/context.hh
new file mode 100644
index 00000000..36cbae57
--- /dev/null
+++ b/qtensor/compression/cusz/include/context.hh
@@ -0,0 +1,251 @@
+#ifndef ARGPARSE_HH
+#define ARGPARSE_HH
+
+/**
+ * @file argparse.hh
+ * @author Jiannan Tian
+ * @brief Argument parser (header).
+ * @version 0.1
+ * @date 2020-09-20
+ * Created on: 20-04-24
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <cstdlib>
+#include <iostream>
+#include <regex>
+#include <string>
+
+#include "common/configs.hh"
+#include "common/definition.hh"
+#include "utils/format.hh"
+#include "utils/strhelper.hh"
+
+namespace cusz {
+
+extern const char* VERSION_TEXT;
+extern const int   version;
+extern const int   compatibility;
+
+}  // namespace cusz
+
+struct cuszCTX {
+   public:
+    // on-off's
+    struct {
+        bool construct{false}, reconstruct{false}, dryrun{false};
+        bool experiment{false};
+        bool gtest{false};
+    } cli_task;
+
+    struct {
+        bool binning{false}, logtransform{false}, prescan{false};
+    } preprocess;
+    struct {
+        bool gpu_nvcomp_cascade{false}, cpu_gzip{false};
+    } postcompress;
+
+    struct {
+        bool predefined_demo{false}, release_input{false};
+        bool anchor{false}, autotune_vle_pardeg{true}, gpu_verify{false};
+    } use;
+
+    struct {
+        bool book{false}, quant{false};
+    } export_raw;
+
+    struct {
+        bool write2disk{false}, huffman{false};
+    } skip;
+    struct {
+        bool time{false}, cr{false}, compressibility{false};
+    } report;
+
+    // filenames
+    struct {
+        std::string fname, origin_cmp, path_basename, basename, compress_output;
+    } fname;
+
+    bool verbose{false};
+
+    // Stat stat;
+
+    int read_args_status{0};
+
+    std::string opath;
+
+    std::string demo_dataset;
+    std::string dtype     = ConfigHelper::get_default_dtype();      // "f32"
+    std::string mode      = ConfigHelper::get_default_cuszmode();   // "r2r"
+    std::string predictor = ConfigHelper::get_default_predictor();  // "lorenzo"
+    std::string codec     = ConfigHelper::get_default_codec();      // "huffman-coarse"
+    std::string spcodec   = ConfigHelper::get_default_spcodec();    // "cusparse-csr"
+    std::string pipeline  = "auto";
+
+    // sparsity related: init_nnz when setting up Spcodec
+    float nz_density{SparseMethodSetup::default_density};
+    float nz_density_factor{SparseMethodSetup::default_density_factor};
+
+    uint32_t codecs_in_use{0b01};
+
+    uint32_t quant_bytewidth{2}, huff_bytewidth{4};
+
+    bool codec_force_fallback() const { return huff_bytewidth == 8; }
+
+    size_t huffman_num_uints, huffman_num_bits;
+    int    vle_sublen{512}, vle_pardeg{-1};
+
+    unsigned int x{1}, y{1}, z{1}, w{1};
+
+    struct {
+        // size_t x, y, z, w;
+        size_t len;
+    } alloclen;
+
+    size_t data_len{1}, quant_len{1}, anchor_len{1};
+    int    ndim{-1};
+
+    size_t get_len() const { return data_len; }
+
+    double eb{0.0};
+    int    dict_size{1024}, radius{512};
+
+    void load_demo_sizes();
+
+    /*******************************************************************************
+     * another configuration method, alternative to
+     *******************************************************************************/
+   public:
+    // for configuration
+    cuszCTX& set_eb(double _)
+    {
+        eb = _;
+        return *this;
+    }
+
+    cuszCTX& set_radius(int _)
+    {
+        radius    = _;
+        dict_size = radius * 2;
+        return *this;
+    }
+
+    cuszCTX& set_huffbyte(int _)
+    {
+        huff_bytewidth = _;
+        codecs_in_use  = codec_force_fallback() ? 0b11 /*use both*/ : 0b01 /*use 4-byte*/;
+        return *this;
+    }
+
+    cuszCTX& set_huffchunk(int _)
+    {
+        vle_sublen              = _;
+        use.autotune_vle_pardeg = false;
+        return *this;
+    }
+
+    cuszCTX& set_spcodec_densityfactor(int _)
+    {
+        if (_ <= 1)
+            throw std::runtime_error(
+                "Density factor for Spcodec must be >1. For example, setting the factor as 4 indicates the density "
+                "(the portion of nonzeros) is 25% in an array.");
+        nz_density_factor = _;
+        nz_density        = 1.0 / _;
+        return *this;
+    }
+
+    cuszCTX& enable_anchor(bool _)
+    {
+        use.anchor = true;
+        return *this;
+    }
+    cuszCTX& enable_input_nondestructive(bool _)
+    {
+        // placeholder
+        return *this;
+    }
+
+    cuszCTX& enable_failfast(bool _)
+    {
+        // placeholder
+        return *this;
+    }
+
+    cuszCTX& set_alloclen(size_t _)
+    {
+        alloclen.len = _;
+        return *this;
+    }
+
+    cuszCTX& set_control_string(const char* in_str);
+
+    cuszCTX& use_anchor(size_t _)
+    {
+        use.anchor = true;
+        return *this;
+    }
+
+    // set x, y, z, w, ndim, data_len
+    cuszCTX& set_len(size_t _x, size_t _y = 1, size_t _z = 1, size_t _w = 1)
+    {
+        x = _x, y = _y, z = _z, w = _w;
+
+        ndim = 4;
+        if (w == 1) ndim = 3;
+        if (z == 1) ndim = 2;
+        if (y == 1) ndim = 1;
+
+        data_len = x * y * z * w;
+
+        if (data_len == 1) throw std::runtime_error("Input data length cannot be 1 (in 1-D view).");
+        if (data_len == 0) throw std::runtime_error("Input data length cannot be 0 (in 1-D view).");
+
+        return *this;
+    }
+
+   private:
+    void derive_fnames();
+
+    void validate();
+
+   public:
+    void trap(int _status);
+
+    static void print_doc(bool full = false);
+
+   public:
+    static void parse_input_length(const char* lenstr, cuszCTX* ctx)
+    {
+        std::vector<std::string> dims;
+        ConfigHelper::parse_length_literal(lenstr, dims);
+        ctx->ndim = dims.size();
+        ctx->y = ctx->z = ctx->w = 1;
+        ctx->x                   = StrHelper::str2int(dims[0]);
+        if (ctx->ndim >= 2) ctx->y = StrHelper::str2int(dims[1]);
+        if (ctx->ndim >= 3) ctx->z = StrHelper::str2int(dims[2]);
+        if (ctx->ndim >= 4) ctx->w = StrHelper::str2int(dims[3]);
+        ctx->data_len = ctx->x * ctx->y * ctx->z * ctx->w;
+    }
+
+   public:
+    cuszCTX() = default;
+
+    cuszCTX(int argc, char** argv);
+
+    cuszCTX(const char*, bool dbg_print = false);
+};
+
+typedef struct cuszCTX cusz_context;
+
+namespace cusz {
+
+using Context   = cusz_context;
+using context_t = cusz_context*;
+
+}  // namespace cusz
+
+#endif  // ARGPARSE_HH
diff --git a/qtensor/compression/cusz/include/cusz.h b/qtensor/compression/cusz/include/cusz.h
new file mode 100644
index 00000000..694d315c
--- /dev/null
+++ b/qtensor/compression/cusz/include/cusz.h
@@ -0,0 +1,60 @@
+/**
+ * @file cusz.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-29
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include <cuda_runtime.h>
+//#define __cplusplus
+//#ifdef __cplusplus
+extern "C" {
+//#endif
+
+#ifndef CUSZ_H
+#define CUSZ_H
+
+#include <stddef.h>
+
+#include "cusz/custom.h"
+#include "cusz/record.h"
+#include "cusz/type.h"
+#include "header.h"
+
+#pragma link C++ all function
+#pragma link C++ all class
+
+cusz_compressor* cusz_create(cusz_framework* framework, cusz_datatype const type);
+
+cusz_error_status cusz_release(cusz_compressor* comp);
+
+cusz_error_status cusz_compress(
+    cusz_compressor* comp,
+    cusz_config*     config,
+    void*            uncompressed,
+    cusz_len const   uncomp_len,
+    uint8_t**        compressed,
+    size_t*          comp_bytes,
+    cusz_header*     header,
+    void*            record,
+    cudaStream_t     stream);
+
+cusz_error_status cusz_decompress(
+    cusz_compressor* comp,
+    cusz_header*     header,
+    uint8_t*         compressed,
+    size_t const     comp_len,
+    void*            decompressed,
+    cusz_len const   decomp_len,
+    void*            record,
+    cudaStream_t     stream);
+
+#endif
+
+//#ifdef __cplusplus
+}
+//#endif
diff --git a/qtensor/compression/cusz/include/cusz/custom.h b/qtensor/compression/cusz/include/cusz/custom.h
new file mode 100644
index 00000000..c44682be
--- /dev/null
+++ b/qtensor/compression/cusz/include/cusz/custom.h
@@ -0,0 +1,26 @@
+/**
+ * @file compress.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-30
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "type.h"
+
+cusz_custom_predictor     cusz_default_predictor();
+cusz_custom_codec         cusz_default_codec();
+cusz_custom_huffman_codec cusz_default_huffman_codec();
+cusz_custom_spcodec       cusz_default_spcodec();
+cusz_custom_framework*    cusz_default_framework();
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/qtensor/compression/cusz/include/cusz/it.hh b/qtensor/compression/cusz/include/cusz/it.hh
new file mode 100644
index 00000000..1e8daa34
--- /dev/null
+++ b/qtensor/compression/cusz/include/cusz/it.hh
@@ -0,0 +1,78 @@
+/**
+ * @file it.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-03-13
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <stdexcept>
+
+template <typename T, int DIM, int BLOCK>
+struct psz_buf {
+   private:
+    T*               _buf;
+    size_t           _len{1};
+    static const int stridey{BLOCK};
+    static const int stridez{BLOCK * BLOCK};
+
+   public:
+    psz_buf(bool do_memset = true)
+    {
+        if (DIM == 1) _len = BLOCK;
+        if (DIM == 2) _len = BLOCK * BLOCK;
+        if (DIM == 3) _len = BLOCK * BLOCK * BLOCK;
+        _buf = new T[_len];
+        if (do_memset) memset(_buf, 0x0, sizeof(T) * _len);
+    }
+
+    ~psz_buf() { delete[] _buf; }
+
+    T*& buf() { return _buf; }
+
+    T& operator()(int x) { return _buf[x]; }
+    T& operator()(int x, int y) { return _buf[x + y * stridey]; }
+    T& operator()(int x, int y, int z) { return _buf[x + y * stridey + z * stridez]; }
+};
+
+template <typename T, typename IDX = uint32_t>
+struct psz_outlier_serial {
+   private:
+    T*       _data;
+    IDX*     _idx;
+    uint32_t _count{0};
+    uint32_t _cap;
+
+   public:
+    psz_outlier_serial(size_t cap) : _cap(cap)
+    {
+        _data = new T[cap + 1];
+        _idx  = new IDX[cap + 1];
+        memset(_data, 0x0, sizeof(T) * cap);
+    }
+
+    ~psz_outlier_serial()
+    {
+        delete[] _data;
+        delete[] _idx;
+    }
+
+    T*&            val() { return _data; }
+    IDX*&          idx() { return _idx; }
+    uint32_t const count() { return _count; }
+
+    void record(T data, IDX idx)
+    {
+        if (_count > _cap) throw std::runtime_error("Outlier overflows.");
+        _data[_count] = data;
+        _idx[_count]  = idx;
+        ++_count;
+    }
+};
\ No newline at end of file
diff --git a/qtensor/compression/cusz/include/cusz/nd.h b/qtensor/compression/cusz/include/cusz/nd.h
new file mode 100644
index 00000000..007dfd7d
--- /dev/null
+++ b/qtensor/compression/cusz/include/cusz/nd.h
@@ -0,0 +1,15 @@
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+typedef struct psz_dim3 {
+    uint32_t x, y, z;
+} psz_dim3;
+
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/qtensor/compression/cusz/include/cusz/pn.hh b/qtensor/compression/cusz/include/cusz/pn.hh
new file mode 100644
index 00000000..1c1bb472
--- /dev/null
+++ b/qtensor/compression/cusz/include/cusz/pn.hh
@@ -0,0 +1,49 @@
+/**
+ * @file pn.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-05
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+
+// TODO typing should be more applicable
+
+namespace psz {
+namespace typing {
+
+// clang-format off
+template <int BYTEWIDTH> struct Int;
+template <> struct Int<1> { typedef int8_t  T; }; 
+template <> struct Int<2> { typedef int16_t T; }; 
+template <> struct Int<4> { typedef int32_t T; }; 
+template <> struct Int<8> { typedef int64_t T; };
+
+template <int BYTEWIDTH> struct UInt;
+template <> struct UInt<1> { typedef uint8_t  T; }; 
+template <> struct UInt<2> { typedef uint16_t T; }; 
+template <> struct UInt<4> { typedef uint32_t T; }; 
+template <> struct UInt<8> { typedef uint64_t T; };
+// clang-format on
+
+}  // namespace typing
+}  // namespace psz
+
+// TODO forward definition in another file
+template <int BYTEWIDTH>
+struct PN {
+    using UI = typename psz::typing::UInt<BYTEWIDTH>::T;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    // reference: https://lemire.me/blog/2022/11/25/making-all-your-integers-positive-with-zigzag-encoding/
+
+    static UI encode(I* x) { return (2 * (*x)) ^ ((*x) >> (BYTEWIDTH * 8 - 1)); }
+    static UI encode(I x) { return (2 * x) ^ (x >> (BYTEWIDTH * 8 - 1)); }
+    static I  decode(UI* x) { return ((*x) >> 1) ^ (-((*x) & 1)); }
+    static I  decode(UI x) { return (x >> 1) ^ (-(x & 1)); }
+};
diff --git a/qtensor/compression/cusz/include/cusz/record.h b/qtensor/compression/cusz/include/cusz/record.h
new file mode 100644
index 00000000..d285f1b1
--- /dev/null
+++ b/qtensor/compression/cusz/include/cusz/record.h
@@ -0,0 +1,38 @@
+/**
+ * @file record.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-30
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_RECORD_H
+#define CUSZ_RECORD_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct cusz_record_entry;
+
+struct cusz_record_entry {
+    const char* name;
+    double      time;
+
+    struct cusz_record_entry* next;
+};
+
+typedef struct cusz_record {
+    int n;
+
+    struct cusz_record_entry* head;
+} cusz_record;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/qtensor/compression/cusz/include/cusz/type.h b/qtensor/compression/cusz/include/cusz/type.h
new file mode 100644
index 00000000..b5f2d750
--- /dev/null
+++ b/qtensor/compression/cusz/include/cusz/type.h
@@ -0,0 +1,219 @@
+/**
+ * @file type.h
+ * @author Jiannan Tian
+ * @brief C-complient type definitions; no methods in this header.
+ * @version 0.3
+ * @date 2022-04-29
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef CUSZ_TYPE_H
+#define CUSZ_TYPE_H
+
+#include "stddef.h"
+
+enum cusz_execution_policy { CPU, CUDA };
+typedef enum cusz_execution_policy cusz_execution_policy;
+typedef enum cusz_execution_policy cusz_policy;
+typedef enum cusz_execution_policy asz_policy;
+
+//////// state enumeration
+
+typedef enum cusz_error_status {  //
+    CUSZ_SUCCESS                = 0x00,
+    CUSZ_FAIL_ONDISK_FILE_ERROR = 0x01,
+    CUSZ_FAIL_DATA_NOT_READY    = 0x02,
+    // specify error when calling CUDA API
+    CUSZ_FAIL_GPU_MALLOC,
+    CUSZ_FAIL_GPU_MEMCPY,
+    CUSZ_FAIL_GPU_ILLEGAL_ACCESS,
+    // specify error related to our own memory manager
+    CUSZ_FAIL_GPU_OUT_OF_MEMORY,
+    // when compression is useless
+    CUSZ_FAIL_INCOMPRESSIABLE,
+    // TODO component related error
+    CUSZ_FAIL_UNSUPPORTED_DATATYPE,
+    CUSZ_FAIL_UNSUPPORTED_QUANTTYPE,
+    CUSZ_FAIL_UNSUPPORTED_PRECISION,
+    CUSZ_FAIL_UNSUPPORTED_PIPELINE,
+    // not-implemented error
+    CUSZ_NOT_IMPLEMENTED = 0x0100,
+} cusz_error_status;
+
+typedef struct cusz_fixedlen_internal { /* all nullable */
+    void* encoding;
+} cusz_fixedlen_internal;
+typedef struct cusz_varlen_internal { /* all nullable */
+    void* huffman;
+    void* outlier;
+} cusz_varlen_internal;
+
+typedef enum cusz_datatype  //
+{ FP32   = 0,
+  FP64   = 1,
+  UINT8  = 10,
+  UINT16 = 11,
+  UINT32 = 12,
+  UINT64 = 13 } cusz_datatype;
+
+typedef enum cusz_executiontype  //
+{ Device = 0,
+  Host   = 1,
+  None   = 2 } cusz_executiontype;
+
+typedef enum cusz_mode  //
+{ Abs = 0,
+  Rel = 1 } cusz_mode;
+
+typedef enum cusz_pipelinetype  //
+{ Auto   = 0,
+  Dense  = 1,
+  Sparse = 2 } cusz_pipelinetype;
+
+typedef enum cusz_predictortype  //
+{ Lorenzo0  = 0,
+  LorenzoI  = 1,
+  LorenzoII = 2,
+  Spline3   = 3 } cusz_predictortype;
+
+typedef enum cusz_preprocessingtype  //
+{ FP64toFP32 = 0,
+  LogTransform,
+  ShiftedLogTransform,
+  Binning2x2,
+  Binning2x1,
+  Binning1x2,
+} cusz_preprocessingtype;
+
+typedef enum cusz_codectype  //
+{ Huffman = 0,
+  RunLength,
+  NvcompCascade,
+  NvcompLz4,
+  NvcompSnappy,
+} cusz_codectype;
+
+typedef enum cusz_spcodectype  //
+{ SparseMat = 0,
+  SparseVec = 1 } cusz_spcodectype;
+
+typedef enum cusz_huffman_booktype  //
+{ Tree      = 0,
+  Canonical = 1 } cusz_huffman_booktype;
+
+typedef enum cusz_huffman_codingtype  //
+{ Coarse = 0,
+  Fine   = 1 } cusz_huffman_codingtype;
+
+//////// configuration template
+typedef struct cusz_custom_len {
+    // clang-format off
+    union { size_t x0, x; };
+    union { size_t x1, y; };
+    union { size_t x2, z; };
+    union { size_t x3, w; };
+    // double factor;
+    // clang-format on
+} cusz_custom_len;
+typedef cusz_custom_len cusz_len;
+
+typedef struct cusz_custom_preprocessing {
+    cusz_custom_len         before;
+    cusz_custom_len         after;
+    cusz_preprocessingtype* list;
+    int                     nstep;
+
+} cusz_custom_preprocessing;
+
+typedef struct cusz_custom_predictor {
+    cusz_predictortype type;
+
+    bool anchor;
+    bool nondestructive;
+} cusz_custom_predictor;
+
+typedef struct cusz_custom_quantization {
+    int  radius;
+    bool delayed;
+} cusz_custom_quantization;
+
+typedef struct cusz_custom_codec {
+    cusz_codectype type;
+
+    bool  variable_length;
+    float presumed_density;
+} cusz_custom_codec;
+
+typedef struct cusz_custom_huffman_codec {
+    cusz_huffman_booktype   book;
+    cusz_executiontype      book_policy;
+    cusz_huffman_codingtype coding;
+
+    int booklen;
+    int coarse_pardeg;
+} cusz_custom_huffman_codec;
+
+typedef struct cusz_custom_spcodec {
+    cusz_spcodectype type;
+    float            presumed_density;
+} cusz_custom_spcodec;
+
+////// wrap-up
+
+/**
+ * @deprecated The framework could be simplifed & unified.
+ */
+typedef struct cusz_custom_framework {
+    cusz_datatype     datatype;
+    cusz_pipelinetype pipeline;
+
+    cusz_custom_predictor    predictor;
+    cusz_custom_quantization quantization;
+    cusz_custom_codec        codec;
+    // cusz_custom_spcodec      spcodec;
+
+    cusz_custom_huffman_codec huffman;
+} cusz_custom_framework;
+
+typedef cusz_custom_framework cusz_framework;
+
+typedef struct cusz_compressor_redundancy_compat_purpose {
+    void*           compressor;
+    cusz_framework* framework;
+    cusz_datatype   type;
+} cusz_compressor_compat;
+
+typedef cusz_compressor_compat cusz_compressor;
+
+typedef struct cusz_runtime_config {
+    double    eb;
+    cusz_mode mode;
+} cusz_runtime_config;
+typedef cusz_runtime_config cusz_config;
+
+typedef struct Res {
+    double min, max, rng, std;
+} Res;
+
+typedef struct cusz_stats {
+    // clang-format off
+    Res odata, xdata;
+    struct { double PSNR, MSE, NRMSE, coeff; } reduced;
+    struct { double abs, rel, pwrrel; size_t idx; } max_err;
+    struct { double lag_one, lag_two; } autocor;
+    double user_eb;
+    size_t len;
+    // clang-format on
+} cusz_stats;
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/qtensor/compression/cusz/include/framework.hh b/qtensor/compression/cusz/include/framework.hh
new file mode 100644
index 00000000..b0e99960
--- /dev/null
+++ b/qtensor/compression/cusz/include/framework.hh
@@ -0,0 +1,62 @@
+/**
+ * @file framework.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-23
+ * (create) 2021-10-06 (rev) 2022-04-23
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_FRAMEWORK
+#define CUSZ_FRAMEWORK
+
+#include "component.hh"
+#include "compressor.hh"
+
+namespace cusz {
+
+template <typename InputDataType, bool FastLowPrecision = true>
+struct Framework {
+   public:
+    /**
+     *
+     *   Predictor<T, E, (FP)>
+     *             |  |   ^
+     *             v  |   |
+     *     Spcodec<T> |   +---- default "fast-lowlowprecision"
+     *                v
+     *        Encoder<E, H>
+     */
+
+    using DATA    = InputDataType;
+    using ERRCTRL = ErrCtrlTrait<4, false>::type;  // predefined for mem. overlapping
+    using FP      = typename FastLowPrecisionTrait<FastLowPrecision>::type;
+    using Huff4   = HuffTrait<4>::type;
+    using Huff8   = HuffTrait<8>::type;
+    using Meta4   = MetadataTrait<4>::type;
+
+    template <class Codec, class FallbackCodec>
+    struct CompressorTemplate;
+
+    /* Predictor */
+    using CompatPurposePredictor = typename cusz::PredictionUnified<DATA, ERRCTRL, FP>;
+    using Predictor              = CompatPurposePredictor;
+
+    using CompatPurposeSpcodec = typename cusz::SpcodecVec<DATA, Meta4>;
+    using Spcodec              = CompatPurposeSpcodec;
+
+    /* Lossless Codec*/
+    using CodecHuffman32 = cusz::LosslessCodec<ERRCTRL, Huff4, Meta4>;
+    using CodecHuffman64 = cusz::LosslessCodec<ERRCTRL, Huff8, Meta4>;
+    using Codec          = CodecHuffman32;
+    using FallbackCodec  = CodecHuffman64;
+};
+
+using CompressorFP32 = cusz::Compressor<cusz::Framework<float>>;
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/include/header.h b/qtensor/compression/cusz/include/header.h
new file mode 100644
index 00000000..c0fd67d8
--- /dev/null
+++ b/qtensor/compression/cusz/include/header.h
@@ -0,0 +1,111 @@
+#ifndef CUSZ_HEADER_H
+#define CUSZ_HEADER_H
+
+/**
+ * @file header.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.2
+ * @date 2021-01-22
+ * (created) 2020-09-25, (rev.1) 2021-01-22 (rev.2) 2021-09-08 (rev.3) 2022-02-26
+ *
+ * @copyright (C) 2020 by Washington State University, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+typedef struct alignas(128) cusz_header {
+    static const int HEADER = 0;
+    static const int ANCHOR = 1;
+    static const int VLE    = 2;
+    static const int SPFMT  = 3;
+
+    static const int END = 4;
+
+    uint32_t self_bytes : 16;
+    uint32_t fp : 1;
+    uint32_t byte_vle : 4;  // 4, 8
+    uint32_t nz_density_factor : 8;
+    uint32_t codecs_in_use : 2;
+    uint32_t vle_pardeg;
+    uint32_t x, y, z, w;
+    double   eb;
+    uint32_t radius : 16;
+
+    uint32_t entry[END + 1];
+
+    // uint32_t byte_uncompressed : 4;  // T; 1, 2, 4, 8
+    // uint32_t byte_errctrl : 3;       // 1, 2, 4
+    // uint32_t byte_meta : 4;          // 4, 8
+    // uint32_t ndim : 3;               // 1,2,3,4
+    // size_t   data_len;
+    // size_t   errctrl_len;
+
+} cusz_header;
+
+typedef cusz_header cuszHEADER;
+
+typedef struct alignas(128) v2_cusz_header {
+    // data segments
+    static const int HEADER = 0;
+    static const int ANCHOR = 1;
+    static const int SP_IDX = 2;
+    static const int SP_VAL = 3;
+    static const int HF     = 4;
+    static const int END    = 5;
+    uint32_t         entry[END + 1];
+
+    struct {
+        uint32_t precision : 1;
+    } data;
+
+    uint32_t x, y, z, w;
+
+    // struct {
+    // uint32_t codecs_in_use : 2;
+    double   eb;
+    uint32_t radius : 16;
+    // } config;
+
+    struct {
+        uint32_t factor : 8;  // density = 1/factor
+        uint32_t count;
+    } sp;
+
+    struct {
+        uint32_t rep_bytes : 4;  // 4, 8
+        uint32_t sublen : 28;
+        uint32_t pardeg;
+    } hf;
+
+    // TODO replace the following with hf.VAR
+    uint32_t vle_pardeg;
+
+} psz_header;
+
+#ifdef __cplusplus
+}
+#endif
+
+namespace cusz {
+
+using Header   = cusz_header;
+using header_t = cusz_header*;
+
+}  // namespace cusz
+
+namespace psz {
+
+using v2_header = v2_cusz_header;
+
+}
+
+#endif
diff --git a/qtensor/compression/cusz/include/hf/hf.hh b/qtensor/compression/cusz/include/hf/hf.hh
new file mode 100644
index 00000000..692d0ea0
--- /dev/null
+++ b/qtensor/compression/cusz/include/hf/hf.hh
@@ -0,0 +1,170 @@
+/**
+ * @file codec.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-23
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_COMPONENT_CODECS_HH
+#define CUSZ_COMPONENT_CODECS_HH
+
+#include <cuda_runtime.h>
+#include <cstdint>
+#include <memory>
+
+#include "hf/hf_struct.h"
+
+#define DEFINE_ARRAY(VAR, TYPE) \
+    TYPE* d_##VAR{nullptr};     \
+    TYPE* h_##VAR{nullptr};
+
+namespace cusz {
+
+template <typename T, typename H, typename M>
+class LosslessCodec
+// : CodecInterface<T, H, M>
+{
+   public:
+    using Origin    = T;
+    using Encoded   = H;
+    using MetadataT = M;
+    using FreqT     = uint32_t;
+    using BYTE      = uint8_t;
+
+   private:
+    class impl;
+    std::unique_ptr<impl> pimpl;
+
+   public:
+    ~LosslessCodec();                                // dtor
+    LosslessCodec();                                 // ctor
+    LosslessCodec(const LosslessCodec&);             // copy ctor
+    LosslessCodec& operator=(const LosslessCodec&);  // copy assign
+    LosslessCodec(LosslessCodec&&);                  // move ctor
+    LosslessCodec& operator=(LosslessCodec&&);       // move assign
+
+    void init(size_t const, int const, int const, bool dbg_print = false);
+    void build_codebook(uint32_t*, int const, cudaStream_t = nullptr);
+    void encode(T*, size_t const, BYTE*&, size_t&, cudaStream_t = nullptr);
+    void decode(BYTE*, T*, cudaStream_t = nullptr, bool = true);
+    void clear_buffer();
+
+    float get_time_elapsed() const;
+    float get_time_book() const;
+    float get_time_lossless() const;
+};
+
+template <typename T, typename H, typename M>
+class LosslessCodec<T, H, M>::impl {
+   public:
+    using Origin    = T;
+    using Encoded   = H;
+    using MetadataT = M;
+    using FreqT     = uint32_t;
+    using BYTE      = uint8_t;
+
+   private:
+    using BOOK = H;
+    using SYM  = T;
+
+    // TODO shared header
+    struct alignas(128) Header {
+        static const int HEADER    = 0;
+        static const int REVBOOK   = 1;
+        static const int PAR_NBIT  = 2;
+        static const int PAR_ENTRY = 3;
+        static const int BITSTREAM = 4;
+        static const int END       = 5;
+
+        int       self_bytes : 16;
+        int       booklen : 16;
+        int       sublen;
+        int       pardeg;
+        size_t    uncompressed_len;
+        size_t    total_nbit;
+        size_t    total_ncell;  // TODO change to uint32_t
+        MetadataT entry[END + 1];
+
+        MetadataT subfile_size() const { return entry[END]; }
+    };
+
+    struct runtime_encode_helper {
+        static const int TMP       = 0;
+        static const int FREQ      = 1;
+        static const int BOOK      = 2;
+        static const int REVBOOK   = 3;
+        static const int PAR_NBIT  = 4;
+        static const int PAR_NCELL = 5;
+        static const int PAR_ENTRY = 6;
+        static const int BITSTREAM = 7;
+        static const int END       = 8;
+
+        uint32_t nbyte[END];
+    };
+
+    using RTE    = runtime_encode_helper;
+    using Header = struct Header;
+
+   private:
+    // array
+    DEFINE_ARRAY(tmp, H);
+    DEFINE_ARRAY(compressed, BYTE);  // alias in address
+    DEFINE_ARRAY(book, H);
+    DEFINE_ARRAY(revbook, BYTE);
+
+    DEFINE_ARRAY(par_metadata, M);
+    DEFINE_ARRAY(par_nbit, M);
+    DEFINE_ARRAY(par_ncell, M);
+    DEFINE_ARRAY(par_entry, M);
+
+    DEFINE_ARRAY(bitstream, H);
+    // helper
+    RTE rte;
+    // memory
+    static const int CELL_BITWIDTH = sizeof(H) * 8;
+    // timer
+    float milliseconds{0.0};
+    float time_hist{0.0}, time_book{0.0}, time_lossless{0.0};
+
+    hf_book*      book_desc;
+    hf_chunk*     chunk_desc_d;
+    hf_chunk*     chunk_desc_h;
+    hf_bitstream* bitstream_desc;
+
+   public:
+    ~impl();  // dtor
+    impl();   // ctor
+
+    // getter
+    float         get_time_elapsed() const;
+    float         get_time_book() const;
+    float         get_time_lossless() const;
+    size_t        get_workspace_nbyte(size_t) const;
+    size_t        get_max_output_nbyte(size_t len) const;
+    static size_t get_revbook_nbyte(int);
+    // getter for internal array
+    H*    expose_book() const;
+    BYTE* expose_revbook() const;
+    // compile-time
+    constexpr bool can_overlap_input_and_firstphase_encode();
+    // public methods
+    void init(size_t const, int const, int const, bool dbg_print = false);
+    void build_codebook(uint32_t*, int const, cudaStream_t = nullptr);
+    void encode(T*, size_t const, BYTE*&, size_t&, cudaStream_t = nullptr);
+    void decode(BYTE*, T*, cudaStream_t = nullptr, bool = true);
+    void clear_buffer();
+
+   private:
+    void subfile_collect(Header&, size_t const, int const, int const, int const, cudaStream_t stream = nullptr);
+    void dbg_println(const std::string, void*, int);
+};
+
+}  // namespace cusz
+
+#undef DEFINE_ARRAY
+
+#endif
diff --git a/qtensor/compression/cusz/include/hf/hf_bookg.hh b/qtensor/compression/cusz/include/hf/hf_bookg.hh
new file mode 100644
index 00000000..3d406f0f
--- /dev/null
+++ b/qtensor/compression/cusz/include/hf/hf_bookg.hh
@@ -0,0 +1,45 @@
+/**
+ * @file huffman_parbook.cuh
+ * @author Cody Rivera (cjrivera1@crimson.ua.edu)
+ * @brief Parallel Huffman Construction to generates canonical forward codebook (header).
+ *        Based on [Ostadzadeh et al. 2007] (https://dblp.org/rec/conf/pdpta/OstadzadehEZMB07.bib)
+ *        "A Two-phase Practical Parallel Algorithm for Construction of Huffman Codes".
+ * @version 0.1
+ * @date 2020-09-20
+ * Created on: 2020-06
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef PAR_HUFFMAN_H
+#define PAR_HUFFMAN_H
+
+// Parallel huffman global memory and kernels
+namespace asz {
+
+/**
+ * @brief get codebook and reverse codebook in parallel
+ *
+ * @tparam T input type
+ * @tparam H codebook type
+ * @param freq input device array; frequency
+ * @param codebook output device array; codebook for encoding
+ * @param dict_size dictionary size; len of freq or codebook
+ * @param reverse_codebook output device array; reverse codebook for decoding
+ * @param time_book the returned time
+ */
+template <typename T, typename H>
+void hf_buildbook_g(
+    uint32_t* freq,
+    int const booksize,
+    H*        codebook,
+    uint8_t*  reverse_codebook,
+    int const revbook_nbyte,
+    float*    time_book,
+    cudaStream_t = nullptr);
+
+}  // namespace asz
+
+#endif
diff --git a/qtensor/compression/cusz/include/hf/hf_codecg.hh b/qtensor/compression/cusz/include/hf/hf_codecg.hh
new file mode 100644
index 00000000..10cb1570
--- /dev/null
+++ b/qtensor/compression/cusz/include/hf/hf_codecg.hh
@@ -0,0 +1,82 @@
+/**
+ * @file launch_lossless.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-06-13
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef ABAACE49_2C9E_4E3C_AEFF_B016276142E1
+#define ABAACE49_2C9E_4E3C_AEFF_B016276142E1
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "hf_struct.h"
+
+template <int WIDTH>
+struct PackedWordByWidth;
+
+template <>
+struct PackedWordByWidth<4> {
+    uint32_t word : 24;
+    uint32_t bits : 8;
+};
+
+template <>
+struct PackedWordByWidth<8> {
+    uint64_t word : 56;
+    uint64_t bits : 8;
+};
+
+namespace asz {
+
+template <typename T, typename H, typename M>
+void hf_encode_coarse(
+    T*           uncompressed,
+    H*           d_internal_coded,
+    size_t const len,
+    uint32_t*    d_freq,
+    H*           d_book,
+    int const    booklen,
+    H*           d_bitstream,
+    M*           d_par_metadata,
+    M*           h_par_metadata,
+    int const    sublen,
+    int const    pardeg,
+    int          numSMs,
+    uint8_t*&    out_compressed,
+    size_t&      out_compressed_len,
+    float&       time_lossless,
+    cudaStream_t stream);
+
+template <typename T, typename H, typename M>
+void hf_encode_coarse_rev1(
+    T*            uncompressed,
+    size_t const  len,
+    hf_book*      book_desc,
+    hf_bitstream* bitstream_desc,
+    uint8_t*&     out_compressed,      // 22-10-12 buggy
+    size_t&       out_compressed_len,  // 22-10-12 buggy
+    float&        time_lossless,
+    cudaStream_t  stream);
+
+template <typename T, typename H, typename M>
+void hf_decode_coarse(
+    H*           d_bitstream,
+    uint8_t*     d_revbook,
+    int const    revbook_nbyte,
+    M*           d_par_nbit,
+    M*           d_par_entry,
+    int const    sublen,
+    int const    pardeg,
+    T*           out_decompressed,
+    float&       time_lossless,
+    cudaStream_t stream);
+
+}  // namespace asz
+
+#endif /* ABAACE49_2C9E_4E3C_AEFF_B016276142E1 */
diff --git a/qtensor/compression/cusz/include/hf/hf_struct.h b/qtensor/compression/cusz/include/hf/hf_struct.h
new file mode 100644
index 00000000..c289a795
--- /dev/null
+++ b/qtensor/compression/cusz/include/hf/hf_struct.h
@@ -0,0 +1,53 @@
+/**
+ * @file hf_struct.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-09-14
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef DA6883A3_A70F_4690_A4FA_56644987725A
+#define DA6883A3_A70F_4690_A4FA_56644987725A
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+// raw pointer array; regardless of being on host or device
+typedef struct hf_book {
+    uint32_t* freq;
+    // undertermined on definition; could be uint32_t* and uint64_t*
+    void* book;
+    int   booklen;
+} hf_book;
+
+// typedef struct hf_revbook {
+// } hf_revbook;
+
+typedef struct hf_chunk {
+    void* bits;     // how many bits each chunk
+    void* cells;    // how many cells each chunk
+    void* entries;  // jump to the chunk
+} hf_chunk;
+
+typedef struct hf_bitstream {
+    void*     buffer;
+    void*     bitstream;
+    hf_chunk* d_metadata;
+    hf_chunk* h_metadata;
+    int       sublen;  // data chunksize
+    int       pardeg;  // runtime paralleism degree
+    int       numSMs;  // number of streaming multiprocessor
+} hf_bitstream;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* DA6883A3_A70F_4690_A4FA_56644987725A */
diff --git a/qtensor/compression/cusz/include/kernel/claunch_cuda.h b/qtensor/compression/cusz/include/kernel/claunch_cuda.h
new file mode 100644
index 00000000..f19943c1
--- /dev/null
+++ b/qtensor/compression/cusz/include/kernel/claunch_cuda.h
@@ -0,0 +1,49 @@
+/**
+ * @file claunch_cuda.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-07-24
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef KERNEL_CUDA_H
+#define KERNEL_CUDA_H
+
+#include <cuda_runtime.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "../cusz/type.h"
+// #include "../hf/hf_struct.h"
+
+#define C_SPLINE3(Tliteral, Eliteral, FPliteral, T, E, FP)                                                           \
+    cusz_error_status claunch_construct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                           \
+        bool NO_R_SEPARATE, T* data, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, \
+        double const eb, int const radius, float* time_elapsed, cudaStream_t stream);                                \
+                                                                                                                     \
+    cusz_error_status claunch_reconstruct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                         \
+        T* xdata, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, double const eb,   \
+        int const radius, float* time_elapsed, cudaStream_t stream);
+
+C_SPLINE3(fp32, ui8, fp32, float, uint8_t, float);
+C_SPLINE3(fp32, ui16, fp32, float, uint16_t, float);
+C_SPLINE3(fp32, ui32, fp32, float, uint32_t, float);
+C_SPLINE3(fp32, fp32, fp32, float, float, float);
+
+#undef C_SPLINE3
+
+#undef C_COARSE_HUFFMAN_DECODE
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/qtensor/compression/cusz/include/kernel/cpplaunch_cuda.hh b/qtensor/compression/cusz/include/kernel/cpplaunch_cuda.hh
new file mode 100644
index 00000000..5c8ee08d
--- /dev/null
+++ b/qtensor/compression/cusz/include/kernel/cpplaunch_cuda.hh
@@ -0,0 +1,51 @@
+/**
+ * @file cpplaunch_cuda.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-07-27
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef COMPONENT_CALL_KERNEL_HH
+#define COMPONENT_CALL_KERNEL_HH
+
+#include "../cusz/type.h"
+#include "../hf/hf_struct.h"
+
+namespace cusz {
+
+// 22-10-27 revise later
+template <typename T, typename E, typename FP>
+cusz_error_status cpplaunch_construct_Spline3(
+    bool         NO_R_SEPARATE,
+    T*           data,
+    dim3 const   len3,
+    T*           anchor,
+    dim3 const   an_len3,
+    E*           eq,
+    dim3 const   ec_len3,
+    double const eb,
+    int const    radius,
+    float*       time_elapsed,
+    cudaStream_t stream);
+
+// 22-10-27 revise later
+template <typename T, typename E, typename FP>
+cusz_error_status cpplaunch_reconstruct_Spline3(
+    T*           xdata,
+    dim3 const   len3,
+    T*           anchor,
+    dim3 const   an_len3,
+    E*           eq,
+    dim3 const   ec_len3,
+    double const eb,
+    int const    radius,
+    float*       time_elapsed,
+    cudaStream_t stream);
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/include/kernel/dryrun.cuh b/qtensor/compression/cusz/include/kernel/dryrun.cuh
new file mode 100644
index 00000000..e96b3b96
--- /dev/null
+++ b/qtensor/compression/cusz/include/kernel/dryrun.cuh
@@ -0,0 +1,47 @@
+/**
+ * @file dryrun.cuh
+ * @author Jiannan Tian
+ * @brief cuSZ dryrun mode, checking data quality from lossy compression.
+ * @version 0.3
+ * @date 2020-09-20
+ * (create) 2020-05-14, (release) 2020-09-20, (rev1) 2021-01-25, (rev2) 2021-06-21
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_KERNEL_DRYRUN_CUH
+#define CUSZ_KERNEL_DRYRUN_CUH
+
+namespace cusz {
+
+template <typename Data = float, typename FP = float, int BLOCK = 256, int SEQ = 4>
+// template <typename Data = float, typename FP = float>
+__global__ void dualquant_dryrun_kernel(Data* in_data, Data* out_xdata, size_t len, FP ebx2_r, FP ebx2)
+{
+    {
+        constexpr auto  NTHREAD = BLOCK / SEQ;
+        __shared__ Data shmem[BLOCK];
+        auto            id_base = blockIdx.x * BLOCK;
+
+#pragma unroll
+        for (auto i = 0; i < SEQ; i++) {
+            auto id = id_base + threadIdx.x + i * NTHREAD;
+            if (id < len) {
+                shmem[threadIdx.x + i * NTHREAD] = round(in_data[id] * ebx2_r) * ebx2;
+                out_xdata[id]                    = shmem[threadIdx.x + i * NTHREAD];
+            }
+        }
+    }
+
+    // simplistic
+    // {
+    //     auto id = blockIdx.x * blockDim.x + threadIdx.x;
+    //     if (id < len) out_xdata[id] = round(in_data[id] * ebx2_r) * ebx2;
+    // }
+}
+
+}  // namespace cusz
+
+#endif
\ No newline at end of file
diff --git a/qtensor/compression/cusz/include/kernel/launch_prototype.cuh b/qtensor/compression/cusz/include/kernel/launch_prototype.cuh
new file mode 100644
index 00000000..e69de29b
diff --git a/qtensor/compression/cusz/include/kernel/launch_spm.cuh b/qtensor/compression/cusz/include/kernel/launch_spm.cuh
new file mode 100644
index 00000000..fe4cfaae
--- /dev/null
+++ b/qtensor/compression/cusz/include/kernel/launch_spm.cuh
@@ -0,0 +1,348 @@
+/**
+ * @file launch_sparse_method.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-06-13
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_LAUNCH_SPARSE_METHOD_CUH
+#define CUSZ_LAUNCH_SPARSE_METHOD_CUH
+
+#include <cuda_runtime.h>
+#include <cusparse.h>
+
+#include "../common.hh"
+#include "../utils.hh"
+#include "../utils/cusparse_err.cuh"
+
+// #if CUDART_VERSION >= 11020
+
+template <typename T, typename M>
+void launch_cusparse_gather_cuda11200_onward(
+    cusparseHandle_t     handle,
+    T*                   in_dense,
+    uint32_t const       num_rows,
+    uint32_t const       num_cols,
+    cusparseDnMatDescr_t dnmat,
+    cusparseSpMatDescr_t spmat,
+    void*                d_buffer,
+    size_t&              d_buffer_size,
+    M*                   d_rowptr,
+    M*                   d_colidx,
+    T*                   d_val,
+    int64_t&             nnz,
+    float&               milliseconds,
+    cudaStream_t         stream)
+{
+    auto ld = num_rows;
+
+    auto gather11_init_mat = [&]() {
+        // create dense matrix wrapper
+        CHECK_CUSPARSE(
+            cusparseCreateDnMat(&dnmat, num_rows, num_cols, ld, in_dense, cuszCUSPARSE<T>::type, CUSPARSE_ORDER_ROW));
+
+        // create CSR wrapper
+        CHECK_CUSPARSE(cusparseCreateCsr(
+            &spmat, num_rows, num_cols, 0, d_rowptr, nullptr, nullptr, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+            CUSPARSE_INDEX_BASE_ZERO, cuszCUSPARSE<T>::type));
+    };
+
+    auto gather11_init_buffer = [&]() {
+        {  // allocate an external buffer if needed
+            cuda_timer_t t;
+            t.timer_start(stream);
+
+            CHECK_CUSPARSE(cusparseDenseToSparse_bufferSize(
+                handle, dnmat, spmat, CUSPARSE_DENSETOSPARSE_ALG_DEFAULT, &d_buffer_size));
+
+            t.timer_end(stream);
+            milliseconds += t.get_time_elapsed();
+
+            CHECK_CUDA(cudaMalloc(&d_buffer, d_buffer_size));
+        }
+    };
+
+    auto gather11_analysis = [&]() {
+        cuda_timer_t t;
+        t.timer_start(stream);
+
+        CHECK_CUSPARSE(
+            cusparseDenseToSparse_analysis(handle, dnmat, spmat, CUSPARSE_DENSETOSPARSE_ALG_DEFAULT, d_buffer));
+
+        t.timer_end(stream);
+        milliseconds += t.get_time_elapsed();
+    };
+
+    int64_t num_rows_tmp, num_cols_tmp;
+
+    auto gather11_get_nnz = [&]() {
+        // get number of non-zero elements
+        CHECK_CUSPARSE(cusparseSpMatGetSize(spmat, &num_rows_tmp, &num_cols_tmp, &nnz));
+    };
+
+    auto gather11_get_rowptr = [&]() {
+        // reset offsets, column indices, and values pointers
+        CHECK_CUSPARSE(cusparseCsrSetPointers(spmat, d_rowptr, d_colidx, d_val));
+    };
+
+    auto gather11_dn2csr = [&]() {
+        cuda_timer_t t;
+        t.timer_start(stream);
+
+        CHECK_CUSPARSE(
+            cusparseDenseToSparse_convert(handle, dnmat, spmat, CUSPARSE_DENSETOSPARSE_ALG_DEFAULT, d_buffer));
+
+        t.timer_end(stream);
+        milliseconds += t.get_time_elapsed();
+    };
+
+    /********************************************************************************/
+    milliseconds = 0;
+
+    CHECK_CUSPARSE(cusparseCreate(&handle));
+    if (stream) CHECK_CUSPARSE(cusparseSetStream(handle, stream));  // TODO move out
+
+    gather11_init_mat();
+    gather11_init_buffer();
+    gather11_analysis();
+    gather11_get_nnz();
+    gather11_get_rowptr();
+    gather11_dn2csr();
+
+    // destroy matrix/vector descriptors
+    CHECK_CUSPARSE(cusparseDestroyDnMat(dnmat));
+    CHECK_CUSPARSE(cusparseDestroySpMat(spmat));
+    CHECK_CUSPARSE(cusparseDestroy(handle));
+}
+
+// void SpcodecCSR<T, M>::impl::scatter_CUDA_11020(BYTE* in_csr, T* out_dense, cudaStream_t stream, bool
+// header_on_device)
+
+template <typename T, typename M>
+void launch_cusparse_scatter_cuda11200_onward(
+    cusparseHandle_t     handle,
+    int*                 d_rowptr,
+    int*                 d_colidx,
+    T*                   d_val,
+    int const            num_rows,
+    int const            num_cols,
+    int const            nnz,
+    cusparseDnMatDescr_t dnmat,
+    cusparseSpMatDescr_t spmat,
+    void*                d_buffer,
+    size_t&              d_buffer_size,
+    T*                   out_dense,
+    float&               milliseconds,
+    cudaStream_t         stream)
+{
+    auto ld = num_rows;
+
+    auto scatter11_init_mat = [&]() {
+        CHECK_CUSPARSE(cusparseCreateCsr(
+            &spmat, num_rows, num_cols, nnz, d_rowptr, d_colidx, d_val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+            CUSPARSE_INDEX_BASE_ZERO, cuszCUSPARSE<T>::type));
+
+        CHECK_CUSPARSE(
+            cusparseCreateDnMat(&dnmat, num_rows, num_cols, ld, out_dense, cuszCUSPARSE<T>::type, CUSPARSE_ORDER_ROW));
+    };
+
+    auto scatter11_init_buffer = [&]() {
+        cuda_timer_t t;
+        t.timer_start(stream);
+
+        // allocate an external buffer if needed
+        CHECK_CUSPARSE(
+            cusparseSparseToDense_bufferSize(handle, spmat, dnmat, CUSPARSE_SPARSETODENSE_ALG_DEFAULT, &d_buffer_size));
+
+        t.timer_end(stream);
+        milliseconds += t.get_time_elapsed();
+
+        CHECK_CUDA(cudaMalloc(&d_buffer, d_buffer_size));
+    };
+
+    auto scatter11_csr2dn = [&]() {
+        cuda_timer_t t;
+        t.timer_start(stream);
+
+        CHECK_CUSPARSE(cusparseSparseToDense(handle, spmat, dnmat, CUSPARSE_SPARSETODENSE_ALG_DEFAULT, d_buffer));
+
+        t.timer_end(stream);
+        milliseconds += t.get_time_elapsed();
+    };
+
+    /******************************************************************************/
+    milliseconds = 0;
+
+    CHECK_CUSPARSE(cusparseCreate(&handle));
+    if (stream) CHECK_CUSPARSE(cusparseSetStream(handle, stream));
+
+    scatter11_init_mat();
+    scatter11_init_buffer();
+    scatter11_csr2dn();
+
+    // destroy matrix/vector descriptors
+    CHECK_CUSPARSE(cusparseDestroySpMat(spmat));
+    CHECK_CUSPARSE(cusparseDestroyDnMat(dnmat));
+    CHECK_CUSPARSE(cusparseDestroy(handle));
+}
+
+// #elif CUDART_VERSION >= 10000
+
+template <typename T, typename M>
+void launch_cusparse_gather_before_cuda11200(
+    cusparseHandle_t   handle,
+    T*                 in_dense,
+    uint32_t const     num_rows,
+    uint32_t const     num_cols,
+    cusparseMatDescr_t mat_desc,
+    void*              d_work,
+    size_t&            lwork_in_bytes,
+    M*                 d_rowptr,
+    M*                 d_colidx,
+    T*                 d_val,
+    int&               nnz,  // int is for compatibility; cuSPARSE of CUDA 11 changed data type
+    float&             milliseconds,
+    cudaStream_t       stream)
+{
+    auto ld = num_rows;
+
+    float threshold{0};
+    auto  has_ext_stream{false};
+
+    /******************************************************************************/
+
+    auto gather10_init_and_probe = [&]() {
+        {  // init
+
+            CHECK_CUSPARSE(cusparseCreateMatDescr(&mat_desc));                            // 4. create rte.mat_desc
+            CHECK_CUSPARSE(cusparseSetMatIndexBase(mat_desc, CUSPARSE_INDEX_BASE_ZERO));  // zero based
+            CHECK_CUSPARSE(cusparseSetMatType(mat_desc, CUSPARSE_MATRIX_TYPE_GENERAL));   // type
+        }
+
+        {  // probe
+            cuda_timer_t t;
+            t.timer_start(stream);
+
+            CHECK_CUSPARSE(cusparseSpruneDense2csr_bufferSizeExt(
+                handle, num_rows, num_cols, in_dense, ld, &threshold, mat_desc, d_val, d_rowptr, d_colidx,
+                &lwork_in_bytes));
+
+            t.timer_end(stream);
+            milliseconds += t.get_time_elapsed();
+        }
+
+        if (nullptr != d_work) cudaFree(d_work);
+        CHECK_CUDA(cudaMalloc((void**)&d_work, lwork_in_bytes));  // TODO where to release d_work?
+    };
+
+    auto gather10_compute_rowptr_and_nnz = [&]() {  // step 4
+        cuda_timer_t t;
+        t.timer_start(stream);
+
+        CHECK_CUSPARSE(cusparseSpruneDense2csrNnz(
+            handle, num_rows, num_cols, in_dense, ld, &threshold, mat_desc, d_rowptr, &nnz, d_work));
+
+        t.timer_end(stream);
+        milliseconds += t.get_time_elapsed();
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    };
+
+    auto gather10_compute_colidx_and_val = [&]() {  // step 5
+        cuda_timer_t t;
+        t.timer_start(stream);
+
+        CHECK_CUSPARSE(cusparseSpruneDense2csr(  //
+            handle, num_rows, num_cols, in_dense, ld, &threshold, mat_desc, d_val, d_rowptr, d_colidx, d_work));
+
+        t.timer_end(stream);
+        milliseconds += t.get_time_elapsed();
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+    };
+
+    /********************************************************************************/
+    milliseconds = 0;
+
+    if (stream)
+        has_ext_stream = true;
+    else
+        CHECK_CUDA(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));  // 1. create stream
+    CHECK_CUSPARSE(cusparseCreate(&handle));                                    // 2. create handle
+    CHECK_CUSPARSE(cusparseSetStream(handle, stream));                          // 3. bind stream
+
+    gather10_init_and_probe();
+    gather10_compute_rowptr_and_nnz();
+    if (nnz == 0) { return; }
+    gather10_compute_colidx_and_val();
+
+    // TODO no need to destroy?
+    if (handle) cusparseDestroy(handle);
+    if (mat_desc) cusparseDestroyMatDescr(mat_desc);
+    if ((not has_ext_stream) and stream) cudaStreamDestroy(stream);
+    /********************************************************************************/
+}
+
+// #endif
+
+template <typename T, typename M>
+void launch_cusparse_scatter_before_cuda11200(
+    cusparseHandle_t   handle,
+    int*               d_rowptr,
+    int*               d_colidx,
+    T*                 d_val,
+    int const          num_rows,
+    int const          num_cols,
+    int const          nnz,
+    cusparseMatDescr_t mat_desc,
+    void*              d_buffer,
+    size_t&            d_buffer_size,
+    T*                 out_dense,
+    float&             milliseconds,
+    cudaStream_t       stream)
+{
+    auto ld = num_rows;
+
+    auto has_external_stream = false;
+
+    /******************************************************************************/
+
+    auto scatter10_init = [&]() {
+        CHECK_CUSPARSE(cusparseCreateMatDescr(&mat_desc));                            // 4. create descr
+        CHECK_CUSPARSE(cusparseSetMatIndexBase(mat_desc, CUSPARSE_INDEX_BASE_ZERO));  // zero based
+        CHECK_CUSPARSE(cusparseSetMatType(mat_desc, CUSPARSE_MATRIX_TYPE_GENERAL));   // type
+    };
+
+    auto scatter10_sparse2dense = [&]() {
+        cuda_timer_t t;
+        t.timer_start(stream);
+
+        CHECK_CUSPARSE(
+            cusparseScsr2dense(handle, num_rows, num_cols, mat_desc, d_val, d_rowptr, d_colidx, out_dense, ld));
+
+        t.timer_end();
+        milliseconds += t.get_time_elapsed();
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+    };
+
+    /******************************************************************************/
+    if (stream)
+        has_external_stream = true;
+    else
+        CHECK_CUDA(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    CHECK_CUSPARSE(cusparseCreate(&handle));
+    CHECK_CUSPARSE(cusparseSetStream(handle, stream));
+
+    scatter10_init();
+    scatter10_sparse2dense();
+
+    if (handle) cusparseDestroy(handle);
+    if (mat_desc) cusparseDestroyMatDescr(mat_desc);
+    if ((not has_external_stream) and stream) cudaStreamDestroy(stream);
+    /******************************************************************************/
+}
+
+#endif
diff --git a/qtensor/compression/cusz/include/kernel/lorenzo_all.h b/qtensor/compression/cusz/include/kernel/lorenzo_all.h
new file mode 100644
index 00000000..89f6f38f
--- /dev/null
+++ b/qtensor/compression/cusz/include/kernel/lorenzo_all.h
@@ -0,0 +1,44 @@
+/**
+ * @file kernel_cuda.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-02
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef BD8A19DE_E881_4A26_9464_C51DAC6B14E1
+#define BD8A19DE_E881_4A26_9464_C51DAC6B14E1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "cusz/type.h"
+
+#define C_LORENZOI(Tliteral, Eliteral, FPliteral, T, E, FP)                                           \
+    cusz_error_status compress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral(        \
+        T* const data, dim3 const len3, double const eb, E* delta, bool* signum, float* time_elapsed, \
+        cudaStream_t stream);                                                                         \
+    cusz_error_status decompress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral(      \
+        E* delta, bool* signum, dim3 const len3, double const eb, T* xdata, float* time_elapsed, cudaStream_t stream);
+
+C_LORENZOI(fp32, ui8, fp32, float, uint8_t, float);
+C_LORENZOI(fp32, ui16, fp32, float, uint16_t, float);
+C_LORENZOI(fp32, ui32, fp32, float, uint32_t, float);
+C_LORENZOI(fp32, fp32, fp32, float, float, float);
+
+C_LORENZOI(fp64, ui8, fp64, double, uint8_t, double);
+C_LORENZOI(fp64, ui16, fp64, double, uint16_t, double);
+C_LORENZOI(fp64, ui32, fp64, double, uint32_t, double);
+C_LORENZOI(fp64, fp32, fp64, double, float, double);
+
+#undef C_LORENZOI
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BD8A19DE_E881_4A26_9464_C51DAC6B14E1 */
diff --git a/qtensor/compression/cusz/include/kernel/lorenzo_all.hh b/qtensor/compression/cusz/include/kernel/lorenzo_all.hh
new file mode 100644
index 00000000..f7308fe1
--- /dev/null
+++ b/qtensor/compression/cusz/include/kernel/lorenzo_all.hh
@@ -0,0 +1,96 @@
+/**
+ * @file kernel_cuda.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-01
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef C8C37773_7EF2_439B_B0EF_14D0058DC714
+#define C8C37773_7EF2_439B_B0EF_14D0058DC714
+
+#include <stdint.h>
+#include "cusz/type.h"
+
+template <typename T, typename EQ = int32_t, typename FP = T>
+cusz_error_status compress_predict_lorenzo_i(
+    T* const     data,          // input
+    dim3 const   len3,          //
+    double const eb,            // input (config)
+    int const    radius,        //
+    EQ* const    eq,            // output
+    T*           outlier,       //
+    uint32_t*    outlier_idx,   //
+    uint32_t*    num_outliers,  //
+    float*       time_elapsed,  // optional
+    cudaStream_t stream);       //
+
+template <typename T, typename EQ = int32_t, typename FP = T>
+cusz_error_status decompress_predict_lorenzo_i(
+    EQ*            eq,            // input
+    dim3 const     len3,          //
+    T*             outlier,       //
+    uint32_t*      outlier_idx,   //
+    uint32_t const num_outliers,  //
+    double const   eb,            // input (config)
+    int const      radius,        //
+    T*             xdata,         // output
+    float*         time_elapsed,  // optional
+    cudaStream_t   stream);
+
+namespace asz {
+namespace experimental {
+
+template <typename T, typename DeltaT, typename FP>
+cusz_error_status compress_predict_lorenzo_ivar(
+    T*           data,
+    dim3 const   len3,
+    double const eb,
+    DeltaT*      delta,
+    bool*        signum,
+    float*       time_elapsed,
+    cudaStream_t stream);
+
+template <typename T, typename DeltaT, typename FP>
+cusz_error_status decompress_predict_lorenzo_ivar(
+    DeltaT*      delta,
+    bool*        signum,
+    dim3 const   len3,
+    double const eb,
+    T*           xdata,
+    float*       time_elapsed,
+    cudaStream_t stream);
+
+}  // namespace experimental
+}  // namespace asz
+
+template <typename T, typename EQ, typename FP>
+cusz_error_status compress_predict_lorenzo_iproto(
+    T* const     data,          // input
+    dim3 const   len3,          //
+    double const eb,            // input (config)
+    int const    radius,        //
+    EQ* const    eq,            // output
+    T*           outlier,       //
+    uint32_t*    outlier_idx,   //
+    uint32_t*    num_outliers,  //
+    float*       time_elapsed,  // optional
+    cudaStream_t stream);       //
+
+template <typename T, typename EQ, typename FP>
+cusz_error_status decompress_predict_lorenzo_iproto(
+    EQ*            eq,            // input
+    dim3 const     len3,          //
+    T*             outlier,       //
+    uint32_t*      outlier_idx,   //
+    uint32_t const num_outliers,  //
+    double const   eb,            // input (config)
+    int const      radius,        //
+    T*             xdata,         // output
+    float*         time_elapsed,  // optional
+    cudaStream_t   stream);
+
+#endif /* C8C37773_7EF2_439B_B0EF_14D0058DC714 */
diff --git a/qtensor/compression/cusz/include/kernel/spv_gpu.h b/qtensor/compression/cusz/include/kernel/spv_gpu.h
new file mode 100644
index 00000000..fb50119c
--- /dev/null
+++ b/qtensor/compression/cusz/include/kernel/spv_gpu.h
@@ -0,0 +1,42 @@
+/**
+ * @file spv_gpu.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-29
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef B1B21251_C3C3_4BC1_B4E0_75D9D86EE7F3
+#define B1B21251_C3C3_4BC1_B4E0_75D9D86EE7F3
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <cuda_runtime.h>
+#include <stdint.h>
+
+#define SPV(Tliteral, Mliteral, T, M)                                                                               \
+    void spv_gather_T##Tliteral##_M##Mliteral(                                                                      \
+        T* in, size_t const in_len, T* d_val, uint32_t* d_idx, int* nnz, float* milliseconds, cudaStream_t stream); \
+                                                                                                                    \
+    void spv_scatter_T##Tliteral##_M##Mliteral(                                                                     \
+        T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream);
+
+SPV(ui8, ui32, uint8_t, uint32_t)
+SPV(ui16, ui32, uint16_t, uint32_t)
+SPV(ui32, ui32, uint32_t, uint32_t)
+SPV(ui64, ui32, uint64_t, uint32_t)
+SPV(fp32, ui32, float, uint32_t)
+SPV(fp64, ui32, double, uint32_t)
+
+#undef SPV
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* B1B21251_C3C3_4BC1_B4E0_75D9D86EE7F3 */
diff --git a/qtensor/compression/cusz/include/kernel/spv_gpu.hh b/qtensor/compression/cusz/include/kernel/spv_gpu.hh
new file mode 100644
index 00000000..6b978abc
--- /dev/null
+++ b/qtensor/compression/cusz/include/kernel/spv_gpu.hh
@@ -0,0 +1,33 @@
+/**
+ * @file spv_gpu.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-29
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef A54D2009_1D4F_4113_9E26_9695A3669224
+#define A54D2009_1D4F_4113_9E26_9695A3669224
+#include <cstdint>
+
+namespace psz {
+
+template <typename T, typename M>
+void spv_gather(
+    T*           in,
+    size_t const in_len,
+    T*           d_val,
+    uint32_t*    d_idx,
+    int*         nnz,
+    float*       milliseconds,
+    cudaStream_t stream);
+
+template <typename T, typename M>
+void spv_scatter(T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream);
+
+}  // namespace psz
+
+#endif /* A54D2009_1D4F_4113_9E26_9695A3669224 */
diff --git a/qtensor/compression/cusz/include/kernel/v2_lorenzo.hh b/qtensor/compression/cusz/include/kernel/v2_lorenzo.hh
new file mode 100644
index 00000000..861a2e2c
--- /dev/null
+++ b/qtensor/compression/cusz/include/kernel/v2_lorenzo.hh
@@ -0,0 +1,32 @@
+/**
+ * @file v2_lorenzo.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-23
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CD52BDA6_9376_43FF_BFDA_693204FA8762
+#define CD52BDA6_9376_43FF_BFDA_693204FA8762
+
+#include "compaction.hh"
+#include "cusz/type.h"
+
+template <typename T, typename E, typename FP>
+cusz_error_status v2_compress_predict_lorenzo_i(
+    T* const          data,          // input
+    dim3 const        data_len3,     //
+    double const      eb,            // input (config)
+    int const         radius,        //
+    E* const          eq,            // output
+    dim3 const        eq_len3,       //
+    T* const          anchor,        //
+    dim3 const        anchor_len3,   //
+    CompactionDRAM<T> outlier,       //
+    float*            time_elapsed,  // optional
+    cudaStream_t      stream);            //
+
+#endif /* CD52BDA6_9376_43FF_BFDA_693204FA8762 */
diff --git a/qtensor/compression/cusz/include/pipeline/compaction_g.inl b/qtensor/compression/cusz/include/pipeline/compaction_g.inl
new file mode 100644
index 00000000..fd312c82
--- /dev/null
+++ b/qtensor/compression/cusz/include/pipeline/compaction_g.inl
@@ -0,0 +1,73 @@
+/**
+ * @file compaction_g.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2022-12-22
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef F712F74C_7488_4445_83EE_EE7F88A64BBA
+#define F712F74C_7488_4445_83EE_EE7F88A64BBA
+
+#include <cuda_runtime.h>
+#include <cstring>
+#include "compaction.hh"
+
+#include <stdint.h>
+#include <stdlib.h>
+
+// TODO filename -> `compaction`
+template <typename T>
+struct CompactionDRAM {
+    using type = T;
+    T*        val;
+    uint32_t* idx;
+    uint32_t* count;
+    uint32_t* h_count;
+
+    void allocate(size_t len, bool device = true)
+    {
+        if (device) {
+            cudaMalloc(&idx, sizeof(uint32_t) * len);
+            cudaMalloc(&val, sizeof(T) * len);
+            cudaMalloc(&count, sizeof(T) * 1);
+            cudaMallocHost(&h_count, sizeof(T) * 1);
+        }
+        else {
+            cudaMallocHost(&idx, sizeof(uint32_t) * len);
+            cudaMallocHost(&val, sizeof(T) * len);
+            cudaMallocHost(&count, sizeof(T) * 1);
+
+            memset(count, 0x0, sizeof(T) * 1);
+        }
+    }
+
+    void make_count_host_accessible(cudaStream_t stream)
+    {
+        cudaMemcpyAsync(h_count, count, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream);
+    }
+
+    uint32_t access_count_on_host() { return *h_count; }
+
+    void allocate_managed(size_t len)
+    {
+        cudaMallocManaged(&idx, sizeof(uint32_t) * len);
+        cudaMallocManaged(&val, sizeof(T) * len);
+        cudaMallocManaged(&count, sizeof(T) * 1);
+
+        cudaMemset(count, 0x0, sizeof(T) * 1);
+    }
+
+    void destroy()
+    {
+        if (h_count) cudaFreeHost(h_count);
+        cudaFree(idx);
+        cudaFree(val);
+        cudaFree(count);
+    }
+};
+
+#endif /* F712F74C_7488_4445_83EE_EE7F88A64BBA */
diff --git a/qtensor/compression/cusz/include/pipeline/v2_compressor.hh b/qtensor/compression/cusz/include/pipeline/v2_compressor.hh
new file mode 100644
index 00000000..fa843f5f
--- /dev/null
+++ b/qtensor/compression/cusz/include/pipeline/v2_compressor.hh
@@ -0,0 +1,146 @@
+/**
+ * @file v2_compressor.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-29
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include <cuda_runtime.h>
+#include <memory>
+
+#include "common/type_traits.hh"
+#include "compaction.hh"
+#include "component.hh"
+#include "context.hh"
+#include "header.h"
+
+// TODO move outward
+#include "compaction_g.inl"
+
+using Context = cusz::Context;
+
+namespace psz {
+
+template <class CONFIG>
+class v2_Compressor {
+   public:
+    using BYTE = uint8_t;
+
+    using T    = typename CONFIG::Predictor::Origin;
+    using FP   = typename CONFIG::Predictor::Precision;
+    using E    = typename CONFIG::Predictor::ErrCtrl;
+    using H    = typename CONFIG::Codec::Encoded;
+    using M    = typename CONFIG::Codec::MetadataT;
+    using H_FB = typename CONFIG::FallbackCodec::Encoded;
+
+    using TimeRecord   = std::vector<std::tuple<const char*, double>>;
+    using timerecord_t = TimeRecord*;
+
+   private:
+    class impl;
+    std::unique_ptr<impl> pimpl;
+
+   public:
+    ~v2_Compressor();
+    v2_Compressor();
+    v2_Compressor(const v2_Compressor&);
+    v2_Compressor& operator=(const v2_Compressor&);
+    v2_Compressor(v2_Compressor&&);
+    v2_Compressor& operator=(v2_Compressor&&);
+
+    // methods
+    void init(Context*);
+    void init(v2_header*);
+    void destroy();
+    void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false);
+    void decompress(v2_header*, BYTE*, T*, cudaStream_t = nullptr, bool = true);
+    void clear_buffer();
+    // getter
+    void export_header(v2_header&);
+    void export_header(v2_header*);
+    void export_timerecord(TimeRecord*);
+};
+
+template <class CONFIG>
+class v2_Compressor<CONFIG>::impl {
+   public:
+    using Codec = typename CONFIG::Codec;
+    using BYTE  = uint8_t;
+    using T     = typename CONFIG::Predictor::Origin;
+    using FP    = typename CONFIG::Predictor::Precision;
+    using EQ    = uint32_t;
+    using H     = typename CONFIG::Codec::Encoded;
+    using M     = uint32_t;
+    using IDX   = uint32_t;
+    using H_FB  = typename CONFIG::FallbackCodec::Encoded;
+
+    using TimeRecord   = std::vector<std::tuple<const char*, double>>;
+    using timerecord_t = TimeRecord*;
+
+   private:
+    // state
+    // bool  use_fallback_codec{false};
+    // bool  fallback_codec_allocated{false};
+
+    BYTE* d_reserved_for_archive{nullptr};
+
+    // profiling
+    // TimeRecord timerecord;
+    // header
+    v2_header header;
+    // components
+
+    Codec* codec;
+
+    // arrays
+    T*                d_anchor;
+    uint32_t*         d_errctrl;
+    uint32_t*         d_freq;
+    CompactionDRAM<T> outlier;
+
+    int sp_factor{20};
+
+    struct {
+        float construct, hist, encode;
+    } comp_time;
+
+    struct {
+        float scatter, decode, reconstruct;
+    } decomp_time;
+
+    dim3   data_len3;
+    size_t data_len;
+
+   public:
+    ~impl();
+    impl();
+
+    // public methods
+    void init(Context* config);
+    void init(v2_header* config);
+
+    void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false);
+    void decompress(v2_header*, BYTE*, T*, cudaStream_t = nullptr, bool = true);
+
+    // getter
+    void export_header(v2_header&);
+    void export_header(v2_header*);
+    // void export_timerecord(TimeRecord*);
+    BYTE* var_archive() { return d_reserved_for_archive; };
+
+   private:
+    // helper
+    template <class ContextOrHeader>
+    void __init(ContextOrHeader*);
+
+    // void collect_compress_timerecord();
+    // void collect_decompress_timerecord();
+    void destroy();
+    // getter
+};
+
+}  // namespace psz
diff --git a/qtensor/compression/cusz/include/stat/compare.h b/qtensor/compression/cusz/include/stat/compare.h
new file mode 100644
index 00000000..9575d72a
--- /dev/null
+++ b/qtensor/compression/cusz/include/stat/compare.h
@@ -0,0 +1,57 @@
+/**
+ * @file compare.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-09
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CE05A256_23CB_4243_8839_B1FDA9C540D2
+#define CE05A256_23CB_4243_8839_B1FDA9C540D2
+
+#ifdef __cplus_plus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "../cusz/type.h"
+
+#define DESCRIPTION(Tliteral, T) void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]);
+
+#define COMPARE_LOSSLESS(Tliteral, T)                                  \
+    bool cppstd_identical_T##Tliteral(T* d1, T* d2, size_t const len); \
+    bool thrustgpu_identical_T##Tliteral(T* d1, T* d2, size_t const len);
+
+#define COMPARE_LOSSY(Tliteral, T)                                                                                     \
+    bool cppstd_error_bounded_T##Tliteral(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx);    \
+    void cppstd_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len);                       \
+                                                                                                                       \
+    bool thrustgpu_error_bounded_T##Tliteral(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx); \
+    void thrustgpu_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len);
+
+DESCRIPTION(ui8, uint8_t)
+DESCRIPTION(ui16, uint16_t)
+DESCRIPTION(ui32, uint32_t)
+DESCRIPTION(fp32, float)
+DESCRIPTION(fp64, double)
+
+COMPARE_LOSSLESS(fp32, float)
+COMPARE_LOSSLESS(fp64, double)
+COMPARE_LOSSLESS(ui8, uint8_t)
+COMPARE_LOSSLESS(ui16, uint16_t)
+COMPARE_LOSSLESS(ui32, uint32_t)
+
+COMPARE_LOSSY(fp32, float)
+COMPARE_LOSSY(fp64, double)
+
+#undef CPPSTD_COMPARE
+
+#ifdef __cplus_plus
+}
+#endif
+
+#endif /* CE05A256_23CB_4243_8839_B1FDA9C540D2 */
diff --git a/qtensor/compression/cusz/include/stat/compare_cpu.hh b/qtensor/compression/cusz/include/stat/compare_cpu.hh
new file mode 100644
index 00000000..19846adc
--- /dev/null
+++ b/qtensor/compression/cusz/include/stat/compare_cpu.hh
@@ -0,0 +1,62 @@
+/**
+ * @file compare_cpu.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-09
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef C93C3857_8821_4988_B6F0_4E885060F642
+#define C93C3857_8821_4988_B6F0_4E885060F642
+
+#include "compare.h"
+
+namespace psz {
+
+template <typename T>
+bool cppstd_identical(T* d1, T* d2, size_t const len);
+
+template <typename T>
+bool cppstd_error_bounded(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx);
+
+template <typename T>
+void cppstd_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t const len);
+
+}  // namespace psz
+
+#define CPPSTD_COMPARE_LOSSLESS(Tliteral, T)                        \
+    template <>                                                     \
+    bool psz::cppstd_identical<T>(T * d1, T * d2, size_t const len) \
+    {                                                               \
+        return cppstd_identical_T##Tliteral(d1, d2, len);           \
+    }
+
+#define CPPSTD_COMPARE_LOSSY(Tliteral, T)                                                                        \
+    template <>                                                                                                  \
+    bool psz::cppstd_error_bounded<T>(T * a, T * b, size_t const len, double const eb, size_t* first_faulty_idx) \
+    {                                                                                                            \
+        return cppstd_error_bounded_T##Tliteral(a, b, len, eb, first_faulty_idx);                                \
+    }                                                                                                            \
+                                                                                                                 \
+    template <>                                                                                                  \
+    void psz::cppstd_assess_quality<T>(cusz_stats * s, T * xdata, T * odata, size_t const len)                   \
+    {                                                                                                            \
+        cppstd_assess_quality_T##Tliteral(s, xdata, odata, len);                                                 \
+    }
+
+CPPSTD_COMPARE_LOSSLESS(fp32, float)
+CPPSTD_COMPARE_LOSSLESS(fp64, double)
+CPPSTD_COMPARE_LOSSLESS(ui8, uint8_t)
+CPPSTD_COMPARE_LOSSLESS(ui16, uint16_t)
+CPPSTD_COMPARE_LOSSLESS(ui32, uint32_t)
+
+CPPSTD_COMPARE_LOSSY(fp32, float);
+CPPSTD_COMPARE_LOSSY(fp64, double);
+
+#undef CPPSTD_COMPARE_LOSSLESS
+#undef CPPSTD_COMPARE_LOSSY
+
+#endif /* C93C3857_8821_4988_B6F0_4E885060F642 */
diff --git a/qtensor/compression/cusz/include/stat/compare_gpu.hh b/qtensor/compression/cusz/include/stat/compare_gpu.hh
new file mode 100644
index 00000000..482c2fab
--- /dev/null
+++ b/qtensor/compression/cusz/include/stat/compare_gpu.hh
@@ -0,0 +1,33 @@
+/**
+ * @file compare_gpu.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-09
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef B0EE0E82_B3AA_4946_A589_A3A6A83DD862
+#define B0EE0E82_B3AA_4946_A589_A3A6A83DD862
+
+#include "compare.h"
+
+namespace psz {
+
+template <typename T>
+void thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4]);
+
+template <typename T>
+bool thrustgpu_identical(T* d1, T* d2, size_t const len);
+
+template <typename T>
+bool thrustgpu_error_bounded(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx);
+
+template <typename T>
+void thrustgpu_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t const len);
+
+}  // namespace psz
+
+#endif /* B0EE0E82_B3AA_4946_A589_A3A6A83DD862 */
diff --git a/qtensor/compression/cusz/include/stat/stat.h b/qtensor/compression/cusz/include/stat/stat.h
new file mode 100644
index 00000000..971d94bc
--- /dev/null
+++ b/qtensor/compression/cusz/include/stat/stat.h
@@ -0,0 +1,29 @@
+/**
+ * @file stat.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-02
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef BBBB5712_FF60_4262_B927_85B113FD26BA
+#define BBBB5712_FF60_4262_B927_85B113FD26BA
+
+#include "cusz/type.h"
+
+#define HIST_C(Tname, T)                                                                                 \
+    cusz_error_status histogram_T##Tname(                                                                \
+        T* in_data, size_t const in_len, uint32_t* out_freq, int const num_buckets, float* milliseconds, \
+        cudaStream_t stream);
+
+HIST_C(ui8, uint8_t)
+HIST_C(ui16, uint16_t)
+HIST_C(ui32, uint32_t)
+HIST_C(ui64, uint64_t)
+
+#undef HIST_C
+
+#endif /* BBBB5712_FF60_4262_B927_85B113FD26BA */
diff --git a/qtensor/compression/cusz/include/stat/stat.hh b/qtensor/compression/cusz/include/stat/stat.hh
new file mode 100644
index 00000000..636192a4
--- /dev/null
+++ b/qtensor/compression/cusz/include/stat/stat.hh
@@ -0,0 +1,15 @@
+/**
+ * @file stat.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-02
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef B005D07B_D92D_4DF0_90D0_87A7B7C310C9
+#define B005D07B_D92D_4DF0_90D0_87A7B7C310C9
+
+#endif /* B005D07B_D92D_4DF0_90D0_87A7B7C310C9 */
diff --git a/qtensor/compression/cusz/include/stat/stat_g.hh b/qtensor/compression/cusz/include/stat/stat_g.hh
new file mode 100644
index 00000000..a76ea6f9
--- /dev/null
+++ b/qtensor/compression/cusz/include/stat/stat_g.hh
@@ -0,0 +1,44 @@
+/**
+ * @file stat_g.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-02
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef D8B68EB9_A86B_4AEA_AD4C_3DF22827E7C3
+#define D8B68EB9_A86B_4AEA_AD4C_3DF22827E7C3
+
+#include <cuda_runtime.h>
+#include "cusz/type.h"
+
+namespace asz {
+namespace stat {
+
+/**
+ * @brief Get frequency: a kernel wrapper
+ *
+ * @tparam T input type
+ * @param in_data input device array
+ * @param in_len input host var; len of in_data
+ * @param out_freq output device array
+ * @param nbin input host var; len of out_freq
+ * @param milliseconds output time elapsed
+ * @param stream optional stream
+ */
+template <typename T>
+cusz_error_status histogram(
+    T*           in_data,
+    size_t const in_len,
+    uint32_t*    out_freq,
+    int const    nbin,
+    float*       milliseconds,
+    cudaStream_t stream = nullptr);
+
+}  // namespace stat
+}  // namespace asz
+
+#endif /* D8B68EB9_A86B_4AEA_AD4C_3DF22827E7C3 */
diff --git a/qtensor/compression/cusz/include/utils.hh b/qtensor/compression/cusz/include/utils.hh
new file mode 100644
index 00000000..68ec1d2b
--- /dev/null
+++ b/qtensor/compression/cusz/include/utils.hh
@@ -0,0 +1,21 @@
+/**
+ * @file utils.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-07-12
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef UTILS_HH
+#define UTILS_HH
+
+#include "utils/cuda_err.cuh"
+#include "utils/cuda_mem.cuh"
+#include "utils/format.hh"
+#include "utils/io.hh"
+#include "utils/strhelper.hh"
+
+#endif
\ No newline at end of file
diff --git a/qtensor/compression/cusz/include/utils/cuda_err.cuh b/qtensor/compression/cusz/include/utils/cuda_err.cuh
new file mode 100644
index 00000000..0812c60e
--- /dev/null
+++ b/qtensor/compression/cusz/include/utils/cuda_err.cuh
@@ -0,0 +1,185 @@
+#ifndef CUDA_ERR_CUH
+#define CUDA_ERR_CUH
+
+/**
+ * @file cuda_err.cuh
+ * @author Jiannan Tian
+ * @brief CUDA runtime error handling macros.
+ * @version 0.2
+ * @date 2020-09-20
+ * Created on: 2019-10-08
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <cuda_runtime.h>
+#include <cstdio>
+#include <string>
+#include <stdexcept>
+#include <sstream>
+
+struct cusz_cuda_exception : public std::exception {
+  cusz_cuda_exception(const char* err, int err_code, const char* file, int line) {
+    std::stringstream ss;
+    ss << "CUDA API failed at \e[31m\e[1m" << file << ':' << line << "\e[0m with error: " << err << '(' << err_code << ')';
+    err_msg = ss.str();
+  }
+  const char* what() const noexcept {
+    return err_msg.c_str();
+  }
+  std::string err_msg;
+};
+
+// back compatibility start
+static void HandleError(cudaError_t err, const char* file, int line)
+{
+    if (err != cudaSuccess) {
+        throw cusz_cuda_exception(cudaGetErrorString(err), err, file, line);
+    }
+}
+#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
+// back compatibility end
+
+static void check_cuda_error(cudaError_t status, const char* file, int line)
+{
+    if (cudaSuccess != status) {
+        /*
+        printf("\nCUDA error/status reference (as of CUDA 11):\n");
+        printf("cudaSuccess                         -> %d\n", cudaSuccess);
+        printf("cudaErrorInvalidValue               -> %d\n", cudaErrorInvalidValue);
+        printf("cudaErrorMemoryAllocation           -> %d\n", cudaErrorMemoryAllocation);
+        printf("cudaErrorInitializationError        -> %d\n", cudaErrorInitializationError);
+        printf("cudaErrorCudartUnloading            -> %d\n", cudaErrorCudartUnloading);
+        printf("cudaErrorProfilerDisabled           -> %d\n", cudaErrorProfilerDisabled);
+        printf("cudaErrorProfilerNotInitialized (Deprecated)-> %d\n", cudaErrorProfilerNotInitialized);
+        printf("cudaErrorProfilerAlreadyStarted (Deprecated)-> %d\n", cudaErrorProfilerAlreadyStarted);
+        printf("cudaErrorProfilerAlreadyStopped (Deprecated)-> %d\n", cudaErrorProfilerAlreadyStopped);
+        printf("cudaErrorInvalidConfiguration       -> %d\n", cudaErrorInvalidConfiguration);
+        printf("cudaErrorInvalidPitchValue          -> %d\n", cudaErrorInvalidPitchValue);
+        printf("cudaErrorInvalidSymbol              -> %d\n", cudaErrorInvalidSymbol);
+        printf("cudaErrorInvalidHostPointer     (Deprecated)-> %d\n", cudaErrorInvalidHostPointer);
+        printf("cudaErrorInvalidDevicePointer   (Deprecated)-> %d\n", cudaErrorInvalidDevicePointer);
+        printf("cudaErrorInvalidTexture             -> %d\n", cudaErrorInvalidTexture);
+        printf("cudaErrorInvalidTextureBinding      -> %d\n", cudaErrorInvalidTextureBinding);
+        printf("cudaErrorInvalidChannelDescriptor   -> %d\n", cudaErrorInvalidChannelDescriptor);
+        printf("cudaErrorInvalidMemcpyDirection     -> %d\n", cudaErrorInvalidMemcpyDirection);
+        printf("cudaErrorAddressOfConstant      (Deprecated)-> %d\n", cudaErrorAddressOfConstant);
+        printf("cudaErrorTextureFetchFailed     (Deprecated)-> %d\n", cudaErrorTextureFetchFailed);
+        printf("cudaErrorTextureNotBound        (Deprecated)-> %d\n", cudaErrorTextureNotBound);
+        printf("cudaErrorSynchronizationError   (Deprecated)-> %d\n", cudaErrorSynchronizationError);
+        printf("cudaErrorInvalidFilterSetting       -> %d\n", cudaErrorInvalidFilterSetting);
+        printf("cudaErrorInvalidNormSetting         -> %d\n", cudaErrorInvalidNormSetting);
+        printf("cudaErrorMixedDeviceExecution   (Deprecated)-> %d\n", cudaErrorMixedDeviceExecution);
+        printf("cudaErrorNotYetImplemented      (Deprecated)-> %d\n", cudaErrorNotYetImplemented);
+        printf("cudaErrorMemoryValueTooLarge    (Deprecated)-> %d\n", cudaErrorMemoryValueTooLarge);
+        printf("cudaErrorInsufficientDriver         -> %d\n", cudaErrorInsufficientDriver);
+        printf("cudaErrorInvalidSurface             -> %d\n", cudaErrorInvalidSurface);
+        printf("cudaErrorDuplicateVariableName      -> %d\n", cudaErrorDuplicateVariableName);
+        printf("cudaErrorDuplicateTextureName       -> %d\n", cudaErrorDuplicateTextureName);
+        printf("cudaErrorDuplicateSurfaceName       -> %d\n", cudaErrorDuplicateSurfaceName);
+        printf("cudaErrorDevicesUnavailable         -> %d\n", cudaErrorDevicesUnavailable);
+        printf("cudaErrorIncompatibleDriverContext  -> %d\n", cudaErrorIncompatibleDriverContext);
+        printf("cudaErrorMissingConfiguration       -> %d\n", cudaErrorMissingConfiguration);
+        printf("cudaErrorPriorLaunchFailure     (Deprecated)-> %d\n", cudaErrorPriorLaunchFailure);
+        printf("cudaErrorLaunchMaxDepthExceeded     -> %d\n", cudaErrorLaunchMaxDepthExceeded);
+        printf("cudaErrorLaunchFileScopedTex        -> %d\n", cudaErrorLaunchFileScopedTex);
+        printf("cudaErrorLaunchFileScopedSurf       -> %d\n", cudaErrorLaunchFileScopedSurf);
+        printf("cudaErrorSyncDepthExceeded          -> %d\n", cudaErrorSyncDepthExceeded);
+        printf("cudaErrorLaunchPendingCountExceeded -> %d\n", cudaErrorLaunchPendingCountExceeded);
+        printf("cudaErrorInvalidDeviceFunction      -> %d\n", cudaErrorInvalidDeviceFunction);
+        printf("cudaErrorNoDevice                   -> %d\n", cudaErrorNoDevice);
+        printf("cudaErrorInvalidDevice              -> %d\n", cudaErrorInvalidDevice);
+        printf("cudaErrorStartupFailure             -> %d\n", cudaErrorStartupFailure);
+        printf("cudaErrorInvalidKernelImage         -> %d\n", cudaErrorInvalidKernelImage);
+    #if (CUDART_VERSION == 1100)
+        printf("cudaErrorDeviceUninitialized        -> %d\n", cudaErrorDeviceUninitialized);
+    #endif
+        printf("cudaErrorMapBufferObjectFailed      -> %d\n", cudaErrorMapBufferObjectFailed);
+        printf("cudaErrorUnmapBufferObjectFailed    -> %d\n", cudaErrorUnmapBufferObjectFailed);
+    #if (CUDART_VERSION == 1010)
+        printf("cudaErrorArrayIsMapped              -> %d\n", cudaErrorArrayIsMapped);
+        printf("cudaErrorAlreadyMapped              -> %d\n", cudaErrorAlreadyMapped);
+    #endif
+        printf("cudaErrorNoKernelImageForDevice     -> %d\n", cudaErrorNoKernelImageForDevice);
+    #if (CUDART_VERSION == 1010)
+        printf("cudaErrorAlreadyAcquired            -> %d\n", cudaErrorAlreadyAcquired);
+        printf("cudaErrorNotMapped                  -> %d\n", cudaErrorNotMapped);
+        printf("cudaErrorNotMappedAsArray           -> %d\n", cudaErrorNotMappedAsArray);
+        printf("cudaErrorNotMappedAsPointer         -> %d\n", cudaErrorNotMappedAsPointer);
+    #endif
+        printf("cudaErrorECCUncorrectable           -> %d\n", cudaErrorECCUncorrectable);
+        printf("cudaErrorUnsupportedLimit           -> %d\n", cudaErrorUnsupportedLimit);
+        printf("cudaErrorDeviceAlreadyInUse         -> %d\n", cudaErrorDeviceAlreadyInUse);
+        printf("cudaErrorPeerAccessUnsupported      -> %d\n", cudaErrorPeerAccessUnsupported);
+        printf("cudaErrorInvalidPtx                 -> %d\n", cudaErrorInvalidPtx);
+        printf("cudaErrorInvalidGraphicsContext     -> %d\n", cudaErrorInvalidGraphicsContext);
+        printf("cudaErrorNvlinkUncorrectable        -> %d\n", cudaErrorNvlinkUncorrectable);
+        printf("cudaErrorJitCompilerNotFound        -> %d\n", cudaErrorJitCompilerNotFound);
+    #if (CUDART_VERSION == 1010)
+        printf("cudaErrorInvalidSource              -> %d\n", cudaErrorInvalidSource);
+        printf("cudaErrorFileNotFound               -> %d\n", cudaErrorFileNotFound);
+    #endif
+        printf("cudaErrorSharedObjectSymbolNotFound -> %d\n", cudaErrorSharedObjectSymbolNotFound);
+        printf("cudaErrorSharedObjectInitFailed     -> %d\n", cudaErrorSharedObjectInitFailed);
+        printf("cudaErrorOperatingSystem            -> %d\n", cudaErrorOperatingSystem);
+        printf("cudaErrorInvalidResourceHandle      -> %d\n", cudaErrorInvalidResourceHandle);
+    #if (CUDART_VERSION == 1010)
+        printf("cudaErrorIllegalState               -> %d\n", cudaErrorIllegalState);
+        printf("cudaErrorSymbolNotFound             -> %d\n", cudaErrorSymbolNotFound);
+    #endif
+        printf("cudaErrorNotReady                   -> %d\n", cudaErrorNotReady);
+        printf("cudaErrorIllegalAddress             -> %d\n", cudaErrorIllegalAddress);
+        printf("cudaErrorLaunchOutOfResources       -> %d\n", cudaErrorLaunchOutOfResources);
+        printf("cudaErrorLaunchTimeout              -> %d\n", cudaErrorLaunchTimeout);
+    #if (CUDART_VERSION == 1010)
+        printf("cudaErrorLaunchIncompatibleTexturing-> %d\n", cudaErrorLaunchIncompatibleTexturing);
+    #endif
+        printf("cudaErrorPeerAccessAlreadyEnabled   -> %d\n", cudaErrorPeerAccessAlreadyEnabled);
+        printf("cudaErrorPeerAccessNotEnabled       -> %d\n", cudaErrorPeerAccessNotEnabled);
+        printf("cudaErrorSetOnActiveProcess         -> %d\n", cudaErrorSetOnActiveProcess);
+    #if (CUDART_VERSION == 1010)
+        printf("cudaErrorContextIsDestroyed         -> %d\n", cudaErrorContextIsDestroyed);
+    #endif
+        printf("cudaErrorAssert                     -> %d\n", cudaErrorAssert);
+        printf("cudaErrorTooManyPeers               -> %d\n", cudaErrorTooManyPeers);
+        printf("cudaErrorHostMemoryAlreadyRegistered-> %d\n", cudaErrorHostMemoryAlreadyRegistered);
+        printf("cudaErrorHostMemoryNotRegistered    -> %d\n", cudaErrorHostMemoryNotRegistered);
+        printf("cudaErrorHardwareStackError         -> %d\n", cudaErrorHardwareStackError);
+        printf("cudaErrorIllegalInstruction         -> %d\n", cudaErrorIllegalInstruction);
+        printf("cudaErrorMisalignedAddress          -> %d\n", cudaErrorMisalignedAddress);
+        printf("cudaErrorInvalidAddressSpace        -> %d\n", cudaErrorInvalidAddressSpace);
+        printf("cudaErrorInvalidPc                  -> %d\n", cudaErrorInvalidPc);
+        printf("cudaErrorLaunchFailure              -> %d\n", cudaErrorLaunchFailure);
+        printf("cudaErrorCooperativeLaunchTooLarge  -> %d\n", cudaErrorCooperativeLaunchTooLarge);
+        printf("cudaErrorNotPermitted               -> %d\n", cudaErrorNotPermitted);
+        printf("cudaErrorNotSupported               -> %d\n", cudaErrorNotSupported);
+    #if (CUDART_VERSION == 1010)
+        printf("cudaErrorSystemNotReady             -> %d\n", cudaErrorSystemNotReady);
+        printf("cudaErrorSystemDriverMismatch       -> %d\n", cudaErrorSystemDriverMismatch);
+        printf("cudaErrorCompatNotSupportedOnDevice -> %d\n", cudaErrorCompatNotSupportedOnDevice);
+        printf("cudaErrorStreamCaptureUnsupported   -> %d\n", cudaErrorStreamCaptureUnsupported);
+        printf("cudaErrorStreamCaptureInvalidated   -> %d\n", cudaErrorStreamCaptureInvalidated);
+        printf("cudaErrorStreamCaptureMerge         -> %d\n", cudaErrorStreamCaptureMerge);
+        printf("cudaErrorStreamCaptureUnmatched     -> %d\n", cudaErrorStreamCaptureUnmatched);
+        printf("cudaErrorStreamCaptureUnjoined      -> %d\n", cudaErrorStreamCaptureUnjoined);
+        printf("cudaErrorStreamCaptureIsolation     -> %d\n", cudaErrorStreamCaptureIsolation);
+        printf("cudaErrorStreamCaptureImplicit      -> %d\n", cudaErrorStreamCaptureImplicit);
+        printf("cudaErrorCapturedEvent              -> %d\n", cudaErrorCapturedEvent);
+        printf("cudaErrorStreamCaptureWrongThread   -> %d\n", cudaErrorStreamCaptureWrongThread);
+    #endif
+    #if (CUDART_VERSION == 1100)
+        printf("cudaErrorTimeout                    -> %d\n", cudaErrorTimeout);
+        printf("cudaErrorGraphExecUpdateFailure     -> %d\n", cudaErrorGraphExecUpdateFailure);
+    #endif
+        printf("cudaErrorUnknown                    -> %d\n", cudaErrorUnknown);
+        printf("cudaErrorApiFailureBase (Deprecated)-> %d\n", cudaErrorApiFailureBase);
+        */
+        throw cusz_cuda_exception(cudaGetErrorString(status), status, file, line);
+    }
+}
+
+#define CHECK_CUDA(err) (check_cuda_error(err, __FILE__, __LINE__))
+
+#endif
diff --git a/qtensor/compression/cusz/include/utils/cuda_mem.cuh b/qtensor/compression/cusz/include/utils/cuda_mem.cuh
new file mode 100644
index 00000000..723028ab
--- /dev/null
+++ b/qtensor/compression/cusz/include/utils/cuda_mem.cuh
@@ -0,0 +1,100 @@
+#ifndef UTILS_CUDA_MEM_CUH
+#define UTILS_CUDA_MEM_CUH
+
+/**
+ * @file cuda_mem.cuh
+ * @author Jiannan Tian
+ * @brief CUDA memory operation wrappers.
+ * @version 0.2
+ * @date 2020-09-20
+ * Created on 2020-04-30
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <cuda_runtime.h>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+
+template <int NUM>
+static inline bool __is_aligned_at(const void* ptr)
+{  //
+    return reinterpret_cast<uintptr_t>(ptr) % NUM == 0;
+};
+
+template <typename T, int NUM>
+static size_t __cusz_get_alignable_len(size_t len)
+{
+    return ((sizeof(T) * len - 1) / NUM + 1) * NUM;
+}
+
+static const int CUSZ_ALIGN_NUM = 128;
+
+/**
+ * @brief when using memory pool, alignment at 128 is necessary
+ *
+ * @tparam SRC
+ * @tparam DST
+ * @param src
+ * @return DST*
+ */
+template <typename DST, typename SRC = uint8_t>
+DST* designate(SRC* src)
+{
+    // TODO check alignment
+    auto aligned = __is_aligned_at<CUSZ_ALIGN_NUM>(src);
+    if (not aligned) throw std::runtime_error("not aligned at " + std::to_string(CUSZ_ALIGN_NUM) + " bytes");
+
+    return reinterpret_cast<DST*>(src);
+}
+
+template <typename DST, typename SRC>
+DST* free_repurpose(SRC* src)
+{
+    // aligning at 4 byte; does not raise misalignment
+    // may not result in optimal performance considering coalescing
+    auto aligned = __is_aligned_at<4>(src);
+    if (not aligned) throw std::runtime_error("not aligned at 4 bytes");
+
+    return reinterpret_cast<DST*>(src);
+}
+
+namespace mem {
+
+enum MemcpyDirection { h2d, d2h };
+
+template <typename T>
+inline T* create_CUDA_space(size_t len, uint8_t filling_val = 0x00)
+{
+    T* d_var;
+    cudaMalloc(&d_var, len * sizeof(T));
+    cudaMemset(d_var, filling_val, len * sizeof(T));
+    return d_var;
+}
+
+template <typename T>
+inline T* create_devspace_memcpy_h2d(T* var, size_t l)
+{
+    T* d_var;
+    cudaMalloc(&d_var, l * sizeof(T));
+    cudaMemcpy(d_var, var, l * sizeof(T), cudaMemcpyHostToDevice);
+    return d_var;
+}
+template <typename T>
+inline T* create_devspace_memcpy_d2h(T* d_var, size_t l)
+{
+    // auto var = new T[l];
+    T* var;
+    cudaMallocHost(&var, l * sizeof(T));
+    cudaMemcpy(var, d_var, l * sizeof(T), cudaMemcpyDeviceToHost);
+    return var;
+}
+
+}  // namespace mem
+
+#endif
diff --git a/qtensor/compression/cusz/include/utils/cusparse_err.cuh b/qtensor/compression/cusz/include/utils/cusparse_err.cuh
new file mode 100644
index 00000000..2086ca44
--- /dev/null
+++ b/qtensor/compression/cusz/include/utils/cusparse_err.cuh
@@ -0,0 +1,60 @@
+#ifndef UTILS_CUSPARSE_ERR_CUH
+#define UTILS_CUSPARSE_ERR_CUH
+
+/**
+ * @file cuda_err.cuh
+ * @author Jiannan Tian
+ * @brief CUDA runtime error handling macros.
+ * @version 0.2
+ * @date 2020-09-20
+ * Created on: 2019-10-08
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <cuda_runtime.h>
+#include <cusparse.h>
+#include <cstdio>
+
+// block cusparse for generic testing
+#ifndef noCUSPARSE
+
+static void check_cusparse_error(cusparseStatus_t status, const char* file, int line)
+{
+    if (CUSPARSE_STATUS_SUCCESS != status) {
+        printf("\nCUSPARSE status reference (as of CUDA 11):\n");
+        printf("CUSPARSE_STATUS_SUCCESS                   -> %d\n", CUSPARSE_STATUS_SUCCESS);
+        printf("CUSPARSE_STATUS_NOT_INITIALIZED           -> %d\n", CUSPARSE_STATUS_NOT_INITIALIZED);
+        printf("CUSPARSE_STATUS_ALLOC_FAILED              -> %d\n", CUSPARSE_STATUS_ALLOC_FAILED);
+        printf("CUSPARSE_STATUS_INVALID_VALUE             -> %d\n", CUSPARSE_STATUS_INVALID_VALUE);
+        printf("CUSPARSE_STATUS_ARCH_MISMATCH             -> %d\n", CUSPARSE_STATUS_ARCH_MISMATCH);
+        printf("CUSPARSE_STATUS_EXECUTION_FAILED          -> %d\n", CUSPARSE_STATUS_EXECUTION_FAILED);
+        printf("CUSPARSE_STATUS_INTERNAL_ERROR            -> %d\n", CUSPARSE_STATUS_INTERNAL_ERROR);
+        printf("CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED -> %d\n", CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+#if (CUDART_VERSION == 1010)
+        printf("CUSPARSE_STATUS_NOT_SUPPORTED             -> %d\n", CUSPARSE_STATUS_NOT_SUPPORTED);
+#endif
+#if (CUDART_VERSION == 1100)
+        printf("CUSPARSE_STATUS_INSUFFICIENT_RESOURCES    -> %d\n", CUSPARSE_STATUS_INSUFFICIENT_RESOURCES);
+#endif
+#if (CUDART_VERSION == 1100)
+        printf("CUSPARSE_STATUS_INSUFFICIENT_RESOURCES    -> %d\n", CUSPARSE_STATUS_INSUFFICIENT_RESOURCES);
+#endif
+        printf("\n");
+
+#if (CUDART_VERSION >= 1010)
+        printf(
+            "CUSPARSE API failed at \e[31m\e[1m%s:%d\e[0m with error: %s (%d)\n", file, line,
+            cusparseGetErrorString(status), status);
+#endif
+        exit(EXIT_FAILURE);
+    }
+}
+
+#define CHECK_CUSPARSE(err) (check_cusparse_error(err, __FILE__, __LINE__))
+
+#endif
+
+#endif
diff --git a/qtensor/compression/cusz/include/utils/format.hh b/qtensor/compression/cusz/include/utils/format.hh
new file mode 100644
index 00000000..196f7248
--- /dev/null
+++ b/qtensor/compression/cusz/include/utils/format.hh
@@ -0,0 +1,57 @@
+#ifndef UTILS_FORMAT_HH
+#define UTILS_FORMAT_HH
+
+/**
+ * @file format.hh
+ * @author Jiannan Tian
+ * @brief Formatting for log print (header).
+ * @version 0.2
+ * @date 2020-09-20
+ * Created on 2020-04-27
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <iostream>
+#include <sstream>
+#include <string>
+
+
+const std::string LOG_NULL      = "      ";
+const std::string LOG_INFO      = "  ::  ";
+const std::string LOG_ERR       = " ERR  ";
+const std::string LOG_WARN      = "WARN  ";
+const std::string LOG_DBG       = " dbg  ";
+const std::string LOG_EXCEPTION = "  !!  ";
+
+// https://stackoverflow.com/a/26080768/8740097  CC BY-SA 3.0
+template <typename T>
+void build(std::ostream& o, T t)
+{
+    o << t << " ";
+}
+
+template <typename T, typename... Args>
+void build(std::ostream& o, T t, Args... args)  // recursive variadic function
+{
+    build(o, t);
+    build(o, args...);
+}
+
+template <typename... Args>
+void LOGGING(const std::string& log_head, Args... args)
+{
+    std::ostringstream oss;
+    oss << log_head;
+    build(oss, args...);
+
+    oss.seekp(0, std::ios::end);
+    std::stringstream::pos_type offset = oss.tellp();
+    if (log_head == LOG_DBG) { std::cout << "\e[2m"; }  // dbg
+    std::cout << oss.str() << std::endl;                // print content
+    if (log_head == LOG_DBG) std::cout << "\e[0m";      // finish printing dbg
+}
+
+#endif  // FORMAT_HH
diff --git a/qtensor/compression/cusz/include/utils/io.hh b/qtensor/compression/cusz/include/utils/io.hh
new file mode 100644
index 00000000..de71334d
--- /dev/null
+++ b/qtensor/compression/cusz/include/utils/io.hh
@@ -0,0 +1,59 @@
+#ifndef UTILS_IO_HH
+#define UTILS_IO_HH
+
+/**
+ * @file io.hh
+ * @author Jiannan Tian
+ * @brief Read and write binary.
+ * @version 0.2
+ * @date 2020-09-20
+ * Created on 2019-08-27
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <fstream>
+#include <iostream>
+
+namespace io {
+
+template <typename T>
+T* read_binary_to_new_array(const std::string& fname, size_t dtype_len)
+{
+    std::ifstream ifs(fname.c_str(), std::ios::binary | std::ios::in);
+    if (not ifs.is_open()) {
+        std::cerr << "fail to open " << fname << std::endl;
+        exit(1);
+    }
+    auto _a = new T[dtype_len]();
+    ifs.read(reinterpret_cast<char*>(_a), std::streamsize(dtype_len * sizeof(T)));
+    ifs.close();
+    return _a;
+}
+
+template <typename T>
+void read_binary_to_array(const std::string& fname, T* _a, size_t dtype_len)
+{
+    std::ifstream ifs(fname.c_str(), std::ios::binary | std::ios::in);
+    if (not ifs.is_open()) {
+        std::cerr << "fail to open " << fname << std::endl;
+        exit(1);
+    }
+    ifs.read(reinterpret_cast<char*>(_a), std::streamsize(dtype_len * sizeof(T)));
+    ifs.close();
+}
+
+template <typename T>
+void write_array_to_binary(const std::string& fname, T* const _a, size_t const dtype_len)
+{
+    std::ofstream ofs(fname.c_str(), std::ios::binary | std::ios::out);
+    if (not ofs.is_open()) return;
+    ofs.write(reinterpret_cast<const char*>(_a), std::streamsize(dtype_len * sizeof(T)));
+    ofs.close();
+}
+
+}  // namespace io
+
+#endif  // IO_HH
diff --git a/qtensor/compression/cusz/include/utils/print_gpu.h b/qtensor/compression/cusz/include/utils/print_gpu.h
new file mode 100644
index 00000000..67dcc30a
--- /dev/null
+++ b/qtensor/compression/cusz/include/utils/print_gpu.h
@@ -0,0 +1,45 @@
+/**
+ * @file print.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-28
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef E02AE628_9C8A_4100_8C73_A3B74B7128F6
+#define E02AE628_9C8A_4100_8C73_A3B74B7128F6
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PRINT_INT_LESS_THAN_64(Tliteral, T) void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset);
+
+PRINT_INT_LESS_THAN_64(i8, int8_t)
+PRINT_INT_LESS_THAN_64(i16, int16_t)
+PRINT_INT_LESS_THAN_64(i32, int32_t)
+
+void peek_device_data_Ti64(int64_t* d_arr, size_t num, size_t offset);
+
+#define PRINT_UINT_LESS_THAN_64(Tliteral, T) void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset);
+
+PRINT_UINT_LESS_THAN_64(ui8, uint8_t)
+PRINT_UINT_LESS_THAN_64(ui16, uint16_t)
+PRINT_UINT_LESS_THAN_64(ui32, uint32_t)
+
+void peek_device_data_Tui64(uint64_t* d_arr, size_t num, size_t offset);
+
+void peek_device_data_Tfp32(float* d_arr, size_t num, size_t offset);
+void peek_device_data_Tfp64(double* d_arr, size_t num, size_t offset);
+
+#undef PRINT_INT_LESS_THAN_64
+#undef PRINT_UINT_LESS_THAN_64
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* E02AE628_9C8A_4100_8C73_A3B74B7128F6 */
diff --git a/qtensor/compression/cusz/include/utils/print_gpu.hh b/qtensor/compression/cusz/include/utils/print_gpu.hh
new file mode 100644
index 00000000..cffcbf22
--- /dev/null
+++ b/qtensor/compression/cusz/include/utils/print_gpu.hh
@@ -0,0 +1,21 @@
+/**
+ * @file print_gpu.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-29
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "print_gpu.h"
+
+namespace psz {
+
+template <typename T>
+void peek_device_data(T* d_arr, size_t num, size_t offset = 0);
+
+}  // namespace psz
+
+#undef PEEK_DEVICE_DATA
diff --git a/qtensor/compression/cusz/include/utils/strhelper.hh b/qtensor/compression/cusz/include/utils/strhelper.hh
new file mode 100644
index 00000000..6768edeb
--- /dev/null
+++ b/qtensor/compression/cusz/include/utils/strhelper.hh
@@ -0,0 +1,144 @@
+/**
+ * @file strhelper.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-09-19
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_UTILS_STRHELPER_HH
+#define CUSZ_UTILS_STRHELPER_HH
+
+#include <iostream>
+#include <regex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "format.hh"
+
+using std::cerr;
+using std::endl;
+
+using ss_t     = std::stringstream;
+using map_t    = std::unordered_map<std::string, std::string>;
+using str_list = std::vector<std::string>;
+
+struct StrHelper {
+    static unsigned int str2int(const char* s)
+    {
+        char* end;
+        auto  res = std::strtol(s, &end, 10);
+        if (*end) {
+            const char* notif = "invalid option value, non-convertible part: ";
+            cerr << LOG_ERR << notif << "\e[1m" << s << "\e[0m" << endl;
+        }
+        return res;
+    }
+
+    static unsigned int str2int(std::string s) { return str2int(s.c_str()); }
+
+    static double str2fp(const char* s)
+    {
+        char* end;
+        auto  res = std::strtod(s, &end);
+        if (*end) {
+            const char* notif = "invalid option value, non-convertible part: ";
+            cerr << LOG_ERR << notif << "\e[1m" << end << "\e[0m" << endl;
+        }
+        return res;
+    }
+
+    static double str2fp(std::string s) { return str2fp(s.c_str()); }
+
+    static bool is_kv_pair(std::string s) { return s.find("=") != std::string::npos; }
+
+    static std::pair<std::string, std::string> separate_kv(std::string& s)
+    {
+        std::string delimiter = "=";
+
+        if (s.find(delimiter) == std::string::npos)
+            throw std::runtime_error("\e[1mnot a correct key-value syntax, must be \"opt=value\"\e[0m");
+
+        std::string k = s.substr(0, s.find(delimiter));
+        std::string v = s.substr(s.find(delimiter) + delimiter.length(), std::string::npos);
+
+        return std::make_pair(k, v);
+    }
+
+    static void parse_strlist_as_kv(const char* in_str, map_t& kv_list)
+    {
+        ss_t ss(in_str);
+        while (ss.good()) {
+            std::string tmp;
+            std::getline(ss, tmp, ',');
+            kv_list.insert(separate_kv(tmp));
+        }
+    }
+
+    static void parse_strlist(const char* in_str, str_list& list)
+    {
+        ss_t ss(in_str);
+        while (ss.good()) {
+            std::string tmp;
+            std::getline(ss, tmp, ',');
+            list.push_back(tmp);
+        }
+    }
+
+    static std::pair<std::string, bool> parse_kv_onoff(std::string in_str)
+    {
+        auto       kv_literal = "(.*?)=(on|ON|off|OFF)";
+        std::regex kv_pattern(kv_literal);
+        std::regex onoff_pattern("on|ON|off|OFF");
+
+        bool        onoff = false;
+        std::string k, v;
+
+        std::smatch kv_match;
+        if (std::regex_match(in_str, kv_match, kv_pattern)) {
+            // the 1st match: whole string
+            // the 2nd: k, the 3rd: v
+            if (kv_match.size() == 3) {
+                k = kv_match[1].str(), v = kv_match[2].str();
+
+                std::smatch v_match;
+                if (std::regex_match(v, v_match, onoff_pattern)) {  //
+                    onoff = (v == "on") or (v == "ON");
+                }
+                else {
+                    throw std::runtime_error("not legal (k=v)-syntax");
+                }
+            }
+        }
+        return std::make_pair(k, onoff);
+    }
+
+    static std::string doc_format(const std::string& s)
+    {
+        std::regex  gray("%(.*?)%");
+        std::string gray_text("\e[37m$1\e[0m");
+
+        std::regex  bful("@(.*?)@");
+        std::string bful_text("\e[1m\e[4m$1\e[0m");
+        std::regex  bf("\\*(.*?)\\*");
+        std::string bf_text("\e[1m$1\e[0m");
+        std::regex  ul(R"(_((\w|-|\d|\.)+?)_)");
+        std::string ul_text("\e[4m$1\e[0m");
+        std::regex  red(R"(\^\^(.*?)\^\^)");
+        std::string red_text("\e[31m$1\e[0m");
+
+        auto a = std::regex_replace(s, bful, bful_text);
+        auto b = std::regex_replace(a, bf, bf_text);
+        auto c = std::regex_replace(b, ul, ul_text);
+        auto d = std::regex_replace(c, red, red_text);
+        auto e = std::regex_replace(d, gray, gray_text);
+
+        return e;
+    }
+};
+
+#endif
diff --git a/qtensor/compression/cusz/include/utils/timer.h b/qtensor/compression/cusz/include/utils/timer.h
new file mode 100644
index 00000000..c38cb0dd
--- /dev/null
+++ b/qtensor/compression/cusz/include/utils/timer.h
@@ -0,0 +1,92 @@
+/**
+ * @file timer.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-31
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef B36B7228_E9EC_4E61_A1DC_19A4352C4EB3
+#define B36B7228_E9EC_4E61_A1DC_19A4352C4EB3
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cusz/type.h"
+
+struct asz_timer;
+typedef struct asz_timer asz_timer;
+typedef struct asz_timer asz_cputimer;
+
+struct asz_cudatimer;
+typedef struct asz_cudatimer asz_cudatimer;
+
+// top-level/dispatcher
+// asz_timer* asz_timer_create(asz_policy const p, void* stream);
+// void       asz_timer_destroy(asz_timer* t);
+// void       asz_timer_start(asz_timer* t);
+// void       asz_timer_end(asz_timer* t);
+// double     asz_time_elapsed(asz_timer* t);
+
+asz_timer* asz_cputimer_create();
+void       asz_cputimer_destroy(asz_timer* t);
+void       asz_cputimer_start(asz_timer* t);
+void       asz_cputimer_end(asz_timer* t);
+double     asz_cputime_elapsed(asz_timer* t);
+
+// 22-11-01 adding wrapper incurs unexpeted overhead in timing
+asz_cudatimer* asz_cudatimer_create();
+void           asz_cudatimer_destroy(asz_cudatimer* t);
+void           asz_cudatimer_start(asz_cudatimer* t);
+void           asz_cudatimer_end(asz_cudatimer* t);
+double         asz_cudatime_elapsed(asz_cudatimer* t);
+
+asz_cudatimer* asz_cudastreamtimer_create(void* stream);
+void           asz_cudastreamtimer_destroy(asz_cudatimer* t);
+void           asz_cudastreamtimer_start(asz_cudatimer* t);
+void           asz_cudastreamtimer_end(asz_cudatimer* t);
+double         asz_cudastreamtime_elapsed(asz_cudatimer* t);
+
+// 22-11-01 CUDA timing snippet instead
+#define CREATE_CUDAEVENT_PAIR \
+    cudaEvent_t a, b;         \
+    cudaEventCreate(&a);      \
+    cudaEventCreate(&b);
+
+#define DESTROY_CUDAEVENT_PAIR \
+    cudaEventDestroy(a);       \
+    cudaEventDestroy(b);
+
+#define START_CUDAEVENT_RECORDING(STREAM) cudaEventRecord(a, STREAM);
+#define STOP_CUDAEVENT_RECORDING(STREAM) \
+    cudaEventRecord(b, STREAM);          \
+    cudaEventSynchronize(b);
+
+#define TIME_ELAPSED_CUDAEVENT(PTR_MILLISEC) cudaEventElapsedTime(PTR_MILLISEC, a, b);
+
+// 22-11-01 HIP timing snippet instead
+#define CREATE_HIPEVENT_PAIR \
+    hipEvent_t a, b;         \
+    hipEventCreate(&a);      \
+    hipEventCreate(&b);
+
+#define DESTROY_HIPEVENT_PAIR \
+    hipEventDestroy(a);       \
+    hipEventDestroy(b);
+
+#define START_HIPEVENT_RECORDING(STREAM) hipEventRecord(a, STREAM);
+#define STOP_HIPEVENT_RECORDING(STREAM) \
+    hipEventRecord(b, STREAM);          \
+    hipEventSynchronize(b);
+
+#define TIME_ELAPSED_HIPEVENT(PTR_MILLISEC) hipEventElapsedTime(PTR_MILLISEC, a, b);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* B36B7228_E9EC_4E61_A1DC_19A4352C4EB3 */
diff --git a/qtensor/compression/cusz/include/utils/timer.hh b/qtensor/compression/cusz/include/utils/timer.hh
new file mode 100644
index 00000000..6ba7d35b
--- /dev/null
+++ b/qtensor/compression/cusz/include/utils/timer.hh
@@ -0,0 +1,153 @@
+/**
+ * @file timer.hh
+ * @author Jiannan Tian
+ * @brief High-resolution timer wrapper from <chrono> and util functions for timing both CPU and CUDA function
+ * @version 0.2
+ * @date 2021-01-05
+ * (created) 2019-08-26 (rev) 2021-12-23
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef UTILS_TIMER_HH
+#define UTILS_TIMER_HH
+
+#include <chrono>
+#include <utility>
+
+using hires         = std::chrono::high_resolution_clock;
+using duration_t    = std::chrono::duration<double>;
+using hires_clock_t = std::chrono::time_point<hires>;
+
+typedef struct Timer {
+    hires_clock_t start, end;
+
+    void   timer_start() { start = hires::now(); }
+    void   timer_end() { end = hires::now(); }
+    double get_time_elapsed() { return static_cast<duration_t>(end - start).count(); }
+
+} host_timer_t;
+
+#ifdef __CUDACC__
+
+/**
+ * @brief CUDA event based timer. Synopsis:
+ * cuda_timer_t t;
+ * t.timer_start();
+ * kernel<<<grid_dim, block_dim, nbytes, stream>>>(...);
+ * t.timer_end();
+ * cudaStreamSynchronize(stream);
+ * auto ms = t.get_time_elapsed();
+ *
+ */
+typedef struct CUDATimer {
+    cudaEvent_t start, stop;
+    float       milliseconds;
+
+    // stream not involved
+    void timer_start()
+    {
+        cudaEventCreate(&start);
+        cudaEventCreate(&stop);
+        cudaEventRecord(start);
+    }
+
+    void timer_end()
+    {
+        cudaEventRecord(stop);
+        cudaEventSynchronize(stop);
+    }
+
+    // stream involved
+    void timer_start(cudaStream_t stream)
+    {
+        cudaEventCreate(&start);
+        cudaEventCreate(&stop);
+
+        cudaEventRecord(start, stream);  // set event as not occurred
+    }
+
+    void timer_end(cudaStream_t stream)
+    {
+        cudaEventRecord(stop, stream);
+        cudaEventSynchronize(stop);  // block host until `stream` meets `stop`
+    }
+
+    // get time
+    float get_time_elapsed()
+    {
+        cudaEventElapsedTime(&milliseconds, start, stop);
+        return milliseconds;
+    }
+
+} cuda_timer_t;
+
+#endif
+
+// TODO handle return; testing
+/**
+ * @brief A timer wrapper for arbitrary function (no handling return for now);
+ * Adapted from https://stackoverflow.com/a/33900479/8740097 (CC BY-SA 3.0)
+ *
+ * @tparam F auto function type
+ * @tparam Args variadic function argument type
+ * @param func non-return function to be timed
+ * @param args variadic function arguments
+ * @return double time in seconds
+ */
+template <typename F, typename... Args>
+double TimeThisRoutine(F func, Args&&... args)
+{
+    auto t0 = hires::now();
+    func(std::forward<Args>(args)...);
+    return static_cast<duration_t>(hires::now() - t0).count();
+}
+
+#ifdef __CUDACC__
+typedef struct CUDAKernelConfig {
+    dim3         dim_grid;
+    dim3         dim_block;
+    size_t       shmem_nbyte{0};
+    cudaStream_t stream;
+
+} kernelcfg;
+
+// TODO use cudaEvent
+/**
+ * @brief A timer wrapper for arbitrary CUDA function
+ *
+ * @tparam F auto function type
+ * @tparam Args variadic function argument type
+ * @param func CUDA kernel function to be time
+ * @param cfg CUDA kernel config
+ * @param args variadic function arguments
+ * @return double time in seconds
+ */
+template <typename F, typename... Args>
+float TimeThisCUDARoutine(F func, kernelcfg cfg, Args&&... args)
+{
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    cudaEventRecord(start);
+    func<<<cfg.dim_grid, cfg.dim_block, cfg.shmem_nbyte, cfg.stream>>>(  //
+        args...
+        // std::forward<Args>(args)... // also works
+    );
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+
+    cudaStreamSynchronize(cfg.stream);
+
+    float milliseconds;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+
+    return milliseconds;
+}
+
+#endif
+
+#endif  // UTILS_TIMER_HH
diff --git a/qtensor/compression/cusz/src/cli/cli.cu b/qtensor/compression/cusz/src/cli/cli.cu
new file mode 100644
index 00000000..01c61565
--- /dev/null
+++ b/qtensor/compression/cusz/src/cli/cli.cu
@@ -0,0 +1,14 @@
+/**
+ * @file cli.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-03-07
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "cli.cuh"
+
+template class cusz::CLI<float>;
diff --git a/qtensor/compression/cusz/src/cli/cli.cuh b/qtensor/compression/cusz/src/cli/cli.cuh
new file mode 100644
index 00000000..da94a347
--- /dev/null
+++ b/qtensor/compression/cusz/src/cli/cli.cuh
@@ -0,0 +1,195 @@
+/**
+ * @file cli.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-02-20
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CLI_CUH
+#define CLI_CUH
+
+#include <string>
+#include <type_traits>
+
+#include "cli/analyzer.hh"
+#include "cli/dryrun_part.cuh"
+#include "cli/query.hh"
+#include "cli/timerecord_viewer.hh"
+#include "cusz.h"
+#include "framework.hh"
+
+namespace cusz {
+
+template <typename Data = float>
+class CLI {
+   private:
+    using Header = cuszHEADER;
+    using T      = Data;
+
+    const static auto HOST        = cusz::LOC::HOST;
+    const static auto DEVICE      = cusz::LOC::DEVICE;
+    const static auto HOST_DEVICE = cusz::LOC::HOST_DEVICE;
+
+    using context_t = cuszCTX*;
+    using header_t  = cuszHEADER*;
+
+   public:
+    CLI() = default;
+
+    template <class Predictor>
+    static void cli_dryrun(context_t ctx, bool dualquant = true)
+    {
+        BaseCompressor<Predictor> analysis;
+
+        uint3        xyz{ctx->x, ctx->y, ctx->z};
+        cudaStream_t stream;
+        cudaStreamCreate(&stream);
+
+        if (not dualquant) {
+            analysis.init_dualquant_dryrun(xyz);
+            analysis.dualquant_dryrun(ctx->fname.fname, ctx->eb, ctx->mode == "r2r", stream);
+            analysis.destroy_dualquant_dryrun();
+        }
+        else {
+            analysis.init_generic_dryrun(xyz);
+            analysis.generic_dryrun(ctx->fname.fname, ctx->eb, 512, ctx->mode == "r2r", stream);
+            analysis.destroy_generic_dryrun();
+        }
+        cudaStreamDestroy(stream);
+    }
+
+   private:
+    void write_compressed_to_disk(std::string compressed_name, BYTE* compressed, size_t compressed_len)
+    {
+        Capsule<BYTE> file("cusza");
+        file.set_len(compressed_len)
+            .set_dptr(compressed)
+            .mallochost()
+            .device2host()
+            .tofile(compressed_name)
+            .freehost()
+            .free();
+    }
+
+    void try_write_decompressed_to_disk(Capsule<T>& xdata, std::string basename, bool skip_write)
+    {
+        if (not skip_write) xdata.device2host().tofile(basename + ".cuszx");
+    }
+
+    // template <typename compressor_t>
+    void cli_construct(context_t ctx, cusz_compressor* compressor, cudaStream_t stream)
+    {
+        Capsule<T> input("uncompressed");
+        BYTE*      compressed;
+        size_t     compressed_len;
+        Header     header;
+        auto       len      = ctx->get_len();
+        auto       basename = ctx->fname.fname;
+
+        auto load_uncompressed = [&](std::string fname) {
+            input
+                .set_len(len)  //
+                .mallochost()
+                .malloc()
+                .fromfile(fname)
+                .host2device();
+        };
+
+        auto adjust_eb = [&]() {
+            if (ctx->mode == "r2r") ctx->eb *= input.prescan().get_rng();
+        };
+
+        /******************************************************************************/
+
+        load_uncompressed(basename);
+        adjust_eb();
+
+        TimeRecord timerecord;
+
+        cusz_config* config     = new cusz_config{.eb = ctx->eb, .mode = Rel};
+        cusz_len     uncomp_len = cusz_len{ctx->x, ctx->y, ctx->z, 1};
+
+        cusz_compress(
+            compressor, config, input.dptr(), uncomp_len, &compressed, &compressed_len, &header, (void*)&timerecord,
+            stream);
+
+        if (ctx->report.time) TimeRecordViewer::view_compression(&timerecord, input.nbyte(), compressed_len);
+        write_compressed_to_disk(basename + ".cusza", compressed, compressed_len);
+    }
+
+    // template <typename compressor_t>
+    void cli_reconstruct(context_t ctx, cusz_compressor* compressor, cudaStream_t stream)
+    {
+        Capsule<BYTE> compressed("compressed");
+        Capsule<T>    decompressed("decompressed"), original("cmp");
+        auto          header   = new Header;
+        auto          basename = (*ctx).fname.fname;
+
+        auto load_compressed = [&](std::string compressed_name) {
+            auto compressed_len = ConfigHelper::get_filesize(compressed_name);
+            compressed
+                .set_len(compressed_len)  //
+                .mallochost()
+                .malloc()
+                .fromfile(compressed_name)
+                .host2device();
+        };
+
+        /******************************************************************************/
+
+        load_compressed(basename + ".cusza");
+        memcpy(header, compressed.hptr(), sizeof(Header));
+        auto len = ConfigHelper::get_uncompressed_len(header);
+
+        decompressed  //
+            .set_len(len)
+            .mallochost()
+            .malloc();
+        original.set_len(len);
+
+        TimeRecord timerecord;
+
+        cusz_len decomp_len = cusz_len{header->x, header->y, header->z, 1};
+
+        cusz_decompress(
+            compressor, header, compressed.dptr(), ConfigHelper::get_filesize(header), decompressed.dptr(), decomp_len,
+            (void*)&timerecord, stream);
+
+        if (ctx->report.time) TimeRecordViewer::view_decompression(&timerecord, decompressed.nbyte());
+        QualityViewer::view(header, decompressed, original, (*ctx).fname.origin_cmp);
+        try_write_decompressed_to_disk(decompressed, basename, (*ctx).skip.write2disk);
+
+        decompressed.freehost().free();
+    }
+
+   public:
+    // TODO determine dtype & predictor in here
+    void dispatch(context_t ctx)
+    {
+        // TODO disable predictor selection; to specify in another way
+        // auto predictor = (*ctx).predictor;
+
+        cusz_framework*  framework  = cusz_default_framework();
+        cusz_compressor* compressor = cusz_create(framework, FP32);
+
+        cudaStream_t stream;
+        CHECK_CUDA(cudaStreamCreate(&stream));
+
+        // TODO hardcoded predictor type
+        if ((*ctx).cli_task.dryrun) cli_dryrun<typename Framework<float>::Predictor>(ctx);
+
+        if ((*ctx).cli_task.construct) cli_construct(ctx, compressor, stream);
+
+        if ((*ctx).cli_task.reconstruct) cli_reconstruct(ctx, compressor, stream);
+
+        if (stream) cudaStreamDestroy(stream);
+    }
+};
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/src/cli/dryrun_part.cu b/qtensor/compression/cusz/src/cli/dryrun_part.cu
new file mode 100644
index 00000000..41311b6b
--- /dev/null
+++ b/qtensor/compression/cusz/src/cli/dryrun_part.cu
@@ -0,0 +1,17 @@
+/**
+ * @file base_compressor.cu
+ * @author Jiannan Tian
+ * @brief Predictor-only Base Compressor; can also be used for dryrun.
+ * @version 0.3
+ * @date 2021-10-05
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "dryrun_part.cuh"
+
+template class cusz::BaseCompressor<cusz::PredictionUnified<  //
+    DataTrait<4>::type,
+    ErrCtrlTrait<2>::type,
+    FastLowPrecisionTrait<true>::type>>;
diff --git a/qtensor/compression/cusz/src/cli/dryrun_part.cuh b/qtensor/compression/cusz/src/cli/dryrun_part.cuh
new file mode 100644
index 00000000..e6fd4579
--- /dev/null
+++ b/qtensor/compression/cusz/src/cli/dryrun_part.cuh
@@ -0,0 +1,196 @@
+/**
+ * @file base_compressor.cuh
+ * @author Jiannan Tian
+ * @brief Predictor-only Base Compressor; can also be used for dryrun.
+ * @version 0.3
+ * @date 2021-10-05
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef BASE_COMPRESSOR_CUH
+#define BASE_COMPRESSOR_CUH
+
+#include "cli/analyzer.hh"
+#include "cli/quality_viewer.hh"
+#include "cli/verify.hh"
+#include "common.hh"
+#include "component.hh"
+#include "context.hh"
+#include "kernel/dryrun.cuh"
+#include "stat/compare_gpu.hh"
+#include "utils.hh"
+
+/**
+ * @brief bare metal, can run predictor to check data quality and compressibility
+ *
+ * @tparam T for data type
+ * @tparam E for error control type
+ */
+
+namespace cusz {
+
+template <class Predictor>
+class BaseCompressor {
+   public:
+    using BYTE = uint8_t;
+    using T    = typename Predictor::Origin;
+    using FP   = typename Predictor::Precision;
+    using E    = typename Predictor::ErrCtrl;
+
+   private:
+    struct NonCritical {
+        Predictor* p;
+        Capsule<T> original;
+        Capsule<E> errctrl;  // TODO change to 4-byte
+        Capsule<T> outlier;
+        Capsule<T> anchor;
+        Capsule<T> reconst;
+
+        NonCritical(dim3 size) { p = new Predictor; }
+    };
+
+    struct NonCritical* nc;
+
+   protected:
+    cuszCTX* ctx;
+
+    int    dict_size;
+    double eb;
+
+    dim3 xyz;
+
+   public:
+    /**
+     * @brief Generic dryrun; performing predictor.construct() and .reconstruct()
+     *
+     * @param fname filename
+     * @param eb (host variable) error bound; future: absolute error bound only
+     * @param radius (host variable) limiting radius
+     * @param r2r if relative-to-value-range
+     * @param stream CUDA stream
+     * @return BaseCompressor& this object instance
+     */
+    BaseCompressor& generic_dryrun(const std::string fname, double eb, int radius, bool r2r, cudaStream_t stream)
+    {
+        if (not nc) throw std::runtime_error("NonCritical struct has no instance.");
+
+        // LOGGING(LOG_INFO, "invoke dry-run");
+
+        nc->original.fromfile(fname).host2device_async(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        if (r2r) {
+            double max, min, rng;
+            nc->original.prescan(max, min, rng);
+            eb *= rng;
+        }
+
+        auto xyz = dim3(ctx->x, ctx->y, ctx->z);
+
+        // nc->p->construct(
+        //     LorenzoI, xyz, nc->original.dptr, nc->anchor.dptr, nc->errctrl.dptr, nc->outlier.dptr, eb, radius,
+        //     stream);
+        // nc->p->reconstruct(
+        //     LorenzoI, xyz, nc->outlier.dptr, nc->anchor.dptr, nc->errctrl.dptr, nc->reconst.dptr, eb, radius,
+        //     stream);
+
+        nc->reconst.device2host_async(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        cusz_stats stat;
+        psz::thrustgpu_assess_quality<T>(&stat, nc->reconst.hptr(), nc->original.hptr(), nc->p->get_len_data());
+        cusz::QualityViewer::print_metrics_cross<T>(&stat, 0, true);
+
+        return *this;
+    }
+
+    /**
+     * @brief Dual-quant dryrun; performing integerization & its reverse procedure
+     *
+     * @param eb (host variable) error bound; future: absolute error bound only
+     * @param r2r if relative-to-value-range
+     * @param stream CUDA stream
+     * @return BaseCompressor& this object instance
+     */
+    BaseCompressor& dualquant_dryrun(const std::string fname, double eb, bool r2r, cudaStream_t stream)
+    {
+        auto len = nc->original.len();
+
+        nc->original.fromfile(fname).host2device_async(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        if (r2r) {
+            double max, min, rng;
+            nc->original.prescan(max, min, rng);
+            eb *= rng;
+        }
+
+        auto ebx2_r = 1 / (eb * 2);
+        auto ebx2   = eb * 2;
+
+        cusz::dualquant_dryrun_kernel                                              //
+            <<<ConfigHelper::get_npart(len, 256), 256, 256 * sizeof(T), stream>>>  //
+            (nc->original.dptr(), nc->reconst.dptr(), len, ebx2_r, ebx2);
+
+        nc->reconst.device2host_async(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        cusz_stats stat;
+        psz::thrustgpu_assess_quality(&stat, nc->reconst.hptr(), nc->original.hptr(), len);
+        cusz::QualityViewer::print_metrics_cross<T>(&stat, 0, true);
+
+        return *this;
+    }
+
+   public:
+    BaseCompressor() = default;
+
+    ~BaseCompressor() {}
+
+   public:
+    // dry run
+    void init_generic_dryrun(dim3 size)
+    {  //
+        auto len = size.x * size.y * size.z;
+        nc       = new struct NonCritical(size);
+
+        nc->original.set_len(len).mallochost().malloc();
+        nc->outlier.set_len(len).mallochost().malloc();
+        nc->errctrl.set_len(len).mallochost().malloc();
+        nc->anchor.set_len(nc->p->get_len_anchor()).mallochost().malloc();
+        nc->reconst.set_len(len).mallochost().malloc();
+    }
+
+    void destroy_generic_dryrun()
+    {
+        delete nc->p;
+        nc->original.freehost().free();
+        nc->outlier.freehost().free();
+        nc->errctrl.freehost().free();
+        nc->anchor.freehost().free();
+        nc->reconst.freehost().free();
+        delete nc;
+    }
+
+    void init_dualquant_dryrun(dim3 size)
+    {
+        auto len = size.x * size.y * size.z;
+        nc       = new struct NonCritical(size);
+        nc->original.set_len(len).mallochost().malloc();
+        nc->reconst.set_len(len).mallochost().malloc();
+    }
+
+    void destroy_dualquant_dryrun()
+    {
+        nc->original.freehost().free();
+        nc->reconst.freehost().free();
+
+        delete nc;
+    }
+};
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/src/cli_bin.cu b/qtensor/compression/cusz/src/cli_bin.cu
new file mode 100644
index 00000000..f3e50d64
--- /dev/null
+++ b/qtensor/compression/cusz/src/cli_bin.cu
@@ -0,0 +1,27 @@
+/**
+ * @file cusz-cli.cu
+ * @author Jiannan Tian
+ * @brief Driver program of cuSZ.
+ * @version 0.1
+ * @date 2020-09-20
+ * (created) 2019-12-30 (rev) 2022-02-20
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include "cli/cli.cuh"
+
+int main(int argc, char** argv)
+{
+    auto ctx = new cuszCTX(argc, argv);
+
+    if (ctx->verbose) {
+        Diagnostics::GetMachineProperties();
+        GpuDiagnostics::GetDeviceProperty();
+    }
+
+    cusz::CLI<float> cusz_cli;
+    cusz_cli.dispatch(ctx);
+}
diff --git a/qtensor/compression/cusz/src/compressor.cc b/qtensor/compression/cusz/src/compressor.cc
new file mode 100644
index 00000000..7b62db5a
--- /dev/null
+++ b/qtensor/compression/cusz/src/compressor.cc
@@ -0,0 +1,149 @@
+/**
+ * @file compressor.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-23
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "compressor.hh"
+#include "common/configs.hh"
+#include "framework.hh"
+
+namespace cusz {
+
+template <class B>
+Compressor<B>::~Compressor()
+{
+    pimpl.reset();
+}
+
+template <class B>
+Compressor<B>::Compressor() : pimpl{std::make_unique<impl>()}
+{
+}
+
+template <class B>
+Compressor<B>::Compressor(const Compressor<B>& old) : pimpl{std::make_unique<impl>(*old.pimpl)}
+{
+}
+
+template <class B>
+Compressor<B>& Compressor<B>::operator=(const Compressor<B>& old)
+{
+    *pimpl = *old.pimpl;
+    return *this;
+}
+
+template <class B>
+Compressor<B>::Compressor(Compressor<B>&&) = default;
+
+template <class B>
+Compressor<B>& Compressor<B>::operator=(Compressor<B>&&) = default;
+
+//------------------------------------------------------------------------------
+
+template <class B>
+void Compressor<B>::init(Context* config, bool dbg_print)
+{
+    pimpl->init(config, dbg_print);
+}
+
+template <class B>
+void Compressor<B>::init(Header* config, bool dbg_print)
+{
+    pimpl->init(config, dbg_print);
+}
+
+template <class B>
+void Compressor<B>::compress(
+    Context*          config,
+    Compressor<B>::T* uncompressed,
+    BYTE*&            compressed,
+    size_t&           compressed_len,
+    cudaStream_t      stream,
+    bool              dbg_print)
+{
+    pimpl->compress(config, uncompressed, compressed, compressed_len, stream, dbg_print);
+}
+
+template <class B>
+void Compressor<B>::decompress(
+    Header*           config,
+    BYTE*             compressed,
+    Compressor<B>::T* decompressed,
+    cudaStream_t      stream,
+    bool              dbg_print)
+{
+    pimpl->decompress(config, compressed, decompressed, stream, dbg_print);
+}
+
+template <class B>
+void Compressor<B>::clear_buffer()
+{
+    pimpl->clear_buffer();
+}
+
+// getter
+
+template <class B>
+void Compressor<B>::export_header(Header& header)
+{
+    pimpl->export_header(header);
+}
+
+template <class B>
+void Compressor<B>::export_header(Header* header)
+{
+    pimpl->export_header(header);
+}
+
+template <class B>
+void Compressor<B>::export_timerecord(TimeRecord* ext_timerecord)
+{
+    pimpl->export_timerecord(ext_timerecord);
+}
+
+}  // namespace cusz
+
+// extra helper
+namespace cusz {
+
+int CompressorHelper::autotune_coarse_parvle(Context* ctx)
+{
+    auto tune_coarse_huffman_sublen = [](size_t len) {
+        int current_dev = 0;
+        cudaSetDevice(current_dev);
+        cudaDeviceProp dev_prop{};
+        cudaGetDeviceProperties(&dev_prop, current_dev);
+
+        auto nSM               = dev_prop.multiProcessorCount;
+        auto allowed_block_dim = dev_prop.maxThreadsPerBlock;
+        auto deflate_nthread   = allowed_block_dim * nSM / HuffmanHelper::DEFLATE_CONSTANT;
+        auto optimal_sublen    = ConfigHelper::get_npart(len, deflate_nthread);
+        optimal_sublen         = ConfigHelper::get_npart(optimal_sublen, HuffmanHelper::BLOCK_DIM_DEFLATE) *
+                         HuffmanHelper::BLOCK_DIM_DEFLATE;
+
+        return optimal_sublen;
+    };
+
+    auto get_coarse_pardeg = [&](size_t len, int& sublen, int& pardeg) {
+        sublen = tune_coarse_huffman_sublen(len);
+        pardeg = ConfigHelper::get_npart(len, sublen);
+    };
+
+    // TODO should be move to somewhere else, e.g., cusz::par_optmizer
+    if (ctx->use.autotune_vle_pardeg)
+        get_coarse_pardeg(ctx->data_len, ctx->vle_sublen, ctx->vle_pardeg);
+    else
+        ctx->vle_pardeg = ConfigHelper::get_npart(ctx->data_len, ctx->vle_sublen);
+
+    return ctx->vle_pardeg;
+}
+
+}  // namespace cusz
+
+template class cusz::Compressor<cusz::Framework<float>>;
diff --git a/qtensor/compression/cusz/src/context.cc b/qtensor/compression/cusz/src/context.cc
new file mode 100644
index 00000000..c85f3d24
--- /dev/null
+++ b/qtensor/compression/cusz/src/context.cc
@@ -0,0 +1,493 @@
+/**
+ * @file argparse.cc
+ * @author Jiannan Tian
+ * @brief Argument parser.
+ * @version 0.1
+ * @date 2020-09-20
+ * Created on: 20-04-24
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <set>
+#include <stdexcept>
+#include <unordered_map>
+
+#include "cli/document.hh"
+#include "context.hh"
+
+namespace cusz {
+const char* VERSION_TEXT  = "2023-01-23 (unstable; pre-0.4)";
+const int   VERSION       = 20230123;
+const int   COMPATIBILITY = 0;
+}  // namespace cusz
+
+namespace {
+
+void set_preprocess(cusz::context_t ctx, const char* in_str)
+{
+    str_list opts;
+    StrHelper::parse_strlist(in_str, opts);
+
+    for (auto k : opts) {
+        // TODO
+    }
+}
+
+void set_report(cusz::context_t ctx, const char* in_str)
+{
+    str_list opts;
+    StrHelper::parse_strlist(in_str, opts);
+
+    for (auto o : opts) {
+        if (StrHelper::is_kv_pair(o)) {
+            auto kv = StrHelper::parse_kv_onoff(o);
+
+            if (kv.first == "cr")
+                ctx->report.cr = kv.second;
+            else if (kv.first == "compressibility")
+                ctx->report.compressibility = kv.second;
+            else if (kv.first == "time")
+                ctx->report.time = kv.second;
+        }
+        else {
+            if (o == "cr")
+                ctx->report.cr = true;
+            else if (o == "compressibility")
+                ctx->report.compressibility = true;
+            else if (o == "time")
+                ctx->report.time = true;
+        }
+    }
+}
+
+void set_config(cusz::context_t ctx, const char* in_str, bool dbg_print = false)
+{
+    map_t opts;
+    StrHelper::parse_strlist_as_kv(in_str, opts);
+
+    if (dbg_print) {
+        for (auto kv : opts) printf("%-*s %-s\n", 10, kv.first.c_str(), kv.second.c_str());
+        std::cout << "\n";
+    }
+
+    std::string k, v;
+    char*       end;
+
+    auto optmatch   = [&](std::vector<std::string> vs) -> bool { return ConfigHelper::check_opt_in_list(k, vs); };
+    auto is_enabled = [&](auto& v) -> bool { return v == "on" or v == "ON"; };
+
+    for (auto kv : opts) {
+        k = kv.first;
+        v = kv.second;
+
+        if (optmatch({"type", "dtype"})) {
+            ConfigHelper::check_dtype(v, false);
+            ctx->dtype = v;
+        }
+        else if (optmatch({"eb", "errorbound"})) {
+            ctx->eb = StrHelper::str2fp(v);
+        }
+        else if (optmatch({"mode"})) {
+            ConfigHelper::check_cuszmode(v, true);
+            ctx->mode = v;
+        }
+        else if (optmatch({"len", "length"})) {
+            cuszCTX::parse_input_length(v.c_str(), ctx);
+        }
+        else if (optmatch({"alloclen"})) {
+            ctx->alloclen.len = StrHelper::str2int(v);
+        }
+        else if (optmatch({"demo"})) {
+            ctx->use.predefined_demo = true;
+            ctx->demo_dataset        = std::string(v);
+            ctx->load_demo_sizes();
+        }
+        else if (optmatch({"cap", "booklen", "dictsize"})) {
+            ctx->dict_size = StrHelper::str2int(v);
+            ctx->radius    = ctx->dict_size / 2;
+        }
+        else if (optmatch({"radius"})) {
+            ctx->radius    = StrHelper::str2int(v);
+            ctx->dict_size = ctx->radius * 2;
+        }
+        else if (optmatch({"huffbyte"})) {
+            ctx->huff_bytewidth = StrHelper::str2int(v);
+            ctx->codecs_in_use  = ctx->codec_force_fallback() ? 0b11 /*use both*/ : 0b01 /*use 4-byte*/;
+        }
+        else if (optmatch({"huffchunk"})) {
+            ctx->vle_sublen              = StrHelper::str2int(v);
+            ctx->use.autotune_vle_pardeg = false;
+        }
+        else if (optmatch({"predictor"})) {
+            ctx->predictor = std::string(v);
+        }
+        else if (optmatch({"codec"})) {
+            // placeholder
+        }
+        else if (optmatch({"spcodec"})) {
+            // placeholder
+        }
+        else if (optmatch({"anchor"}) and is_enabled(v)) {
+            ctx->use.anchor = true;
+        }
+        else if (optmatch({"nondestructive"}) and is_enabled(v)) {
+            // placeholder
+        }
+        else if (optmatch({"failfast"}) and is_enabled(v)) {
+            // placeholder
+        }
+        else if (optmatch({"releaseinput"}) and is_enabled(v)) {
+            ctx->use.release_input = true;
+        }
+        else if (optmatch({"pipeline"})) {
+            ctx->pipeline = v;
+        }
+        else if (optmatch({"density"})) {  // refer to `SparseMethodSetup` in `config.hh`
+            ctx->nz_density        = StrHelper::str2fp(v);
+            ctx->nz_density_factor = 1 / ctx->nz_density;
+        }
+        else if (optmatch({"densityfactor"})) {  // refer to `SparseMethodSetup` in `config.hh`
+            ctx->nz_density_factor = StrHelper::str2fp(v);
+            ctx->nz_density        = 1 / ctx->nz_density_factor;
+        }
+        else if (optmatch({"gpuverify"}) and is_enabled(v)) {
+            ctx->use.gpu_verify = true;
+        }
+
+        // when to enable anchor
+        if (ctx->predictor == "spline3") {
+            // unconditionally use anchor when it is spline3
+            ctx->use.anchor = true;
+        }
+    }
+}
+
+void set_from_cli_input(cusz::context_t ctx, int const argc, char** const argv)
+{
+    int i = 1;
+
+    auto check_next = [&]() {
+        if (i + 1 >= argc) throw std::runtime_error("out-of-range at" + std::string(argv[i]));
+    };
+
+    std::string opt;
+    auto optmatch = [&](std::vector<std::string> vs) -> bool { return ConfigHelper::check_opt_in_list(opt, vs); };
+
+    while (i < argc) {
+        if (argv[i][0] == '-') {
+            opt = std::string(argv[i]);
+
+            if (optmatch({"-c", "--config"})) {
+                check_next();
+                set_config(ctx, argv[++i]);
+            }
+            else if (optmatch({"-R", "--report"})) {
+                check_next();
+                set_report(ctx, argv[++i]);
+            }
+            else if (optmatch({"-h", "--help"})) {
+                cusz::Context::print_doc(true);
+                exit(0);
+            }
+            else if (optmatch({"-v", "--version"})) {
+                std::cout << ">>>>  cusz build: " << cusz::VERSION_TEXT << "\n";
+                exit(0);
+            }
+            else if (optmatch({"-m", "--mode"})) {
+                check_next();
+                ctx->mode = std::string(argv[++i]);
+                if (ctx->mode == "r2r") ctx->preprocess.prescan = true;
+            }
+            else if (optmatch({"-e", "--eb", "--error-bound"})) {
+                check_next();
+                char* end;
+                ctx->eb = std::strtod(argv[++i], &end);
+            }
+            else if (optmatch({"-p", "--predictor"})) {
+                check_next();
+                ctx->predictor = std::string(argv[++i]);
+            }
+            else if (optmatch({"-c", "--codec"})) {
+                check_next();
+                // placeholder
+            }
+            else if (optmatch({"-s", "--spcodec"})) {
+                check_next();
+                // placeholder
+            }
+            else if (optmatch({"-t", "--type", "--dtype"})) {
+                check_next();
+                std::string s = std::string(std::string(argv[++i]));
+                if (s == "f32" or s == "fp4")
+                    ctx->dtype = "f32";
+                else if (s == "f64" or s == "fp8")
+                    ctx->dtype = "f64";
+            }
+            else if (optmatch({"-i", "--input"})) {
+                check_next();
+                ctx->fname.fname = std::string(argv[++i]);
+            }
+            else if (optmatch({"-l", "--len"})) {
+                check_next();
+                cusz::Context::parse_input_length(argv[++i], ctx);
+            }
+            else if (optmatch({"-L", "--allocation-len"})) {
+                check_next();
+                // placeholder
+            }
+            else if (optmatch({"-z", "--zip", "--compress"})) {
+                ctx->cli_task.construct = true;
+            }
+            else if (optmatch({"-x", "--unzip", "--decompress"})) {
+                ctx->cli_task.reconstruct = true;
+            }
+            else if (optmatch({"-r", "--dry-run"})) {
+                ctx->cli_task.dryrun = true;
+            }
+            else if (optmatch({"--anchor"})) {
+                ctx->use.anchor = true;
+            }
+            else if (optmatch({"--nondestructive", "--input-nondestructive"})) {
+                // placeholder
+            }
+            else if (optmatch({"--failfast"})) {
+                // placeholder
+            }
+            else if (optmatch({"-P", "--pre", "--preprocess"})) {
+                check_next();
+                std::string pre(argv[++i]);
+                if (pre.find("binning") != std::string::npos) { ctx->preprocess.binning = true; }
+            }
+            else if (optmatch({"-T", "--post", "--postprocess"})) {
+                check_next();
+                std::string post(argv[++i]);
+                if (post.find("gzip") != std::string::npos) { ctx->postcompress.cpu_gzip = true; }
+                if (post.find("nvcomp") != std::string::npos) { ctx->postcompress.gpu_nvcomp_cascade = true; }
+            }
+            else if (optmatch({"-V", "--verbose"})) {
+                ctx->verbose = true;
+            }
+            else if (optmatch({"--pipeline"})) {
+                check_next();
+                ctx->pipeline = std::string(argv[++i]);
+            }
+            else if (optmatch({"--demo"})) {
+                check_next();
+                ctx->use.predefined_demo = true;
+                ctx->demo_dataset        = std::string(argv[++i]);
+                ctx->load_demo_sizes();
+            }
+            else if (optmatch({"-S", "-X", "--skip", "--exclude"})) {
+                check_next();
+                std::string exclude(argv[++i]);
+                if (exclude.find("huffman") != std::string::npos) { ctx->skip.huffman = true; }
+                if (exclude.find("write2disk") != std::string::npos) { ctx->skip.write2disk = true; }
+            }
+            else if (optmatch({"--opath"})) {
+                check_next();
+                ctx->opath = std::string(argv[++i]);
+            }
+            else if (optmatch({"--origin", "--compare"})) {
+                check_next();
+                ctx->fname.origin_cmp = std::string(argv[++i]);
+            }
+            else {
+                const char* notif_prefix = "invalid option value at position ";
+                char*       notif;
+                int         size = asprintf(&notif, "%d: %s", i, argv[i]);
+                cerr << LOG_ERR << notif_prefix << "\e[1m" << notif << "\e[0m"
+                     << "\n";
+                cerr << std::string(LOG_NULL.length() + strlen(notif_prefix), ' ');
+                cerr << "\e[1m";
+                cerr << std::string(strlen(notif), '~');
+                cerr << "\e[0m\n";
+
+                ctx->trap(-1);
+            }
+        }
+        else {
+            const char* notif_prefix = "invalid option at position ";
+            char*       notif;
+            int         size = asprintf(&notif, "%d: %s", i, argv[i]);
+            cerr << LOG_ERR << notif_prefix << "\e[1m" << notif
+                 << "\e[0m"
+                    "\n"
+                 << std::string(LOG_NULL.length() + strlen(notif_prefix), ' ')  //
+                 << "\e[1m"                                                     //
+                 << std::string(strlen(notif), '~')                             //
+                 << "\e[0m\n";
+
+            ctx->trap(-1);
+        }
+        i++;
+    }
+}
+
+}  // namespace
+
+cuszCTX& cuszCTX::set_control_string(const char* in_str)
+{
+    set_config(this, in_str);
+    return *this;
+}
+
+void cuszCTX::load_demo_sizes()
+{
+    const std::unordered_map<std::string, std::vector<int>> dataset_entries = {
+        {std::string("hacc"), {280953867, 1, 1, 1, 1}},    {std::string("hacc1b"), {1073726487, 1, 1, 1, 1}},
+        {std::string("cesm"), {3600, 1800, 1, 1, 2}},      {std::string("hurricane"), {500, 500, 100, 1, 3}},
+        {std::string("nyx-s"), {512, 512, 512, 1, 3}},     {std::string("nyx-m"), {1024, 1024, 1024, 1, 3}},
+        {std::string("qmc"), {288, 69, 7935, 1, 3}},       {std::string("qmcpre"), {69, 69, 33120, 1, 3}},
+        {std::string("exafel"), {388, 59200, 1, 1, 2}},    {std::string("rtm"), {235, 849, 849, 1, 3}},
+        {std::string("parihaka"), {1168, 1126, 922, 1, 3}}};
+
+    if (not demo_dataset.empty()) {
+        auto f = dataset_entries.find(demo_dataset);
+        if (f == dataset_entries.end()) throw std::runtime_error("no such dataset as" + demo_dataset);
+        auto demo_xyzw = f->second;
+
+        x = demo_xyzw[0], y = demo_xyzw[1], z = demo_xyzw[2], w = demo_xyzw[3];
+        ndim = demo_xyzw[4];
+    }
+    data_len = x * y * z * w;
+}
+
+void cuszCTX::trap(int _status) { this->read_args_status = _status; }
+
+void cuszCTX::validate()
+{
+    bool to_abort = false;
+    if (fname.fname.empty()) {
+        cerr << LOG_ERR << "must specify input file" << endl;
+        to_abort = true;
+    }
+
+    if (data_len == 1 and not use.predefined_demo) {
+        if (cli_task.construct or cli_task.dryrun) {
+            cerr << LOG_ERR << "wrong input size" << endl;
+            to_abort = true;
+        }
+    }
+    if (not cli_task.construct and not cli_task.reconstruct and not cli_task.dryrun) {
+        cerr << LOG_ERR << "select compress (-z), decompress (-x) or dry-run (-r)" << endl;
+        to_abort = true;
+    }
+    if (false == ConfigHelper::check_dtype(dtype, false)) {
+        if (cli_task.construct or cli_task.dryrun) {
+            std::cout << dtype << endl;
+            cerr << LOG_ERR << "must specify data type" << endl;
+            to_abort = true;
+        }
+    }
+
+    if (quant_bytewidth == 1)
+        assert(dict_size <= 256);
+    else if (quant_bytewidth == 2)
+        assert(dict_size <= 65536);
+
+    if (cli_task.dryrun and cli_task.construct and cli_task.reconstruct) {
+        cerr << LOG_WARN << "no need to dry-run, compress and decompress at the same time" << endl;
+        cerr << LOG_WARN << "dryrun only" << endl << endl;
+        cli_task.construct   = false;
+        cli_task.reconstruct = false;
+    }
+    else if (cli_task.dryrun and cli_task.construct) {
+        cerr << LOG_WARN << "no need to dry-run and compress at the same time" << endl;
+        cerr << LOG_WARN << "dryrun only" << endl << endl;
+        cli_task.construct = false;
+    }
+    else if (cli_task.dryrun and cli_task.reconstruct) {
+        cerr << LOG_WARN << "no need to dry-run and decompress at the same time" << endl;
+        cerr << LOG_WARN << "will dryrun only" << endl << endl;
+        cli_task.reconstruct = false;
+    }
+
+    if (to_abort) {
+        print_doc();
+        exit(-1);
+    }
+}
+
+cuszCTX::cuszCTX(int argc, char** const argv)
+{
+    std::string opt;
+    auto optmatch = [&](std::vector<std::string> vs) -> bool { return ConfigHelper::check_opt_in_list(opt, vs); };
+
+    if (argc == 1) {
+        print_doc();
+        exit(0);
+    }
+
+    /******************************************************************************/
+    /* phase 0: parse */
+    set_from_cli_input(this, argc, argv);
+
+    // special treatment
+    if (predictor == "spline3") {
+        // unconditionally use anchor when it is spline3
+        use.anchor = true;
+    }
+
+    /******************************************************************************/
+    /* phase 1: check syntax */
+    if (read_args_status != 0) {
+        std::cout << LOG_INFO << "Exiting..." << endl;
+        // after printing ALL argument errors
+        exit(-1);
+    }
+
+    /******************************************************************************/
+    /* phase 2: check if legal */
+    validate();
+
+    /******************************************************************************/
+    /* phase 3: sort out filenames */
+    derive_fnames();
+}
+
+cuszCTX::cuszCTX(const char* in_str, bool dbg_print)
+{
+    /**
+     **  >>> syntax
+     **  comma-separated key-pairs
+     **  "key1=val1,key2=val2[,...]"
+     **
+     **  >>> example
+     **  "predictor=lorenzo,size=3600x1800"
+     **
+     **/
+
+    set_config(this, in_str, dbg_print);
+}
+
+void cuszCTX::print_doc(bool full)
+{
+    std::cout << "\n>>>>  cusz build: " << cusz::VERSION_TEXT << "\n";
+
+    if (full)
+        std::cout << StrHelper::doc_format(cusz_full_doc) << std::endl;
+    else
+        std::cout << cusz_short_doc << std::endl;
+}
+
+void cuszCTX::derive_fnames()
+{
+    // (1) "fname"          -> "", "fname"
+    // (2) "./fname"        -> "./" "fname"
+    // (3) "/path/to/fname" -> "/path/to", "fname"
+    auto input_path = fname.fname.substr(0, fname.fname.rfind('/') + 1);
+    if (not cli_task.construct and cli_task.reconstruct) fname.fname = fname.fname.substr(0, fname.fname.rfind('.'));
+    fname.basename = fname.fname.substr(fname.fname.rfind('/') + 1);
+
+    if (opath.empty()) opath = input_path.empty() ? opath = "" : opath = input_path;
+    opath += "/";
+
+    fname.path_basename   = opath + fname.basename;
+    fname.compress_output = fname.path_basename + ".cusza";
+}
diff --git a/qtensor/compression/cusz/src/cusz/custom.cc b/qtensor/compression/cusz/src/cusz/custom.cc
new file mode 100644
index 00000000..ad9eff89
--- /dev/null
+++ b/qtensor/compression/cusz/src/cusz/custom.cc
@@ -0,0 +1,34 @@
+/**
+ * @file custom.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-30
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "cusz/custom.h"
+
+extern "C" {
+
+cusz_custom_predictor     cusz_default_predictor() { return {LorenzoI, false, false}; }
+cusz_custom_quantization  cusz_default_quantization() { return {512, false}; }
+cusz_custom_codec         cusz_default_codec() { return {Huffman, true, 0.5}; }
+cusz_custom_huffman_codec cusz_default_huffman_codec() { return {Canonical, Device, Coarse, 1024, 768}; }
+cusz_custom_spcodec       cusz_default_spcodec() { return {SparseMat, 0.2}; }
+cusz_custom_framework*    cusz_default_framework()
+{
+    return new cusz_custom_framework{
+        FP32,  // placeholder; set in another function call
+        Auto, cusz_default_predictor(), cusz_default_quantization(), cusz_default_codec(),
+        // cusz_default_spcodec(),
+        cusz_default_huffman_codec()};
+}
+
+void cusz_set_datatype(cusz_custom_framework* config, cusz_datatype datatype) { config->datatype = datatype; }
+void cusz_set_pipelinetype(cusz_custom_framework* config, cusz_pipelinetype pipeline) { config->pipeline = pipeline; }
+
+// end of extern C
+}
diff --git a/qtensor/compression/cusz/src/cusz_lib.cc b/qtensor/compression/cusz/src/cusz_lib.cc
new file mode 100644
index 00000000..723b80b1
--- /dev/null
+++ b/qtensor/compression/cusz/src/cusz_lib.cc
@@ -0,0 +1,115 @@
+/**
+ * @file cusz_lib.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-05-01
+ * (rev.1) 2023-01-29
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include <stdexcept>
+
+#include <thrust/device_ptr.h>
+#include <thrust/extrema.h>
+
+#include "component.hh"
+#include "compressor.hh"
+#include "context.hh"
+#include "cusz.h"
+#include "cusz/custom.h"
+#include "cusz/type.h"
+#include "framework.hh"
+
+cusz_compressor* cusz_create(cusz_framework* _framework, cusz_datatype _type)
+{
+    auto comp = new cusz_compressor{.framework = _framework, .type = _type};
+
+    if (comp->type == FP32) {
+        using DATA       = float;
+        using Compressor = cusz::CompressorFP32;
+
+        comp->compressor = new Compressor();
+    }
+    else {
+        throw std::runtime_error("Type is not supported.");
+    }
+
+    return comp;
+}
+
+cusz_error_status cusz_release(cusz_compressor* comp)
+{
+    delete comp;
+    return CUSZ_SUCCESS;
+}
+
+cusz_error_status cusz_compress(
+    cusz_compressor* comp,
+    cusz_config*     config,
+    void*            uncompressed,
+    cusz_len const   uncomp_len,
+    uint8_t**        compressed,
+    size_t*          comp_bytes,
+    cusz_header*     header,
+    void*            record,
+    cudaStream_t     stream)
+{
+    // cusz::TimeRecord cpp_record;
+
+    auto context = new cusz_context();
+    (*context)
+        .set_len(uncomp_len.x, uncomp_len.y, uncomp_len.z, uncomp_len.w)
+        .set_eb(config->eb)
+        .set_control_string(config->eb == Rel ? "mode=r2r" : "mode=abs");
+
+    // Be cautious of autotuning! The default value of pardeg is not robust.
+    cusz::CompressorHelper::autotune_coarse_parvle(static_cast<cusz_context*>(context));
+
+    if (comp->type == FP32) {
+        using DATA       = float;
+        using Compressor = cusz::CompressorFP32;
+
+        // TODO add memlen & datalen comparison
+        static_cast<Compressor*>(comp->compressor)->init(context);
+        static_cast<Compressor*>(comp->compressor)
+            ->compress(context, static_cast<DATA*>(uncompressed), *compressed, *comp_bytes, stream);
+        static_cast<Compressor*>(comp->compressor)->export_header(*header);
+        static_cast<Compressor*>(comp->compressor)->export_timerecord((cusz::TimeRecord*)record);
+    }
+    else {
+        throw std::runtime_error(std::string(__FUNCTION__) + ": Type is not supported.");
+    }
+
+    return CUSZ_SUCCESS;
+}
+
+cusz_error_status cusz_decompress(
+    cusz_compressor* comp,
+    cusz_header*     header,
+    uint8_t*         compressed,
+    size_t const     comp_len,
+    void*            decompressed,
+    cusz_len const   decomp_len,
+    void*            record,
+    cudaStream_t     stream)
+{
+    // cusz::TimeRecord cpp_record;
+
+    if (comp->type == FP32) {
+        using DATA       = float;
+        using Compressor = cusz::CompressorFP32;
+
+        static_cast<Compressor*>(comp->compressor)->init(header);
+        static_cast<Compressor*>(comp->compressor)
+            ->decompress(header, compressed, static_cast<DATA*>(decompressed), stream);
+        static_cast<Compressor*>(comp->compressor)->export_timerecord((cusz::TimeRecord*)record);
+    }
+    else {
+        throw std::runtime_error(std::string(__FUNCTION__) + ": Type is not supported.");
+    }
+
+    return CUSZ_SUCCESS;
+}
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/cusz_version.h.in b/qtensor/compression/cusz/src/cusz_version.h.in
new file mode 100644
index 00000000..1bd3344f
--- /dev/null
+++ b/qtensor/compression/cusz/src/cusz_version.h.in
@@ -0,0 +1,3 @@
+#define CUSZ_MAJOR_VERSION @PROJECT_VERSION_MAJOR@
+#define CUSZ_MINOR_VERSION @PROJECT_VERSION_MINOR@
+#define CUSZ_PATCH_VERSION @PROJECT_VERSION_PATCH@
diff --git a/qtensor/compression/cusz/src/cusz_wrapper.cu b/qtensor/compression/cusz/src/cusz_wrapper.cu
new file mode 100644
index 00000000..2827123d
--- /dev/null
+++ b/qtensor/compression/cusz/src/cusz_wrapper.cu
@@ -0,0 +1,154 @@
+//#include "cuszx_entry.h"
+//#include "szx_defines.h"
+//#include "szx_BytesToolkit.h"
+//#include "szx_TypeManager.h"
+//#include "timingGPU.h"
+
+#include "cusz.h"
+#include "cli/quality_viewer.hh"
+#include "cli/timerecord_viewer.hh"
+#include "utils/io.hh"
+#include "utils/print_gpu.hh"
+
+// template <typename T>
+extern "C"{
+unsigned char* cusz_device_compress(float *data, float r2r_error,size_t len,size_t *outSize)
+{
+    /* For demo, we use 3600x1800 CESM data. */
+
+    cusz_header header;
+    uint8_t*    exposed_compressed;
+    uint8_t*    compressed;
+    size_t      compressed_len;
+
+    float *d_uncompressed, *h_uncompressed;
+    float *d_decompressed, *h_decompressed;
+
+    d_uncompressed = data;
+
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // using default
+    // cusz_framework* framework = cusz_default_framework();
+    // alternatively
+    cusz_framework fw = cusz_framework{
+        .pipeline     = Auto,
+        .predictor    = cusz_custom_predictor{.type = LorenzoI},
+        .quantization = cusz_custom_quantization{.radius = 512},
+        .codec        = cusz_custom_codec{.type = Huffman}};
+    cusz_framework* framework = &fw;
+
+    // Brace initializing a struct pointer is not supported by all host compilers
+    // when nvcc forwards.
+    // cusz_framework* framework = new cusz_framework{
+    //     .pipeline     = Auto,
+    //     .predictor    = cusz_custom_predictor{.type = LorenzoI},
+    //     .quantization = cusz_custom_quantization{.radius = 512},
+    //     .codec        = cusz_custom_codec{.type = Huffman}};
+
+
+    cusz_compressor* comp       = cusz_create(framework, FP32);
+    cusz_config*     config     = new cusz_config{.eb = r2r_error, .mode = Rel};
+    cusz_len         uncomp_len = cusz_len{len, 1, 1, 1};  // x, y, z, w
+    cusz_len         decomp_len = uncomp_len;
+
+    cusz::TimeRecord compress_timerecord;
+    
+
+    {
+        cusz_compress(
+            comp, config, d_uncompressed, uncomp_len, &exposed_compressed, &compressed_len, &header,
+            (void*)&compress_timerecord, stream);
+
+        /* User can interpret the collected time information in other ways. */
+        cusz::TimeRecordViewer::view_compression(&compress_timerecord, len * sizeof(float), compressed_len);
+
+        /* verify header */
+        printf("header.%-*s : %x\n", 12, "(addr)", &header);
+        printf("header.%-*s : %lu, %lu, %lu\n", 12, "{x,y,z}", header.x, header.y, header.z);
+        printf("header.%-*s : %lu\n", 12, "filesize", ConfigHelper::get_filesize(&header));
+    }
+
+    /* If needed, User should perform a memcopy to transfer `exposed_compressed` before `compressor` is destroyed. */
+    cudaMalloc(&compressed, compressed_len);
+    cudaMemcpy(compressed, exposed_compressed, compressed_len, cudaMemcpyDeviceToDevice);
+    cudaFree(exposed_compressed);
+    cudaStreamDestroy(stream);
+    *outSize = compressed_len;
+    return compressed;
+}
+
+float* cusz_device_decompress(uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error){
+    cusz::TimeRecord decompress_timerecord;
+    cudaStream_t stream;
+    cusz_header header;
+    float* d_decompressed;
+    cudaMalloc(&d_decompressed, sizeof(float) * len);
+
+    cusz_framework fw = cusz_framework{
+        .pipeline     = Auto,
+        .predictor    = cusz_custom_predictor{.type = LorenzoI},
+        .quantization = cusz_custom_quantization{.radius = 512},
+        .codec        = cusz_custom_codec{.type = Huffman}};
+    cusz_framework* framework = &fw;
+
+    cusz_compressor* comp       = cusz_create(framework, FP32);
+    cusz_config*     config     = new cusz_config{.eb = r2r_error, .mode = Rel};
+    cusz_len         uncomp_len = cusz_len{len, 1, 1, 1};  // x, y, z, w
+    cusz_len         decomp_len = uncomp_len;
+
+
+    cudaStreamCreate(&stream);
+    {
+        cusz_decompress(
+            comp, &header, cmpbytes, compressed_len, d_decompressed, decomp_len,
+            (void*)&decompress_timerecord, stream);
+
+        cusz::TimeRecordViewer::view_decompression(&decompress_timerecord, len * sizeof(float));
+    }
+
+
+    cusz_release(comp);
+
+    // cudaFree(cmpbytes);
+    cudaStreamDestroy(stream);
+    return d_decompressed;
+}
+
+
+    // unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize){
+    //     float max,min;
+    //     unsigned char* bytes;
+    //     max = data[0];
+    //     min = data[0];
+    //     for (size_t i = 0; i < nbEle; i++)
+    //     {
+    //         if(data[i] > max) max = data[i];
+    //         if(data[i] < min) min = data[i];
+    //     }
+        
+    //     float threshold = r2r_threshold*(max-min);
+    //     float errBound = r2r_err*(max-min);
+    //     bytes = cuSZx_fast_compress_args_unpredictable_blocked_float(data, outSize, errBound, nbEle, blockSize, threshold);
+   	//     // printf("outSize %p\n", bytes);
+    //     return bytes;
+    // }
+
+    // float* cuSZx_integrated_decompress(unsigned char *bytes, size_t nbEle){
+    //     // printf("test\n");
+    //     float**data;
+	//     cuSZx_fast_decompress_args_unpredictable_blocked_float(data, nbEle, bytes);
+    //     return *data;
+    // }
+
+    // unsigned char* cuSZx_device_compress(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold){
+    //     return device_ptr_cuSZx_compress_float(oriData, outSize, absErrBound, nbEle, blockSize, threshold);
+    // }
+
+    // float* cuSZx_device_decompress(size_t nbEle, unsigned char* cmpBytes){
+    //     return device_ptr_cuSZx_decompress_float(nbEle, cmpBytes);
+    // }
+    
+    
+}
diff --git a/qtensor/compression/cusz/src/cusz_wrapper.py b/qtensor/compression/cusz/src/cusz_wrapper.py
new file mode 100644
index 00000000..e588c492
--- /dev/null
+++ b/qtensor/compression/cusz/src/cusz_wrapper.py
@@ -0,0 +1,173 @@
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+from qtensor.tools.lazy_import import cupy as cp
+import time
+import torch
+
+from pathlib import Path
+LIB_PATH = str(Path(__file__).parent/'libcusz_wrapper.so')
+CUSZ_PATH = str(Path(__file__).parent/'libcusz.so')
+# unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize)
+
+# unsigned char* cusz_device_compress(float *data, float r2r_error,size_t len,size_t *outSize)
+
+def get_device_compress():
+    dll_base = ctypes.CDLL(CUSZ_PATH, mode=ctypes.RTLD_GLOBAL)
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.cusz_device_compress
+    # Returns: unsigned char *bytes
+    # Needs: float *data, float r2r_error,size_t len,size_t *outSize
+    func.argtypes = [POINTER(c_float), c_float, c_size_t, POINTER(c_size_t)]
+    func.restype = POINTER(c_ubyte)
+    return func
+
+# float* cusz_device_decompress(uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error){
+
+def get_device_decompress():
+
+    dll_base = ctypes.CDLL(CUSZ_PATH, mode=ctypes.RTLD_GLOBAL)
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.cusz_device_decompress
+    # Returns: float *newData
+    # Needs: size_t nbEle, unsigned char *cmpBytes
+    func.argtypes = [POINTER(c_ubyte), c_size_t, c_size_t, c_float]
+    func.restype = POINTER(c_float)
+    return func
+
+
+def cusz_device_compress(oriData, absErrBound, nbEle, blockSize,threshold):
+    __cuszx_device_compress = get_device_compress()
+    #print(nbEle)
+    ori_nbEle = nbEle
+    variable = ctypes.c_size_t(0)
+    outSize = ctypes.pointer(variable)
+
+    oriData = oriData.flatten()
+    ori_real = oriData.real
+    ori_imag = oriData.imag
+    oriData = cp.concatenate((ori_real, ori_imag))
+    #nbEle = len(oriData)
+    sample = oriData[::2]
+    #print(nbEle)
+    d = cp.amax(oriData) - cp.amin(oriData)
+    #print("max min time (s): " +str(time.time()-v_time))
+    d = d.get()
+    if d.dtype == np.complex64:
+        #d = min(d.real, d.imag)
+        d = d.real
+    # absErrBound = absErrBound*(d)
+    threshold = threshold*(d)
+    s_1 = time.time() 
+    #print(cp.get_array_module(oriData))    
+    truth_values = abs(oriData)<=threshold
+    oriData[truth_values] = 0.0
+    
+    nbEle = oriData.shape[0]
+    
+
+    oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float))
+    #print("starting")
+    # float *data, float r2r_error,size_t len,size_t *outSize
+    o_bytes = __cuszx_device_compress(oriData_p,np.float32(absErrBound), np.ulonglong(nbEle), outSize)
+  
+
+    return (o_bytes,outSize.contents.value, absErrBound), outSize
+
+
+def cusz_device_decompress(nbEle, cmpBytes, owner, dtype):
+    __cuszx_device_decompress=get_device_decompress()
+    (cmpBytes, cmpsize, err_bound) = cmpBytes
+
+    nbEle_p = ctypes.c_size_t(nbEle)
+    # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error
+    newData = __cuszx_device_decompress(cmpBytes,nbEle_p, ctypes.c_size_t(cmpsize), np.float32(err_bound))
+
+    # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
+    # -- Workaround to convert GPU pointer to int
+    p_decompressed_ptr = ctypes.addressof(newData)
+    # cast to int64 pointer
+    # (effectively converting pointer to pointer to addr to pointer to int64)
+    p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decompressed_int = p_decompressed_int.contents
+    # --
+    pointer_for_free = decompressed_int.value
+    # self.decompressed_own.append(decompressed_int.value)
+    mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0)
+    mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
+    #print("mem ptr")
+    #print(mem_ptr)
+    arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr)
+
+    # res = cp.zeros((nbEle,))
+    # ## need to convert newData to cupy
+    # cp.place(res,bitmap,arr)
+
+    c_res = cp.zeros(int(nbEle/2), np.complex64)
+    c_res.real = arr[0:int(nbEle/2)]
+    c_res.imag = arr[int(nbEle/2):]
+    return (c_res, pointer_for_free)
+
+### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
+
+def free_compressed(ptr):
+    p_ptr = ctypes.addressof(ptr)
+    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decomp_int = p_int.contents
+    cp.cuda.runtime.free(decomp_int.value)
+
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = int(1024)
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.002
+    r2r_error = 0.0001
+
+    in_vector = np.fromfile("all_sample.bin", dtype=np.complex64)
+    #print(np.max(in_vector))
+    DATA_SIZE = len(in_vector)
+    #range_vr = np.max(in_vector)-np.min(in_vector)
+    #r2r_threshold = r2r_threshold*range_vr
+    #r2r_error = r2r_error*range_vr
+    #in_vector = np.zeros((DATA_SIZE,))
+    #for i in range(0,int(DATA_SIZE/4)):
+    #    in_vector[i] = 0.0
+    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+    #    in_vector[i] = 5.0
+    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
+    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+    #    in_vector[i] = -7.0
+    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+    #    in_vector[i] = 0.001
+
+    print(DATA_SIZE)
+    #in_vector = in_vector.astype('float32')
+    in_vector_gpu = cp.asarray(in_vector)
+    
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+    for i in range(200):
+        s_time = time.time()
+        o_bytes, outSize = cusz_device_compress(in_vector_gpu, r2r_error, DATA_SIZE, 256, r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        print(outSize[0])
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        (d_bytes,ptr )= cusz_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
+        
+        free_compressed(o_bytes[0])
+        cp.cuda.runtime.free(ptr)
+        print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
+        print("Decompress Success")
diff --git a/qtensor/compression/cusz/src/detail/compare_cpu.inl b/qtensor/compression/cusz/src/detail/compare_cpu.inl
new file mode 100644
index 00000000..1617fc38
--- /dev/null
+++ b/qtensor/compression/cusz/src/detail/compare_cpu.inl
@@ -0,0 +1,109 @@
+/**
+ * @file _compare.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-08
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef C0E747B4_066F_4B04_A3D2_00E1A3B7D682
+#define C0E747B4_066F_4B04_A3D2_00E1A3B7D682
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include "cusz/type.h"
+
+namespace psz {
+namespace detail {
+
+template <typename T>
+bool cppstd_identical(T* d1, T* d2, size_t const len)
+{
+    return std::equal(d1, d1 + len, d2);
+}
+
+template <typename T>
+bool cppstd_error_bounded(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx = nullptr)
+{
+    // debugging
+
+    bool eb_ed = true;
+    for (size_t i = 0; i < len; i++) {
+        if (fabs(a[i] - b[i]) > 1.001 * eb) {
+            if (first_faulty_idx) *first_faulty_idx = i;
+            return false;
+        }
+    }
+    return true;
+}
+
+template <typename T>
+void cppstd_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t const len)
+{
+    double max_odata = odata[0], min_odata = odata[0];
+    double max_xdata = xdata[0], min_xdata = xdata[0];
+    double max_abserr = max_abserr = fabs(xdata[0] - odata[0]);
+
+    double sum_0 = 0, sum_x = 0;
+    for (size_t i = 0; i < len; i++) sum_0 += odata[i], sum_x += xdata[i];
+
+    double mean_odata = sum_0 / len, mean_xdata = sum_x / len;
+    double sum_var_odata = 0, sum_var_xdata = 0, sum_err2 = 0, sum_corr = 0, rel_abserr = 0;
+
+    double max_pwrrel_abserr = 0;
+    size_t max_abserr_index  = 0;
+    for (size_t i = 0; i < len; i++) {
+        max_odata = max_odata < odata[i] ? odata[i] : max_odata;
+        min_odata = min_odata > odata[i] ? odata[i] : min_odata;
+
+        max_xdata = max_xdata < odata[i] ? odata[i] : max_xdata;
+        min_xdata = min_xdata > xdata[i] ? xdata[i] : min_xdata;
+
+        float abserr = fabs(xdata[i] - odata[i]);
+        if (odata[i] != 0) {
+            rel_abserr        = abserr / fabs(odata[i]);
+            max_pwrrel_abserr = max_pwrrel_abserr < rel_abserr ? rel_abserr : max_pwrrel_abserr;
+        }
+        max_abserr_index = max_abserr < abserr ? i : max_abserr_index;
+        max_abserr       = max_abserr < abserr ? abserr : max_abserr;
+        sum_corr += (odata[i] - mean_odata) * (xdata[i] - mean_xdata);
+        sum_var_odata += (odata[i] - mean_odata) * (odata[i] - mean_odata);
+        sum_var_xdata += (xdata[i] - mean_xdata) * (xdata[i] - mean_xdata);
+        sum_err2 += abserr * abserr;
+    }
+    double std_odata = sqrt(sum_var_odata / len);
+    double std_xdata = sqrt(sum_var_xdata / len);
+    double ee        = sum_corr / len;
+
+    s->len = len;
+
+    s->odata.max = max_odata;
+    s->odata.min = min_odata;
+    s->odata.rng = max_odata - min_odata;
+    s->odata.std = std_odata;
+
+    s->xdata.max = max_xdata;
+    s->xdata.min = min_xdata;
+    s->xdata.rng = max_xdata - min_xdata;
+    s->xdata.std = std_xdata;
+
+    s->max_err.idx    = max_abserr_index;
+    s->max_err.abs    = max_abserr;
+    s->max_err.rel    = max_abserr / s->odata.rng;
+    s->max_err.pwrrel = max_pwrrel_abserr;
+
+    s->reduced.coeff = ee / std_odata / std_xdata;
+    s->reduced.MSE   = sum_err2 / len;
+    s->reduced.NRMSE = sqrt(s->reduced.MSE) / s->odata.rng;
+    s->reduced.PSNR  = 20 * log10(s->odata.rng) - 10 * log10(s->reduced.MSE);
+}
+
+}  // namespace detail
+}  // namespace psz
+
+#endif /* C0E747B4_066F_4B04_A3D2_00E1A3B7D682 */
diff --git a/qtensor/compression/cusz/src/detail/compare_gpu.inl b/qtensor/compression/cusz/src/detail/compare_gpu.inl
new file mode 100644
index 00000000..12ec3475
--- /dev/null
+++ b/qtensor/compression/cusz/src/detail/compare_gpu.inl
@@ -0,0 +1,193 @@
+/**
+ * @file _compare.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-08
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef F7DF2FE5_571E_48C1_965D_0B19D1CC14D4
+#define F7DF2FE5_571E_48C1_965D_0B19D1CC14D4
+
+#include <math.h>
+#include <thrust/count.h>
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/equal.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/tuple.h>
+
+#include "cusz/type.h"
+
+namespace psz {
+namespace detail {
+
+static const int MINVAL = 0;
+static const int MAXVAL = 1;
+static const int AVGVAL = 2;
+static const int RNG    = 3;
+
+template <typename T>
+bool thrustgpu_identical(T* d1, T* d2, size_t const len)
+{
+    return thrust::equal(thrust::device, d1, d1 + len, d2);
+}
+
+template <typename T>
+bool thrustgpu_error_bounded(T* a, T* b, size_t const len, double eb, size_t* first_faulty_idx = nullptr)
+{
+    thrust::device_ptr<T>             a_ = thrust::device_pointer_cast(a);
+    thrust::device_ptr<T>             b_ = thrust::device_pointer_cast(b);
+    thrust::constant_iterator<double> eb_(eb);
+    using tup = thrust::tuple<T, T, double>;
+
+    auto ab_begin = thrust::make_zip_iterator(thrust::make_tuple(a_, b_, eb_));
+    auto ab_end   = thrust::make_zip_iterator(thrust::make_tuple(a_ + len, b_ + len, eb_));
+
+    // Let compiler figure out the type.
+    auto iter = thrust::find_if(thrust::device, ab_begin, ab_end, [] __device__(tup t) {
+        // debug use
+        // if (fabs(thrust::get<1>(t) - thrust::get<0>(t)) > thrust::get<2>(t))
+        //     printf("a: %f\tb: %f\teb: %lf\n", (float)thrust::get<1>(t), (float)thrust::get<0>(t), thrust::get<2>(t));
+
+        return fabs(thrust::get<1>(t) - thrust::get<0>(t)) > 1.001 * thrust::get<2>(t);
+    });
+
+    if (iter == ab_end) { return true; }
+    else {
+        // *first_faulty_idx = iter - ab_begin;
+        return false;
+    }
+}
+
+template <typename T>
+void thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])
+{
+    thrust::device_ptr<T> g_ptr = thrust::device_pointer_cast(d_ptr);
+
+    auto minel  = thrust::min_element(g_ptr, g_ptr + len) - g_ptr;
+    auto maxel  = thrust::max_element(g_ptr, g_ptr + len) - g_ptr;
+    res[MINVAL] = *(g_ptr + minel);
+    res[MAXVAL] = *(g_ptr + maxel);
+    res[RNG]    = res[MAXVAL] - res[MINVAL];
+
+    auto sum    = thrust::reduce(g_ptr, g_ptr + len, (T)0.0, thrust::plus<T>());
+    res[AVGVAL] = sum / len;
+}
+
+template <typename T>
+void thrustgpu_get_extrema(thrust::device_ptr<T> g_ptr, size_t len, T res[4])
+{
+    auto minel  = thrust::min_element(g_ptr, g_ptr + len) - g_ptr;
+    auto maxel  = thrust::max_element(g_ptr, g_ptr + len) - g_ptr;
+    res[MINVAL] = *(g_ptr + minel);
+    res[MAXVAL] = *(g_ptr + maxel);
+    res[RNG]    = res[MAXVAL] - res[MINVAL];
+
+    auto sum    = thrust::reduce(g_ptr, g_ptr + len, (T)0.0, thrust::plus<T>());
+    res[AVGVAL] = sum / len;
+}
+
+template <typename T>
+void thrustgpu_get_maxerr(
+    T*      reconstructed,  // in
+    T*      original,       // in
+    size_t  len,            // in
+    T&      maximum_val,    // out
+    size_t& maximum_loc,    // out
+    bool    destructive = false)
+{
+    T* diff;
+
+    if (destructive) {
+        diff = original;  // aliasing
+    }
+    else {
+        cudaMalloc(&diff, sizeof(T) * len);
+    }
+
+    auto expr = [=] __device__(T rel, T oel) { return rel - oel; };
+
+    // typesafe (also with exec-policy binding)
+    thrust::device_ptr<T> r(reconstructed);
+    thrust::device_ptr<T> o(original);
+    thrust::device_ptr<T> d(diff);
+
+    thrust::transform(r, r + len, o, d, expr);
+
+    auto maximum_ptr = thrust::max_element(d, d + len);
+    maximum_val      = *maximum_ptr;
+    maximum_loc      = maximum_ptr - d;
+
+    if (not destructive) { cudaFree(diff); }
+}
+
+template <typename T>
+void thrustgpu_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t len)
+{
+    using tup = thrust::tuple<T, T>;
+
+    thrust::device_ptr<T> p_odata = thrust::device_pointer_cast(odata);  // origin
+    thrust::device_ptr<T> p_xdata = thrust::device_pointer_cast(xdata);
+
+    T odata_res[4], xdata_res[4];
+
+    thrustgpu_get_extrema(p_odata, len, odata_res);
+    thrustgpu_get_extrema(p_xdata, len, xdata_res);
+
+    auto begin = thrust::make_zip_iterator(thrust::make_tuple(p_odata, p_xdata));
+    auto end   = thrust::make_zip_iterator(thrust::make_tuple(p_odata + len, p_xdata + len));
+
+    // clang-format off
+    auto corr      = [=] __host__ __device__(tup t)  { return (thrust::get<0>(t) - odata[AVGVAL]) * (thrust::get<1>(t) - xdata[AVGVAL]); };
+    auto err2      = []  __host__ __device__(tup t)  { T f = thrust::get<0>(t) - thrust::get<1>(t); return f * f; };
+    auto var_odata = [=] __host__ __device__(T a) { T f = a - odata[AVGVAL]; return f * f; };
+    auto var_xdata = [=] __host__ __device__(T a) { T f = a - xdata[AVGVAL]; return f * f; };
+
+    auto sum_err2      = thrust::transform_reduce(begin, end, err2, 0.0f, thrust::plus<T>());
+    auto sum_corr      = thrust::transform_reduce(begin, end, corr, 0.0f, thrust::plus<T>());
+    auto sum_var_odata = thrust::transform_reduce(p_odata, p_odata + len, var_odata, 0.0f, thrust::plus<T>());
+    auto sum_var_xdata = thrust::transform_reduce(p_xdata, p_xdata + len, var_xdata, 0.0f, thrust::plus<T>());
+    // clang-format on
+
+    double std_odata = sqrt(sum_var_odata / len);
+    double std_xdata = sqrt(sum_var_xdata / len);
+    double ee        = sum_corr / len;
+
+    // -----------------------------------------------------------------------------
+    T      max_abserr{0};
+    size_t max_abserr_index{0};
+    thrustgpu_get_maxerr(xdata, odata, len, max_abserr, max_abserr_index, false);
+    // -----------------------------------------------------------------------------
+
+    s->len = len;
+
+    s->odata.max = odata_res[MAXVAL];
+    s->odata.min = odata_res[MINVAL];
+    s->odata.rng = odata_res[MAXVAL] - odata_res[MINVAL];
+    s->odata.std = std_odata;
+
+    s->xdata.max = xdata_res[MAXVAL];
+    s->xdata.min = xdata_res[MINVAL];
+    s->xdata.rng = xdata_res[MAXVAL] - xdata_res[MINVAL];
+    s->xdata.std = std_xdata;
+
+    s->max_err.idx    = max_abserr_index;
+    s->max_err.abs    = max_abserr;
+    s->max_err.rel    = max_abserr / s->odata.rng;
+    s->max_err.pwrrel = NAN;
+
+    s->reduced.coeff = ee / std_odata / std_xdata;
+    s->reduced.MSE   = sum_err2 / len;
+    s->reduced.NRMSE = sqrt(s->reduced.MSE) / s->odata.rng;
+    s->reduced.PSNR  = 20 * log10(s->odata.rng) - 10 * log10(s->reduced.MSE);
+}
+
+}  // namespace detail
+}  // namespace psz
+
+#endif /* F7DF2FE5_571E_48C1_965D_0B19D1CC14D4 */
diff --git a/qtensor/compression/cusz/src/detail/compressor_impl.cu b/qtensor/compression/cusz/src/detail/compressor_impl.cu
new file mode 100644
index 00000000..83b819ae
--- /dev/null
+++ b/qtensor/compression/cusz/src/detail/compressor_impl.cu
@@ -0,0 +1,18 @@
+/**
+ * @file compressor.cu
+ * @author Jiannan Tian
+ * @brief cuSZ compressor of the default path
+ * @version 0.3
+ * @date 2021-10-05
+ * (create) 2020-02-12; (release) 2020-09-20;
+ * (rev.1) 2021-01-16; (rev.2) 2021-07-12; (rev.3) 2021-09-06; (rev.4) 2021-10-05
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include "compressor_impl.inl"
+#include "framework.hh"
+
+template class cusz::Compressor<cusz::Framework<float>>::impl;
diff --git a/qtensor/compression/cusz/src/detail/compressor_impl.inl b/qtensor/compression/cusz/src/detail/compressor_impl.inl
new file mode 100644
index 00000000..a36f339a
--- /dev/null
+++ b/qtensor/compression/cusz/src/detail/compressor_impl.inl
@@ -0,0 +1,479 @@
+/**
+ * @file compressor_impl.cuh
+ * @author Jiannan Tian
+ * @brief cuSZ compressor of the default path
+ * @version 0.3
+ * @date 2021-10-05
+ * (create) 2020-02-12; (release) 2020-09-20;
+ * (rev.1) 2021-01-16; (rev.2) 2021-07-12; (rev.3) 2021-09-06; (rev.4) 2021-10-05
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_DEFAULT_PATH_CUH
+#define CUSZ_DEFAULT_PATH_CUH
+
+#include <cuda_runtime.h>
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <iostream>
+
+#include "component.hh"
+#include "compressor.hh"
+#include "header.h"
+#include "kernel/cpplaunch_cuda.hh"
+#include "stat/stat_g.hh"
+#include "utils/cuda_err.cuh"
+
+#define DEFINE_DEV(VAR, TYPE) TYPE* d_##VAR{nullptr};
+#define DEFINE_HOST(VAR, TYPE) TYPE* h_##VAR{nullptr};
+#define FREEDEV(VAR) CHECK_CUDA(cudaFree(d_##VAR));
+#define FREEHOST(VAR) CHECK_CUDA(cudaFreeHost(h_##VAR));
+
+#define PRINT_ENTRY(VAR) printf("%d %-*s:  %'10u\n", (int)Header::VAR, 14, #VAR, header.entry[Header::VAR]);
+
+#define DEVICE2DEVICE_COPY(VAR, FIELD)                                                                 \
+    if (nbyte[Header::FIELD] != 0 and VAR != nullptr) {                                                \
+        auto dst = d_reserved_compressed + header.entry[Header::FIELD];                                \
+        auto src = reinterpret_cast<BYTE*>(VAR);                                                       \
+        CHECK_CUDA(cudaMemcpyAsync(dst, src, nbyte[Header::FIELD], cudaMemcpyDeviceToDevice, stream)); \
+    }
+
+#define ACCESSOR(SYM, TYPE) reinterpret_cast<TYPE*>(in_compressed + header->entry[Header::SYM])
+
+namespace cusz {
+
+constexpr auto kHOST        = cusz::LOC::HOST;
+constexpr auto kDEVICE      = cusz::LOC::DEVICE;
+constexpr auto kHOST_DEVICE = cusz::LOC::HOST_DEVICE;
+
+#define TEMPLATE_TYPE template <class BINDING>
+#define IMPL Compressor<BINDING>::impl
+
+TEMPLATE_TYPE
+uint32_t IMPL::get_len_data() { return data_len3.x * data_len3.y * data_len3.z; }
+
+TEMPLATE_TYPE
+IMPL::impl()
+{
+    predictor = new Predictor;
+
+    spcodec  = new Spcodec;
+    codec    = new Codec;
+    fb_codec = new FallbackCodec;
+}
+
+TEMPLATE_TYPE
+void IMPL::destroy()
+{
+    if (spcodec) delete spcodec;
+    if (codec) delete codec;
+    if (fb_codec) delete codec;
+    if (predictor) delete predictor;
+}
+
+TEMPLATE_TYPE
+IMPL::~impl() { destroy(); }
+
+//------------------------------------------------------------------------------
+
+// TODO
+TEMPLATE_TYPE
+void IMPL::init(Context* config, bool dbg_print) { init_detail(config, dbg_print); }
+
+TEMPLATE_TYPE
+void IMPL::init(Header* config, bool dbg_print) { init_detail(config, dbg_print); }
+
+template <class T>
+void peek_devdata(T* d_arr, size_t num = 20)
+{
+    thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__ __host__(const T i) { printf("%u\t", i); });
+    printf("\n");
+}
+
+TEMPLATE_TYPE
+void IMPL::compress(
+    Context*     config,
+    T*           uncompressed,
+    BYTE*&       compressed,
+    size_t&      compressed_len,
+    cudaStream_t stream,
+    bool         dbg_print)
+{
+    auto const eb                = config->eb;
+    auto const radius            = config->radius;
+    auto const pardeg            = config->vle_pardeg;
+    auto const codecs_in_use     = config->codecs_in_use;
+    auto const nz_density_factor = config->nz_density_factor;
+
+    if (dbg_print) {
+        std::cout << "eb\t" << eb << endl;
+        std::cout << "radius\t" << radius << endl;
+        std::cout << "pardeg\t" << pardeg << endl;
+        std::cout << "codecs_in_use\t" << codecs_in_use << endl;
+        std::cout << "nz_density_factor\t" << nz_density_factor << endl;
+    }
+
+    data_len3                 = dim3(config->x, config->y, config->z);
+    auto codec_force_fallback = config->codec_force_fallback();
+
+    header.codecs_in_use     = codecs_in_use;
+    header.nz_density_factor = nz_density_factor;
+
+    T*     d_anchor{nullptr};   // predictor out1
+    E*     d_errctrl{nullptr};  // predictor out2
+    T*     d_outlier{nullptr};  // predictor out3
+    BYTE*  d_spfmt{nullptr};
+    size_t spfmt_outlen{0};
+
+    BYTE*  d_codec_out{nullptr};
+    size_t codec_outlen{0};
+
+    size_t data_len, errctrl_len, sublen, spcodec_inlen;
+    auto   booklen = radius * 2;
+
+    auto derive_lengths_after_prediction = [&]() {
+        data_len      = predictor->get_len_data();
+        errctrl_len   = data_len;
+        spcodec_inlen = data_len;
+        sublen        = ConfigHelper::get_npart(data_len, pardeg);
+
+        // std::cout << "datalen\t" << data_len << '\n';
+        // std::cout << "errctrl_len\t" << errctrl_len << '\n';
+        // std::cout << "spcodec_inlen\t" << spcodec_inlen << '\n';
+        // std::cout << "sublen\t" << sublen << '\n';
+    };
+
+    auto update_header = [&]() {
+        header.x          = data_len3.x;
+        header.y          = data_len3.y;
+        header.z          = data_len3.z;
+        header.w          = 1;  // placeholder
+        header.radius     = radius;
+        header.vle_pardeg = pardeg;
+        header.eb         = eb;
+        header.byte_vle   = use_fallback_codec ? 8 : 4;
+    };
+
+    /******************************************************************************/
+
+    // Prediction is the dependency of the rest procedures.
+    predictor->construct(LorenzoI, data_len3, uncompressed, &d_anchor, &d_errctrl, &d_outlier, eb, radius, stream);
+    // peek_devdata(d_errctrl);
+
+    derive_lengths_after_prediction();
+    /******************************************************************************/
+
+    asz::stat::histogram<E>(d_errctrl, errctrl_len, d_freq, booklen, &time_hist, stream);
+
+    /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    // TODO remove duplicate get_frequency inside encode_with_exception()
+    encode_with_exception(
+        d_errctrl, errctrl_len,                                 // input
+        d_freq, booklen, sublen, pardeg, codec_force_fallback,  // config
+        d_codec_out, codec_outlen,                              // output
+        stream, dbg_print);
+
+    (*spcodec).encode(d_outlier, spcodec_inlen, d_spfmt, spfmt_outlen, stream, dbg_print);
+
+    /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    /******************************************************************************/
+
+    update_header();
+    subfile_collect(
+        d_anchor, (*predictor).get_len_anchor(),  //
+        d_codec_out, codec_outlen,                //
+        d_spfmt, spfmt_outlen,                    //
+        stream, dbg_print);
+
+    // output
+    compressed_len = ConfigHelper::get_filesize(&header);
+    compressed     = d_reserved_compressed;
+
+    collect_compress_timerecord();
+
+    // considering that codec can be consecutively in use, and can compress data of different huff-byte
+    use_fallback_codec = false;
+}
+
+TEMPLATE_TYPE
+void IMPL::clear_buffer()
+{  //
+    (*predictor).clear_buffer();
+    (*codec).clear_buffer();
+    (*spcodec).clear_buffer();
+}
+
+TEMPLATE_TYPE
+void IMPL::decompress(Header* header, BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool dbg_print)
+{
+    // TODO host having copy of header when compressing
+    if (not header) {
+        header = new Header;
+        CHECK_CUDA(cudaMemcpyAsync(header, in_compressed, sizeof(Header), cudaMemcpyDeviceToHost, stream));
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+    }
+
+    data_len3 = dim3(header->x, header->y, header->z);
+
+    use_fallback_codec      = header->byte_vle == 8;
+    double const eb         = header->eb;
+    int const    radius     = header->radius;
+    auto const   vle_pardeg = header->vle_pardeg;
+
+    // The inputs of components are from `compressed`.
+    auto d_anchor = ACCESSOR(ANCHOR, T);
+    auto d_vle    = ACCESSOR(VLE, BYTE);
+    auto d_sp     = ACCESSOR(SPFMT, BYTE);
+
+    // wire the workspace
+    auto d_errctrl = (*predictor).expose_quant();  // reuse space
+
+    // wire and aliasing
+    auto d_outlier       = out_decompressed;
+    auto d_outlier_xdata = out_decompressed;
+
+    auto spcodec_do            = [&]() { (*spcodec).decode(d_sp, d_outlier, stream); };
+    auto decode_with_exception = [&]() {
+        if (not use_fallback_codec) {  //
+            (*codec).decode(d_vle, d_errctrl);
+        }
+        else {
+            if (not fallback_codec_allocated) {
+                (*fb_codec).init((*predictor).get_len_quant(), radius * 2, vle_pardeg, /*dbg print*/ false);
+                fallback_codec_allocated = true;
+            }
+            (*fb_codec).decode(d_vle, d_errctrl);
+        }
+    };
+    auto predictor_do = [&]() {
+        (*predictor).reconstruct(LorenzoI, data_len3, d_outlier_xdata, d_anchor, d_errctrl, eb, radius, stream);
+    };
+
+    // process
+    spcodec_do(), decode_with_exception(), predictor_do();
+
+    collect_decompress_timerecord();
+
+    // clear state for the next decompression after reporting
+    use_fallback_codec = false;
+}
+
+// public getter
+TEMPLATE_TYPE
+void IMPL::export_header(Header& ext_header) { ext_header = header; }
+
+TEMPLATE_TYPE
+void IMPL::export_header(Header* ext_header) { *ext_header = header; }
+
+TEMPLATE_TYPE
+void IMPL::export_timerecord(TimeRecord* ext_timerecord)
+{
+    if (ext_timerecord) *ext_timerecord = timerecord;
+}
+
+// helper
+TEMPLATE_TYPE
+void IMPL::init_codec(size_t codec_in_len, unsigned int codec_config, int max_booklen, int pardeg, bool dbg_print)
+{
+    if (codec_config == 0b00) throw std::runtime_error("Argument codec_config must have set bit(s).");
+    if (codec_config bitand 0b01) {
+        if (dbg_print) LOGGING(LOG_INFO, "allocated 4-byte codec");
+        (*codec).init(codec_in_len, max_booklen, pardeg, dbg_print);
+    }
+    if (codec_config bitand 0b10) {
+        if (dbg_print) LOGGING(LOG_INFO, "allocated 8-byte (fallback) codec");
+        (*fb_codec).init(codec_in_len, max_booklen, pardeg, dbg_print);
+        fallback_codec_allocated = true;
+    }
+};
+
+TEMPLATE_TYPE
+template <class CONFIG>
+void IMPL::init_detail(CONFIG* config, bool dbg_print)
+{
+    const auto cfg_radius      = config->radius;
+    const auto cfg_pardeg      = config->vle_pardeg;
+    const auto density_factor  = config->nz_density_factor;
+    const auto codec_config    = config->codecs_in_use;
+    const auto cfg_max_booklen = cfg_radius * 2;
+    const auto x               = config->x;
+    const auto y               = config->y;
+    const auto z               = config->z;
+
+    size_t spcodec_in_len, codec_in_len;
+
+    (*predictor).init(LorenzoI, x, y, z, dbg_print);
+
+    spcodec_in_len = (*predictor).get_alloclen_data();
+    codec_in_len   = (*predictor).get_alloclen_quant();
+
+    (*spcodec).init(spcodec_in_len, density_factor, dbg_print);
+
+    {
+        auto bytes = sizeof(cusz::FREQ) * cfg_max_booklen;
+        cudaMalloc(&d_freq, bytes);
+        cudaMemset(d_freq, 0x0, bytes);
+
+        // cudaMalloc(&d_freq_another, bytes);
+        // cudaMemset(d_freq_another, 0x0, bytes);
+    }
+
+    init_codec(codec_in_len, codec_config, cfg_max_booklen, cfg_pardeg, dbg_print);
+
+    CHECK_CUDA(cudaMalloc(&d_reserved_compressed, (*predictor).get_alloclen_data() * sizeof(T) / 2));
+}
+
+TEMPLATE_TYPE
+void IMPL::collect_compress_timerecord()
+{
+#define COLLECT_TIME(NAME, TIME) timerecord.push_back({const_cast<const char*>(NAME), TIME});
+
+    if (not timerecord.empty()) timerecord.clear();
+
+    COLLECT_TIME("predict", (*predictor).get_time_elapsed());
+    COLLECT_TIME("histogram", time_hist);
+
+    if (not use_fallback_codec) {
+        COLLECT_TIME("book", (*codec).get_time_book());
+        COLLECT_TIME("huff-enc", (*codec).get_time_lossless());
+    }
+    else {
+        COLLECT_TIME("book", (*fb_codec).get_time_book());
+        COLLECT_TIME("huff-enc", (*fb_codec).get_time_lossless());
+    }
+
+    COLLECT_TIME("outlier", (*spcodec).get_time_elapsed());
+}
+
+TEMPLATE_TYPE
+void IMPL::collect_decompress_timerecord()
+{
+    if (not timerecord.empty()) timerecord.clear();
+
+    COLLECT_TIME("outlier", (*spcodec).get_time_elapsed());
+
+    if (not use_fallback_codec) {  //
+        COLLECT_TIME("huff-dec", (*codec).get_time_lossless());
+    }
+    else {  //
+        COLLECT_TIME("huff-dec", (*fb_codec).get_time_lossless());
+    }
+
+    COLLECT_TIME("predict", (*predictor).get_time_elapsed());
+}
+
+TEMPLATE_TYPE
+void IMPL::encode_with_exception(
+    E*           d_in,
+    size_t       inlen,
+    cusz::FREQ*  d_freq,
+    int          booklen,
+    int          sublen,
+    int          pardeg,
+    bool         codec_force_fallback,
+    BYTE*&       d_out,
+    size_t&      outlen,
+    cudaStream_t stream,
+    bool         dbg_print)
+{
+    auto build_codebook_using = [&](auto encoder) { encoder->build_codebook(d_freq, booklen, stream); };
+    auto encode_with          = [&](auto encoder) { encoder->encode(d_in, inlen, d_out, outlen, stream); };
+
+    auto try_fallback_alloc = [&]() {
+        use_fallback_codec = true;
+        if (not fallback_codec_allocated) {
+            LOGGING(LOG_EXCEPTION, "online allocate fallback (8-byte) codec");
+            fb_codec->init(inlen, booklen, pardeg, dbg_print);
+            fallback_codec_allocated = true;
+        }
+    };
+
+    /******************************************************************************/
+    if (not codec_force_fallback) {
+        try {
+            build_codebook_using(codec);
+            encode_with(codec);
+        }
+        catch (const std::runtime_error& e) {
+            LOGGING(LOG_EXCEPTION, "switch to fallback codec");
+            try_fallback_alloc();
+
+            build_codebook_using(fb_codec);
+            encode_with(fb_codec);
+        }
+    }
+    else {
+        LOGGING(LOG_INFO, "force switch to fallback codec");
+        try_fallback_alloc();
+
+        build_codebook_using(fb_codec);
+        encode_with(fb_codec);
+    }
+}
+
+TEMPLATE_TYPE
+void IMPL::subfile_collect(
+    T*           d_anchor,
+    size_t       anchor_len,
+    BYTE*        d_codec_out,
+    size_t       codec_outlen,
+    BYTE*        d_spfmt_out,
+    size_t       spfmt_outlen,
+    cudaStream_t stream,
+    bool         dbg_print)
+{
+    header.self_bytes = sizeof(Header);
+    uint32_t nbyte[Header::END];
+    nbyte[Header::HEADER] = sizeof(Header);
+    nbyte[Header::ANCHOR] = sizeof(T) * anchor_len;
+    nbyte[Header::VLE]    = sizeof(BYTE) * codec_outlen;
+    nbyte[Header::SPFMT]  = sizeof(BYTE) * spfmt_outlen;
+
+    header.entry[0] = 0;
+    // *.END + 1; need to know the ending position
+    for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] = nbyte[i - 1]; }
+    for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; }
+
+    auto debug_header_entry = [&]() {
+        printf("\nsubfile collect in compressor:\n");
+        printf("  ENTRIES\n");
+
+        PRINT_ENTRY(HEADER);
+        PRINT_ENTRY(ANCHOR);
+        PRINT_ENTRY(VLE);
+        PRINT_ENTRY(SPFMT);
+        PRINT_ENTRY(END);
+        printf("\n");
+    };
+
+    if (dbg_print) debug_header_entry();
+
+    CHECK_CUDA(cudaMemcpyAsync(d_reserved_compressed, &header, sizeof(header), cudaMemcpyHostToDevice, stream));
+
+    DEVICE2DEVICE_COPY(d_anchor, ANCHOR)
+    DEVICE2DEVICE_COPY(d_codec_out, VLE)
+    DEVICE2DEVICE_COPY(d_spfmt_out, SPFMT)
+
+    /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream));
+}
+
+}  // namespace cusz
+
+#undef FREEDEV
+#undef FREEHOST
+#undef DEFINE_DEV
+#undef DEFINE_HOST
+#undef DEVICE2DEVICE_COPY
+#undef PRINT_ENTRY
+#undef ACCESSOR
+#undef COLLECT_TIME
+
+#undef TEMPLATE_TYPE
+#undef IMPL
+
+#endif
diff --git a/qtensor/compression/cusz/src/detail/spmat.cu b/qtensor/compression/cusz/src/detail/spmat.cu
new file mode 100644
index 00000000..141d2acb
--- /dev/null
+++ b/qtensor/compression/cusz/src/detail/spmat.cu
@@ -0,0 +1,14 @@
+/**
+ * @file spmat.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-09-28
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "detail/spmat.cuh"
+
+template struct cusz::SpcodecCSR<float, uint32_t>::impl;
diff --git a/qtensor/compression/cusz/src/detail/spv_gpu.inl b/qtensor/compression/cusz/src/detail/spv_gpu.inl
new file mode 100644
index 00000000..4775926e
--- /dev/null
+++ b/qtensor/compression/cusz/src/detail/spv_gpu.inl
@@ -0,0 +1,77 @@
+/**
+ * @file spv_gpu.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-08-22
+ * (update) 2022-10-29
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef F88E11A6_6B61_4C6F_8B2E_30EEAAB4D204
+#define F88E11A6_6B61_4C6F_8B2E_30EEAAB4D204
+
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/tuple.h>
+
+#include "utils/timer.h"
+
+namespace psz {
+namespace detail {
+
+template <typename T, typename M>
+void spv_gather(
+    T*           in,
+    size_t const in_len,
+    T*           d_val,
+    uint32_t*    d_idx,
+    int*         nnz,
+    float*       milliseconds,
+    cudaStream_t stream)
+{
+    using thrust::placeholders::_1;
+
+    thrust::cuda::par.on(stream);
+    thrust::counting_iterator<uint32_t> zero(0);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    // find out the indices
+    *nnz = thrust::copy_if(thrust::device, zero, zero + in_len, in, d_idx, _1 != 0) - d_idx;
+
+    // fetch corresponding values
+    thrust::copy(
+        thrust::device, thrust::make_permutation_iterator(in, d_idx),
+        thrust::make_permutation_iterator(in + *nnz, d_idx + *nnz), d_val);
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    TIME_ELAPSED_CUDAEVENT(milliseconds);
+    DESTROY_CUDAEVENT_PAIR;
+}
+
+template <typename T, typename M>
+void spv_scatter(T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream)
+{
+    thrust::cuda::par.on(stream);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    thrust::scatter(thrust::device, d_val, d_val + nnz, d_idx, decoded);
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    TIME_ELAPSED_CUDAEVENT(milliseconds);
+    DESTROY_CUDAEVENT_PAIR;
+}
+
+}  // namespace detail
+}  // namespace psz
+
+#endif /* F88E11A6_6B61_4C6F_8B2E_30EEAAB4D204 */
diff --git a/qtensor/compression/cusz/src/detail/spvec.cu b/qtensor/compression/cusz/src/detail/spvec.cu
new file mode 100644
index 00000000..e9b9ab6f
--- /dev/null
+++ b/qtensor/compression/cusz/src/detail/spvec.cu
@@ -0,0 +1,18 @@
+/**
+ * @file spvec.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-03-01
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "detail/spvec.cuh"
+
+template struct cusz::SpcodecVec<float>::impl;
+template struct cusz::SpcodecVec<uint8_t>::impl;
+template struct cusz::SpcodecVec<uint16_t>::impl;
+template struct cusz::SpcodecVec<uint32_t>::impl;
+// template struct cusz::SpcodecVec<double>::impl;
diff --git a/qtensor/compression/cusz/src/experimental/Makefile b/qtensor/compression/cusz/src/experimental/Makefile
new file mode 100644
index 00000000..cecce6f5
--- /dev/null
+++ b/qtensor/compression/cusz/src/experimental/Makefile
@@ -0,0 +1,7 @@
+altlorenzo:
+	nvcc -lineinfo -std=c++17 \
+		--extended-lambda \
+		-DDPCPP_SHOWCASE \
+		../wrapper/extrap_lorenzo.cu \
+		dpcpp_demo_lorenzo.cu \
+		-o dpcpp_demo_lorenzo
diff --git a/qtensor/compression/cusz/src/experimental/dpcpp_demo_lorenzo.cu b/qtensor/compression/cusz/src/experimental/dpcpp_demo_lorenzo.cu
new file mode 100644
index 00000000..375d648d
--- /dev/null
+++ b/qtensor/compression/cusz/src/experimental/dpcpp_demo_lorenzo.cu
@@ -0,0 +1,120 @@
+/**
+ * @file withwrapper_lorenzo.cu
+ * @author Jiannan Tian
+ * @brief A temporary test case using high-level component/API.
+ * @version 0.3
+ * @date 2021-06-21
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include <pwd.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include "../utils/io.hh"
+#include "../utils/verify.hh"
+
+#pragma message "--extended-lambda causes migration error (nvcc is incapable to be a wellrounded compiler)."
+// #include "../utils/verify_gpu.cuh"
+#include "../component/extrap_lorenzo.h"
+
+using std::cout;
+using std::endl;
+
+using Data  = float;
+using Quant = uint16_t;
+using FP    = float;
+
+Data eb;
+Data maxval, minval;
+
+// dim3   stride3;
+size_t len1;
+int    radius = 512;
+
+namespace {
+
+#ifndef __CUDACC__
+struct __dim3_compat {
+    unsigned int x, y, z;
+    __dim3_compat(unsigned int _x, unsigned int _y, unsigned int _z){};
+};
+
+using dim3 = __dim3_compat;
+#endif
+
+auto get_npart = [](auto size, auto subsize) {
+    static_assert(
+        std::numeric_limits<decltype(size)>::is_integer and std::numeric_limits<decltype(subsize)>::is_integer,
+        "[get_npart] must be plain interger types.");
+    return (size + subsize - 1) / subsize;
+};
+auto get_len_from_dim3 = [](dim3 size) { return size.x * size.y * size.z; };
+auto get_stride3       = [](dim3 size) -> dim3 { return dim3(1, size.x, size.x * size.y); };
+
+}  // namespace
+
+void test_lorenzo(std::string fname, int ndim, dim3 size3)
+{
+    cout << "filename: " << fname << '\n';
+
+    Data*  h_data{nullptr};
+    Data*  d_data{nullptr};
+    Data*  h2_data{nullptr};
+    Quant* d_quant{nullptr};
+
+    auto len1 = get_len_from_dim3(size3);
+    cout << "len1 from dim3: " << len1 << endl;
+
+    cudaMallocHost(&h_data, len1 * sizeof(Data));
+    io::read_binary_to_array(fname, h_data, len1);
+    cudaMallocHost(&h2_data, len1 * sizeof(Data));
+    memcpy(h2_data, h_data, len1 * sizeof(Data));
+
+    cudaMalloc(&d_data, len1 * sizeof(Data));
+    cudaMemcpy(d_data, h_data, len1 * sizeof(Data), cudaMemcpyHostToDevice);
+    cudaMalloc(&d_quant, len1 * sizeof(Quant));
+
+    auto maxval = *std::max_element(h_data, h_data + len1);
+    auto minval = *std::min_element(h_data, h_data + len1);
+    eb          = 1e-3 * (maxval - minval);
+
+    compress_lorenzo_construct<Data, Quant, FP>(d_data, d_quant, size3, ndim, eb, radius);
+    decompress_lorenzo_reconstruct<Data, Quant, FP>(d_data, d_quant, size3, ndim, eb, radius);
+
+    cudaMemcpy(h_data, d_data, len1 * sizeof(Data), cudaMemcpyDeviceToHost);
+
+    // TODO GPU verification does not print
+    // {
+    //     Stat stat_gpu;
+    //     verify_data_GPU(&stat_gpu, h_data, h2_data, len1);
+    //     cusz::QualityViewer::print_metrics_cross<Data>(&stat_gpu, false, eb, 0, 1, false, true);
+    // }
+    {
+        Stat stat;
+        cusz::verify_data(&stat, h_data, h2_data, len1);
+        cusz::QualityViewer::print_metrics_cross<Data>(&stat, false, eb, 0, 1, false, false);
+    }
+
+    // clear up
+    cudaFree(d_data);
+    cudaFree(d_quant);
+    cudaFreeHost(h_data);
+    cudaFreeHost(h2_data);
+}
+
+int main()
+{
+    struct passwd* pw      = getpwuid(getuid());
+    const char*    homedir = pw->pw_dir;
+
+    test_lorenzo(std::string(homedir) + "/datafields/vx", 1, dim3(280953867, 1, 1));
+    test_lorenzo(std::string(homedir) + "/datafields/CLDHGH", 2, dim3(3600, 1800, 1));
+    test_lorenzo(std::string(homedir) + "/datafields/CLOUDf48", 3, dim3(500, 500, 100));
+
+    return 0;
+}
diff --git a/qtensor/compression/cusz/src/hf/detail/hf_bookg.inl b/qtensor/compression/cusz/src/hf/detail/hf_bookg.inl
new file mode 100644
index 00000000..3fb9ef82
--- /dev/null
+++ b/qtensor/compression/cusz/src/hf/detail/hf_bookg.inl
@@ -0,0 +1,742 @@
+/**
+ * @file huffman_parbook.cu
+ * @author Cody Rivera (cjrivera1@crimson.ua.edu)
+ * @brief Parallel Huffman Construction to generates canonical forward codebook.
+ *        Based on [Ostadzadeh et al. 2007] (https://dblp.org/rec/conf/pdpta/OstadzadehEZMB07.bib)
+ *        "A Two-phase Practical Parallel Algorithm for Construction of Huffman Codes".
+ * @version 0.1
+ * @date 2020-10-24
+ * (created) 2020-05 (rev) 2021-06-21
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef C883A574_4491_40E8_A083_1B6E8FB56670
+#define C883A574_4491_40E8_A083_1B6E8FB56670
+
+#include <cooperative_groups.h>
+#include <cuda.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sort.h>
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <type_traits>
+
+#include "common.hh"
+#include "hf/hf_bookg.hh"
+#include "par_merge.inl"
+#include "utils.hh"
+#include "utils/timer.h"
+
+using std::cout;
+using std::endl;
+namespace cg = cooperative_groups;
+
+// GenerateCL Locals
+__device__ int iNodesFront = 0;
+__device__ int iNodesRear  = 0;
+__device__ int lNodesCur   = 0;
+
+__device__ int iNodesSize = 0;
+__device__ int curLeavesNum;
+
+__device__ int minFreq;
+
+__device__ int tempLength;
+
+__device__ int mergeFront;
+__device__ int mergeRear;
+
+__device__ int lNodesIndex;
+
+// GenerateCW Locals
+__device__ int CCL;
+__device__ int CDPI;
+__device__ int newCDPI;
+
+// Profiling
+__device__ long long int s[10];
+__device__ long long int st[10];
+
+// Mathematically correct mod
+#define MOD(a, b) ((((a) % (b)) + (b)) % (b))
+
+namespace par_huffman {
+namespace detail {
+
+// clang-format off
+template <typename T>             __global__ void GPU_FillArraySequence(T*, unsigned int);
+template <typename T>             __global__ void GPU_GetFirstNonzeroIndex(T*, unsigned int, unsigned int*);
+template <typename T>             __global__ void GPU_ReverseArray(T*, unsigned int);
+template <typename H, typename T> __global__ void GPU_ReorderByIndex(H*, T*, unsigned int);
+// clang-format on
+
+}  // namespace detail
+}  // namespace par_huffman
+
+namespace par_huffman {
+
+// Codeword length
+template <typename F>
+__global__ void GPU_GenerateCL(F*, F*, int, F*, int*, F*, int*, F*, int*, int*, F*, int*, int*, uint32_t*, int, int);
+
+// Forward Codebook
+template <typename F, typename H>
+__global__ void GPU_GenerateCW(F* CL, H* CW, H* first, H* entry, int size);
+
+}  // namespace par_huffman
+
+// Parallel huffman code generation
+// clang-format off
+template <typename F>
+__global__ void par_huffman::GPU_GenerateCL(
+    F*  histogram,  F* CL,  int size,
+    /* Global Arrays */
+    F* lNodesFreq,  int* lNodesLeader,
+    F* iNodesFreq,  int* iNodesLeader,
+    F* tempFreq,    int* tempIsLeaf,    int* tempIndex,
+    F* copyFreq,    int* copyIsLeaf,    int* copyIndex,
+    uint32_t* diagonal_path_intersections, int mblocks, int mthreads)
+{
+    // clang-format on
+
+    extern __shared__ int32_t shmem[];
+    // Shared variables
+    int32_t& x_top     = shmem[0];
+    int32_t& y_top     = shmem[1];
+    int32_t& x_bottom  = shmem[2];
+    int32_t& y_bottom  = shmem[3];
+    int32_t& found     = shmem[4];
+    int32_t* oneorzero = &shmem[5];
+
+    unsigned int       thread       = (blockIdx.x * blockDim.x) + threadIdx.x;
+    const unsigned int i            = thread;  // Adaptation for easier porting
+    auto               current_grid = cg::this_grid();
+
+    /* Initialization */
+    if (thread < size) {
+        lNodesLeader[i] = -1;
+        CL[i]           = 0;
+    }
+
+    if (thread == 0) {
+        iNodesFront = 0;
+        iNodesRear  = 0;
+        lNodesCur   = 0;
+
+        iNodesSize = 0;
+    }
+    current_grid.sync();
+
+    /* While there is not exactly one internal node */
+    while (lNodesCur < size || iNodesSize > 1) {
+        /* Combine two most frequent nodes on same level */
+        if (thread == 0) {
+            F   midFreq[4];
+            int midIsLeaf[4];
+            for (int i = 0; i < 4; ++i) midFreq[i] = UINT_MAX;
+
+            if (lNodesCur < size) {
+                midFreq[0]   = lNodesFreq[lNodesCur];
+                midIsLeaf[0] = 1;
+            }
+            if (lNodesCur < size - 1) {
+                midFreq[1]   = lNodesFreq[lNodesCur + 1];
+                midIsLeaf[1] = 1;
+            }
+            if (iNodesSize >= 1) {
+                midFreq[2]   = iNodesFreq[iNodesFront];
+                midIsLeaf[2] = 0;
+            }
+            if (iNodesSize >= 2) {
+                midFreq[3]   = iNodesFreq[MOD(iNodesFront + 1, size)];
+                midIsLeaf[3] = 0;
+            }
+
+            /* Select the minimum of minimums - 4elt sorting network */
+            /* TODO There's likely a good 1-warp faster way to do this */
+            {
+                F   tempFreq;
+                int tempIsLeaf;
+                if (midFreq[1] > midFreq[3]) {
+                    tempFreq     = midFreq[1];
+                    midFreq[1]   = midFreq[3];
+                    midFreq[3]   = tempFreq;
+                    tempIsLeaf   = midIsLeaf[1];
+                    midIsLeaf[1] = midIsLeaf[3];
+                    midIsLeaf[3] = tempIsLeaf;
+                }
+                if (midFreq[0] > midFreq[2]) {
+                    tempFreq     = midFreq[0];
+                    midFreq[0]   = midFreq[2];
+                    midFreq[2]   = tempFreq;
+                    tempIsLeaf   = midIsLeaf[0];
+                    midIsLeaf[0] = midIsLeaf[2];
+                    midIsLeaf[2] = tempIsLeaf;
+                }
+                if (midFreq[0] > midFreq[1]) {
+                    tempFreq     = midFreq[0];
+                    midFreq[0]   = midFreq[1];
+                    midFreq[1]   = tempFreq;
+                    tempIsLeaf   = midIsLeaf[0];
+                    midIsLeaf[0] = midIsLeaf[1];
+                    midIsLeaf[1] = tempIsLeaf;
+                }
+                if (midFreq[2] > midFreq[3]) {
+                    tempFreq     = midFreq[2];
+                    midFreq[2]   = midFreq[3];
+                    midFreq[3]   = tempFreq;
+                    tempIsLeaf   = midIsLeaf[2];
+                    midIsLeaf[2] = midIsLeaf[3];
+                    midIsLeaf[3] = tempIsLeaf;
+                }
+                if (midFreq[1] > midFreq[2]) {
+                    tempFreq     = midFreq[1];
+                    midFreq[1]   = midFreq[2];
+                    midFreq[2]   = tempFreq;
+                    tempIsLeaf   = midIsLeaf[1];
+                    midIsLeaf[1] = midIsLeaf[2];
+                    midIsLeaf[2] = tempIsLeaf;
+                }
+            }
+
+            minFreq = midFreq[0];
+            if (midFreq[1] < UINT_MAX) { minFreq += midFreq[1]; }
+            iNodesFreq[iNodesRear]   = minFreq;
+            iNodesLeader[iNodesRear] = -1;
+
+            /* If is leaf */
+            if (midIsLeaf[0]) {
+                lNodesLeader[lNodesCur] = iNodesRear;
+                ++CL[lNodesCur], ++lNodesCur;
+            }
+            else {
+                iNodesLeader[iNodesFront] = iNodesRear;
+                iNodesFront               = MOD(iNodesFront + 1, size);
+            }
+            if (midIsLeaf[1]) {
+                lNodesLeader[lNodesCur] = iNodesRear;
+                ++CL[lNodesCur], ++lNodesCur;
+            }
+            else {
+                iNodesLeader[iNodesFront] = iNodesRear;
+                iNodesFront               = MOD(iNodesFront + 1, size); /* ? */
+            }
+
+            // iNodesRear = MOD(iNodesRear + 1, size);
+
+            iNodesSize = MOD(iNodesRear - iNodesFront, size);
+        }
+
+        // int curLeavesNum;
+        /* Select elements to copy -- parallelized */
+        curLeavesNum = 0;
+        current_grid.sync();
+        if (i >= lNodesCur && i < size) {
+            // Parallel component
+            int threadCurLeavesNum;
+            if (lNodesFreq[i] <= minFreq) {
+                threadCurLeavesNum = i - lNodesCur + 1;
+                // Atomic max -- Largest valid index
+                atomicMax(&curLeavesNum, threadCurLeavesNum);
+            }
+
+            if (i - lNodesCur < curLeavesNum) {
+                copyFreq[i - lNodesCur]   = lNodesFreq[i];
+                copyIndex[i - lNodesCur]  = i;
+                copyIsLeaf[i - lNodesCur] = 1;
+            }
+        }
+
+        current_grid.sync();
+
+        /* Updates Iterators */
+        if (thread == 0) {
+            mergeRear  = iNodesRear;
+            mergeFront = iNodesFront;
+
+            if ((curLeavesNum + iNodesSize) % 2 == 0) { iNodesFront = iNodesRear; }
+            /* Odd number of nodes to merge - leave out one*/
+            else if (
+                (iNodesSize != 0)                                                                        //
+                and (curLeavesNum == 0                                                                   //
+                     or (histogram[lNodesCur + curLeavesNum] <= iNodesFreq[MOD(iNodesRear - 1, size)]))  //
+            ) {
+                mergeRear   = MOD(mergeRear - 1, size);
+                iNodesFront = MOD(iNodesRear - 1, size);
+            }
+            else {
+                iNodesFront = iNodesRear;
+                --curLeavesNum;
+            }
+
+            lNodesCur  = lNodesCur + curLeavesNum;
+            iNodesRear = MOD(iNodesRear + 1, size);
+        }
+        current_grid.sync();
+
+        /* Parallelized Merging Phase */
+
+        /*if (thread == 0) {
+            merge(copyFreq, copyIndex, copyIsLeaf, 0, curLeavesNum,
+                    iNodesFreq, mergeFront, mergeRear, size,
+                    tempFreq, tempIndex, tempIsLeaf, tempLength);
+                    }*/
+
+        parMerge(
+            copyFreq, copyIndex, copyIsLeaf, 0, curLeavesNum,  //
+            iNodesFreq, mergeFront, mergeRear, size,           //
+            tempFreq, tempIndex, tempIsLeaf, tempLength,       //
+            diagonal_path_intersections, mblocks, mthreads,    //
+            x_top, y_top, x_bottom, y_bottom, found, oneorzero);
+        current_grid.sync();
+
+        /* Melding phase -- New */
+        if (thread < tempLength / 2) {
+            int ind           = MOD(iNodesRear + i, size);
+            iNodesFreq[ind]   = tempFreq[(2 * i)] + tempFreq[(2 * i) + 1];
+            iNodesLeader[ind] = -1;
+
+            if (tempIsLeaf[(2 * i)]) {
+                lNodesLeader[tempIndex[(2 * i)]] = ind;
+                ++CL[tempIndex[(2 * i)]];
+            }
+            else {
+                iNodesLeader[tempIndex[(2 * i)]] = ind;
+            }
+            if (tempIsLeaf[(2 * i) + 1]) {
+                lNodesLeader[tempIndex[(2 * i) + 1]] = ind;
+                ++CL[tempIndex[(2 * i) + 1]];
+            }
+            else {
+                iNodesLeader[tempIndex[(2 * i) + 1]] = ind;
+            }
+        }
+        current_grid.sync();
+
+        if (thread == 0) { iNodesRear = MOD(iNodesRear + (tempLength / 2), size); }
+        current_grid.sync();
+
+        /* Update leaders */
+        if (thread < size) {
+            if (lNodesLeader[i] != -1) {
+                if (iNodesLeader[lNodesLeader[i]] != -1) {
+                    lNodesLeader[i] = iNodesLeader[lNodesLeader[i]];
+                    ++CL[i];
+                }
+            }
+        }
+        current_grid.sync();
+
+        if (thread == 0) { iNodesSize = MOD(iNodesRear - iNodesFront, size); }
+        current_grid.sync();
+    }
+}
+
+// Parallelized with atomic writes, but could replace with Jiannan's similar code
+template <typename F, typename H>
+__global__ void par_huffman::GPU_GenerateCW(F* CL, H* CW, H* first, H* entry, int size)
+{
+    unsigned int       thread       = (blockIdx.x * blockDim.x) + threadIdx.x;
+    const unsigned int i            = thread;  // Porting convenience
+    auto               current_grid = cg::this_grid();
+    auto               type_bw      = sizeof(H) * 8;
+
+    /* Reverse in place - Probably a more CUDA-appropriate way */
+    if (thread < size / 2) {
+        F temp           = CL[i];
+        CL[i]            = CL[size - i - 1];
+        CL[size - i - 1] = temp;
+    }
+    current_grid.sync();
+
+    if (thread == 0) {
+        CCL        = CL[0];
+        CDPI       = 0;
+        newCDPI    = size - 1;
+        entry[CCL] = 0;
+
+        // Edge case -- only one input symbol
+        CW[CDPI]       = 0;
+        first[CCL]     = CW[CDPI] ^ (((H)1 << (H)CL[CDPI]) - 1);
+        entry[CCL + 1] = 1;
+    }
+    current_grid.sync();
+
+    // Initialize first and entry arrays
+    if (thread < CCL) {
+        // Initialization of first to Max ensures that unused code
+        // lengths are skipped over in decoding.
+        first[i] = std::numeric_limits<H>::max();
+        entry[i] = 0;
+    }
+    // Initialize first element of entry
+    current_grid.sync();
+
+    while (CDPI < size - 1) {
+        // CDPI update
+        if (i < size - 1 && CL[i + 1] > CCL) { atomicMin(&newCDPI, i); }
+        current_grid.sync();
+
+        // Last element to update
+        const int updateEnd = (newCDPI >= size - 1) ? type_bw : CL[newCDPI + 1];
+        // Fill base
+        const int curEntryVal = entry[CCL];
+        // Number of elements of length CCL
+        const int numCCL = (newCDPI - CDPI + 1);
+
+        // Get first codeword
+        if (i == 0) {
+            if (CDPI == 0) { CW[newCDPI] = 0; }
+            else {
+                CW[newCDPI] = CW[CDPI];  // Pre-stored
+            }
+        }
+        current_grid.sync();
+
+        if (i < size) {
+            // Parallel canonical codeword generation
+            if (i >= CDPI && i < newCDPI) { CW[i] = CW[newCDPI] + (newCDPI - i); }
+        }
+
+        // Update entry and first arrays in O(1) time
+        if (thread > CCL && thread < updateEnd) { entry[i] = curEntryVal + numCCL; }
+        // Add number of entries to next CCL
+        if (thread == 0) {
+            if (updateEnd < type_bw) { entry[updateEnd] = curEntryVal + numCCL; }
+        }
+        current_grid.sync();
+
+        // Update first array in O(1) time
+        if (thread == CCL) {
+            // Flip least significant CL[CDPI] bits
+            first[CCL] = CW[CDPI] ^ (((H)1 << (H)CL[CDPI]) - 1);
+        }
+        if (thread > CCL && thread < updateEnd) { first[i] = std::numeric_limits<H>::max(); }
+        current_grid.sync();
+
+        if (thread == 0) {
+            if (newCDPI < size - 1) {
+                int CLDiff = CL[newCDPI + 1] - CL[newCDPI];
+                // Add and shift -- Next canonical code
+                CW[newCDPI + 1] = ((CW[CDPI] + 1) << CLDiff);
+                CCL             = CL[newCDPI + 1];
+
+                ++newCDPI;
+            }
+
+            // Update CDPI to newCDPI after codeword length increase
+            CDPI    = newCDPI;
+            newCDPI = size - 1;
+        }
+        current_grid.sync();
+    }
+
+    if (thread < size) {
+        /* Make encoded codeword compatible with CUSZ */
+        CW[i] = (CW[i] | (((H)CL[i] & (H)0xffu) << ((sizeof(H) * 8) - 8))) ^ (((H)1 << (H)CL[i]) - 1);
+    }
+    current_grid.sync();
+
+    /* Reverse partial codebook */
+    if (thread < size / 2) {
+        H temp           = CW[i];
+        CW[i]            = CW[size - i - 1];
+        CW[size - i - 1] = temp;
+    }
+}
+
+// TODO forceinilne?
+// Helper implementations
+template <typename T>
+__global__ void par_huffman::detail::GPU_FillArraySequence(T* array, unsigned int size)
+{
+    unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (thread < size) { array[thread] = thread; }
+}
+
+// Precondition -- Result is preset to be equal to size
+template <typename T>
+__global__ void par_huffman::detail::GPU_GetFirstNonzeroIndex(T* array, unsigned int size, unsigned int* result)
+{
+    unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (array[thread] != 0) { atomicMin(result, thread); }
+}
+
+namespace par_huffman {
+namespace detail {
+__global__ void GPU_GetMaxCWLength(unsigned int* CL, unsigned int size, unsigned int* result)
+{
+    (void)size;
+    unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (thread == 0) { *result = CL[0]; }
+}
+
+}  // namespace detail
+}  // namespace par_huffman
+
+/**
+ * @brief Reorders given a set of indices. Programmer must ensure that all index[i]
+ * are unique or else race conditions may occur
+ *
+ * @tparam T
+ * @tparam Q
+ * @param array e.g., codebook
+ * @param index e.g., input data
+ * @param size
+ * @return __global__
+ */
+template <typename H, typename T>
+__global__ void par_huffman::detail::GPU_ReorderByIndex(H* array, T* index, unsigned int size)
+{
+    unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x;
+    H            temp;
+    T            newIndex;
+    if (thread < size) {
+        temp                 = array[thread];
+        newIndex             = index[thread];
+        array[(int)newIndex] = temp;
+    }
+}
+
+// Reverses a given array.
+template <typename T>
+__global__ void par_huffman::detail::GPU_ReverseArray(T* array, unsigned int size)
+{
+    unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (thread < size / 2) {
+        T temp                   = array[thread];
+        array[thread]            = array[size - thread - 1];
+        array[size - thread - 1] = temp;
+    }
+}
+
+// Parallel codebook generation wrapper
+template <typename T, typename H>
+void asz::hf_buildbook_g(
+    uint32_t*    freq,
+    int const    dict_size,
+    H*           codebook,
+    uint8_t*     reverse_codebook,
+    int const    revbook_nbyte,
+    float*       time_book,
+    cudaStream_t stream)
+{
+    // Metadata
+    auto type_bw  = sizeof(H) * 8;
+    auto _d_first = reinterpret_cast<H*>(reverse_codebook);
+    auto _d_entry = reinterpret_cast<H*>(reverse_codebook + (sizeof(H) * type_bw));
+    auto _d_qcode = reinterpret_cast<T*>(reverse_codebook + (sizeof(H) * 2 * type_bw));
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    // Sort Qcodes by frequency
+    int nblocks = (dict_size / 1024) + 1;
+    par_huffman::detail::GPU_FillArraySequence<T><<<nblocks, 1024>>>(_d_qcode, (unsigned int)dict_size);
+    cudaStreamSynchronize(stream);
+
+    /**
+     * Originally from par_huffman_sortbyfreq.cu by Cody Rivera (cjrivera1@crimson.ua.edu)
+     * Sorts quantization codes by frequency, using a key-value sort. This functionality is placed in a separate
+     * compilation unit as thrust calls fail in par_huffman.cu.
+     *
+     * Resolved by
+     * 1) inlining function
+     * 2) using `thrust::device_pointer_cast(var)` instead of `thrust::device_pointer<T>(var)`
+     */
+    auto lambda_sort_by_freq = [] __host__(auto freq, auto len, auto qcode) {
+        thrust::sort_by_key(
+            thrust::device_pointer_cast(freq), thrust::device_pointer_cast(freq + len),
+            thrust::device_pointer_cast(qcode));
+    };
+
+    lambda_sort_by_freq(freq, dict_size, _d_qcode);
+    cudaStreamSynchronize(stream);
+
+    unsigned int* d_first_nonzero_index;
+    unsigned int  first_nonzero_index = dict_size;
+    cudaMalloc(&d_first_nonzero_index, sizeof(unsigned int));
+    cudaMemcpy(d_first_nonzero_index, &first_nonzero_index, sizeof(unsigned int), cudaMemcpyHostToDevice);
+    par_huffman::detail::GPU_GetFirstNonzeroIndex<unsigned int>
+        <<<nblocks, 1024>>>(freq, dict_size, d_first_nonzero_index);
+    cudaStreamSynchronize(stream);
+    cudaMemcpy(&first_nonzero_index, d_first_nonzero_index, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+    cudaFree(d_first_nonzero_index);
+
+    int           nz_dict_size   = dict_size - first_nonzero_index;
+    unsigned int* _nz_d_freq     = freq + first_nonzero_index;
+    H*            _nz_d_codebook = codebook + first_nonzero_index;
+    int           nz_nblocks     = (nz_dict_size / 1024) + 1;
+
+    // Memory Allocation -- Perhaps put in another wrapper
+    // clang-format off
+    unsigned int *CL         = nullptr;
+    /*unsigned int* lNodesFreq*/         int *lNodesLeader = nullptr;
+    unsigned int *iNodesFreq = nullptr;  int *iNodesLeader = nullptr;
+    unsigned int *tempFreq   = nullptr;  int *tempIsLeaf   = nullptr;  int *tempIndex = nullptr;
+    unsigned int *copyFreq   = nullptr;  int *copyIsLeaf   = nullptr;  int *copyIndex = nullptr;
+    cudaMalloc(&CL,           nz_dict_size * sizeof(unsigned int) );
+    cudaMalloc(&lNodesLeader, nz_dict_size * sizeof(int)          );
+    cudaMalloc(&iNodesFreq,   nz_dict_size * sizeof(unsigned int) );
+    cudaMalloc(&iNodesLeader, nz_dict_size * sizeof(int)          );
+    cudaMalloc(&tempFreq,     nz_dict_size * sizeof(unsigned int) );
+    cudaMalloc(&tempIsLeaf,   nz_dict_size * sizeof(int)          );
+    cudaMalloc(&tempIndex,    nz_dict_size * sizeof(int)          );
+    cudaMalloc(&copyFreq,     nz_dict_size * sizeof(unsigned int) );
+    cudaMalloc(&copyIsLeaf,   nz_dict_size * sizeof(int)          );
+    cudaMalloc(&copyIndex,    nz_dict_size * sizeof(int)          );
+    cudaMemset(CL, 0,         nz_dict_size * sizeof(int)          );
+    // clang-format on
+
+    // Grid configuration for CL -- based on Cooperative Groups
+    int            cg_mblocks;
+    int            cg_blocks_sm;
+    int            device_id;
+    int            mthreads = 32;  // 1 warp
+    cudaDeviceProp deviceProp;
+    cudaGetDevice(&device_id);
+    cudaGetDeviceProperties(&deviceProp, device_id);
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &cg_blocks_sm, par_huffman::GPU_GenerateCL<unsigned int>, mthreads, 5 * sizeof(int32_t) + 32 * sizeof(int32_t));
+    cg_mblocks = deviceProp.multiProcessorCount * cg_blocks_sm;
+
+    int ELTS_PER_SEQ_MERGE = 16;
+    int mblocks            = std::min(cg_mblocks, (nz_dict_size / ELTS_PER_SEQ_MERGE) + 1);
+
+    // Exit if not enough exposed parallelism -- TODO modify kernels so this is unneeded
+    int tthreads = mthreads * mblocks;
+    if (tthreads < nz_dict_size) {
+        cout << LOG_ERR << "Insufficient on-device parallelism to construct a " << nz_dict_size
+             << " non-zero item codebook" << endl;
+        cout << LOG_ERR << "Provided parallelism: " << mblocks << " blocks, " << mthreads << " threads, " << tthreads
+             << " total" << endl
+             << endl;
+        // cout << LOG_ERR << "Exiting cuSZ ..." << endl;
+        throw std::system_error();
+        // exit(1);
+    }
+
+    uint32_t* diagonal_path_intersections;
+    cudaMalloc(&diagonal_path_intersections, (2 * (mblocks + 1)) * sizeof(uint32_t));
+
+    // Codebook already init'ed
+    cudaStreamSynchronize(stream);
+
+    // Call first kernel
+    // Collect arguments
+    void* CL_Args[] = {(void*)&_nz_d_freq,   (void*)&CL,
+                       (void*)&nz_dict_size, (void*)&_nz_d_freq,
+                       (void*)&lNodesLeader, (void*)&iNodesFreq,
+                       (void*)&iNodesLeader, (void*)&tempFreq,
+                       (void*)&tempIsLeaf,   (void*)&tempIndex,
+                       (void*)&copyFreq,     (void*)&copyIsLeaf,
+                       (void*)&copyIndex,    (void*)&diagonal_path_intersections,
+                       (void*)&mblocks,      (void*)&mthreads};
+    // Cooperative Launch
+    cudaLaunchCooperativeKernel(
+        (void*)par_huffman::GPU_GenerateCL<unsigned int>, mblocks, mthreads, CL_Args,
+        5 * sizeof(int32_t) + 32 * sizeof(int32_t));
+    cudaStreamSynchronize(stream);
+
+    // Exits if the highest codeword length is greater than what
+    // the adaptive representation can handle
+    // TODO do  proper cleanup
+
+    unsigned int* d_max_CL;
+    unsigned int  max_CL;
+    cudaMalloc(&d_max_CL, sizeof(unsigned int));
+    par_huffman::detail::GPU_GetMaxCWLength<<<1, 1>>>(CL, nz_dict_size, d_max_CL);
+    cudaStreamSynchronize(stream);
+    cudaMemcpy(&max_CL, d_max_CL, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+    cudaFree(d_max_CL);
+
+    int max_CW_bits = (sizeof(H) * 8) - 8;
+    if (max_CL > max_CW_bits) {
+        cout << LOG_ERR << "Cannot store all Huffman codewords in " << max_CW_bits + 8 << "-bit representation" << endl;
+        cout << LOG_ERR << "Huffman codeword representation requires at least " << max_CL + 8
+             << " bits (longest codeword: " << max_CL << " bits)" << endl;
+        // cout << LOG_ERR << "(Consider running with -H 8 for 8-byte representation)" << endl << endl;
+        // cout << LOG_ERR << "Exiting cuSZ ..." << endl;
+        // exit(1);
+        throw std::runtime_error("Falling back to 8-byte Codec.");
+    }
+
+    // Configure CW for 1024 threads/block
+    int cg_cw_mblocks = (cg_mblocks * mthreads) / 1024;
+    int cw_mblocks    = std::min(cg_cw_mblocks, nz_nblocks);
+
+    // Exit if not enough exposed parallelism -- TODO modify kernels so this is unneeded
+    int cw_tthreads = cw_mblocks * 1024;
+    if (cw_tthreads < nz_dict_size) {
+        cout << LOG_ERR << "Insufficient on-device parallelism to construct a " << nz_dict_size
+             << " non-zero item codebook" << endl;
+        cout << LOG_ERR << "Provided parallelism: " << cw_mblocks << " blocks, " << 1024 << " threads, " << cw_tthreads
+             << " total" << endl
+             << endl;
+        // cout << LOG_ERR << "Exiting cuSZ ..." << endl;
+        // exit(1);
+        throw std::system_error();
+    }
+
+    void* CW_Args[] = {
+        (void*)&CL,              //
+        (void*)&_nz_d_codebook,  //
+        (void*)&_d_first,        //
+        (void*)&_d_entry,        //
+        (void*)&nz_dict_size};
+
+    // Call second kernel
+    cudaLaunchCooperativeKernel(
+        (void*)par_huffman::GPU_GenerateCW<unsigned int, H>,  //
+        cw_mblocks,                                           //
+        1024,                                                 //
+        CW_Args);
+    cudaStreamSynchronize(stream);
+
+#ifdef D_DEBUG_PRINT
+    print_codebook<H><<<1, 32>>>(codebook, dict_size);  // PASS
+    cudaStreamSynchronize(stream);
+#endif
+
+    // Reverse _d_qcode and codebook
+    par_huffman::detail::GPU_ReverseArray<H><<<nblocks, 1024>>>(codebook, (unsigned int)dict_size);
+    par_huffman::detail::GPU_ReverseArray<T><<<nblocks, 1024>>>(_d_qcode, (unsigned int)dict_size);
+    cudaStreamSynchronize(stream);
+
+    par_huffman::detail::GPU_ReorderByIndex<H, T><<<nblocks, 1024>>>(codebook, _d_qcode, (unsigned int)dict_size);
+    cudaStreamSynchronize(stream);
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    TIME_ELAPSED_CUDAEVENT(time_book);
+    DESTROY_CUDAEVENT_PAIR;
+
+    // Cleanup
+    cudaFree(CL);
+    cudaFree(lNodesLeader);
+    cudaFree(iNodesFreq);
+    cudaFree(iNodesLeader);
+    cudaFree(tempFreq);
+    cudaFree(tempIsLeaf);
+    cudaFree(tempIndex);
+    cudaFree(copyFreq);
+    cudaFree(copyIsLeaf);
+    cudaFree(copyIndex);
+    cudaFree(diagonal_path_intersections);
+    cudaStreamSynchronize(stream);
+
+#ifdef D_DEBUG_PRINT
+    print_codebook<H><<<1, 32>>>(codebook, dict_size);  // PASS
+    cudaStreamSynchronize(stream);
+#endif
+}
+
+#endif /* C883A574_4491_40E8_A083_1B6E8FB56670 */
diff --git a/qtensor/compression/cusz/src/hf/detail/hf_codecg.inl b/qtensor/compression/cusz/src/hf/detail/hf_codecg.inl
new file mode 100644
index 00000000..04c8883b
--- /dev/null
+++ b/qtensor/compression/cusz/src/hf/detail/hf_codecg.inl
@@ -0,0 +1,296 @@
+/**
+ * @file codec_huffman.cuh
+ * @author Jiannan Tian
+ * @brief Huffman kernel definitions
+ * @version 0.2
+ * @date 2020-02-13
+ * (created) 2020-02-02, (rev1) 2021-02-13, (rev2) 2021-12-29
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_KERNEL_CODEC_HUFFMAN_CUH
+#define CUSZ_KERNEL_CODEC_HUFFMAN_CUH
+
+#include <cuda_runtime.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <cstdio>
+#include <cstdlib>
+#include <limits>
+
+#include "common.hh"
+#include "hf/hf_bookg.hh"
+#include "hf/hf_codecg.hh"
+#include "hf/hf_struct.h"
+#include "utils/cuda_err.cuh"
+#include "utils/timer.h"
+
+#define TIX threadIdx.x
+#define BIX blockIdx.x
+#define BDX blockDim.x
+
+#if __has_include(<cub/cub.cuh>)
+// #pragma message __FILE__ ": (CUDA 11 onward), cub from system path"
+#include <cub/cub.cuh>
+#else
+// #pragma message __FILE__ ": (CUDA 10 or earlier), cub from git submodule"
+#include "../../third_party/cub/cub/cub.cuh"
+#endif
+
+using BYTE = uint8_t;
+
+extern __shared__ char __codec_huffman_uninitialized[];
+
+struct __helper {
+    __device__ __forceinline__ static unsigned int local_tid_1() { return threadIdx.x; }
+    __device__ __forceinline__ static unsigned int global_tid_1() { return blockIdx.x * blockDim.x + threadIdx.x; }
+    __device__ __forceinline__ static unsigned int block_stride_1() { return blockDim.x; }
+    __device__ __forceinline__ static unsigned int grid_stride_1() { return blockDim.x * gridDim.x; }
+    template <int SEQ>
+    __device__ __forceinline__ static unsigned int global_tid()
+    {
+        return blockIdx.x * blockDim.x * SEQ + threadIdx.x;
+    }
+    template <int SEQ>
+    __device__ __forceinline__ static unsigned int grid_stride()
+    {
+        return blockDim.x * gridDim.x * SEQ;
+    }
+};
+
+template <typename UNCOMPRESSED, typename COMPRESSED, typename MetadataT>
+__global__ void hf_decode_kernel(
+    COMPRESSED*   compressed,
+    uint8_t*      revbook,
+    MetadataT*    par_nbit,
+    MetadataT*    par_entry,
+    int const     revbook_nbyte,
+    int const     sublen,
+    int const     pardeg,
+    UNCOMPRESSED* out_uncompressed);
+
+namespace asz {
+namespace detail {
+
+template <typename UNCOMPRESSED, typename ENCODED>
+__global__ void hf_encode_phase1_fill(
+    UNCOMPRESSED* in_uncompressed,
+    size_t const  in_uncompressed_len,
+    ENCODED*      in_book,
+    int const     in_booklen,
+    ENCODED*      out_encoded);
+
+template <typename COMPRESSED, typename MetadataT>
+__global__ void hf_encode_phase2_deflate(
+    COMPRESSED*  inout_inplace,
+    size_t const len,
+    MetadataT*   par_nbit,
+    MetadataT*   par_ncell,
+    int const    sublen,
+    int const    pardeg);
+
+template <typename Huff, typename Meta>
+__global__ void
+hf_encode_phase4_concatenate(Huff* gapped, Meta* par_entry, Meta* par_ncell, int const cfg_sublen, Huff* non_gapped);
+
+// TODO change size_t to unsigned int
+template <typename COMPRESSED, typename UNCOMPRESSED>
+__device__ void
+hf_decode_single_thread_inflate(COMPRESSED* input, UNCOMPRESSED* out, int const total_bw, BYTE* revbook);
+
+}  // namespace detail
+}  // namespace asz
+
+// TODO change size_t to unsigned int
+template <typename COMPRESSED, typename UNCOMPRESSED>
+__device__ void
+asz::detail::hf_decode_single_thread_inflate(COMPRESSED* input, UNCOMPRESSED* out, int const total_bw, BYTE* revbook)
+{
+    static const auto DTYPE_WIDTH = sizeof(COMPRESSED) * 8;
+
+    int  next_bit;
+    auto idx_bit  = 0;
+    auto idx_byte = 0;
+    auto idx_out  = 0;
+
+    COMPRESSED bufr = input[idx_byte];
+
+    auto       first = reinterpret_cast<COMPRESSED*>(revbook);
+    auto       entry = first + DTYPE_WIDTH;
+    auto       keys  = reinterpret_cast<UNCOMPRESSED*>(revbook + sizeof(COMPRESSED) * (2 * DTYPE_WIDTH));
+    COMPRESSED v     = (bufr >> (DTYPE_WIDTH - 1)) & 0x1;  // get the first bit
+    auto       l     = 1;
+    auto       i     = 0;
+
+    while (i < total_bw) {
+        while (v < first[l]) {  // append next i_cb bit
+            ++i;
+            idx_byte = i / DTYPE_WIDTH;  // [1:exclusive]
+            idx_bit  = i % DTYPE_WIDTH;
+            if (idx_bit == 0) {
+                // idx_byte += 1; // [1:exclusive]
+                bufr = input[idx_byte];
+            }
+
+            next_bit = ((bufr >> (DTYPE_WIDTH - 1 - idx_bit)) & 0x1);
+            v        = (v << 1) | next_bit;
+            ++l;
+        }
+        out[idx_out++] = keys[entry[l] + v - first[l]];
+        {
+            ++i;
+            idx_byte = i / DTYPE_WIDTH;  // [2:exclusive]
+            idx_bit  = i % DTYPE_WIDTH;
+            if (idx_bit == 0) {
+                // idx_byte += 1; // [2:exclusive]
+                bufr = input[idx_byte];
+            }
+
+            next_bit = ((bufr >> (DTYPE_WIDTH - 1 - idx_bit)) & 0x1);
+            v        = 0x0 | next_bit;
+        }
+        l = 1;
+    }
+}
+
+template <typename UNCOMPRESSED, typename ENCODED>
+__global__ void asz::detail::hf_encode_phase1_fill(
+    UNCOMPRESSED* in_uncompressed,
+    size_t const  in_uncompressed_len,
+    ENCODED*      in_book,
+    int const     in_booklen,
+    ENCODED*      out_encoded)
+{
+    auto shmem_cb = reinterpret_cast<ENCODED*>(__codec_huffman_uninitialized);
+
+    // load from global memory
+    for (auto idx = __helper::local_tid_1();  //
+         idx < in_booklen;                    //
+         idx += __helper::block_stride_1())
+        shmem_cb[idx] = in_book[idx];
+
+    __syncthreads();
+
+    for (auto idx = __helper::global_tid_1();  //
+         idx < in_uncompressed_len;            //
+         idx += __helper::grid_stride_1()      //
+    )
+        out_encoded[idx] = shmem_cb[(int)in_uncompressed[idx]];
+}
+
+template <typename COMPRESSED, typename MetadataT>
+__global__ void asz::detail::hf_encode_phase2_deflate(
+    COMPRESSED*  inout_inplace,
+    size_t const len,
+    MetadataT*   par_nbit,
+    MetadataT*   par_ncell,
+    int const    sublen,
+    int const    pardeg)
+{
+    constexpr int CELL_BITWIDTH = sizeof(COMPRESSED) * 8;
+
+    auto tid = BIX * BDX + TIX;
+
+    if (tid * sublen < len) {
+        int         residue_bits = CELL_BITWIDTH;
+        int         total_bits   = 0;
+        COMPRESSED* ptr          = inout_inplace + tid * sublen;
+        COMPRESSED  bufr;
+        uint8_t     word_width;
+
+        auto did = tid * sublen;
+        for (auto i = 0; i < sublen; i++, did++) {
+            if (did == len) break;
+
+            COMPRESSED packed_word = inout_inplace[tid * sublen + i];
+            auto       word_ptr    = reinterpret_cast<struct PackedWordByWidth<sizeof(COMPRESSED)>*>(&packed_word);
+            word_width             = word_ptr->bits;
+            word_ptr->bits         = (uint8_t)0x0;
+
+            if (residue_bits == CELL_BITWIDTH) {  // a new unit of compact format
+                bufr = 0x0;
+            }
+            ////////////////////////////////////////////////////////////////
+
+            if (word_width <= residue_bits) {
+                residue_bits -= word_width;
+                bufr |= packed_word << residue_bits;
+
+                if (residue_bits == 0) {
+                    residue_bits = CELL_BITWIDTH;
+                    *(ptr++)     = bufr;
+                }
+            }
+            else {
+                // example: we have 5-bit code 11111 but 3 bits available in (*ptr)
+                // 11111 for the residue 3 bits in (*ptr); 11111 for 2 bits of (*(++ptr)), starting with MSB
+                // ^^^                                        ^^
+                auto l_bits = word_width - residue_bits;
+                auto r_bits = CELL_BITWIDTH - l_bits;
+
+                bufr |= packed_word >> l_bits;
+                *(ptr++) = bufr;
+                bufr     = packed_word << r_bits;
+
+                residue_bits = r_bits;
+            }
+            total_bits += word_width;
+        }
+        *ptr = bufr;  // manage the last unit
+
+        par_nbit[tid]  = total_bits;
+        par_ncell[tid] = (total_bits + CELL_BITWIDTH - 1) / CELL_BITWIDTH;
+    }
+}
+
+template <typename Huff, typename Meta>
+__global__ void asz::detail::hf_encode_phase4_concatenate(
+    Huff*     gapped,
+    Meta*     par_entry,
+    Meta*     par_ncell,
+    int const cfg_sublen,
+    Huff*     non_gapped)
+{
+    auto n   = par_ncell[blockIdx.x];
+    auto src = gapped + cfg_sublen * blockIdx.x;
+    auto dst = non_gapped + par_entry[blockIdx.x];
+
+    for (auto i = threadIdx.x; i < n; i += blockDim.x) {  // block-stride
+        dst[i] = src[i];
+    }
+}
+
+template <typename UNCOMPRESSED, typename COMPRESSED, typename MetadataT>
+__global__ void hf_decode_kernel(
+    COMPRESSED*   compressed,
+    uint8_t*      revbook,
+    MetadataT*    par_nbit,
+    MetadataT*    par_entry,
+    int const     revbook_nbyte,
+    int const     sublen,
+    int const     pardeg,
+    UNCOMPRESSED* out_uncompressed)
+{
+    extern __shared__ uint8_t shmem[];
+    constexpr auto            block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE;
+
+    auto R = (revbook_nbyte - 1 + block_dim) / block_dim;
+
+    for (auto i = 0; i < R; i++) {
+        if (TIX + i * block_dim < revbook_nbyte) shmem[TIX + i * block_dim] = revbook[TIX + i * block_dim];
+    }
+    __syncthreads();
+
+    auto gid = BIX * BDX + TIX;
+
+    if (gid < pardeg) {
+        asz::detail::hf_decode_single_thread_inflate(
+            compressed + par_entry[gid], out_uncompressed + sublen * gid, par_nbit[gid], shmem);
+        __syncthreads();
+    }
+}
+
+#endif
diff --git a/qtensor/compression/cusz/src/hf/detail/hf_pimpl.inl b/qtensor/compression/cusz/src/hf/detail/hf_pimpl.inl
new file mode 100644
index 00000000..7a330ba6
--- /dev/null
+++ b/qtensor/compression/cusz/src/hf/detail/hf_pimpl.inl
@@ -0,0 +1,364 @@
+/**
+ * @file huffman_coarse.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-12-17
+ * (created) 2020-04-24 (rev1) 2021-09-05 (rev2) 2021-12-29
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * @copyright (C) 2021 by Washington State University, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_COMPONENT_HUFFMAN_COARSE_CUH
+#define CUSZ_COMPONENT_HUFFMAN_COARSE_CUH
+
+#include <cuda.h>
+// #include <clocale>
+// #include <cstdint>
+// #include <exception>
+// #include <functional>
+#include <iostream>
+#include <numeric>
+// #include <type_traits>
+
+using std::cout;
+
+#include "common/definition.hh"
+#include "common/type_traits.hh"
+#include "utils.hh"
+
+#include "hf/hf.hh"
+#include "hf/hf_bookg.hh"
+#include "hf/hf_codecg.hh"
+
+/******************************************************************************
+                            macros for shorthand writing
+ ******************************************************************************/
+
+#define EXPORT_NBYTE(FIELD) nbyte[Header::FIELD] = rte.nbyte[RTE::FIELD];
+
+#define DEVICE2DEVICE_COPY(VAR, FIELD)                                            \
+    {                                                                             \
+        constexpr auto D2D = cudaMemcpyDeviceToDevice;                            \
+        auto           dst = d_compressed + header.entry[Header::FIELD];          \
+        auto           src = reinterpret_cast<BYTE*>(d_##VAR);                    \
+        CHECK_CUDA(cudaMemcpyAsync(dst, src, nbyte[Header::FIELD], D2D, stream)); \
+    }
+
+#define ACCESSOR(SYM, TYPE) reinterpret_cast<TYPE*>(in_compressed + header.entry[Header::SYM])
+
+#define HC_ALLOCHOST(VAR, SYM)                     \
+    cudaMallocHost(&h_##VAR, rte.nbyte[RTE::SYM]); \
+    memset(h_##VAR, 0x0, rte.nbyte[RTE::SYM]);
+
+#define HC_ALLOCDEV(VAR, SYM)                  \
+    cudaMalloc(&d_##VAR, rte.nbyte[RTE::SYM]); \
+    cudaMemset(d_##VAR, 0x0, rte.nbyte[RTE::SYM]);
+
+#define HC_FREEHOST(VAR)       \
+    if (h_##VAR) {             \
+        cudaFreeHost(h_##VAR); \
+        h_##VAR = nullptr;     \
+    }
+
+#define HC_FREEDEV(VAR)    \
+    if (d_##VAR) {         \
+        cudaFree(d_##VAR); \
+        d_##VAR = nullptr; \
+    }
+
+/******************************************************************************
+                                class definition
+ ******************************************************************************/
+
+#define TEMPLATE_TYPE template <typename T, typename H, typename M>
+#define IMPL LosslessCodec<T, H, M>::impl
+
+namespace cusz {
+
+TEMPLATE_TYPE
+IMPL::~impl()
+{
+    HC_FREEDEV(tmp);
+    HC_FREEDEV(book);
+    HC_FREEDEV(revbook);
+    HC_FREEDEV(par_nbit);
+    HC_FREEDEV(par_ncell);
+    HC_FREEDEV(par_entry);
+    HC_FREEDEV(bitstream);
+
+    HC_FREEHOST(book);
+    HC_FREEHOST(revbook);
+    HC_FREEHOST(par_nbit);
+    HC_FREEHOST(par_ncell);
+    HC_FREEHOST(par_entry);
+}
+
+TEMPLATE_TYPE
+IMPL::impl() = default;
+
+//------------------------------------------------------------------------------
+
+TEMPLATE_TYPE
+void IMPL::init(size_t const in_uncompressed_len, int const booklen, int const pardeg, bool dbg_print)
+{
+    auto max_compressed_bytes = [&]() { return in_uncompressed_len / 2 * sizeof(H); };
+
+    auto debug = [&]() {
+        setlocale(LC_NUMERIC, "");
+        printf("\nHuffmanCoarse<T, H, M>::init() debugging:\n");
+        printf("CUdeviceptr nbyte: %d\n", (int)sizeof(CUdeviceptr));
+        dbg_println("TMP", d_tmp, RTE::TMP);
+        dbg_println("BOOK", d_book, RTE::BOOK);
+        dbg_println("REVBOOK", d_revbook, RTE::REVBOOK);
+        dbg_println("PAR_NBIT", d_par_nbit, RTE::PAR_NBIT);
+        dbg_println("PAR_NCELL", d_par_ncell, RTE::PAR_NCELL);
+        dbg_println("BITSTREAM", d_bitstream, RTE::BITSTREAM);
+        printf("\n");
+    };
+
+    memset(rte.nbyte, 0, sizeof(uint32_t) * RTE::END);
+    // memset(rte.entry, 0, sizeof(uint32_t) * (RTE::END + 1));
+
+    rte.nbyte[RTE::TMP]       = sizeof(H) * in_uncompressed_len;
+    rte.nbyte[RTE::BOOK]      = sizeof(H) * booklen;
+    rte.nbyte[RTE::REVBOOK]   = get_revbook_nbyte(booklen);
+    rte.nbyte[RTE::PAR_NBIT]  = sizeof(M) * pardeg;
+    rte.nbyte[RTE::PAR_NCELL] = sizeof(M) * pardeg;
+    rte.nbyte[RTE::PAR_ENTRY] = sizeof(M) * pardeg;
+    rte.nbyte[RTE::BITSTREAM] = max_compressed_bytes();
+
+    HC_ALLOCDEV(tmp, TMP);
+
+    {
+        auto total_bytes = rte.nbyte[RTE::BOOK] + rte.nbyte[RTE::REVBOOK];
+        cudaMalloc(&d_book, total_bytes);
+        cudaMemset(d_book, 0x0, total_bytes);
+
+        d_revbook = reinterpret_cast<uint8_t*>(d_book + booklen);
+    }
+
+    {
+        cudaMalloc(&d_par_metadata, rte.nbyte[RTE::PAR_NBIT] * 3);
+        cudaMemset(d_par_metadata, 0x0, rte.nbyte[RTE::PAR_NBIT] * 3);
+
+        d_par_nbit  = d_par_metadata;
+        d_par_ncell = d_par_metadata + pardeg;
+        d_par_entry = d_par_metadata + pardeg * 2;
+    }
+
+    HC_ALLOCDEV(bitstream, BITSTREAM);
+
+    // standalone definition for output
+    d_compressed = reinterpret_cast<BYTE*>(d_tmp);
+
+    HC_ALLOCHOST(book, BOOK);
+    HC_ALLOCHOST(revbook, REVBOOK);
+
+    {
+        cudaMallocHost(&h_par_metadata, rte.nbyte[RTE::PAR_NBIT] * 3);
+        // cudaMemset(h_par_nbit, 0x0, rte.nbyte[RTE::PAR_NBIT] * 3);
+
+        h_par_nbit  = h_par_metadata;
+        h_par_ncell = h_par_metadata + pardeg;
+        h_par_entry = h_par_metadata + pardeg * 2;
+    }
+
+    int numSMs;
+    cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, 0);
+
+    int sublen = (in_uncompressed_len - 1) / pardeg + 1;
+
+    book_desc      = new hf_book{nullptr, d_book, booklen};
+    chunk_desc_d   = new hf_chunk{d_par_nbit, d_par_ncell, d_par_entry};
+    chunk_desc_h   = new hf_chunk{h_par_nbit, h_par_ncell, h_par_entry};
+    bitstream_desc = new hf_bitstream{d_tmp, d_bitstream, chunk_desc_d, chunk_desc_h, sublen, pardeg, numSMs};
+
+    if (dbg_print) debug();
+}
+
+TEMPLATE_TYPE
+void IMPL::build_codebook(cusz::FREQ* freq, int const booklen, cudaStream_t stream)
+{
+    book_desc->freq = freq;
+    asz::hf_buildbook_g<T, H>(freq, booklen, d_book, d_revbook, get_revbook_nbyte(booklen), &time_book, stream);
+}
+
+TEMPLATE_TYPE
+void IMPL::encode(
+    T*           in_uncompressed,
+    size_t const in_uncompressed_len,
+    BYTE*&       out_compressed,
+    size_t&      out_compressed_len,
+    cudaStream_t stream)
+{
+    time_lossless = 0;
+
+    struct Header header;
+
+    asz::hf_encode_coarse_rev1<T, H, M>(
+        in_uncompressed, in_uncompressed_len,  //
+        book_desc, bitstream_desc,             //
+        out_compressed, out_compressed_len, time_lossless, stream);
+
+    header.total_nbit =
+        std::accumulate((M*)chunk_desc_h->bits, (M*)chunk_desc_h->bits + bitstream_desc->pardeg, (size_t)0);
+    header.total_ncell =
+        std::accumulate((M*)chunk_desc_h->cells, (M*)chunk_desc_h->cells + bitstream_desc->pardeg, (size_t)0);
+    // update with the precise BITSTREAM nbyte
+    rte.nbyte[RTE::BITSTREAM] = sizeof(H) * header.total_ncell;
+
+    // d_revbook and revbook_nbyte is hidden; need to improve here
+    subfile_collect(
+        header, in_uncompressed_len, book_desc->booklen, bitstream_desc->sublen, bitstream_desc->pardeg, stream);
+
+    out_compressed     = d_compressed;
+    out_compressed_len = header.subfile_size();
+}
+
+TEMPLATE_TYPE
+void IMPL::decode(BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool header_on_device)
+{
+    Header header;
+    if (header_on_device)
+        CHECK_CUDA(cudaMemcpyAsync(&header, in_compressed, sizeof(header), cudaMemcpyDeviceToHost, stream));
+
+    auto d_revbook   = ACCESSOR(REVBOOK, BYTE);
+    auto d_par_nbit  = ACCESSOR(PAR_NBIT, M);
+    auto d_par_entry = ACCESSOR(PAR_ENTRY, M);
+    auto d_bitstream = ACCESSOR(BITSTREAM, H);
+
+    auto const revbook_nbyte = get_revbook_nbyte(header.booklen);
+
+    // launch_coarse_grained_Huffman_decoding<T, H, M>(
+    asz::hf_decode_coarse<T, H, M>(
+        d_bitstream, d_revbook, revbook_nbyte, d_par_nbit, d_par_entry, header.sublen, header.pardeg, out_decompressed,
+        time_lossless, stream);
+}
+
+TEMPLATE_TYPE
+void IMPL::clear_buffer()
+{
+    cudaMemset(d_tmp, 0x0, rte.nbyte[RTE::TMP]);
+    cudaMemset(d_book, 0x0, rte.nbyte[RTE::BOOK]);
+    cudaMemset(d_revbook, 0x0, rte.nbyte[RTE::REVBOOK]);
+    cudaMemset(d_par_nbit, 0x0, rte.nbyte[RTE::PAR_NBIT]);
+    cudaMemset(d_par_ncell, 0x0, rte.nbyte[RTE::PAR_NCELL]);
+    cudaMemset(d_par_entry, 0x0, rte.nbyte[RTE::PAR_ENTRY]);
+    cudaMemset(d_bitstream, 0x0, rte.nbyte[RTE::BITSTREAM]);
+}
+
+// private helper
+TEMPLATE_TYPE
+void IMPL::subfile_collect(
+    Header&      header,
+    size_t const in_uncompressed_len,
+    int const    booklen,
+    int const    sublen,
+    int const    pardeg,
+    cudaStream_t stream)
+{
+    auto BARRIER = [&]() {
+        if (stream)
+            CHECK_CUDA(cudaStreamSynchronize(stream));
+        else
+            CHECK_CUDA(cudaDeviceSynchronize());
+    };
+
+    header.self_bytes       = sizeof(Header);
+    header.booklen          = booklen;
+    header.sublen           = sublen;
+    header.pardeg           = pardeg;
+    header.uncompressed_len = in_uncompressed_len;
+
+    MetadataT nbyte[Header::END];
+    nbyte[Header::HEADER] = sizeof(Header);
+
+    EXPORT_NBYTE(REVBOOK)
+    EXPORT_NBYTE(PAR_NBIT)
+    EXPORT_NBYTE(PAR_ENTRY)
+    EXPORT_NBYTE(BITSTREAM)
+
+    header.entry[0] = 0;
+    // *.END + 1: need to know the ending position
+    for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] = nbyte[i - 1]; }
+    for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; }
+
+    // auto debug_header_entry = [&]() {
+    //     for (auto i = 0; i < Header::END + 1; i++) printf("%d, header entry: %d\n", i, header.entry[i]);
+    // };
+    // debug_header_entry();
+
+    CHECK_CUDA(cudaMemcpyAsync(d_compressed, &header, sizeof(header), cudaMemcpyHostToDevice, stream));
+
+    /* debug */ BARRIER();
+
+    DEVICE2DEVICE_COPY(revbook, REVBOOK)
+    DEVICE2DEVICE_COPY(par_nbit, PAR_NBIT)
+    DEVICE2DEVICE_COPY(par_entry, PAR_ENTRY)
+    DEVICE2DEVICE_COPY(bitstream, BITSTREAM)
+}
+
+// getter
+TEMPLATE_TYPE
+float IMPL::get_time_elapsed() const { return milliseconds; }
+
+TEMPLATE_TYPE
+float IMPL::get_time_book() const { return time_book; }
+TEMPLATE_TYPE
+float IMPL::get_time_lossless() const { return time_lossless; }
+
+TEMPLATE_TYPE
+H* IMPL::expose_book() const { return d_book; }
+
+TEMPLATE_TYPE
+BYTE* IMPL::expose_revbook() const { return d_revbook; }
+
+// TODO this kind of space will be overlapping with quant-codes
+TEMPLATE_TYPE
+size_t IMPL::get_workspace_nbyte(size_t len) const { return sizeof(H) * len; }
+
+TEMPLATE_TYPE
+size_t IMPL::get_max_output_nbyte(size_t len) const { return sizeof(H) * len / 2; }
+
+TEMPLATE_TYPE
+size_t IMPL::get_revbook_nbyte(int dict_size) { return sizeof(BOOK) * (2 * CELL_BITWIDTH) + sizeof(SYM) * dict_size; }
+
+TEMPLATE_TYPE
+constexpr bool IMPL::can_overlap_input_and_firstphase_encode() { return sizeof(T) == sizeof(H); }
+
+// auxiliary
+TEMPLATE_TYPE
+void IMPL::dbg_println(const std::string SYM_name, void* VAR, int SYM)
+{
+    CUdeviceptr pbase0{0};
+    size_t      psize0{0};
+
+    cuMemGetAddressRange(&pbase0, &psize0, (CUdeviceptr)VAR);
+    printf(
+        "%s:\n"
+        "\t(supposed) pointer : %p\n"
+        "\t(supposed) bytes   : %'9lu\n"
+        "\t(queried)  pbase0  : %p\n"
+        "\t(queried)  psize0  : %'9lu\n",
+        SYM_name.c_str(), (void*)VAR, (size_t)rte.nbyte[SYM], (void*)&pbase0, psize0);
+    pbase0 = 0, psize0 = 0;
+}
+
+}  // namespace cusz
+
+#undef HC_ALLOCDEV
+#undef HC_ALLOCHOST
+#undef HC_FREEDEV
+#undef HC_FREEHOST
+#undef EXPORT_NBYTE
+#undef ACCESSOR
+#undef DEVICE2DEVICE_COPY
+
+#undef TEMPLATE_TYPE
+#undef IMPL
+
+#endif
diff --git a/qtensor/compression/cusz/src/hf/detail/par_merge.inl b/qtensor/compression/cusz/src/hf/detail/par_merge.inl
new file mode 100644
index 00000000..6e934a08
--- /dev/null
+++ b/qtensor/compression/cusz/src/hf/detail/par_merge.inl
@@ -0,0 +1,445 @@
+/*
+ * Authors:
+ *  Oded Green (ogreen@gatech.edu), Rob McColl (robert.c.mccoll@gmail.com)
+ *  High Performance Computing Lab, Georgia Tech
+ *
+ * Future Publication:
+ * GPU MergePath: A GPU Merging Algorithm
+ * ACM International Conference on Supercomputing 2012
+ * June 25-29 2012, San Servolo, Venice, Italy
+ *
+ * (C) 2012 Georgia Institute of Technology
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the Georgia Institute of Technology nor the names of
+ *   its contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file par_merge.h
+ * @author Oded Green (ogreen@gatech.edu), Rob McColl (robert.c.mccoll@gmail.com))
+ * @brief Modified and adapted by Cody Rivera
+ * @version 0.3
+ * @date 2020-10-24
+ * (created) 2020-06 (rev) 2021-06-21
+ *
+ */
+
+#ifndef CUSZ_KERNEL_PAR_MERGE_CUH
+#define CUSZ_KERNEL_PAR_MERGE_CUH
+
+#include <cuda.h>
+#include <float.h>
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <cooperative_groups.h>
+namespace cg = cooperative_groups;
+
+#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
+#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
+// Mathematically correct modulo
+#define MOD(a, b) ((((a) % (b)) + (b)) % (b))
+
+/* MERGETYPE
+ * Performs <runs> merges of two sorted pseudorandom <vec_t> arrays of length <size>
+ * Times the runs and reports on the average time
+ * Checks the output of each merge for correctness
+ */
+#define PADDING 1024
+
+/********************************************************************************
+ * signature
+ ********************************************************************************/
+
+// Partition array
+template <typename F>
+__device__ void cudaWorkloadDiagonals(
+    F*        copyFreq,
+    int*      copyIndex,
+    int*      copyIsLeaf,
+    int       cStart,
+    int       cEnd,
+    F*        iNodesFreq,
+    int       iStart,
+    int       iEnd,
+    int       iNodesCap,
+    uint32_t* diagonal_path_intersections,
+    /* Shared Memory */
+    int32_t& x_top,
+    int32_t& y_top,
+    int32_t& x_bottom,
+    int32_t& y_bottom,
+    int32_t& found,
+    int32_t* oneorzero);
+
+// Merge partitions
+template <typename F>
+__device__ void cudaMergeSinglePath(
+    F*        copyFreq,
+    int*      copyIndex,
+    int*      copyIsLeaf,
+    int       cStart,
+    int       cEnd,
+    F*        iNodesFreq,
+    int       iStart,
+    int       iEnd,
+    int       iNodesCap,
+    uint32_t* diagonal_path_intersections,
+    F*        tempFreq,
+    int*      tempIndex,
+    int*      tempIsLeaf,
+    int       tempLength);
+
+template <typename F>
+__device__ void parMerge(
+    F*        copyFreq,
+    int*      copyIndex,
+    int*      copyIsLeaf,
+    int       cStart,
+    int       cEnd,
+    F*        iNodesFreq,
+    int       iStart,
+    int       iEnd,
+    int       iNodesCap,
+    F*        tempFreq,
+    int*      tempIndex,
+    int*      tempIsLeaf,
+    int&      tempLength,
+    uint32_t* diagonal_path_intersections,
+    int       blocks,
+    int       threads,
+    /* Shared Memory */
+    int32_t& x_top,
+    int32_t& y_top,
+    int32_t& x_bottom,
+    int32_t& y_bottom,
+    int32_t& found,
+    int32_t* oneorzero);
+
+template <typename F>
+__device__ void merge(
+    F*   copyFreq,
+    int* copyIndex,
+    int* copyIsLeaf,
+    int  cStart,
+    int  cEnd,
+    F*   iNodesFreq,
+    int  iStart,
+    int  iEnd,
+    int  iNodesCap,
+    F*   tempFreq,
+    int* tempIndex,
+    int* tempIsLeaf,
+    int& tempLength);
+
+/********************************************************************************
+ * definition
+ ********************************************************************************/
+
+// clang-format off
+template <typename F>
+__device__ void parMerge(
+    F* copyFreq,    int* copyIndex,  int* copyIsLeaf,   int  cStart,    int cEnd,
+    F* iNodesFreq,  int  iStart,     int  iEnd,         int  iNodesCap,
+    F* tempFreq,    int* tempIndex,  int* tempIsLeaf,   int& tempLength,
+    uint32_t* diagonal_path_intersections, int blocks,  int  threads,
+    /* Shared Memory */
+    int32_t& x_top, int32_t& y_top,  int32_t& x_bottom, int32_t& y_bottom,
+    int32_t& found, int32_t* oneorzero)
+    {
+    // clang-format on
+    auto current_grid = cg::this_grid();
+    current_grid.sync();
+    tempLength = (cEnd - cStart) + MOD(iEnd - iStart, iNodesCap);
+
+    if (tempLength == 0) return;
+
+    // Perform the global diagonal intersection serach to divide work among SMs
+    cudaWorkloadDiagonals<F>(
+        copyFreq, copyIndex, copyIsLeaf, cStart, cEnd,  //
+        iNodesFreq, iStart, iEnd, iNodesCap,            //
+        diagonal_path_intersections,                    //
+        x_top, y_top, x_bottom, y_bottom, found, oneorzero);
+    current_grid.sync();
+
+    // Merge between global diagonals independently on each block
+    cudaMergeSinglePath<F>(
+        copyFreq, copyIndex, copyIsLeaf, cStart, cEnd,  //
+        iNodesFreq, iStart, iEnd, iNodesCap,            //
+        diagonal_path_intersections,                    //
+        tempFreq, tempIndex, tempIsLeaf, tempLength);
+    current_grid.sync();
+}
+
+/* CUDAWORKLOADDIAGONALS
+ * Performs a 32-wide binary search on one glboal diagonal per block to find the intersection with the path.
+ * This divides the workload into independent merges for the next step
+ */
+// clang-format off
+template <typename F>
+__device__ void cudaWorkloadDiagonals(
+    F*  copyFreq,   int* copyIndex, int* copyIsLeaf,
+    int cStart,     int  cEnd,
+    F*  iNodesFreq,
+    int iStart,     int  iEnd,      int  iNodesCap,
+    uint32_t* diagonal_path_intersections,
+    /* Shared Memory */
+    int32_t& x_top, int32_t& y_top, int32_t& x_bottom, int32_t& y_bottom,
+    int32_t& found, int32_t* oneorzero)
+{
+    // clang-format on
+    uint32_t A_length = cEnd - cStart;
+    uint32_t B_length = MOD(iEnd - iStart, iNodesCap);
+    // Calculate combined index around the MergePath "matrix"
+    int32_t combinedIndex = ((uint64_t)blockIdx.x * ((uint64_t)A_length + (uint64_t)B_length)) / (uint64_t)gridDim.x;
+    /*
+    __shared__ int32_t x_top, y_top, x_bottom, y_bottom,  found;
+    __shared__ int32_t oneorzero[32];
+    */
+    int threadOffset = threadIdx.x - 16;
+
+    if (threadIdx.x < 32) {
+        // Figure out the coordinates of our diagonal
+        if (A_length >= B_length) {
+            x_top    = MIN(combinedIndex, A_length);
+            y_top    = combinedIndex > A_length ? combinedIndex - (A_length) : 0;
+            x_bottom = y_top;
+            y_bottom = x_top;
+        }
+        else {
+            y_bottom = MIN(combinedIndex, B_length);
+            x_bottom = combinedIndex > B_length ? combinedIndex - (B_length) : 0;
+            y_top    = x_bottom;
+            x_top    = y_bottom;
+        }
+    }
+
+    // if (threadIdx.x == 0) {
+    //    printf("Diagonal block %d: (%d, %d) to (%d, %d)\n", blockIdx.x, x_top, y_top, x_bottom, y_bottom);
+    //}
+
+    found = 0;
+
+    // Search the diagonal
+    while (!found) {
+        // Update our coordinates within the 32-wide section of the diagonal
+        int32_t current_x = x_top - ((x_top - x_bottom) >> 1) - threadOffset;
+        int32_t current_y = y_top + ((y_bottom - y_top) >> 1) + threadOffset;
+        int32_t getfrom_x = current_x + cStart - 1;
+        // Below statement is a more efficient, divmodless version of the following
+        // int32_t getfrom_y = MOD(iStart + current_y, iNodesCap);
+        int32_t getfrom_y = iStart + current_y;
+
+        if (threadIdx.x < 32) {
+            if (getfrom_y >= iNodesCap) getfrom_y -= iNodesCap;
+
+            // Are we a '1' or '0' with respect to A[x] <= B[x]
+            if (current_x > (int32_t)A_length or current_y < 0) { oneorzero[threadIdx.x] = 0; }
+            else if (current_y >= (int32_t)B_length || current_x < 1) {
+                oneorzero[threadIdx.x] = 1;
+            }
+            else {
+                oneorzero[threadIdx.x] = (copyFreq[getfrom_x] <= iNodesFreq[getfrom_y]) ? 1 : 0;
+            }
+        }
+
+        __syncthreads();
+
+        // If we find the meeting of the '1's and '0's, we found the
+        // intersection of the path and diagonal
+        if (threadIdx.x > 0 and                                     //
+            threadIdx.x < 32 and                                    //
+            (oneorzero[threadIdx.x] != oneorzero[threadIdx.x - 1])  //
+        ) {
+            found = 1;
+
+            diagonal_path_intersections[blockIdx.x]                 = current_x;
+            diagonal_path_intersections[blockIdx.x + gridDim.x + 1] = current_y;
+        }
+
+        __syncthreads();
+
+        // Adjust the search window on the diagonal
+        if (threadIdx.x == 16) {
+            if (oneorzero[31] != 0) {
+                x_bottom = current_x;
+                y_bottom = current_y;
+            }
+            else {
+                x_top = current_x;
+                y_top = current_y;
+            }
+        }
+        __syncthreads();
+    }
+
+    // Set the boundary diagonals (through 0,0 and A_length,B_length)
+    if (threadIdx.x == 0 && blockIdx.x == 0) {
+        diagonal_path_intersections[0]                         = 0;
+        diagonal_path_intersections[gridDim.x + 1]             = 0;
+        diagonal_path_intersections[gridDim.x]                 = A_length;
+        diagonal_path_intersections[gridDim.x + gridDim.x + 1] = B_length;
+    }
+}
+
+// Serial merge
+// clang-format off
+template <typename F>
+__device__ void merge(
+    F*   copyFreq,   int* copyIndex, int* copyIsLeaf, int  cStart,    int  cEnd,
+    F*   iNodesFreq, int  iStart,    int  iEnd,       int  iNodesCap,
+    F*   tempFreq,   int* tempIndex, int* tempIsLeaf, int& tempLength)
+{
+    // clang-format on
+    int len      = 0;
+    int iterCopy = cStart, iterINodes = iStart;
+
+    while (iterCopy < cEnd && MOD(iEnd - iterINodes, iNodesCap) > 0) {
+        if (copyFreq[iterCopy] <= iNodesFreq[iterINodes]) {
+            tempFreq[len]   = copyFreq[iterCopy];
+            tempIndex[len]  = copyIndex[iterCopy];
+            tempIsLeaf[len] = copyIsLeaf[iterCopy];
+            ++iterCopy;
+        }
+        else {
+            tempFreq[len]   = iNodesFreq[iterINodes];
+            tempIndex[len]  = iterINodes;
+            tempIsLeaf[len] = 0;
+            iterINodes      = MOD(iterINodes + 1, iNodesCap);
+        }
+        ++len;
+    }
+
+    while (iterCopy < cEnd) {
+        tempFreq[len]   = copyFreq[iterCopy];
+        tempIndex[len]  = copyIndex[iterCopy];
+        tempIsLeaf[len] = copyIsLeaf[iterCopy];
+        ++iterCopy;
+        ++len;
+    }
+    while (MOD(iEnd - iterINodes, iNodesCap) > 0) {
+        tempFreq[len]   = iNodesFreq[iterINodes];
+        tempIndex[len]  = iterINodes;
+        tempIsLeaf[len] = 0;
+        iterINodes      = MOD(iterINodes + 1, iNodesCap);
+        ++len;
+    }
+
+    tempLength = len;
+}
+
+/* CUDAMERGESINGLEPATH
+ * Performs merge windows within a thread block from that block's global diagonal
+ * intersection to the next
+ */
+#define K 512
+#define PAD_SIZE 0
+
+// clang-format off
+template <typename F>
+__device__ void cudaMergeSinglePath(
+    F*  copyFreq,   int* copyIndex, int* copyIsLeaf,
+    int cStart,     int  cEnd,
+    F*  iNodesFreq,
+    int iStart,     int  iEnd,      int  iNodesCap,
+    uint32_t* diagonal_path_intersections,
+    F*  tempFreq,   int* tempIndex, int* tempIsLeaf,
+    int tempLength)
+{
+    // clang-format on
+    // Temporary Code -- Serial Merge Per Block
+    if (threadIdx.x == 0) {
+        // Boundaries
+        int x_block_top  = diagonal_path_intersections[blockIdx.x];
+        int y_block_top  = diagonal_path_intersections[blockIdx.x + gridDim.x + 1];
+        int x_block_stop = diagonal_path_intersections[blockIdx.x + 1];
+        int y_block_stop = diagonal_path_intersections[blockIdx.x + gridDim.x + 2];
+
+        // Actual indexes
+        int x_start = x_block_top + cStart;
+        int x_end   = x_block_stop + cStart;
+        int y_start = MOD(iStart + y_block_top, iNodesCap);
+        int y_end   = MOD(iStart + y_block_stop, iNodesCap);
+
+        int offset = x_block_top + y_block_top;
+
+        int dummy;  // Unused result
+        // TODO optimize serial merging of each partition
+        merge(
+            copyFreq, copyIndex, copyIsLeaf, x_start, x_end,  //
+            iNodesFreq, y_start, y_end, iNodesCap,            //
+            tempFreq + offset, tempIndex + offset, tempIsLeaf + offset, dummy);
+        if (0) {
+            printf(
+                "block: %d x: %d %d, y: %d %d, contrib: %d\n", blockIdx.x, x_block_top, x_block_stop, y_block_top,
+                y_block_stop, dummy);
+        }
+    }
+}
+
+// `unsigned int` instantiations
+template __device__ void parMerge<unsigned int>(
+    unsigned int* copyFreq,
+    int*          copyIndex,
+    int*          copyIsLeaf,
+    int           cStart,
+    int           cEnd,
+    unsigned int* iNodesFreq,
+    int           iStart,
+    int           iEnd,
+    int           iNodesCap,
+    unsigned int* tempFreq,
+    int*          tempIndex,
+    int*          tempIsLeaf,
+    int&          tempLength,
+    uint32_t*     diagonal_path_intersections,
+    int           blocks,
+    int           threads,
+    /* Shared Memory */
+    int32_t& x_top,
+    int32_t& y_top,
+    int32_t& x_bottom,
+    int32_t& y_bottom,
+    int32_t& found,
+    int32_t* oneorzero);
+
+template __device__ void merge<unsigned int>(
+    unsigned int* copyFreq,
+    int*          copyIndex,
+    int*          copyIsLeaf,
+    int           cStart,
+    int           cEnd,
+    unsigned int* iNodesFreq,
+    int           iStart,
+    int           iEnd,
+    int           iNodesCap,
+    unsigned int* tempFreq,
+    int*          tempIndex,
+    int*          tempIsLeaf,
+    int&          tempLength);
+
+#endif
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/hf/hf.cc b/qtensor/compression/cusz/src/hf/hf.cc
new file mode 100644
index 00000000..19387263
--- /dev/null
+++ b/qtensor/compression/cusz/src/hf/hf.cc
@@ -0,0 +1,109 @@
+/**
+ * @file codec.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-23
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "common/type_traits.hh"
+
+#include "hf/hf.hh"
+#include "hf/hf_bookg.hh"
+#include "hf/hf_codecg.hh"
+
+namespace cusz {
+
+#define TEMPLATE_TYPE template <typename T, typename H, typename M>
+#define HUFFMAN_COARSE LosslessCodec<T, H, M>
+
+TEMPLATE_TYPE
+HUFFMAN_COARSE::~LosslessCodec() { pimpl.reset(); }
+
+TEMPLATE_TYPE
+HUFFMAN_COARSE::LosslessCodec() : pimpl{std::make_unique<impl>()} {}
+
+TEMPLATE_TYPE
+HUFFMAN_COARSE::LosslessCodec(const HUFFMAN_COARSE& old) : pimpl{std::make_unique<impl>(*old.pimpl)}
+{
+    // TODO allocation/deep copy
+}
+
+TEMPLATE_TYPE
+HUFFMAN_COARSE& HUFFMAN_COARSE::operator=(const HUFFMAN_COARSE& old)
+{
+    *pimpl = *old.pimpl;
+    // TODO allocation/deep copy
+    return *this;
+}
+
+TEMPLATE_TYPE
+HUFFMAN_COARSE::LosslessCodec(HUFFMAN_COARSE&&) = default;
+
+TEMPLATE_TYPE
+HUFFMAN_COARSE& HUFFMAN_COARSE::operator=(HUFFMAN_COARSE&&) = default;
+
+//------------------------------------------------------------------------------
+
+TEMPLATE_TYPE
+void HUFFMAN_COARSE::init(size_t const in_uncompressed_len, int const booklen, int const pardeg, bool dbg_print)
+{
+    pimpl->init(in_uncompressed_len, booklen, pardeg, dbg_print);
+}
+
+TEMPLATE_TYPE
+void HUFFMAN_COARSE::build_codebook(uint32_t* freq, int const booklen, cudaStream_t stream)
+{
+    pimpl->build_codebook(freq, booklen, stream);
+}
+
+TEMPLATE_TYPE
+void HUFFMAN_COARSE::encode(
+    T*           in_uncompressed,
+    size_t const in_uncompressed_len,
+    BYTE*&       out_compressed,
+    size_t&      out_compressed_len,
+    cudaStream_t stream)
+{
+    pimpl->encode(in_uncompressed, in_uncompressed_len, out_compressed, out_compressed_len, stream);
+}
+
+TEMPLATE_TYPE
+void HUFFMAN_COARSE::decode(BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool header_on_device)
+{
+    pimpl->decode(in_compressed, out_decompressed, stream, header_on_device);
+}
+
+TEMPLATE_TYPE
+void HUFFMAN_COARSE::clear_buffer() { pimpl->clear_buffer(); }
+
+TEMPLATE_TYPE
+float HUFFMAN_COARSE::get_time_elapsed() const { return pimpl->get_time_elapsed(); }
+
+TEMPLATE_TYPE
+float HUFFMAN_COARSE::get_time_book() const { return pimpl->get_time_book(); }
+TEMPLATE_TYPE
+float HUFFMAN_COARSE::get_time_lossless() const { return pimpl->get_time_lossless(); }
+
+#undef TEMPLATE_TYPE
+#undef HUFFMAN_COARSE
+
+}  // namespace cusz
+
+#define HUFFCOARSE_CC(E, ETF, H, M) \
+    template class cusz::LosslessCodec<ErrCtrlTrait<E, ETF>::type, HuffTrait<H>::type, MetadataTrait<M>::type>;
+
+HUFFCOARSE_CC(1, false, 4, 4)  // uint
+HUFFCOARSE_CC(1, false, 8, 4)  //
+HUFFCOARSE_CC(2, false, 4, 4)  //
+HUFFCOARSE_CC(2, false, 8, 4)  //
+HUFFCOARSE_CC(4, false, 4, 4)  //
+HUFFCOARSE_CC(4, false, 8, 4)  //
+
+HUFFCOARSE_CC(4, true, 4, 4)  // float
+HUFFCOARSE_CC(4, true, 8, 4)  //
+
+#undef HUFFCOARSE_CC
diff --git a/qtensor/compression/cusz/src/hf/hf_bookg.cu b/qtensor/compression/cusz/src/hf/hf_bookg.cu
new file mode 100644
index 00000000..fc6d3ac9
--- /dev/null
+++ b/qtensor/compression/cusz/src/hf/hf_bookg.cu
@@ -0,0 +1,33 @@
+/**
+ * @file hf_bookg.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "detail/hf_bookg.inl"
+#include "hf/hf_bookg.hh"
+
+#define PAR_BOOK(T, H) \
+    template void asz::hf_buildbook_g<T, H>(uint32_t*, int const, H*, uint8_t*, int const, float*, cudaStream_t);
+
+PAR_BOOK(uint8_t, uint32_t);
+PAR_BOOK(uint16_t, uint32_t);
+PAR_BOOK(uint32_t, uint32_t);
+PAR_BOOK(float, uint32_t);
+
+PAR_BOOK(uint8_t, uint64_t);
+PAR_BOOK(uint16_t, uint64_t);
+PAR_BOOK(uint32_t, uint64_t);
+PAR_BOOK(float, uint64_t);
+
+PAR_BOOK(uint8_t, unsigned long long);
+PAR_BOOK(uint16_t, unsigned long long);
+PAR_BOOK(uint32_t, unsigned long long);
+PAR_BOOK(float, unsigned long long);
+
+#undef PAR_BOOK
diff --git a/qtensor/compression/cusz/src/hf/hf_codecg.cu b/qtensor/compression/cusz/src/hf/hf_codecg.cu
new file mode 100644
index 00000000..9b7d9f0b
--- /dev/null
+++ b/qtensor/compression/cusz/src/hf/hf_codecg.cu
@@ -0,0 +1,269 @@
+/**
+ * @file hf_codecg.cu
+ * @author Jiannan Tian
+ * @brief kernel wrappers; launching Huffman kernels
+ * @version 0.3
+ * @date 2022-11-02
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "detail/hf_codecg.inl"
+#include "hf/hf_bookg.hh"
+#include "hf/hf_codecg.hh"
+
+template <typename T, typename H, typename M>
+void asz::hf_encode_coarse(
+    T*           uncompressed,
+    H*           d_internal_coded,
+    size_t const len,
+    uint32_t*    d_freq,
+    H*           d_book,
+    int const    booklen,
+    H*           d_bitstream,
+    M*           d_par_metadata,
+    M*           h_par_metadata,
+    int const    sublen,
+    int const    pardeg,
+    int          numSMs,
+    uint8_t*&    out_compressed,
+    size_t&      out_compressed_len,
+    float&       time_lossless,
+    cudaStream_t stream)
+{
+    auto d_par_nbit  = d_par_metadata;
+    auto d_par_ncell = d_par_metadata + pardeg;
+    auto d_par_entry = d_par_metadata + pardeg * 2;
+
+    auto h_par_nbit  = h_par_metadata;
+    auto h_par_ncell = h_par_metadata + pardeg;
+    auto h_par_entry = h_par_metadata + pardeg * 2;
+
+    CREATE_CUDAEVENT_PAIR;
+
+    /* phase 1 */
+    {
+        auto block_dim = HuffmanHelper::BLOCK_DIM_ENCODE;
+        auto grid_dim  = ConfigHelper::get_npart(len, block_dim);
+
+        START_CUDAEVENT_RECORDING(stream);
+
+        asz::detail::hf_encode_phase1_fill<T, H>                //
+            <<<8 * numSMs, 256, sizeof(H) * booklen, stream>>>  //
+            (uncompressed, len, d_book, booklen, d_internal_coded);
+
+        STOP_CUDAEVENT_RECORDING(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        float stage_time;
+        TIME_ELAPSED_CUDAEVENT(&stage_time);
+        time_lossless += stage_time;
+    }
+
+    /* phase 2 */
+    {
+        auto block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE;
+        auto grid_dim  = ConfigHelper::get_npart(pardeg, block_dim);
+
+        START_CUDAEVENT_RECORDING(stream);
+
+        asz::detail::hf_encode_phase2_deflate<H>  //
+            <<<grid_dim, block_dim, 0, stream>>>  //
+            (d_internal_coded, len, d_par_nbit, d_par_ncell, sublen, pardeg);
+
+        STOP_CUDAEVENT_RECORDING(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        float stage_time;
+        TIME_ELAPSED_CUDAEVENT(&stage_time);
+        time_lossless += stage_time;
+    }
+
+    /* phase 3 */
+    {
+        CHECK_CUDA(cudaMemcpyAsync(h_par_nbit, d_par_nbit, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream));
+        CHECK_CUDA(cudaMemcpyAsync(h_par_ncell, d_par_ncell, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream));
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        memcpy(h_par_entry + 1, h_par_ncell, (pardeg - 1) * sizeof(M));
+        for (auto i = 1; i < pardeg; i++) h_par_entry[i] += h_par_entry[i - 1];  // inclusive scan
+
+        CHECK_CUDA(cudaMemcpyAsync(d_par_entry, h_par_entry, pardeg * sizeof(M), cudaMemcpyHostToDevice, stream));
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+    }
+
+    /* phase 4 */
+    {
+        START_CUDAEVENT_RECORDING(stream);
+
+        asz::detail::hf_encode_phase4_concatenate<H, M><<<pardeg, 128, 0, stream>>>  //
+            (d_internal_coded, d_par_entry, d_par_ncell, sublen, d_bitstream);
+
+        STOP_CUDAEVENT_RECORDING(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        float stage_time;
+        TIME_ELAPSED_CUDAEVENT(&stage_time);
+        time_lossless += stage_time;
+    }
+
+    DESTROY_CUDAEVENT_PAIR;
+}
+
+template <typename T, typename H, typename M>
+void asz::hf_encode_coarse_rev1(
+    T*            uncompressed,
+    size_t const  len,
+    hf_book*      book_desc,
+    hf_bitstream* bitstream_desc,
+    uint8_t*&     out_compressed,      // 22-10-12 buggy
+    size_t&       out_compressed_len,  // 22-10-12 buggy
+    float&        time_lossless,
+    cudaStream_t  stream)
+{
+    CREATE_CUDAEVENT_PAIR;
+
+    H*        d_buffer    = (H*)bitstream_desc->buffer;
+    H*        d_bitstream = (H*)bitstream_desc->bitstream;
+    H*        d_book      = (H*)book_desc->book;
+    int const booklen     = book_desc->booklen;
+    int const sublen      = bitstream_desc->sublen;
+    int const pardeg      = bitstream_desc->pardeg;
+    int const numSMs      = bitstream_desc->numSMs;
+    // uint32_t* d_freq      = book_desc->freq;
+
+    auto d_par_nbit  = (M*)bitstream_desc->d_metadata->bits;
+    auto d_par_ncell = (M*)bitstream_desc->d_metadata->cells;
+    auto d_par_entry = (M*)bitstream_desc->d_metadata->entries;
+
+    auto h_par_nbit  = (M*)bitstream_desc->h_metadata->bits;
+    auto h_par_ncell = (M*)bitstream_desc->h_metadata->cells;
+    auto h_par_entry = (M*)bitstream_desc->h_metadata->entries;
+
+    /* phase 1 */
+    {
+        auto block_dim = HuffmanHelper::BLOCK_DIM_ENCODE;
+        auto grid_dim  = ConfigHelper::get_npart(len, block_dim);
+
+        START_CUDAEVENT_RECORDING(stream);
+
+        asz::detail::hf_encode_phase1_fill<T, H>                //
+            <<<8 * numSMs, 256, sizeof(H) * booklen, stream>>>  //
+            (uncompressed, len, d_book, booklen, d_buffer);
+
+        STOP_CUDAEVENT_RECORDING(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        float stage_time;
+        TIME_ELAPSED_CUDAEVENT(&stage_time);
+        time_lossless += stage_time;
+    }
+
+    /* phase 2 */
+    {
+        auto block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE;
+        auto grid_dim  = ConfigHelper::get_npart(pardeg, block_dim);
+
+        START_CUDAEVENT_RECORDING(stream);
+
+        asz::detail::hf_encode_phase2_deflate<H>  //
+            <<<grid_dim, block_dim, 0, stream>>>  //
+            (d_buffer, len, d_par_nbit, d_par_ncell, sublen, pardeg);
+
+        STOP_CUDAEVENT_RECORDING(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        float stage_time;
+        TIME_ELAPSED_CUDAEVENT(&stage_time);
+        time_lossless += stage_time;
+    }
+
+    /* phase 3 */
+    {
+        CHECK_CUDA(cudaMemcpyAsync(h_par_nbit, d_par_nbit, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream));
+        CHECK_CUDA(cudaMemcpyAsync(h_par_ncell, d_par_ncell, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream));
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        memcpy(h_par_entry + 1, h_par_ncell, (pardeg - 1) * sizeof(M));
+        for (auto i = 1; i < pardeg; i++) h_par_entry[i] += h_par_entry[i - 1];  // inclusive scan
+
+        CHECK_CUDA(cudaMemcpyAsync(d_par_entry, h_par_entry, pardeg * sizeof(M), cudaMemcpyHostToDevice, stream));
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+    }
+
+    /* phase 4 */
+    {
+        START_CUDAEVENT_RECORDING(stream);
+
+        asz::detail::hf_encode_phase4_concatenate<H, M><<<pardeg, 128, 0, stream>>>  //
+            (d_buffer, d_par_entry, d_par_ncell, sublen, d_bitstream);
+
+        STOP_CUDAEVENT_RECORDING(stream);
+
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        float stage_time;
+        TIME_ELAPSED_CUDAEVENT(&stage_time);
+        time_lossless += stage_time;
+    }
+}
+
+template <typename T, typename H, typename M>
+void asz::hf_decode_coarse(
+    H*           d_bitstream,
+    uint8_t*     d_revbook,
+    int const    revbook_nbyte,
+    M*           d_par_nbit,
+    M*           d_par_entry,
+    int const    sublen,
+    int const    pardeg,
+    T*           out_decompressed,
+    float&       time_lossless,
+    cudaStream_t stream)
+{
+    auto const block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE;  // = deflating
+    auto const grid_dim  = ConfigHelper::get_npart(pardeg, block_dim);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream)
+
+    hf_decode_kernel<T, H, M>                             //
+        <<<grid_dim, block_dim, revbook_nbyte, stream>>>  //
+        (d_bitstream, d_revbook, d_par_nbit, d_par_entry, revbook_nbyte, sublen, pardeg, out_decompressed);
+
+    STOP_CUDAEVENT_RECORDING(stream)
+    cudaStreamSynchronize(stream);
+
+    TIME_ELAPSED_CUDAEVENT(&time_lossless);
+    DESTROY_CUDAEVENT_PAIR;
+}
+
+#define HF_CODEC_INIT(T, H, M)                                                                                     \
+    template void asz::hf_encode_coarse<T, H, M>(                                                                  \
+        T*, H*, size_t const, uint32_t*, H*, int const, H*, M*, M*, int const, int const, int, uint8_t*&, size_t&, \
+        float&, cudaStream_t);                                                                                     \
+                                                                                                                   \
+    template void asz::hf_encode_coarse_rev1<T, H, M>(                                                             \
+        T*, size_t const, hf_book*, hf_bitstream*, uint8_t*&, size_t&, float&, cudaStream_t);                      \
+                                                                                                                   \
+    template void asz::hf_decode_coarse<T, H, M>(                                                                  \
+        H*, uint8_t*, int const, M*, M*, int const, int const, T*, float&, cudaStream_t);
+
+HF_CODEC_INIT(uint8_t, uint32_t, uint32_t);
+HF_CODEC_INIT(uint16_t, uint32_t, uint32_t);
+HF_CODEC_INIT(uint32_t, uint32_t, uint32_t);
+HF_CODEC_INIT(float, uint32_t, uint32_t);
+HF_CODEC_INIT(uint8_t, uint64_t, uint32_t);
+HF_CODEC_INIT(uint16_t, uint64_t, uint32_t);
+HF_CODEC_INIT(uint32_t, uint64_t, uint32_t);
+HF_CODEC_INIT(float, uint64_t, uint32_t);
+HF_CODEC_INIT(uint8_t, unsigned long long, uint32_t);
+HF_CODEC_INIT(uint16_t, unsigned long long, uint32_t);
+HF_CODEC_INIT(uint32_t, unsigned long long, uint32_t);
+HF_CODEC_INIT(float, unsigned long long, uint32_t);
+
+#undef HFBOOK_INIT
+#undef HF_CODEC_INIT
diff --git a/qtensor/compression/cusz/src/hf/hf_pimpl.cu b/qtensor/compression/cusz/src/hf/hf_pimpl.cu
new file mode 100644
index 00000000..595ccea4
--- /dev/null
+++ b/qtensor/compression/cusz/src/hf/hf_pimpl.cu
@@ -0,0 +1,31 @@
+/**
+ * @file huffman_coarse.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-12-17
+ * (created) 2020-04-24 (rev1) 2021-09-05 (rev2) 2021-12-29
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * @copyright (C) 2021 by Washington State University, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include "detail/hf_pimpl.inl"
+#include "hf/hf.hh"
+
+#define HUFFCOARSE(E, ETF, H, M) \
+    template class cusz::LosslessCodec<ErrCtrlTrait<E, ETF>::type, HuffTrait<H>::type, MetadataTrait<M>::type>::impl;
+
+HUFFCOARSE(1, false, 4, 4)  // uint
+HUFFCOARSE(1, false, 8, 4)  //
+HUFFCOARSE(2, false, 4, 4)  //
+HUFFCOARSE(2, false, 8, 4)  //
+HUFFCOARSE(4, false, 4, 4)  //
+HUFFCOARSE(4, false, 8, 4)  //
+
+HUFFCOARSE(4, true, 4, 4)  // float
+HUFFCOARSE(4, true, 8, 4)  //
+
+#undef HUFFCOARSE
diff --git a/qtensor/compression/cusz/src/kernel/claunch_cuda.cu b/qtensor/compression/cusz/src/kernel/claunch_cuda.cu
new file mode 100644
index 00000000..5433d7d8
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/claunch_cuda.cu
@@ -0,0 +1,76 @@
+/**
+ * @file kernel_cuda.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-07-24
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "detail/hist.inl"
+#include "detail/spline3.inl"
+// #include "hf/hf_codecg.hh"
+// #include "hf/hf_struct.h"
+#include "kernel/claunch_cuda.h"
+#include "kernel/cpplaunch_cuda.hh"
+#include "utils/cuda_err.cuh"
+
+#define C_SPLINE3(Tliteral, Eliteral, FPliteral, T, E, FP)                                                           \
+    cusz_error_status claunch_construct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                           \
+        bool NO_R_SEPARATE, T* data, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, \
+        double const eb, int const radius, float* time_elapsed, cudaStream_t stream)                                 \
+    {                                                                                                                \
+        if (NO_R_SEPARATE)                                                                                           \
+            launch_construct_Spline3<T, E, FP, true>(                                                                \
+                data, len3, anchor, an_len3, errctrl, ec_len3, eb, radius, *time_elapsed, stream);                   \
+        else                                                                                                         \
+            launch_construct_Spline3<T, E, FP, false>(                                                               \
+                data, len3, anchor, an_len3, errctrl, ec_len3, eb, radius, *time_elapsed, stream);                   \
+        return CUSZ_SUCCESS;                                                                                         \
+    }                                                                                                                \
+    cusz_error_status claunch_reconstruct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                         \
+        T* xdata, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, double const eb,   \
+        int const radius, float* time_elapsed, cudaStream_t stream)                                                  \
+    {                                                                                                                \
+        launch_reconstruct_Spline3<T, E, FP>(                                                                        \
+            xdata, len3, anchor, an_len3, errctrl, ec_len3, eb, radius, *time_elapsed, stream);                      \
+        return CUSZ_SUCCESS;                                                                                         \
+    }
+
+C_SPLINE3(fp32, ui8, fp32, float, uint8_t, float);
+C_SPLINE3(fp32, ui16, fp32, float, uint16_t, float);
+C_SPLINE3(fp32, ui32, fp32, float, uint32_t, float);
+C_SPLINE3(fp32, fp32, fp32, float, float, float);
+
+#undef C_SPLINE3
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define CPP_SPLINE3(Tliteral, Eliteral, FPliteral, T, E, FP)                                                    \
+    template <>                                                                                                 \
+    cusz_error_status cusz::cpplaunch_construct_Spline3<T, E, FP>(                                              \
+        bool NO_R_SEPARATE, T* data, dim3 const len3, T* anchor, dim3 const an_len3, E* eq, dim3 const ec_len3, \
+        double const eb, int const radius, float* time_elapsed, cudaStream_t stream)                            \
+    {                                                                                                           \
+        return claunch_construct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                             \
+            NO_R_SEPARATE, data, len3, anchor, an_len3, eq, ec_len3, eb, radius, time_elapsed, stream);         \
+    }                                                                                                           \
+                                                                                                                \
+    template <>                                                                                                 \
+    cusz_error_status cusz::cpplaunch_reconstruct_Spline3<T, E, FP>(                                            \
+        T * xdata, dim3 const len3, T* anchor, dim3 const an_len3, E* eq, dim3 const ec_len3, double const eb,  \
+        int const radius, float* time_elapsed, cudaStream_t stream)                                             \
+    {                                                                                                           \
+        return claunch_reconstruct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                           \
+            xdata, len3, anchor, an_len3, eq, ec_len3, eb, radius, time_elapsed, stream);                       \
+    }
+
+CPP_SPLINE3(fp32, ui8, fp32, float, uint8_t, float);
+CPP_SPLINE3(fp32, ui16, fp32, float, uint16_t, float);
+CPP_SPLINE3(fp32, ui32, fp32, float, uint32_t, float);
+CPP_SPLINE3(fp32, fp32, fp32, float, float, float);
+
+#undef CPP_SPLINE3
diff --git a/qtensor/compression/cusz/src/kernel/detail/hist.inl b/qtensor/compression/cusz/src/kernel/detail/hist.inl
new file mode 100644
index 00000000..a3781eb6
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/detail/hist.inl
@@ -0,0 +1,100 @@
+/**
+ * @file hist.inl
+ * @author Cody Rivera (cjrivera1@crimson.ua.edu), Megan Hickman Fulp (mlhickm@g.clemson.edu)
+ * @brief Fast histogramming from [Gómez-Luna et al. 2013]
+ * @version 0.1
+ * @date 2020-09-20
+ * Created on 2020-02-16
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_KERNEL_HIST_CUH
+#define CUSZ_KERNEL_HIST_CUH
+
+#include <cuda_runtime.h>
+#include <cstdio>
+#include <limits>
+
+#include "common.hh"
+#include "utils/timer.h"
+
+#define MIN(a, b) ((a) < (b)) ? (a) : (b)
+const static unsigned int WARP_SIZE = 32;
+
+#define tix threadIdx.x
+#define tiy threadIdx.y
+#define tiz threadIdx.z
+#define bix blockIdx.x
+#define biy blockIdx.y
+#define biz blockIdx.z
+#define bdx blockDim.x
+#define bdy blockDim.y
+#define bdz blockDim.z
+
+namespace kernel {
+
+template <typename Input>
+__global__ void NaiveHistogram(Input in_data[], int out_freq[], int N, int symbols_per_thread);
+
+/* Copied from J. Gomez-Luna et al */
+template <typename T, typename FREQ>
+__global__ void p2013Histogram(T*, FREQ*, size_t, int, int);
+
+}  // namespace kernel
+
+template <typename T>
+__global__ void kernel::NaiveHistogram(T in_data[], int out_freq[], int N, int symbols_per_thread)
+{
+    unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+    unsigned int j;
+    if (i * symbols_per_thread < N) {  // if there is a symbol to count,
+        for (j = i * symbols_per_thread; j < (i + 1) * symbols_per_thread; j++) {
+            if (j < N) {
+                unsigned int item = in_data[j];  // Symbol to count
+                atomicAdd(&out_freq[item], 1);   // update bin count by 1
+            }
+        }
+    }
+}
+
+template <typename T, typename FREQ>
+__global__ void kernel::p2013Histogram(T* in_data, FREQ* out_freq, size_t N, int nbin, int R)
+{
+    // static_assert(
+    //     std::numeric_limits<T>::is_integer and (not std::numeric_limits<T>::is_signed),
+    //     "T must be `unsigned integer` type of {1,2,4} bytes");
+
+    extern __shared__ int Hs[/*(nbin + 1) * R*/];
+
+    const unsigned int warp_id     = (int)(tix / WARP_SIZE);
+    const unsigned int lane        = tix % WARP_SIZE;
+    const unsigned int warps_block = bdx / WARP_SIZE;
+    const unsigned int off_rep     = (nbin + 1) * (tix % R);
+    const unsigned int begin       = (N / warps_block) * warp_id + WARP_SIZE * blockIdx.x + lane;
+    unsigned int       end         = (N / warps_block) * (warp_id + 1);
+    const unsigned int step        = WARP_SIZE * gridDim.x;
+
+    // final warp handles data outside of the warps_block partitions
+    if (warp_id >= warps_block - 1) end = N;
+
+    for (unsigned int pos = tix; pos < (nbin + 1) * R; pos += bdx) Hs[pos] = 0;
+    __syncthreads();
+
+    for (unsigned int i = begin; i < end; i += step) {
+        int d = in_data[i];
+        d     = d <= 0 and d >= nbin ? nbin / 2 : d;
+        atomicAdd(&Hs[off_rep + d], 1);
+    }
+    __syncthreads();
+
+    for (unsigned int pos = tix; pos < nbin; pos += bdx) {
+        int sum = 0;
+        for (int base = 0; base < (nbin + 1) * R; base += nbin + 1) { sum += Hs[base + pos]; }
+        atomicAdd(out_freq + pos, sum);
+    }
+}
+
+#endif
diff --git a/qtensor/compression/cusz/src/kernel/detail/lorenzo.inl b/qtensor/compression/cusz/src/kernel/detail/lorenzo.inl
new file mode 100644
index 00000000..0e1f9acd
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/detail/lorenzo.inl
@@ -0,0 +1,816 @@
+/**
+ * @file lorenzo.inl
+ * @author Jiannan Tian
+ * @brief Dual-ErrCtrl Lorenzo method.
+ * @version 0.2
+ * @date 2021-01-16
+ * (create) 2019-09-23; (release) 2020-09-20; (rev1) 2021-01-16; (rev2) 2021-02-20; (rev3) 2021-04-11
+ * (rev4) 2021-04-30
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_KERNEL_LORENZO_CUH
+#define CUSZ_KERNEL_LORENZO_CUH
+
+#include <cstddef>
+// #include "utils/cuda_err.cuh"
+// #include "utils/timer.h"
+
+#if __has_include(<cub/cub.cuh>)
+// #pragma message __FILE__ ": (CUDA 11 onward), cub from system path"
+#include <cub/cub.cuh>
+#else
+// #pragma message __FILE__ ": (CUDA 10 or earlier), cub from git submodule"
+#include "../../third_party/cub/cub/cub.cuh"
+#endif
+
+#if __cplusplus >= 201703L
+#define CONSTEXPR constexpr
+#else
+#define CONSTEXPR
+#endif
+
+#define TIX threadIdx.x
+#define TIY threadIdx.y
+#define TIZ threadIdx.z
+#define BIX blockIdx.x
+#define BIY blockIdx.y
+#define BIZ blockIdx.z
+#define BDX blockDim.x
+#define BDY blockDim.y
+#define BDZ blockDim.z
+
+using DIM    = unsigned int;
+using STRIDE = unsigned int;
+
+namespace cusz {
+
+/**
+ * @brief compress-time 1D Lorenzo pred-quant kernel
+ *
+ * @tparam Data type of input data
+ * @tparam ErrCtrl type of error-control code
+ * @tparam FP type for internal floating-point processing
+ * @tparam BLOCK block size
+ * @tparam SEQ degree of sequentiality
+ * @param data input
+ * @param errctrl output 1
+ * @param outlier output 2
+ * @param len3 data length in 3D
+ * @param stride3 data stride in 3D
+ * @param radius quant-code radius
+ * @param ebx2_r precalculated reciprocal of eb*2
+ */
+template <typename Data, typename ErrCtrl, typename FP = float, int BLOCK = 256, int SEQ = 4>
+__global__ void
+c_lorenzo_1d1l(Data* data, ErrCtrl* errctrl, Data* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2_r);
+
+/**
+ * @brief compress-time 2D Lorenzo pred-quant kernel
+ *
+ * @tparam Data type of input data
+ * @tparam ErrCtrl type of error-control code
+ * @tparam FP type for internal floating-point processing
+ * @tparam BLOCK block size
+ * @tparam SEQ degree of sequentiality
+ * @param data input
+ * @param errctrl output 1
+ * @param outlier output 2
+ * @param len3 data length in 3D
+ * @param stride3 data stride in 3D
+ * @param radius quant-code radius
+ * @param ebx2_r precalculated reciprocal of eb*2
+ */
+template <typename Data, typename ErrCtrl, typename FP = float>
+__global__ void c_lorenzo_2d1l_16x16data_mapto16x2(
+    Data*    data,
+    ErrCtrl* errctrl,
+    Data*    outlier,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2_r);
+
+/**
+ * @brief compress-time 3D Lorenzo pred-quant kernel
+ *
+ * @tparam Data type of input data
+ * @tparam ErrCtrl type of error-control code
+ * @tparam FP type for internal floating-point processing
+ * @tparam BLOCK block size
+ * @tparam SEQ degree of sequentiality
+ * @param data input
+ * @param errctrl output 1
+ * @param outlier output 2
+ * @param len3 data length in 3D
+ * @param stride3 data stride in 3D
+ * @param radius quant-code radius
+ * @param ebx2_r precalculated reciprocal of eb*2
+ */
+template <typename Data, typename ErrCtrl, typename FP = float>
+__global__ void c_lorenzo_3d1l_32x8x8data_mapto32x1x8(
+    Data*    data,
+    ErrCtrl* errctrl,
+    Data*    outlier,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2_r);
+
+/**
+ * @brief decompress-time 1D Lorenzo pred-quant kernel
+ *
+ * @tparam Data type of input data
+ * @tparam ErrCtrl type of error-control code
+ * @tparam FP type for internal floating-point processing
+ * @tparam BLOCK block size
+ * @tparam SEQ degree of sequentiality
+ * @param outlier input 1
+ * @param quant input 2
+ * @param xdata output
+ * @param len3 data length in 3D
+ * @param stride3 data stride in 3D
+ * @param radius quant-code radius
+ * @param ebx2 precalculated eb*2
+ */
+template <
+    typename Data,
+    typename ErrCtrl,
+    typename FP = float,
+    int BLOCK   = 256,
+    int SEQ     = 8>
+__global__ void x_lorenzo_1d1l(
+    Data*    outlier,  //
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2);
+
+/**
+ * @brief decompress-time 2D Lorenzo pred-quant kernel
+ *
+ * @tparam Data type of input data
+ * @tparam ErrCtrl type of error-control code
+ * @tparam FP type for internal floating-point processing
+ * @tparam BLOCK block size
+ * @tparam SEQ degree of sequentiality
+ * @param outlier input 1
+ * @param quant input 2
+ * @param xdata output
+ * @param len3 data length in 3D
+ * @param stride3 data stride in 3D
+ * @param radius quant-code radius
+ * @param ebx2 precalculated eb*2
+ */
+template <typename Data, typename ErrCtrl, typename FP = float>
+__global__ void x_lorenzo_2d1l_16x16data_mapto16x2(
+    Data*    outlier,
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2);
+
+/**
+ * @brief decompress-time 3D Lorenzo pred-quant kernel
+ *
+ * @tparam Data type of input data
+ * @tparam ErrCtrl type of error-control code
+ * @tparam FP type for internal floating-point processing
+ * @tparam BLOCK block size
+ * @tparam SEQ degree of sequentiality
+ * @param outlier input 1
+ * @param quant input 2
+ * @param xdata output
+ * @param len3 data length in 3D
+ * @param stride3 data stride in 3D
+ * @param radius quant-code radius
+ * @param ebx2 precalculated eb*2
+ */
+template <typename Data, typename ErrCtrl, typename FP = float>
+__global__ void x_lorenzo_3d1l_32x8x8data_mapto32x1x8(
+    Data*    outlier,
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2);
+
+/**
+ * @brief decompress-time 3D Lorenzo pred-quant kernel (variant)
+ *
+ * @tparam Data type of input data
+ * @tparam ErrCtrl type of error-control code
+ * @tparam FP type for internal floating-point processing
+ * @tparam BLOCK block size
+ * @tparam SEQ degree of sequentiality
+ * @param outlier input 1
+ * @param quant input 2
+ * @param xdata output
+ * @param len3 data length in 3D
+ * @param stride3 data stride in 3D
+ * @param radius quant-code radius
+ * @param ebx2 precalculated eb*2
+ */
+template <typename Data, typename ErrCtrl, typename FP = float>
+__global__ void x_lorenzo_3d1lvar_32x8x8data_mapto32x1x8(
+    Data*    outlier,
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2);
+
+}  // namespace cusz
+
+namespace {
+
+/**
+ * @brief (Original SZ/cuSZ design) 1D: separate delta by radius in to quant-code and outlier
+ */
+template <typename Data, typename ErrCtrl, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void pred1d_radius_separate(
+    Data              thread_scope[SEQ],
+    volatile Data*    shmem_data,
+    volatile ErrCtrl* shmem_quant,
+    int               radius,
+    Data              from_last_stripe = 0)
+{
+    if CONSTEXPR (FIRST_POINT) {  // i == 0
+        Data delta                 = thread_scope[0] - from_last_stripe;
+        bool quantizable           = fabs(delta) < radius;
+        Data candidate             = delta + radius;
+        shmem_data[0 + TIX * SEQ]  = (1 - quantizable) * candidate;  // output; reuse data for outlier
+        shmem_quant[0 + TIX * SEQ] = quantizable * static_cast<ErrCtrl>(candidate);
+    }
+    else {
+#pragma unroll
+        for (auto i = 1; i < SEQ; i++) {
+            Data delta                 = thread_scope[i] - thread_scope[i - 1];
+            bool quantizable           = fabs(delta) < radius;
+            Data candidate             = delta + radius;
+            shmem_data[i + TIX * SEQ]  = (1 - quantizable) * candidate;  // output; reuse data for outlier
+            shmem_quant[i + TIX * SEQ] = quantizable * static_cast<ErrCtrl>(candidate);
+        }
+        __syncthreads();
+    }
+}
+
+template <typename Data, typename FP, int NTHREAD, int SEQ>
+__forceinline__ __device__ void load1d(
+    Data*          data,
+    unsigned int   dimx,
+    unsigned int   id_base,
+    volatile Data* shmem_data,
+    Data           thread_scope[SEQ],
+    Data&          from_last_stripe,
+    FP             ebx2_r)
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id = id_base + TIX + i * NTHREAD;
+        if (id < dimx) { shmem_data[TIX + i * NTHREAD] = round(data[id] * ebx2_r); }
+    }
+    __syncthreads();
+
+    for (auto i = 0; i < SEQ; i++) thread_scope[i] = shmem_data[TIX * SEQ + i];
+
+    if (TIX > 0) from_last_stripe = shmem_data[TIX * SEQ - 1];
+    __syncthreads();
+}
+
+template <typename Data, typename ErrCtrl, int NTHREAD, int SEQ, bool NO_R_SEPARATE>
+__forceinline__ __device__ void write1d(
+    volatile Data*    shmem_data,
+    Data*             data,
+    unsigned int      dimx,
+    unsigned int      id_base,
+    volatile ErrCtrl* shmem_quant = nullptr,
+    ErrCtrl*          quant       = nullptr)
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id = id_base + TIX + i * NTHREAD;
+        if (id < dimx) {
+            if CONSTEXPR (NO_R_SEPARATE) {  // TODO no-radius-separate uses shmem_data
+                quant[id] = shmem_data[TIX + i * NTHREAD];
+            }
+            else {
+                data[id]  = shmem_data[TIX + i * NTHREAD];
+                quant[id] = shmem_quant[TIX + i * NTHREAD];
+            }
+        }
+    }
+}
+
+template <typename Data, typename FP, int YSEQ>
+__forceinline__ __device__ void load2d_prequant(
+    Data*        data,
+    Data         center[YSEQ + 1],
+    unsigned int dimx,
+    unsigned int dimy,
+    unsigned int stridey,
+    unsigned int gix,
+    unsigned int giy_base,
+    FP           ebx2_r)
+{
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        if (gix < dimx and giy_base + i < dimy) center[i + 1] = round(data[get_gid(i)] * ebx2_r);
+    }
+    auto tmp = __shfl_up_sync(0xffffffff, center[YSEQ], 16);  // same-warp, next-16
+    if (TIY == 1) center[0] = tmp;
+}
+
+template <typename Data, typename FP, int YSEQ>
+__forceinline__ __device__ void pred2d(Data center[YSEQ + 1])
+{
+    /* prediction
+         original form:  Data delta = center[i] - center[i - 1] + west[i] - west[i - 1];
+            short form:  Data delta = center[i] - west[i];
+       */
+#pragma unroll
+    for (auto i = YSEQ; i > 0; i--) {
+        center[i] -= center[i - 1];
+        auto west = __shfl_up_sync(0xffffffff, center[i], 1, 16);
+        if (TIX > 0) center[i] -= west;
+    }
+    __syncthreads();
+}
+
+template <typename Data, typename ErrCtrl, int YSEQ>
+__forceinline__ __device__ void postquant_write2d(
+    Data         center[YSEQ + 1],
+    ErrCtrl*     quant,
+    Data*        outlier,
+    unsigned int dimx,
+    unsigned int dimy,
+    unsigned int stridey,
+    int          radius,
+    unsigned int gix,
+    unsigned int giy_base)
+{
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 1; i < YSEQ + 1; i++) {
+        auto gid = get_gid(i - 1);
+
+        if (gix < dimx and giy_base + i - 1 < dimy) {
+            bool quantizable = fabs(center[i]) < radius;
+            Data candidate   = center[i] + radius;
+            outlier[gid]     = (1 - quantizable) * candidate;  // output; reuse data for outlier
+            quant[gid]       = quantizable * static_cast<ErrCtrl>(candidate);
+        }
+    }
+}
+
+}  // namespace
+
+template <
+    typename Data,
+    typename ErrCtrl,
+    typename FP,
+    int BLOCK,
+    int SEQ>
+__global__ void cusz::c_lorenzo_1d1l(  //
+    Data*    data,
+    ErrCtrl* quant,
+    Data*    outlier,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2_r)
+{
+    constexpr auto NTHREAD = BLOCK / SEQ;
+
+    __shared__ struct {
+        union {
+            uint8_t uninitialized[BLOCK * sizeof(Data) + BLOCK * sizeof(ErrCtrl)];
+            Data    data[BLOCK];
+        } space;
+    } shmem;
+
+    auto id_base = BIX * BLOCK;
+
+    Data thread_scope[SEQ];
+    Data from_last_stripe{0};
+
+    /********************************************************************************
+     * load from DRAM using striped layout, perform prequant
+     ********************************************************************************/
+    load1d<Data, FP, NTHREAD, SEQ>(data, len3.x, id_base, shmem.space.data, thread_scope, from_last_stripe, ebx2_r);
+
+    // the original SZ/cuSZ design
+    auto shmem_quant = reinterpret_cast<ErrCtrl*>(shmem.space.uninitialized + sizeof(Data) * BLOCK);
+    pred1d_radius_separate<Data, ErrCtrl, SEQ, true>(
+        thread_scope, shmem.space.data, shmem_quant, radius, from_last_stripe);
+    pred1d_radius_separate<Data, ErrCtrl, SEQ, false>(thread_scope, shmem.space.data, shmem_quant, radius);
+    write1d<Data, ErrCtrl, NTHREAD, SEQ, false>(shmem.space.data, outlier, len3.x, id_base, shmem_quant, quant);
+}
+
+template <typename Data, typename ErrCtrl, typename FP>
+__global__ void cusz::c_lorenzo_2d1l_16x16data_mapto16x2(
+    Data*    data,
+    ErrCtrl* quant,
+    Data*    outlier,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2_r)
+{
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = 8;
+
+    Data center[YSEQ + 1] = {0};  // nw  n
+                                  //  w  center
+
+    auto gix      = BIX * BDX + TIX;           // BDX == 16
+    auto giy_base = BIY * BLOCK + TIY * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    load2d_prequant<Data, FP, YSEQ>(data, center, len3.x, len3.y, stride3.y, gix, giy_base, ebx2_r);
+    pred2d<Data, FP, YSEQ>(center);
+    postquant_write2d<Data, ErrCtrl, YSEQ>(center, quant, outlier, len3.x, len3.y, stride3.y, radius, gix, giy_base);
+}
+
+template <typename Data, typename ErrCtrl, typename FP>
+__global__ void cusz::c_lorenzo_3d1l_32x8x8data_mapto32x1x8(
+    Data*    data,
+    ErrCtrl* quant,
+    Data*    outlier,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2_r)
+{
+    constexpr auto  BLOCK = 8;
+    __shared__ Data shmem[8][8][32];
+
+    auto z = TIZ;
+
+    auto gix      = BIX * (BLOCK * 4) + TIX;
+    auto giy_base = BIY * BLOCK;
+    auto giz      = BIZ * BLOCK + z;
+    auto base_id  = gix + giy_base * stride3.y + giz * stride3.z;
+
+    /********************************************************************************
+     * load from DRAM, perform prequant
+     ********************************************************************************/
+    if (gix < len3.x and giz < len3.z) {
+        for (auto y = 0; y < BLOCK; y++) {
+            if (giy_base + y < len3.y) {
+                shmem[z][y][TIX] = round(data[base_id + y * stride3.y] * ebx2_r);  // prequant (fp presence)
+            }
+        }
+    }
+    __syncthreads();  // necessary to ensure correctness
+
+    auto x = TIX % 8;
+
+    for (auto y = 0; y < BLOCK; y++) {
+        Data delta;
+
+        /********************************************************************************
+         * prediction
+         ********************************************************************************/
+        delta = shmem[z][y][TIX] - ((z > 0 and y > 0 and x > 0 ? shmem[z - 1][y - 1][TIX - 1] : 0)  // dist=3
+                                    - (y > 0 and x > 0 ? shmem[z][y - 1][TIX - 1] : 0)              // dist=2
+                                    - (z > 0 and x > 0 ? shmem[z - 1][y][TIX - 1] : 0)              //
+                                    - (z > 0 and y > 0 ? shmem[z - 1][y - 1][TIX] : 0)              //
+                                    + (x > 0 ? shmem[z][y][TIX - 1] : 0)                            // dist=1
+                                    + (y > 0 ? shmem[z][y - 1][TIX] : 0)                            //
+                                    + (z > 0 ? shmem[z - 1][y][TIX] : 0));                          //
+
+        auto id = base_id + (y * stride3.y);
+
+        bool quantizable = fabs(delta) < radius;
+        Data candidate   = delta + radius;
+        if (gix < len3.x and (giy_base + y) < len3.y and giz < len3.z) {
+            outlier[id] = (1 - quantizable) * candidate;  // output; reuse data for outlier
+            quant[id]   = quantizable * static_cast<ErrCtrl>(candidate);
+        }
+    }
+    /* EOF */
+}
+
+template <typename Data, typename ErrCtrl, typename FP, int BLOCK, int SEQ>
+__global__ void cusz::x_lorenzo_1d1l(  //
+    Data*    outlier,
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2)
+{
+    constexpr auto block_dim = BLOCK / SEQ;  // dividable
+
+    // coalesce-load (warp-striped) and transpose in shmem (similar for store)
+    typedef cub::BlockLoad<Data, block_dim, SEQ, cub::BLOCK_LOAD_WARP_TRANSPOSE>    BlockLoadT_outlier;
+    typedef cub::BlockLoad<ErrCtrl, block_dim, SEQ, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT_quant;
+    typedef cub::BlockStore<Data, block_dim, SEQ, cub::BLOCK_STORE_WARP_TRANSPOSE>  BlockStoreT_xdata;
+    typedef cub::BlockScan<Data, block_dim, cub::BLOCK_SCAN_RAKING_MEMOIZE>
+        BlockScanT_xdata;  // TODO autoselect algorithm
+
+    __shared__ union TempStorage {  // overlap shared memory space
+        typename BlockLoadT_outlier::TempStorage load_outlier;
+        typename BlockLoadT_quant::TempStorage   load_quant;
+        typename BlockStoreT_xdata::TempStorage  store_xdata;
+        typename BlockScanT_xdata::TempStorage   scan_xdata;
+    } temp_storage;
+
+    // thread-scope tiled data
+    union ThreadData {
+        Data xdata[SEQ];
+        Data outlier[SEQ];
+    } thread_scope;
+    ErrCtrl thread_scope_quant[SEQ];
+
+    /********************************************************************************
+     * load to thread-private array (fuse at the same time)
+     * (BIX * BDX * SEQ) denotes the start of the data chunk that belongs to this thread block
+     ********************************************************************************/
+    BlockLoadT_quant(temp_storage.load_quant).Load(quant + (BIX * BDX) * SEQ, thread_scope_quant);
+    __syncthreads();  // barrier for shmem reuse
+    BlockLoadT_outlier(temp_storage.load_outlier).Load(outlier + (BIX * BDX) * SEQ, thread_scope.outlier);
+    __syncthreads();  // barrier for shmem reuse
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id = (BIX * BDX + TIX) * SEQ + i;
+        thread_scope.xdata[i] =
+            id < len3.x ? thread_scope.outlier[i] + static_cast<Data>(thread_scope_quant[i]) - radius : 0;
+    }
+    __syncthreads();
+
+    /********************************************************************************
+     * perform partial-sum using cub::InclusiveSum
+     ********************************************************************************/
+    BlockScanT_xdata(temp_storage.scan_xdata).InclusiveSum(thread_scope.xdata, thread_scope.xdata);
+    __syncthreads();  // barrier for shmem reuse
+
+    /********************************************************************************
+     * scale by ebx2 and write to DRAM
+     ********************************************************************************/
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) thread_scope.xdata[i] *= ebx2;
+    __syncthreads();  // barrier for shmem reuse
+
+    BlockStoreT_xdata(temp_storage.store_xdata).Store(xdata + (BIX * BDX) * SEQ, thread_scope.xdata);
+}
+
+template <typename Data, typename ErrCtrl, typename FP>
+__global__ void cusz::x_lorenzo_2d1l_16x16data_mapto16x2(
+    Data*    outlier,
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2)
+{
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
+    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
+
+    __shared__ Data intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
+    Data            thread_scope[YSEQ];
+    /*
+      .  ------> gix (x)
+      |  t00    t01    t02    t03    ... t0f
+      |  ts00_0 ts00_0 ts00_0 ts00_0
+     giy ts00_1 ts00_1 ts00_1 ts00_1
+     (y)  |      |      |      |
+         ts00_7 ts00_7 ts00_7 ts00_7
+
+      |  t10    t11    t12    t13    ... t1f
+      |  ts00_0 ts00_0 ts00_0 ts00_0
+     giy ts00_1 ts00_1 ts00_1 ts00_1
+     (y)  |      |      |      |
+         ts00_7 ts00_7 ts00_7 ts00_7
+     */
+
+    auto gix      = BIX * BLOCK + TIX;
+    auto giy_base = BIY * BLOCK + TIY * YSEQ;  // BDY * YSEQ = BLOCK == 16
+    auto get_gid  = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
+
+    /********************************************************************************
+     * load to thread-private array (fuse at the same time)
+     ********************************************************************************/
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
+        if (gix < len3.x and giy_base + i < len3.y)
+            thread_scope[i] = outlier[gid] + static_cast<Data>(quant[gid]) - radius;  // fuse
+        else
+            thread_scope[i] = 0;  // TODO set as init state?
+    }
+
+    /********************************************************************************
+     * partial-sum along y-axis, sequantially
+     ********************************************************************************/
+    for (auto i = 1; i < YSEQ; i++) thread_scope[i] += thread_scope[i - 1];
+    // two-pass: store for cross-threadscope update
+    if (TIY == 0) intermediate[TIX] = thread_scope[YSEQ - 1];
+    __syncthreads();
+    // two-pass: load and update
+    if (TIY == 1) {
+        auto tmp = intermediate[TIX];
+#pragma unroll
+        for (auto& i : thread_scope) i += tmp;
+    }
+
+    /********************************************************************************
+     * in-warp partial-sum along x-axis
+     ********************************************************************************/
+#pragma unroll
+    for (auto& i : thread_scope) {
+        for (auto d = 1; d < BLOCK; d *= 2) {
+            Data n = __shfl_up_sync(0xffffffff, i, d, 16);
+            if (TIX >= d) i += n;
+        }
+        i *= ebx2;
+    }
+
+    /********************************************************************************
+     * write to DRAM
+     ********************************************************************************/
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        if (gix < len3.x and giy_base + i < len3.y) xdata[gid] = thread_scope[i];
+    }
+}
+
+template <typename Data, typename ErrCtrl, typename FP>
+__global__ void cusz::x_lorenzo_3d1l_32x8x8data_mapto32x1x8(
+    Data*    outlier,
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2)
+{
+    constexpr auto BLOCK = 8;
+    constexpr auto YSEQ  = BLOCK;
+    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
+
+    __shared__ Data intermediate[BLOCK][4][8];
+    Data            thread_scope[YSEQ];
+
+    auto seg_id  = TIX / 8;
+    auto seg_tix = TIX % 8;
+
+    auto gix = BIX * (4 * BLOCK) + TIX, giy_base = BIY * BLOCK, giz = BIZ * BLOCK + TIZ;
+    auto get_gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
+
+    /********************************************************************************
+     * load to thread-private array (fuse at the same time)
+     ********************************************************************************/
+#pragma unroll
+    for (auto y = 0; y < YSEQ; y++) {
+        auto gid = get_gid(y);
+        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
+            thread_scope[y] = outlier[gid] + static_cast<Data>(quant[gid]) - static_cast<Data>(radius);  // fuse
+        else
+            thread_scope[y] = 0;
+    }
+
+    /********************************************************************************
+     * partial-sum along y-axis, sequantially
+     ********************************************************************************/
+    for (auto y = 1; y < YSEQ; y++) thread_scope[y] += thread_scope[y - 1];
+
+    /********************************************************************************
+     * ND partial-sums along x- and z-axis
+     * in-warp shuffle used: in order to perform, it's transposed after X-partial sum
+     ********************************************************************************/
+    auto dist = 1;
+    Data addend;
+
+#pragma unroll
+    for (auto i = 0; i < BLOCK; i++) {
+        Data val = thread_scope[i];
+
+        for (dist = 1; dist < BLOCK; dist *= 2) {
+            addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+            if (seg_tix >= dist) val += addend;
+        }
+
+        // x-z transpose
+        intermediate[TIZ][seg_id][seg_tix] = val;
+        __syncthreads();
+        val = intermediate[seg_tix][seg_id][TIZ];
+        __syncthreads();
+
+        for (dist = 1; dist < BLOCK; dist *= 2) {
+            addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+            if (seg_tix >= dist) val += addend;
+        }
+
+        intermediate[TIZ][seg_id][seg_tix] = val;
+        __syncthreads();
+        val = intermediate[seg_tix][seg_id][TIZ];
+        __syncthreads();
+
+        thread_scope[i] = val;
+    }
+
+    /********************************************************************************
+     * write to DRAM
+     ********************************************************************************/
+#pragma unroll
+    for (auto y = 0; y < YSEQ; y++) {
+        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) { xdata[get_gid(y)] = thread_scope[y] * ebx2; }
+    }
+    /* EOF */
+}
+
+/********************************************************************************
+ * experimental prototype toward further optmization
+ ********************************************************************************/
+template <typename Data, typename ErrCtrl, typename FP>
+__global__ void cusz::x_lorenzo_3d1lvar_32x8x8data_mapto32x1x8(
+    Data*    outlier,
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2)
+{
+    constexpr auto BLOCK = 8;
+    constexpr auto YSEQ  = BLOCK;
+    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
+
+    __shared__ Data intermediate[BLOCK][4][8];
+    Data            thread_scope = 0;
+
+    auto seg_id  = TIX / 8;
+    auto seg_tix = TIX % 8;
+
+    auto gix = BIX * (4 * BLOCK) + TIX, giy_base = BIY * BLOCK, giz = BIZ * BLOCK + TIZ;
+    auto get_gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
+
+    auto y = 0;
+
+    // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
+#pragma unroll
+    for (y = 0; y < YSEQ; y++) {
+        auto gid = get_gid(y);
+        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
+            thread_scope += outlier[gid] + static_cast<Data>(quant[gid]) - static_cast<Data>(radius);  // fuse
+
+        Data val = thread_scope;
+
+        // shuffle, ND partial-sums
+        for (auto dist = 1; dist < BLOCK; dist *= 2) {
+            Data addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+            if (seg_tix >= dist) val += addend;
+        }
+
+        // x-z transpose
+        intermediate[TIZ][seg_id][seg_tix] = val;
+        __syncthreads();
+        val = intermediate[seg_tix][seg_id][TIZ];
+        __syncthreads();
+
+        for (auto dist = 1; dist < BLOCK; dist *= 2) {
+            Data addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+            if (seg_tix >= dist) val += addend;
+        }
+
+        intermediate[TIZ][seg_id][seg_tix] = val;
+        __syncthreads();
+        val = intermediate[seg_tix][seg_id][TIZ];
+        __syncthreads();
+
+        // thread_scope += val;
+
+        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) { xdata[get_gid(y)] = val * ebx2; }
+    }
+}
+
+#undef TIX
+#undef TIY
+#undef TIZ
+#undef BIX
+#undef BIY
+#undef BIZ
+#undef BDX
+#undef BDY
+#undef BDZ
+
+#endif
diff --git a/qtensor/compression/cusz/src/kernel/detail/lorenzo23.inl b/qtensor/compression/cusz/src/kernel/detail/lorenzo23.inl
new file mode 100644
index 00000000..764f44ec
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/detail/lorenzo23.inl
@@ -0,0 +1,1237 @@
+/**
+ * @file lorenzo23.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2022-12-22
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "subroutine.inl"
+
+namespace subr = psz::cuda::__device;
+
+namespace psz {
+namespace cuda {
+namespace __kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+// 1D
+
+namespace v0 {
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier);
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void x_lorenzo_1d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata);
+
+namespace compaction {
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ, typename Compaction = CompactionDRAM<T>>
+__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
+
+}
+
+namespace delta_only {
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta);
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void x_lorenzo_1d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+}  // namespace delta_only
+
+}  // namespace v0
+
+namespace v1_pn {
+
+namespace compaction {
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ, typename Compaction = CompactionDRAM<T>>
+__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
+
+}  // namespace compaction
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void x_lorenzo_1d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+namespace delta_only {
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta);
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void x_lorenzo_1d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+}  // namespace delta_only
+
+}  // namespace v1_pn
+
+////////////////////////////////////////////////////////////////////////////////
+// 2D
+
+namespace v0 {
+
+template <typename T, typename EQ, typename FP>
+__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier);
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_2d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata);
+
+namespace delta_only {
+
+template <typename T, typename EQ, typename FP>
+__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta);
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_2d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+}  // namespace delta_only
+
+namespace compaction {
+
+template <typename T, typename EQ, typename FP, typename Compaction = CompactionDRAM<T>>
+__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
+
+}  // namespace compaction
+
+}  // namespace v0
+
+namespace v1_pn {
+
+namespace compaction {
+
+template <typename T, typename EQ, typename FP, typename Compaction = CompactionDRAM<T>>
+__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
+
+}  // namespace compaction
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_2d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+namespace delta_only {
+
+template <typename T, typename EQ, typename FP>
+__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta);
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_2d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+}  // namespace delta_only
+
+}  // namespace v1_pn
+
+////////////////////////////////////////////////////////////////////////////////
+// 3D
+
+namespace v0 {
+
+// TODO -> `legacy`
+namespace legacy {
+template <typename T, typename EQ, typename FP>
+__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier);
+
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier);
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_3d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata);
+
+namespace delta_only {
+
+template <typename T, typename EQ, typename FP>
+__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant);
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_3d1l(EQ* quant, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+}  // namespace delta_only
+
+namespace compaction {
+
+template <typename T, typename EQ, typename FP, typename Compaction = CompactionDRAM<T>>
+__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
+
+}
+
+}  // namespace v0
+
+namespace v1_pn {
+
+namespace compaction {
+
+template <typename T, typename EQ, typename FP, typename Compaction = CompactionDRAM<T>>
+__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
+
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_3d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+namespace delta_only {
+
+template <typename T, typename EQ, typename FP>
+__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant);
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_3d1l(EQ* quant, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+}  // namespace delta_only
+
+}  // namespace v1_pn
+
+}  // namespace __kernel
+}  // namespace cuda
+}  // namespace psz
+
+////////////////////////////////////////////////////////////////////////////////
+// 1D definition
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void
+psz::cuda::__kernel::v0::c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto NTHREAD = BLOCK / SEQ;
+
+    __shared__ struct {
+        union {
+            T data[BLOCK];
+            T outlier[BLOCK];
+        };
+        EQ quant[BLOCK];
+    } s;
+
+    T prev{0};
+    T thp_data[SEQ];
+
+    auto id_base = blockIdx.x * BLOCK;
+
+    subr_v0::load_prequant_1d<T, FP, NTHREAD, SEQ>(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r);
+    subr_v0::predict_quantize_1d<T, EQ, SEQ, true>(thp_data, s.quant, s.outlier, radius, prev);
+    subr_v0::predict_quantize_1d<T, EQ, SEQ, false>(thp_data, s.quant, s.outlier, radius);
+    subr_v0::write_1d<EQ, T, NTHREAD, SEQ, false>(s.quant, s.outlier, len3.x, id_base, quant, outlier);
+}
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void
+psz::cuda::__kernel::v0::delta_only::c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto NTHREAD = BLOCK / SEQ;
+
+    __shared__ struct {
+        union {
+            T data[BLOCK];
+            T outlier[BLOCK];
+        };
+        EQ quant[BLOCK];
+    } s;
+
+    T prev{0};
+    T thp_data[SEQ];
+
+    auto id_base = blockIdx.x * BLOCK;
+
+    subr_v0::load_prequant_1d<T, FP, NTHREAD, SEQ>(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r);
+    subr_v0::predict_quantize__no_outlier_1d<T, EQ, SEQ, true>(thp_data, s.quant, prev);
+    subr_v0::predict_quantize__no_outlier_1d<T, EQ, SEQ, false>(thp_data, s.quant);
+    subr_v0::write_1d<EQ, T, NTHREAD, SEQ, false>(s.quant, nullptr, len3.x, id_base, quant, nullptr);
+}
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ, typename Compaction>
+__global__ void psz::cuda::__kernel::v0::compaction::c_lorenzo_1d1l(
+    T*         data,
+    dim3       len3,
+    dim3       stride3,
+    int        radius,
+    FP         ebx2_r,
+    EQ*        quant,
+    Compaction outlier_desc)
+{
+    namespace subr_v0  = psz::cuda::__device::v0;
+    namespace subr_v0c = psz::cuda::__device::v0::compaction;
+
+    constexpr auto NTHREAD = BLOCK / SEQ;
+
+    __shared__ struct {
+        union {
+            T data[BLOCK];
+            T outlier[BLOCK];
+        };
+        EQ quant[BLOCK];
+    } s;
+
+    T prev{0};
+    T thp_data[SEQ];
+
+    auto id_base = blockIdx.x * BLOCK;
+
+    subr_v0::load_prequant_1d<T, FP, NTHREAD, SEQ>(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r);
+    subr_v0c::predict_quantize_1d<T, EQ, SEQ, true>(thp_data, s.quant, len3.x, radius, id_base, outlier_desc, prev);
+    subr_v0c::predict_quantize_1d<T, EQ, SEQ, false>(thp_data, s.quant, len3.x, radius, id_base, outlier_desc);
+    subr_v0::write_1d<EQ, T, NTHREAD, SEQ, true>(s.quant, nullptr, len3.x, id_base, quant, nullptr);
+}
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ, typename Compaction>
+__global__ void psz::cuda::__kernel::v1_pn::compaction::c_lorenzo_1d1l(  //
+    T*         data,
+    dim3       len3,
+    dim3       stride3,
+    int        radius,
+    FP         ebx2_r,
+    EQ*        quant,
+    Compaction outlier)
+{
+    namespace subr_v0  = psz::cuda::__device::v0;
+    namespace subr_v1c = psz::cuda::__device::v1_pn::compaction;
+
+    constexpr auto NTHREAD = BLOCK / SEQ;
+
+    __shared__ struct {
+        union {
+            T data[BLOCK];
+            T outlier[BLOCK];
+        };
+        EQ quant[BLOCK];
+    } s;
+
+    T prev{0};
+    T thp_data[SEQ];
+
+    auto id_base = blockIdx.x * BLOCK;
+
+    subr_v0::load_prequant_1d<T, FP, NTHREAD, SEQ>(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r);
+    subr_v1c::predict_quantize_1d<T, EQ, SEQ, true>(thp_data, s.quant, s.outlier, radius, prev);
+    subr_v1c::predict_quantize_1d<T, EQ, SEQ, false>(thp_data, s.quant, s.outlier, radius);
+    subr_v0::write_1d<EQ, T, NTHREAD, SEQ, false>(s.quant, s.outlier, len3.x, id_base, quant, outlier);
+}
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void psz::cuda::__kernel::v0::x_lorenzo_1d1l(  //
+    EQ*  quant,
+    T*   outlier,
+    dim3 len3,
+    dim3 stride3,
+    int  radius,
+    FP   ebx2,
+    T*   xdata)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+    namespace wave32  = psz::cuda::__device::wave32;
+
+    constexpr auto NTHREAD = BLOCK / SEQ;  // equiv. to blockDim.x
+
+    __shared__ struct {
+        union {
+            T outlier[BLOCK];
+            T xdata[BLOCK];
+        };
+        // even if it's wave64, "/32" works
+        T exchange_in[NTHREAD / 32];
+        T exchange_out[NTHREAD / 32];
+    } s;
+
+    T thp_data[SEQ];
+
+    auto id_base = blockIdx.x * BLOCK;
+
+    subr_v0::load_fuse_1d<T, EQ, NTHREAD, SEQ>(quant, outlier, len3.x, id_base, radius, s.xdata, thp_data);
+    subr_v0::block_scan_1d<T, SEQ, NTHREAD>(thp_data, ebx2, s.exchange_in, s.exchange_out, s.xdata);
+    subr_v0::write_1d<T, T, NTHREAD, SEQ, true>(s.xdata, nullptr, len3.x, id_base, xdata, nullptr);
+}
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void psz::cuda::__kernel::v0::delta_only::x_lorenzo_1d1l(  //
+    EQ*  quant,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2,
+    T*   xdata)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto NTHREAD = BLOCK / SEQ;  // equiv. to blockDim.x
+
+    __shared__ struct {
+        T xdata[BLOCK];
+        // even if it's wave64, "/32" works
+        T exchange_in[NTHREAD / 32];
+        T exchange_out[NTHREAD / 32];
+    } s;
+
+    T thp_data[SEQ];
+
+    auto id_base = blockIdx.x * BLOCK;
+
+    subr_v0::delta_only::load_1d<T, EQ, NTHREAD, SEQ>(quant, len3.x, id_base, s.xdata, thp_data);
+    subr_v0::block_scan_1d<T, SEQ, NTHREAD>(thp_data, ebx2, s.exchange_in, s.exchange_out, s.xdata);
+    subr_v0::write_1d<T, T, NTHREAD, SEQ, true>(s.xdata, nullptr, len3.x, id_base, xdata, nullptr);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// 2D definition
+
+template <typename T, typename EQ, typename FP>
+__global__ void
+psz::cuda::__kernel::v0::c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = 8;
+
+    T center[YSEQ + 1] = {0};  // NW  N       first element <- 0
+                               //  W  center
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;         // BDX == BLOCK == 16
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    subr_v0::load_prequant_2d<T, FP, YSEQ>(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center);
+    subr_v0::predict_2d<T, EQ, YSEQ>(center);
+    subr_v0::quantize_write_2d<T, EQ, YSEQ>(center, len3.x, gix, len3.y, giy_base, stride3.y, radius, quant, outlier);
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void
+psz::cuda::__kernel::v0::delta_only::c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = 8;
+
+    T center[YSEQ + 1] = {0};  // NW  N       first element <- 0
+                               //  W  center
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;         // BDX == BLOCK == 16
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    subr_v0::load_prequant_2d<T, FP, YSEQ>(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center);
+    subr_v0::predict_2d<T, EQ, YSEQ>(center);
+    subr_v0::delta_only::quantize_write_2d<T, EQ, YSEQ>(center, len3.x, gix, len3.y, giy_base, stride3.y, quant);
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void
+psz::cuda::__kernel::v1_pn::delta_only::c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant)
+{
+    namespace subr_v0  = psz::cuda::__device::v0;
+    namespace subr_v1d = psz::cuda::__device::v1_pn::delta_only;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = 8;
+
+    T center[YSEQ + 1] = {0};  // NW  N       first element <- 0
+                               //  W  center
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;         // BDX == BLOCK == 16
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    subr_v0::load_prequant_2d<T, FP, YSEQ>(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center);
+    subr_v0::predict_2d<T, EQ, YSEQ>(center);
+    subr_v1d::quantize_write_2d<T, EQ, YSEQ>(center, len3.x, gix, len3.y, giy_base, stride3.y, quant);
+}
+
+template <typename T, typename EQ, typename FP, typename Compaction>
+__global__ void psz::cuda::__kernel::v0::compaction::c_lorenzo_2d1l(
+    T*         data,
+    dim3       len3,
+    dim3       stride3,
+    int        radius,
+    FP         ebx2_r,
+    EQ*        quant,
+    Compaction outlier)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = 8;
+
+    T center[YSEQ + 1] = {0};  // NW  N       first element <- 0
+                               //  W  center
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;         // BDX == BLOCK == 16
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    subr_v0::load_prequant_2d<T, FP, YSEQ>(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center);
+    subr_v0::predict_2d<T, EQ, YSEQ>(center);
+    subr_v0::compaction::quantize_write_2d<T, EQ, YSEQ>(
+        center, len3.x, gix, len3.y, giy_base, stride3.y, radius, quant, outlier);
+}
+
+// 16x16 data block maps to 16x2 (one warp) thread block
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v0::x_lorenzo_2d1l(  //
+    EQ*  quant,
+    T*   outlier,
+    dim3 len3,
+    dim3 stride3,
+    int  radius,
+    FP   ebx2,
+    T*   xdata)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
+    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
+
+    __shared__ T intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
+    T            thread_private[YSEQ];
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
+
+    subr_v0::load_fuse_2d<T, EQ, YSEQ>(
+        quant, outlier, len3.x, gix, len3.y, giy_base, stride3.y, radius, thread_private);
+    subr_v0::block_scan_2d<T, EQ, FP, YSEQ>(thread_private, intermediate, ebx2);
+    subr_v0::decomp_write_2d<T, YSEQ>(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata);
+}
+
+// 16x16 data block maps to 16x2 (one warp) thread block
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v1_pn::x_lorenzo_2d1l(  //
+    EQ*  quant,
+    T*   outlier,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2,
+    T*   xdata)
+{
+    namespace subr_v0    = psz::cuda::__device::v0;
+    namespace subr_v1_pn = psz::cuda::__device::v1_pn;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
+    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
+
+    __shared__ T intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
+    T            thread_private[YSEQ];
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
+
+    subr_v1_pn::load_fuse_2d<T, EQ, YSEQ>(quant, outlier, len3.x, gix, len3.y, giy_base, stride3.y, thread_private);
+    subr_v0::block_scan_2d<T, EQ, FP, YSEQ>(thread_private, intermediate, ebx2);
+    subr_v0::decomp_write_2d<T, YSEQ>(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata);
+}
+
+// 16x16 data block maps to 16x2 (one warp) thread block
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v0::delta_only::x_lorenzo_2d1l(  //
+    EQ*  quant,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2,
+    T*   xdata)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
+    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
+
+    __shared__ T intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
+    T            thread_private[YSEQ];
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
+
+    subr_v0::delta_only::load_2d<T, EQ, YSEQ>(quant, len3.x, gix, len3.y, giy_base, stride3.y, thread_private);
+    subr_v0::block_scan_2d<T, EQ, FP, YSEQ>(thread_private, intermediate, ebx2);
+    subr_v0::decomp_write_2d<T, YSEQ>(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata);
+}
+
+// 16x16 data block maps to 16x2 (one warp) thread block
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v1_pn::delta_only::x_lorenzo_2d1l(  //
+    EQ*  quant,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2,
+    T*   xdata)
+{
+    namespace subr_v0    = psz::cuda::__device::v0;
+    namespace subr_v1_pn = psz::cuda::__device::v1_pn;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
+    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
+
+    __shared__ T intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
+    T            thread_private[YSEQ];
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
+
+    subr_v1_pn::delta_only::load_2d<T, EQ, YSEQ>(quant, len3.x, gix, len3.y, giy_base, stride3.y, thread_private);
+    subr_v0::block_scan_2d<T, EQ, FP, YSEQ>(thread_private, intermediate, ebx2);
+    subr_v0::decomp_write_2d<T, YSEQ>(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata);
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v0::legacy::c_lorenzo_3d1l(
+    T*   data,
+    dim3 len3,
+    dim3 stride3,
+    int  radius,
+    FP   ebx2_r,
+    EQ*  quant,
+    T*   outlier)
+{
+    constexpr auto BLOCK = 8;
+    __shared__ T   s[8][8][32];
+
+    auto z = threadIdx.z;
+
+    auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK;
+    auto giz      = blockIdx.z * BLOCK + z;
+    auto base_id  = gix + giy_base * stride3.y + giz * stride3.z;
+
+    auto giy = [&](auto y) { return giy_base + y; };
+    auto gid = [&](auto y) { return base_id + y * stride3.y; };
+
+    auto load_prequant_3d = [&]() {
+        if (gix < len3.x and giz < len3.z) {
+            for (auto y = 0; y < BLOCK; y++)
+                if (giy(y) < len3.y) s[z][y][threadIdx.x] = round(data[gid(y)] * ebx2_r);  // prequant (fp presence)
+        }
+        __syncthreads();
+    };
+
+    auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) {
+        bool quantizable = fabs(delta) < radius;
+        T    candidate   = delta + radius;
+        if (x < len3.x and y < len3.y and z < len3.z) {
+            quant[gid]   = quantizable * static_cast<EQ>(candidate);
+            outlier[gid] = (not quantizable) * candidate;
+        }
+    };
+
+    auto x = threadIdx.x % 8;
+
+    auto predict_3d = [&](auto y) {
+        T delta = s[z][y][threadIdx.x] -                                               //
+                  ((z > 0 and y > 0 and x > 0 ? s[z - 1][y - 1][threadIdx.x - 1] : 0)  // dist=3
+                   - (y > 0 and x > 0 ? s[z][y - 1][threadIdx.x - 1] : 0)              // dist=2
+                   - (z > 0 and x > 0 ? s[z - 1][y][threadIdx.x - 1] : 0)              //
+                   - (z > 0 and y > 0 ? s[z - 1][y - 1][threadIdx.x] : 0)              //
+                   + (x > 0 ? s[z][y][threadIdx.x - 1] : 0)                            // dist=1
+                   + (y > 0 ? s[z][y - 1][threadIdx.x] : 0)                            //
+                   + (z > 0 ? s[z - 1][y][threadIdx.x] : 0));                          //
+        return delta;
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    load_prequant_3d();
+    for (auto y = 0; y < BLOCK; y++) {
+        auto delta = predict_3d(y);
+        quantize_write(delta, gix, giy(y), giz, gid(y));
+    }
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void
+psz::cuda::__kernel::v0::c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier)
+{
+    constexpr auto BLOCK = 8;
+    __shared__ T   s[9][33];
+    T              delta[BLOCK + 1] = {0};  // first el = 0
+
+    const auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
+    const auto giy      = blockIdx.y * BLOCK + threadIdx.y;
+    const auto giz_base = blockIdx.z * BLOCK;
+    const auto base_id  = gix + giy * stride3.y + giz_base * stride3.z;
+
+    auto giz = [&](auto z) { return giz_base + z; };
+    auto gid = [&](auto z) { return base_id + z * stride3.z; };
+
+    auto load_prequant_3d = [&]() {
+        if (gix < len3.x and giy < len3.y) {
+            for (auto z = 0; z < BLOCK; z++)
+                if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r);  // prequant (fp presence)
+        }
+        __syncthreads();
+    };
+
+    auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) {
+        bool quantizable = fabs(delta) < radius;
+        T    candidate   = delta + radius;
+        if (x < len3.x and y < len3.y and z < len3.z) {
+            quant[gid]   = quantizable * static_cast<EQ>(candidate);
+            outlier[gid] = (not quantizable) * candidate;
+        }
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    /* z-direction, sequential in private buffer
+       delta = + (s[z][y][x] - s[z-1][y][x])
+               - (s[z][y][x-1] - s[z-1][y][x-1])
+               + (s[z][y-1][x-1] - s[z-1][y-1][x-1])
+               - (s[z][y-1][x] - s[z-1][y-1][x])
+
+       x-direction, shuffle
+       delta = + (s[z][y][x] - s[z][y][x-1])
+               - (s[z][y-1][x] - s[z][y-1][x-1])
+
+       y-direction, shmem
+       delta = s[z][y][x] - s[z][y-1][x]
+     */
+
+    load_prequant_3d();
+
+    for (auto z = BLOCK; z > 0; z--) {
+        // z-direction
+        delta[z] -= delta[z - 1];
+
+        // x-direction
+        auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8);
+        if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x;
+
+        // y-direction, exchange via shmem
+        // ghost padding along y
+        s[threadIdx.y + 1][threadIdx.x] = delta[z];
+        __syncthreads();
+
+        delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x];
+
+        // now delta[z] is delta
+        quantize_write(delta[z], gix, giy, giz(z - 1), gid(z - 1));
+    }
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v0::delta_only::c_lorenzo_3d1l(  //
+    T*   data,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2_r,
+    EQ*  quant)
+{
+    constexpr auto BLOCK = 8;
+    __shared__ T   s[9][33];
+    T              delta[BLOCK + 1] = {0};  // first el = 0
+
+    const auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
+    const auto giy      = blockIdx.y * BLOCK + threadIdx.y;
+    const auto giz_base = blockIdx.z * BLOCK;
+    const auto base_id  = gix + giy * stride3.y + giz_base * stride3.z;
+
+    auto giz = [&](auto z) { return giz_base + z; };
+    auto gid = [&](auto z) { return base_id + z * stride3.z; };
+
+    auto load_prequant_3d = [&]() {
+        if (gix < len3.x and giy < len3.y) {
+            for (auto z = 0; z < BLOCK; z++)
+                if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r);  // prequant (fp presence)
+        }
+        __syncthreads();
+    };
+
+    auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) {
+        if (x < len3.x and y < len3.y and z < len3.z) quant[gid] = static_cast<EQ>(delta);
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    load_prequant_3d();
+
+    for (auto z = BLOCK; z > 0; z--) {
+        // z-direction
+        delta[z] -= delta[z - 1];
+
+        // x-direction
+        auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8);
+        if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x;
+
+        // y-direction, exchange via shmem
+        // ghost padding along y
+        s[threadIdx.y + 1][threadIdx.x] = delta[z];
+        __syncthreads();
+
+        delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x];
+
+        // now delta[z] is delta
+        quantize_write(delta[z], gix, giy, giz(z - 1), gid(z - 1));
+    }
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v1_pn::delta_only::c_lorenzo_3d1l(  //
+    T*   data,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2_r,
+    EQ*  quant)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    constexpr auto BLOCK = 8;
+    __shared__ T   s[9][33];
+    T              delta[BLOCK + 1] = {0};  // first el = 0
+
+    const auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
+    const auto giy      = blockIdx.y * BLOCK + threadIdx.y;
+    const auto giz_base = blockIdx.z * BLOCK;
+    const auto base_id  = gix + giy * stride3.y + giz_base * stride3.z;
+
+    auto giz = [&](auto z) { return giz_base + z; };
+    auto gid = [&](auto z) { return base_id + z * stride3.z; };
+
+    auto load_prequant_3d = [&]() {
+        if (gix < len3.x and giy < len3.y) {
+            for (auto z = 0; z < BLOCK; z++)
+                if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r);  // prequant (fp presence)
+        }
+        __syncthreads();
+    };
+
+    auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) {
+        if (x < len3.x and y < len3.y and z < len3.z) quant[gid] = PN<BYTEWIDTH>::encode(static_cast<I>(delta));
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    load_prequant_3d();
+
+    for (auto z = BLOCK; z > 0; z--) {
+        // z-direction
+        delta[z] -= delta[z - 1];
+
+        // x-direction
+        auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8);
+        if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x;
+
+        // y-direction, exchange via shmem
+        // ghost padding along y
+        s[threadIdx.y + 1][threadIdx.x] = delta[z];
+        __syncthreads();
+
+        delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x];
+
+        // now delta[z] is delta
+        quantize_write(delta[z], gix, giy, giz(z - 1), gid(z - 1));
+    }
+}
+
+template <typename T, typename EQ, typename FP, typename Compaction>
+__global__ void psz::cuda::__kernel::v0::compaction::c_lorenzo_3d1l(
+    T*         data,
+    dim3       len3,
+    dim3       stride3,
+    int        radius,
+    FP         ebx2_r,
+    EQ*        quant,
+    Compaction outlier)
+{
+    constexpr auto BLOCK = 8;
+    __shared__ T   s[9][33];
+    T              delta[BLOCK + 1] = {0};  // first el = 0
+
+    const auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
+    const auto giy      = blockIdx.y * BLOCK + threadIdx.y;
+    const auto giz_base = blockIdx.z * BLOCK;
+    const auto base_id  = gix + giy * stride3.y + giz_base * stride3.z;
+
+    auto giz = [&](auto z) { return giz_base + z; };
+    auto gid = [&](auto z) { return base_id + z * stride3.z; };
+
+    auto load_prequant_3d = [&]() {
+        if (gix < len3.x and giy < len3.y) {
+            for (auto z = 0; z < BLOCK; z++)
+                if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r);  // prequant (fp presence)
+        }
+        __syncthreads();
+    };
+
+    auto quantize_compact_write = [&](T delta, auto x, auto y, auto z, auto gid) {
+        bool quantizable = fabs(delta) < radius;
+        T    candidate   = delta + radius;
+        if (x < len3.x and y < len3.y and z < len3.z) {
+            quant[gid] = quantizable * static_cast<EQ>(candidate);
+            if (not quantizable) {
+                auto cur_idx         = atomicAdd(outlier.count, 1);
+                outlier.idx[cur_idx] = gid;
+                outlier.val[cur_idx] = candidate;
+            }
+        }
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    load_prequant_3d();
+
+    for (auto z = BLOCK; z > 0; z--) {
+        // z-direction
+        delta[z] -= delta[z - 1];
+
+        // x-direction
+        auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8);
+        if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x;
+
+        // y-direction, exchange via shmem
+        // ghost padding along y
+        s[threadIdx.y + 1][threadIdx.x] = delta[z];
+        __syncthreads();
+
+        delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x];
+
+        // now delta[z] is delta
+        quantize_compact_write(delta[z], gix, giy, giz(z - 1), gid(z - 1));
+    }
+}
+
+template <typename T, typename EQ, typename FP, typename Compaction>
+__global__ void psz::cuda::__kernel::v1_pn::compaction::c_lorenzo_3d1l(
+    T*         data,
+    dim3       len3,
+    dim3       stride3,
+    int        radius,
+    FP         ebx2_r,
+    EQ*        quant,
+    Compaction outlier)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    constexpr auto BLOCK = 8;
+    __shared__ T   s[9][33];
+    T              delta[BLOCK + 1] = {0};  // first el = 0
+
+    const auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
+    const auto giy      = blockIdx.y * BLOCK + threadIdx.y;
+    const auto giz_base = blockIdx.z * BLOCK;
+    const auto base_id  = gix + giy * stride3.y + giz_base * stride3.z;
+
+    auto giz = [&](auto z) { return giz_base + z; };
+    auto gid = [&](auto z) { return base_id + z * stride3.z; };
+
+    // TODO move to subroutine.inl
+    auto load_prequant_3d = [&]() {
+        if (gix < len3.x and giy < len3.y) {
+            for (auto z = 0; z < BLOCK; z++)
+                if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r);  // prequant (fp presence)
+        }
+        __syncthreads();
+    };
+
+    auto quantize_compact_write = [&](T delta, auto x, auto y, auto z, auto gid) {
+        bool quantizable = fabs(delta) < radius;
+        UI   UI_delta    = PN<BYTEWIDTH>::encode(static_cast<I>(delta));
+
+        T candidate = delta + radius;
+        if (x < len3.x and y < len3.y and z < len3.z) {
+            quant[gid] = quantizable * UI_delta;
+            if (not quantizable) {
+                auto cur_idx         = atomicAdd(outlier.count, 1);
+                outlier.idx[cur_idx] = gid;
+                outlier.val[cur_idx] = UI_delta;
+            }
+        }
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    load_prequant_3d();
+
+    for (auto z = BLOCK; z > 0; z--) {
+        // z-direction
+        delta[z] -= delta[z - 1];
+
+        // x-direction
+        auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8);
+        if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x;
+
+        // y-direction, exchange via shmem
+        // ghost padding along y
+        s[threadIdx.y + 1][threadIdx.x] = delta[z];
+        __syncthreads();
+
+        delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x];
+
+        // now delta[z] is delta
+        quantize_compact_write(delta[z], gix, giy, giz(z - 1), gid(z - 1));
+    }
+}
+
+// 32x8x8 data block maps to 32x1x8 thread block
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v0::x_lorenzo_3d1l(  //
+    EQ*  quant,
+    T*   outlier,
+    dim3 len3,
+    dim3 stride3,
+    int  radius,
+    FP   ebx2,
+    T*   xdata)
+{
+    constexpr auto BLOCK = 8;
+    constexpr auto YSEQ  = BLOCK;
+    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
+
+    __shared__ T intermediate[BLOCK][4][8];
+    T            thread_private[YSEQ];
+
+    auto seg_id  = threadIdx.x / 8;
+    auto seg_tix = threadIdx.x % 8;
+
+    auto gix      = blockIdx.x * (4 * BLOCK) + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK;
+    auto giy      = [&](auto y) { return giy_base + y; };
+    auto giz      = blockIdx.z * BLOCK + threadIdx.z;
+    auto gid      = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
+
+    auto load_fuse_3d = [&]() {
+    // load to thread-private array (fuse at the same time)
+#pragma unroll
+        for (auto y = 0; y < YSEQ; y++) {
+            if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
+                thread_private[y] = outlier[gid(y)] + static_cast<T>(quant[gid(y)]) - radius;  // fuse
+            else
+                thread_private[y] = 0;
+        }
+    };
+
+    auto block_scan_3d = [&]() {
+        // partial-sum along y-axis, sequentially
+        for (auto y = 1; y < YSEQ; y++) thread_private[y] += thread_private[y - 1];
+
+#pragma unroll
+        for (auto i = 0; i < BLOCK; i++) {
+            // ND partial-sums along x- and z-axis
+            // in-warp shuffle used: in order to perform, it's transposed after X-partial sum
+            T val = thread_private[i];
+
+            for (auto dist = 1; dist < BLOCK; dist *= 2) {
+                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+                if (seg_tix >= dist) val += addend;
+            }
+
+            // x-z transpose
+            intermediate[threadIdx.z][seg_id][seg_tix] = val;
+            __syncthreads();
+            val = intermediate[seg_tix][seg_id][threadIdx.z];
+            __syncthreads();
+
+            for (auto dist = 1; dist < BLOCK; dist *= 2) {
+                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+                if (seg_tix >= dist) val += addend;
+            }
+
+            intermediate[threadIdx.z][seg_id][seg_tix] = val;
+            __syncthreads();
+            val = intermediate[seg_tix][seg_id][threadIdx.z];
+            __syncthreads();
+
+            thread_private[i] = val;
+        }
+    };
+
+    auto decomp_write_3d = [&]() {
+#pragma unroll
+        for (auto y = 0; y < YSEQ; y++)
+            if (gix < len3.x and giy(y) < len3.y and giz < len3.z) xdata[gid(y)] = thread_private[y] * ebx2;
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+    load_fuse_3d();
+    block_scan_3d();
+    decomp_write_3d();
+}
+
+// 32x8x8 data block maps to 32x1x8 thread block
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v1_pn::x_lorenzo_3d1l(  //
+    EQ*  quant,
+    T*   outlier,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2,
+    T*   xdata)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    constexpr auto BLOCK = 8;
+    constexpr auto YSEQ  = BLOCK;
+    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
+
+    __shared__ T intermediate[BLOCK][4][8];
+    T            thread_private[YSEQ];
+
+    auto seg_id  = threadIdx.x / 8;
+    auto seg_tix = threadIdx.x % 8;
+
+    auto gix      = blockIdx.x * (4 * BLOCK) + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK;
+    auto giy      = [&](auto y) { return giy_base + y; };
+    auto giz      = blockIdx.z * BLOCK + threadIdx.z;
+    auto gid      = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
+
+    auto load_fuse_3d = [&]() {
+    // load to thread-private array (fuse at the same time)
+#pragma unroll
+        for (auto y = 0; y < YSEQ; y++) {
+            if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
+                thread_private[y] = outlier[gid(y)] + PN<BYTEWIDTH>::decode(quant[gid(y)]);  // fuse
+            else
+                thread_private[y] = 0;
+        }
+    };
+
+    auto block_scan_3d = [&]() {
+        // partial-sum along y-axis, sequentially
+        for (auto y = 1; y < YSEQ; y++) thread_private[y] += thread_private[y - 1];
+
+#pragma unroll
+        for (auto i = 0; i < BLOCK; i++) {
+            // ND partial-sums along x- and z-axis
+            // in-warp shuffle used: in order to perform, it's transposed after X-partial sum
+            T val = thread_private[i];
+
+            for (auto dist = 1; dist < BLOCK; dist *= 2) {
+                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+                if (seg_tix >= dist) val += addend;
+            }
+
+            // x-z transpose
+            intermediate[threadIdx.z][seg_id][seg_tix] = val;
+            __syncthreads();
+            val = intermediate[seg_tix][seg_id][threadIdx.z];
+            __syncthreads();
+
+            for (auto dist = 1; dist < BLOCK; dist *= 2) {
+                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+                if (seg_tix >= dist) val += addend;
+            }
+
+            intermediate[threadIdx.z][seg_id][seg_tix] = val;
+            __syncthreads();
+            val = intermediate[seg_tix][seg_id][threadIdx.z];
+            __syncthreads();
+
+            thread_private[i] = val;
+        }
+    };
+
+    auto decomp_write_3d = [&]() {
+#pragma unroll
+        for (auto y = 0; y < YSEQ; y++)
+            if (gix < len3.x and giy(y) < len3.y and giz < len3.z) xdata[gid(y)] = thread_private[y] * ebx2;
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+    load_fuse_3d();
+    block_scan_3d();
+    decomp_write_3d();
+}
+
+// 32x8x8 data block maps to 32x1x8 thread block
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v0::delta_only::x_lorenzo_3d1l(  //
+    EQ*  quant,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2,
+    T*   xdata)
+{
+    constexpr auto BLOCK = 8;
+    constexpr auto YSEQ  = BLOCK;
+    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
+
+    __shared__ T intermediate[BLOCK][4][8];
+    T            thread_private[YSEQ];
+
+    auto seg_id  = threadIdx.x / 8;
+    auto seg_tix = threadIdx.x % 8;
+
+    auto gix      = blockIdx.x * (4 * BLOCK) + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK;
+    auto giy      = [&](auto y) { return giy_base + y; };
+    auto giz      = blockIdx.z * BLOCK + threadIdx.z;
+    auto gid      = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
+
+    auto load_3d = [&]() {
+    // load to thread-private array (fuse at the same time)
+#pragma unroll
+        for (auto y = 0; y < YSEQ; y++) {
+            if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
+                thread_private[y] = static_cast<T>(quant[gid(y)]);  // fuse
+            else
+                thread_private[y] = 0;
+        }
+    };
+
+    auto block_scan_3d = [&]() {
+        // partial-sum along y-axis, sequentially
+        for (auto y = 1; y < YSEQ; y++) thread_private[y] += thread_private[y - 1];
+
+#pragma unroll
+        for (auto i = 0; i < BLOCK; i++) {
+            // ND partial-sums along x- and z-axis
+            // in-warp shuffle used: in order to perform, it's transposed after X-partial sum
+            T val = thread_private[i];
+
+            for (auto dist = 1; dist < BLOCK; dist *= 2) {
+                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+                if (seg_tix >= dist) val += addend;
+            }
+
+            // x-z transpose
+            intermediate[threadIdx.z][seg_id][seg_tix] = val;
+            __syncthreads();
+            val = intermediate[seg_tix][seg_id][threadIdx.z];
+            __syncthreads();
+
+            for (auto dist = 1; dist < BLOCK; dist *= 2) {
+                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+                if (seg_tix >= dist) val += addend;
+            }
+
+            intermediate[threadIdx.z][seg_id][seg_tix] = val;
+            __syncthreads();
+            val = intermediate[seg_tix][seg_id][threadIdx.z];
+            __syncthreads();
+
+            thread_private[i] = val;
+        }
+    };
+
+    auto decomp_write_3d = [&]() {
+#pragma unroll
+        for (auto y = 0; y < YSEQ; y++)
+            if (gix < len3.x and giy(y) < len3.y and giz < len3.z) xdata[gid(y)] = thread_private[y] * ebx2;
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+    load_3d();
+    block_scan_3d();
+    decomp_write_3d();
+}
diff --git a/qtensor/compression/cusz/src/kernel/detail/lorenzo_proto.inl b/qtensor/compression/cusz/src/kernel/detail/lorenzo_proto.inl
new file mode 100644
index 00000000..2ed25984
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/detail/lorenzo_proto.inl
@@ -0,0 +1,214 @@
+/**
+ * @file lorenzo_proto.inl
+ * @author Jiannan Tian
+ * @brief (prototype) Dual-EQ Lorenzo method.
+ * @version 0.2
+ * @date 2021-01-16
+ * (create) 2019-09-23; (release) 2020-09-20; (rev1) 2021-01-16; (rev2) 2021-02-20; (rev3) 2021-04-11
+ * (rev4) 2021-04-30
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_KERNEL_LORENZO_PROTOTYPE_CUH
+#define CUSZ_KERNEL_LORENZO_PROTOTYPE_CUH
+
+#include <cstddef>
+#include <stdexcept>
+
+#include "utils/cuda_err.cuh"
+#include "utils/timer.h"
+
+namespace psz {
+
+namespace cuda {
+namespace __kernel {
+
+namespace prototype {  // easy algorithmic description
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 256>
+__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* eq, T* outlier)
+{
+    __shared__ T buf[BLK];
+
+    auto id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id < len3.x) {
+        buf[threadIdx.x] = round(data[id] * ebx2_r);  // prequant (fp presence)
+    }
+    __syncthreads();  // necessary to ensure correctness
+
+    T delta = buf[threadIdx.x] - (threadIdx.x == 0 ? 0 : buf[threadIdx.x - 1]);
+
+    bool quantizable = fabs(delta) < radius;
+    T    candidate   = delta + radius;
+    if (id < len3.x) {                             // postquant
+        data[id] = (1 - quantizable) * candidate;  // output; reuse data for outlier
+        eq[id]   = quantizable * static_cast<EQ>(candidate);
+    }
+}
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 16>
+__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* eq, T* outlier)
+{
+    __shared__ T buf[BLK][BLK + 1];
+
+    auto y = threadIdx.y, x = threadIdx.x;
+    auto giy = blockIdx.y * blockDim.y + y, gix = blockIdx.x * blockDim.x + x;
+
+    auto id = gix + giy * stride3.y;  // low to high dim, inner to outer
+    if (gix < len3.x and giy < len3.y) {
+        buf[y][x] = round(data[id] * ebx2_r);  // prequant (fp presence)
+    }
+    __syncthreads();  // necessary to ensure correctness
+
+    T delta = buf[y][x] - ((x > 0 ? buf[y][x - 1] : 0) +                // dist=1
+                           (y > 0 ? buf[y - 1][x] : 0) -                // dist=1
+                           (x > 0 and y > 0 ? buf[y - 1][x - 1] : 0));  // dist=2
+
+    bool quantizable = fabs(delta) < radius;
+    T    candidate   = delta + radius;
+    if (gix < len3.x and giy < len3.y) {
+        data[id] = (1 - quantizable) * candidate;  // output; reuse data for outlier
+        eq[id]   = quantizable * static_cast<EQ>(candidate);
+    }
+}
+
+template <typename T, typename EQ, typename FP, int BLK = 8>
+__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* eq, T* outlier)
+{
+    __shared__ T buf[BLK][BLK][BLK + 1];
+
+    auto z = threadIdx.z, y = threadIdx.y, x = threadIdx.x;
+    auto giz = blockIdx.z * blockDim.z + z, giy = blockIdx.y * blockDim.y + y, gix = blockIdx.x * blockDim.x + x;
+
+    auto id = gix + giy * stride3.y + giz * stride3.z;  // low to high in dim, inner to outer
+    if (gix < len3.x and giy < len3.y and giz < len3.z) {
+        buf[z][y][x] = round(data[id] * ebx2_r);  // prequant (fp presence)
+    }
+    __syncthreads();  // necessary to ensure correctness
+
+    T delta = buf[z][y][x] - ((z > 0 and y > 0 and x > 0 ? buf[z - 1][y - 1][x - 1] : 0)  // dist=3
+                              - (y > 0 and x > 0 ? buf[z][y - 1][x - 1] : 0)              // dist=2
+                              - (z > 0 and x > 0 ? buf[z - 1][y][x - 1] : 0)              //
+                              - (z > 0 and y > 0 ? buf[z - 1][y - 1][x] : 0)              //
+                              + (x > 0 ? buf[z][y][x - 1] : 0)                            // dist=1
+                              + (y > 0 ? buf[z][y - 1][x] : 0)                            //
+                              + (z > 0 ? buf[z - 1][y][x] : 0));                          //
+
+    bool quantizable = fabs(delta) < radius;
+    T    candidate   = delta + radius;
+    if (gix < len3.x and giy < len3.y and giz < len3.z) {
+        data[id] = (1 - quantizable) * candidate;  // output; reuse data for outlier
+        eq[id]   = quantizable * static_cast<EQ>(candidate);
+    }
+}
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 256>
+__global__ void x_lorenzo_1d1l(EQ* eq, T* scattered_outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata)
+{
+    __shared__ T buf[BLK];
+
+    auto id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (id < len3.x)
+        buf[threadIdx.x] = scattered_outlier[id] + static_cast<T>(eq[id]) - radius;  // fuse
+    else
+        buf[threadIdx.x] = 0;
+    __syncthreads();
+
+    for (auto d = 1; d < BLK; d *= 2) {
+        T n = 0;
+        if (threadIdx.x >= d) n = buf[threadIdx.x - d];  // like __shfl_up_sync(0x1f, var, d); warp_sync
+        __syncthreads();
+        if (threadIdx.x >= d) buf[threadIdx.x] += n;
+        __syncthreads();
+    }
+
+    if (id < len3.x) { xdata[id] = buf[threadIdx.x] * ebx2; }
+}
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 16>
+__global__ void x_lorenzo_2d1l(EQ* eq, T* scattered_outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata)
+{
+    __shared__ T buf[BLK][BLK + 1];
+
+    auto   giy = blockIdx.y * blockDim.y + threadIdx.y, gix = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t id = gix + giy * stride3.y;
+
+    if (gix < len3.x and giy < len3.y)
+        buf[threadIdx.y][threadIdx.x] = scattered_outlier[id] + static_cast<T>(eq[id]) - radius;  // fuse
+    else
+        buf[threadIdx.y][threadIdx.x] = 0;
+    __syncthreads();
+
+    for (auto d = 1; d < BLK; d *= 2) {
+        T n = 0;
+        if (threadIdx.x >= d) n = buf[threadIdx.y][threadIdx.x - d];
+        __syncthreads();
+        if (threadIdx.x >= d) buf[threadIdx.y][threadIdx.x] += n;
+        __syncthreads();
+    }
+
+    for (auto d = 1; d < BLK; d *= 2) {
+        T n = 0;
+        if (threadIdx.y >= d) n = buf[threadIdx.y - d][threadIdx.x];
+        __syncthreads();
+        if (threadIdx.y >= d) buf[threadIdx.y][threadIdx.x] += n;
+        __syncthreads();
+    }
+
+    if (gix < len3.x and giy < len3.y) { xdata[id] = buf[threadIdx.y][threadIdx.x] * ebx2; }
+}
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 8>
+__global__ void x_lorenzo_3d1l(EQ* eq, T* scattered_outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata)
+{
+    __shared__ T buf[BLK][BLK][BLK + 1];
+
+    auto giz = blockIdx.z * BLK + threadIdx.z, giy = blockIdx.y * BLK + threadIdx.y,
+         gix  = blockIdx.x * BLK + threadIdx.x;
+    size_t id = gix + giy * stride3.y + giz * stride3.z;  // low to high in dim, inner to outer
+
+    if (gix < len3.x and giy < len3.y and giz < len3.z)
+        buf[threadIdx.z][threadIdx.y][threadIdx.x] = scattered_outlier[id] + static_cast<T>(eq[id]) - radius;  // id
+    else
+        buf[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
+    __syncthreads();
+
+    for (auto dist = 1; dist < BLK; dist *= 2) {
+        T addend = 0;
+        if (threadIdx.x >= dist) addend = buf[threadIdx.z][threadIdx.y][threadIdx.x - dist];
+        __syncthreads();
+        if (threadIdx.x >= dist) buf[threadIdx.z][threadIdx.y][threadIdx.x] += addend;
+        __syncthreads();
+    }
+
+    for (auto dist = 1; dist < BLK; dist *= 2) {
+        T addend = 0;
+        if (threadIdx.y >= dist) addend = buf[threadIdx.z][threadIdx.y - dist][threadIdx.x];
+        __syncthreads();
+        if (threadIdx.y >= dist) buf[threadIdx.z][threadIdx.y][threadIdx.x] += addend;
+        __syncthreads();
+    }
+
+    for (auto dist = 1; dist < BLK; dist *= 2) {
+        T addend = 0;
+        if (threadIdx.z >= dist) addend = buf[threadIdx.z - dist][threadIdx.y][threadIdx.x];
+        __syncthreads();
+        if (threadIdx.z >= dist) buf[threadIdx.z][threadIdx.y][threadIdx.x] += addend;
+        __syncthreads();
+    }
+
+    if (gix < len3.x and giy < len3.y and giz < len3.z) {
+        xdata[id] = buf[threadIdx.z][threadIdx.y][threadIdx.x] * ebx2;
+    }
+}
+
+}  // namespace prototype
+}  // namespace __kernel
+}  // namespace cuda
+}  // namespace psz
+
+#endif
diff --git a/qtensor/compression/cusz/src/kernel/detail/lorenzo_serial.inl b/qtensor/compression/cusz/src/kernel/detail/lorenzo_serial.inl
new file mode 100644
index 00000000..b00ec690
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/detail/lorenzo_serial.inl
@@ -0,0 +1,326 @@
+/**
+ * @file lorenzo_serial.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-03-13
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef E0B87BA8_BEDC_4CBE_B5EE_C0C5875E07D6
+#define E0B87BA8_BEDC_4CBE_B5EE_C0C5875E07D6
+
+#include <iostream>
+#include "cusz/it.hh"
+#include "cusz/nd.h"
+
+using std::cout;
+using std::endl;
+
+#define SETUP_1D_BASIC                                                                        \
+    psz_dim3 grid_dim, block_idx, thread_idx;                                                 \
+    auto     gx             = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; }; \
+    auto     gidx           = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; }; \
+    auto     check_boundary = [&]() { return gx() < len3.x; };                                \
+    grid_dim.x              = (len3.x - 1) / BLK + 1;
+#define SETUP_1D_DATABUF                                            \
+    constexpr auto PADDING    = 1;                                  \
+    auto           _buf1      = new psz_buf<T, 1, BLK + PADDING>(); \
+    auto&          buf1       = *_buf1;                             \
+    auto           databuf_it = [&](auto x) -> T& { return buf1(thread_idx.x + x + PADDING); };
+#define SETUP_1D_EQBUF                          \
+    auto  _buf2    = new psz_buf<EQ, 1, BLK>(); \
+    auto& buf2     = *_buf2;                    \
+    auto  eqbuf_it = [&](auto dx) -> EQ& { return buf2(thread_idx.x + dx); };
+#define PFOR_GRID_1D() for (block_idx.x = 0; block_idx.x < grid_dim.x; block_idx.x++)
+#define PFOR_BLOCK_1D() for (thread_idx.x = 0; thread_idx.x < BLK; thread_idx.x++)
+
+#define SETUP_2D_BASIC                                                                        \
+    psz_dim3 grid_dim, block_idx, thread_idx;                                                 \
+    auto     gx             = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; }; \
+    auto     gy             = [&]() -> uint32_t { return block_idx.y * BLK + thread_idx.y; }; \
+    auto     gidx           = [&]() -> uint32_t { return gy() * stride3.y + gx(); };          \
+    auto     check_boundary = [&]() { return gx() < len3.x and gy() < len3.y; };              \
+    grid_dim.x              = (len3.x - 1) / BLK + 1;                                         \
+    grid_dim.y              = (len3.y - 1) / BLK + 1;
+#define SETUP_2D_DATABUF                                                                 \
+    constexpr auto PADDING    = 1;                                                       \
+    auto           _buf1      = new psz_buf<T, 2, BLK + PADDING>();                      \
+    auto&          buf1       = *_buf1;                                                  \
+    auto           databuf_it = [&](auto dx, auto dy) -> T& {                            \
+        return buf1(thread_idx.x + dx + PADDING, thread_idx.y + dy + PADDING); \
+    };
+#define SETUP_2D_EQBUF                          \
+    auto  _buf2    = new psz_buf<EQ, 2, BLK>(); \
+    auto& buf2     = *_buf2;                    \
+    auto  eqbuf_it = [&](auto dx, auto dy) -> EQ& { return buf2(thread_idx.x + dx, thread_idx.y + dy); };
+#define PFOR_GRID_2D()                                             \
+    for (block_idx.y = 0; block_idx.y < grid_dim.y; block_idx.y++) \
+        for (block_idx.x = 0; block_idx.x < grid_dim.x; block_idx.x++)
+#define PFOR_BLOCK_2D()                                        \
+    for (thread_idx.y = 0; thread_idx.y < BLK; thread_idx.y++) \
+        for (thread_idx.x = 0; thread_idx.x < BLK; thread_idx.x++)
+
+#define SETUP_3D_BASIC                                                                                  \
+    psz_dim3 grid_dim, block_idx, thread_idx;                                                           \
+    auto     gx             = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; };           \
+    auto     gy             = [&]() -> uint32_t { return block_idx.y * BLK + thread_idx.y; };           \
+    auto     gz             = [&]() -> uint32_t { return block_idx.z * BLK + thread_idx.z; };           \
+    auto     gidx           = [&]() -> uint32_t { return gz() * stride3.z + gy() * stride3.y + gx(); }; \
+    auto     check_boundary = [&]() { return gx() < len3.x and gy() < len3.y and gz() < len3.z; };      \
+    grid_dim.x              = (len3.x - 1) / BLK + 1;                                                   \
+    grid_dim.y              = (len3.y - 1) / BLK + 1;                                                   \
+    grid_dim.z              = (len3.z - 1) / BLK + 1;
+#define SETUP_3D_DATABUF                                                                                              \
+    constexpr auto PADDING    = 1;                                                                                    \
+    auto           _buf1      = new psz_buf<T, 3, BLK + PADDING>();                                                   \
+    auto&          buf1       = *_buf1;                                                                               \
+    auto           databuf_it = [&](auto dx, auto dy, auto dz) -> T& {                                                \
+        return buf1(thread_idx.x + dx + PADDING, thread_idx.y + dy + PADDING, thread_idx.z + dz + PADDING); \
+    };
+#define SETUP_3D_EQBUF                                                         \
+    auto  _buf2    = new psz_buf<EQ, 3, BLK>();                                \
+    auto& buf2     = *_buf2;                                                   \
+    auto  eqbuf_it = [&](auto dx, auto dy, auto dz) -> EQ& {                   \
+        return buf2(thread_idx.x + dx, thread_idx.y + dy, thread_idx.z + dz); \
+    };
+#define PFOR_GRID_3D()                                                 \
+    for (block_idx.z = 0; block_idx.z < grid_dim.z; block_idx.z++)     \
+        for (block_idx.y = 0; block_idx.y < grid_dim.y; block_idx.y++) \
+            for (block_idx.x = 0; block_idx.x < grid_dim.x; block_idx.x++)
+#define PFOR_BLOCK_3D()                                            \
+    for (thread_idx.z = 0; thread_idx.z < BLK; thread_idx.z++)     \
+        for (thread_idx.y = 0; thread_idx.y < BLK; thread_idx.y++) \
+            for (thread_idx.x = 0; thread_idx.x < BLK; thread_idx.x++)
+
+namespace psz {
+namespace serial {
+namespace __kernel {
+
+template <
+    typename T,
+    typename EQ      = int32_t,
+    typename FP      = T,
+    int BLK          = 256,
+    typename OUTLIER = struct psz_outlier_serial<T>>
+void c_lorenzo_1d1l(T* data, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2_r, EQ* eq, OUTLIER* outlier) {
+    SETUP_1D_BASIC;
+    SETUP_1D_DATABUF;
+    SETUP_1D_EQBUF;
+
+    // per-thread ("real" kernel)
+    auto threadview_load = [&]() {
+        if (check_boundary()) databuf_it(0) = data[gidx()] * ebx2_r;
+    };
+    auto threadview_process = [&]() {
+        auto delta = databuf_it(0) - databuf_it(-1);
+        if (delta > radius) {
+            outlier->record(delta, gidx());
+            eqbuf_it(0) = 0;
+        }
+        else {
+            eqbuf_it(0) = delta;
+        }
+    };
+    auto threadview_store = [&]() {
+        if (check_boundary()) eq[gidx()] = eqbuf_it(0);
+    };
+
+    ////////////////////////////////////////
+    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_load(); }
+    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_process(); }
+    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_store(); }
+
+    delete _buf1;
+    delete _buf2;
+
+}
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 256>
+void x_lorenzo_1d1l(EQ* eq, T* scattered_outlier, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2, T* xdata)
+{
+    SETUP_1D_BASIC;
+    SETUP_1D_DATABUF;
+
+    // per-thread ("real" kernel)
+    auto threadview_load = [&]() {
+        if (check_boundary()) databuf_it(0) = eq[gidx()] + scattered_outlier[gidx()];
+    };
+    auto threadview_partial_sum = [&]() {
+        if (thread_idx.x > 0) databuf_it(0) += databuf_it(-1);
+    };
+    auto threadview_store = [&]() {
+        if (check_boundary()) xdata[gidx()] = databuf_it(0) * ebx2;
+    };
+
+    ////////////////////////////////////////
+    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_load(); }
+    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_partial_sum(); }
+    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_store(); }
+
+    delete _buf1;
+}
+
+template <
+    typename T,
+    typename EQ      = int32_t,
+    typename FP      = T,
+    int BLK          = 16,
+    typename OUTLIER = struct psz_outlier_serial<T>>
+void c_lorenzo_2d1l(T* data, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2_r, EQ* eq, OUTLIER* outlier) {
+    SETUP_2D_BASIC;
+    SETUP_2D_DATABUF;
+    SETUP_2D_EQBUF;
+
+    // per-thread ("real" kernel)
+    auto threadview_load = [&]() {
+        if (check_boundary()) databuf_it(0, 0) = data[gidx()] * ebx2_r;
+    };
+    auto threadview_process = [&]() {
+        auto delta = databuf_it(0, 0) - (databuf_it(-1, 0) + databuf_it(0, -1) - databuf_it(-1, -1));
+        if (delta > radius) {
+            outlier->record(delta, gidx());
+            eqbuf_it(0, 0) = 0;
+        }
+        else {
+            eqbuf_it(0, 0) = delta;
+        }
+    };
+    auto threadview_store = [&]() {
+        if (check_boundary()) eq[gidx()] = eqbuf_it(0, 0);
+    };
+
+    ////////////////////////////////////////
+    PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_load(); }
+    PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_process(); }
+    PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_store(); }
+
+    delete _buf1;
+    delete _buf2;
+}
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 16>
+void x_lorenzo_2d1l(EQ* eq, T* scattered_outlier, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2, T* xdata)
+{
+    SETUP_2D_BASIC;
+    SETUP_2D_DATABUF;
+
+    // per-thread ("real" kernel)
+    auto threadview_load = [&]() {
+        if (check_boundary()) databuf_it(0, 0) = eq[gidx()] + scattered_outlier[gidx()];
+    };
+    auto threadview_partial_sum_x = [&]() {
+        if (thread_idx.x > 0) databuf_it(0, 0) += databuf_it(-1, 0);
+    };
+    auto threadview_partial_sum_y = [&]() {
+        if (thread_idx.y > 0) databuf_it(0, 0) += databuf_it(0, -1);
+    };
+    auto threadview_store = [&]() {
+        if (check_boundary()) xdata[gidx()] = databuf_it(0, 0) * ebx2;
+    };
+
+    ////////////////////////////////////////
+    PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_load(); }
+    PFOR_GRID_2D()
+    {
+        PFOR_BLOCK_2D() threadview_partial_sum_x();
+        PFOR_BLOCK_2D() threadview_partial_sum_y();
+    }
+    PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_store(); }
+
+    delete _buf1;
+}
+
+template <
+    typename T,
+    typename EQ      = int32_t,
+    typename FP      = T,
+    int BLK          = 8,
+    typename OUTLIER = struct psz_outlier_serial<T>>
+void c_lorenzo_3d1l(T* data, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2_r, EQ* eq, OUTLIER* outlier) {
+    SETUP_3D_BASIC;
+    SETUP_3D_DATABUF;
+    SETUP_3D_EQBUF;
+
+    // per-thread ("real" kernel)
+    auto threadview_load = [&]() {
+        if (check_boundary()) databuf_it(0, 0, 0) = data[gidx()] * ebx2_r;
+    };
+    auto threadview_process = [&]() {
+        auto delta = databuf_it(0, 0, 0) -
+                     (databuf_it(-1, -1, -1) - databuf_it(0, -1, -1) - databuf_it(-1, 0, -1) - databuf_it(-1, -1, 0) +
+                      databuf_it(0, 0, -1) + databuf_it(0, -1, 0) + databuf_it(-1, 0, 0));
+        if (delta > radius) {
+            outlier->record(delta, gidx());
+            eqbuf_it(0, 0, 0) = 0;
+        }
+        else {
+            eqbuf_it(0, 0, 0) = delta;
+        }
+    };
+    auto threadview_store = [&]() {
+        if (check_boundary()) eq[gidx()] = eqbuf_it(0, 0, 0);
+    };
+
+    ////////////////////////////////////////
+    PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_load(); }
+    PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_process(); }
+    PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_store(); }
+
+    delete _buf1;
+    delete _buf2;
+}
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 8>
+void x_lorenzo_3d1l(EQ* eq, T* scattered_outlier, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2, T* xdata)
+{
+    SETUP_3D_BASIC;
+    SETUP_3D_DATABUF;
+
+    // per-thread ("real" kernel)
+    auto threadview_load = [&]() {
+        if (check_boundary()) databuf_it(0, 0, 0) = eq[gidx()] + scattered_outlier[gidx()];
+    };
+    auto threadview_partial_sum_x = [&]() {
+        if (thread_idx.x > 0) databuf_it(0, 0, 0) += databuf_it(-1, 0, 0);
+    };
+    auto threadview_partial_sum_y = [&]() {
+        if (thread_idx.y > 0) databuf_it(0, 0, 0) += databuf_it(0, -1, 0);
+    };
+    auto threadview_partial_sum_z = [&]() {
+        if (thread_idx.z > 0) databuf_it(0, 0, 0) += databuf_it(0, 0, -1);
+    };
+    auto threadview_store = [&]() {
+        if (check_boundary()) xdata[gidx()] = databuf_it(0, 0, 0) * ebx2;
+    };
+
+    ////////////////////////////////////////
+    PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_load(); }
+    PFOR_GRID_3D()
+    {
+        PFOR_BLOCK_3D() threadview_partial_sum_x();
+        PFOR_BLOCK_3D() threadview_partial_sum_y();
+        PFOR_BLOCK_3D() threadview_partial_sum_z();
+    }
+    PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_store(); }
+
+    delete _buf1;
+}
+
+}  // namespace __kernel
+}  // namespace serial
+}  // namespace psz
+
+#undef SETUP_1D
+#undef PFOR_GRID_1D
+#undef PFOR_BLOCK_1D
+#undef SETUP_2D_BASIC
+#undef PFOR_GRID_2D
+#undef PFOR_BLOCK_2D
+#undef SETUP_3D
+#undef PFOR_GRID_3D
+#undef PFOR_BLOCK_3D
+
+#endif /* E0B87BA8_BEDC_4CBE_B5EE_C0C5875E07D6 */
diff --git a/qtensor/compression/cusz/src/kernel/detail/lorenzo_var.inl b/qtensor/compression/cusz/src/kernel/detail/lorenzo_var.inl
new file mode 100644
index 00000000..b5563275
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/detail/lorenzo_var.inl
@@ -0,0 +1,530 @@
+/**
+ * @file lorenzo_var.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-09-29
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef E2BEA52A_4D2E_4966_9135_6CE8B8E05762
+#define E2BEA52A_4D2E_4966_9135_6CE8B8E05762
+
+#include <cstddef>
+
+#if __has_include(<cub/cub.cuh>)
+// #pragma message __FILE__ ": (CUDA 11 onward), cub from system path"
+#include <cub/cub.cuh>
+#else
+// #pragma message __FILE__ ": (CUDA 10 or earlier), cub from git submodule"
+#include "../../third_party/cub/cub/cub.cuh"
+#endif
+
+#if __cplusplus >= 201703L
+#define CONSTEXPR constexpr
+#else
+#define CONSTEXPR
+#endif
+
+#define TIX threadIdx.x
+#define TIY threadIdx.y
+#define TIZ threadIdx.z
+#define BIX blockIdx.x
+#define BIY blockIdx.y
+#define BIZ blockIdx.z
+#define BDX blockDim.x
+#define BDY blockDim.y
+#define BDZ blockDim.z
+
+#include "utils/cuda_err.cuh"
+#include "utils/timer.h"
+
+namespace cusz {
+namespace experimental {
+
+template <typename Data, typename ErrCtrl, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void
+pred1d(Data thread_scope[SEQ], volatile bool* shmem_signum, volatile ErrCtrl* shmem_delta, Data from_last_stripe = 0)
+{
+    if CONSTEXPR (FIRST_POINT) {  // i == 0
+        Data delta                  = thread_scope[0] - from_last_stripe;
+        shmem_signum[0 + TIX * SEQ] = delta < 0;  // signnum
+        shmem_delta[0 + TIX * SEQ]  = static_cast<ErrCtrl>(fabs(delta));
+    }
+    else {
+#pragma unroll
+        for (auto i = 1; i < SEQ; i++) {
+            Data delta                  = thread_scope[i] - thread_scope[i - 1];
+            shmem_signum[i + TIX * SEQ] = delta < 0;  // signum
+            shmem_delta[i + TIX * SEQ]  = static_cast<ErrCtrl>(fabs(delta));
+        }
+        __syncthreads();
+    }
+}
+
+template <typename Data, typename FP, int NTHREAD, int SEQ>
+__forceinline__ __device__ void load1d(
+    Data*          data,
+    unsigned int   dimx,
+    unsigned int   id_base,
+    volatile Data* shmem_data,
+    Data           thread_scope[SEQ],
+    Data&          from_last_stripe,
+    FP             ebx2_r)
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id = id_base + TIX + i * NTHREAD;
+        if (id < dimx) { shmem_data[TIX + i * NTHREAD] = round(data[id] * ebx2_r); }
+    }
+    __syncthreads();
+
+    for (auto i = 0; i < SEQ; i++) thread_scope[i] = shmem_data[TIX * SEQ + i];
+
+    if (TIX > 0) from_last_stripe = shmem_data[TIX * SEQ - 1];
+    __syncthreads();
+}
+
+template <typename ErrCtrl, int NTHREAD, int SEQ>
+__forceinline__ __device__ void write1d(
+    volatile bool*    shmem_signum,
+    bool*             signum,
+    unsigned int      dimx,
+    unsigned int      id_base,
+    volatile ErrCtrl* shmem_delta = nullptr,
+    ErrCtrl*          delta       = nullptr)
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id = id_base + TIX + i * NTHREAD;
+        if (id < dimx) {
+            signum[id] = shmem_signum[TIX + i * NTHREAD];
+            delta[id]  = shmem_delta[TIX + i * NTHREAD];
+        }
+    }
+}
+
+template <typename Data, typename FP, int YSEQ>
+__forceinline__ __device__ void load2d_prequant(
+    Data*        data,
+    Data         center[YSEQ + 1],
+    unsigned int dimx,
+    unsigned int dimy,
+    unsigned int stridey,
+    unsigned int gix,
+    unsigned int giy_base,
+    FP           ebx2_r)
+{
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        if (gix < dimx and giy_base + i < dimy) center[i + 1] = round(data[get_gid(i)] * ebx2_r);
+    }
+    auto tmp = __shfl_up_sync(0xffffffff, center[YSEQ], 16);  // same-warp, next-16
+    if (TIY == 1) center[0] = tmp;
+}
+
+template <typename Data, typename FP, int YSEQ>
+__forceinline__ __device__ void pred2d(Data center[YSEQ + 1])
+{
+    /* prediction
+         original form:  Data delta = center[i] - center[i - 1] + west[i] - west[i - 1];
+            short form:  Data delta = center[i] - west[i];
+       */
+#pragma unroll
+    for (auto i = YSEQ; i > 0; i--) {
+        center[i] -= center[i - 1];
+        auto west = __shfl_up_sync(0xffffffff, center[i], 1, 16);
+        if (TIX > 0) center[i] -= west;
+    }
+    __syncthreads();
+}
+
+template <typename Data, typename ErrCtrl, int YSEQ>
+__forceinline__ __device__ void postquant_write2d(
+    Data         center[YSEQ + 1],
+    ErrCtrl*     delta,
+    bool*        signum,
+    unsigned int dimx,
+    unsigned int dimy,
+    unsigned int stridey,
+    unsigned int gix,
+    unsigned int giy_base)
+{
+    /********************************************************************************
+     * Depending on whether postquant is delayed in compression, deside separating
+     * data-type signum and uint-type quantcode when writing to DRAM (or not).
+     ********************************************************************************/
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 1; i < YSEQ + 1; i++) {
+        auto gid = get_gid(i - 1);
+
+        if (gix < dimx and giy_base + i - 1 < dimy) {
+            signum[gid] = center[i] < 0;  // output; reuse data for signum
+            delta[gid]  = static_cast<ErrCtrl>(fabs(center[i]));
+        }
+    }
+}
+
+template <
+    typename Data,
+    typename ErrCtrl,
+    typename FP,
+    int BLOCK,
+    int SEQ>
+__global__ void c_lorenzo_1d1l(  //
+    Data*    data,
+    ErrCtrl* delta,
+    bool*    signum,
+    dim3     len3,
+    dim3     stride3,
+    FP       ebx2_r)
+{
+    constexpr auto NTHREAD = BLOCK / SEQ;
+
+    __shared__ struct {
+        Data    data[BLOCK];
+        ErrCtrl delta[BLOCK];
+        bool    signum[BLOCK];
+    } shmem;
+
+    auto id_base = BIX * BLOCK;
+
+    Data thread_scope[SEQ];
+    Data from_last_stripe{0};
+
+    /********************************************************************************
+     * load from DRAM using striped layout, perform prequant
+     ********************************************************************************/
+    load1d<Data, FP, NTHREAD, SEQ>(data, len3.x, id_base, shmem.data, thread_scope, from_last_stripe, ebx2_r);
+
+    /********************************************************************************
+     * delta and signum
+     ********************************************************************************/
+    pred1d<Data, ErrCtrl, SEQ, true>(thread_scope, shmem.signum, shmem.delta, from_last_stripe);
+    pred1d<Data, ErrCtrl, SEQ, false>(thread_scope, shmem.signum, shmem.delta);
+    write1d<ErrCtrl, NTHREAD, SEQ>(shmem.signum, signum, len3.x, id_base, shmem.delta, delta);
+}
+
+template <typename Data = float, typename ErrCtrl = uint16_t, typename FP = float>
+__global__ void c_lorenzo_2d1l_16x16data_mapto16x2(
+    Data*    data,    // input
+    ErrCtrl* delta,   // output
+    bool*    signum,  // output
+    dim3     len3,
+    dim3     stride3,
+    FP       ebx2_r)
+{
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = 8;
+
+    Data center[YSEQ + 1] = {0};  // nw  n
+                                  //  w  center
+
+    auto gix      = BIX * BDX + TIX;           // BDX == 16
+    auto giy_base = BIY * BLOCK + TIY * YSEQ;  // BDY * YSEQ = BLOCK == 16
+                                               // clang-format off
+    load2d_prequant<Data, FP, YSEQ>(data, center, len3.x, len3.y, stride3.y, gix, giy_base, ebx2_r);
+    pred2d<Data, FP, YSEQ>(center);
+    postquant_write2d<Data, ErrCtrl, YSEQ >(center, delta, signum, len3.x, len3.y, stride3.y,  gix, giy_base);
+    // clang-format on
+}
+
+template <typename Data, typename ErrCtrl = uint16_t, typename FP = float>
+__global__ void c_lorenzo_3d1l_32x8x8data_mapto32x1x8(
+    Data*    data,    // input
+    ErrCtrl* delta,   // output
+    bool*    signum,  // output
+    dim3     len3,
+    dim3     stride3,
+    FP       ebx2_r)
+{
+    constexpr auto  BLOCK = 8;
+    __shared__ Data shmem[8][8][32];
+
+    auto z = TIZ;
+
+    auto gix      = BIX * (BLOCK * 4) + TIX;
+    auto giy_base = BIY * BLOCK;
+    auto giz      = BIZ * BLOCK + z;
+    auto base_id  = gix + giy_base * stride3.y + giz * stride3.z;
+
+    /********************************************************************************
+     * load from DRAM, perform prequant
+     ********************************************************************************/
+    if (gix < len3.x and giz < len3.z) {
+        for (auto y = 0; y < BLOCK; y++) {
+            if (giy_base + y < len3.y) {
+                shmem[z][y][TIX] = round(data[base_id + y * stride3.y] * ebx2_r);  // prequant (fp presence)
+            }
+        }
+    }
+    __syncthreads();  // necessary to ensure correctness
+
+    auto x = TIX % 8;
+
+    for (auto y = 0; y < BLOCK; y++) {
+        Data delta_val;
+
+        // prediction
+        delta_val = shmem[z][y][TIX] - ((z > 0 and y > 0 and x > 0 ? shmem[z - 1][y - 1][TIX - 1] : 0)  // dist=3
+                                        - (y > 0 and x > 0 ? shmem[z][y - 1][TIX - 1] : 0)              // dist=2
+                                        - (z > 0 and x > 0 ? shmem[z - 1][y][TIX - 1] : 0)              //
+                                        - (z > 0 and y > 0 ? shmem[z - 1][y - 1][TIX] : 0)              //
+                                        + (x > 0 ? shmem[z][y][TIX - 1] : 0)                            // dist=1
+                                        + (y > 0 ? shmem[z][y - 1][TIX] : 0)                            //
+                                        + (z > 0 ? shmem[z - 1][y][TIX] : 0));                          //
+
+        auto id = base_id + (y * stride3.y);
+
+        // delta and signum
+        if (gix < len3.x and (giy_base + y) < len3.y and giz < len3.z) {
+            signum[id] = delta_val < 0;
+            delta[id]  = static_cast<ErrCtrl>(fabs(delta_val));
+        }
+    }
+    /* EOF */
+}
+
+template <typename Data = float, typename ErrCtrl = uint16_t, typename FP = float, int BLOCK = 256, int SEQ = 8>
+__global__ void x_lorenzo_1d1l(  //
+    bool*    signum,
+    ErrCtrl* delta,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    FP       ebx2)
+{
+    constexpr auto block_dim = BLOCK / SEQ;  // dividable
+
+    // coalesce-load (warp-striped) and transpose in shmem (similar for store)
+    typedef cub::BlockLoad<bool, block_dim, SEQ, cub::BLOCK_LOAD_WARP_TRANSPOSE>    BlockLoadT_signum;
+    typedef cub::BlockLoad<ErrCtrl, block_dim, SEQ, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT_delta;
+    typedef cub::BlockStore<Data, block_dim, SEQ, cub::BLOCK_STORE_WARP_TRANSPOSE>  BlockStoreT_xdata;
+    typedef cub::BlockScan<Data, block_dim, cub::BLOCK_SCAN_RAKING_MEMOIZE>
+        BlockScanT_xdata;  // TODO autoselect algorithm
+
+    __shared__ union TempStorage {  // overlap shared memory space
+        typename BlockLoadT_signum::TempStorage load_signum;
+        typename BlockLoadT_delta::TempStorage  load_delta;
+        typename BlockStoreT_xdata::TempStorage store_xdata;
+        typename BlockScanT_xdata::TempStorage  scan_xdata;
+    } temp_storage;
+
+    // thread-scope tiled data
+    struct ThreadData {
+        Data xdata[SEQ];
+        bool signum[SEQ];
+    } thread_scope;
+    ErrCtrl thread_scope_delta[SEQ];
+
+    /********************************************************************************
+     * load to thread-private array (fuse at the same time)
+     * (BIX * BDX * SEQ) denotes the start of the data chunk that belongs to this thread block
+     ********************************************************************************/
+    BlockLoadT_delta(temp_storage.load_delta).Load(delta + (BIX * BDX) * SEQ, thread_scope_delta);
+    __syncthreads();  // barrier for shmem reuse
+    BlockLoadT_signum(temp_storage.load_signum).Load(signum + (BIX * BDX) * SEQ, thread_scope.signum);
+    __syncthreads();  // barrier for shmem reuse
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id               = (BIX * BDX + TIX) * SEQ + i;
+        thread_scope.xdata[i] = id < len3.x  //
+                                    ? (thread_scope.signum[i] ? -1 : 1) * static_cast<Data>(thread_scope_delta[i])
+                                    : 0;
+    }
+    __syncthreads();
+
+    /********************************************************************************
+     * perform partial-sum using cub::InclusiveSum
+     ********************************************************************************/
+    BlockScanT_xdata(temp_storage.scan_xdata).InclusiveSum(thread_scope.xdata, thread_scope.xdata);
+    __syncthreads();  // barrier for shmem reuse
+
+    /********************************************************************************
+     * scale by ebx2 and write to DRAM
+     ********************************************************************************/
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) thread_scope.xdata[i] *= ebx2;
+    __syncthreads();  // barrier for shmem reuse
+
+    BlockStoreT_xdata(temp_storage.store_xdata).Store(xdata + (BIX * BDX) * SEQ, thread_scope.xdata);
+}
+
+template <typename Data = float, typename ErrCtrl = uint16_t, typename FP = float>
+__global__ void
+x_lorenzo_2d1l_16x16data_mapto16x2(bool* signum, ErrCtrl* delta, Data* xdata, dim3 len3, dim3 stride3, FP ebx2)
+{
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
+    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
+
+    __shared__ Data intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
+    Data            thread_scope[YSEQ];
+    /*
+      .  ------> gix (x)
+      |  t00    t01    t02    t03    ... t0f
+      |  ts00_0 ts00_0 ts00_0 ts00_0
+     giy ts00_1 ts00_1 ts00_1 ts00_1
+     (y)  |      |      |      |
+         ts00_7 ts00_7 ts00_7 ts00_7
+
+      |  t10    t11    t12    t13    ... t1f
+      |  ts00_0 ts00_0 ts00_0 ts00_0
+     giy ts00_1 ts00_1 ts00_1 ts00_1
+     (y)  |      |      |      |
+         ts00_7 ts00_7 ts00_7 ts00_7
+     */
+
+    auto gix      = BIX * BLOCK + TIX;
+    auto giy_base = BIY * BLOCK + TIY * YSEQ;  // BDY * YSEQ = BLOCK == 16
+    auto get_gid  = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
+
+    /********************************************************************************
+     * load to thread-private array (fuse at the same time)
+     ********************************************************************************/
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
+        if (gix < len3.x and giy_base + i < len3.y)
+            thread_scope[i] = (signum[gid] ? -1 : 1) * static_cast<Data>(delta[gid]);  // fuse
+        else
+            thread_scope[i] = 0;  // TODO set as init state?
+    }
+
+    /********************************************************************************
+     * partial-sum along y-axis, sequantially
+     ********************************************************************************/
+    for (auto i = 1; i < YSEQ; i++) thread_scope[i] += thread_scope[i - 1];
+    // two-pass: store for cross-threadscope update
+    if (TIY == 0) intermediate[TIX] = thread_scope[YSEQ - 1];
+    __syncthreads();
+    // two-pass: load and update
+    if (TIY == 1) {
+        auto tmp = intermediate[TIX];
+#pragma unroll
+        for (auto& i : thread_scope) i += tmp;
+    }
+
+    /********************************************************************************
+     * in-warp partial-sum along x-axis
+     ********************************************************************************/
+#pragma unroll
+    for (auto& i : thread_scope) {
+        for (auto d = 1; d < BLOCK; d *= 2) {
+            Data n = __shfl_up_sync(0xffffffff, i, d, 16);
+            if (TIX >= d) i += n;
+        }
+        i *= ebx2;
+    }
+
+    /********************************************************************************
+     * write to DRAM
+     ********************************************************************************/
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        if (gix < len3.x and giy_base + i < len3.y) xdata[gid] = thread_scope[i];
+    }
+}
+
+template <typename Data = float, typename ErrCtrl = uint16_t, typename FP = float>
+__global__ void
+x_lorenzo_3d1l_32x8x8data_mapto32x1x8(bool* signum, ErrCtrl* delta, Data* xdata, dim3 len3, dim3 stride3, FP ebx2)
+{
+    constexpr auto BLOCK = 8;
+    constexpr auto YSEQ  = BLOCK;
+    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
+
+    __shared__ Data intermediate[BLOCK][4][8];
+    Data            thread_scope[YSEQ];
+
+    auto seg_id  = TIX / 8;
+    auto seg_tix = TIX % 8;
+
+    auto gix = BIX * (4 * BLOCK) + TIX, giy_base = BIY * BLOCK, giz = BIZ * BLOCK + TIZ;
+    auto get_gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
+
+    /********************************************************************************
+     * load to thread-private array (fuse at the same time)
+     ********************************************************************************/
+#pragma unroll
+    for (auto y = 0; y < YSEQ; y++) {
+        auto gid = get_gid(y);
+        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
+            thread_scope[y] = (signum[gid] ? -1 : 1) * static_cast<Data>(delta[gid]);
+        else
+            thread_scope[y] = 0;
+    }
+
+    /********************************************************************************
+     * partial-sum along y-axis, sequantially
+     ********************************************************************************/
+    for (auto y = 1; y < YSEQ; y++) thread_scope[y] += thread_scope[y - 1];
+
+    /********************************************************************************
+     * ND partial-sums along x- and z-axis
+     * in-warp shuffle used: in order to perform, it's transposed after X-partial sum
+     ********************************************************************************/
+    auto dist = 1;
+    Data addend;
+
+#pragma unroll
+    for (auto i = 0; i < BLOCK; i++) {
+        Data val = thread_scope[i];
+
+        for (dist = 1; dist < BLOCK; dist *= 2) {
+            addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+            if (seg_tix >= dist) val += addend;
+        }
+
+        // x-z transpose
+        intermediate[TIZ][seg_id][seg_tix] = val;
+        __syncthreads();
+        val = intermediate[seg_tix][seg_id][TIZ];
+        __syncthreads();
+
+        for (dist = 1; dist < BLOCK; dist *= 2) {
+            addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+            if (seg_tix >= dist) val += addend;
+        }
+
+        intermediate[TIZ][seg_id][seg_tix] = val;
+        __syncthreads();
+        val = intermediate[seg_tix][seg_id][TIZ];
+        __syncthreads();
+
+        thread_scope[i] = val;
+    }
+
+    /********************************************************************************
+     * write to DRAM
+     ********************************************************************************/
+#pragma unroll
+    for (auto y = 0; y < YSEQ; y++) {
+        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) { xdata[get_gid(y)] = thread_scope[y] * ebx2; }
+    }
+    /* EOF */
+}
+
+}  // namespace experimental
+}  // namespace cusz
+
+#undef TIX
+#undef TIY
+#undef TIZ
+#undef BIX
+#undef BIY
+#undef BIZ
+#undef BDX
+#undef BDY
+#undef BDZ
+
+#endif /* E2BEA52A_4D2E_4966_9135_6CE8B8E05762 */
diff --git a/qtensor/compression/cusz/src/kernel/detail/spline3.inl b/qtensor/compression/cusz/src/kernel/detail/spline3.inl
new file mode 100644
index 00000000..2c4f1213
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/detail/spline3.inl
@@ -0,0 +1,746 @@
+/**
+ * @file spline3.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.2
+ * @date 2021-05-15
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_KERNEL_SPLINE3_CUH
+#define CUSZ_KERNEL_SPLINE3_CUH
+
+#include <stdint.h>
+#include <stdio.h>
+#include <type_traits>
+#include "utils/cuda_err.cuh"
+
+#define SPLINE3_COMPR true
+#define SPLINE3_DECOMPR false
+
+#if __cplusplus >= 201703L
+#define CONSTEXPR constexpr
+#else
+#define CONSTEXPR
+#endif
+
+#define TIX threadIdx.x
+#define TIY threadIdx.y
+#define TIZ threadIdx.z
+#define BIX blockIdx.x
+#define BIY blockIdx.y
+#define BIZ blockIdx.z
+#define BDX blockDim.x
+#define BDY blockDim.y
+#define BDZ blockDim.
+
+using DIM     = unsigned int;
+using STRIDE  = unsigned int;
+using DIM3    = dim3;
+using STRIDE3 = dim3;
+
+constexpr int BLOCK8  = 8;
+constexpr int BLOCK32 = 32;
+
+#define SHM_ERROR shm_errctrl
+
+namespace cusz {
+
+/********************************************************************************
+ * host API
+ ********************************************************************************/
+
+template <
+    typename TITER,
+    typename EITER,
+    typename FP            = float,
+    int  LINEAR_BLOCK_SIZE = 256,
+    bool PROBE_PRED_ERROR  = false>
+__global__ void c_spline3d_infprecis_32x8x8data(
+    TITER   data,
+    DIM3    data_size,
+    STRIDE3 data_leap,
+    EITER   errctrl,
+    DIM3    errctrl_size,
+    STRIDE3 errctrl_leap,
+    TITER   anchor,
+    STRIDE3 anchor_leap,
+    FP      eb_r,
+    FP      ebx2,
+    int     radius,
+    TITER   pred_error     = nullptr,
+    TITER   compress_error = nullptr);
+
+template <
+    typename EITER,
+    typename TITER,
+    typename FP           = float,
+    int LINEAR_BLOCK_SIZE = 256>
+__global__ void x_spline3d_infprecis_32x8x8data(
+    EITER   errctrl,       // input 1
+    DIM3    errctrl_size,  //
+    STRIDE3 errctrl_leap,  //
+    TITER   anchor,        // input 2
+    DIM3    anchor_size,   //
+    STRIDE3 anchor_leap,   //
+    TITER   data,          // output
+    DIM3    data_size,     //
+    STRIDE3 data_leap,     //
+    FP      eb_r,
+    FP      ebx2,
+    int     radius);
+
+namespace device_api {
+/********************************************************************************
+ * device API
+ ********************************************************************************/
+template <
+    typename T1,
+    typename T2,
+    typename FP,
+    int  LINEAR_BLOCK_SIZE,
+    bool WORKFLOW         = SPLINE3_COMPR,
+    bool PROBE_PRED_ERROR = false>
+__device__ void spline3d_layout2_interpolate(
+    volatile T1 shm_data[9][9][33],
+    volatile T2 shm_errctrl[9][9][33],
+    FP          eb_r,
+    FP          ebx2,
+    int         radius);
+}  // namespace device_api
+
+}  // namespace cusz
+
+/********************************************************************************
+ * helper function
+ ********************************************************************************/
+
+namespace {
+
+template <bool INCLUSIVE = true>
+__forceinline__ __device__ bool xyz33x9x9_predicate(unsigned int x, unsigned int y, unsigned int z)
+{
+    if CONSTEXPR (INCLUSIVE) {  //
+        return x <= 32 and y <= 8 and z <= 8;
+    }
+    else {
+        return x < 32 and y < 8 and z < 8;
+    }
+}
+
+// control block_id3 in function call
+template <typename T, bool PRINT_FP = false, int XEND = 33, int YEND = 9, int ZEND = 9>
+__device__ void
+spline3d_print_block_from_GPU(T volatile a[9][9][33], int radius = 512, bool compress = true, bool print_errctrl = true)
+{
+    for (auto z = 0; z < ZEND; z++) {
+        printf("\nprint from GPU, z=%d\n", z);
+        printf("    ");
+        for (auto i = 0; i < 33; i++) printf("%3d", i);
+        printf("\n");
+
+        for (auto y = 0; y < YEND; y++) {
+            printf("y=%d ", y);
+            for (auto x = 0; x < XEND; x++) {  //
+                if CONSTEXPR (PRINT_FP) { printf("%.2e\t", (float)a[z][y][x]); }
+                else {
+                    T c = print_errctrl ? a[z][y][x] - radius : a[z][y][x];
+                    if (compress) {
+                        if (c == 0) { printf("%3c", '.'); }
+                        else {
+                            if (abs(c) >= 10) { printf("%3c", '*'); }
+                            else {
+                                if (print_errctrl) { printf("%3d", c); }
+                                else {
+                                    printf("%4.2f", c);
+                                }
+                            }
+                        }
+                    }
+                    else {
+                        if (print_errctrl) { printf("%3d", c); }
+                        else {
+                            printf("%4.2f", c);
+                        }
+                    }
+                }
+            }
+            printf("\n");
+        }
+    }
+    printf("\nGPU print end\n\n");
+}
+
+template <typename T1, typename T2, int LINEAR_BLOCK_SIZE = 256>
+__device__ void
+c_reset_scratch_33x9x9data(volatile T1 shm_data[9][9][33], volatile T2 shm_errctrl[9][9][33], int radius)
+{
+    // alternatively, reinterprete cast volatile T?[][][] to 1D
+    for (auto _tix = TIX; _tix < 33 * 9 * 9; _tix += LINEAR_BLOCK_SIZE) {
+        auto x = (_tix % 33);
+        auto y = (_tix / 33) % 9;
+        auto z = (_tix / 33) / 9;
+
+        shm_data[z][y][x] = 0;
+        /*****************************************************************************
+         okay to use
+         ******************************************************************************/
+        if (x % 8 == 0 and y % 8 == 0 and z % 8 == 0) shm_errctrl[z][y][x] = radius;
+        /*****************************************************************************
+         alternatively
+         ******************************************************************************/
+        // shm_errctrl[z][y][x] = radius;
+    }
+    __syncthreads();
+}
+
+template <typename T1, int LINEAR_BLOCK_SIZE = 256>
+__device__ void c_gather_anchor(T1* data, DIM3 data_size, STRIDE3 data_leap, T1* anchor, STRIDE3 anchor_leap)
+{
+    auto x = (TIX % 32) + BIX * 32;
+    auto y = (TIX / 32) % 8 + BIY * 8;
+    auto z = (TIX / 32) / 8 + BIZ * 8;
+
+    bool pred1 = x % 8 == 0 and y % 8 == 0 and z % 8 == 0;
+    bool pred2 = x < data_size.x and y < data_size.y and z < data_size.z;
+
+    if (pred1 and pred2) {
+        auto data_id      = x + y * data_leap.y + z * data_leap.z;
+        auto anchor_id    = (x / 8) + (y / 8) * anchor_leap.y + (z / 8) * anchor_leap.z;
+        anchor[anchor_id] = data[data_id];
+    }
+    __syncthreads();
+}
+
+/*
+ * use shmem, erroneous
+template <typename T1, int LINEAR_BLOCK_SIZE = 256>
+__device__ void c_gather_anchor(volatile T1 shm_data[9][9][33], T1* anchor, STRIDE3 anchor_leap)
+{
+    constexpr auto NUM_ITERS = 33 * 9 * 9 / LINEAR_BLOCK_SIZE + 1;  // 11 iterations
+    for (auto i = 0; i < NUM_ITERS; i++) {
+        auto _tix = i * LINEAR_BLOCK_SIZE + TIX;
+
+        if (_tix < 33 * 9 * 9) {
+            auto x = (_tix % 33);
+            auto y = (_tix / 33) % 9;
+            auto z = (_tix / 33) / 9;
+
+            if (x % 8 == 0 and y % 8 == 0 and z % 8 == 0) {
+                auto aid = ((x / 8) + BIX * 4) +             //
+                           ((y / 8) + BIY) * anchor_leap.y +  //
+                           ((z / 8) + BIZ) * anchor_leap.z;   //
+                anchor[aid] = shm_data[z][y][x];
+            }
+        }
+    }
+    __syncthreads();
+}
+*/
+
+template <typename T1, typename T2, int LINEAR_BLOCK_SIZE = 256>
+__device__ void x_reset_scratch_33x9x9data(
+    volatile T1 shm_xdata[9][9][33],
+    volatile T2 shm_errctrl[9][9][33],
+    T1*         anchor,       //
+    DIM3        anchor_size,  //
+    STRIDE3     anchor_leap)
+{
+    for (auto _tix = TIX; _tix < 33 * 9 * 9; _tix += LINEAR_BLOCK_SIZE) {
+        auto x = (_tix % 33);
+        auto y = (_tix / 33) % 9;
+        auto z = (_tix / 33) / 9;
+
+        shm_errctrl[z][y][x] = 0;  // TODO explicitly handle zero-padding
+        /*****************************************************************************
+         okay to use
+         ******************************************************************************/
+        if (x % 8 == 0 and y % 8 == 0 and z % 8 == 0) {
+            shm_xdata[z][y][x] = 0;
+
+            auto ax = ((x / 8) + BIX * 4);
+            auto ay = ((y / 8) + BIY);
+            auto az = ((z / 8) + BIZ);
+
+            if (ax < anchor_size.x and ay < anchor_size.y and az < anchor_size.z)
+                shm_xdata[z][y][x] = anchor[ax + ay * anchor_leap.y + az * anchor_leap.z];
+        }
+        /*****************************************************************************
+         alternatively
+         ******************************************************************************/
+        // shm_errctrl[z][y][x] = radius;
+    }
+
+    __syncthreads();
+}
+
+template <typename Input, int LINEAR_BLOCK_SIZE = 256>
+__device__ void
+global2shmem_33x9x9data(Input* data, DIM3 data_size, STRIDE3 data_leap, volatile Input shm_data[9][9][33])
+{
+    constexpr auto TOTAL = 33 * 9 * 9;
+
+    for (auto _tix = TIX; _tix < TOTAL; _tix += LINEAR_BLOCK_SIZE) {
+        auto x   = (_tix % 33);
+        auto y   = (_tix / 33) % 9;
+        auto z   = (_tix / 33) / 9;
+        auto gx  = (x + BIX * BLOCK32);
+        auto gy  = (y + BIY * BLOCK8);
+        auto gz  = (z + BIZ * BLOCK8);
+        auto gid = gx + gy * data_leap.y + gz * data_leap.z;
+
+        if (gx < data_size.x and gy < data_size.y and gz < data_size.z) shm_data[z][y][x] = data[gid];
+    }
+    __syncthreads();
+}
+
+template <typename Output, int LINEAR_BLOCK_SIZE = 256>
+__device__ void
+shmem2global_32x8x8data(volatile Output shm_data[9][9][33], Output* data, DIM3 data_size, STRIDE3 data_leap)
+{
+    constexpr auto TOTAL = 32 * 8 * 8;
+
+    for (auto _tix = TIX; _tix < TOTAL; _tix += LINEAR_BLOCK_SIZE) {
+        auto x   = (_tix % 32);
+        auto y   = (_tix / 32) % 8;
+        auto z   = (_tix / 32) / 8;
+        auto gx  = (x + BIX * BLOCK32);
+        auto gy  = (y + BIY * BLOCK8);
+        auto gz  = (z + BIZ * BLOCK8);
+        auto gid = gx + gy * data_leap.y + gz * data_leap.z;
+
+        if (gx < data_size.x and gy < data_size.y and gz < data_size.z) data[gid] = shm_data[z][y][x];
+    }
+    __syncthreads();
+}
+
+template <
+    typename T1,
+    typename T2,
+    typename FP,
+    typename LAMBDAX,
+    typename LAMBDAY,
+    typename LAMBDAZ,
+    bool BLUE,
+    bool YELLOW,
+    bool HOLLOW,
+    int  LINEAR_BLOCK_SIZE,
+    int  BLOCK_DIMX,
+    int  BLOCK_DIMY,
+    bool COARSEN,
+    int  BLOCK_DIMZ,
+    bool BORDER_INCLUSIVE,
+    bool WORKFLOW>
+__forceinline__ __device__ void interpolate_stage(
+    volatile T1 shm_data[9][9][33],
+    volatile T2 shm_errctrl[9][9][33],
+    LAMBDAX     xmap,
+    LAMBDAY     ymap,
+    LAMBDAZ     zmap,
+    int         unit,
+    FP          eb_r,
+    FP          ebx2,
+    int         radius)
+{
+    static_assert(BLOCK_DIMX * BLOCK_DIMY * (COARSEN ? 1 : BLOCK_DIMZ) <= LINEAR_BLOCK_SIZE, "block oversized");
+    static_assert((BLUE or YELLOW or HOLLOW) == true, "must be one hot");
+    static_assert((BLUE and YELLOW) == false, "must be only one hot (1)");
+    static_assert((BLUE and YELLOW) == false, "must be only one hot (2)");
+    static_assert((YELLOW and HOLLOW) == false, "must be only one hot (3)");
+
+    auto run = [&](auto x, auto y, auto z) {
+        if (xyz33x9x9_predicate<BORDER_INCLUSIVE>(x, y, z)) {
+            T1 pred = 0;
+
+            if CONSTEXPR (BLUE) {  //
+                pred = (shm_data[z - unit][y][x] + shm_data[z + unit][y][x]) / 2;
+            }
+            if CONSTEXPR (YELLOW) {  //
+                pred = (shm_data[z][y][x - unit] + shm_data[z][y][x + unit]) / 2;
+            }
+            if CONSTEXPR (HOLLOW) {  //
+                pred = (shm_data[z][y - unit][x] + shm_data[z][y + unit][x]) / 2;
+            }
+
+            if CONSTEXPR (WORKFLOW == SPLINE3_COMPR) {
+                auto          err = shm_data[z][y][x] - pred;
+                decltype(err) code;
+                // TODO unsafe, did not deal with the out-of-cap case
+                {
+                    code = fabs(err) * eb_r + 1;
+                    code = err < 0 ? -code : code;
+                    code = int(code / 2) + radius;
+                }
+                shm_errctrl[z][y][x] = code;  // TODO double check if unsigned type works
+                shm_data[z][y][x]    = pred + (code - radius) * ebx2;
+            }
+            else {  // TODO == DECOMPRESSS and static_assert
+                auto code         = shm_errctrl[z][y][x];
+                shm_data[z][y][x] = pred + (code - radius) * ebx2;
+            }
+        }
+    };
+    // -------------------------------------------------------------------------------- //
+
+    if CONSTEXPR (COARSEN) {
+        constexpr auto TOTAL = BLOCK_DIMX * BLOCK_DIMY * BLOCK_DIMZ;
+        for (auto _tix = TIX; _tix < TOTAL; _tix += LINEAR_BLOCK_SIZE) {
+            auto itix = (_tix % BLOCK_DIMX);
+            auto itiy = (_tix / BLOCK_DIMX) % BLOCK_DIMY;
+            auto itiz = (_tix / BLOCK_DIMX) / BLOCK_DIMY;
+            auto x    = xmap(itix, unit);
+            auto y    = ymap(itiy, unit);
+            auto z    = zmap(itiz, unit);
+            run(x, y, z);
+        }
+    }
+    else {
+        auto itix = (TIX % BLOCK_DIMX);
+        auto itiy = (TIX / BLOCK_DIMX) % BLOCK_DIMY;
+        auto itiz = (TIX / BLOCK_DIMX) / BLOCK_DIMY;
+        auto x    = xmap(itix, unit);
+        auto y    = ymap(itiy, unit);
+        auto z    = zmap(itiz, unit);
+        run(x, y, z);
+    }
+    __syncthreads();
+}
+
+}  // namespace
+
+/********************************************************************************/
+
+template <typename T1, typename T2, typename FP, int LINEAR_BLOCK_SIZE, bool WORKFLOW, bool PROBE_PRED_ERROR>
+__device__ void cusz::device_api::spline3d_layout2_interpolate(
+    volatile T1 shm_data[9][9][33],
+    volatile T2 shm_errctrl[9][9][33],
+    FP          eb_r,
+    FP          ebx2,
+    int         radius)
+{
+    auto xblue = [] __device__(int _tix, int unit) -> int { return unit * (_tix * 2); };
+    auto yblue = [] __device__(int _tiy, int unit) -> int { return unit * (_tiy * 2); };
+    auto zblue = [] __device__(int _tiz, int unit) -> int { return unit * (_tiz * 2 + 1); };
+
+    auto xyellow = [] __device__(int _tix, int unit) -> int { return unit * (_tix * 2 + 1); };
+    auto yyellow = [] __device__(int _tiy, int unit) -> int { return unit * (_tiy * 2); };
+    auto zyellow = [] __device__(int _tiz, int unit) -> int { return unit * (_tiz); };
+
+    auto xhollow = [] __device__(int _tix, int unit) -> int { return unit * (_tix); };
+    auto yhollow = [] __device__(int _tiy, int unit) -> int { return unit * (_tiy * 2 + 1); };
+    auto zhollow = [] __device__(int _tiz, int unit) -> int { return unit * (_tiz); };
+
+    constexpr auto COARSEN          = true;
+    constexpr auto NO_COARSEN       = false;
+    constexpr auto BORDER_INCLUSIVE = true;
+    constexpr auto BORDER_EXCLUSIVE = false;
+
+    int unit = 4;
+
+    // iteration 1
+    interpolate_stage<
+        T1, T2, FP, decltype(xblue), decltype(yblue), decltype(zblue),  //
+        true, false, false, LINEAR_BLOCK_SIZE, 5, 2, NO_COARSEN, 1, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xblue, yblue, zblue, unit, eb_r, ebx2, radius);
+    interpolate_stage<
+        T1, T2, FP, decltype(xyellow), decltype(yyellow), decltype(zyellow),  //
+        false, true, false, LINEAR_BLOCK_SIZE, 4, 2, NO_COARSEN, 3, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xyellow, yyellow, zyellow, unit, eb_r, ebx2, radius);
+    interpolate_stage<
+        T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow),  //
+        false, false, true, LINEAR_BLOCK_SIZE, 9, 1, NO_COARSEN, 3, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius);
+
+    unit = 2;
+
+    // iteration 2, TODO switch y-z order
+    interpolate_stage<
+        T1, T2, FP, decltype(xblue), decltype(yblue), decltype(zblue),  //
+        true, false, false, LINEAR_BLOCK_SIZE, 9, 3, NO_COARSEN, 2, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xblue, yblue, zblue, unit, eb_r, ebx2, radius);
+    interpolate_stage<
+        T1, T2, FP, decltype(xyellow), decltype(yyellow), decltype(zyellow),  //
+        false, true, false, LINEAR_BLOCK_SIZE, 8, 3, NO_COARSEN, 5, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xyellow, yyellow, zyellow, unit, eb_r, ebx2, radius);
+    interpolate_stage<
+        T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow),  //
+        false, false, true, LINEAR_BLOCK_SIZE, 17, 2, NO_COARSEN, 5, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius);
+
+    unit = 1;
+
+    // iteration 3
+    interpolate_stage<
+        T1, T2, FP, decltype(xblue), decltype(yblue), decltype(zblue),  //
+        true, false, false, LINEAR_BLOCK_SIZE, 17, 5, COARSEN, 4, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xblue, yblue, zblue, unit, eb_r, ebx2, radius);
+    interpolate_stage<
+        T1, T2, FP, decltype(xyellow), decltype(yyellow), decltype(zyellow),  //
+        false, true, false, LINEAR_BLOCK_SIZE, 16, 5, COARSEN, 9, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xyellow, yyellow, zyellow, unit, eb_r, ebx2, radius);
+    /******************************************************************************
+     test only: last step inclusive
+     ******************************************************************************/
+    // interpolate_stage<
+    //     T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow),  //
+    //     false, false, true, LINEAR_BLOCK_SIZE, 33, 4, COARSEN, 9, BORDER_INCLUSIVE, WORKFLOW>(
+    //     shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius);
+    /******************************************************************************
+     production
+     ******************************************************************************/
+    interpolate_stage<
+        T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow),  //
+        false, false, true, LINEAR_BLOCK_SIZE, 32, 4, COARSEN, 8, BORDER_EXCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius);
+
+    /******************************************************************************
+     test only: print a block
+     ******************************************************************************/
+    // if (TIX == 0 and BIX == 0 and BIY == 0 and BIZ == 0) { spline3d_print_block_from_GPU(shm_errctrl); }
+    // if (TIX == 0 and BIX == 0 and BIY == 0 and BIZ == 0) { spline3d_print_block_from_GPU(shm_data); }
+}
+
+/********************************************************************************
+ * host API/kernel
+ ********************************************************************************/
+
+template <typename TITER, typename EITER, typename FP, int LINEAR_BLOCK_SIZE, bool PROBE_PRED_ERROR>
+__global__ void cusz::c_spline3d_infprecis_32x8x8data(
+    TITER   data,
+    DIM3    data_size,
+    STRIDE3 data_leap,
+    EITER   errctrl,
+    DIM3    errctrl_size,
+    STRIDE3 errctrl_leap,
+    TITER   anchor,
+    STRIDE3 anchor_leap,
+    FP      eb_r,
+    FP      ebx2,
+    int     radius,
+    TITER   pred_error,
+    TITER   compress_error)
+{
+    // compile time variables
+    using T = typename std::remove_pointer<TITER>::type;
+    using E = typename std::remove_pointer<EITER>::type;
+
+    if CONSTEXPR (PROBE_PRED_ERROR) {
+        // TODO
+    }
+    else {
+        __shared__ struct {
+            T data[9][9][33];
+            E errctrl[9][9][33];
+        } shmem;
+
+        c_reset_scratch_33x9x9data<T, E, LINEAR_BLOCK_SIZE>(shmem.data, shmem.errctrl, radius);
+        global2shmem_33x9x9data<T, LINEAR_BLOCK_SIZE>(data, data_size, data_leap, shmem.data);
+
+        // version 1, use shmem, erroneous
+        // c_gather_anchor<T>(shmem.data, anchor, anchor_leap);
+        // version 2, use global mem, correct
+        c_gather_anchor<T>(data, data_size, data_leap, anchor, anchor_leap);
+
+        cusz::device_api::spline3d_layout2_interpolate<T, E, FP, LINEAR_BLOCK_SIZE, SPLINE3_COMPR, false>(
+            shmem.data, shmem.errctrl, eb_r, ebx2, radius);
+        shmem2global_32x8x8data<E, LINEAR_BLOCK_SIZE>(shmem.errctrl, errctrl, errctrl_size, errctrl_leap);
+    }
+}
+
+template <
+    typename EITER,
+    typename TITER,
+    typename FP,
+    int LINEAR_BLOCK_SIZE>
+__global__ void cusz::x_spline3d_infprecis_32x8x8data(
+    EITER   errctrl,       // input 1
+    DIM3    errctrl_size,  //
+    STRIDE3 errctrl_leap,  //
+    TITER   anchor,        // input 2
+    DIM3    anchor_size,   //
+    STRIDE3 anchor_leap,   //
+    TITER   data,          // output
+    DIM3    data_size,     //
+    STRIDE3 data_leap,     //
+    FP      eb_r,
+    FP      ebx2,
+    int     radius)
+{
+    // compile time variables
+    using E = typename std::remove_pointer<EITER>::type;
+    using T = typename std::remove_pointer<TITER>::type;
+
+    __shared__ struct {
+        E errctrl[9][9][33];
+        T data[9][9][33];
+    } shmem;
+
+    x_reset_scratch_33x9x9data<T, E, LINEAR_BLOCK_SIZE>(shmem.data, shmem.errctrl, anchor, anchor_size, anchor_leap);
+    global2shmem_33x9x9data<E, LINEAR_BLOCK_SIZE>(errctrl, errctrl_size, errctrl_leap, shmem.errctrl);
+    cusz::device_api::spline3d_layout2_interpolate<T, E, FP, LINEAR_BLOCK_SIZE, SPLINE3_DECOMPR, false>(
+        shmem.data, shmem.errctrl, eb_r, ebx2, radius);
+    shmem2global_32x8x8data<T, LINEAR_BLOCK_SIZE>(shmem.data, data, data_size, data_leap);
+}
+
+#undef TIX
+#undef TIY
+#undef TIZ
+#undef BIX
+#undef BIY
+#undef BIZ
+#undef BDX
+#undef BDY
+#undef BDZ
+
+template <typename T, typename E, typename FP, bool NO_R_SEPARATE>
+void launch_construct_Spline3(
+    T*           data,
+    dim3 const   len3,
+    T*           anchor,
+    dim3 const   an_len3,
+    E*           errctrl,
+    dim3 const   ec_len3,
+    double const eb,
+    int const    radius,
+    float&       time_elapsed,
+    cudaStream_t stream)
+{
+    auto divide3 = [](dim3 len, dim3 sublen) {
+        return dim3(
+            (len.x - 1) / sublen.x + 1,  //
+            (len.y - 1) / sublen.y + 1,  //
+            (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
+    constexpr auto SEQ_3D    = dim3(1, 8, 1);
+    constexpr auto BLOCK_3D  = dim3(256, 1, 1);
+    auto           GRID_3D   = divide3(len3, SUBLEN_3D);
+
+    {
+        constexpr auto SUBLEN_TOTAL = SUBLEN_3D.x * SUBLEN_3D.y * SUBLEN_3D.z;
+        constexpr auto SEQ_TOTAL    = SEQ_3D.x * SEQ_3D.y * SEQ_3D.z;
+        constexpr auto BLOCK_TOTAL  = BLOCK_3D.x * BLOCK_3D.y * BLOCK_3D.z;
+
+        // static_assert(SUBLEN_TOTAL / SEQ_TOTAL == BLOCK_TOTAL, "parallelism does not match!");
+        if (SUBLEN_TOTAL / SEQ_TOTAL != BLOCK_TOTAL) throw std::runtime_error("parallelism does not match!");
+    }
+
+    ////////////////////////////////////////
+
+    auto ebx2     = eb * 2;
+    auto eb_r     = 1 / eb;
+    auto leap3    = dim3(1, len3.x, len3.x * len3.y);
+    auto ec_leap3 = dim3(1, ec_len3.x, ec_len3.x * ec_len3.y);
+    auto an_leap3 = dim3(1, an_len3.x, an_len3.x * an_len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    auto d = ndim();
+
+    if (d == 1) {  //
+        throw std::runtime_error("Spline1 not implemented");
+    }
+    else if (d == 2) {
+        throw std::runtime_error("Spline2 not implemented");
+    }
+    else if (d == 3) {
+        cusz::c_spline3d_infprecis_32x8x8data<T*, E*, float, 256, false>  //
+            <<<GRID_3D, BLOCK_3D, 0, stream>>>                            //
+            (data, len3, leap3,                                           //
+             errctrl, ec_len3, ec_leap3,                                  //
+             anchor, an_leap3,                                            //
+             eb_r, ebx2, radius);
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+    TIME_ELAPSED_CUDAEVENT(&time_elapsed);
+
+    DESTROY_CUDAEVENT_PAIR;
+}
+
+template <typename T, typename E, typename FP>
+void launch_reconstruct_Spline3(
+    T*           xdata,
+    dim3 const   len3,
+    T*           anchor,
+    dim3 const   an_len3,
+    E*           errctrl,
+    dim3 const   ec_len3,
+    double const eb,
+    int const    radius,
+    float&       time_elapsed,
+    cudaStream_t stream)
+{
+    auto divide3 = [](dim3 len, dim3 sublen) {
+        return dim3(
+            (len.x - 1) / sublen.x + 1,  //
+            (len.y - 1) / sublen.y + 1,  //
+            (len.z - 1) / sublen.z + 1);
+    };
+
+    /*
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+     */
+
+    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
+    constexpr auto SEQ_3D    = dim3(1, 8, 1);
+    constexpr auto BLOCK_3D  = dim3(256, 1, 1);
+    auto           GRID_3D   = divide3(len3, SUBLEN_3D);
+
+    {
+        constexpr auto SUBLEN_TOTAL = SUBLEN_3D.x * SUBLEN_3D.y * SUBLEN_3D.z;
+        constexpr auto SEQ_TOTAL    = SEQ_3D.x * SEQ_3D.y * SEQ_3D.z;
+        constexpr auto BLOCK_TOTAL  = BLOCK_3D.x * BLOCK_3D.y * BLOCK_3D.z;
+
+        // static_assert(SUBLEN_TOTAL / SEQ_TOTAL == BLOCK_TOTAL, "parallelism does not match!");
+        if (SUBLEN_TOTAL / SEQ_TOTAL != BLOCK_TOTAL) throw std::runtime_error("parallelism does not match!");
+    }
+
+    ////////////////////////////////////////
+
+    auto ebx2     = eb * 2;
+    auto eb_r     = 1 / eb;
+    auto leap3    = dim3(1, len3.x, len3.x * len3.y);
+    auto ec_leap3 = dim3(1, ec_len3.x, ec_len3.x * ec_len3.y);
+    auto an_leap3 = dim3(1, an_len3.x, an_len3.x * an_len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    cusz::x_spline3d_infprecis_32x8x8data<E*, T*, float, 256>  //
+        <<<GRID_3D, BLOCK_3D, 0, stream>>>                     //
+        (errctrl, ec_len3, ec_leap3,                           //
+         anchor, an_len3, an_leap3,                            //
+         xdata, len3, leap3,                                   //
+         eb_r, ebx2, radius);
+
+    STOP_CUDAEVENT_RECORDING(stream);
+
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    TIME_ELAPSED_CUDAEVENT(&time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+}
+
+#endif
diff --git a/qtensor/compression/cusz/src/kernel/detail/subroutine.inl b/qtensor/compression/cusz/src/kernel/detail/subroutine.inl
new file mode 100644
index 00000000..2aa5bb5c
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/detail/subroutine.inl
@@ -0,0 +1,1074 @@
+/**
+ * @file subroutine.inl
+ * @author Jiannan Tian
+ * @brief subroutines of kernels
+ * @version 0.4
+ * @date 2022-12-22
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include <stdint.h>
+#include <type_traits>
+#include "cusz/pn.hh"
+#include "pipeline/compaction_g.inl"
+#include "subsub.inl"
+
+namespace psz {
+namespace cuda {
+namespace __device {
+
+//////// 1D
+
+namespace v0 {
+
+// compression load
+template <typename T, typename FP, int NTHREAD, int SEQ>
+__forceinline__ __device__ void load_prequant_1d(
+    T*          data,
+    uint32_t    dimx,
+    uint32_t    id_base,
+    volatile T* shmem,
+    T           private_buffer[SEQ],
+    T&          prev,
+    FP          ebx2_r);
+
+// decompression load
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void load_fuse_1d(
+    EQ*         quant,
+    T*          outlier,
+    uint32_t    dimx,
+    uint32_t    id_base,
+    int         radius,
+    volatile T* shmem,
+    T           private_buffer[SEQ]);
+
+namespace delta_only {
+
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void
+load_1d(EQ* quant, uint32_t dimx, uint32_t id_base, volatile T* shmem, T private_buffer[SEQ]);
+
+}
+
+// compression and decompression store
+template <typename T1, typename T2, int NTHREAD, int SEQ, bool NO_OUTLIER>
+__forceinline__ __device__ void write_1d(  //
+    volatile T1* shmem_a1,
+    volatile T2* shmem_a2,
+    uint32_t     dimx,
+    uint32_t     id_base,
+    T1*          a1,
+    T2*          a2);
+
+// compression pred-quant, method 1
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void predict_quantize__no_outlier_1d(  //
+    T            private_buffer[SEQ],
+    volatile EQ* shmem_quant,
+    T            prev = 0);
+
+// compression pred-quant, method 2
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void predict_quantize_1d(  //
+    T            private_buffer[SEQ],
+    volatile EQ* shmem_quant,
+    volatile T*  shmem_outlier,
+    int          radius,
+    T            prev = 0);
+
+namespace compaction {
+
+template <
+    typename T,
+    typename EQ,
+    int  SEQ,
+    bool FIRST_POINT,
+    typename Compaction = CompactionDRAM<T>>
+__forceinline__ __device__ void predict_quantize_1d(  //
+    T            thp_buffer[SEQ],
+    volatile EQ* s_quant,
+    uint32_t     dimx,
+    int          radius,
+    uint32_t     g_id_base,
+    Compaction   g_outlier,
+    T            prev = 0);
+
+}
+
+// decompression pred-quant
+template <typename T, int SEQ, int NTHREAD>
+__forceinline__ __device__ void block_scan_1d(
+    T           private_buffer[SEQ],
+    T           ebx2,
+    volatile T* exchange_in,
+    volatile T* exchange_out,
+    volatile T* shmem_buffer);
+
+}  // namespace v0
+
+namespace v1_pn {
+
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void
+load_fuse_1d(EQ* quant, T* outlier, uint32_t dimx, uint32_t id_base, volatile T* shmem, T private_buffer[SEQ]);
+
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void
+predict_quantize__no_outlier_1d(T private_buffer[SEQ], volatile EQ* shmem_quant, T prev);
+
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void
+predict_quantize_1d(T private_buffer[SEQ], volatile EQ* shmem_quant, volatile T* shmem_outlier, int radius, T prev);
+
+namespace compaction {
+
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT, typename Compaction>
+__forceinline__ __device__ void predict_quantize_1d(
+    T            thp_buffer[SEQ],
+    volatile EQ* s_quant,
+    uint32_t     dimx,
+    int          radius,
+    uint32_t     g_idx_base,
+    Compaction   outlier,
+    T            prev);
+
+}
+
+namespace delta_only {
+
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void
+load_1d(EQ* quant, uint32_t dimx, uint32_t id_base, volatile T* shmem, T private_buffer[SEQ]);
+
+}
+
+}  // namespace v1_pn
+
+//////// 2D
+
+namespace v0 {
+
+template <typename T, typename FP, int YSEQ>
+__forceinline__ __device__ void load_prequant_2d(
+    T*       data,
+    uint32_t dimx,
+    uint32_t gix,
+    uint32_t dimy,
+    uint32_t giy_base,
+    uint32_t stridey,
+    FP       ebx2_r,
+    T        center[YSEQ + 1]);
+
+template <typename T, typename FP, int YSEQ>
+__forceinline__ __device__ void predict_2d(T center[YSEQ + 1]);
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void quantize_write_2d(
+    T        delta[YSEQ + 1],
+    uint32_t dimx,
+    uint32_t gix,
+    uint32_t dimy,
+    uint32_t giy_base,
+    uint32_t stridey,
+    int      radius,
+    EQ*      quant,
+    T*       outlier);
+
+namespace delta_only {
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void quantize_write_2d(
+    T        delta[YSEQ + 1],
+    uint32_t dimx,
+    uint32_t gix,
+    uint32_t dimy,
+    uint32_t giy_base,
+    uint32_t stridey,
+    EQ*      quant);
+
+}
+
+namespace compaction {
+
+template <typename T, typename EQ, int YSEQ, typename Compaction>
+__forceinline__ __device__ void quantize_write_2d(
+    T          delta[YSEQ + 1],
+    uint32_t   dimx,
+    uint32_t   gix,
+    uint32_t   dimy,
+    uint32_t   giy_base,
+    uint32_t   stridey,
+    int        radius,
+    EQ*        quant,
+    Compaction outlier);
+
+};
+
+// decompression load
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void load_fuse_2d(
+    EQ*      quant,
+    T*       outlier,
+    uint32_t dimx,
+    uint32_t gix,
+    uint32_t dimy,
+    uint32_t giy_base,
+    uint32_t stridey,
+    int      radius,
+    T        private_buffer[YSEQ]);
+
+namespace delta_only {
+// decompression load
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void load_2d(
+    EQ*      quant,
+    uint32_t dimx,
+    uint32_t gix,
+    uint32_t dimy,
+    uint32_t giy_base,
+    uint32_t stridey,
+    T        private_buffer[YSEQ]);
+
+}  // namespace delta_only
+
+template <typename T, typename EQ, typename FP, int YSEQ>
+__forceinline__ __device__ void block_scan_2d(  //
+    T           thread_private[YSEQ],
+    volatile T* intermediate,
+    FP          ebx2);
+
+template <typename T, int YSEQ>
+__forceinline__ __device__ void decomp_write_2d(
+    T        thread_private[YSEQ],
+    uint32_t dimx,
+    uint32_t gix,
+    uint32_t dimy,
+    uint32_t giy_base,
+    uint32_t stridey,
+    T*       xdata);
+
+}  // namespace v0
+
+namespace v1_pn {
+
+namespace compaction {
+template <typename T, typename EQ, int YSEQ, typename Compaction>
+__forceinline__ __device__ void quantize_write_2d(
+    // clang-format off
+    T        delta[YSEQ + 1],
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    int      radius,
+    EQ*      quant,
+    Compaction outlier
+    // clang-format on
+);
+
+}
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void load_fuse_2d(
+    // clang-format off
+    EQ*      quant,
+    T*       outlier,
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    int      radius,
+    T        thread_private[YSEQ]
+    // clang-format on
+);
+
+namespace delta_only {
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void load_2d(
+    // clang-format off
+    EQ*      quant,
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    T        thread_private[YSEQ]
+    // clang-format on
+);
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void quantize_write_2d(
+    T        delta[YSEQ + 1],
+    uint32_t dimx,
+    uint32_t gix,
+    uint32_t dimy,
+    uint32_t giy_base,
+    uint32_t stridey,
+    EQ*      quant);
+
+}  // namespace delta_only
+
+}  // namespace v1_pn
+
+//////// 3D
+
+namespace v0 {
+
+// TODO move subroutines for 3D here
+
+}
+
+}  // namespace __device
+}  // namespace cuda
+}  // namespace psz
+
+////////////////////////////////////////////////////////////////////////////////
+
+//////// 1D
+
+template <typename T, typename FP, int NTHREAD, int SEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::load_prequant_1d(
+    T*          data,
+    uint32_t    dimx,
+    uint32_t    id_base,
+    volatile T* shmem,
+    T           private_buffer[SEQ],
+    T&          prev,  // TODO use pointer?
+    FP          ebx2_r)
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id = id_base + threadIdx.x + i * NTHREAD;
+        if (id < dimx) shmem[threadIdx.x + i * NTHREAD] = round(data[id] * ebx2_r);
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i];
+    if (threadIdx.x > 0) prev = shmem[threadIdx.x * SEQ - 1];
+    __syncthreads();
+}
+
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::load_fuse_1d(
+    EQ*         quant,
+    T*          outlier,
+    uint32_t    dimx,
+    uint32_t    id_base,
+    int         radius,
+    volatile T* shmem,
+    T           private_buffer[SEQ])
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto local_id = threadIdx.x + i * NTHREAD;
+        auto id       = id_base + local_id;
+        if (id < dimx) shmem[local_id] = outlier[id] + static_cast<T>(quant[id]) - radius;
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i];
+    __syncthreads();
+}
+
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::load_fuse_1d(
+    EQ*         quant,
+    T*          outlier,
+    uint32_t    dimx,
+    uint32_t    id_base,
+    volatile T* shmem,
+    T           private_buffer[SEQ])
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto local_id = threadIdx.x + i * NTHREAD;
+        auto id       = id_base + local_id;
+        if (id < dimx) shmem[local_id] = outlier[id] + PN<BYTEWIDTH>::decode(quant[id]);
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i];
+    __syncthreads();
+}
+
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::delta_only::load_1d(
+    EQ*         quant,
+    uint32_t    dimx,
+    uint32_t    id_base,
+    volatile T* shmem,
+    T           private_buffer[SEQ])
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto local_id = threadIdx.x + i * NTHREAD;
+        auto id       = id_base + local_id;
+        if (id < dimx) shmem[local_id] = static_cast<T>(quant[id]);
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i];
+    __syncthreads();
+}
+
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::delta_only::load_1d(
+    EQ*         quant,
+    uint32_t    dimx,
+    uint32_t    id_base,
+    volatile T* shmem,
+    T           private_buffer[SEQ])
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto local_id = threadIdx.x + i * NTHREAD;
+        auto id       = id_base + local_id;
+        if (id < dimx) shmem[local_id] = PN<BYTEWIDTH>::decode(quant[id]);
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i];
+    __syncthreads();
+}
+
+template <typename T1, typename T2, int NTHREAD, int SEQ, bool NO_OUTLIER>  // TODO remove NO_OUTLIER, use nullable
+__forceinline__ __device__ void psz::cuda::__device::v0::write_1d(
+    volatile T1* shmem_a1,
+    volatile T2* shmem_a2,
+    uint32_t     dimx,
+    uint32_t     id_base,
+    T1*          a1,
+    T2*          a2)
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id = id_base + threadIdx.x + i * NTHREAD;
+        if (id < dimx) {
+            if (NO_OUTLIER) {  //
+                a1[id] = shmem_a1[threadIdx.x + i * NTHREAD];
+            }
+            else {
+                a1[id] = shmem_a1[threadIdx.x + i * NTHREAD];
+                a2[id] = shmem_a2[threadIdx.x + i * NTHREAD];
+            }
+        }
+    }
+}
+
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void psz::cuda::__device::v0::predict_quantize__no_outlier_1d(  //
+    T            private_buffer[SEQ],
+    volatile EQ* shmem_quant,
+    T            prev)
+{
+    auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) {
+        shmem_quant[idx + threadIdx.x * SEQ] = static_cast<EQ>(cur - prev);
+    };
+
+    if (FIRST_POINT) {  // i == 0
+        quantize_1d(private_buffer[0], prev, 0);
+    }
+    else {
+#pragma unroll
+        for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i);
+        __syncthreads();
+    }
+}
+
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void psz::cuda::__device::v0::predict_quantize_1d(
+    T            private_buffer[SEQ],
+    volatile EQ* shmem_quant,
+    volatile T*  shmem_outlier,
+    int          radius,
+    T            prev)
+{
+    auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) {
+        T    delta       = cur - prev;
+        bool quantizable = fabs(delta) < radius;
+        T    candidate   = delta + radius;
+
+        // otherwise, need to reset shared memory (to 0)
+        shmem_quant[idx + threadIdx.x * SEQ]   = quantizable * static_cast<EQ>(candidate);
+        shmem_outlier[idx + threadIdx.x * SEQ] = (not quantizable) * candidate;
+    };
+
+    if (FIRST_POINT) {  // i == 0
+        quantize_1d(private_buffer[0], prev, 0);
+    }
+    else {
+#pragma unroll
+        for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i);
+        __syncthreads();
+    }
+}
+
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT, typename Compaction>
+__forceinline__ __device__ void psz::cuda::__device::v0::compaction::predict_quantize_1d(
+    T            thp_buffer[SEQ],
+    volatile EQ* s_quant,
+    uint32_t     dimx,  // put x-related
+    int          radius,
+    uint32_t     g_idx_base,  // TODO this file `id_base` to `g_idx_base`
+    Compaction   outlier,
+    T            prev)
+{
+    auto quantize_1d = [&](T& cur, T& prev, uint32_t inloop_idx) {
+        T    delta       = cur - prev;
+        bool quantizable = fabs(delta) < radius;
+        T    candidate   = delta + radius;
+
+        auto inblock_idx = inloop_idx + threadIdx.x * SEQ;  // TODO this file use `inblock_idx`
+
+        // though quantizable, need to set non-quantizable position as 0
+        s_quant[inblock_idx] = quantizable * static_cast<EQ>(candidate);
+
+        // very small chance running into this block
+        if (not quantizable) {
+            auto g_idx = inblock_idx + g_idx_base;
+            if (g_idx < dimx) {
+                auto cur_idx         = atomicAdd(outlier.count, 1);
+                outlier.val[cur_idx] = candidate;
+                outlier.idx[cur_idx] = g_idx;
+            }
+        }
+    };
+
+    if (FIRST_POINT) {  // i == 0
+        quantize_1d(thp_buffer[0], prev, 0);
+    }
+    else {
+#pragma unroll
+        for (auto i = 1; i < SEQ; i++) quantize_1d(thp_buffer[i], thp_buffer[i - 1], i);
+        __syncthreads();  // TODO move __syncthreads() outside this subroutine?
+    }
+}
+
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT, typename Compaction>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::compaction::predict_quantize_1d(
+    T            thp_buffer[SEQ],
+    volatile EQ* s_quant,
+    uint32_t     dimx,  // put x-related
+    int          radius,
+    uint32_t     g_idx_base,  // TODO this file `id_base` to `g_idx_base`
+    Compaction   outlier,
+    T            prev)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    auto quantize_1d = [&](T& cur, T& prev, uint32_t inloop_idx) {
+        T    delta       = cur - prev;
+        bool quantizable = fabs(delta) < radius;
+        UI   UI_delta    = PN<BYTEWIDTH>::encode(static_cast<I>(delta));
+
+        auto inblock_idx = inloop_idx + threadIdx.x * SEQ;  // TODO this file use `inblock_idx`
+
+        // though quantizable, need to set non-quantizable position as 0
+        s_quant[inblock_idx] = quantizable * UI_delta;
+
+        // very small chance running into this block
+        if (not quantizable) {
+            auto g_idx = inblock_idx + g_idx_base;
+            if (g_idx < dimx) {
+                auto cur_idx         = atomicAdd(outlier.count, 1);
+                outlier.val[cur_idx] = delta;
+                outlier.idx[cur_idx] = g_idx;
+            }
+        }
+    };
+
+    if (FIRST_POINT) {  // i == 0
+        quantize_1d(thp_buffer[0], prev, 0);
+    }
+    else {
+#pragma unroll
+        for (auto i = 1; i < SEQ; i++) quantize_1d(thp_buffer[i], thp_buffer[i - 1], i);
+        __syncthreads();  // TODO move __syncthreads() outside this subroutine?
+    }
+}
+
+// decompression pred-quant
+template <typename T, int SEQ, int NTHREAD>
+__forceinline__ __device__ void psz::cuda::__device::v0::block_scan_1d(
+    T           private_buffer[SEQ],
+    T           ebx2,
+    volatile T* exchange_in,
+    volatile T* exchange_out,
+    volatile T* shmem_buffer)
+{
+    namespace wave32 = psz::cuda::__device::wave32;
+    wave32::intrawarp_inclusivescan_1d<T, SEQ>(private_buffer);
+    wave32::intrablock_exclusivescan_1d<T, SEQ, NTHREAD>(private_buffer, exchange_in, exchange_out);
+
+    // put back to shmem
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) shmem_buffer[threadIdx.x * SEQ + i] = private_buffer[i] * ebx2;
+    __syncthreads();
+}
+
+// v1_pn: quantization code uses PN::encode
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::predict_quantize__no_outlier_1d(  //
+    T            private_buffer[SEQ],
+    volatile EQ* shmem_quant,
+    T            prev)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) {
+        UI UI_delta                          = PN<BYTEWIDTH>::encode(static_cast<I>(cur - prev));
+        shmem_quant[idx + threadIdx.x * SEQ] = UI_delta;
+    };
+
+    if (FIRST_POINT) {  // i == 0
+        quantize_1d(private_buffer[0], prev, 0);
+    }
+    else {
+#pragma unroll
+        for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i);
+        __syncthreads();
+    }
+}
+
+// template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+// __forceinline__ __device__ void psz::cuda::__device::v1_pn::predict_quantize_1d(
+//     T            private_buffer[SEQ],
+//     volatile EQ* shmem_quant,
+//     volatile T*  shmem_outlier,
+//     int          radius,
+//     T            prev)
+// {
+//     constexpr auto BYTEWIDTH = sizeof(EQ);
+//     using UI                 = EQ;
+//     using I                  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+//     auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) {
+//         T    delta       = cur - prev;
+//         bool quantizable = fabs(delta) < radius;
+//         UI   UI_delta    = PN<BYTEWIDTH>::encode(static_cast<I>(delta));
+
+//         // otherwise, need to reset shared memory (to 0)
+//         shmem_quant[idx + threadIdx.x * SEQ]   = quantizable * UI_delta;
+//         shmem_outlier[idx + threadIdx.x * SEQ] = (not quantizable) * delta;
+//     };
+
+//     if (FIRST_POINT) {  // i == 0
+//         quantize_1d(private_buffer[0], prev, 0);
+//     }
+//     else {
+// #pragma unroll
+//         for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i);
+//         __syncthreads();
+//     }
+// }
+
+////////////////////////////////////////////////////////////////////////////////
+
+//////// 2D
+
+template <typename T, typename FP, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::load_prequant_2d(
+    // clang-format off
+    T*       data,
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    FP ebx2_r,
+    T  center[YSEQ + 1]
+    // clang-format on
+)
+{
+    auto g_id = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
+
+    // use a warp as two half-warps
+    // block_dim = (16, 2, 1) makes a full warp internally
+
+#pragma unroll
+    for (auto iy = 0; iy < YSEQ; iy++) {
+        if (gix < dimx and giy_base + iy < dimy) center[iy + 1] = round(data[g_id(iy)] * ebx2_r);
+    }
+    auto tmp = __shfl_up_sync(0xffffffff, center[YSEQ], 16, 32);  // same-warp, next-16
+    if (threadIdx.y == 1) center[0] = tmp;
+}
+
+template <typename T, typename FP, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::predict_2d(T center[YSEQ + 1])
+{
+    /*
+       Lorenzo 2D (1-layer) illustration
+                 NW N NE
+       notation   W C E   "->" to predict
+       --------  SW S SE
+
+                normal data layout       |   considering register file
+                col(k-1)    col(k)       |   thread(k-1)        thread(k)
+                                         |
+       r(i-1)  -west[i-1]  +center[i-1]  |  -center(k-1)[i-1]  +center(k)[i-1]
+       r(i  )  +west[i]   ->center[i]    |  +center(k-1)[i]   ->center(k)[i]
+
+       calculation
+       -----------
+       delta = center[i] - (center[i-1] + west[i] - west[i-1])
+             = (center[i] - center[i-1]) - (west[i] - west[i-1])
+
+       With center[i] -= center[i-1] and west[i] -= west[i-1],
+       delta = center[i] - west[i]
+
+       For thread(k),
+       delta(k) = center(k)[i] - center(k-1)[i]
+                = center(k)[i] - SHFL_UP(center(k)[i], 1, HALF_WARP)
+     */
+
+#pragma unroll
+    for (auto i = YSEQ; i > 0; i--) {
+        // with center[i-1] intact in this iteration
+        center[i] -= center[i - 1];
+        // within a halfwarp (32/2)
+        auto west = __shfl_up_sync(0xffffffff, center[i], 1, 16);
+        if (threadIdx.x > 0) center[i] -= west;  // delta
+    }
+    __syncthreads();
+}
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::quantize_write_2d(
+    // clang-format off
+    T        delta[YSEQ + 1],
+    uint32_t dimx,  uint32_t gix,
+    uint32_t dimy,  uint32_t giy_base, uint32_t stridey,
+    int      radius,
+    EQ*      quant, 
+    T*       outlier
+    // clang-format on
+)
+{
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 1; i < YSEQ + 1; i++) {
+        auto gid = get_gid(i - 1);
+
+        if (gix < dimx and giy_base + (i - 1) < dimy) {
+            bool quantizable = fabs(delta[i]) < radius;
+            T    candidate   = delta[i] + radius;
+
+            // outlier array is not in sparse form in this version
+            quant[gid]   = quantizable * static_cast<EQ>(candidate);
+            outlier[gid] = (not quantizable) * candidate;
+        }
+    }
+}
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::delta_only::quantize_write_2d(
+    // clang-format off
+    T        delta[YSEQ + 1],
+    uint32_t dimx,  uint32_t gix,
+    uint32_t dimy,  uint32_t giy_base, uint32_t stridey,
+    EQ*      quant
+    // clang-format on
+)
+{
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 1; i < YSEQ + 1; i++) {
+        auto gid = get_gid(i - 1);
+        if (gix < dimx and giy_base + (i - 1) < dimy) quant[gid] = static_cast<EQ>(delta[i]);
+    }
+}
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::delta_only::quantize_write_2d(
+    // clang-format off
+    T        delta[YSEQ + 1],
+    uint32_t dimx,  uint32_t gix,
+    uint32_t dimy,  uint32_t giy_base, uint32_t stridey,
+    EQ*      quant
+    // clang-format on
+)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 1; i < YSEQ + 1; i++) {
+        auto gid = get_gid(i - 1);
+        if (gix < dimx and giy_base + (i - 1) < dimy) quant[gid] = PN<BYTEWIDTH>::encode(static_cast<I>(delta[i]));
+    }
+}
+
+template <typename T, typename EQ, int YSEQ, typename Compaction>
+__forceinline__ __device__ void psz::cuda::__device::v0::compaction::quantize_write_2d(
+    // clang-format off
+    T        delta[YSEQ + 1],
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    int      radius,
+    EQ*      quant,
+    Compaction outlier
+    // clang-format on
+)
+{
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 1; i < YSEQ + 1; i++) {
+        auto gid = get_gid(i - 1);
+
+        if (gix < dimx and giy_base + (i - 1) < dimy) {
+            bool quantizable = fabs(delta[i]) < radius;
+            T    candidate   = delta[i] + radius;
+
+            // The non-quantizable is recorded as "0" (radius).
+            quant[gid] = quantizable * static_cast<EQ>(candidate);
+
+            if (not quantizable) {
+                auto cur_idx         = atomicAdd(outlier.count, 1);
+                outlier.idx[cur_idx] = gid;
+                outlier.val[cur_idx] = candidate;
+            }
+        }
+    }
+}
+
+template <typename T, typename EQ, int YSEQ, typename Compaction>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::compaction::quantize_write_2d(
+    // clang-format off
+    T        delta[YSEQ + 1],
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    int      radius,
+    EQ*      quant,
+    Compaction outlier
+    // clang-format on
+)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 1; i < YSEQ + 1; i++) {
+        auto gid = get_gid(i - 1);
+
+        if (gix < dimx and giy_base + (i - 1) < dimy) {
+            bool quantizable = fabs(delta[i]) < radius;
+            UI   UI_delta    = PN<BYTEWIDTH>::encode(static_cast<I>(delta[i]));
+
+            // The non-quantizable is recorded as "0" (radius).
+            quant[gid] = quantizable * UI_delta;
+
+            if (not quantizable) {
+                auto cur_idx         = atomicAdd(outlier.count, 1);
+                outlier.idx[cur_idx] = gid;
+                outlier.val[cur_idx] = delta[i];
+            }
+        }
+    }
+}
+
+// load to thread-private array (fuse at the same time)
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::load_fuse_2d(
+    // clang-format off
+    EQ*      quant,
+    T*       outlier,
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    int      radius,
+    T        thread_private[YSEQ]
+    // clang-format on
+)
+{
+    auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
+        if (gix < dimx and (giy_base + i) < dimy)
+            thread_private[i] = outlier[gid] + static_cast<T>(quant[gid]) - radius;  // fuse
+        else
+            thread_private[i] = 0;  // TODO set as init state?
+    }
+}
+
+// load to thread-private array (fuse at the same time)
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::load_fuse_2d(
+    // clang-format off
+    EQ*      quant,
+    T*       outlier,
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    int      radius,
+    T        thread_private[YSEQ]
+    // clang-format on
+)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
+        if (gix < dimx and (giy_base + i) < dimy)
+            thread_private[i] = outlier[gid] + PN<BYTEWIDTH>::decode(quant[gid]);  // fuse
+        else
+            thread_private[i] = 0;  // TODO set as init state?
+    }
+}
+
+// load to thread-private array (fuse at the same time)
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::delta_only::load_2d(
+    // clang-format off
+    EQ*      quant,
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    T        thread_private[YSEQ]
+    // clang-format on
+)
+{
+    auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
+        if (gix < dimx and (giy_base + i) < dimy)
+            thread_private[i] = static_cast<T>(quant[gid]);
+        else
+            thread_private[i] = 0;  // TODO set as init state?
+    }
+}
+
+// load to thread-private array (fuse at the same time)
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::delta_only::load_2d(
+    // clang-format off
+    EQ*      quant,
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    T        thread_private[YSEQ]
+    // clang-format on
+)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
+        if (gix < dimx and (giy_base + i) < dimy)
+            thread_private[i] = PN<BYTEWIDTH>::decode(quant[gid]);
+        else
+            thread_private[i] = 0;  // TODO set as init state?
+    }
+}
+
+// partial-sum along y-axis, sequantially
+// then, in-warp partial-sum along x-axis
+template <typename T, typename EQ, typename FP, int YSEQ>
+__forceinline__ __device__ void
+psz::cuda::__device::v0::block_scan_2d(T thread_private[YSEQ], volatile T* intermediate, FP ebx2)
+{
+    //       ------> gix (x)
+    //
+    //   |   t(0,0)       t(0,1)       t(0,2)       t(0,3)       ... t(0,f)
+    //   |
+    //   |   thp(0,0)[0]  thp(0,0)[0]  thp(0,0)[0]  thp(0,0)[0]
+    //  giy  thp(0,0)[1]  thp(0,0)[1]  thp(0,0)[1]  thp(0,0)[1]
+    //  (y)  |            |            |            |
+    //       thp(0,0)[7]  thp(0,0)[7]  thp(0,0)[7]  thp(0,0)[7]
+    //
+    //   |   t(1,0)       t(1,1)       t(1,2)       t(1,3)       ... t(1,f)
+    //   |
+    //   |   thp(1,0)[0]  thp(1,0)[0]  thp(1,0)[0]  thp(1,0)[0]
+    //  giy  thp(1,0)[1]  thp(1,0)[1]  thp(1,0)[1]  thp(1,0)[1]
+    //  (y)  |            |            |            |
+    //       thp(1,0)[7]  thp(1,0)[7]  thp(1,0)[7]  thp(1,0)[7]
+
+    constexpr auto BLOCK = 16;
+
+    for (auto i = 1; i < YSEQ; i++) thread_private[i] += thread_private[i - 1];
+    // two-pass: store for cross-thread-private update
+    // TODO shuffle up by 16 in the same warp
+    if (threadIdx.y == 0) intermediate[threadIdx.x] = thread_private[YSEQ - 1];
+    __syncthreads();
+    // broadcast the partial-sum result from a previous segment
+    if (threadIdx.y == 1) {
+        auto tmp = intermediate[threadIdx.x];
+#pragma unroll
+        for (auto i = 0; i < YSEQ; i++) thread_private[i] += tmp;  // regression as pointer
+    }
+    // implicit sync as there is half-warp divergence
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        for (auto d = 1; d < BLOCK; d *= 2) {
+            T n = __shfl_up_sync(0xffffffff, thread_private[i], d, 16);  // half-warp shuffle
+            if (threadIdx.x >= d) thread_private[i] += n;
+        }
+        thread_private[i] *= ebx2;  // scale accordingly
+    }
+}
+
+// write to DRAM
+template <typename T, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::decomp_write_2d(
+    // clang-format off
+    T        thread_private[YSEQ],
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    T*       xdata
+    // clang-format on
+)
+{
+    auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        if (gix < dimx and (giy_base + i) < dimy) xdata[gid] = thread_private[i];
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+//////// 3D
diff --git a/qtensor/compression/cusz/src/kernel/detail/subsub.inl b/qtensor/compression/cusz/src/kernel/detail/subsub.inl
new file mode 100644
index 00000000..4d34fdc6
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/detail/subsub.inl
@@ -0,0 +1,92 @@
+/**
+ * @file subsub.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2022-12-26
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+namespace psz {
+namespace cuda {
+namespace __device {
+
+namespace wave32 {
+template <typename T, int SEQ>
+__forceinline__ __device__ void intrawarp_inclusivescan_1d(  //
+    T private_buffer[SEQ]);
+
+template <typename T, int SEQ, int NTHREAD>
+__forceinline__ __device__ void intrablock_exclusivescan_1d(  //
+    T           private_buffer[SEQ],
+    volatile T* exchange_in,
+    volatile T* exchange_out);
+}  // namespace wave32
+
+}  // namespace __device
+}  // namespace cuda
+}  // namespace psz
+
+template <typename T, int SEQ>
+__forceinline__ __device__ void psz::cuda::__device::wave32::intrawarp_inclusivescan_1d(T private_buffer[SEQ])
+{
+    for (auto i = 1; i < SEQ; i++) private_buffer[i] += private_buffer[i - 1];
+    T addend = private_buffer[SEQ - 1];
+
+    // in-warp shuffle
+    for (auto d = 1; d < 32; d *= 2) {
+        T n = __shfl_up_sync(0xffffffff, addend, d, 32);
+        if (threadIdx.x % 32 >= d) addend += n;
+    }
+    // exclusive scan
+    T prev_addend = __shfl_up_sync(0xffffffff, addend, 1, 32);
+
+    // propagate
+    if (threadIdx.x % 32 > 0)
+        for (auto i = 0; i < SEQ; i++) private_buffer[i] += prev_addend;
+}
+
+template <typename T, int SEQ, int NTHREAD>
+__forceinline__ __device__ void psz::cuda::__device::wave32::intrablock_exclusivescan_1d(
+    T           private_buffer[SEQ],
+    volatile T* exchange_in,
+    volatile T* exchange_out)
+{
+    constexpr auto NWARP = NTHREAD / 32;
+    static_assert(NWARP <= 32, "too big");
+
+    auto warp_id = threadIdx.x / 32;
+    auto lane_id = threadIdx.x % 32;
+
+    if (lane_id == 31) exchange_in[warp_id] = private_buffer[SEQ - 1];
+    __syncthreads();
+
+    if (NWARP <= 8) {
+        if (threadIdx.x == 0) {
+            exchange_out[0] = 0;
+            for (auto i = 1; i < NWARP; i++) exchange_out[i] = exchange_out[i - 1] + exchange_in[i - 1];
+        }
+    }
+    else if (NWARP <= 32) {
+        if (threadIdx.x <= 32) {
+            auto addend = exchange_in[threadIdx.x];
+
+            for (auto d = 1; d < 32; d *= 2) {
+                T n = __shfl_up_sync(0xffffffff, addend, d, 32);
+                if (threadIdx.x >= d) addend += n;
+            }
+            // exclusive scan
+            T prev_addend         = __shfl_up_sync(0xffffffff, addend, 1, 32);
+            exchange_out[warp_id] = (warp_id > 0) * prev_addend;
+        }
+    }
+    // else-case handled by static_assert
+    __syncthreads();
+
+    // propagate
+    auto addend = exchange_out[warp_id];
+    for (auto i = 0; i < SEQ; i++) private_buffer[i] += addend;
+    __syncthreads();
+};
diff --git a/qtensor/compression/cusz/src/kernel/lorenzo.cu b/qtensor/compression/cusz/src/kernel/lorenzo.cu
new file mode 100644
index 00000000..fe5e6a25
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/lorenzo.cu
@@ -0,0 +1,209 @@
+/**
+ * @file lorenzo.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-01
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "cusz/type.h"
+#include "utils/cuda_err.cuh"
+#include "utils/timer.h"
+
+#include "kernel/lorenzo_all.hh"
+
+// #include "detail/lorenzo.inl"
+#include "detail/lorenzo23.inl"
+
+template <typename T, typename EQ, typename FP>
+cusz_error_status compress_predict_lorenzo_i(
+    T* const     data,
+    dim3 const   len3,
+    double const eb,
+    int const    radius,
+    EQ* const    eq,
+    T* const     outlier,
+    uint32_t*    outlier_idx,
+    uint32_t*    num_outliers,
+    float*       time_elapsed,
+    cudaStream_t stream)
+{
+    auto divide3 = [](dim3 len, dim3 sublen) {
+        return dim3(
+            (len.x - 1) / sublen.x + 1,  //
+            (len.y - 1) / sublen.y + 1,  //
+            (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_1D = 256;
+    constexpr auto SEQ_1D    = 4;  // x-sequentiality == 4
+    constexpr auto BLOCK_1D  = dim3(256 / 4, 1, 1);
+    auto           GRID_1D   = divide3(len3, SUBLEN_1D);
+
+    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
+    // constexpr auto SEQ_2D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    constexpr auto BLOCK_2D = dim3(16, 2, 1);
+    auto           GRID_2D  = divide3(len3, SUBLEN_2D);
+
+    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
+    // constexpr auto SEQ_3D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    // constexpr auto BLOCK_3D = dim3(32, 1, 8);  // for v0
+    constexpr auto BLOCK_3D = dim3(32, 8, 1);  // for v0::r1_shfl
+    auto           GRID_3D  = divide3(len3, SUBLEN_3D);
+
+    auto d = ndim();
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    if (d == 1) {
+        //::cusz::c_lorenzo_1d1l<T, EQ, FP, SUBLEN_1D, SEQ_1D>
+        //<<<GRID_1D, BLOCK_1D, 0, stream>>>(data, eq, outlier, len3, leap3, radius, ebx2_r);
+
+        psz::cuda::__kernel::v0::c_lorenzo_1d1l<T, EQ, FP, SUBLEN_1D, SEQ_1D>
+            <<<GRID_1D, BLOCK_1D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+    else if (d == 2) {
+        //::cusz::c_lorenzo_2d1l_16x16data_mapto16x2<T, EQ, FP>
+        //<<<GRID_2D, BLOCK_2D, 0, stream>>>(data, eq, outlier, len3, leap3, radius, ebx2_r);
+        psz::cuda::__kernel::v0::c_lorenzo_2d1l<T, EQ, FP>
+            <<<GRID_2D, BLOCK_2D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+    else if (d == 3) {
+        //::cusz::c_lorenzo_3d1l_32x8x8data_mapto32x1x8<T, EQ, FP>
+        //<<<GRID_3D, BLOCK_3D, 0, stream>>>(data, eq, outlier, len3, leap3, radius, ebx2_r);
+        psz::cuda::__kernel::v0::c_lorenzo_3d1l<T, EQ, FP>
+            <<<GRID_3D, BLOCK_3D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+    TIME_ELAPSED_CUDAEVENT(time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+template <typename T, typename EQ, typename FP>
+cusz_error_status decompress_predict_lorenzo_i(
+    EQ*            eq,
+    dim3 const     len3,
+    T*             outlier,
+    uint32_t*      outlier_idx,
+    uint32_t const num_outliers,
+    double const   eb,
+    int const      radius,
+    T*             xdata,
+    float*         time_elapsed,
+    cudaStream_t   stream)
+{
+    auto divide3 = [](dim3 len, dim3 sublen) {
+        return dim3(
+            (len.x - 1) / sublen.x + 1,  //
+            (len.y - 1) / sublen.y + 1,  //
+            (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_1D = 256;
+    constexpr auto SEQ_1D    = 8;  // x-sequentiality == 8
+    constexpr auto BLOCK_1D  = dim3(256 / 8, 1, 1);
+    auto           GRID_1D   = divide3(len3, SUBLEN_1D);
+
+    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
+    // constexpr auto SEQ_2D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    constexpr auto BLOCK_2D = dim3(16, 2, 1);
+    auto           GRID_2D  = divide3(len3, SUBLEN_2D);
+
+    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
+    // constexpr auto SEQ_3D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    constexpr auto BLOCK_3D = dim3(32, 1, 8);
+    auto           GRID_3D  = divide3(len3, SUBLEN_3D);
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
+
+    auto d = ndim();
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    if (d == 1) {
+        //::cusz::x_lorenzo_1d1l<T, EQ, FP, SUBLEN_1D, SEQ_1D>
+        //<<<GRID_1D, BLOCK_1D, 0, stream>>>(outlier, eq, xdata, len3, leap3, radius, ebx2);
+        psz::cuda::__kernel::v0::x_lorenzo_1d1l<T, EQ, FP, SUBLEN_1D, SEQ_1D>
+            <<<GRID_1D, BLOCK_1D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+    else if (d == 2) {
+        //::cusz::x_lorenzo_2d1l_16x16data_mapto16x2<T, EQ, FP>
+        //<<<GRID_2D, BLOCK_2D, 0, stream>>>(outlier, eq, xdata, len3, leap3, radius, ebx2);
+        psz::cuda::__kernel::v0::x_lorenzo_2d1l<T, EQ, FP>
+            <<<GRID_2D, BLOCK_2D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+    else if (d == 3) {
+        //::cusz::x_lorenzo_3d1l_32x8x8data_mapto32x1x8<T, EQ, FP>
+        //<<<GRID_3D, BLOCK_3D, 0, stream>>>(outlier, eq, xdata, len3, leap3, radius, ebx2);
+        psz::cuda::__kernel::v0::x_lorenzo_3d1l<T, EQ, FP>
+            <<<GRID_3D, BLOCK_3D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+    TIME_ELAPSED_CUDAEVENT(time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(T, EQ)                                                                     \
+    template cusz_error_status compress_predict_lorenzo_i<T, EQ>(                                                  \
+        T* const data, dim3 const len3, double const eb, int const radius, EQ* const eq, T* const outlier,         \
+        uint32_t* outlier_idx, uint32_t* num_outliers, float* time_elapsed, cudaStream_t stream);                  \
+                                                                                                                   \
+    template cusz_error_status decompress_predict_lorenzo_i<T, EQ>(                                                \
+        EQ * eq, dim3 const len3, T* outlier, uint32_t* outlier_idx, uint32_t const num_outliers, double const eb, \
+        int const radius, T* xdata, float* time_elapsed, cudaStream_t stream);
+
+// before 2023
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, uint8_t);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, uint16_t);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, uint32_t);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, float);
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, uint8_t);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, uint16_t);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, uint32_t);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, float);
+
+// 2023
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, int32_t);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, int32_t);
+
+#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER
diff --git a/qtensor/compression/cusz/src/kernel/lorenzo_proto.cu b/qtensor/compression/cusz/src/kernel/lorenzo_proto.cu
new file mode 100644
index 00000000..3dcbadb3
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/lorenzo_proto.cu
@@ -0,0 +1,176 @@
+/**
+ * @file claunch_cuda_proto.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-09-22
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "cusz/type.h"
+#include "utils/cuda_err.cuh"
+#include "utils/timer.h"
+
+#include "kernel/lorenzo_all.h"
+#include "kernel/lorenzo_all.hh"
+
+#include "detail/lorenzo_proto.inl"
+
+template <typename T, typename EQ, typename FP>
+cusz_error_status compress_predict_lorenzo_iproto(
+    T* const     data,
+    dim3 const   len3,
+    double const eb,
+    int const    radius,
+    EQ* const    eq,
+    T*           outlier,
+    uint32_t*    outlier_idx,
+    uint32_t*    num_outliers,
+    float*       time_elapsed,
+    cudaStream_t stream)
+{
+    auto divide3 = [](dim3 len, dim3 sublen) {
+        return dim3((len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_1D = 256;
+    constexpr auto BLOCK_1D  = dim3(256, 1, 1);
+    auto           GRID_1D   = divide3(len3, SUBLEN_1D);
+
+    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
+    constexpr auto BLOCK_2D  = dim3(16, 16, 1);
+    auto           GRID_2D   = divide3(len3, SUBLEN_2D);
+
+    constexpr auto SUBLEN_3D = dim3(8, 8, 8);
+    constexpr auto BLOCK_3D  = dim3(8, 8, 8);
+    auto           GRID_3D   = divide3(len3, SUBLEN_3D);
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    using namespace psz::cuda::__kernel::prototype;
+
+    if (ndim() == 1) {
+        c_lorenzo_1d1l<T, EQ, FP><<<GRID_1D, BLOCK_1D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+    else if (ndim() == 2) {
+        c_lorenzo_2d1l<T, EQ, FP><<<GRID_2D, BLOCK_2D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+    else if (ndim() == 3) {
+        c_lorenzo_3d1l<T, EQ, FP><<<GRID_3D, BLOCK_3D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+    else {
+        throw std::runtime_error("Lorenzo only works for 123-D.");
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    TIME_ELAPSED_CUDAEVENT(time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+template <typename T, typename EQ, typename FP>
+cusz_error_status decompress_predict_lorenzo_iproto(
+    EQ*            eq,
+    dim3 const     len3,
+    T*             outlier,
+    uint32_t*      outlier_idx,
+    uint32_t const num_outliers,
+    double const   eb,
+    int const      radius,
+    T*             xdata,
+    float*         time_elapsed,
+    cudaStream_t   stream)
+{
+    auto divide3 = [](dim3 len, dim3 sublen) {
+        return dim3((len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_1D = 256;
+    constexpr auto BLOCK_1D  = dim3(256, 1, 1);
+    auto           GRID_1D   = divide3(len3, SUBLEN_1D);
+
+    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
+    constexpr auto BLOCK_2D  = dim3(16, 16, 1);
+    auto           GRID_2D   = divide3(len3, SUBLEN_2D);
+
+    constexpr auto SUBLEN_3D = dim3(8, 8, 8);
+    constexpr auto BLOCK_3D  = dim3(8, 8, 8);
+    auto           GRID_3D   = divide3(len3, SUBLEN_3D);
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    using namespace psz::cuda::__kernel::prototype;
+
+    if (ndim() == 1) {
+        x_lorenzo_1d1l<T, EQ, FP><<<GRID_1D, BLOCK_1D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+    else if (ndim() == 2) {
+        x_lorenzo_2d1l<T, EQ, FP><<<GRID_2D, BLOCK_2D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+    else if (ndim() == 3) {
+        x_lorenzo_3d1l<T, EQ, FP><<<GRID_3D, BLOCK_3D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    TIME_ELAPSED_CUDAEVENT(time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, EQ, FP)                         \
+    template cusz_error_status compress_predict_lorenzo_iproto<T, EQ, FP>(                                \
+        T* const, dim3 const, double const, int const, EQ* const, T* const, uint32_t*, uint32_t*, float*, \
+        cudaStream_t);                                                                                    \
+                                                                                                          \
+    template cusz_error_status decompress_predict_lorenzo_iproto<T, EQ, FP>(                              \
+        EQ*, dim3 const, T*, uint32_t*, uint32_t const, double const, int const, T*, float*, cudaStream_t);
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float);
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double);
+
+#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER
diff --git a/qtensor/compression/cusz/src/kernel/lorenzo_serial.cc b/qtensor/compression/cusz/src/kernel/lorenzo_serial.cc
new file mode 100644
index 00000000..b274bc23
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/lorenzo_serial.cc
@@ -0,0 +1,118 @@
+/**
+ * @file lorenzo.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-03-16
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "detail/lorenzo_serial.inl"
+#include "cusz/type.h"
+
+template <typename T, typename EQ, typename FP, typename OUTLIER = psz_outlier_serial<T>>
+cusz_error_status serial_compress_predict_lorenzo_i(
+    T* const       data,
+    psz_dim3 const len3,
+    double const   eb,
+    int const      radius,
+    EQ* const      eq,
+    OUTLIER*       outlier,
+    float*         time_elapsed)
+{
+    auto divide3 = [](psz_dim3 len, psz_dim3 sublen) {
+        return psz_dim3{(len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1};
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    auto d = ndim();
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = psz_dim3{1, len3.x, len3.x * len3.y};
+
+    if (d == 1) {
+        psz::serial::__kernel::c_lorenzo_1d1l<T, EQ, FP, 256>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+    else if (d == 2) {
+        psz::serial::__kernel::c_lorenzo_2d1l<T, EQ, FP, 16>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+    else if (d == 3) {
+        psz::serial::__kernel::c_lorenzo_3d1l<T, EQ, FP, 8>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+
+    return CUSZ_SUCCESS;
+}
+
+template <typename T, typename EQ, typename FP>
+cusz_error_status serial_decompress_predict_lorenzo_i(
+    EQ*            eq,
+    psz_dim3 const len3,
+    T*             outlier,
+    double const   eb,
+    int const      radius,
+    T*             xdata,
+    float*         time_elapsed)
+{
+    auto divide3 = [](psz_dim3 len, psz_dim3 sublen) {
+        return psz_dim3{(len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1};
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = psz_dim3{1, len3.x, len3.x * len3.y};
+
+    auto d = ndim();
+
+    if (d == 1) {
+        psz::serial::__kernel::x_lorenzo_1d1l<T, EQ, FP, 256>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+    else if (d == 2) {
+        psz::serial::__kernel::x_lorenzo_2d1l<T, EQ, FP, 16>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+    else if (d == 3) {
+        psz::serial::__kernel::x_lorenzo_3d1l<T, EQ, FP, 8>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+
+    return CUSZ_SUCCESS;
+}
+
+#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, EQ, FP)                      \
+    template cusz_error_status serial_compress_predict_lorenzo_i<T, EQ, FP>(                           \
+        T* const, psz_dim3 const, double const, int const, EQ* const, psz_outlier_serial<T>*, float*); \
+                                                                                                       \
+    template cusz_error_status serial_decompress_predict_lorenzo_i<T, EQ, FP>(                         \
+        EQ*, psz_dim3 const, T*, double const, int const, T*, float*);
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float);
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double);
+
+#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER
diff --git a/qtensor/compression/cusz/src/kernel/lorenzo_var.cu b/qtensor/compression/cusz/src/kernel/lorenzo_var.cu
new file mode 100644
index 00000000..8fc3ff39
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/lorenzo_var.cu
@@ -0,0 +1,206 @@
+/**
+ * @file lorenzo_var.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-27
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "cusz/type.h"
+#include "utils/cuda_err.cuh"
+#include "utils/timer.h"
+
+#include "kernel/lorenzo_all.h"
+#include "kernel/lorenzo_all.hh"
+
+#include "detail/lorenzo_var.inl"
+
+template <typename T, typename DeltaT, typename FP>
+cusz_error_status asz::experimental::compress_predict_lorenzo_ivar(
+    T*           data,
+    dim3 const   len3,
+    double const eb,
+    DeltaT*      delta,
+    bool*        signum,
+    float*       time_elapsed,
+    cudaStream_t stream)
+{
+    auto pardeg3 = [](dim3 len, dim3 sublen) {
+        return dim3(
+            (len.x - 1) / sublen.x + 1,  //
+            (len.y - 1) / sublen.y + 1,  //
+            (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_1D = 256;
+    constexpr auto SEQ_1D    = 4;  // x-sequentiality == 4
+    constexpr auto BLOCK_1D  = dim3(256 / 4, 1, 1);
+    auto           GRID_1D   = pardeg3(len3, SUBLEN_1D);
+
+    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
+    // constexpr auto SEQ_2D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    constexpr auto BLOCK_2D = dim3(16, 2, 1);
+    auto           GRID_2D  = pardeg3(len3, SUBLEN_2D);
+
+    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
+    // constexpr auto SEQ_3D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    constexpr auto BLOCK_3D = dim3(32, 1, 8);
+    auto           GRID_3D  = pardeg3(len3, SUBLEN_3D);
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    if (ndim() == 1) {
+        cusz::experimental::c_lorenzo_1d1l<T, DeltaT, FP, SEQ_1D, SEQ_1D>  //
+            <<<GRID_1D, BLOCK_1D, 0, stream>>>                             //
+            (data, delta, signum, len3, leap3, ebx2_r);
+    }
+    else if (ndim() == 2) {
+        cusz::experimental::c_lorenzo_2d1l_16x16data_mapto16x2<T, DeltaT, FP>  //
+            <<<GRID_2D, BLOCK_2D, 0, stream>>>                                 //
+            (data, delta, signum, len3, leap3, ebx2_r);
+    }
+    else if (ndim() == 3) {
+        cusz::experimental::c_lorenzo_3d1l_32x8x8data_mapto32x1x8<T, DeltaT, FP>  //
+            <<<GRID_3D, BLOCK_3D, 0, stream>>>                                    //
+            (data, delta, signum, len3, leap3, ebx2_r);
+    }
+    else {
+        throw std::runtime_error("Lorenzo only works for 123-D.");
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    TIME_ELAPSED_CUDAEVENT(time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+template <typename T, typename DeltaT, typename FP>
+cusz_error_status asz::experimental::decompress_predict_lorenzo_ivar(
+    DeltaT*      delta,
+    bool*        signum,
+    dim3 const   len3,
+    double const eb,
+    T*           xdata,
+    float*       time_elapsed,
+    cudaStream_t stream)
+{
+    auto pardeg3 = [](dim3 len, dim3 sublen) {
+        return dim3(
+            (len.x - 1) / sublen.x + 1,  //
+            (len.y - 1) / sublen.y + 1,  //
+            (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_1D = 256;
+    // constexpr auto SEQ_1D    = 8;  // x-sequentiality == 8
+    constexpr auto BLOCK_1D = dim3(256 / 8, 1, 1);
+    auto           GRID_1D  = pardeg3(len3, SUBLEN_1D);
+
+    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
+    // constexpr auto SEQ_2D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    constexpr auto BLOCK_2D = dim3(16, 2, 1);
+    auto           GRID_2D  = pardeg3(len3, SUBLEN_2D);
+
+    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
+    // constexpr auto SEQ_3D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    constexpr auto BLOCK_3D = dim3(32, 1, 8);
+    auto           GRID_3D  = pardeg3(len3, SUBLEN_3D);
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    if (ndim() == 1) {
+        cusz::experimental::x_lorenzo_1d1l<T, DeltaT, FP, 256, 8>  //
+            <<<GRID_1D, BLOCK_1D, 0, stream>>>                     //
+            (signum, delta, xdata, len3, leap3, ebx2);
+    }
+    else if (ndim() == 2) {
+        cusz::experimental::x_lorenzo_2d1l_16x16data_mapto16x2<T, DeltaT, FP>  //
+            <<<GRID_2D, BLOCK_2D, 0, stream>>>                                 //
+            (signum, delta, xdata, len3, leap3, ebx2);
+    }
+    else {
+        cusz::experimental::x_lorenzo_3d1l_32x8x8data_mapto32x1x8<T, DeltaT, FP>  //
+            <<<GRID_3D, BLOCK_3D, 0, stream>>>                                    //
+            (signum, delta, xdata, len3, leap3, ebx2);
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    TIME_ELAPSED_CUDAEVENT(time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, E, FP)                                      \
+    template cusz_error_status asz::experimental::compress_predict_lorenzo_ivar<T, E, FP>(                            \
+        T*, dim3 const, double const, E*, bool*, float*, cudaStream_t);                                               \
+                                                                                                                      \
+    template cusz_error_status asz::experimental::decompress_predict_lorenzo_ivar<T, E, FP>(                          \
+        E*, bool*, dim3 const, double const, T*, float*, cudaStream_t);                                               \
+                                                                                                                      \
+    cusz_error_status compress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral(                        \
+        T* const data, dim3 const len3, double const eb, E* delta, bool* signum, float* time_elapsed,                 \
+        cudaStream_t stream)                                                                                          \
+    {                                                                                                                 \
+        asz::experimental::compress_predict_lorenzo_ivar<T, E, FP>(                                                   \
+            data, len3, eb, delta, signum, time_elapsed, stream);                                                     \
+        return CUSZ_SUCCESS;                                                                                          \
+    }                                                                                                                 \
+                                                                                                                      \
+    cusz_error_status decompress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral(                      \
+        E* delta, bool* signum, dim3 const len3, double const eb, T* xdata, float* time_elapsed, cudaStream_t stream) \
+    {                                                                                                                 \
+        asz::experimental::decompress_predict_lorenzo_ivar<T, E, FP>(                                                 \
+            delta, signum, len3, eb, xdata, time_elapsed, stream);                                                    \
+        return CUSZ_SUCCESS;                                                                                          \
+    }
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float);
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double);
+
+#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER
diff --git a/qtensor/compression/cusz/src/kernel/preprocess.cuh b/qtensor/compression/cusz/src/kernel/preprocess.cuh
new file mode 100644
index 00000000..f7c321f7
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/preprocess.cuh
@@ -0,0 +1,65 @@
+/**
+ * @file preprocess.cuh
+ * @author Jiannan Tian
+ * @brief Filters for preprocessing of cuSZ.
+ * @version 0.3
+ * @date 2020-09-20
+ * (created) 2020-05-03 (rev) 2021-06-21
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_KERNEL_PREPROCESS_CUH
+#define CUSZ_KERNEL_PREPROCESS_CUH
+
+#include <iostream>
+
+#include "common.hh"
+
+using std::cout;
+using std::endl;
+
+namespace cusz {
+
+#include <numeric>
+
+template <typename T>
+__global__ void log_transform()
+{
+    static_assert(std::is_floating_point<T>::value, "[log_transform] must be floating-point type.");
+}
+
+template <typename Data, int DOWNSCALE_FACTOR, int tBLK>
+__global__ void binning2d(Data* input, Data* output, size_t d0, size_t d1, size_t new_d0, size_t new_d1)
+{
+    auto y   = threadIdx.y;
+    auto x   = threadIdx.x;
+    auto yid = blockIdx.y * blockDim.y + y;
+    auto xid = blockIdx.x * blockDim.x + x;
+
+    __shared__ Data s[tBLK][tBLK];
+
+    if (yid >= new_d1 or xid >= new_d0) return;
+
+    int xblk = (xid + 1) * DOWNSCALE_FACTOR >= d0 ? d0 - xid * DOWNSCALE_FACTOR : DOWNSCALE_FACTOR;
+    int yblk = (yid + 1) * DOWNSCALE_FACTOR >= d1 ? d1 - yid * DOWNSCALE_FACTOR : DOWNSCALE_FACTOR;
+    s[y][x]  = 0;
+
+    for (int j = 0; j < yblk; j++)
+        for (int i = 0; i < xblk; i++)
+            s[y][x] += input[(yid * DOWNSCALE_FACTOR + j) * d0 + (xid * DOWNSCALE_FACTOR + i)];
+
+    output[yid * new_d0 + xid] = s[y][x] / static_cast<Data>(yblk * xblk);
+}
+}  // namespace cusz
+
+template __global__ void cusz::binning2d<float, 2, 32>(float*, float*, size_t, size_t, size_t, size_t);
+template __global__ void cusz::binning2d<double, 2, 32>(double*, double*, size_t, size_t, size_t, size_t);
+// template __global__ void cusz::binning2d<I1, 2, 32>(I1*, I1*, size_t, size_t, size_t, size_t);
+// template __global__ void cusz::binning2d<I2, 2, 32>(I2*, I2*, size_t, size_t, size_t, size_t);
+// template __global__ void cusz::binning2d<I4, 2, 32>(I4*, I4*, size_t, size_t, size_t, size_t);
+// template __global__ void cusz::binning2d<I8, 2, 32>(I8*, I8*, size_t, size_t, size_t, size_t);
+
+#endif
diff --git a/qtensor/compression/cusz/src/kernel/rle.cuh b/qtensor/compression/cusz/src/kernel/rle.cuh
new file mode 100644
index 00000000..6f01cff4
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/rle.cuh
@@ -0,0 +1,74 @@
+// modified from thrust example
+// attach the license below when push to master branch
+// https://github.com/NVIDIA/thrust/blob/main/LICENSE
+
+/**
+ * @file rle.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.2
+ * @date 2021-04-01
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef KERNEL_RLE_CUH
+#define KERNEL_RLE_CUH
+
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/gather.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+
+#include <iostream>
+#include <iterator>
+
+using const_gen = thrust::constant_iterator<int>;
+using counter   = thrust::counting_iterator<int>;
+
+namespace kernel {
+
+template <typename T>
+void RunLengthEncoding(T* d_fullfmt_data, const size_t N, T* d_compact_data, int* d_lengths, size_t& num_runs)
+{
+    thrust::device_ptr<T>   input   = thrust::device_pointer_cast(d_fullfmt_data);
+    thrust::device_ptr<T>   output  = thrust::device_pointer_cast(d_compact_data);
+    thrust::device_ptr<int> lengths = thrust::device_pointer_cast(d_lengths);
+    // compute the output size (run lengths)
+    num_runs = thrust::reduce_by_key(
+                   input, input + N,  // input::key (symbol)
+                   const_gen(1),      // input::value (count)
+                   output,            // output::key (symbol)
+                   lengths)           // output::value (count)
+                   .first -
+               output;
+}
+
+template <typename T>
+void RunLengthDecoding(T* d_fullfmt_data, const size_t N, T* d_compact_data, int* d_lengths, const size_t num_runs)
+{
+    thrust::device_ptr<T>   output  = thrust::device_pointer_cast(d_fullfmt_data);
+    thrust::device_ptr<T>   input   = thrust::device_pointer_cast(d_compact_data);
+    thrust::device_ptr<int> lengths = thrust::device_pointer_cast(d_lengths);
+
+    // scan the lengths
+    thrust::inclusive_scan(lengths, lengths + num_runs, lengths);
+
+    // compute input index for each output element
+    thrust::device_vector<int> indices(N);
+    thrust::lower_bound(
+        lengths, lengths + N,        //
+        counter(1), counter(N + 1),  //
+        indices.begin());
+
+    thrust::encode(indices.begin(), indices.end(), input, output);
+}
+
+}  // namespace kernel
+
+#endif
diff --git a/qtensor/compression/cusz/src/kernel/spv_gpu.cu b/qtensor/compression/cusz/src/kernel/spv_gpu.cu
new file mode 100644
index 00000000..96b665a7
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/spv_gpu.cu
@@ -0,0 +1,60 @@
+/**
+ * @file spv_gpu.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-29
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/spv_gpu.inl"
+#include "kernel/spv_gpu.h"
+#include "kernel/spv_gpu.hh"
+
+#define SPV(Tliteral, Mliteral, T, M)                                                                              \
+    void spv_gather_T##Tliteral##_M##Mliteral(                                                                     \
+        T* in, size_t const in_len, T* d_val, uint32_t* d_idx, int* nnz, float* milliseconds, cudaStream_t stream) \
+    {                                                                                                              \
+        psz::detail::spv_gather<T, M>(in, in_len, d_val, d_idx, nnz, milliseconds, stream);                        \
+    }                                                                                                              \
+                                                                                                                   \
+    void spv_scatter_T##Tliteral##_M##Mliteral(                                                                    \
+        T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream)            \
+    {                                                                                                              \
+        psz::detail::spv_scatter<T, M>(d_val, d_idx, nnz, decoded, milliseconds, stream);                          \
+    }
+
+SPV(ui8, ui32, uint8_t, uint32_t)
+SPV(ui16, ui32, uint16_t, uint32_t)
+SPV(ui32, ui32, uint32_t, uint32_t)
+SPV(ui64, ui32, uint64_t, uint32_t)
+SPV(fp32, ui32, float, uint32_t)
+SPV(fp64, ui32, double, uint32_t)
+
+#undef SPV
+
+#define SPV(Tliteral, Mliteral, T, M)                                                                               \
+    template <>                                                                                                     \
+    void psz::spv_gather<T, M>(                                                                                     \
+        T * in, size_t const in_len, T* d_val, uint32_t* d_idx, int* nnz, float* milliseconds, cudaStream_t stream) \
+    {                                                                                                               \
+        spv_gather_T##Tliteral##_M##Mliteral(in, in_len, d_val, d_idx, nnz, milliseconds, stream);                  \
+    }                                                                                                               \
+                                                                                                                    \
+    template <>                                                                                                     \
+    void psz::spv_scatter<T, M>(                                                                                    \
+        T * d_val, uint32_t * d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream)           \
+    {                                                                                                               \
+        spv_scatter_T##Tliteral##_M##Mliteral(d_val, d_idx, nnz, decoded, milliseconds, stream);                    \
+    }
+
+SPV(ui8, ui32, uint8_t, uint32_t)
+SPV(ui16, ui32, uint16_t, uint32_t)
+SPV(ui32, ui32, uint32_t, uint32_t)
+SPV(ui64, ui32, uint64_t, uint32_t)
+SPV(fp32, ui32, float, uint32_t)
+SPV(fp64, ui32, double, uint32_t)
+
+#undef SPV
diff --git a/qtensor/compression/cusz/src/kernel/v2_lorenzo.cu b/qtensor/compression/cusz/src/kernel/v2_lorenzo.cu
new file mode 100644
index 00000000..fb2c22ed
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/v2_lorenzo.cu
@@ -0,0 +1,118 @@
+/**
+ * @file v2_lorenzo.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-23
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "cusz/type.h"
+#include "utils/cuda_err.cuh"
+#include "utils/timer.h"
+
+#include "kernel/lorenzo_all.hh"
+#include "kernel/v2_lorenzo.hh"
+
+template <typename T, typename E, typename FP>
+cusz_error_status v2_compress_predict_lorenzo_i(
+    T* const          data,
+    dim3 const        len3,
+    double const      eb,
+    int const         radius,
+    E* const          errctrl,
+    dim3 const        placeholder_2,
+    T* const          anchor,
+    dim3 const        placeholder_1,
+    CompactionDRAM<T> outlier,
+    float*            time_elapsed,
+    cudaStream_t      stream)
+{
+    auto divide3 = [](dim3 len, dim3 sublen) {
+        return dim3(
+            (len.x - 1) / sublen.x + 1,  //
+            (len.y - 1) / sublen.y + 1,  //
+            (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_1D = 256;
+    constexpr auto SEQ_1D    = 4;  // x-sequentiality == 4
+    constexpr auto BLOCK_1D  = dim3(256 / 4, 1, 1);
+    auto           GRID_1D   = divide3(len3, SUBLEN_1D);
+
+    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
+    constexpr auto BLOCK_2D  = dim3(16, 2, 1);
+    auto           GRID_2D   = divide3(len3, SUBLEN_2D);
+
+    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
+    constexpr auto BLOCK_3D  = dim3(32, 8, 1);  // for v0::r1_shfl
+    auto           GRID_3D   = divide3(len3, SUBLEN_3D);
+
+    auto d = ndim();
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    if (d == 1) {
+        psz::cuda::__kernel::v0::compaction::c_lorenzo_1d1l<T, E, FP, SUBLEN_1D, SEQ_1D>
+            <<<GRID_1D, BLOCK_1D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, errctrl, outlier);
+    }
+    else if (d == 2) {
+        psz::cuda::__kernel::v0::compaction::c_lorenzo_2d1l<T, E, FP>
+            <<<GRID_2D, BLOCK_2D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, errctrl, outlier);
+    }
+    else if (d == 3) {
+        psz::cuda::__kernel::v0::compaction::c_lorenzo_3d1l<T, E, FP>
+            <<<GRID_3D, BLOCK_3D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, errctrl, outlier);
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+    TIME_ELAPSED_CUDAEVENT(time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, E, FP)                   \
+    template cusz_error_status v2_compress_predict_lorenzo_i<T, E, FP>(                            \
+        T* const, dim3 const, double const, int const, E* const, dim3 const, T* const, dim3 const, \
+        struct CompactionDRAM<T>, float*, cudaStream_t);                                           \
+                                                                                                   \
+    // cusz_error_status v2_compress_predict_lorenzo_i_T##Tliteral##_E##Eliteral##_FP##FPliteral(                \
+    //     T* const data, dim3 const len3, T* const anchor, dim3 const placeholder_1, E* const errctrl,          \
+    //     dim3 const placeholder_2, T* outlier, double const eb, int const radius, float* time_elapsed,         \
+    //     cudaStream_t stream)                                                                                  \
+    // {                                                                                                         \
+    //     return v2_compress_predict_lorenzo_i<T, E, FP>(                                                       \
+    //         data, len3, eb, radius, errctrl, placeholder_2, anchor, placeholder_1, outlier, nullptr, nullptr, \
+    //         time_elapsed, stream);                                                                            \
+    // }
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float);
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double);
+
+#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER
diff --git a/qtensor/compression/cusz/src/pipeline/v2_compressor.cc b/qtensor/compression/cusz/src/pipeline/v2_compressor.cc
new file mode 100644
index 00000000..73ee3c83
--- /dev/null
+++ b/qtensor/compression/cusz/src/pipeline/v2_compressor.cc
@@ -0,0 +1,112 @@
+/**
+ * @file v2_compressor.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-29
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "pipeline/v2_compressor.hh"
+#include "common/configs.hh"
+#include "framework.hh"
+
+namespace psz {
+
+template <class B>
+v2_Compressor<B>::~v2_Compressor()
+{
+    pimpl.reset();
+}
+
+template <class B>
+v2_Compressor<B>::v2_Compressor() : pimpl{std::make_unique<impl>()}
+{
+}
+
+template <class B>
+v2_Compressor<B>::v2_Compressor(const v2_Compressor<B>& old) : pimpl{std::make_unique<impl>(*old.pimpl)}
+{
+}
+
+template <class B>
+v2_Compressor<B>& v2_Compressor<B>::operator=(const v2_Compressor<B>& old)
+{
+    *pimpl = *old.pimpl;
+    return *this;
+}
+
+template <class B>
+v2_Compressor<B>::v2_Compressor(v2_Compressor<B>&&) = default;
+
+template <class B>
+v2_Compressor<B>& v2_Compressor<B>::operator=(v2_Compressor<B>&&) = default;
+
+//------------------------------------------------------------------------------
+
+template <class B>
+void v2_Compressor<B>::init(Context* config)
+{
+    pimpl->init(config);
+}
+
+template <class B>
+void v2_Compressor<B>::init(v2_header* config)
+{
+    pimpl->init(config);
+}
+
+template <class B>
+void v2_Compressor<B>::compress(
+    Context*             config,
+    v2_Compressor<B>::T* uncompressed,
+    BYTE*&               compressed,
+    size_t&              compressed_len,
+    cudaStream_t         stream,
+    bool                 dbg_print)
+{
+    pimpl->compress(config, uncompressed, compressed, compressed_len, stream, dbg_print);
+}
+
+template <class B>
+void v2_Compressor<B>::decompress(
+    v2_header*           config,
+    BYTE*                compressed,
+    v2_Compressor<B>::T* decompressed,
+    cudaStream_t         stream,
+    bool                 dbg_print)
+{
+    pimpl->decompress(config, compressed, decompressed, stream, dbg_print);
+}
+
+// template <class B>
+// void v2_Compressor<B>::clear_buffer()
+// {
+//     pimpl->clear_buffer();
+// }
+
+// getter
+
+template <class B>
+void v2_Compressor<B>::export_header(v2_header& header)
+{
+    pimpl->export_header(header);
+}
+
+template <class B>
+void v2_Compressor<B>::export_header(v2_header* header)
+{
+    pimpl->export_header(header);
+}
+
+// template <class B>
+// void v2_Compressor<B>::export_timerecord(TimeRecord* ext_timerecord)
+// {
+//     pimpl->export_timerecord(ext_timerecord);
+// }
+
+}  // namespace psz
+
+template class psz::v2_Compressor<cusz::Framework<float>>;
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.cu b/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.cu
new file mode 100644
index 00000000..32eeb39d
--- /dev/null
+++ b/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.cu
@@ -0,0 +1,15 @@
+/**
+ * @file v2_compressor_impl.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-23
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "framework.hh"
+#include "v2_compressor_impl.inl"
+
+template class psz::v2_Compressor<cusz::Framework<float>>::impl;
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.inl b/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.inl
new file mode 100644
index 00000000..2a2788f4
--- /dev/null
+++ b/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.inl
@@ -0,0 +1,239 @@
+/**
+ * @file v2_compressor_impl.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-23
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef F4D645B7_B2E3_41AB_BCFD_DCF919C4C56D
+#define F4D645B7_B2E3_41AB_BCFD_DCF919C4C56D
+
+#include <iostream>
+
+#include "component.hh"
+#include "header.h"
+#include "pipeline/v2_compressor.hh"
+// #include "kernel/cpplaunch_cuda.hh"
+#include "kernel/v2_lorenzo.hh"
+#include "stat/stat_g.hh"
+#include "utils/cuda_err.cuh"
+
+#include "../detail/spv_gpu.inl"
+#include "../kernel/detail/lorenzo23.inl"
+
+#define TEMPLATE_TYPE template <class CONFIG>
+#define IMPL v2_Compressor<CONFIG>::impl
+
+#define ARCHIVE(VAR, FIELD)                                                                                  \
+    if (segments[v2_header::FIELD] != 0 and VAR != nullptr) {                                                \
+        auto dst = var_archive() + header.entry[v2_header::FIELD];                                           \
+        auto src = reinterpret_cast<BYTE*>(VAR);                                                             \
+        CHECK_CUDA(cudaMemcpyAsync(dst, src, segments[v2_header::FIELD], cudaMemcpyDeviceToDevice, stream)); \
+    }
+
+#define ACCESS_VAR(SYM, TYPE) reinterpret_cast<TYPE*>(in_compressed + header->entry[v2_header::SYM])
+
+namespace psz {
+
+TEMPLATE_TYPE
+IMPL::impl()
+{
+    codec = new Codec;
+    // TODO re-enable fallback codec
+    // fb_codec  = new FallbackCodec;
+}
+
+TEMPLATE_TYPE
+void IMPL::destroy()
+{
+    if (codec) delete codec;
+    // if (fb_codec) delete codec;
+
+    // also deallocate buffer
+}
+
+TEMPLATE_TYPE
+void IMPL::init(Context* config) { __init(config); }
+
+TEMPLATE_TYPE
+void IMPL::init(v2_header* config) { __init(config); }
+
+TEMPLATE_TYPE
+template <class ContextOrHeader>
+void IMPL::__init(ContextOrHeader* c)
+{
+    static_assert(
+        std::is_same<ContextOrHeader, Context>::value or  //
+            std::is_same<ContextOrHeader, v2_header>::value,
+        "[v2_Compressor::impl::init] not a valid comrpessor config type.");
+
+    auto len = c->x * c->y * c->z;
+    // TODO allocate anchor
+
+    // allocate eq
+    cudaMalloc(&d_errctrl, len * sizeof(EQ));  // to overlap with one of vle/hf buffers
+
+    // allocate outlier
+    outlier.allocate(len / sp_factor, true);
+
+    // allocate vle/hf
+    codec->init(len, c->radius * 2, c->vle_pardeg);
+    // TODO disable fallback codec for now
+}
+
+TEMPLATE_TYPE
+void IMPL::compress(
+    Context*     c,
+    T*           uncompressed,
+    BYTE*&       compressed,
+    size_t&      compressed_len,
+    cudaStream_t stream,
+    bool         dbg_print)
+{
+    auto const eb     = c->eb;
+    auto const radius = c->radius;
+    auto const pardeg = c->vle_pardeg;
+
+    if (dbg_print) {
+        printf("[dbg] eb: %lf\n", eb);
+        printf("[dbg] radius: %d\n", radius);
+        printf("[dbg] pardeg: %d\n", pardeg);
+        // printf("[dbg] codecs_in_use: %d\n", codecs_in_use);
+        printf("[dbg] sp_factor: %d\n", sp_factor);
+    }
+
+    data_len3 = dim3(c->x, c->y, c->z);
+    data_len  = c->x * c->y * c->z;
+
+    header.sp.factor = sp_factor;
+
+    BYTE*  d_codec_out{nullptr};
+    size_t codec_outlen{0};
+
+    // size_t sublen;
+    auto booklen = radius * 2;
+
+    /******************************************************************************/
+
+    // TODO version clarification
+    // with compaction
+    v2_compress_predict_lorenzo_i<T, EQ, FP>(
+        uncompressed, data_len3, eb, radius, d_errctrl, dim3(1, 1, 1), d_anchor, dim3(1, 1, 1), outlier,
+        &comp_time.construct, stream);
+
+    outlier.make_count_host_accessible(stream);
+
+    asz::stat::histogram<E>(d_errctrl, data_len, d_freq, booklen, &comp_time.hist, stream);
+
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    // TODO overlapping memory
+    codec->encode(d_errctrl, data_len, d_codec_out, codec_outlen, stream);
+
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    // update header
+    {
+        header.x = c->x, header.y = c->y, header.z = c->z, header.w = 1;
+        header.sp.count = outlier.access_count_on_host();
+        // TODO the new
+        {
+            // header.config.radius = radius, header.config.eb = eb;
+            // header.hf.pardeg = pardeg;
+        }
+
+        // the compat
+        {
+            header.radius = radius, header.eb = eb;
+            header.vle_pardeg = pardeg;
+        }
+
+        // header.byte_vle  = 4;  // regardless of fallback codec
+    };
+
+    size_t segments[v2_header::END] = {0};
+
+    // gather archive
+    {
+        // calculate offsets
+        segments[v2_header::HEADER] = sizeof(v2_header);
+        segments[v2_header::ANCHOR] = 0;  // placeholder
+        segments[v2_header::SP_IDX] = outlier.access_count_on_host() * sizeof(IDX);
+        segments[v2_header::SP_VAL] = outlier.access_count_on_host() * sizeof(T);
+        segments[v2_header::HF]     = codec_outlen;
+
+        header.entry[0] = 0;
+        for (auto i = 1; i < v2_header::END + 1; i++) { header.entry[i] = segments[i - 1]; }
+        for (auto i = 1; i < v2_header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; }
+
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        // memcpy
+        ARCHIVE(d_anchor, ANCHOR);
+        ARCHIVE(outlier.idx, SP_IDX);
+        ARCHIVE(outlier.val, SP_VAL);
+        ARCHIVE(d_codec_out, HF);
+
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+    }
+
+    // output
+    compressed_len = header.entry[v2_header::END];
+    compressed     = var_archive();
+
+    // collect_compress_timerecord();
+}
+
+TEMPLATE_TYPE
+void IMPL::decompress(v2_header* header, BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool dbg_print)
+{
+    // TODO host having copy of header when compressing
+    if (not header) {
+        header = new v2_header;
+        CHECK_CUDA(cudaMemcpyAsync(header, in_compressed, sizeof(v2_header), cudaMemcpyDeviceToHost, stream));
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+    }
+
+    data_len3 = dim3(header->x, header->y, header->z);
+
+    // use_fallback_codec      = header->byte_vle == 8;
+    // auto const vle_pardeg = header->hf.pardeg;
+
+    // The inputs of components are from `compressed`.
+    // auto d_anchor = ACCESS_VAR(ANCHOR, T);
+    auto d_vle   = ACCESS_VAR(HF, BYTE);
+    auto d_spidx = ACCESS_VAR(SP_IDX, IDX);
+    auto d_spval = ACCESS_VAR(SP_VAL, T);
+
+    // wire and aliasing
+    auto d_outlier = out_decompressed;
+    auto d_xdata   = out_decompressed;
+
+    psz::detail::spv_scatter<T, IDX>(d_spval, d_spidx, header->sp.count, d_outlier, &decomp_time.scatter, stream);
+
+    codec->decode(d_vle, d_errctrl);
+
+    decompress_predict_lorenzo_i<T, EQ, FP>(
+        d_errctrl, data_len3,  //
+        d_outlier,             //
+        nullptr, 0,            // TODO remove
+        header->eb, header->radius,
+        d_xdata,  // output
+        &decomp_time.reconstruct, stream);
+
+    // collect_decompress_timerecord();
+
+    // clear state for the next decompression after reporting
+    // use_fallback_codec = false;
+}
+
+}  // namespace psz
+
+#undef TEMPLATE_TYPE
+#undef IMPL
+
+#endif /* F4D645B7_B2E3_41AB_BCFD_DCF919C4C56D */
diff --git a/qtensor/compression/cusz/src/stat/cmpg1_1.cu b/qtensor/compression/cusz/src/stat/cmpg1_1.cu
new file mode 100644
index 00000000..ccf91661
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/cmpg1_1.cu
@@ -0,0 +1,30 @@
+/**
+ * @file cmpg1.cu
+ * @author Jiannan Tian
+ * @brief (split to speed up buid process; part 1)
+ * @version 0.3
+ * @date 2022-10-09
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_DESCRIPTION(Tliteral, T)                                        \
+    void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \
+    {                                                                             \
+        psz::detail::thrustgpu_get_extrema_rawptr<T>(d_ptr, len, res);            \
+    }                                                                             \
+                                                                                  \
+    template <>                                                                   \
+    void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])        \
+    {                                                                             \
+        thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res);                \
+    }
+
+THRUSTGPU_DESCRIPTION(ui8, uint8_t)
+
+#undef THRUSTGPU_DESCRIPTION
diff --git a/qtensor/compression/cusz/src/stat/cmpg1_2.cu b/qtensor/compression/cusz/src/stat/cmpg1_2.cu
new file mode 100644
index 00000000..8b44a9e6
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/cmpg1_2.cu
@@ -0,0 +1,30 @@
+/**
+ * @file cmpg1_2.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_DESCRIPTION(Tliteral, T)                                        \
+    void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \
+    {                                                                             \
+        psz::detail::thrustgpu_get_extrema_rawptr<T>(d_ptr, len, res);            \
+    }                                                                             \
+                                                                                  \
+    template <>                                                                   \
+    void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])        \
+    {                                                                             \
+        thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res);                \
+    }
+
+THRUSTGPU_DESCRIPTION(ui16, uint16_t)
+
+#undef THRUSTGPU_DESCRIPTION
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/stat/cmpg1_3.cu b/qtensor/compression/cusz/src/stat/cmpg1_3.cu
new file mode 100644
index 00000000..169741bc
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/cmpg1_3.cu
@@ -0,0 +1,30 @@
+/**
+ * @file cmpg1_3.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_DESCRIPTION(Tliteral, T)                                        \
+    void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \
+    {                                                                             \
+        psz::detail::thrustgpu_get_extrema_rawptr<T>(d_ptr, len, res);            \
+    }                                                                             \
+                                                                                  \
+    template <>                                                                   \
+    void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])        \
+    {                                                                             \
+        thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res);                \
+    }
+
+THRUSTGPU_DESCRIPTION(ui32, uint32_t)
+
+#undef THRUSTGPU_DESCRIPTION
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/stat/cmpg1_4.cu b/qtensor/compression/cusz/src/stat/cmpg1_4.cu
new file mode 100644
index 00000000..4ec93b20
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/cmpg1_4.cu
@@ -0,0 +1,30 @@
+/**
+ * @file cmpg1_4.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_DESCRIPTION(Tliteral, T)                                        \
+    void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \
+    {                                                                             \
+        psz::detail::thrustgpu_get_extrema_rawptr<T>(d_ptr, len, res);            \
+    }                                                                             \
+                                                                                  \
+    template <>                                                                   \
+    void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])        \
+    {                                                                             \
+        thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res);                \
+    }
+
+THRUSTGPU_DESCRIPTION(fp32, float)
+
+#undef THRUSTGPU_DESCRIPTION
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/stat/cmpg1_5.cu b/qtensor/compression/cusz/src/stat/cmpg1_5.cu
new file mode 100644
index 00000000..3b08e576
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/cmpg1_5.cu
@@ -0,0 +1,30 @@
+/**
+ * @file cmpg1_5.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_DESCRIPTION(Tliteral, T)                                        \
+    void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \
+    {                                                                             \
+        psz::detail::thrustgpu_get_extrema_rawptr<T>(d_ptr, len, res);            \
+    }                                                                             \
+                                                                                  \
+    template <>                                                                   \
+    void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])        \
+    {                                                                             \
+        thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res);                \
+    }
+
+THRUSTGPU_DESCRIPTION(fp64, double)
+
+#undef THRUSTGPU_DESCRIPTION
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/stat/cmpg2.cu b/qtensor/compression/cusz/src/stat/cmpg2.cu
new file mode 100644
index 00000000..0ece52b5
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/cmpg2.cu
@@ -0,0 +1,34 @@
+/**
+ * @file cmp2g.cu
+ * @author Jiannan Tian
+ * @brief (split to speed up buid process; part 2)
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_COMPARE_LOSSLESS(Tliteral, T)                          \
+    bool thrustgpu_identical_T##Tliteral(T* d1, T* d2, size_t const len) \
+    {                                                                    \
+        return psz::detail::thrustgpu_identical<T>(d1, d2, len);         \
+    }                                                                    \
+                                                                         \
+    template <>                                                          \
+    bool psz::thrustgpu_identical<T>(T * d1, T * d2, size_t const len)   \
+    {                                                                    \
+        return thrustgpu_identical_T##Tliteral(d1, d2, len);             \
+    }
+
+THRUSTGPU_COMPARE_LOSSLESS(fp32, float)
+THRUSTGPU_COMPARE_LOSSLESS(fp64, double)
+THRUSTGPU_COMPARE_LOSSLESS(ui8, uint8_t)
+THRUSTGPU_COMPARE_LOSSLESS(ui16, uint16_t)
+THRUSTGPU_COMPARE_LOSSLESS(ui32, uint32_t)
+
+#undef THRUSTGPU_COMPARE_LOSSLESS
diff --git a/qtensor/compression/cusz/src/stat/cmpg3.cu b/qtensor/compression/cusz/src/stat/cmpg3.cu
new file mode 100644
index 00000000..05c7af97
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/cmpg3.cu
@@ -0,0 +1,32 @@
+/**
+ * @file cmp3g.cu
+ * @author Jiannan Tian
+ * @brief (split to speed up buid process; part 3)
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_COMPARE_LOSSY(Tliteral, T)                                                                        \
+    bool thrustgpu_error_bounded_T##Tliteral(                                                                       \
+        T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx = nullptr)                          \
+    {                                                                                                               \
+        return psz::detail::thrustgpu_error_bounded<T>(a, b, len, eb, first_faulty_idx);                            \
+    }                                                                                                               \
+                                                                                                                    \
+    template <>                                                                                                     \
+    bool psz::thrustgpu_error_bounded<T>(T * a, T * b, size_t const len, double const eb, size_t* first_faulty_idx) \
+    {                                                                                                               \
+        return thrustgpu_error_bounded_T##Tliteral(a, b, len, eb, first_faulty_idx);                                \
+    }
+
+THRUSTGPU_COMPARE_LOSSY(fp32, float);
+THRUSTGPU_COMPARE_LOSSY(fp64, double);
+
+#undef THRUSTGPU_COMPARE_LOSSY
diff --git a/qtensor/compression/cusz/src/stat/cmpg4_1.cu b/qtensor/compression/cusz/src/stat/cmpg4_1.cu
new file mode 100644
index 00000000..b3e5edaf
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/cmpg4_1.cu
@@ -0,0 +1,24 @@
+/**
+ * @file cmpg4_1.cu
+ * @author Jiannan Tian
+ * @brief (split to speed up buid process; part 4)
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_ASSESS(Tliteral, T)                                                              \
+    void thrustgpu_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len) \
+    {                                                                                              \
+        psz::detail::thrustgpu_assess_quality<T>(s, xdata, odata, len);                            \
+    }
+
+THRUSTGPU_ASSESS(fp32, float);
+
+#undef THRUSTGPU_ASSESS
diff --git a/qtensor/compression/cusz/src/stat/cmpg4_2.cu b/qtensor/compression/cusz/src/stat/cmpg4_2.cu
new file mode 100644
index 00000000..7a62b06d
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/cmpg4_2.cu
@@ -0,0 +1,25 @@
+/**
+ * @file cmpg4_2.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_ASSESS(Tliteral, T)                                                             \
+    template <>                                                                                   \
+    void psz::thrustgpu_assess_quality<T>(cusz_stats * s, T * xdata, T * odata, size_t const len) \
+    {                                                                                             \
+        thrustgpu_assess_quality_T##Tliteral(s, xdata, odata, len);                               \
+    }
+
+THRUSTGPU_ASSESS(fp32, float);
+
+#undef THRUSTGPU_ASSESS
diff --git a/qtensor/compression/cusz/src/stat/cmpg4_3.cu b/qtensor/compression/cusz/src/stat/cmpg4_3.cu
new file mode 100644
index 00000000..b9361bfb
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/cmpg4_3.cu
@@ -0,0 +1,24 @@
+/**
+ * @file cmpg4_3.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_ASSESS(Tliteral, T)                                                              \
+    void thrustgpu_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len) \
+    {                                                                                              \
+        psz::detail::thrustgpu_assess_quality<T>(s, xdata, odata, len);                            \
+    }
+
+THRUSTGPU_ASSESS(fp64, double);
+
+#undef THRUSTGPU_ASSESS
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/stat/cmpg4_4.cu b/qtensor/compression/cusz/src/stat/cmpg4_4.cu
new file mode 100644
index 00000000..4df3919f
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/cmpg4_4.cu
@@ -0,0 +1,25 @@
+/**
+ * @file cmpg4_4.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_ASSESS(Tliteral, T)                                                             \
+    template <>                                                                                   \
+    void psz::thrustgpu_assess_quality<T>(cusz_stats * s, T * xdata, T * odata, size_t const len) \
+    {                                                                                             \
+        thrustgpu_assess_quality_T##Tliteral(s, xdata, odata, len);                               \
+    }
+
+THRUSTGPU_ASSESS(fp64, double);
+
+#undef THRUSTGPU_ASSESS
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/stat/compare_cpu.cc b/qtensor/compression/cusz/src/stat/compare_cpu.cc
new file mode 100644
index 00000000..c9432bb4
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/compare_cpu.cc
@@ -0,0 +1,43 @@
+/**
+ * @file _compare.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-09
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_cpu.inl"
+#include "stat/compare.h"
+
+#define CPPSTD_COMPARE_LOSSLESS(Tliteral, T)                          \
+    bool cppstd_identical_T##Tliteral(T* d1, T* d2, size_t const len) \
+    {                                                                 \
+        return psz::detail::cppstd_identical<T>(d1, d2, len);         \
+    }
+
+#define CPPSTD_COMPARE_LOSSY(Tliteral, T)                                                       \
+    bool cppstd_error_bounded_T##Tliteral(                                                      \
+        T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx = nullptr)      \
+    {                                                                                           \
+        return psz::detail::cppstd_error_bounded<T>(a, b, len, eb, first_faulty_idx);           \
+    }                                                                                           \
+                                                                                                \
+    void cppstd_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len) \
+    {                                                                                           \
+        psz::detail::cppstd_assess_quality<T>(s, xdata, odata, len);                            \
+    }
+
+CPPSTD_COMPARE_LOSSLESS(fp32, float)
+CPPSTD_COMPARE_LOSSLESS(fp64, double)
+CPPSTD_COMPARE_LOSSLESS(ui8, uint8_t)
+CPPSTD_COMPARE_LOSSLESS(ui16, uint16_t)
+CPPSTD_COMPARE_LOSSLESS(ui32, uint32_t)
+
+CPPSTD_COMPARE_LOSSY(fp32, float)
+CPPSTD_COMPARE_LOSSY(fp64, double)
+
+#undef CPPSTD_COMPARE_LOSSLESS
+#undef CPPSTD_COMPARE_LOSSY
diff --git a/qtensor/compression/cusz/src/stat/stat.cc b/qtensor/compression/cusz/src/stat/stat.cc
new file mode 100644
index 00000000..e69de29b
diff --git a/qtensor/compression/cusz/src/stat/stat_g.cu b/qtensor/compression/cusz/src/stat/stat_g.cu
new file mode 100644
index 00000000..2fcc81c6
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/stat_g.cu
@@ -0,0 +1,96 @@
+/**
+ * @file stat_g.cu
+ * @author Cody Rivera, Jiannan Tian
+ * @brief Fast histogramming from [Gómez-Luna et al. 2013], wrapper
+ * @version 0.3
+ * @date 2022-11-02
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../kernel/detail/hist.inl"
+
+#include "cusz/type.h"
+#include "stat/stat.h"
+#include "stat/stat_g.hh"
+
+template <typename T>
+cusz_error_status asz::stat::histogram(
+    T*           in_data,
+    size_t const in_len,
+    uint32_t*    out_freq,
+    int const    num_buckets,
+    float*       milliseconds,
+    cudaStream_t stream)
+{
+    int device_id, max_bytes, num_SMs;
+    int items_per_thread, r_per_block, grid_dim, block_dim, shmem_use;
+
+    cudaGetDevice(&device_id);
+    cudaDeviceGetAttribute(&num_SMs, cudaDevAttrMultiProcessorCount, device_id);
+
+    auto query_maxbytes = [&]() {
+        int max_bytes_opt_in;
+        cudaDeviceGetAttribute(&max_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id);
+
+        // account for opt-in extra shared memory on certain architectures
+        cudaDeviceGetAttribute(&max_bytes_opt_in, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id);
+        max_bytes = std::max(max_bytes, max_bytes_opt_in);
+
+        // config kernel attribute
+        cudaFuncSetAttribute(
+            kernel::p2013Histogram<T, cusz::FREQ>, cudaFuncAttributeMaxDynamicSharedMemorySize, max_bytes);
+    };
+
+    auto optimize_launch = [&]() {
+        items_per_thread = 1;
+        r_per_block      = (max_bytes / sizeof(int)) / (num_buckets + 1);
+        grid_dim         = num_SMs;
+        // fits to size
+        block_dim = ((((in_len / (grid_dim * items_per_thread)) + 1) / 64) + 1) * 64;
+        while (block_dim > 1024) {
+            if (r_per_block <= 1) { block_dim = 1024; }
+            else {
+                r_per_block /= 2;
+                grid_dim *= 2;
+                block_dim = ((((in_len / (grid_dim * items_per_thread)) + 1) / 64) + 1) * 64;
+            }
+        }
+        shmem_use = ((num_buckets + 1) * r_per_block) * sizeof(int);
+    };
+
+    query_maxbytes();
+    optimize_launch();
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    kernel::p2013Histogram<<<grid_dim, block_dim, shmem_use, stream>>>  //
+        (in_data, out_freq, in_len, num_buckets, r_per_block);
+
+    STOP_CUDAEVENT_RECORDING(stream);
+
+    cudaStreamSynchronize(stream);
+    TIME_ELAPSED_CUDAEVENT(milliseconds);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+#define INIT_HIST_AND_C(Tname, T)                                                                                     \
+    template cusz_error_status asz::stat::histogram<T>(T*, size_t const, uint32_t*, int const, float*, cudaStream_t); \
+                                                                                                                      \
+    cusz_error_status histogram_T##Tname(                                                                             \
+        T* in_data, size_t const in_len, uint32_t* out_freq, int const num_buckets, float* milliseconds,              \
+        cudaStream_t stream)                                                                                          \
+    {                                                                                                                 \
+        return asz::stat::histogram<T>(in_data, in_len, out_freq, num_buckets, milliseconds, stream);                 \
+    }
+
+INIT_HIST_AND_C(ui8, uint8_t)
+INIT_HIST_AND_C(ui16, uint16_t)
+INIT_HIST_AND_C(ui32, uint32_t)
+INIT_HIST_AND_C(ui64, uint64_t)
+
+#undef INIT_HIST_AND_C
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/utils/dbg_print.cuh b/qtensor/compression/cusz/src/utils/dbg_print.cuh
new file mode 100644
index 00000000..19334e2e
--- /dev/null
+++ b/qtensor/compression/cusz/src/utils/dbg_print.cuh
@@ -0,0 +1,132 @@
+#ifndef UTILS_DBG_PRINT_CUH
+#define UTILS_DBG_PRINT_CUH
+
+/**
+ * @file dbg_print.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.2
+ * @date 2020-09-20
+ * Created on 2020-03-17
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+template <typename Q, int PART_SIZE>
+__global__ void print_deflated(Q* coded, size_t gid)
+{
+    if (blockIdx.x * blockDim.x + threadIdx.x != gid) return;
+    printf("print after deflating\n");
+    //    for_each(coded, coded + PART_SIZE, [](Q& i) { print_by_type(i, '_', '\n'); });
+    for (size_t i = 0; i < PART_SIZE; i++) { print_by_type(*(coded + i), '_', '\n'); }
+    printf("\n");
+}
+
+template <typename T>
+__global__ void print_histogram(T* freq, size_t size, size_t radius = 20)
+{
+    const int DICT_SIZE = size; /* Dynamic sizing */
+    if (blockIdx.x * blockDim.x + threadIdx.x == 0) {
+        for (size_t i = DICT_SIZE / 2 - radius; i < DICT_SIZE / 2 + radius; i++) {
+            if (i % 10 == 0) printf("\n");
+            printf("%4lu: %-12lu", i, static_cast<size_t>(freq[i]));
+        }
+        printf("\n");
+    }
+}
+
+template <typename T>
+__device__ __host__ void print_by_type(T num, char sep = '_', char ending = '\n')
+{
+    for (size_t j = 0; j < sizeof(T) * CHAR_BIT; j++) {
+        printf("%u", (num >> ((sizeof(T) * CHAR_BIT - 1) - j)) & 0x01u);
+        if (j != 0 and j != sizeof(T) * CHAR_BIT - 1 and j % 8 == 7) printf("%c", sep);
+    }
+    printf("%c", ending);
+}
+
+// MSB to LSB
+template <typename T>
+__device__ __host__ void print_code_only(T num, size_t bitwidth, char sep = '_', char ending = '\n')
+{
+    for (size_t j = 0; j < bitwidth; j++) {
+        printf("%u", (num >> ((bitwidth - 1) - j)) & 0x01u);
+        if (j != 0 and j != bitwidth - 1 and j % 8 == 7) printf("%c", sep);
+    }
+    printf("%c", ending);
+}
+
+template <typename T>
+__device__ __host__ void snippet_print_bitset_full(T num)
+{
+    print_by_type(num, '_', '\t');
+    size_t bitwidth = *((uint8_t*)&num + sizeof(T) - 1);
+    //    size_t code_bitwidth = ((static_cast<T>(0xffu) << (sizeof(T) * 8 - 8)) & num) >> (sizeof(T) * 8 - 8);
+    printf("len: %3lu\tcode: ", bitwidth);
+    print_code_only<T>(num, bitwidth, '\0', '\n');
+}
+
+template <typename T>
+__global__ void print_codebook(T* codebook, size_t len)
+{
+    if (blockIdx.x * blockDim.x + threadIdx.x != 0) return;
+    printf("--------------------------------------------------------------------------------\n");
+    printf("printing codebook\n");
+    printf("--------------------------------------------------------------------------------\n");
+    __shared__ T buffer;
+    for (size_t i = 0; i < len; i++) {
+        buffer = codebook[i];
+        if (buffer == ~((T)0x0)) continue;
+        printf("%5lu\t", i);
+        snippet_print_bitset_full(buffer);
+    }
+    printf("--------------------------------------------------------------------------------\n");
+    printf("done printing codebook\n");
+    printf("--------------------------------------------------------------------------------\n");
+}
+
+template <typename T>
+__global__ void get_entropy(T* freq)
+{
+}
+
+// TODO real GPU version
+template <typename T, typename Q>
+__global__ void get_theoretical_dense_Huffman_coded_length(T* codebook, Q* freq, size_t codebook_len)
+{
+}
+
+// template <typename T>
+//__global__ void print_Huffman_coded_before_deflating(T* coded, size_t len=200) {
+//    if (blockIdx.x * blockDim.x + threadIdx.x != 0) return;
+//    printf("print Huffman coded before it is deflated\n");
+//    for (size_t i = 0; i < 200; i++) {
+//        if (coded[i] == ~((T)0x0)) continue;
+//        printf("%5lu\t", i);
+//        snippet_print_bitset_full(coded[i]);
+//    }
+//    printf("\n");
+//}
+
+template <typename T>
+__global__ void print_Huffman_coded_before_deflating(T* coded, size_t len)
+{
+    if (blockIdx.x != 0) return;
+    size_t gid = blockDim.x * blockIdx.x + threadIdx.x;
+    if (coded[gid] == ~((T)0x0)) return;
+    printf("%5lu\t", gid);
+    snippet_print_bitset_full(coded[gid]);
+
+    //        if (coded[i] == ~((T)0x0)) continue;
+    //    printf("print Huffman coded before it is deflated\n");
+    //    for (size_t i = 0; i < 200; i++) {
+    //        if (coded[i] == ~((T)0x0)) continue;
+    //        printf("%5lu\t", i);
+    //        snippet_print_bitset_full(coded[i]);
+    //    }
+    //    printf("\n");
+}
+
+#endif
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/utils/print_gpu.cu b/qtensor/compression/cusz/src/utils/print_gpu.cu
new file mode 100644
index 00000000..9fd20040
--- /dev/null
+++ b/qtensor/compression/cusz/src/utils/print_gpu.cu
@@ -0,0 +1,121 @@
+/**
+ * @file print_gpu.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-09-23
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+// #include "../detail/print_gpu.inl"
+#include <stdio.h>
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+#include "utils/print_gpu.h"
+#include "utils/print_gpu.hh"
+
+#define PRINT_INT_LESS_THAN_64(Tliteral, T)                                                                 \
+    void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset)                                  \
+    {                                                                                                       \
+        thrust::for_each(                                                                                   \
+            thrust::device, d_arr, d_arr + num, [=] __device__(const T i) { printf("%d\t", (int32_t)i); }); \
+        printf("\n");                                                                                       \
+    }
+
+PRINT_INT_LESS_THAN_64(i8, int8_t)
+PRINT_INT_LESS_THAN_64(i16, int16_t)
+PRINT_INT_LESS_THAN_64(i32, int32_t)
+
+void peek_device_data_Ti64(int64_t* d_arr, size_t num, size_t offset)
+{
+    thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const int64_t i) { printf("%ld\t", i); });
+    printf("\n");
+}
+
+#define PRINT_UINT_LESS_THAN_64(Tliteral, T)                                                                 \
+    void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset)                                   \
+    {                                                                                                        \
+        thrust::for_each(                                                                                    \
+            thrust::device, d_arr, d_arr + num, [=] __device__(const T i) { printf("%u\t", (uint32_t)i); }); \
+        printf("\n");                                                                                        \
+    }
+
+PRINT_UINT_LESS_THAN_64(ui8, uint8_t)
+PRINT_UINT_LESS_THAN_64(ui16, uint16_t)
+PRINT_UINT_LESS_THAN_64(ui32, uint32_t)
+
+void peek_device_data_Tui64(uint64_t* d_arr, size_t num, size_t offset)
+{
+    thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const uint64_t i) { printf("%lu\t", i); });
+    printf("\n");
+}
+
+void peek_device_data_Tfp32(float* d_arr, size_t num, size_t offset)
+{
+    thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const float i) { printf("%.7f\t", i); });
+    printf("\n");
+}
+
+void peek_device_data_Tfp64(double* d_arr, size_t num, size_t offset)
+{
+    thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const double i) { printf("%.7lf\t", i); });
+    printf("\n");
+}
+
+template <typename T>
+void psz::peek_device_data(T* d_arr, size_t num, size_t offset)
+{
+    if (std::is_same<T, int8_t>::value) {  //
+        peek_device_data_Ti8((int8_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, int16_t>::value) {
+        peek_device_data_Ti16((int16_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, int32_t>::value) {
+        peek_device_data_Ti32((int32_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, int64_t>::value) {
+        peek_device_data_Ti64((int64_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, uint8_t>::value) {
+        peek_device_data_Tui8((uint8_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, uint16_t>::value) {
+        peek_device_data_Tui16((uint16_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, uint32_t>::value) {
+        peek_device_data_Tui32((uint32_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, uint64_t>::value) {
+        peek_device_data_Tui64((uint64_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, float>::value) {
+        peek_device_data_Tfp32((float*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, double>::value) {
+        peek_device_data_Tfp64((double*)d_arr, num, offset);
+    }
+    else {
+        std::runtime_error("peek_device_data cannot accept this type.");
+    }
+}
+
+#define CPP_PEEK(Tliteral, T) template void psz::peek_device_data<T>(T * d_arr, size_t num, size_t offset);
+
+CPP_PEEK(i8, int8_t);
+CPP_PEEK(i16, int16_t);
+CPP_PEEK(i32, int32_t);
+CPP_PEEK(i64, int64_t);
+CPP_PEEK(ui8, uint8_t);
+CPP_PEEK(ui16, uint16_t);
+CPP_PEEK(ui32, uint32_t);
+CPP_PEEK(ui64, uint64_t);
+CPP_PEEK(fp32, float);
+CPP_PEEK(fp64, double);
+
+#undef CPP_PEEK
+
+#undef PRINT_INT_LESS_THAN_64
+#undef PRINT_UINT_LESS_THAN_64
diff --git a/qtensor/compression/cusz/src/utils/timer_cpu.cc b/qtensor/compression/cusz/src/utils/timer_cpu.cc
new file mode 100644
index 00000000..3983bc0f
--- /dev/null
+++ b/qtensor/compression/cusz/src/utils/timer_cpu.cc
@@ -0,0 +1,30 @@
+/**
+ * @file timer_cpu.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-31
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "utils/timer.h"
+
+#include <chrono>
+#include <utility>
+
+using hires         = std::chrono::high_resolution_clock;
+using duration_t    = std::chrono::duration<double>;
+using hires_clock_t = std::chrono::time_point<hires>;
+
+struct asz_timer {
+    hires_clock_t start, stop;
+};
+
+// cpu timer specific
+asz_timer* asz_cputimer_create() { return new asz_timer; }
+void       asz_cputimer_destroy(asz_timer* t) { delete t; }
+void       asz_cputimer_start(asz_timer* t) { t->start = hires::now(); }
+void       asz_cputimer_end(asz_timer* t) { t->stop = hires::now(); }
+double     asz_cputime_elapsed(asz_timer* t) { return static_cast<duration_t>((t->stop) - (t->start)).count(); }
diff --git a/qtensor/compression/cusz/src/utils/timer_gpu.cu b/qtensor/compression/cusz/src/utils/timer_gpu.cu
new file mode 100644
index 00000000..a44ee4bf
--- /dev/null
+++ b/qtensor/compression/cusz/src/utils/timer_gpu.cu
@@ -0,0 +1,82 @@
+/**
+ * @file timer_gpu.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-31
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include <cuda_runtime.h>
+#include <iostream>
+#include "utils/timer.h"
+
+typedef struct asz_cudatimer {
+    cudaEvent_t  a, b;
+    float        milliseconds;
+    cudaStream_t stream;
+
+    asz_cudatimer() { create(); }
+    asz_cudatimer(cudaStream_t stream)
+    {
+        create();
+        this->stream = stream;
+    }
+
+    void create()
+    {
+        cudaEventCreate(&a);
+        cudaEventCreate(&b);
+    }
+
+    void destroy()
+    {
+        cudaEventDestroy(a);
+        cudaEventDestroy(b);
+    }
+
+    // stream not involved
+    void start() { cudaEventRecord(a); }
+
+    void stop()
+    {
+        cudaEventRecord(b);
+        cudaEventSynchronize(b);
+    }
+
+    // stream involved
+    void stream_start()
+    {
+        cudaEventRecord(a, stream);  // set event as not occurred
+    }
+
+    void stream_stop()
+    {
+        cudaEventRecord(b, stream);
+        cudaEventSynchronize(b);  // block host until `stream` meets `stop`
+    }
+
+    // get time
+    float time_elapsed()
+    {
+        cudaEventElapsedTime(&milliseconds, a, b);
+        std::cout << "milliseconds: " << milliseconds << std::endl;
+        return milliseconds;
+    }
+} asz_cudatimer;
+
+// cuda timer specific
+asz_cudatimer* asz_cudatimer_create() { return new asz_cudatimer{}; }
+void           asz_cudatimer_destroy(asz_cudatimer* t) { t->destroy(); }
+void           asz_cudatimer_start(asz_cudatimer* t) { t->start(); }
+void           asz_cudatimer_end(asz_cudatimer* t) { t->stop(); }
+double         asz_cudatime_elapsed(asz_cudatimer* t) { return t->time_elapsed() / 1000; }
+
+// cuda streamtimer specific
+asz_cudatimer* asz_cudastreamtimer_create(void* stream) { return new asz_cudatimer((cudaStream_t)stream); }
+void           asz_cudastreamtimer_destroy(asz_cudatimer* t) { t->destroy(); }
+void           asz_cudastreamtimer_start(asz_cudatimer* t) { t->stream_start(); }
+void           asz_cudastreamtimer_end(asz_cudatimer* t) { t->stream_stop(); }
+double         asz_cudastreamtime_elapsed(asz_cudatimer* t) { return t->time_elapsed() / 1000; }
diff --git a/qtensor/compression/cusz/src/utils/vis_stat.hh b/qtensor/compression/cusz/src/utils/vis_stat.hh
new file mode 100644
index 00000000..60099138
--- /dev/null
+++ b/qtensor/compression/cusz/src/utils/vis_stat.hh
@@ -0,0 +1,137 @@
+#ifndef UTILS_VIS_STAT_HH
+#define UTILS_VIS_STAT_HH
+
+/**
+ * @file vis_stat.hh
+ * @author Jiannan Tian
+ * @brief Analysis and visualization of datum.
+ * @version 0.1
+ * @date 2020-09-20
+ * Created on 2020-02-09
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <cmath>
+#include <cstdio>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::tuple;
+
+template <typename T>
+double GetEntropy(T* code, size_t l, size_t cap = 1024)
+{
+    if (cap == 0) {
+        cerr << "wrong cap" << endl;
+        exit(-1);
+    }
+    auto arr = new size_t[cap]();
+    for (size_t i = 0; i < l; i++) arr[code[i]]++;
+    std::vector<double> raw(arr, arr + cap);
+    std::vector<double> frequencies;
+    std::copy_if(raw.begin(), raw.end(), std::back_inserter(frequencies), [](double& e) { return e != 0; });
+    double entropy = 0;
+    for (auto freq : frequencies) { entropy += -(freq * 1.0 / l) * log2(freq * 1.0 / l); }
+
+    //    cout << "entropy:\t" << entropy << endl;
+    delete[] arr;
+    return entropy;
+}
+
+// TODO automatically omit bins that are less than 1%
+template <typename T>
+void VisualizeHistogram(
+    const std::string& tag,
+    T*                 _d_POD,
+    size_t             l,
+    size_t             _bins                   = 16,
+    bool               log_freq                = false,
+    double             override_min            = 0,
+    double             override_max            = 0,
+    bool               eliminate_zeros         = false,
+    bool               use_scientific_notation = true)
+{
+    std::vector<T> _d(_d_POD, _d_POD + l);
+    std::vector<T> _d_nonzero;
+    //    std::vector<size_t> arr;
+    //    arr.reserve(_bins);
+    //    for (size_t i = 0; i< _bins; i++) arr.push_back(0);
+    auto arr = new size_t[_bins]();
+
+    if (eliminate_zeros) {
+        std::copy_if(_d.begin(), _d.end(), std::back_inserter(_d_nonzero), [](int i) { return i != 0; });
+    }
+    double Min = *std::min_element(_d.begin(), _d.end());
+    double Max = *std::max_element(_d.begin(), _d.end());
+    //    double sum = std::accumulate(_d.begin(), _d.end(), 0);
+    double rng = Max - Min;
+    //    double avg = sum / l;
+
+    cout << "\e[7m[[" << tag << "]]\e[0m";
+    if (override_max > override_min) {
+        cout << "zoom into " << override_min << "--" << override_max << endl;
+        std::tie(Max, Min, rng) = std::make_tuple(override_max, override_min, override_max - override_min);
+    }
+    double step = rng / _bins;
+    for (size_t i = 0; i < l; i++) arr[static_cast<size_t>((_d[i] - Min) / step)]++;
+    std::vector<size_t> _viz(arr, arr + _bins);
+    //    std::vector<size_t> _viz(arr);
+
+    // visualization
+    printf("\tbins:\t%zu\tbin_width:\t%lf\n", _bins, step);
+    //    printf("count:\t%zu\tmin:\t%lf\tmax:\t%lf\trng:\t%lf\n", l, Min, Max, rng);
+    cout << "count:\t" << l << "\t";
+    cout << "min:\t" << Min << "\t";
+    cout << "max:\t" << Max << "\t";
+    cout << "rng:\t" << rng << endl;
+
+    if (log_freq) {
+        cout << "using log_freq" << endl;
+        std::for_each(_viz.begin(), _viz.end(), [](size_t& n) { n = log2(n); });
+    }
+
+    size_t longest     = *std::max_element(_viz.begin(), _viz.end());
+    size_t bar_str_len = 64;  // scale according to the longest
+    std::for_each(_viz.begin(), _viz.end(), [&](size_t& n) {
+        n = static_cast<size_t>(n / static_cast<double>(longest) * bar_str_len);
+    });
+
+    for (size_t i = 0; i < _bins; i++) {
+        // normalize to width
+        cout << "|"
+             << "\33[43m";
+
+        for (size_t j = 0; j < bar_str_len + 1; j++) {
+            if (j < _viz[i])
+                cout << "-";
+            else if (j == _viz[i])
+                cout << "\33[0m"
+                     << "+";
+            else
+                cout << " ";
+        }
+        cout.precision(2);
+        cout << "    ";
+        if (use_scientific_notation) cout << std::scientific;
+        cout << Min + i * step << " -- " << Min + (i + 1) * step;
+        cout << "  ";
+        cout << std::setw((int)log10(l) + 2);
+        cout << arr[i];
+        cout << "   ";
+        cout << std::defaultfloat << std::setw(5) << arr[i] / static_cast<double>(l) * 100 << "%" << endl;
+    }
+    cout << endl;
+    //    delete[] arr;
+}
+
+#endif
diff --git a/qtensor/compression/szp/src/cuSZp_entry.cu b/qtensor/compression/szp/src/cuSZp_entry.cu
index b92d4e41..a04d8348 100644
--- a/qtensor/compression/szp/src/cuSZp_entry.cu
+++ b/qtensor/compression/szp/src/cuSZp_entry.cu
@@ -76,7 +76,7 @@ void SZp_decompress_hostptr(float* decData, unsigned char* cmpBytes, size_t nbEl
     dim3 blockSize(bsize);
     dim3 gridSize(gsize);
     SZp_decompress_kernel<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
-
+    
     // Move data back to CPU.
     cudaMemcpy(decData, d_decData, sizeof(float)*nbEle, cudaMemcpyDeviceToHost);
 
@@ -109,7 +109,7 @@ void SZp_compress_deviceptr(float* d_oriData, unsigned char* d_cmpBytes, size_t
     dim3 blockSize(bsize);
     dim3 gridSize(gsize);
     SZp_compress_kernel<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
-
+    cudaDeviceSynchronize();
     // Obtain compression ratio and move data back to CPU.  
     cudaMemcpy(&glob_sync, d_cmpOffset+cmpOffSize-2, sizeof(unsigned int), cudaMemcpyDeviceToHost);
     *cmpSize = (size_t)glob_sync + (nbEle+31)/32;
@@ -140,8 +140,8 @@ void SZp_decompress_deviceptr(float* d_decData, unsigned char* d_cmpBytes, size_
     dim3 blockSize(bsize);
     dim3 gridSize(gsize);
     SZp_decompress_kernel<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
-    
+    cudaDeviceSynchronize();
     // Free memoy that is used.
     cudaFree(d_cmpOffset);
     cudaFree(d_flag);
-}
\ No newline at end of file
+}
diff --git a/qtensor/compression/szp/src/cuSZp_wrapper.cu b/qtensor/compression/szp/src/cuSZp_wrapper.cu
index b71bda71..4d83f283 100644
--- a/qtensor/compression/szp/src/cuSZp_wrapper.cu
+++ b/qtensor/compression/szp/src/cuSZp_wrapper.cu
@@ -17,7 +17,7 @@ extern "C"{
         cudaMalloc((void**)&d_finalCmpBytes, *outSize);
         cudaMemcpy(d_finalCmpBytes, d_cmpBytes, *outSize, cudaMemcpyDeviceToDevice);
         cudaFree(d_cmpBytes);
-
+	//cudaFree(oriData);
         return d_finalCmpBytes;
     }
 
@@ -30,7 +30,8 @@ extern "C"{
         cudaStreamCreate(&stream);
         cudaMalloc((void**)&d_decData, sizeof(float)*nbEle);
         SZp_decompress_deviceptr(d_decData, cmpBytes, nbEle, cmpSize, errorBound, stream);
-        return d_decData;
+        cudaFree(cmpBytes);
+	return d_decData;
     }
     
 }
diff --git a/qtensor/compression/szp/src/cuSZp_wrapper.py b/qtensor/compression/szp/src/cuSZp_wrapper.py
index ef2d3272..6f4053ba 100644
--- a/qtensor/compression/szp/src/cuSZp_wrapper.py
+++ b/qtensor/compression/szp/src/cuSZp_wrapper.py
@@ -7,8 +7,8 @@
 import torch
 
 from pathlib import Path
-LIB_PATH = str(Path(__file__).parent/'libcuszp_wrapper.so')
-
+#LIB_PATH = str(Path(__file__).parent/'libcuszp_wrapper.so')
+LIB_PATH = '/home/mkshah5/QTensor/qtensor/compression/szp/src/libcuszp_wrapper.so'
 # unsigned char* cuSZp_device_compress(float *oriData, size_t *outSize, float absErrBound, size_t nbEle){
 
 def get_device_compress():
@@ -41,10 +41,10 @@ def cuszp_device_compress(oriData, absErrBound, nbEle,threshold):
     outSize = ctypes.pointer(variable)
     
     oriData = oriData.flatten()
-    ori_real = oriData.real
-    ori_imag = oriData.imag
-    oriData = cp.concatenate((ori_real, ori_imag))
-    sample = oriData[::2]
+    #ori_real = oriData.real
+    #ori_imag = oriData.imag
+    #oriData = cp.concatenate((ori_real, ori_imag))
+    #sample = oriData[::2]
     
     
     d = cp.amax(oriData) - cp.amin(oriData)
@@ -58,11 +58,11 @@ def cuszp_device_compress(oriData, absErrBound, nbEle,threshold):
     s_1 = time.time() 
     #print(cp.get_array_module(oriData))    
     truth_values = cp.absolute(oriData)<=threshold
-    oriData[truth_values] = 0.0
+    #oriData[truth_values] = 0.0
     truth_values = cp.invert(truth_values)
-    oriData = oriData[truth_values]
+    # oriData = oriData[truth_values]
     bitmap = truth_values
-    nbEle = oriData.shape[0]
+    nbEle = oriData.shape[0]*2
     
 
     oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float))
@@ -72,13 +72,13 @@ def cuszp_device_compress(oriData, absErrBound, nbEle,threshold):
 
     mempool = cp.get_default_memory_pool()
     pinned_mempool = cp.get_default_pinned_memory_pool()
-    del oriData
+    #del oriData
 
     #print("tg and max time (s): "+str(time.time()-s_1))
     #print("bitmap shape: "+str(bitmap.shape[0]))
     #print("percent nonzero bytes: "+str(bitmap[cp.nonzero(bitmap)].shape[0]/bitmap.shape[0]))
     #print("CR")
-    print((ori_nbEle*4)/(outSize[0] + bitmap.shape[0]/8))
+    #print((ori_nbEle*4)/(outSize[0] + bitmap.shape[0]/8))
     return (o_bytes,bitmap, absErrBound), outSize
 
 
@@ -88,9 +88,9 @@ def cuszp_device_decompress(nbEle, cmpBytes, cmpSize, owner, dtype):
     #print("bitmap len:" +str(len(bitmap)))
     #print(nbEle)
     #tmp_nbEle = nbEle
-    tmp_nbEle = cp.count_nonzero(bitmap).item()
+    # tmp_nbEle = cp.count_nonzero(bitmap).item()
 #    print(tmp_nbEle)
-    nbEle_p = ctypes.c_size_t(tmp_nbEle)
+    nbEle_p = ctypes.c_size_t(nbEle)
     # size_t nbEle, unsigned char* cmpBytes, size_t cmpSize, float errorBound
     newData = __cuszp_device_decompress(nbEle_p,cmpBytes, np.ulonglong(cmpSize), np.float32(absErrBound))
 
@@ -104,16 +104,16 @@ def cuszp_device_decompress(nbEle, cmpBytes, cmpSize, owner, dtype):
     # --
     pointer_for_free = decompressed_int.value
     # self.decompressed_own.append(decompressed_int.value)
-    mem = cp.cuda.UnownedMemory(decompressed_int.value, tmp_nbEle, owner, device_id=0)
+    mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle, owner, device_id=0)
     mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
     #print("mem ptr")
     #print(mem_ptr)
-    arr = cp.ndarray(shape=tmp_nbEle, dtype=cp.float32, memptr=mem_ptr)
+    arr = cp.ndarray(shape=nbEle, dtype=cp.float32, memptr=mem_ptr)
 #    print("attempt alloc")
-    res = cp.zeros(nbEle,dtype=cp.float32)
+    # res = cp.zeros(nbEle,dtype=cp.float32)
 #    print("alloc passed")
     ## need to convert newData to cupy
-    cp.putmask(res,bitmap,arr)
+    # cp.putmask(res,bitmap,arr)
     mempool = cp.get_default_memory_pool()
     pinned_mempool = cp.get_default_pinned_memory_pool()
     #del arr
@@ -121,17 +121,17 @@ def cuszp_device_decompress(nbEle, cmpBytes, cmpSize, owner, dtype):
     #print(res[0])
     #print(res[int(nbEle/2)])
     #reshaped_data = arr.reshape(-1,2)
-    reshaped_data = res.reshape(-1,2)
-    
-    c_res = reshaped_data.view(dtype=cp.complex64)
+    reshaped_data = arr.reshape(-1,2)
+    #c_res = arr
+    c_res = reshaped_data.view(dtype=np.complex64)
     #print(c_res[0])
     #c_res = cp.zeros(int(nbEle/2), np.complex64)
     #c_res.real = res[0:int(nbEle/2)]
     #c_res.imag = res[int(nbEle/2):]
     #del res
-    del bitmap
-    mempool.free_all_blocks()
-    pinned_mempool.free_all_blocks()
+    #del bitmap
+    #mempool.free_all_blocks()
+    #pinned_mempool.free_all_blocks()
 
     return (c_res, pointer_for_free)
 
@@ -173,17 +173,18 @@ def __init__(self):
     
     # variable = ctypes.c_size_t(0)
     # outSize = ctypes.pointer(variable)
-    s_time = time.time()
-    o_bytes, outSize = cuszp_device_compress(in_vector_gpu, r2r_error, DATA_SIZE,r2r_threshold)
-    print("Time python: "+str(time.time()-s_time))
-    print(outSize[0])
-    print("Compress Success...starting decompress ")
-    comp = Comp()
-
-    s_time = time.time()
-    (d_bytes,ptr )= cuszp_device_decompress(DATA_SIZE, o_bytes,outSize[0], comp, in_vector_gpu.dtype)
+    for i in range(30):
+        s_time = time.time()
+        o_bytes, outSize = cuszp_device_compress(in_vector_gpu, r2r_error, DATA_SIZE,r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        print(outSize[0])
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        (d_bytes,ptr )= cuszp_device_decompress(DATA_SIZE, o_bytes,outSize[0], comp, in_vector_gpu.dtype)
     
-    print("Time python: "+str(time.time()-s_time))
+        print("Time python: "+str(time.time()-s_time))
     #for i in d_bytes:
     #    print(i)
-    print("Decompress Success")
+        print("Decompress Success")

From 87184a95904e01a071fc9609630c367cdfd63c5a Mon Sep 17 00:00:00 2001
From: Milan Kartik Shah <mkshah5@ncsu.edu>
Date: Wed, 7 Jun 2023 12:53:51 -0400
Subject: [PATCH 086/126] Added PyTorch based lossy compressor

---
 qtensor/compression/CompressedTensor.py       | 308 +++++++++---------
 qtensor/compression/Compressor.py             |  25 +-
 qtensor/compression/compressed_contraction.py |   3 +-
 .../compression/torch_quant/torch_quant.py    | 139 ++++++++
 qtensor/contraction_backends/compression.py   | 256 +++++++--------
 5 files changed, 440 insertions(+), 291 deletions(-)
 create mode 100644 qtensor/compression/torch_quant/torch_quant.py

diff --git a/qtensor/compression/CompressedTensor.py b/qtensor/compression/CompressedTensor.py
index 3f9181d2..08a0c390 100644
--- a/qtensor/compression/CompressedTensor.py
+++ b/qtensor/compression/CompressedTensor.py
@@ -1,154 +1,154 @@
-import itertools
-import numpy as np
-from qtree.optimizer import Tensor
-from qtree.system_defs import NP_ARRAY_TYPE
-from .Compressor import NumpyCompressor, Compressor
-
-def iterate_indices(indices: list):
-    if len(indices)==0:
-        return [tuple()]
-    ranges = [range(v.size) for v in indices]
-    return itertools.product(*ranges)
-
-
-class CompressedTensor(Tensor):
-    """
-    Extension of the Tensor class that holds compressed data
-
-    The data array is split along several indices S into 2^|S| parts
-
-    """
-    def __init__(self, name, indices,
-                 data_key=None, data=None,
-                 slice_indices=[],
-                 compressor:Compressor=NumpyCompressor()
-                ):
-        """
-        Initialize the tensor
-        name: str,
-              the name of the tensor. Used only for display/convenience.
-              May be not unique.
-        indices: tuple,
-              Indices of the tensor
-        shape: tuple,
-              shape of a tensor
-        data_key: int
-              Key to find tensor's data in the global storage
-        data: np.array
-              Actual data of the tensor. Default None.
-              Usually is not supplied at initialization.
-        slice_indices: list[Var]
-            indices along which the tensor is split into chunks
-        """
-        super().__init__(name, indices, data_key=data_key, data=data)
-        self.slice_indices = slice_indices
-        self.compressor = compressor
-        if data is not None:
-            self._dtype = data.dtype
-        else:
-            self._dtype = None
-
-    @classmethod
-    def empty(cls, name, indices, slice_indices=[], compressor=NumpyCompressor(), dtype:type=NP_ARRAY_TYPE):
-        t = super().empty(name, indices, dtype)
-        t.compressor = compressor
-        if slice_indices:
-            t.compress_indices(slice_indices)
-        return t
-
-    def compress_indices(self, indices: list):
-        """
-        Slice the self.data along dimensions in `indices`,
-        store them compressed
-
-        Does not support compressing when already compressed
-        """
-        slice_dict = {
-            i: slice(None) for i in self.indices
-        }
-        data_chunks = []
-        for ivals in iterate_indices(indices):
-            for ix, ival in zip(indices, ivals):
-                slice_dict[ix] = ival# slice(ival, ival+1)
-            dslice = self.data[tuple(slice_dict[i] for i in self.indices)]
-
-            data_chunks.append(
-                self.compressor.compress(dslice)
-            )
-            del dslice
-        self._data = data_chunks
-        self.slice_indices = indices
-
-    @property
-    def dtype(self):
-        """
-        DataType of wrapped chunks.
-        """
-        return self._dtype
-
-    @property
-    def array_indices(self):
-        return [x for x in self.indices if x not in self.slice_indices]
-
-    def get_chunk(self, ivals):
-        dims = [v.size for v in self.slice_indices]
-        if len(ivals)==0:
-            flat_ix = 0
-        else:
-            flat_ix = np.ravel_multi_index(ivals, dims)
-        ptr = self._data[flat_ix]
-        return self.compressor.decompress(ptr)
-
-    def set_chunk(self, ivals, chunk: np.ndarray):
-        # -- Check for consistent data types between chunks
-        if self._dtype is None:
-            self._dtype = chunk.dtype
-        else:
-            assert self.dtype == chunk.dtype, f"Chunk dtype {chunk.dtype} does not match tensor dtype {self.dtype}"
-        # --
-
-        if self._data is None:
-            self._data = np.empty(2**len(self.slice_indices), dtype=object)
-        dims = [v.size for v in self.slice_indices]
-        if len(ivals)==0:
-            flat_ix = 0
-        else:
-            flat_ix = np.ravel_multi_index(ivals, dims)
-        self._data[flat_ix] = self.compressor.compress(chunk)
-
-    def __getitem__(self, key):
-        """
-        Get a slice of the tensor along the indices in `key`
-        Currently slicing over all compressed indices is required.
-        Slices over compressed indices must be ints
-        """
-        slices_ints, new_indices = self._parse_getitem_key(key)
-        slice_dict = {}
-        chunk_slices_ints = []
-        compression_ints = []
-        for ix, ival in zip(self.indices, slices_ints):
-            slice_dict[ix] = ival
-            if ix in self.slice_indices:
-                compression_ints.append(ival)
-            else:
-                chunk_slices_ints.append(ival)
-        chunk = self.get_chunk(compression_ints)
-        new_name = f"{self.name}[sliced]"
-        # careful: chunk will not be collected even if slice is small
-        chunk_slice = chunk[tuple(chunk_slices_ints)]
-        return Tensor(new_name, new_indices, data=chunk_slice)
-
-
-    def __str__(self):
-        array_ix = ','.join(map(str, self.array_indices))
-        split_ix= ','.join(map(str, self.slice_indices))
-        return f'{self._name}{{{split_ix}}}({array_ix})'
-
-    def copy(self, name=None, indices=None, data_key=None, data=None):
-        raise NotImplementedError()
-
-    def __repr__(self):
-        return self.__str__()
-
-
-
+import itertools
+import numpy as np
+from qtree.optimizer import Tensor
+from qtree.system_defs import NP_ARRAY_TYPE
+from .Compressor import NumpyCompressor, Compressor
+
+def iterate_indices(indices: list):
+    if len(indices)==0:
+        return [tuple()]
+    ranges = [range(v.size) for v in indices]
+    return itertools.product(*ranges)
+
+
+class CompressedTensor(Tensor):
+    """
+    Extension of the Tensor class that holds compressed data
+
+    The data array is split along several indices S into 2^|S| parts
+
+    """
+    def __init__(self, name, indices,
+                 data_key=None, data=None,
+                 slice_indices=[],
+                 compressor:Compressor=NumpyCompressor()
+                ):
+        """
+        Initialize the tensor
+        name: str,
+              the name of the tensor. Used only for display/convenience.
+              May be not unique.
+        indices: tuple,
+              Indices of the tensor
+        shape: tuple,
+              shape of a tensor
+        data_key: int
+              Key to find tensor's data in the global storage
+        data: np.array
+              Actual data of the tensor. Default None.
+              Usually is not supplied at initialization.
+        slice_indices: list[Var]
+            indices along which the tensor is split into chunks
+        """
+        super().__init__(name, indices, data_key=data_key, data=data)
+        self.slice_indices = slice_indices
+        self.compressor = compressor
+        if data is not None:
+            self._dtype = data.dtype
+        else:
+            self._dtype = None
+
+    @classmethod
+    def empty(cls, name, indices, slice_indices=[], compressor=NumpyCompressor(), dtype:type=NP_ARRAY_TYPE):
+        t = super().empty(name, indices, dtype)
+        t.compressor = compressor
+        if slice_indices:
+            t.compress_indices(slice_indices)
+        return t
+
+    def compress_indices(self, indices: list):
+        """
+        Slice the self.data along dimensions in `indices`,
+        store them compressed
+
+        Does not support compressing when already compressed
+        """
+        slice_dict = {
+            i: slice(None) for i in self.indices
+        }
+        data_chunks = []
+        for ivals in iterate_indices(indices):
+            for ix, ival in zip(indices, ivals):
+                slice_dict[ix] = ival# slice(ival, ival+1)
+            dslice = self.data[tuple(slice_dict[i] for i in self.indices)]
+
+            data_chunks.append(
+                self.compressor.compress(dslice)
+            )
+            del dslice
+        self._data = data_chunks
+        self.slice_indices = indices
+
+    @property
+    def dtype(self):
+        """
+        DataType of wrapped chunks.
+        """
+        return self._dtype
+
+    @property
+    def array_indices(self):
+        return [x for x in self.indices if x not in self.slice_indices]
+
+    def get_chunk(self, ivals):
+        dims = [v.size for v in self.slice_indices]
+        if len(ivals)==0:
+            flat_ix = 0
+        else:
+            flat_ix = np.ravel_multi_index(ivals, dims)
+        ptr = self._data[flat_ix]
+        return self.compressor.decompress(ptr)
+
+    def set_chunk(self, ivals, chunk: np.ndarray):
+        # -- Check for consistent data types between chunks
+        if self._dtype is None:
+            self._dtype = chunk.dtype
+        else:
+            assert self.dtype == chunk.dtype, f"Chunk dtype {chunk.dtype} does not match tensor dtype {self.dtype}"
+        # --
+
+        if self._data is None:
+            self._data = np.empty(2**len(self.slice_indices), dtype=object)
+        dims = [v.size for v in self.slice_indices]
+        if len(ivals)==0:
+            flat_ix = 0
+        else:
+            flat_ix = np.ravel_multi_index(ivals, dims)
+        self._data[flat_ix] = self.compressor.compress(chunk)
+
+    def __getitem__(self, key):
+        """
+        Get a slice of the tensor along the indices in `key`
+        Currently slicing over all compressed indices is required.
+        Slices over compressed indices must be ints
+        """
+        slices_ints, new_indices = self._parse_getitem_key(key)
+        slice_dict = {}
+        chunk_slices_ints = []
+        compression_ints = []
+        for ix, ival in zip(self.indices, slices_ints):
+            slice_dict[ix] = ival
+            if ix in self.slice_indices:
+                compression_ints.append(ival)
+            else:
+                chunk_slices_ints.append(ival)
+        chunk = self.get_chunk(compression_ints)
+        new_name = f"{self.name}[sliced]"
+        # careful: chunk will not be collected even if slice is small
+        chunk_slice = chunk[tuple(chunk_slices_ints)]
+        return Tensor(new_name, new_indices, data=chunk_slice)
+
+
+    def __str__(self):
+        array_ix = ','.join(map(str, self.array_indices))
+        split_ix= ','.join(map(str, self.slice_indices))
+        return f'{self._name}{{{split_ix}}}({array_ix})'
+
+    def copy(self, name=None, indices=None, data_key=None, data=None):
+        raise NotImplementedError()
+
+    def __repr__(self):
+        return self.__str__()
+
+
+
diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index 49d2f9d9..bb924819 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -5,16 +5,19 @@
 print(Path(__file__).parent/'szx/src/')
 sys.path.append(str(Path(__file__).parent/'szx/src/'))
 sys.path.append('./szx/src')
-sys.path.append(str(Path(__file__).parent/'szp/src/'))
-sys.path.append('./szp/src')
+# sys.path.append(str(Path(__file__).parent/'szp/src/'))
+# sys.path.append('./szp/src')
 
-sys.path.append(str(Path(__file__).parent/'cusz/src'))
-sys.path.append('./cusz/src')
+# sys.path.append(str(Path(__file__).parent/'cusz/src'))
+# sys.path.append('./cusz/src')
+sys.path.append(str(Path(__file__).parent/'torch_quant'))
+sys.path.append('./torch_quant')
 
 try:
     from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
-    from cuSZp_wrapper import cuszp_device_compress, cuszp_device_decompress
-    from cusz_wrapper import cusz_device_compress, cusz_device_decompress
+    # from cuSZp_wrapper import cuszp_device_compress, cuszp_device_decompress
+    # from cusz_wrapper import cusz_device_compress, cusz_device_decompress
+    from torch_quant import quant_device_compress, quant_device_decompress
 except:
     print("import failed")
     # Silently fail on missing build of cuszx
@@ -119,6 +122,8 @@ def free_decompressed(self):
         import cupy
         print("Cleanup", len(self.decompressed_own))
         for x in self.decompressed_own:
+            if x == None:
+                continue
             print("CUDA Free", x)
             cupy.cuda.runtime.free(x)
         self.decompressed_own = []
@@ -146,7 +151,9 @@ def compress(self, data):
 
         dtype = data.dtype
         cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
-        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
+        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
+
+        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
 
     def compress_size(self, ptr):
         return ptr[5]
@@ -190,7 +197,7 @@ def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
         else:
             #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
 
-            cmp_bytes, outSize_ptr = cusz_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+            cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
         return cmp_bytes, outSize_ptr
 
     ### Decompression API with cuSZx ###
@@ -209,5 +216,5 @@ def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtyp
         else:
             #decompressed_data = cuszp_device_decompress(num_elements, cmp_bytes, cmpsize, owner,dtype)
 # oriData, absErrBound, nbEle, blockSize,threshold
-            decompressed_data = cusz_device_decompress(num_elements, cmp_bytes, owner,dtype)
+            decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
         return decompressed_data
diff --git a/qtensor/compression/compressed_contraction.py b/qtensor/compression/compressed_contraction.py
index eb6ee0c2..041eaf27 100644
--- a/qtensor/compression/compressed_contraction.py
+++ b/qtensor/compression/compressed_contraction.py
@@ -40,7 +40,8 @@ def contract_two_tensors(A, B, T_out, einsum=np.einsum):
         result_ints = [relabel_dict_int[int(i)] for i in result_indices]
     else:
         result_ints = list(map(int, result_indices))
-
+    print(A.data.shape)
+    print(B.data.shape)
     out = einsum(A.data, A_ints, B.data, B_ints, result_ints)
     if len(result_ints)>0:
         # This copying is reqiured because cupy doesn't support `out` argument.
diff --git a/qtensor/compression/torch_quant/torch_quant.py b/qtensor/compression/torch_quant/torch_quant.py
new file mode 100644
index 00000000..09c1b9d5
--- /dev/null
+++ b/qtensor/compression/torch_quant/torch_quant.py
@@ -0,0 +1,139 @@
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+from qtensor.tools.lazy_import import cupy as cp
+import time
+import torch
+
+from pathlib import Path
+
+
+
+def quant_device_compress(oriData, nbEle, blockSize,threshold):
+    #print(nbEle)
+    ori_nbEle = nbEle
+    variable = ctypes.c_size_t(0)
+    outSize = ctypes.pointer(variable)
+
+    oriData = oriData.flatten()
+    ori_real = oriData.real
+    ori_imag = oriData.imag
+    oriData = cp.concatenate((ori_real, ori_imag))
+    sample = oriData[::2]
+    max_val = cp.amax(oriData).get()
+    min_val = cp.amin(oriData).get()
+    d = max_val - min_val
+    if d.dtype == np.complex64:
+        d = d.real
+    threshold = threshold*(d)
+    s_1 = time.time() 
+    truth_values = abs(oriData)<=threshold
+    oriData[truth_values] = 0.0
+    print("Percent nonzero: "+str(cp.count_nonzero(oriData)/oriData.shape[0]))
+
+    nbEle = oriData.shape[0]
+    
+    # oriData = cp.reshape(oriData, (-1, blockSize))  # Reshape to blocksize
+    tensor = torch.as_tensor(oriData, device='cuda')
+    print("Min val: "+str(min_val)+" range: "+str(d))
+    zero_point = int((min_val/d)*127)
+
+    q_tensor = torch.quantize_per_tensor(tensor, 0.1, zero_point, dtype=torch.qint8)
+
+    return (q_tensor), nbEle/4
+
+
+def quant_device_decompress(nbEle, cmpBytes, owner, dtype):
+    (q_tensor) = cmpBytes
+    restored = torch.dequantize(q_tensor)
+    arr = cp.asarray(restored)
+    # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error
+
+    # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
+    # -- Workaround to convert GPU pointer to int
+    # p_decompressed_ptr = ctypes.addressof(newData)
+    # cast to int64 pointer
+    # (effectively converting pointer to pointer to addr to pointer to int64)
+    # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+    # decompressed_int = p_decompressed_int.contents
+    # # --
+    # pointer_for_free = decompressed_int.value
+    # # self.decompressed_own.append(decompressed_int.value)
+    # mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0)
+    # mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
+    #print("mem ptr")
+    #print(mem_ptr)
+    # arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr)
+
+    # res = cp.zeros((nbEle,))
+    # ## need to convert newData to cupy
+    # cp.place(res,bitmap,arr)
+
+    c_res = cp.zeros(int(nbEle/2), np.complex64)
+    c_res.real = arr[0:int(nbEle/2)]
+    c_res.imag = arr[int(nbEle/2):]
+    return (c_res, None)
+
+### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
+
+def free_compressed(ptr):
+    p_ptr = ctypes.addressof(ptr)
+    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decomp_int = p_int.contents
+    cp.cuda.runtime.free(decomp_int.value)
+
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = int(1024)
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.002
+    r2r_error = 0.0001
+
+    in_vector = np.fromfile("all_sample.bin", dtype=np.complex64)
+    #print(np.max(in_vector))
+    DATA_SIZE = len(in_vector)
+    #range_vr = np.max(in_vector)-np.min(in_vector)
+    #r2r_threshold = r2r_threshold*range_vr
+    #r2r_error = r2r_error*range_vr
+    #in_vector = np.zeros((DATA_SIZE,))
+    #for i in range(0,int(DATA_SIZE/4)):
+    #    in_vector[i] = 0.0
+    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+    #    in_vector[i] = 5.0
+    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
+    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+    #    in_vector[i] = -7.0
+    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+    #    in_vector[i] = 0.001
+
+    print(DATA_SIZE)
+    #in_vector = in_vector.astype('float32')
+    in_vector_gpu = cp.asarray(in_vector)
+    
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+    for i in range(200):
+        s_time = time.time()
+        o_bytes, outSize = quant_device_compress(in_vector_gpu, DATA_SIZE, 256, r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        # print(outSize[0])
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        (d_bytes,ptr )= quant_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
+        
+        # free_compressed(o_bytes[0])
+        # cp.cuda.runtime.free(ptr)
+        print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
+        print("Decompress Success")
diff --git a/qtensor/contraction_backends/compression.py b/qtensor/contraction_backends/compression.py
index 6bc09558..c7f6bbf0 100644
--- a/qtensor/contraction_backends/compression.py
+++ b/qtensor/contraction_backends/compression.py
@@ -1,127 +1,129 @@
-from qtensor.contraction_backends import ContractionBackend
-from qtensor.compression import Compressor, CompressedTensor, Tensor
-from qtensor.compression.compressed_contraction import compressed_contract, compressed_sum
-from qtensor.contraction_backends.common import slice_numpy_tensor
-from qtree.optimizer import Tensor
-
-class CompressionBackend(ContractionBackend):
-    """
-    Compression bucket contraction backend.
-
-    This backend "decorates" another backend, by using compression in 
-    pairwise contraction. If the result tensor has more than `max_tw` indices,
-    it is sliced and the contraction result is compressed before proceeding to
-    next slice.
-    """
-    def __init__(self, backend, compressor:Compressor, max_tw:int):
-        """
-        Arguments:
-            backend: the backend to use for contraction
-            compressor: the compressor to use for compression
-            max_tw: threshold for triggering compression.
-
-        """
-        self.backend = backend
-        self.compressor = compressor
-        self.max_tw = max_tw
-
-    def _get_backend_specific_fns(self, backend):
-        ## Hacky way to extend backends
-        if 'cupy' in backend.__class__.__name__.lower():
-            import cupy as cp
-            return cp.einsum, cp.array
-        elif 'torch' in backend.__class__.__name__.lower():
-            import torch
-            return torch.einsum, torch.tensor
-        else:
-            import numpy as np
-            return np.einsum, lambda x: x
-
-    def process_bucket(self, bucket, no_sum=False):
-        """
-        Process a bucket.
-
-        This uses `self.backend.process_bucket` in combination with
-        compression.compressed_contraction.compressed_contract
-        """
-        ctr_kw = dict(zip(['einsum', 'move_data'], self._get_backend_specific_fns(self.backend)))
-        bucket.sort(key=lambda x: len(x.indices))
-        print("Processing bucket", bucket)
-        accum = bucket[0]
-        for t in bucket[1:-1]:
-            accum = compressed_contract(
-                accum, t, [], self.max_tw, self.compressor,
-                **ctr_kw
-            )
-        if len(bucket)>1:
-            t = bucket[-1]
-            total_ixs = sorted(
-                set().union(*[t.indices, accum.indices])
-                , key=int, reverse=True
-            )
-            accum_new = compressed_contract(
-                accum, t, [total_ixs[-1]], self.max_tw, self.compressor
-                ,**ctr_kw
-            )
-            # free data
-            import cupy
-            for t in [accum, t]:
-                if isinstance(t, CompressedTensor):
-                    for c in t.data:
-                        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = c
-                        import ctypes
-                        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
-                        # cast to int64 pointer
-                        # (effectively converting pointer to pointer to addr to pointer to int64)
-                        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-                        decompressed_int = p_decompressed_int.contents
-                        print("Freeing mem", decompressed_int.value)
-                        cupy.cuda.runtime.free(decompressed_int.value)
-                    t.compressor.compressor.free_decompressed()
-                    #raise ValueError("Done")
-                else:
-                    #print("PTR", t.data.data.ptr)
-                    #cupy.cuda.runtime.free(t.data.data.ptr)
-                    pass
-                    
-            accum = accum_new
-
-            return accum
-        else:
-            if len(accum.indices) < 1:
-                return accum
-            indices = (accum.indices[-1], )
-            res = compressed_sum(accum, indices, self.compressor, self.max_tw,  **ctr_kw)
-            if isinstance(accum, CompressedTensor):
-                import cupy
-                for c in accum.data:
-                    cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = c
-                    import ctypes
-                    p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
-                    # cast to int64 pointer
-                    # (effectively converting pointer to pointer to addr to pointer to int64)
-                    p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-                    decompressed_int = p_decompressed_int.contents
-                    print("Freeing mem", decompressed_int.value)
-                    cupy.cuda.runtime.free(decompressed_int.value)
-                accum.compressor.compressor.free_decompressed()
-            return res
-
-    def get_sliced_buckets(self, buckets, data_dict, slice_dict):
-        """
-        Slice buckets accounding to `slice_dict`
-
-        This delegates to `self.backend`, assuming that buckets don't have 
-        tensors with more than `self.max_tw` indices.
-        """
-        # Note: to support large tensors (more than `max_tw`), 
-        # just iterate through sliced bucket tensors and compress if needed
-        return self.backend.get_sliced_buckets(buckets, data_dict, slice_dict)
-
-    def get_result_data(self, result):
-        """
-        Get result data from `result` tensor.
-
-        This assumes that the result has at most `self.max_tw` indices.
-        """
-        return self.backend.get_result_data(result)
+from qtensor.contraction_backends import ContractionBackend
+from qtensor.compression import Compressor, CompressedTensor, Tensor
+from qtensor.compression.compressed_contraction import compressed_contract, compressed_sum
+from qtensor.contraction_backends.common import slice_numpy_tensor
+from qtree.optimizer import Tensor
+
+class CompressionBackend(ContractionBackend):
+    """
+    Compression bucket contraction backend.
+
+    This backend "decorates" another backend, by using compression in 
+    pairwise contraction. If the result tensor has more than `max_tw` indices,
+    it is sliced and the contraction result is compressed before proceeding to
+    next slice.
+    """
+    def __init__(self, backend, compressor:Compressor, max_tw:int):
+        """
+        Arguments:
+            backend: the backend to use for contraction
+            compressor: the compressor to use for compression
+            max_tw: threshold for triggering compression.
+
+        """
+        self.backend = backend
+        self.compressor = compressor
+        self.max_tw = max_tw
+
+    def _get_backend_specific_fns(self, backend):
+        ## Hacky way to extend backends
+        if 'cupy' in backend.__class__.__name__.lower():
+            import cupy as cp
+            return cp.einsum, cp.array
+        elif 'torch' in backend.__class__.__name__.lower():
+            import torch
+            return torch.einsum, torch.tensor
+        else:
+            import numpy as np
+            return np.einsum, lambda x: x
+
+    def process_bucket(self, bucket, no_sum=False):
+        """
+        Process a bucket.
+
+        This uses `self.backend.process_bucket` in combination with
+        compression.compressed_contraction.compressed_contract
+        """
+        ctr_kw = dict(zip(['einsum', 'move_data'], self._get_backend_specific_fns(self.backend)))
+        bucket.sort(key=lambda x: len(x.indices))
+        print("Processing bucket", bucket)
+        accum = bucket[0]
+        for t in bucket[1:-1]:
+            accum = compressed_contract(
+                accum, t, [], self.max_tw, self.compressor,
+                **ctr_kw
+            )
+        if len(bucket)>1:
+            t = bucket[-1]
+            total_ixs = sorted(
+                set().union(*[t.indices, accum.indices])
+                , key=int, reverse=True
+            )
+            accum_new = compressed_contract(
+                accum, t, [total_ixs[-1]], self.max_tw, self.compressor
+                ,**ctr_kw
+            )
+            # free data
+            import cupy
+            for t in [accum, t]:
+                if isinstance(t, CompressedTensor):
+                    for c in t.data:
+                        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = c
+                        del cmp_bytes
+                        # import ctypes
+                        # p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
+                        # # cast to int64 pointer
+                        # # (effectively converting pointer to pointer to addr to pointer to int64)
+                        # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+                        # decompressed_int = p_decompressed_int.contents
+                        # print("Freeing mem", decompressed_int.value)
+                        # cupy.cuda.runtime.free(decompressed_int.value)
+                    t.compressor.compressor.free_decompressed()
+                    #raise ValueError("Done")
+                else:
+                    #print("PTR", t.data.data.ptr)
+                    #cupy.cuda.runtime.free(t.data.data.ptr)
+                    pass
+                    
+            accum = accum_new
+
+            return accum
+        else:
+            if len(accum.indices) < 1:
+                return accum
+            indices = (accum.indices[-1], )
+            res = compressed_sum(accum, indices, self.compressor, self.max_tw,  **ctr_kw)
+            if isinstance(accum, CompressedTensor):
+                import cupy
+                for c in accum.data:
+                    cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = c
+                    del cmp_bytes
+                    #import ctypes
+                    #p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
+                    # cast to int64 pointer
+                    # (effectively converting pointer to pointer to addr to pointer to int64)
+                    #p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+                    #decompressed_int = p_decompressed_int.contents
+                    #print("Freeing mem", decompressed_int.value)
+                    #cupy.cuda.runtime.free(decompressed_int.value)
+                accum.compressor.compressor.free_decompressed()
+            return res
+
+    def get_sliced_buckets(self, buckets, data_dict, slice_dict):
+        """
+        Slice buckets accounding to `slice_dict`
+
+        This delegates to `self.backend`, assuming that buckets don't have 
+        tensors with more than `self.max_tw` indices.
+        """
+        # Note: to support large tensors (more than `max_tw`), 
+        # just iterate through sliced bucket tensors and compress if needed
+        return self.backend.get_sliced_buckets(buckets, data_dict, slice_dict)
+
+    def get_result_data(self, result):
+        """
+        Get result data from `result` tensor.
+
+        This assumes that the result has at most `self.max_tw` indices.
+        """
+        return self.backend.get_result_data(result)

From 6a5d47238d8ca3251578839f46aa55bab1e7c782 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Mon, 12 Jun 2023 14:19:25 -0500
Subject: [PATCH 087/126] torch transpose changes

---
 qtensor/contraction_backends/torch.py | 44 ++++++++++++++++++++++++---
 1 file changed, 40 insertions(+), 4 deletions(-)

diff --git a/qtensor/contraction_backends/torch.py b/qtensor/contraction_backends/torch.py
index 2180be40..3df1bf16 100644
--- a/qtensor/contraction_backends/torch.py
+++ b/qtensor/contraction_backends/torch.py
@@ -3,7 +3,7 @@
 import numpy as np
 from qtree import np_framework
 from qtensor.contraction_backends import ContractionBackend
-from .common import slice_numpy_tensor, get_einsum_expr
+from .common import get_slice_bounds, get_einsum_expr, slice_numpy_tensor
 import string
 CHARS = string.ascii_lowercase + string.ascii_uppercase
 
@@ -34,6 +34,42 @@ def get_einsum_expr_bucket(bucket, all_indices_list, result_indices):
 
 
+def permute_torch_tensor_data(data:np.ndarray, indices_in, indices_out):
+    """
+    Permute the data of a numpy tensor to the given indices_out.
+    
+    Returns:
+        permuted data
+    """
+    # permute indices
+    out_locs = {idx: i for i, idx in enumerate(indices_out)}
+    perm = [out_locs[i] for i in indices_in]
+    # permute tensor
+    return torch.permute(data, perm)
+
+def slice_torch_tensor(data:np.ndarray, indices_in, indices_out, slice_dict):
+    """
+    Args:
+        data : np.ndarray
+        indices_in: list of `qtree.optimizer.Var`
+        indices_out: list of `qtree.optimizer.Var`
+        slice_dict: dict of `qtree.optimizer.Var` to `slice`
+
+    Returns:
+        new data, new indices
+    """
+    slice_bounds = get_slice_bounds(slice_dict, indices_in)
+    s_data = data[slice_bounds]
+    indices_sliced = [
+        i for sl, i in zip(slice_bounds, indices_in) if not isinstance(sl, int)
+    ]
+    indices_sized = [v.copy(size=size) for v, size in zip(indices_sliced, s_data.shape)]
+    indices_out = [v for v in indices_out if not isinstance(slice_dict.get(v, None), int)]
+    assert len(indices_sized) == len(s_data.shape)
+    assert len(indices_sliced) == len(s_data.shape)
+    st_data = permute_torch_tensor_data(s_data, indices_sliced, indices_out)
+    return st_data, indices_out
+
 
 class TorchBackend(ContractionBackend):
     def __init__(self, device='cpu'):
@@ -147,8 +183,6 @@ def get_sliced_buckets(self, buckets, data_dict, slice_dict):
                 out_indices = list(sorted(tensor.indices, key=int, reverse=True))
                 data = data_dict[tensor.data_key]
                 # Works for torch tensors just fine
-                data, new_indices = slice_numpy_tensor(data, tensor.indices, out_indices, slice_dict)
-
                 if not isinstance(data, torch.Tensor):             
                     if self.device == 'gpu' and torch.cuda.is_available():
                         cuda = torch.device('cuda')
@@ -158,6 +192,8 @@ def get_sliced_buckets(self, buckets, data_dict, slice_dict):
                 else:
                     data = data.type(torch.complex128)
                 # slice data
+                data, new_indices = slice_torch_tensor(data, tensor.indices, out_indices, slice_dict)
+
                 sliced_bucket.append(
                     tensor.copy(indices=new_indices, data=data))
             sliced_buckets.append(sliced_bucket)
@@ -165,4 +201,4 @@ def get_sliced_buckets(self, buckets, data_dict, slice_dict):
         return sliced_buckets
 
     def get_result_data(self, result):
-        return np.transpose(result.data)
+        return torch.permute(result.data, tuple(reversed(range(result.data.ndim))))

From af8c59e4ceac678547b61fd0607d9631a3960c96 Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Fri, 16 Jun 2023 19:04:09 +0000
Subject: [PATCH 088/126] fix DOS newline characters

---
 qtensor/contraction_backends/compression.py | 258 ++++++++++----------
 1 file changed, 129 insertions(+), 129 deletions(-)

diff --git a/qtensor/contraction_backends/compression.py b/qtensor/contraction_backends/compression.py
index c7f6bbf0..1cb548ef 100644
--- a/qtensor/contraction_backends/compression.py
+++ b/qtensor/contraction_backends/compression.py
@@ -1,129 +1,129 @@
-from qtensor.contraction_backends import ContractionBackend
-from qtensor.compression import Compressor, CompressedTensor, Tensor
-from qtensor.compression.compressed_contraction import compressed_contract, compressed_sum
-from qtensor.contraction_backends.common import slice_numpy_tensor
-from qtree.optimizer import Tensor
-
-class CompressionBackend(ContractionBackend):
-    """
-    Compression bucket contraction backend.
-
-    This backend "decorates" another backend, by using compression in 
-    pairwise contraction. If the result tensor has more than `max_tw` indices,
-    it is sliced and the contraction result is compressed before proceeding to
-    next slice.
-    """
-    def __init__(self, backend, compressor:Compressor, max_tw:int):
-        """
-        Arguments:
-            backend: the backend to use for contraction
-            compressor: the compressor to use for compression
-            max_tw: threshold for triggering compression.
-
-        """
-        self.backend = backend
-        self.compressor = compressor
-        self.max_tw = max_tw
-
-    def _get_backend_specific_fns(self, backend):
-        ## Hacky way to extend backends
-        if 'cupy' in backend.__class__.__name__.lower():
-            import cupy as cp
-            return cp.einsum, cp.array
-        elif 'torch' in backend.__class__.__name__.lower():
-            import torch
-            return torch.einsum, torch.tensor
-        else:
-            import numpy as np
-            return np.einsum, lambda x: x
-
-    def process_bucket(self, bucket, no_sum=False):
-        """
-        Process a bucket.
-
-        This uses `self.backend.process_bucket` in combination with
-        compression.compressed_contraction.compressed_contract
-        """
-        ctr_kw = dict(zip(['einsum', 'move_data'], self._get_backend_specific_fns(self.backend)))
-        bucket.sort(key=lambda x: len(x.indices))
-        print("Processing bucket", bucket)
-        accum = bucket[0]
-        for t in bucket[1:-1]:
-            accum = compressed_contract(
-                accum, t, [], self.max_tw, self.compressor,
-                **ctr_kw
-            )
-        if len(bucket)>1:
-            t = bucket[-1]
-            total_ixs = sorted(
-                set().union(*[t.indices, accum.indices])
-                , key=int, reverse=True
-            )
-            accum_new = compressed_contract(
-                accum, t, [total_ixs[-1]], self.max_tw, self.compressor
-                ,**ctr_kw
-            )
-            # free data
-            import cupy
-            for t in [accum, t]:
-                if isinstance(t, CompressedTensor):
-                    for c in t.data:
-                        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = c
-                        del cmp_bytes
-                        # import ctypes
-                        # p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
-                        # # cast to int64 pointer
-                        # # (effectively converting pointer to pointer to addr to pointer to int64)
-                        # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-                        # decompressed_int = p_decompressed_int.contents
-                        # print("Freeing mem", decompressed_int.value)
-                        # cupy.cuda.runtime.free(decompressed_int.value)
-                    t.compressor.compressor.free_decompressed()
-                    #raise ValueError("Done")
-                else:
-                    #print("PTR", t.data.data.ptr)
-                    #cupy.cuda.runtime.free(t.data.data.ptr)
-                    pass
-                    
-            accum = accum_new
-
-            return accum
-        else:
-            if len(accum.indices) < 1:
-                return accum
-            indices = (accum.indices[-1], )
-            res = compressed_sum(accum, indices, self.compressor, self.max_tw,  **ctr_kw)
-            if isinstance(accum, CompressedTensor):
-                import cupy
-                for c in accum.data:
-                    cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = c
-                    del cmp_bytes
-                    #import ctypes
-                    #p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
-                    # cast to int64 pointer
-                    # (effectively converting pointer to pointer to addr to pointer to int64)
-                    #p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-                    #decompressed_int = p_decompressed_int.contents
-                    #print("Freeing mem", decompressed_int.value)
-                    #cupy.cuda.runtime.free(decompressed_int.value)
-                accum.compressor.compressor.free_decompressed()
-            return res
-
-    def get_sliced_buckets(self, buckets, data_dict, slice_dict):
-        """
-        Slice buckets accounding to `slice_dict`
-
-        This delegates to `self.backend`, assuming that buckets don't have 
-        tensors with more than `self.max_tw` indices.
-        """
-        # Note: to support large tensors (more than `max_tw`), 
-        # just iterate through sliced bucket tensors and compress if needed
-        return self.backend.get_sliced_buckets(buckets, data_dict, slice_dict)
-
-    def get_result_data(self, result):
-        """
-        Get result data from `result` tensor.
-
-        This assumes that the result has at most `self.max_tw` indices.
-        """
-        return self.backend.get_result_data(result)
+from qtensor.contraction_backends import ContractionBackend
+from qtensor.compression import Compressor, CompressedTensor, Tensor
+from qtensor.compression.compressed_contraction import compressed_contract, compressed_sum
+from qtensor.contraction_backends.common import slice_numpy_tensor
+from qtree.optimizer import Tensor
+
+class CompressionBackend(ContractionBackend):
+    """
+    Compression bucket contraction backend.
+
+    This backend "decorates" another backend, by using compression in 
+    pairwise contraction. If the result tensor has more than `max_tw` indices,
+    it is sliced and the contraction result is compressed before proceeding to
+    next slice.
+    """
+    def __init__(self, backend, compressor:Compressor, max_tw:int):
+        """
+        Arguments:
+            backend: the backend to use for contraction
+            compressor: the compressor to use for compression
+            max_tw: threshold for triggering compression.
+
+        """
+        self.backend = backend
+        self.compressor = compressor
+        self.max_tw = max_tw
+
+    def _get_backend_specific_fns(self, backend):
+        ## Hacky way to extend backends
+        if 'cupy' in backend.__class__.__name__.lower():
+            import cupy as cp
+            return cp.einsum, cp.array
+        elif 'torch' in backend.__class__.__name__.lower():
+            import torch
+            return torch.einsum, torch.tensor
+        else:
+            import numpy as np
+            return np.einsum, lambda x: x
+
+    def process_bucket(self, bucket, no_sum=False):
+        """
+        Process a bucket.
+
+        This uses `self.backend.process_bucket` in combination with
+        compression.compressed_contraction.compressed_contract
+        """
+        ctr_kw = dict(zip(['einsum', 'move_data'], self._get_backend_specific_fns(self.backend)))
+        bucket.sort(key=lambda x: len(x.indices))
+        print("Processing bucket", bucket)
+        accum = bucket[0]
+        for t in bucket[1:-1]:
+            accum = compressed_contract(
+                accum, t, [], self.max_tw, self.compressor,
+                **ctr_kw
+            )
+        if len(bucket)>1:
+            t = bucket[-1]
+            total_ixs = sorted(
+                set().union(*[t.indices, accum.indices])
+                , key=int, reverse=True
+            )
+            accum_new = compressed_contract(
+                accum, t, [total_ixs[-1]], self.max_tw, self.compressor
+                ,**ctr_kw
+            )
+            # free data
+            import cupy
+            for t in [accum, t]:
+                if isinstance(t, CompressedTensor):
+                    for c in t.data:
+                        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = c
+                        del cmp_bytes
+                        # import ctypes
+                        # p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
+                        # # cast to int64 pointer
+                        # # (effectively converting pointer to pointer to addr to pointer to int64)
+                        # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+                        # decompressed_int = p_decompressed_int.contents
+                        # print("Freeing mem", decompressed_int.value)
+                        # cupy.cuda.runtime.free(decompressed_int.value)
+                    t.compressor.compressor.free_decompressed()
+                    #raise ValueError("Done")
+                else:
+                    #print("PTR", t.data.data.ptr)
+                    #cupy.cuda.runtime.free(t.data.data.ptr)
+                    pass
+                    
+            accum = accum_new
+
+            return accum
+        else:
+            if len(accum.indices) < 1:
+                return accum
+            indices = (accum.indices[-1], )
+            res = compressed_sum(accum, indices, self.compressor, self.max_tw,  **ctr_kw)
+            if isinstance(accum, CompressedTensor):
+                import cupy
+                for c in accum.data:
+                    cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = c
+                    del cmp_bytes
+                    #import ctypes
+                    #p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
+                    # cast to int64 pointer
+                    # (effectively converting pointer to pointer to addr to pointer to int64)
+                    #p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+                    #decompressed_int = p_decompressed_int.contents
+                    #print("Freeing mem", decompressed_int.value)
+                    #cupy.cuda.runtime.free(decompressed_int.value)
+                accum.compressor.compressor.free_decompressed()
+            return res
+
+    def get_sliced_buckets(self, buckets, data_dict, slice_dict):
+        """
+        Slice buckets accounding to `slice_dict`
+
+        This delegates to `self.backend`, assuming that buckets don't have 
+        tensors with more than `self.max_tw` indices.
+        """
+        # Note: to support large tensors (more than `max_tw`), 
+        # just iterate through sliced bucket tensors and compress if needed
+        return self.backend.get_sliced_buckets(buckets, data_dict, slice_dict)
+
+    def get_result_data(self, result):
+        """
+        Get result data from `result` tensor.
+
+        This assumes that the result has at most `self.max_tw` indices.
+        """
+        return self.backend.get_result_data(result)

From a0591f25cb5d269113e8949d682beefd22dc044c Mon Sep 17 00:00:00 2001
From: Milan Shah <mkshah5@ncsu.edu>
Date: Fri, 16 Jun 2023 15:23:54 -0400
Subject: [PATCH 089/126] Updated scale and zero point for quantization

---
 qtensor/compression/torch_quant/torch_quant.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/qtensor/compression/torch_quant/torch_quant.py b/qtensor/compression/torch_quant/torch_quant.py
index 09c1b9d5..49c4a54c 100644
--- a/qtensor/compression/torch_quant/torch_quant.py
+++ b/qtensor/compression/torch_quant/torch_quant.py
@@ -37,9 +37,10 @@ def quant_device_compress(oriData, nbEle, blockSize,threshold):
     # oriData = cp.reshape(oriData, (-1, blockSize))  # Reshape to blocksize
     tensor = torch.as_tensor(oriData, device='cuda')
     print("Min val: "+str(min_val)+" range: "+str(d))
-    zero_point = int((min_val/d)*127)
+    scale = d/255.0
+    zero_point = -1*round(min_val*scale) - 128
 
-    q_tensor = torch.quantize_per_tensor(tensor, 0.1, zero_point, dtype=torch.qint8)
+    q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
 
     return (q_tensor), nbEle/4
 

From e8ebe8ca6c1f9e9a20818c9c1dc82598faeb8f3f Mon Sep 17 00:00:00 2001
From: Milan Kartik Shah <mkshah5@ncsu.edu>
Date: Fri, 23 Jun 2023 16:09:55 -0400
Subject: [PATCH 090/126] Updated to add threshold+grouping

---
 qtensor/compression/Compressor.py             | 19 ++++++---
 .../compression/torch_quant/torch_quant.py    | 40 +++++++++++++------
 2 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index bb924819..6a4d117e 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -12,7 +12,7 @@
 # sys.path.append('./cusz/src')
 sys.path.append(str(Path(__file__).parent/'torch_quant'))
 sys.path.append('./torch_quant')
-
+import torch
 try:
     from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
     # from cuSZp_wrapper import cuszp_device_compress, cuszp_device_decompress
@@ -122,10 +122,16 @@ def free_decompressed(self):
         import cupy
         print("Cleanup", len(self.decompressed_own))
         for x in self.decompressed_own:
-            if x == None:
-                continue
-            print("CUDA Free", x)
-            cupy.cuda.runtime.free(x)
+            #print(x)
+            #if x == None:
+            #    continue
+            #else:
+                #print("CUDA Free", x)
+            #cupy.cuda.runtime.free(x)
+            del x
+            cupy.get_default_memory_pool().free_all_blocks()
+            cupy.get_default_pinned_memory_pool().free_all_blocks()
+        torch.cuda.empty_cache()
         self.decompressed_own = []
 
     def free_compressed(self, ptr):
@@ -177,6 +183,7 @@ def decompress(self, obj):
         # mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
         # mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
         arr = cupy.reshape(arr_cp, shape)
+        self.decompressed_own.append(arr)
         # arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
         return arr
     
@@ -198,6 +205,8 @@ def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
             #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
 
             cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+            del data
+            torch.cuda.empty_cache()
         return cmp_bytes, outSize_ptr
 
     ### Decompression API with cuSZx ###
diff --git a/qtensor/compression/torch_quant/torch_quant.py b/qtensor/compression/torch_quant/torch_quant.py
index 49c4a54c..e6cd0e0b 100644
--- a/qtensor/compression/torch_quant/torch_quant.py
+++ b/qtensor/compression/torch_quant/torch_quant.py
@@ -30,23 +30,36 @@ def quant_device_compress(oriData, nbEle, blockSize,threshold):
     s_1 = time.time() 
     truth_values = abs(oriData)<=threshold
     oriData[truth_values] = 0.0
+    truth_values = cp.invert(truth_values)
+    ori_len = oriData.shape[0]
     print("Percent nonzero: "+str(cp.count_nonzero(oriData)/oriData.shape[0]))
-
+    oriData = oriData[truth_values]
+    
     nbEle = oriData.shape[0]
     
     # oriData = cp.reshape(oriData, (-1, blockSize))  # Reshape to blocksize
     tensor = torch.as_tensor(oriData, device='cuda')
-    print("Min val: "+str(min_val)+" range: "+str(d))
-    scale = d/255.0
-    zero_point = -1*round(min_val*scale) - 128
+    print("Min val: "+str(cp.amin(oriData).get())+" range: "+str(d))
+#    scale = d/255.0
+#    zero_point = -1*round(min_val*scale) - 128
 
+    scale = d/((2**8) - 1)
+    zero_point = -1*round(min_val*scale) - 2^7
+#    q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
+    
     q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
-
-    return (q_tensor), nbEle/4
+    del tensor
+    torch.cuda.empty_cache()
+    #q_ten2 = torch.dequantize(q_tensor)
+    #print(tensor)
+    #print(q_ten2)
+    #print("Max PW error")
+    #print(torch.max(torch.div(torch.abs(torch.sub(tensor[tensor!=0.0],q_ten2[tensor!=0.0])),tensor[tensor!=0.0])))
+    return (q_tensor, truth_values), (nbEle/4)+(ori_len/8)
 
 
 def quant_device_decompress(nbEle, cmpBytes, owner, dtype):
-    (q_tensor) = cmpBytes
+    (q_tensor, bitmap) = cmpBytes
     restored = torch.dequantize(q_tensor)
     arr = cp.asarray(restored)
     # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error
@@ -66,14 +79,17 @@ def quant_device_decompress(nbEle, cmpBytes, owner, dtype):
     #print("mem ptr")
     #print(mem_ptr)
     # arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr)
-
-    # res = cp.zeros((nbEle,))
+    #print(nbEle)
+    res = cp.zeros((nbEle,))
     # ## need to convert newData to cupy
-    # cp.place(res,bitmap,arr)
+    cp.place(res,bitmap,arr)
 
     c_res = cp.zeros(int(nbEle/2), np.complex64)
-    c_res.real = arr[0:int(nbEle/2)]
-    c_res.imag = arr[int(nbEle/2):]
+    #c_res.real = arr[0:int(nbEle/2)]
+    #c_res.imag = arr[int(nbEle/2):]
+
+    c_res.real = res[0:int(nbEle/2)]
+    c_res.imag = res[int(nbEle/2):]
     return (c_res, None)
 
 ### Example of device compress/decompress wrapper usage

From c13ee3af93a669900e5d31d920813b0812e19611 Mon Sep 17 00:00:00 2001
From: Milan Kartik Shah <mkshah5@ncsu.edu>
Date: Fri, 23 Jun 2023 16:22:14 -0400
Subject: [PATCH 091/126] Added packbits call to compress bitmap

---
 qtensor/compression/torch_quant/torch_quant.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/qtensor/compression/torch_quant/torch_quant.py b/qtensor/compression/torch_quant/torch_quant.py
index e6cd0e0b..fb578a0b 100644
--- a/qtensor/compression/torch_quant/torch_quant.py
+++ b/qtensor/compression/torch_quant/torch_quant.py
@@ -50,16 +50,19 @@ def quant_device_compress(oriData, nbEle, blockSize,threshold):
     q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
     del tensor
     torch.cuda.empty_cache()
+    bitmap = cp.packbits(truth_values)
+    del truth_values
     #q_ten2 = torch.dequantize(q_tensor)
     #print(tensor)
     #print(q_ten2)
     #print("Max PW error")
     #print(torch.max(torch.div(torch.abs(torch.sub(tensor[tensor!=0.0],q_ten2[tensor!=0.0])),tensor[tensor!=0.0])))
-    return (q_tensor, truth_values), (nbEle/4)+(ori_len/8)
+    return (q_tensor, bitmap), (nbEle/4)+(ori_len/8)
 
 
 def quant_device_decompress(nbEle, cmpBytes, owner, dtype):
     (q_tensor, bitmap) = cmpBytes
+    bitmap = cp.unpackbits(bitmap)
     restored = torch.dequantize(q_tensor)
     arr = cp.asarray(restored)
     # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error

From 311211f8eaffa9cae2d36086bc6a87da723a2101 Mon Sep 17 00:00:00 2001
From: Milan Kartik Shah <mkshah5@ncsu.edu>
Date: Mon, 3 Jul 2023 12:36:11 -0400
Subject: [PATCH 092/126] Updated zero point and grouping criteria

---
 .../compression/torch_quant/torch_quant.py    | 41 +++++++++++++------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/qtensor/compression/torch_quant/torch_quant.py b/qtensor/compression/torch_quant/torch_quant.py
index fb578a0b..c5f04fc6 100644
--- a/qtensor/compression/torch_quant/torch_quant.py
+++ b/qtensor/compression/torch_quant/torch_quant.py
@@ -32,37 +32,47 @@ def quant_device_compress(oriData, nbEle, blockSize,threshold):
     oriData[truth_values] = 0.0
     truth_values = cp.invert(truth_values)
     ori_len = oriData.shape[0]
-    print("Percent nonzero: "+str(cp.count_nonzero(oriData)/oriData.shape[0]))
-    oriData = oriData[truth_values]
+    nonzero_percent = cp.count_nonzero(oriData)/oriData.shape[0]
+    print("Percent nonzero: "+str(nonzero_percent))
+
+    isGrouped = False
+    if nonzero_percent<=0.5:
+        isGrouped=True
+        oriData = oriData[truth_values]
     
     nbEle = oriData.shape[0]
     
     # oriData = cp.reshape(oriData, (-1, blockSize))  # Reshape to blocksize
     tensor = torch.as_tensor(oriData, device='cuda')
-    print("Min val: "+str(cp.amin(oriData).get())+" range: "+str(d))
+    # print("Min val: "+str(cp.amin(oriData).get())+" range: "+str(d))
 #    scale = d/255.0
 #    zero_point = -1*round(min_val*scale) - 128
 
     scale = d/((2**8) - 1)
-    zero_point = -1*round(min_val*scale) - 2^7
+    #zero_point = -1*round(min_val*scale)
+    zero_point = -1*round(min_val*scale)+32
 #    q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
     
     q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
     del tensor
     torch.cuda.empty_cache()
-    bitmap = cp.packbits(truth_values)
+    if isGrouped:
+        bitmap = cp.packbits(truth_values)
+    else:
+        bitmap = None
     del truth_values
     #q_ten2 = torch.dequantize(q_tensor)
     #print(tensor)
     #print(q_ten2)
     #print("Max PW error")
     #print(torch.max(torch.div(torch.abs(torch.sub(tensor[tensor!=0.0],q_ten2[tensor!=0.0])),tensor[tensor!=0.0])))
-    return (q_tensor, bitmap), (nbEle/4)+(ori_len/8)
+    return (q_tensor, bitmap, isGrouped), (nbEle/4)+(ori_len/8)
 
 
 def quant_device_decompress(nbEle, cmpBytes, owner, dtype):
-    (q_tensor, bitmap) = cmpBytes
-    bitmap = cp.unpackbits(bitmap)
+    (q_tensor, bitmap, isGrouped) = cmpBytes
+    if isGrouped:
+        bitmap = cp.unpackbits(bitmap)
     restored = torch.dequantize(q_tensor)
     arr = cp.asarray(restored)
     # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error
@@ -83,16 +93,21 @@ def quant_device_decompress(nbEle, cmpBytes, owner, dtype):
     #print(mem_ptr)
     # arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr)
     #print(nbEle)
-    res = cp.zeros((nbEle,))
+    if isGrouped:
+        res = cp.zeros((nbEle,))
     # ## need to convert newData to cupy
-    cp.place(res,bitmap,arr)
+        cp.place(res,bitmap,arr)
 
-    c_res = cp.zeros(int(nbEle/2), np.complex64)
+        c_res = cp.zeros(int(nbEle/2), np.complex64)
     #c_res.real = arr[0:int(nbEle/2)]
     #c_res.imag = arr[int(nbEle/2):]
 
-    c_res.real = res[0:int(nbEle/2)]
-    c_res.imag = res[int(nbEle/2):]
+        c_res.real = res[0:int(nbEle/2)]
+        c_res.imag = res[int(nbEle/2):]
+    else:
+        c_res = cp.zeros(int(nbEle/2), np.complex64)
+        c_res.real = arr[0:int(nbEle/2)]
+        c_res.imag = arr[int(nbEle/2):]
     return (c_res, None)
 
 ### Example of device compress/decompress wrapper usage

From 4712784ae56482950d085b1e11fc389fd079dade Mon Sep 17 00:00:00 2001
From: Milan Kartik Shah <mkshah5@ncsu.edu>
Date: Fri, 7 Jul 2023 11:00:59 -0400
Subject: [PATCH 093/126] Quantize per channel

---
 .../torch_quant/torch_quant_perchannel.py     | 194 ++++++++++++++++++
 1 file changed, 194 insertions(+)
 create mode 100644 qtensor/compression/torch_quant/torch_quant_perchannel.py

diff --git a/qtensor/compression/torch_quant/torch_quant_perchannel.py b/qtensor/compression/torch_quant/torch_quant_perchannel.py
new file mode 100644
index 00000000..4278aaa6
--- /dev/null
+++ b/qtensor/compression/torch_quant/torch_quant_perchannel.py
@@ -0,0 +1,194 @@
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+from qtensor.tools.lazy_import import cupy as cp
+import time
+import torch
+
+from pathlib import Path
+
+BS = 16
+
+def quant_device_compress(oriData, nbEle, blockSize,threshold):
+    #print(nbEle)
+    ori_nbEle = nbEle
+    variable = ctypes.c_size_t(0)
+    outSize = ctypes.pointer(variable)
+
+    oriData = oriData.flatten()
+    ori_real = oriData.real
+    ori_imag = oriData.imag
+    oriData = cp.concatenate((ori_real, ori_imag))
+    sample = oriData[::2]
+    max_val = cp.amax(oriData).get()
+    min_val = cp.amin(oriData).get()
+    d = max_val - min_val
+    if d.dtype == np.complex64:
+        d = d.real
+    threshold = threshold*(d)
+    s_1 = time.time() 
+    truth_values = abs(oriData)<=threshold
+    oriData[truth_values] = 0.0
+    truth_values = cp.invert(truth_values)
+    ori_len = oriData.shape[0]
+    nonzero_percent = cp.count_nonzero(oriData)/oriData.shape[0]
+    print("Percent nonzero: "+str(nonzero_percent))
+
+    isGrouped = False
+    if nonzero_percent<=0.00:
+        isGrouped=True
+        oriData = oriData[truth_values]
+    
+    nbEle = oriData.shape[0]
+    
+    # oriData = cp.reshape(oriData, (-1, blockSize))  # Reshape to blocksize
+    tensor = torch.as_tensor(oriData, device='cuda')
+    # print("Min val: "+str(cp.amin(oriData).get())+" range: "+str(d))
+#    scale = d/255.0
+#    zero_point = -1*round(min_val*scale) - 128
+    tensor = torch.reshape(tensor, (-1, BS))
+    maxs = torch.flatten(torch.max(tensor, dim=1)[0])
+    mins = torch.flatten(torch.min(tensor, dim=1)[0])
+    
+    #scales = torch.ones(tensor.shape[0], device='cuda')
+    #scales = torch.mul(scales, d/255.0)
+    #print(d)
+    #print(torch.max(torch.sub(maxs,mins)))
+    scales = torch.abs(torch.sub(maxs,mins))/127.0
+    zero_points = torch.zeros(tensor.shape[0], device='cuda')
+    #zero_points = torch.abs(torch.round(torch.div(mins,scales)))-127
+
+    #print(zero_points)
+
+    #scale = d/((2**8) - 1)
+    #zero_point = -1*round(min_val*scale)
+    #zero_point = -1*round(min_val*scale)+32
+#    q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
+    #tensor = torch.flatten(tensor)
+    #tensor = torch.split(tensor, BS)
+    #print(maxs)
+    #print(mins)
+    #print(scales)
+    
+    q_tensor = torch.quantize_per_channel(tensor, scales, zero_points,0, dtype=torch.qint8)
+    #q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
+    del tensor
+    torch.cuda.empty_cache()
+    if isGrouped:
+        bitmap = cp.packbits(truth_values)
+    else:
+        bitmap = None
+    del truth_values
+    #q_ten2 = torch.dequantize(q_tensor)
+    #print(tensor)
+    #print(q_ten2)
+    #print("Max PW error")
+    #print(torch.max(torch.div(torch.abs(torch.sub(tensor[tensor!=0.0],q_ten2[tensor!=0.0])),tensor[tensor!=0.0])))
+    return (q_tensor, bitmap, isGrouped), (nbEle/2)+(ori_len/8)
+
+
+def quant_device_decompress(nbEle, cmpBytes, owner, dtype):
+    (q_tensor, bitmap, isGrouped) = cmpBytes
+    if isGrouped:
+        bitmap = cp.unpackbits(bitmap)
+    restored = torch.flatten(torch.dequantize(q_tensor))
+    
+    arr = cp.asarray(restored)
+    # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error
+
+    # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
+    # -- Workaround to convert GPU pointer to int
+    # p_decompressed_ptr = ctypes.addressof(newData)
+    # cast to int64 pointer
+    # (effectively converting pointer to pointer to addr to pointer to int64)
+    # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+    # decompressed_int = p_decompressed_int.contents
+    # # --
+    # pointer_for_free = decompressed_int.value
+    # # self.decompressed_own.append(decompressed_int.value)
+    # mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0)
+    # mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
+    #print("mem ptr")
+    #print(mem_ptr)
+    # arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr)
+    #print(nbEle)
+    if isGrouped:
+        res = cp.zeros((nbEle,))
+    # ## need to convert newData to cupy
+        cp.place(res,bitmap,arr)
+
+        c_res = cp.zeros(int(nbEle/2), np.complex64)
+    #c_res.real = arr[0:int(nbEle/2)]
+    #c_res.imag = arr[int(nbEle/2):]
+
+        c_res.real = res[0:int(nbEle/2)]
+        c_res.imag = res[int(nbEle/2):]
+    else:
+        c_res = cp.zeros(int(nbEle/2), np.complex64)
+        c_res.real = arr[0:int(nbEle/2)]
+        c_res.imag = arr[int(nbEle/2):]
+    return (c_res, None)
+
+### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
+
+def free_compressed(ptr):
+    p_ptr = ctypes.addressof(ptr)
+    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decomp_int = p_int.contents
+    cp.cuda.runtime.free(decomp_int.value)
+
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = int(1024)
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.002
+    r2r_error = 0.0001
+
+    in_vector = np.fromfile("all_sample.bin", dtype=np.complex64)
+    #print(np.max(in_vector))
+    DATA_SIZE = len(in_vector)
+    #range_vr = np.max(in_vector)-np.min(in_vector)
+    #r2r_threshold = r2r_threshold*range_vr
+    #r2r_error = r2r_error*range_vr
+    #in_vector = np.zeros((DATA_SIZE,))
+    #for i in range(0,int(DATA_SIZE/4)):
+    #    in_vector[i] = 0.0
+    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+    #    in_vector[i] = 5.0
+    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
+    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+    #    in_vector[i] = -7.0
+    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+    #    in_vector[i] = 0.001
+
+    print(DATA_SIZE)
+    #in_vector = in_vector.astype('float32')
+    in_vector_gpu = cp.asarray(in_vector)
+    
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+    for i in range(200):
+        s_time = time.time()
+        o_bytes, outSize = quant_device_compress(in_vector_gpu, DATA_SIZE, 256, r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        # print(outSize[0])
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        (d_bytes,ptr )= quant_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
+        
+        # free_compressed(o_bytes[0])
+        # cp.cuda.runtime.free(ptr)
+        print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
+        print("Decompress Success")

From 46b6994bb8f93caaeef4926e7ec5c65415a79a70 Mon Sep 17 00:00:00 2001
From: Milan Kartik Shah <mkshah5@ncsu.edu>
Date: Fri, 7 Jul 2023 12:49:03 -0400
Subject: [PATCH 094/126] Added grouping to perchannel quantization

---
 .../torch_quant/torch_quant_perchannel.py         | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/qtensor/compression/torch_quant/torch_quant_perchannel.py b/qtensor/compression/torch_quant/torch_quant_perchannel.py
index 4278aaa6..a41606b2 100644
--- a/qtensor/compression/torch_quant/torch_quant_perchannel.py
+++ b/qtensor/compression/torch_quant/torch_quant_perchannel.py
@@ -8,7 +8,7 @@
 
 from pathlib import Path
 
-BS = 16
+BS = 32
 
 def quant_device_compress(oriData, nbEle, blockSize,threshold):
     #print(nbEle)
@@ -36,7 +36,7 @@ def quant_device_compress(oriData, nbEle, blockSize,threshold):
     print("Percent nonzero: "+str(nonzero_percent))
 
     isGrouped = False
-    if nonzero_percent<=0.00:
+    if nonzero_percent<=0.5:
         isGrouped=True
         oriData = oriData[truth_values]
     
@@ -47,6 +47,14 @@ def quant_device_compress(oriData, nbEle, blockSize,threshold):
     # print("Min val: "+str(cp.amin(oriData).get())+" range: "+str(d))
 #    scale = d/255.0
 #    zero_point = -1*round(min_val*scale) - 128
+    if isGrouped:
+        pad_rows = int(nbEle/BS)
+        if nbEle%BS != 0:
+            pad_rows +=1
+
+        padded = torch.zeros(pad_rows*BS, device='cuda')
+        padded[:nbEle] = tensor
+        tensor = padded
     tensor = torch.reshape(tensor, (-1, BS))
     maxs = torch.flatten(torch.max(tensor, dim=1)[0])
     mins = torch.flatten(torch.min(tensor, dim=1)[0])
@@ -57,7 +65,8 @@ def quant_device_compress(oriData, nbEle, blockSize,threshold):
     #print(torch.max(torch.sub(maxs,mins)))
     scales = torch.abs(torch.sub(maxs,mins))/127.0
     zero_points = torch.zeros(tensor.shape[0], device='cuda')
-    #zero_points = torch.abs(torch.round(torch.div(mins,scales)))-127
+    #zero_points = torch.round(torch.div(torch.add(maxs,mins)/2,scales))
+    #zero_points = torch.neg(torch.round(torch.div(mins,scales)))+64
 
     #print(zero_points)
 

From afe92ffc454e23810f758ade784ebdbe87b8af72 Mon Sep 17 00:00:00 2001
From: Milan Shah <mkshah5@ncsu.edu>
Date: Mon, 10 Jul 2023 13:41:34 -0400
Subject: [PATCH 095/126] Added all compressors to Compressor.py

---
 qtensor/compression/Compressor.py | 235 ++++++++++++++++++++++++++++--
 1 file changed, 226 insertions(+), 9 deletions(-)

diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index 6a4d117e..fda3670b 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -8,16 +8,16 @@
 # sys.path.append(str(Path(__file__).parent/'szp/src/'))
 # sys.path.append('./szp/src')
 
-# sys.path.append(str(Path(__file__).parent/'cusz/src'))
-# sys.path.append('./cusz/src')
+sys.path.append(str(Path(__file__).parent/'cusz/src'))
+sys.path.append('./cusz/src')
 sys.path.append(str(Path(__file__).parent/'torch_quant'))
 sys.path.append('./torch_quant')
 import torch
 try:
     from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
     # from cuSZp_wrapper import cuszp_device_compress, cuszp_device_decompress
-    # from cusz_wrapper import cusz_device_compress, cusz_device_decompress
-    from torch_quant import quant_device_compress, quant_device_decompress
+    from cusz_wrapper import cusz_device_compress, cusz_device_decompress
+    from torch_quant_perchannel import quant_device_compress, quant_device_decompress
 except:
     print("import failed")
     # Silently fail on missing build of cuszx
@@ -112,6 +112,222 @@ def decompress(self, ptr):
         ptr.seek(0)
         return  np.load(ptr)['arr_0']
 
+class TorchCompressor(Compressor):
+    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
+        self.r2r_error = r2r_error
+        self.r2r_threshold = r2r_threshold
+        self.decompressed_own = []
+
+    def free_decompressed(self):
+        import cupy
+        print("Cleanup", len(self.decompressed_own))
+        for x in self.decompressed_own:
+            del x
+        cupy.get_default_memory_pool().free_all_blocks()
+        cupy.get_default_pinned_memory_pool().free_all_blocks()
+        torch.cuda.empty_cache()
+        self.decompressed_own = []
+
+    def free_compressed(self, ptr):
+        import ctypes, cupy
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
+        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
+        # cast to int64 pointer
+        # (effectively converting pointer to pointer to addr to pointer to int64)
+        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        decompressed_int = p_decompressed_int.contents
+        cupy.cuda.runtime.free(decompressed_int.value)
+
+    def compress(self, data):
+        import cupy
+        if isinstance(data, cupy.ndarray):
+            isCuPy = True
+        else:
+            isCuPy = False
+        num_elements = data.size
+        # Adapt numele depending on itemsize
+        itemsize = data.dtype.itemsize
+        num_elements_eff = int(num_elements*itemsize/4) 
+
+        dtype = data.dtype
+        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
+        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
+
+        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
+
+    def compress_size(self, ptr):
+        return ptr[5]
+
+    def decompress(self, obj):
+        import cupy
+        import ctypes
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
+        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
+        arr_cp = decompressed_ptr[0]
+
+        arr = cupy.reshape(arr_cp, shape)
+        self.decompressed_own.append(arr)
+        return arr
+    
+    ### Compression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
+    # - num_elements = Number of floating point elements in data
+    # - r2r_error = relative-to-value-range error bound for lossy compression
+    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
+    # Returns:
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
+    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
+        
+        if not isCuPy:
+            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+        else:
+            #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
+
+            cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+            del data
+            torch.cuda.empty_cache()
+        return cmp_bytes, outSize_ptr
+
+    ### Decompression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - num_elements = Number of floating point elements in original data
+    # Returns:
+    # - decompressed_data = Float32 pointer to decompressed data
+    #
+    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
+
+    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
+        if not isCuPy:
+            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
+        else:
+            #decompressed_data = cuszp_device_decompress(num_elements, cmp_bytes, cmpsize, owner,dtype)
+# oriData, absErrBound, nbEle, blockSize,threshold
+            decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
+        return decompressed_data
+
+
+class CUSZXCompressor(Compressor):
+    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
+        self.r2r_error = r2r_error
+        self.r2r_threshold = r2r_threshold
+        self.decompressed_own = []
+
+    def free_decompressed(self):
+        import cupy
+        print("Cleanup", len(self.decompressed_own))
+        for x in self.decompressed_own:
+            #print(x)
+            #if x == None:
+            #    continue
+            #else:
+                #print("CUDA Free", x)
+            #cupy.cuda.runtime.free(x)
+            del x
+            cupy.get_default_memory_pool().free_all_blocks()
+            cupy.get_default_pinned_memory_pool().free_all_blocks()
+        torch.cuda.empty_cache()
+        self.decompressed_own = []
+
+    def free_compressed(self, ptr):
+        import ctypes, cupy
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
+        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
+        # cast to int64 pointer
+        # (effectively converting pointer to pointer to addr to pointer to int64)
+        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        decompressed_int = p_decompressed_int.contents
+        cupy.cuda.runtime.free(decompressed_int.value)
+
+    def compress(self, data):
+        import cupy
+        if isinstance(data, cupy.ndarray):
+            isCuPy = True
+        else:
+            isCuPy = False
+        num_elements = data.size
+        # Adapt numele depending on itemsize
+        itemsize = data.dtype.itemsize
+        num_elements_eff = int(num_elements*itemsize/4) 
+
+        dtype = data.dtype
+        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
+        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
+
+        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
+
+    def compress_size(self, ptr):
+        return ptr[5]
+
+    def decompress(self, obj):
+        import cupy
+        import ctypes
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
+        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
+        arr_cp = decompressed_ptr[0]
+        self.decompressed_own.append(decompressed_ptr[1])
+        
+        # -- Workaround to convert GPU pointer to int
+        # p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
+        # # cast to int64 pointer
+        # # (effectively converting pointer to pointer to addr to pointer to int64)
+        # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        # decompressed_int = p_decompressed_int.contents
+        # # --
+        # self.decompressed_own.append(decompressed_int.value)
+        # mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
+        # mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
+        arr = cupy.reshape(arr_cp, shape)
+        # self.decompressed_own.append(arr)
+        # arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
+        return arr
+    
+    ### Compression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
+    # - num_elements = Number of floating point elements in data
+    # - r2r_error = relative-to-value-range error bound for lossy compression
+    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
+    # Returns:
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
+    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
+        
+        if not isCuPy:
+            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+        else:
+            #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
+            cmp_bytes, outSize_ptr = cuszx_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE,r2r_threshold)
+            # cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+            del data
+            torch.cuda.empty_cache()
+        return cmp_bytes, outSize_ptr
+
+    ### Decompression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - num_elements = Number of floating point elements in original data
+    # Returns:
+    # - decompressed_data = Float32 pointer to decompressed data
+    #
+    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
+
+    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
+        if not isCuPy:
+            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
+        else:
+            # cuszx_device_decompress(nbEle, cmpBytes, owner, dtype)
+            decompressed_data = cuszx_device_decompress(num_elements, cmp_bytes, owner,dtype)
+# oriData, absErrBound, nbEle, blockSize,threshold
+            # decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
+        return decompressed_data
+    
 class CUSZCompressor(Compressor):
     def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
         self.r2r_error = r2r_error
@@ -203,8 +419,8 @@ def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
             cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
         else:
             #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
-
-            cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+            cmp_bytes, outSize_ptr = cusz_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE,r2r_threshold)
+            # cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
             del data
             torch.cuda.empty_cache()
         return cmp_bytes, outSize_ptr
@@ -223,7 +439,8 @@ def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtyp
         if not isCuPy:
             decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
         else:
-            #decompressed_data = cuszp_device_decompress(num_elements, cmp_bytes, cmpsize, owner,dtype)
+            # cuszx_device_decompress(nbEle, cmpBytes, owner, dtype)
+            decompressed_data = cusz_device_decompress(num_elements, cmp_bytes, owner,dtype)
 # oriData, absErrBound, nbEle, blockSize,threshold
-            decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
-        return decompressed_data
+            # decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
+        return decompressed_data
\ No newline at end of file

From ec6aee82149610b2f8dd407d16a3231ed0555f38 Mon Sep 17 00:00:00 2001
From: Milan Shah <mkshah5@ncsu.edu>
Date: Tue, 11 Jul 2023 11:36:51 -0400
Subject: [PATCH 096/126] Can change compressor with flag

---
 bench/qc_simulation/src/simulators/qtensor.py |  8 ++++++++
 qtensor/compression/Compressor.py             | 20 +++++++++----------
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/bench/qc_simulation/src/simulators/qtensor.py b/bench/qc_simulation/src/simulators/qtensor.py
index c64f728c..15da83ce 100644
--- a/bench/qc_simulation/src/simulators/qtensor.py
+++ b/bench/qc_simulation/src/simulators/qtensor.py
@@ -183,9 +183,17 @@ def simulate(in_file, out_file,
     backend = qtensor.contraction_backends.get_backend(backend)
     if compress is not None:
         if compress == 'szx':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = qtensor.compression.CUSZXCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'cusz':
             print(f"{r2r_error=} {r2r_threshold=}")
             compressor = qtensor.compression.CUSZCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
             compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'torch':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = qtensor.compression.TorchCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
         else:
             raise ValueError(f"Unknown compression algorithm: {compress}")
         backend = qtensor.contraction_backends.CompressionBackend(backend, compressor, M)
diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index fda3670b..76b22087 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -226,11 +226,11 @@ def free_decompressed(self):
             #    continue
             #else:
                 #print("CUDA Free", x)
-            #cupy.cuda.runtime.free(x)
-            del x
-            cupy.get_default_memory_pool().free_all_blocks()
-            cupy.get_default_pinned_memory_pool().free_all_blocks()
-        torch.cuda.empty_cache()
+            cupy.cuda.runtime.free(x)
+            # del x
+            # cupy.get_default_memory_pool().free_all_blocks()
+            # cupy.get_default_pinned_memory_pool().free_all_blocks()
+        # torch.cuda.empty_cache()
         self.decompressed_own = []
 
     def free_compressed(self, ptr):
@@ -343,11 +343,11 @@ def free_decompressed(self):
             #    continue
             #else:
                 #print("CUDA Free", x)
-            #cupy.cuda.runtime.free(x)
-            del x
-            cupy.get_default_memory_pool().free_all_blocks()
-            cupy.get_default_pinned_memory_pool().free_all_blocks()
-        torch.cuda.empty_cache()
+            cupy.cuda.runtime.free(x)
+            # del x
+            # cupy.get_default_memory_pool().free_all_blocks()
+            # cupy.get_default_pinned_memory_pool().free_all_blocks()
+        # torch.cuda.empty_cache()
         self.decompressed_own = []
 
     def free_compressed(self, ptr):

From df1f8c1b0fe3e4b4493ff588e1529d93d3635005 Mon Sep 17 00:00:00 2001
From: Milan Shah <mkshah5@ncsu.edu>
Date: Tue, 11 Jul 2023 11:53:17 -0400
Subject: [PATCH 097/126] Bug fixes for freeing pointers

---
 bench/qc_simulation/src/simulators/qtensor.py | 7 ++++---
 qtensor/compression/Compressor.py             | 8 ++++----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/bench/qc_simulation/src/simulators/qtensor.py b/bench/qc_simulation/src/simulators/qtensor.py
index 15da83ce..86ef216c 100644
--- a/bench/qc_simulation/src/simulators/qtensor.py
+++ b/bench/qc_simulation/src/simulators/qtensor.py
@@ -175,6 +175,7 @@ def simulate(in_file, out_file,
     """
     import time
     from qtensor.contraction_algos import bucket_elimination
+    from qtensor.compression.Compressor import CUSZCompressor, CUSZXCompressor, TorchCompressor
     import cupy
     cupy.cuda.profiler.start()
     prep_data = read_preps(in_file)
@@ -184,15 +185,15 @@ def simulate(in_file, out_file,
     if compress is not None:
         if compress == 'szx':
             print(f"{r2r_error=} {r2r_threshold=}")
-            compressor = qtensor.compression.CUSZXCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = CUSZXCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
             compressor = qtensor.compression.ProfileCompressor(compressor)
         elif compress == 'cusz':
             print(f"{r2r_error=} {r2r_threshold=}")
-            compressor = qtensor.compression.CUSZCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = CUSZCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
             compressor = qtensor.compression.ProfileCompressor(compressor)
         elif compress == 'torch':
             print(f"{r2r_error=} {r2r_threshold=}")
-            compressor = qtensor.compression.TorchCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = TorchCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
             compressor = qtensor.compression.ProfileCompressor(compressor)
         else:
             raise ValueError(f"Unknown compression algorithm: {compress}")
diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index 76b22087..ac94fdf2 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -256,9 +256,9 @@ def compress(self, data):
 
         dtype = data.dtype
         cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
-        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
+        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
 
-        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
+        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
 
     def compress_size(self, ptr):
         return ptr[5]
@@ -373,9 +373,9 @@ def compress(self, data):
 
         dtype = data.dtype
         cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
-        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
+        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
 
-        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
+        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
 
     def compress_size(self, ptr):
         return ptr[5]

From d34e0d71057836da3f6314b9676c27a2049a0811 Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Tue, 18 Jul 2023 13:07:05 +0000
Subject: [PATCH 098/126] add qaoa parameters config to circuit genm

---
 bench/qc_simulation/src/circuit_gen/qaoa.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/bench/qc_simulation/src/circuit_gen/qaoa.py b/bench/qc_simulation/src/circuit_gen/qaoa.py
index 6f5a002d..8dac5be6 100644
--- a/bench/qc_simulation/src/circuit_gen/qaoa.py
+++ b/bench/qc_simulation/src/circuit_gen/qaoa.py
@@ -1,5 +1,6 @@
 import networkx
 import numpy as np
+from qtensor.tools import BETHE_QAOA_VALUES
 
 def generate_ibm_connectivity(arch):
     """
@@ -27,6 +28,8 @@ def coupling_map_from_provider(p_class):
         # IBM quantum volume 64
         from qiskit.providers.fake_provider import FakeCairoV2
         return coupling_map_from_provider(FakeCairoV2)
+    else:
+        raise ValueError("IBM architecture {} not supported".format(arch))
 
 def save_terms_format(file, terms):
     """
@@ -45,8 +48,10 @@ def generate_graph(n, d, type="random"):
     elif type[:4] == "ibm_":
         arch = type[4:]
         return generate_ibm_connectivity(arch)
+    else:
+        raise ValueError("Unknown graph type {}".format(type))
 
-def generate_maxcut(out_file, N, p, d, graph_type='random', seed=None):
+def generate_maxcut(out_file, N, p, d, graph_type='random', seed=None, parameters='random'):
     """
     Generate a random regular maxcut problem
 
@@ -55,16 +60,23 @@ def generate_maxcut(out_file, N, p, d, graph_type='random', seed=None):
         N (int): Number of nodes
         p (int): Number of layers
         d (int): Random regular graph degree 
+        parameters (str): One of ["random", "fixed_angles"]
 
     Returns:
         str: Path to output file
     """
-    G = generate_graph(N, d, graph_type)
+    G: networkx.Graph = generate_graph(N, d, graph_type)
     terms = []
     for u, v in G.edges:
         terms.append((1, (u, v)))
-    gamma = np.random.uniform(0, 2 * np.pi, p)
-    beta = np.random.uniform(0, np.pi, p)
+    if parameters == "random":
+        gamma = np.random.uniform(0, 2 * np.pi, p)
+        beta = np.random.uniform(0, np.pi, p)
+    elif parameters == "fixed_angles":
+        gammabeta = np.array(BETHE_QAOA_VALUES[str(p)]['angles'])
+        gamma, beta = gammabeta[:p]*2, gammabeta[p:]
+    else:
+        raise ValueError("Unknown parameters type {}. Use one of ['random', 'fixed_angles']".format(parameters))
     pb = {"terms": terms, "gamma": gamma.tolist(), "beta": beta.tolist()}
 
     return save_terms_format(out_file, pb)

From 27d44a2e6f74ef61e3545c231e064240b3116c26 Mon Sep 17 00:00:00 2001
From: Milan Kartik Shah <mkshah5@ncsu.edu>
Date: Mon, 24 Jul 2023 12:12:44 -0400
Subject: [PATCH 099/126] Added new compressor: combines quantization with
 lossless compression of quant codes

---
 bench/qc_simulation/src/simulators/qtensor.py | 530 +++++++++---------
 qtensor/compression/Compressor.py             | 122 +++-
 qtensor/compression/newsz/README_NEWSZ.md     |  15 +
 qtensor/compression/newsz/newsz.cu            | 248 ++++++++
 qtensor/compression/newsz/newsz.h             |   3 +
 qtensor/compression/newsz/newsz_wrapper.cu    |  21 +
 qtensor/compression/newsz/newsz_wrapper.py    | 159 ++++++
 7 files changed, 834 insertions(+), 264 deletions(-)
 create mode 100644 qtensor/compression/newsz/README_NEWSZ.md
 create mode 100644 qtensor/compression/newsz/newsz.cu
 create mode 100644 qtensor/compression/newsz/newsz.h
 create mode 100644 qtensor/compression/newsz/newsz_wrapper.cu
 create mode 100644 qtensor/compression/newsz/newsz_wrapper.py

diff --git a/bench/qc_simulation/src/simulators/qtensor.py b/bench/qc_simulation/src/simulators/qtensor.py
index 86ef216c..d80f68ad 100644
--- a/bench/qc_simulation/src/simulators/qtensor.py
+++ b/bench/qc_simulation/src/simulators/qtensor.py
@@ -1,263 +1,267 @@
-import qtensor
-import qtree
-import numpy as np
-
-# -- QAOA generic parser
-
-class QAOAComposer(qtensor.DefaultQAOAComposer):
-    def __init__(self, N, terms, **kwargs):
-        self.n_qubits = N
-        # from ccomp (Can't call DefaultQAOA Composer since need graph)
-        self.builder = self._get_builder()
-        # gamma and beta
-        self.params = kwargs
-        # 
-        self.terms = terms
-        self.qubit_map = {n: i for i, n in enumerate(range(N))}
-
-    def cost_operator_circuit(self, gamma):
-        for factor, term in self.terms:
-            t_mapped = [self.qubit_map[i] for i in term]
-            self.append_Z_term(term, gamma)
-
-    def append_Z_term(self, term, gamma):
-        if len(term) == 2:
-            self.apply_gate(self.operators.ZZ, term[0], term[1], alpha=2*gamma)
-            #self.apply_gate(qtensor.OpFactory.ZZFull, term[0], term[1], alpha=2*gamma)
-        elif len(term) == 4:
-            self.apply_gate(self.operators.Z4, *term, alpha=2*gamma)
-        else:
-            raise ValueError(f"Invalid QAOA term length: {len(term)}")
-
-    def mixer_operator(self, beta):
-        qubits = self.qubit_map.values()
-        for qubit in qubits:
-            self.x_term(qubit, beta)
-
-def parse_qaoa(data):
-    import json
-    data = json.loads(data)
-    terms = data["terms"]
-    gamma = np.array(data["gamma"])/np.pi/2
-    beta = np.array(data["beta"])/np.pi
-    N = len(set(sum([t[1] for t in terms], [])))
-    composer = QAOAComposer(N, terms, gamma=gamma, beta=beta)
-    composer.ansatz_state()
-    return composer.circuit
-# --
-
-def read_circ(circ_f, type=None):
-
-    if type is None:
-        type = circ_f.path.name.split(".")[-1]
-
-    print("Reading circuit of type", type)
-    if type == "jsonterms":
-        b = circ_f.f.read()
-        return parse_qaoa(b)
-
-    elif type == "qasm":
-        from qiskit import QuantumCircuit
-        b = circ_f.f.read()
-        str = b.decode('utf-8')
-
-        qiskit_circuit = QuantumCircuit.from_qasm_str(str)
-        return qtree.operators.from_qiskit_circuit(qiskit_circuit)
-    else:
-        b = circ_f.f.read()
-        str = b.decode('utf-8')
-        import io
-        f = io.StringIO(str)
-        N, circ = qtree.operators.read_circuit_stream(f)
-        return sum(circ, [])
-
-def read_preps(prep_f):
-    import pickle
-    return pickle.load(prep_f.f)
-
-def write_preps(peo, prep_f):
-    import pickle
-    pickle.dump(peo, open(prep_f, 'wb'))
-
-def write_json(data, out_file):
-    import json
-    with open(out_file, 'w') as f:
-        json.dump(data, f)
-        # This newline plays nice when cat-ing multiple files
-        f.write('\n')
-
-def preprocess(in_file, out_file, O='greedy', S=None, M=30, after_slice='run-again'):
-    """
-    Arguments:
-        in_file: input file
-        out_file: output file
-        O: ordering algorithm 
-        S: slicing algorithm 
-        M: Memory limit for slicing 
-    """
-    circ = read_circ(in_file)
-    tn = qtensor.optimisation.QtreeTensorNet.from_qtree_gates(circ)
-    opt = qtensor.toolbox.get_ordering_algo(O)
-    if S:
-        # ignore argument type mismatch for pyright -- opt can be `Optimizer`
-        # pyright: reportGeneralTypeIssues=false
-        opt = qtensor.optimisation.TreeTrimSplitter(
-            tw_bias=0, max_tw=M, base_ordering=opt,
-            peo_after_slice_strategy=after_slice
-        )
-        
-        peo, par_vars, _ = opt.optimize(tn)
-        # --dbg
-        import networkx as nx
-        graph = tn.get_line_graph()
-        ignore_vars = tn.bra_vars + tn.ket_vars
-        for pv in par_vars:
-            graph.remove_node(int(pv))
-        components = list(nx.connected_components(graph))
-        print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
-        print(f"peo size without par_vars and ignore_vars: {len(peo) - len(par_vars) - len(ignore_vars)}")
-
-        print()
-        # --
-    else:
-        peo, _ = opt.optimize(tn)
-        par_vars = []
-    print("W", opt.treewidth)
-    # -- qtensor_estim
-    prep_data = (peo, par_vars, tn)
-    write_preps(prep_data, out_file)
-
-
-def estimate(in_file, out_file, C=100, M=30, F=1e12, T=1e9, **kwargs):
-    """
-    Arguments:
-        in_file: file with preprocessed data
-        out_file: file to write the results to
-        C: Compression ratio
-        M: Memory limit in log2(b/16)
-        F: assumed FLOPS 
-        T: Throughput of compression
-    """
-    from qtensor.compression.cost_estimation import compressed_contraction_cost, Cost
-    from dataclasses import asdict
-    import json
-    prep_data = read_preps(in_file)
-    peo, par_vars, tn = prep_data
-
-    tn.slice({i: slice(0, 1) for i in par_vars})
-    peo = peo[:len(peo) - len(par_vars)]
-    costs: list[Cost] = compressed_contraction_cost(tn, peo, mem_limit=M, compression_ratio=C)
-    totals: Cost = sum(costs[1:], costs[0])
-    time = totals.time(F, T, T, M)
-    C = asdict(totals)
-    C['time'] = time*2**len(par_vars)
-    C['slices'] = 2**len(par_vars)
-    print("C", C)
-    out_file += ".json"
-    write_json(C, out_file)
-    return out_file
-
-def simulate(in_file, out_file,
-             backend='einsum',
-             compress=None,
-             M=29,
-             r2r_error=1e-3, r2r_threshold=1e-3,
-             **kwargs):
-    """
-    Args:
-        in_file: file with preprocessed data
-        out_file: file to write the results to
-        backend: backend to use
-        compress: compression algorithm
-        M: memory threshold for compression
-        r2r_error: relative error for compression
-        r2r_threshold: relative threshold for compression
-    """
-    import time
-    from qtensor.contraction_algos import bucket_elimination
-    from qtensor.compression.Compressor import CUSZCompressor, CUSZXCompressor, TorchCompressor
-    import cupy
-    cupy.cuda.profiler.start()
-    prep_data = read_preps(in_file)
-    peo, par_vars, tn = prep_data
-    
-    backend = qtensor.contraction_backends.get_backend(backend)
-    if compress is not None:
-        if compress == 'szx':
-            print(f"{r2r_error=} {r2r_threshold=}")
-            compressor = CUSZXCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
-            compressor = qtensor.compression.ProfileCompressor(compressor)
-        elif compress == 'cusz':
-            print(f"{r2r_error=} {r2r_threshold=}")
-            compressor = CUSZCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
-            compressor = qtensor.compression.ProfileCompressor(compressor)
-        elif compress == 'torch':
-            print(f"{r2r_error=} {r2r_threshold=}")
-            compressor = TorchCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
-            compressor = qtensor.compression.ProfileCompressor(compressor)
-        else:
-            raise ValueError(f"Unknown compression algorithm: {compress}")
-        backend = qtensor.contraction_backends.CompressionBackend(backend, compressor, M)
-        from qtensor.contraction_backends.performance_measurement_decorator import MemProfBackend
-        backend = MemProfBackend(backend)
-
-    relabelid = {}
-    for tensor in tn.tensors:
-        for i in tensor.indices:
-            relabelid[int(i)] = i
-
-    slice_ext = {relabelid[int(i)]: 0 for i in par_vars}
-
-    if len(par_vars) > 0:
-        print("Parvars", par_vars)
-        print(f"Detected {len(par_vars)} slice variables")
-    sim = qtensor.QtreeSimulator(backend=backend)
-    sim.tn = tn
-    sim.tn.backend = backend
-    sim.peo = peo
-    sim._slice_relabel_buckets(slice_ext)
-    buckets = sim.tn.buckets
-    # --dbg
-    #ignore_vars  = sim.tn.bra_vars + sim.tn.ket_vars 
-    #graph = qtree.graph_model.importers.buckets2graph(buckets, ignore_vars)
-    #graph, label_dict = qtree.graph_model.relabel_graph_nodes(
-        #graph, dict(zip(graph.nodes, np.array(list(graph.nodes)) - 127*2))
-    #) 
-    #import networkx as nx
-    #components = list(nx.connected_components(graph))
-    #print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
-    #print(f"peo size without par_vars and ignore_vars: {len(peo) - len(ignore_vars)}")
-    # --
-
-    start = time.time()
-    for i in range(2**0):
-        print(f"P {i}", end='', flush=True)
-        bcopy = [b[:] for b in buckets]
-        res = bucket_elimination(
-            bcopy, backend,
-            n_var_nosum=len(tn.free_vars)
-        )
-        del bcopy
-        print("Result", res.data.flatten()[0])
-        time.sleep(0.5)
-    sim_result = backend.get_result_data(res).flatten()[0]
-    print("Simulation result:", sim_result)
-    end = time.time()
-    print("Elapsed", end - start)
-    out_file += ".json"
-    C = {'time': 2**len(par_vars)*(end - start)}
-    C['elapsed'] = (end - start)
-    C['memory'] = backend.max_mem
-    C['memory_history'] = backend.mem_history
-    C['nvmemory'] = backend.nvsmi_max_mem
-    C['result'] = {
-        "Re": np.real(sim_result).tolist(),
-        "Im": np.imag(sim_result).tolist()
-    }
-    if compress is not None:
-        if isinstance(compressor, qtensor.compression.ProfileCompressor):
-            C['compression'] = compressor.get_profile_data_json()
-
-    write_json(C, out_file)
-    cupy.cuda.profiler.stop()
-    return out_file
+import qtensor
+import qtree
+import numpy as np
+
+# -- QAOA generic parser
+
+class QAOAComposer(qtensor.DefaultQAOAComposer):
+    def __init__(self, N, terms, **kwargs):
+        self.n_qubits = N
+        # from ccomp (Can't call DefaultQAOA Composer since need graph)
+        self.builder = self._get_builder()
+        # gamma and beta
+        self.params = kwargs
+        # 
+        self.terms = terms
+        self.qubit_map = {n: i for i, n in enumerate(range(N))}
+
+    def cost_operator_circuit(self, gamma):
+        for factor, term in self.terms:
+            t_mapped = [self.qubit_map[i] for i in term]
+            self.append_Z_term(term, gamma)
+
+    def append_Z_term(self, term, gamma):
+        if len(term) == 2:
+            self.apply_gate(self.operators.ZZ, term[0], term[1], alpha=2*gamma)
+            #self.apply_gate(qtensor.OpFactory.ZZFull, term[0], term[1], alpha=2*gamma)
+        elif len(term) == 4:
+            self.apply_gate(self.operators.Z4, *term, alpha=2*gamma)
+        else:
+            raise ValueError(f"Invalid QAOA term length: {len(term)}")
+
+    def mixer_operator(self, beta):
+        qubits = self.qubit_map.values()
+        for qubit in qubits:
+            self.x_term(qubit, beta)
+
+def parse_qaoa(data):
+    import json
+    data = json.loads(data)
+    terms = data["terms"]
+    gamma = np.array(data["gamma"])/np.pi/2
+    beta = np.array(data["beta"])/np.pi
+    N = len(set(sum([t[1] for t in terms], [])))
+    composer = QAOAComposer(N, terms, gamma=gamma, beta=beta)
+    composer.ansatz_state()
+    return composer.circuit
+# --
+
+def read_circ(circ_f, type=None):
+
+    if type is None:
+        type = circ_f.path.name.split(".")[-1]
+
+    print("Reading circuit of type", type)
+    if type == "jsonterms":
+        b = circ_f.f.read()
+        return parse_qaoa(b)
+
+    elif type == "qasm":
+        from qiskit import QuantumCircuit
+        b = circ_f.f.read()
+        str = b.decode('utf-8')
+
+        qiskit_circuit = QuantumCircuit.from_qasm_str(str)
+        return qtree.operators.from_qiskit_circuit(qiskit_circuit)
+    else:
+        b = circ_f.f.read()
+        str = b.decode('utf-8')
+        import io
+        f = io.StringIO(str)
+        N, circ = qtree.operators.read_circuit_stream(f)
+        return sum(circ, [])
+
+def read_preps(prep_f):
+    import pickle
+    return pickle.load(prep_f.f)
+
+def write_preps(peo, prep_f):
+    import pickle
+    pickle.dump(peo, open(prep_f, 'wb'))
+
+def write_json(data, out_file):
+    import json
+    with open(out_file, 'w') as f:
+        json.dump(data, f)
+        # This newline plays nice when cat-ing multiple files
+        f.write('\n')
+
+def preprocess(in_file, out_file, O='greedy', S=None, M=30, after_slice='run-again'):
+    """
+    Arguments:
+        in_file: input file
+        out_file: output file
+        O: ordering algorithm 
+        S: slicing algorithm 
+        M: Memory limit for slicing 
+    """
+    circ = read_circ(in_file)
+    tn = qtensor.optimisation.QtreeTensorNet.from_qtree_gates(circ)
+    opt = qtensor.toolbox.get_ordering_algo(O)
+    if S:
+        # ignore argument type mismatch for pyright -- opt can be `Optimizer`
+        # pyright: reportGeneralTypeIssues=false
+        opt = qtensor.optimisation.TreeTrimSplitter(
+            tw_bias=0, max_tw=M, base_ordering=opt,
+            peo_after_slice_strategy=after_slice
+        )
+        
+        peo, par_vars, _ = opt.optimize(tn)
+        # --dbg
+        import networkx as nx
+        graph = tn.get_line_graph()
+        ignore_vars = tn.bra_vars + tn.ket_vars
+        for pv in par_vars:
+            graph.remove_node(int(pv))
+        components = list(nx.connected_components(graph))
+        print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
+        print(f"peo size without par_vars and ignore_vars: {len(peo) - len(par_vars) - len(ignore_vars)}")
+
+        print()
+        # --
+    else:
+        peo, _ = opt.optimize(tn)
+        par_vars = []
+    print("W", opt.treewidth)
+    # -- qtensor_estim
+    prep_data = (peo, par_vars, tn)
+    write_preps(prep_data, out_file)
+
+
+def estimate(in_file, out_file, C=100, M=30, F=1e12, T=1e9, **kwargs):
+    """
+    Arguments:
+        in_file: file with preprocessed data
+        out_file: file to write the results to
+        C: Compression ratio
+        M: Memory limit in log2(b/16)
+        F: assumed FLOPS 
+        T: Throughput of compression
+    """
+    from qtensor.compression.cost_estimation import compressed_contraction_cost, Cost
+    from dataclasses import asdict
+    import json
+    prep_data = read_preps(in_file)
+    peo, par_vars, tn = prep_data
+
+    tn.slice({i: slice(0, 1) for i in par_vars})
+    peo = peo[:len(peo) - len(par_vars)]
+    costs: list[Cost] = compressed_contraction_cost(tn, peo, mem_limit=M, compression_ratio=C)
+    totals: Cost = sum(costs[1:], costs[0])
+    time = totals.time(F, T, T, M)
+    C = asdict(totals)
+    C['time'] = time*2**len(par_vars)
+    C['slices'] = 2**len(par_vars)
+    print("C", C)
+    out_file += ".json"
+    write_json(C, out_file)
+    return out_file
+
+def simulate(in_file, out_file,
+             backend='einsum',
+             compress=None,
+             M=29,
+             r2r_error=1e-3, r2r_threshold=1e-3,
+             **kwargs):
+    """
+    Args:
+        in_file: file with preprocessed data
+        out_file: file to write the results to
+        backend: backend to use
+        compress: compression algorithm
+        M: memory threshold for compression
+        r2r_error: relative error for compression
+        r2r_threshold: relative threshold for compression
+    """
+    import time
+    from qtensor.contraction_algos import bucket_elimination
+    from qtensor.compression.Compressor import CUSZCompressor, CUSZXCompressor, TorchCompressor, NEWSZCompressor
+    import cupy
+    cupy.cuda.profiler.start()
+    prep_data = read_preps(in_file)
+    peo, par_vars, tn = prep_data
+    
+    backend = qtensor.contraction_backends.get_backend(backend)
+    if compress is not None:
+        if compress == 'szx':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = CUSZXCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'cusz':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = CUSZCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'torch':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = TorchCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'newsz':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = NEWSZCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        else:
+            raise ValueError(f"Unknown compression algorithm: {compress}")
+        backend = qtensor.contraction_backends.CompressionBackend(backend, compressor, M)
+        from qtensor.contraction_backends.performance_measurement_decorator import MemProfBackend
+        backend = MemProfBackend(backend)
+
+    relabelid = {}
+    for tensor in tn.tensors:
+        for i in tensor.indices:
+            relabelid[int(i)] = i
+
+    slice_ext = {relabelid[int(i)]: 0 for i in par_vars}
+
+    if len(par_vars) > 0:
+        print("Parvars", par_vars)
+        print(f"Detected {len(par_vars)} slice variables")
+    sim = qtensor.QtreeSimulator(backend=backend)
+    sim.tn = tn
+    sim.tn.backend = backend
+    sim.peo = peo
+    sim._slice_relabel_buckets(slice_ext)
+    buckets = sim.tn.buckets
+    # --dbg
+    #ignore_vars  = sim.tn.bra_vars + sim.tn.ket_vars 
+    #graph = qtree.graph_model.importers.buckets2graph(buckets, ignore_vars)
+    #graph, label_dict = qtree.graph_model.relabel_graph_nodes(
+        #graph, dict(zip(graph.nodes, np.array(list(graph.nodes)) - 127*2))
+    #) 
+    #import networkx as nx
+    #components = list(nx.connected_components(graph))
+    #print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
+    #print(f"peo size without par_vars and ignore_vars: {len(peo) - len(ignore_vars)}")
+    # --
+
+    start = time.time()
+    for i in range(2**0):
+        print(f"P {i}", end='', flush=True)
+        bcopy = [b[:] for b in buckets]
+        res = bucket_elimination(
+            bcopy, backend,
+            n_var_nosum=len(tn.free_vars)
+        )
+        del bcopy
+        print("Result", res.data.flatten()[0])
+        time.sleep(0.5)
+    sim_result = backend.get_result_data(res).flatten()[0]
+    print("Simulation result:", sim_result)
+    end = time.time()
+    print("Elapsed", end - start)
+    out_file += ".json"
+    C = {'time': 2**len(par_vars)*(end - start)}
+    C['elapsed'] = (end - start)
+    C['memory'] = backend.max_mem
+    C['memory_history'] = backend.mem_history
+    C['nvmemory'] = backend.nvsmi_max_mem
+    C['result'] = {
+        "Re": np.real(sim_result).tolist(),
+        "Im": np.imag(sim_result).tolist()
+    }
+    if compress is not None:
+        if isinstance(compressor, qtensor.compression.ProfileCompressor):
+            C['compression'] = compressor.get_profile_data_json()
+
+    write_json(C, out_file)
+    cupy.cuda.profiler.stop()
+    return out_file
diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index ac94fdf2..71302263 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -12,12 +12,16 @@
 sys.path.append('./cusz/src')
 sys.path.append(str(Path(__file__).parent/'torch_quant'))
 sys.path.append('./torch_quant')
+sys.path.append(str(Path(__file__).parent/'newsz'))
+sys.path.append('./newsz')
+
 import torch
 try:
     from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
     # from cuSZp_wrapper import cuszp_device_compress, cuszp_device_decompress
     from cusz_wrapper import cusz_device_compress, cusz_device_decompress
     from torch_quant_perchannel import quant_device_compress, quant_device_decompress
+    from newsz_wrapper import newsz_device_compress, newsz_device_decompress
 except:
     print("import failed")
     # Silently fail on missing build of cuszx
@@ -210,6 +214,122 @@ def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtyp
             decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
         return decompressed_data
 
+class NEWSZCompressor(Compressor):
+    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
+        self.r2r_error = r2r_error
+        self.r2r_threshold = r2r_threshold
+        self.decompressed_own = []
+
+    def free_decompressed(self):
+        import cupy
+        print("Cleanup", len(self.decompressed_own))
+        for x in self.decompressed_own:
+            #print(x)
+            #if x == None:
+            #    continue
+            #else:
+                #print("CUDA Free", x)
+            cupy.cuda.runtime.free(x)
+            # del x
+            # cupy.get_default_memory_pool().free_all_blocks()
+            # cupy.get_default_pinned_memory_pool().free_all_blocks()
+        # torch.cuda.empty_cache()
+        self.decompressed_own = []
+
+    def free_compressed(self, ptr):
+        import ctypes, cupy
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
+        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
+        # cast to int64 pointer
+        # (effectively converting pointer to pointer to addr to pointer to int64)
+        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        decompressed_int = p_decompressed_int.contents
+        cupy.cuda.runtime.free(decompressed_int.value)
+
+    def compress(self, data):
+        import cupy
+        if isinstance(data, cupy.ndarray):
+            isCuPy = True
+        else:
+            isCuPy = False
+        num_elements = data.size
+        # Adapt numele depending on itemsize
+        itemsize = data.dtype.itemsize
+        num_elements_eff = int(num_elements*itemsize/4) 
+
+        dtype = data.dtype
+        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
+        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
+
+        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
+
+    def compress_size(self, ptr):
+        return ptr[5]
+
+    def decompress(self, obj):
+        import cupy
+        import ctypes
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
+        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
+        arr_cp = decompressed_ptr[0]
+        self.decompressed_own.append(decompressed_ptr[1])
+        
+        # -- Workaround to convert GPU pointer to int
+        # p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
+        # # cast to int64 pointer
+        # # (effectively converting pointer to pointer to addr to pointer to int64)
+        # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        # decompressed_int = p_decompressed_int.contents
+        # # --
+        # self.decompressed_own.append(decompressed_int.value)
+        # mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
+        # mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
+        arr = cupy.reshape(arr_cp, shape)
+        # self.decompressed_own.append(arr)
+        # arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
+        return arr
+    
+    ### Compression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
+    # - num_elements = Number of floating point elements in data
+    # - r2r_error = relative-to-value-range error bound for lossy compression
+    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
+    # Returns:
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
+    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
+        
+        if not isCuPy:
+            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+        else:
+            #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
+            cmp_bytes, outSize_ptr = newsz_device_compress(data,num_elements, CUSZX_BLOCKSIZE,r2r_threshold)
+            # cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+            del data
+            torch.cuda.empty_cache()
+        return cmp_bytes, outSize_ptr
+
+    ### Decompression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - num_elements = Number of floating point elements in original data
+    # Returns:
+    # - decompressed_data = Float32 pointer to decompressed data
+    #
+    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
+
+    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
+        if not isCuPy:
+            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
+        else:
+            # cuszx_device_decompress(nbEle, cmpBytes, owner, dtype)
+            decompressed_data = newsz_device_decompress(num_elements, cmp_bytes, owner,dtype)
+# oriData, absErrBound, nbEle, blockSize,threshold
+            # decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
+        return decompressed_data
 
 class CUSZXCompressor(Compressor):
     def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
@@ -443,4 +563,4 @@ def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtyp
             decompressed_data = cusz_device_decompress(num_elements, cmp_bytes, owner,dtype)
 # oriData, absErrBound, nbEle, blockSize,threshold
             # decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
-        return decompressed_data
\ No newline at end of file
+        return decompressed_data
diff --git a/qtensor/compression/newsz/README_NEWSZ.md b/qtensor/compression/newsz/README_NEWSZ.md
new file mode 100644
index 00000000..e6dbcda2
--- /dev/null
+++ b/qtensor/compression/newsz/README_NEWSZ.md
@@ -0,0 +1,15 @@
+# Building newSZ
+
+1. Clone the NVCOMP repository from https://github.com/NVIDIA/nvcomp.git
+
+2. Change to 'branch-2.2' branch. (`git checkout branch-2.2`)
+
+3. Follow build instructions in NVCOMP repository (you can ignore -DNVCOMP_EXTS_ROOT flag)
+
+4. Copy shared library `nvcomp/build/lib/libnvcomp.so` to current directory (`qtensor/compression/newsz/`)
+
+5. Run the following command: `nvcc --shared --compiler-options '-fPIC' -lnvcomp -o libnewsz_wrapper.so *.cu --library-path=<PATH_TO_NVCOMP_LIB> --library=nvcomp -I/PATH_TO_NVCOMP/nvcomp/build/include/`
+
+# Running newSZ
+
+- Specify --compress=newsz when running main.py
diff --git a/qtensor/compression/newsz/newsz.cu b/qtensor/compression/newsz/newsz.cu
new file mode 100644
index 00000000..3ef211d5
--- /dev/null
+++ b/qtensor/compression/newsz/newsz.cu
@@ -0,0 +1,248 @@
+#include <stdio.h>
+#include "newsz.h"
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+#include <cub/cub.cuh>
+// #include "cuCompactor.cuh"
+
+#include "nvcomp/lz4.hpp"
+#include "nvcomp.hpp"
+#include "nvcomp/nvcompManagerFactory.hpp"
+
+#define BLKS 40
+#define THDS 128
+#define FULL_MASK 0xffffffff
+
+__device__ int g_ints;
+
+struct int_predicate
+{
+    
+	__host__ __device__
+	bool operator()(const int x)
+	{
+		return x>0;
+	}
+};
+
+struct to_copy
+{
+  __host__ __device__
+  bool operator()(const uint8_t x)
+  {
+    return x==1;
+  }
+};
+
+
+
+
+__global__ void compress(float *data, float *scales, float *zeropts, int8_t *out){
+    int bid = blockIdx.x;
+    int tid = threadIdx.x;
+    extern __shared__ float scratchpad[];
+    __shared__ float min;
+    __shared__ float max;
+
+    typedef cub::BlockReduce<float, THDS> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage1;
+
+    float item = data[blockIdx.x*blockDim.x+threadIdx.x];
+
+    float tmax = BlockReduce(temp_storage1).Reduce(item, cub::Max());
+    float tmin = BlockReduce(temp_storage1).Reduce(item, cub::Min());
+    
+    if (threadIdx.x==0)
+    {
+        max = tmax;
+        min = tmin;
+    }
+
+    __syncthreads();
+
+    float vrange = max - min;
+    float scale = vrange/((2^8) - 1);
+    int zeropt = -1*lrintf(min*scale) - (2^7);
+
+    int q_item = lrintf(item/scale) + zeropt;
+
+    // Clamp quantized value
+    if(q_item>127)q_item = 127;
+    if(q_item <-128)q_item = -128;
+    int8_t q_val = (int8_t)(0xff & q_item);
+    out[blockIdx.x*blockDim.x+threadIdx.x] = q_val;
+    if (threadIdx.x==0)
+    {
+        scales[blockIdx.x] = scale;
+        zeropts[blockIdx.x]= zeropt;
+    }
+    
+}
+
+__global__ void decompress(int8_t *q_data, float *scales, float *zeropts, float *out){
+    int bid = blockIdx.x;
+    int tid = threadIdx.x;
+    extern __shared__ float scratchpad[];
+    __shared__ float min;
+    __shared__ float max;
+
+    typedef cub::BlockReduce<float, THDS> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage1;
+
+    int8_t q_val = q_data[blockIdx.x*blockDim.x+threadIdx.x];
+
+    out[blockIdx.x*blockDim.x+threadIdx.x] = (q_val - zeropts[bid])*scales[bid];
+}
+
+__global__ void p_ints(){
+	printf("codebook entries used: %d\n", g_ints);
+}
+
+unsigned char* SZ_device_compress(float *data, size_t num_elements, int blocksize, size_t *outsize){
+    float *scales, *zeropts;
+    int8_t *q_out;
+    unsigned char *cmpbytes;
+    int num_blocks = num_elements/blocksize;
+
+    cudaMalloc(&scales, sizeof(float)*num_blocks);
+    cudaMalloc(&zeropts,sizeof(float)*num_blocks);
+    cudaMalloc(&q_out, num_elements);
+
+    using namespace nvcomp;
+
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    const int chunk_size = 1 << 16;
+    nvcompType_t data_type = NVCOMP_TYPE_CHAR;
+
+     
+
+    compress<<<num_blocks, blocksize>>>(data, scales, zeropts, q_out);
+    cudaDeviceSynchronize();
+
+    LZ4Manager nvcomp_manager{chunk_size, data_type, stream};
+    CompressionConfig comp_config = nvcomp_manager.configure_compression(num_elements);
+
+    uint8_t* comp_buffer;
+    cudaMalloc(&comp_buffer, comp_config.max_compressed_buffer_size);
+    
+    nvcomp_manager.compress((const uint8_t *)q_out, comp_buffer, comp_config);
+
+    size_t c_size = nvcomp_manager.get_compressed_output_size(comp_buffer);
+    cudaFree(q_out);
+
+    *outsize = sizeof(float)*(num_blocks+num_blocks)+c_size;
+    cudaMalloc(&cmpbytes, *outsize);
+
+    cudaMemcpy(cmpbytes, (unsigned char *)scales, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice);
+    cudaMemcpy(cmpbytes+sizeof(float)*num_blocks, (unsigned char *)zeropts, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice);
+    cudaMemcpy(cmpbytes+sizeof(float)*num_blocks+sizeof(float)*num_blocks, comp_buffer, c_size, cudaMemcpyDeviceToDevice);
+
+    float h_firstscale;
+    cudaMemcpy(&h_firstscale, cmpbytes, sizeof(float), cudaMemcpyDeviceToHost);
+    cudaFree(scales);
+    cudaFree(zeropts);
+    cudaFree(comp_buffer);
+    return cmpbytes;
+}
+
+float* SZ_device_decompress(unsigned char *cmpbytes, size_t num_elements, int blocksize, size_t *cmpsize){
+    float *scales, *zeropts;
+    uint8_t *q_cmp;
+    int8_t *q_vals;
+    float *out;
+    int num_blocks = num_elements/blocksize;
+    size_t c_size = *cmpsize-(2*sizeof(float)*num_blocks);
+
+    float first_val, *d_first;
+
+    cudaMalloc(&d_first, sizeof(float));
+    cudaMemcpy((unsigned char *)&first_val, cmpbytes, sizeof(float), cudaMemcpyDeviceToHost);
+
+
+
+    cudaMalloc((void **)&scales, sizeof(float)*num_blocks);
+    cudaMalloc((void **)&zeropts,sizeof(float)*num_blocks);
+    cudaMalloc((void **)&q_cmp, c_size);
+    cudaMemcpy((unsigned char *)scales, cmpbytes, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice);
+    
+    cudaMemcpy((unsigned char *)zeropts, cmpbytes+sizeof(float)*num_blocks, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice);
+    
+    cudaMemcpy(q_cmp, cmpbytes+sizeof(float)*num_blocks+sizeof(float)*num_blocks, c_size, cudaMemcpyDeviceToDevice);
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    const int chunk_size = 1 << 16;
+    
+    
+    nvcompType_t data_type = NVCOMP_TYPE_CHAR;
+
+    auto decomp_manager = nvcomp::create_manager(q_cmp, stream);
+
+    nvcomp::DecompressionConfig decomp_config = decomp_manager->configure_decompression((uint8_t *)q_cmp);
+    cudaMalloc(&q_vals, num_elements);
+
+    decomp_manager->decompress((uint8_t*)q_vals, (uint8_t*)q_cmp, decomp_config);
+    cudaFree(q_cmp);
+
+    cudaMalloc(&out, sizeof(float)*num_elements);
+
+    decompress<<<num_blocks, blocksize>>>(q_vals, scales, zeropts, out);
+    cudaDeviceSynchronize();
+    
+    cudaFree(scales);
+    cudaFree(zeropts);
+    cudaFree(q_vals);
+
+    return out;
+}
+
+int main(int argc, char** argv){
+    char oriFilePath[640], outputFilePath[645];
+    float* data;
+    size_t nbEle;
+    if(argc < 3)
+    {
+		printf("Usage: testfloat_compress_fastmode2 [srcFilePath] [block size] [err bound] [--cuda]\n");
+		printf("Example: testfloat_compress_fastmode2 testfloat_8_8_128.dat 64 1E-3 --cuda\n");
+		exit(0);
+    }
+
+    sprintf(oriFilePath, "%s", argv[1]);
+    int blockSize = atoi(argv[2]);
+    float errBound = atof(argv[3]);
+    nbEle = atoi(argv[4]);
+
+    data = (float*)malloc(sizeof(float)*nbEle);
+    sprintf(outputFilePath, "%s.sznew", oriFilePath);
+
+    FILE *in_file;
+    in_file = fopen(oriFilePath, "rb");
+    
+    fread(data, sizeof(float), nbEle, in_file);
+    fclose(in_file);
+    
+    float max = data[0];
+    float min = data[0];
+    for(int i=0;i<nbEle;i++){
+	if(data[i]>=max){
+		max = data[i];
+	}
+	if(data[i]<=min){
+		min = data[i];
+	}
+    }
+    errBound = errBound*(max-min);
+
+    // Move to device
+    float *d_data;
+    unsigned char *cmpbytes;
+    size_t outsize;
+    cudaMalloc(&d_data, sizeof(float)*nbEle);
+    cudaMemcpy(d_data, data, sizeof(float)*nbEle, cudaMemcpyHostToDevice);
+    //SZ_device_compress(d_data, nbEle, errBound, blockSize, cmpbytes, &outsize);
+
+    cudaFree(d_data);
+    
+}
diff --git a/qtensor/compression/newsz/newsz.h b/qtensor/compression/newsz/newsz.h
new file mode 100644
index 00000000..c537b889
--- /dev/null
+++ b/qtensor/compression/newsz/newsz.h
@@ -0,0 +1,3 @@
+
+unsigned char* SZ_device_compress(float *data, size_t num_elements, int blocksize, size_t *outsize);
+float* SZ_device_decompress(unsigned char *cmpbytes, size_t num_elements, int blocksize, size_t *cmpsize);
diff --git a/qtensor/compression/newsz/newsz_wrapper.cu b/qtensor/compression/newsz/newsz_wrapper.cu
new file mode 100644
index 00000000..d067560d
--- /dev/null
+++ b/qtensor/compression/newsz/newsz_wrapper.cu
@@ -0,0 +1,21 @@
+#include "newsz.h"
+#include <stdio.h>
+
+extern "C"{
+    
+    unsigned char* newSZ_device_compress(float *oriData, size_t *outSize, size_t nbEle, int blockSize){
+        //unsigned char* cmpbytes;
+        return SZ_device_compress(oriData, nbEle, blockSize, outSize);
+        //printf("in wrap cmpbytes: %p\n", cmpbytes);
+	//return cmpbytes;
+    }
+
+    float* newSZ_device_decompress(size_t nbEle, unsigned char* cmpBytes, int blocksize, size_t cmpsize){
+        size_t *cmpsize_ptr;
+        *cmpsize_ptr = cmpsize;
+
+        float *res = SZ_device_decompress(cmpBytes, nbEle, blocksize, cmpsize_ptr);
+	return res;
+    }
+    
+}
diff --git a/qtensor/compression/newsz/newsz_wrapper.py b/qtensor/compression/newsz/newsz_wrapper.py
new file mode 100644
index 00000000..af5e3741
--- /dev/null
+++ b/qtensor/compression/newsz/newsz_wrapper.py
@@ -0,0 +1,159 @@
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+from qtensor.tools.lazy_import import cupy as cp
+import time
+import torch
+
+from pathlib import Path
+LIB_PATH = str(Path(__file__).parent/'libnewsz_wrapper.so')
+#LIB_PATH = './libnewsz_wrapper.so'
+NVCOMP_PATH=r'/home/mkshah5/nvcomp/build/lib/libnvcomp.so'
+
+# unsigned char* newSZ_device_compress(float *oriData, size_t *outSize, size_t nbEle, int blockSize)
+def get_device_compress():
+    dll_base = ctypes.CDLL(NVCOMP_PATH,mode=ctypes.RTLD_GLOBAL)
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.newSZ_device_compress
+    func.argtypes = [POINTER(c_float), POINTER(c_size_t), c_size_t, c_int]
+    func.restype = POINTER(c_ubyte)
+    return func
+
+# float* newSZ_device_decompress(size_t nbEle, unsigned char* cmpBytes, int blocksize, size_t cmpsize)
+def get_device_decompress():
+
+    dll_base = ctypes.CDLL(NVCOMP_PATH,mode=ctypes.RTLD_GLOBAL)
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.newSZ_device_decompress
+    func.argtypes = [c_size_t, POINTER(c_ubyte), c_int, c_size_t]
+    func.restype = POINTER(c_float)
+    return func
+
+
+def newsz_device_compress(oriData, nbEle, blockSize,threshold):
+    __cuszx_device_compress = get_device_compress()
+    ori_nbEle = nbEle
+    variable = ctypes.c_size_t(0)
+    outSize = ctypes.pointer(variable)
+
+    oriData = oriData.flatten()
+    ori_real = oriData.real
+    ori_imag = oriData.imag
+    oriData = cp.concatenate((ori_real, ori_imag))
+    sample = oriData[::2]
+    d = cp.amax(oriData) - cp.amin(oriData)
+    d = d.get()
+    if d.dtype == np.complex64:
+        d = d.real
+    threshold = threshold*(d)
+    truth_values = abs(oriData)<=threshold
+    oriData[truth_values] = 0.0
+    nbEle = oriData.shape[0]
+    
+
+    oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float))
+    # newSZ_device_compress(float *oriData, size_t *outSize, size_t nbEle, int blockSize)
+    o_bytes = __cuszx_device_compress(oriData_p, outSize, np.ulonglong(nbEle), np.int32(blockSize))
+    #print("testing")
+    #print(o_bytes.value)
+    return (o_bytes,outSize.contents.value, blockSize), outSize
+
+
+def newsz_device_decompress(nbEle, cmpBytes, owner, dtype):
+    __cuszx_device_decompress=get_device_decompress()
+    (cmpBytes, cmpsize, blockSize) = cmpBytes
+
+    nbEle_p = ctypes.c_size_t(nbEle)
+    # float* newSZ_device_decompress(size_t nbEle, unsigned char* cmpBytes, int blocksize, size_t cmpsize)
+    newData = __cuszx_device_decompress(nbEle_p, cmpBytes, np.int32(blockSize), ctypes.c_size_t(cmpsize))
+    # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
+    # -- Workaround to convert GPU pointer to int
+    p_decompressed_ptr = ctypes.addressof(newData)
+    # cast to int64 pointer
+    # (effectively converting pointer to pointer to addr to pointer to int64)
+    p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decompressed_int = p_decompressed_int.contents
+    # --
+    pointer_for_free = decompressed_int.value
+    # self.decompressed_own.append(decompressed_int.value)
+    mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0)
+    mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
+    #print("mem ptr")
+    #print(mem_ptr)
+    arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr)
+    # res = cp.zeros((nbEle,))
+    # ## need to convert newData to cupy
+    # cp.place(res,bitmap,arr)
+
+    c_res = cp.zeros(int(nbEle/2), np.complex64)
+    c_res.real = arr[0:int(nbEle/2)]
+    c_res.imag = arr[int(nbEle/2):]
+    return (c_res, pointer_for_free)
+
+### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
+
+def free_compressed(ptr):
+    p_ptr = ctypes.addressof(ptr)
+    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decomp_int = p_int.contents
+    #cp.cuda.runtime.free(decomp_int.value)
+
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = int(1024)
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.002
+    r2r_error = 0.0001
+
+    in_vector = np.fromfile("all_sample.bin", dtype=np.complex64)
+    #print(np.max(in_vector))
+    DATA_SIZE = len(in_vector)
+    #range_vr = np.max(in_vector)-np.min(in_vector)
+    #r2r_threshold = r2r_threshold*range_vr
+    #r2r_error = r2r_error*range_vr
+    #in_vector = np.zeros((DATA_SIZE,))
+    #for i in range(0,int(DATA_SIZE/4)):
+    #    in_vector[i] = 0.0
+    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+    #    in_vector[i] = 5.0
+    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
+    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+    #    in_vector[i] = -7.0
+    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+    #    in_vector[i] = 0.001
+
+    print(DATA_SIZE)
+    #in_vector = in_vector.astype('float32')
+    in_vector_gpu = cp.asarray(in_vector)
+    
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+    #print(in_vector[0:16])
+    for i in range(2):
+        s_time = time.time()
+        #o_bytes, outSize = cusz_device_compress(in_vector_gpu, r2r_error, DATA_SIZE, 256, r2r_threshold)
+
+        o_bytes, outSize = newsz_device_compress(in_vector_gpu, DATA_SIZE, 256,r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        print(outSize[0])
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        #(d_bytes,ptr )= cusz_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
+        
+        (d_bytes, ptr) = newsz_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
+        free_compressed(o_bytes[0])
+        cp.cuda.runtime.free(ptr)
+        print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
+        print("Decompress Success")

From fac16fd76536417948603d767f51d9f914e680cf Mon Sep 17 00:00:00 2001
From: Milan Kartik Shah <mkshah5@ncsu.edu>
Date: Mon, 24 Jul 2023 12:32:57 -0400
Subject: [PATCH 100/126] Updated lib paths in newsz_wrapper.py

---
 qtensor/compression/newsz/newsz_wrapper.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/qtensor/compression/newsz/newsz_wrapper.py b/qtensor/compression/newsz/newsz_wrapper.py
index af5e3741..d40304fb 100644
--- a/qtensor/compression/newsz/newsz_wrapper.py
+++ b/qtensor/compression/newsz/newsz_wrapper.py
@@ -8,8 +8,10 @@
 
 from pathlib import Path
 LIB_PATH = str(Path(__file__).parent/'libnewsz_wrapper.so')
+ 
+NVCOMP_PATH = str(Path(__file__).parent/'libnvcomp.so')
+#NVCOMP_PATH= './libnvcomp.so'
 #LIB_PATH = './libnewsz_wrapper.so'
-NVCOMP_PATH=r'/home/mkshah5/nvcomp/build/lib/libnvcomp.so'
 
 # unsigned char* newSZ_device_compress(float *oriData, size_t *outSize, size_t nbEle, int blockSize)
 def get_device_compress():
@@ -100,7 +102,7 @@ def free_compressed(ptr):
     p_ptr = ctypes.addressof(ptr)
     p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
     decomp_int = p_int.contents
-    #cp.cuda.runtime.free(decomp_int.value)
+    cp.cuda.runtime.free(decomp_int.value)
 
 
 if __name__ == "__main__":
@@ -137,7 +139,7 @@ def free_compressed(ptr):
     # variable = ctypes.c_size_t(0)
     # outSize = ctypes.pointer(variable)
     #print(in_vector[0:16])
-    for i in range(2):
+    for i in range(200):
         s_time = time.time()
         #o_bytes, outSize = cusz_device_compress(in_vector_gpu, r2r_error, DATA_SIZE, 256, r2r_threshold)
 

From 1e728c5b3a354eaa2ed44b591439b7ba44aa2f27 Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Fri, 4 Aug 2023 19:09:18 +0000
Subject: [PATCH 101/126] add WriteToDiskCompressor

---
 bench/qc_simulation/src/simulators/qtensor.py |  4 ++
 qtensor/compression/Compressor.py             | 38 +++++++++++++++++++
 qtensor/compression/compressed_contraction.py |  4 +-
 qtensor/contraction_backends/compression.py   |  5 ++-
 4 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/bench/qc_simulation/src/simulators/qtensor.py b/bench/qc_simulation/src/simulators/qtensor.py
index d80f68ad..e206feb6 100644
--- a/bench/qc_simulation/src/simulators/qtensor.py
+++ b/bench/qc_simulation/src/simulators/qtensor.py
@@ -176,6 +176,7 @@ def simulate(in_file, out_file,
     import time
     from qtensor.contraction_algos import bucket_elimination
     from qtensor.compression.Compressor import CUSZCompressor, CUSZXCompressor, TorchCompressor, NEWSZCompressor
+    from qtensor.compression.Compressor import WriteToDiskCompressor
     import cupy
     cupy.cuda.profiler.start()
     prep_data = read_preps(in_file)
@@ -199,6 +200,9 @@ def simulate(in_file, out_file,
             print(f"{r2r_error=} {r2r_threshold=}")
             compressor = NEWSZCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
             compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'disk':
+            compressor = WriteToDiskCompressor(f'/grand/QTensor/compression/data/tensors_compressed_M{M}/')
+            compressor = qtensor.compression.ProfileCompressor(compressor)
         else:
             raise ValueError(f"Unknown compression algorithm: {compress}")
         backend = qtensor.contraction_backends.CompressionBackend(backend, compressor, M)
diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index 71302263..4e537489 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -564,3 +564,41 @@ def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtyp
 # oriData, absErrBound, nbEle, blockSize,threshold
             # decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
         return decompressed_data
+
+class WriteToDiskCompressor(Compressor):
+    def __init__(self, path):
+        from pathlib import Path
+        Path(path).mkdir(exist_ok=True, parents=True)
+        self.path = path
+    
+    def _gen_random_filename(self, info):
+        dtype, shape, isCupy = info
+        k = np.random.randint(0, 100000000)
+        s = hex(k)[2:]
+        return self.path + f'/qtensor_data_{s}_{str(dtype)}.bin'
+
+    def compress(self, data):
+        import cupy
+        if isinstance(data, cupy.ndarray):
+            isCupy=False
+        else:
+            isCupy=True
+        fname = self._gen_random_filename((data.dtype, data.shape, isCupy))
+        data.tofile(fname)
+        return (fname, data.dtype, data.shape, isCupy)
+
+    def compress_size(self, ptr):
+        return 0.1
+
+    def decompress(self, obj):
+        import cupy
+        fname, dtype, shape, isCupy = obj
+        if isCupy:
+            return cupy.fromfile(fname).view(dtype).reshape(shape)
+        else:
+            return np.fromfile(fname).view(dtype).reshape(shape)
+
+    def free_compressed(self, ptr):
+        pass
+    def free_decompressed(self):
+        pass
\ No newline at end of file
diff --git a/qtensor/compression/compressed_contraction.py b/qtensor/compression/compressed_contraction.py
index 041eaf27..81deb7cd 100644
--- a/qtensor/compression/compressed_contraction.py
+++ b/qtensor/compression/compressed_contraction.py
@@ -40,8 +40,8 @@ def contract_two_tensors(A, B, T_out, einsum=np.einsum):
         result_ints = [relabel_dict_int[int(i)] for i in result_indices]
     else:
         result_ints = list(map(int, result_indices))
-    print(A.data.shape)
-    print(B.data.shape)
+    #print(A.data.shape)
+    #print(B.data.shape)
     out = einsum(A.data, A_ints, B.data, B_ints, result_ints)
     if len(result_ints)>0:
         # This copying is reqiured because cupy doesn't support `out` argument.
diff --git a/qtensor/contraction_backends/compression.py b/qtensor/contraction_backends/compression.py
index 1cb548ef..994c1699 100644
--- a/qtensor/contraction_backends/compression.py
+++ b/qtensor/contraction_backends/compression.py
@@ -68,8 +68,9 @@ def process_bucket(self, bucket, no_sum=False):
             for t in [accum, t]:
                 if isinstance(t, CompressedTensor):
                     for c in t.data:
-                        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = c
-                        del cmp_bytes
+                        if len(c)==6:
+                            cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = c
+                            del cmp_bytes
                         # import ctypes
                         # p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
                         # # cast to int64 pointer

From cdd9c352f685e5fabc0b5a66d32e734ca1fcd372 Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Fri, 11 Aug 2023 03:54:05 +0000
Subject: [PATCH 102/126] Minor cleanup in Compressor.py

---
 qtensor/compression/Compressor.py | 118 ++++++++++++------------------
 1 file changed, 48 insertions(+), 70 deletions(-)

diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index 4e537489..02a5b4da 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -15,6 +15,7 @@
 sys.path.append(str(Path(__file__).parent/'newsz'))
 sys.path.append('./newsz')
 
+
 import torch
 try:
     from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
@@ -29,6 +30,22 @@
 
 CUSZX_BLOCKSIZE = 256
 
+# -- helper functions
+
+def _get_data_info(data):
+    import cupy
+    if isinstance(data, cupy.ndarray):
+        isCuPy = True
+    else:
+        isCuPy = False
+    num_elements = data.size
+    # Adapt numele depending on itemsize
+    itemsize = data.dtype.itemsize
+    num_elements_eff = int(num_elements*itemsize/4) 
+    return isCuPy, num_elements_eff
+
+# -- Compressor classes
+
 class Compressor():
     def compress(self, data):
         raise NotImplementedError
@@ -143,16 +160,7 @@ def free_compressed(self, ptr):
         cupy.cuda.runtime.free(decompressed_int.value)
 
     def compress(self, data):
-        import cupy
-        if isinstance(data, cupy.ndarray):
-            isCuPy = True
-        else:
-            isCuPy = False
-        num_elements = data.size
-        # Adapt numele depending on itemsize
-        itemsize = data.dtype.itemsize
-        num_elements_eff = int(num_elements*itemsize/4) 
-
+        isCupy, num_elements_eff = _get_data_info(data)
         dtype = data.dtype
         cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
         return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
@@ -247,16 +255,7 @@ def free_compressed(self, ptr):
         cupy.cuda.runtime.free(decompressed_int.value)
 
     def compress(self, data):
-        import cupy
-        if isinstance(data, cupy.ndarray):
-            isCuPy = True
-        else:
-            isCuPy = False
-        num_elements = data.size
-        # Adapt numele depending on itemsize
-        itemsize = data.dtype.itemsize
-        num_elements_eff = int(num_elements*itemsize/4) 
-
+        isCuPy, num_elements_eff = _get_data_info(data)
         dtype = data.dtype
         cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
         # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
@@ -304,9 +303,9 @@ def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
         if not isCuPy:
             cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
         else:
-            #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
-            cmp_bytes, outSize_ptr = newsz_device_compress(data,num_elements, CUSZX_BLOCKSIZE,r2r_threshold)
-            # cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+            print('Before compress')
+            cmp_bytes, outSize_ptr = newsz_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+            print('After compress')
             del data
             torch.cuda.empty_cache()
         return cmp_bytes, outSize_ptr
@@ -364,16 +363,7 @@ def free_compressed(self, ptr):
         cupy.cuda.runtime.free(decompressed_int.value)
 
     def compress(self, data):
-        import cupy
-        if isinstance(data, cupy.ndarray):
-            isCuPy = True
-        else:
-            isCuPy = False
-        num_elements = data.size
-        # Adapt numele depending on itemsize
-        itemsize = data.dtype.itemsize
-        num_elements_eff = int(num_elements*itemsize/4) 
-
+        isCuPy, num_elements_eff = _get_data_info(data)
         dtype = data.dtype
         cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
         # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
@@ -406,17 +396,19 @@ def decompress(self, obj):
         # arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
         return arr
     
-    ### Compression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
-    # - num_elements = Number of floating point elements in data
-    # - r2r_error = relative-to-value-range error bound for lossy compression
-    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
-    # Returns:
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
     def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
+        """
+        ## Compression API with cuSZx ###
+        Parameters:
+         - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+         - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
+         - num_elements = Number of floating point elements in data
+         - r2r_error = relative-to-value-range error bound for lossy compression
+         - r2r_threshold = relative-to-value-range threshold to floor values to zero
+         Returns:
+         - cmp_bytes = Unsigned char pointer to compressed bytes
+         - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
+         """
         
         if not isCuPy:
             cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
@@ -428,17 +420,19 @@ def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
             torch.cuda.empty_cache()
         return cmp_bytes, outSize_ptr
 
-    ### Decompression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - num_elements = Number of floating point elements in original data
-    # Returns:
-    # - decompressed_data = Float32 pointer to decompressed data
-    #
-    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
 
     def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
+        """
+        ## Decompression API with cuSZx ###
+         Parameters:
+         - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+         - cmp_bytes = Unsigned char pointer to compressed bytes
+         - num_elements = Number of floating point elements in original data
+         Returns:
+         - decompressed_data = Float32 pointer to decompressed data
+        
+         Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
+         """
         if not isCuPy:
             decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
         else:
@@ -458,11 +452,6 @@ def free_decompressed(self):
         import cupy
         print("Cleanup", len(self.decompressed_own))
         for x in self.decompressed_own:
-            #print(x)
-            #if x == None:
-            #    continue
-            #else:
-                #print("CUDA Free", x)
             cupy.cuda.runtime.free(x)
             # del x
             # cupy.get_default_memory_pool().free_all_blocks()
@@ -481,20 +470,10 @@ def free_compressed(self, ptr):
         cupy.cuda.runtime.free(decompressed_int.value)
 
     def compress(self, data):
-        import cupy
-        if isinstance(data, cupy.ndarray):
-            isCuPy = True
-        else:
-            isCuPy = False
-        num_elements = data.size
-        # Adapt numele depending on itemsize
-        itemsize = data.dtype.itemsize
-        num_elements_eff = int(num_elements*itemsize/4) 
+        isCuPy, num_elements_eff = _get_data_info(data)
 
         dtype = data.dtype
         cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
-        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
-
         return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
 
     def compress_size(self, ptr):
@@ -538,7 +517,6 @@ def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
         if not isCuPy:
             cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
         else:
-            #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
             cmp_bytes, outSize_ptr = cusz_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE,r2r_threshold)
             # cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
             del data
@@ -601,4 +579,4 @@ def decompress(self, obj):
     def free_compressed(self, ptr):
         pass
     def free_decompressed(self):
-        pass
\ No newline at end of file
+        pass

From ac03eb505487e85865aa893a2c6d3084c7533bf8 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Fri, 11 Aug 2023 18:08:11 -0400
Subject: [PATCH 103/126] Change pynauty to pynauty-nice

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index be3da9b8..641d2156 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@
     ,'click'
     ,'qtensor-qtree'
     ,'lazy-import'
-    ,'pynauty-nice'
+    ,'pynauty'
     ,'sarge'
     ,'cartesian-explorer'
 

From 957ddb540536cf6c507aec8ac5a6007a4983613c Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Fri, 10 Nov 2023 04:33:50 +0000
Subject: [PATCH 104/126] add energy simulation to compression bench

---
 bench/qc_simulation/main.py                   |   8 +-
 .../src/simulators/qtensor_energy.py          | 277 ++++++++++++++++++
 requirements.txt                              |   2 -
 test-requirements.txt                         |   5 +
 4 files changed, 288 insertions(+), 4 deletions(-)
 create mode 100644 bench/qc_simulation/src/simulators/qtensor_energy.py
 delete mode 100644 requirements.txt
 create mode 100644 test-requirements.txt

diff --git a/bench/qc_simulation/main.py b/bench/qc_simulation/main.py
index 141508dc..7203b4fe 100755
--- a/bench/qc_simulation/main.py
+++ b/bench/qc_simulation/main.py
@@ -116,11 +116,14 @@ def update_index(index_file, **kwargs):
 from src.simulators.qtensor import preprocess as qtensor_preprocess
 from src.simulators.qtensor import estimate as qtensor_estimate
 from src.simulators.qtensor import simulate as qtensor_simulate
+from src.simulators.qtensor_energy import simulate as qtensor_simulate_energy
+from src.simulators.qtensor_energy import preprocess as qtensor_preprocess_energy
 from src.circuit_gen.qaoa import generate_maxcut
 
 # -- Main
 sim_preprocessors = {
-    'qtensor': qtensor_preprocess
+    'qtensor': qtensor_preprocess,
+    'qtensor_energy': qtensor_preprocess_energy
 }
 
 sim_estimators = {
@@ -128,7 +131,8 @@ def update_index(index_file, **kwargs):
 }
 
 sim_simulators = {
-    'qtensor': qtensor_simulate
+    'qtensor': qtensor_simulate,
+    'qtensor_energy': qtensor_simulate_energy
 }
 
 circ_generators = {
diff --git a/bench/qc_simulation/src/simulators/qtensor_energy.py b/bench/qc_simulation/src/simulators/qtensor_energy.py
new file mode 100644
index 00000000..d9689064
--- /dev/null
+++ b/bench/qc_simulation/src/simulators/qtensor_energy.py
@@ -0,0 +1,277 @@
+import qtensor
+import qtree
+import networkx as nx
+import numpy as np
+
+# -- QAOA generic parser
+
+def parse_qaoa_composer(data):
+    import json
+    data = json.loads(data)
+    terms = data["terms"]
+    gamma = np.array(data["gamma"])/np.pi/2
+    beta = np.array(data["beta"])/np.pi
+    N = len(set(sum([t[1] for t in terms], [])))
+    G = nx.Graph()
+    for factor, term in terms:
+        G.add_edge(*term)
+    composer = qtensor.DefaultQAOAComposer(G, gamma=gamma, beta=beta)
+    return composer
+# --
+
+def read_circ(circ_f, type=None):
+
+    if type is None:
+        type = circ_f.path.name.split(".")[-1]
+
+    print("Reading circuit of type", type)
+    if type == "jsonterms":
+        b = circ_f.f.read()
+        return parse_qaoa_composer(b)
+
+    elif type == "qasm":
+        raise Exception("only jsonterms is supported for energy calculations")
+
+def read_preps(prep_f):
+    import pickle
+    return pickle.load(prep_f.f)
+
+def write_preps(peo, prep_f):
+    import pickle
+    pickle.dump(peo, open(prep_f, 'wb'))
+
+def write_json(data, out_file):
+    import json
+    with open(out_file, 'w') as f:
+        json.dump(data, f)
+        # This newline plays nice when cat-ing multiple files
+        f.write('\n')
+
+def preprocess_circ(circ, S, O, M, after_slice):
+    tn = qtensor.optimisation.QtreeTensorNet.from_qtree_gates(circ)
+    opt = qtensor.toolbox.get_ordering_algo(O)
+    if S:
+        # ignore argument type mismatch for pyright -- opt can be `Optimizer`
+        # pyright: reportGeneralTypeIssues=false
+        opt = qtensor.optimisation.TreeTrimSplitter(
+            tw_bias=0, max_tw=M, base_ordering=opt,
+            peo_after_slice_strategy=after_slice
+        )
+        
+        peo, par_vars, _ = opt.optimize(tn)
+        # --dbg
+        graph = tn.get_line_graph()
+        ignore_vars = tn.bra_vars + tn.ket_vars
+        for pv in par_vars:
+            graph.remove_node(int(pv))
+        components = list(nx.connected_components(graph))
+        print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
+        print(f"peo size without par_vars and ignore_vars: {len(peo) - len(par_vars) - len(ignore_vars)}")
+
+        print()
+        # --
+    else:
+        peo, _ = opt.optimize(tn)
+        par_vars = []
+    #print("W", opt.treewidth)
+    return (peo, par_vars, tn), opt.treewidth
+
+def preprocess(in_file, out_file, O='greedy', S=None, M=30, after_slice='run-again'):
+    """
+    Arguments:
+        in_file: input file
+        out_file: output file
+        O: ordering algorithm 
+        S: slicing algorithm 
+        M: Memory limit for slicing 
+    """
+    import copy
+    composer = read_circ(in_file)
+    G = composer.graph
+    prep_data = []
+    for edge in G.edges:
+        c_copy = copy.deepcopy(composer)
+        c_copy.energy_expectation_lightcone(edge)
+        e_prep, treewidth = preprocess_circ(c_copy.circuit, S, O, M, after_slice)
+        if treewidth>25:
+            prep_data.append(e_prep)
+    write_preps(prep_data, out_file)
+    print(f"Wrote {len(prep_data)} preparations of lightcones")
+    return prep_data
+
+def estimate(in_file, out_file, C=100, M=30, F=1e12, T=1e9, **kwargs):
+    """
+    Arguments:
+        in_file: file with preprocessed data
+        out_file: file to write the results to
+        C: Compression ratio
+        M: Memory limit in log2(b/16)
+        F: assumed FLOPS 
+        T: Throughput of compression
+    """
+    from qtensor.compression.cost_estimation import compressed_contraction_cost, Cost
+    from dataclasses import asdict
+    import json
+    prep_data = read_preps(in_file)
+    peo, par_vars, tn = prep_data
+
+    tn.slice({i: slice(0, 1) for i in par_vars})
+    peo = peo[:len(peo) - len(par_vars)]
+    costs: list[Cost] = compressed_contraction_cost(tn, peo, mem_limit=M, compression_ratio=C)
+    totals: Cost = sum(costs[1:], costs[0])
+    time = totals.time(F, T, T, M)
+    C = asdict(totals)
+    C['time'] = time*2**len(par_vars)
+    C['slices'] = 2**len(par_vars)
+    print("C", C)
+    out_file += ".json"
+    write_json(C, out_file)
+    return out_file
+
+def simulate(in_file, out_file,
+             backend='einsum',
+             compress=None,
+             M=29,
+             r2r_error=1e-3, r2r_threshold=1e-3,
+             **kwargs):
+    import cupy
+    prep_data = read_preps(in_file)
+    cupy.cuda.profiler.start()
+
+    C = dict(
+        time=0,
+        elapsed=0,
+        memory=0,
+        memory_history=[],
+        nvmemory=0,
+        result = dict(Re=0, Im=0),
+        compression=dict(compress=[], decompress=[])
+    )
+
+    for prep_lightcone in prep_data[:5]:
+        print(prep_lightcone)
+        r = simulate_preps_lightcone(prep_lightcone, backend, compress, M,
+                                              r2r_error,
+                                              r2r_threshold,**kwargs)
+        C['time'] += r['time']
+        C['elapsed'] += r['elapsed']
+        C['memory'] = max(C['memory'], r['memory'])
+        C['nvmemory'] = max(C['nvmemory'], r['nvmemory'])
+        C['memory_history'] += r['memory_history']
+        C['result']['Re'] += r['result']['Re']
+        C['result']['Im'] += r['result']['Im']
+        if r.get('compression'):
+            C['compression']['compress'] += r['compression']['compress']
+            C['compression']['decompress'] += r['compression']['decompress']
+
+    out_file += ".json"
+    write_json(C, out_file)
+    return out_file
+    cupy.cuda.profiler.stop()
+
+def simulate_preps_lightcone(prep_data,
+             backend='einsum',
+             compress=None,
+             M=29,
+             r2r_error=1e-3, r2r_threshold=1e-3,
+             **kwargs):
+    """
+    Args:
+        in_file: file with preprocessed data
+        out_file: file to write the results to
+        backend: backend to use
+        compress: compression algorithm
+        M: memory threshold for compression
+        r2r_error: relative error for compression
+        r2r_threshold: relative threshold for compression
+    """
+    import time
+    from qtensor.contraction_algos import bucket_elimination
+    from qtensor.compression.Compressor import CUSZCompressor, CUSZXCompressor, TorchCompressor, NEWSZCompressor
+    #from qtensor.compression.Compressor import WriteToDiskCompressor
+    import cupy
+    peo, par_vars, tn = prep_data
+    
+    backend = qtensor.contraction_backends.get_backend(backend)
+    if compress is not None:
+        if compress == 'szx':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = CUSZXCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'cusz':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = CUSZCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'torch':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = TorchCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'newsz':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = NEWSZCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'disk':
+            compressor = WriteToDiskCompressor(f'/grand/QTensor/compression/data/tensors_compressed_M{M}/')
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        else:
+            raise ValueError(f"Unknown compression algorithm: {compress}")
+        backend = qtensor.contraction_backends.CompressionBackend(backend, compressor, M)
+        from qtensor.contraction_backends.performance_measurement_decorator import MemProfBackend
+        backend = MemProfBackend(backend)
+
+    relabelid = {}
+    for tensor in tn.tensors:
+        for i in tensor.indices:
+            relabelid[int(i)] = i
+
+    slice_ext = {relabelid[int(i)]: 0 for i in par_vars}
+
+    if len(par_vars) > 0:
+        print("Parvars", par_vars)
+        print(f"Detected {len(par_vars)} slice variables")
+    sim = qtensor.QtreeSimulator(backend=backend)
+    sim.tn = tn
+    sim.tn.backend = backend
+    sim.peo = peo
+    sim._slice_relabel_buckets(slice_ext)
+    buckets = sim.tn.buckets
+    # --dbg
+    #ignore_vars  = sim.tn.bra_vars + sim.tn.ket_vars 
+    #graph = qtree.graph_model.importers.buckets2graph(buckets, ignore_vars)
+    #graph, label_dict = qtree.graph_model.relabel_graph_nodes(
+        #graph, dict(zip(graph.nodes, np.array(list(graph.nodes)) - 127*2))
+    #) 
+    #import networkx as nx
+    #components = list(nx.connected_components(graph))
+    #print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
+    #print(f"peo size without par_vars and ignore_vars: {len(peo) - len(ignore_vars)}")
+    # --
+
+    start = time.time()
+    for i in range(2**0):
+        print(f"P {i}", end='', flush=True)
+        bcopy = [b[:] for b in buckets]
+        res = bucket_elimination(
+            bcopy, backend,
+            n_var_nosum=len(tn.free_vars)
+        )
+        del bcopy
+        print("Result", res.data.flatten()[0])
+        #time.sleep(0.5)
+    sim_result = backend.get_result_data(res).flatten()[0]
+    print("Simulation result:", sim_result)
+    end = time.time()
+    print("Elapsed", end - start)
+    C = {'time': 2**len(par_vars)*(end - start)}
+    C['elapsed'] = (end - start)
+    C['memory'] = backend.max_mem
+    C['memory_history'] = backend.mem_history
+    C['nvmemory'] = backend.nvsmi_max_mem
+    C['result'] = {
+        "Re": np.real(sim_result).tolist(),
+        "Im": np.imag(sim_result).tolist()
+    }
+    if compress is not None:
+        if isinstance(compressor, qtensor.compression.ProfileCompressor):
+            C['compression'] = compressor.get_profile_data_json()
+    return C
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 2279da1b..00000000
--- a/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-mongocat
-PyInquirer
diff --git a/test-requirements.txt b/test-requirements.txt
new file mode 100644
index 00000000..720fe010
--- /dev/null
+++ b/test-requirements.txt
@@ -0,0 +1,5 @@
+mongocat
+pytest-xdist
+cirq
+qiskit
+tabulate

From 5f5a15c58913db75d9f34a8424b852f68009028e Mon Sep 17 00:00:00 2001
From: Dan Lykov <dlykov@anl.gov>
Date: Fri, 1 Dec 2023 17:34:14 +0000
Subject: [PATCH 105/126] update in slicing history shape

---
 qtensor/contraction_backends/common.py | 5 ++++-
 qtensor/optimisation/Optimizer.py      | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/qtensor/contraction_backends/common.py b/qtensor/contraction_backends/common.py
index d09441e7..e3c0fbc5 100644
--- a/qtensor/contraction_backends/common.py
+++ b/qtensor/contraction_backends/common.py
@@ -44,8 +44,11 @@ def slice_numpy_tensor(data:np.ndarray, indices_in, indices_out, slice_dict):
         i for sl, i in zip(slice_bounds, indices_in) if not isinstance(sl, int)
     ]
     indices_sized = [v.copy(size=size) for v, size in zip(indices_sliced, s_data.shape)]
+    #print("indices_sized", indices_sized)
+    #print("Slice bounds", slice_bounds)
+    #print("Slice dict", slice_dict)
+    #print("data shape, sliced data shape", data.shape, s_data.shape)
     indices_out = [v for v in indices_out if not isinstance(slice_dict.get(v, None), int)]
-    assert len(indices_sized) == len(s_data.shape)
     assert len(indices_sliced) == len(s_data.shape)
     st_data = permute_np_tensor_data(s_data, indices_sliced, indices_out)
     return st_data, indices_out
diff --git a/qtensor/optimisation/Optimizer.py b/qtensor/optimisation/Optimizer.py
index 605a8805..dfa38957 100644
--- a/qtensor/optimisation/Optimizer.py
+++ b/qtensor/optimisation/Optimizer.py
@@ -318,7 +318,7 @@ def _split_graph(self, p_graph, max_tw):
         peo_ints = self.peo_ints
         tw = self.treewidth
         self._slice_hist = []
-        self._slice_hist.append([0, tw])
+        self._slice_hist.append([0, tw, peo_ints])
         log.info('Treewidth: {}', tw)
         log.info('Target treewidth: {}', max_tw)
         result = []
@@ -354,7 +354,7 @@ def _split_graph(self, p_graph, max_tw):
 
             peo_ints, path = self._update_peo_after_slice(p_graph, result)
             tw = max(path)
-            self._slice_hist.append([pv_cnt, tw])
+            self._slice_hist.append([pv_cnt, tw, peo_ints])
             delta = tw - max_tw
 
         return peo_ints, result

From 547d05b57fbb938f15e55a6f14c5d8c48b04d86b Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Fri, 1 Dec 2023 14:16:24 -0600
Subject: [PATCH 106/126] add more info for compression profiling

---
 qtensor/compression/Compressor.py             |  3 +++
 .../performance_measurement_decorator.py      | 23 +++++++++++++++----
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index 71302263..ced72ad0 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -101,6 +101,9 @@ def get_profile_stats(self):
         compress_ratios = np.mean([x.size_in/x.size_out for x in compress])
         compress_size = sum([x.size_out for x in compress])
         return compress_time, decompress_time, compress_size, compress_ratios
+
+    def compress_size(self, ptr):
+        return self.compressor.compress_size(ptr)
 # --
 
 class NumpyCompressor(Compressor):
diff --git a/qtensor/contraction_backends/performance_measurement_decorator.py b/qtensor/contraction_backends/performance_measurement_decorator.py
index cedb3e45..39efffa7 100644
--- a/qtensor/contraction_backends/performance_measurement_decorator.py
+++ b/qtensor/contraction_backends/performance_measurement_decorator.py
@@ -1,7 +1,7 @@
 import numpy as np
 from dataclasses import dataclass
 from qtensor.contraction_backends import ContractionBackend, NumpyBackend
-from qtensor.contraction_backends.compression import CompressionBackend
+from qtensor.contraction_backends.compression import CompressionBackend, CompressedTensor
 from pyrofiler import timing
 from qtensor.tools.lazy_import import torch, pandas
 import string
@@ -40,6 +40,14 @@ def max_mem(self):
     def nvsmi_max_mem(self):
         mems = [m['nvmem'] for m in self.mem_history]
         return max(mems)
+    @property
+    def cupy_buffer_max_mem(self):
+        mems = [m['cupy_bufsize'] for m in self.mem_history]
+        return max(mems)
+    @property
+    def object_max_mem(self):
+        mems = [m['objmem'] for m in self.mem_history]
+        return max(mems)
 
     def check_store(self):
         import cupy
@@ -54,6 +62,8 @@ def check_store(self):
                 continue
             else:
                 size = self.tensor_size(tensor)
+                if isinstance(tensor, CompressedTensor):
+                    print("Tensor", tensor, "size", size)
                 total_mem += size
         for key in deleted_keys:
             self.object_keys.remove(key)
@@ -70,9 +80,12 @@ def check_store(self):
             mem=gpu_mem,
             cupy_bufsize=mempool.total_bytes(),
             nvmem = self._get_nvsmi_mem(),
+            cupybuf=mempool.total_bytes(),
+            objmem=total_mem,
             tensors_sizes=[len(tensor.indices) for tensor in self.object_store.values()]
         ))
         # --
+        print('MH', self.mem_history[-1])
         if cupy_mem>1024**2:
             self._print("CuPy memory usage", cupy_mem/1024/1024, "MB. Total MB:", mempool.total_bytes()/1024**2)
 
@@ -80,12 +93,12 @@ def tensor_size(self, tensor)->int:
         from qtensor.compression import Tensor, CompressedTensor
         if tensor.data is None:
             return 0
-        if isinstance(tensor, Tensor):
-            return tensor.data.nbytes
-        elif isinstance(tensor, CompressedTensor):
-            chunks = tensor.data
+        if isinstance(tensor, CompressedTensor):
+            chunks = tensor._data
             sizes = [tensor.compressor.compress_size(x) for x in chunks]
             return sum(sizes)
+        elif isinstance(tensor, Tensor):
+            return tensor.data.nbytes
         else:
             raise ValueError("Unknown tensor type")
 

From 4ce7d0cdf68a00871b2a28662cee716230bfcee3 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Wed, 13 Dec 2023 22:34:48 -0600
Subject: [PATCH 107/126] compressed contraction memory leak testing

---
 qtensor/compression/Compressor.py             |  9 +++--
 qtensor/compression/__init__.py               |  8 +++-
 qtensor/compression/tests/test_memory_leak.py | 40 ++++++++++++++-----
 3 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index ced72ad0..a92b8a12 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -351,9 +351,9 @@ def free_decompressed(self):
                 #print("CUDA Free", x)
             cupy.cuda.runtime.free(x)
             # del x
-            # cupy.get_default_memory_pool().free_all_blocks()
-            # cupy.get_default_pinned_memory_pool().free_all_blocks()
-        # torch.cuda.empty_cache()
+            cupy.get_default_memory_pool().free_all_blocks()
+            cupy.get_default_pinned_memory_pool().free_all_blocks()
+        torch.cuda.empty_cache()
         self.decompressed_own = []
 
     def free_compressed(self, ptr):
@@ -365,6 +365,9 @@ def free_compressed(self, ptr):
         p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
         decompressed_int = p_decompressed_int.contents
         cupy.cuda.runtime.free(decompressed_int.value)
+        cupy.get_default_memory_pool().free_all_blocks()
+        cupy.get_default_pinned_memory_pool().free_all_blocks()
+        torch.cuda.empty_cache()
 
     def compress(self, data):
         import cupy
diff --git a/qtensor/compression/__init__.py b/qtensor/compression/__init__.py
index cf248bee..9e320426 100644
--- a/qtensor/compression/__init__.py
+++ b/qtensor/compression/__init__.py
@@ -1,4 +1,10 @@
-from .Compressor import Compressor, NumpyCompressor, CUSZCompressor, ProfileCompressor
+from .Compressor import (
+    Compressor,
+    NumpyCompressor,
+    CUSZCompressor,
+    CUSZXCompressor,
+    ProfileCompressor,
+)
 from .CompressedTensor import CompressedTensor, Tensor
 from .compressed_contraction import compressed_contract, compressed_sum
 from .cost_estimation import compressed_contraction_cost
diff --git a/qtensor/compression/tests/test_memory_leak.py b/qtensor/compression/tests/test_memory_leak.py
index 4ae02b93..fa6ea09e 100644
--- a/qtensor/compression/tests/test_memory_leak.py
+++ b/qtensor/compression/tests/test_memory_leak.py
@@ -1,32 +1,52 @@
 """
 Run `watch -n 0.1 nvidia-smi` and then run this test
 """
-from qtensor.compression import CUSZCompressor
+from qtensor.compression import CUSZXCompressor
 import cupy
 import ctypes
 
+
+def _init_nvsmi():
+    import nvidia_smi
+    nvidia_smi.nvmlInit()
+    nvsmi_handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
+    return nvsmi_handle
+
+def _get_nvsmi_mem(handle):
+    import nvidia_smi
+    info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
+    mem = info.used
+    return mem
+
+
 def free_compressed(ptr):
     cmp_bytes, *_ = ptr
-    p_decompressed_ptr = ctypes.addressof(cmp_bytes)
+    p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
     # cast to int64 pointer
     # (effectively converting pointer to pointer to addr to pointer to int64)
-    p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+    p_decompressed_int = ctypes.cast(
+        p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64)
+    )
     decompressed_int = p_decompressed_int.contents
     cupy.cuda.runtime.free(decompressed_int.value)
 
+
 def test_leak():
-    N = 1024*1024//2 # 32MB
+    N = 1024 * 1024 * 32  # 32MB
     a = cupy.zeros(N, dtype=float)
-    a[::1024] = .01
+    a[::1024] = 0.01
     for i in range(1000):
-        a[32*i] = .005*(i%5+1)
+        a[32 * i] = 0.005 * (i % 5 + 1)
+    _nvsmi_handle = _init_nvsmi()
+    print(f"Original, [0]={a[0]}, [1024]={a[1024]}")
 
-    c = CUSZCompressor(r2r_error=1e-2, r2r_threshold=1e-2)
+    c = CUSZXCompressor(r2r_error=1e-2, r2r_threshold=1e-2)
     for i in range(200):
         out = c.compress(a)
-        print(i, "Compression ratio", 4*N/c.compress_size(out))
+        print(i, "Compression ratio", 4 * N / c.compress_size(out))
         b = c.decompress(out)
-        a[:] = b
-        print(i, "Decompressed, 0, 1024", b[0], b[1024])
+        #a[:] = b
+        print(i, f"Decompressed, [0]={b[0]}, [1024]={b[1024]}")
         c.free_decompressed()
         free_compressed(out)
+        print(f"Memory usage: {_get_nvsmi_mem(_nvsmi_handle) / 1024 ** 3} GB")

From 547c5f5f12ce826b236d3e1978c6f776ebe86d0c Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Thu, 14 Dec 2023 23:35:54 -0600
Subject: [PATCH 108/126] fix test_leak for cusz. only complex64 works

---
 qtensor/compression/Compressor.py             |  8 ++---
 qtensor/compression/tests/test_memory_leak.py | 31 ++++++++-----------
 2 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index a92b8a12..69d8a284 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -352,8 +352,8 @@ def free_decompressed(self):
             cupy.cuda.runtime.free(x)
             # del x
             cupy.get_default_memory_pool().free_all_blocks()
-            cupy.get_default_pinned_memory_pool().free_all_blocks()
-        torch.cuda.empty_cache()
+            #cupy.get_default_pinned_memory_pool().free_all_blocks()
+        #torch.cuda.empty_cache()
         self.decompressed_own = []
 
     def free_compressed(self, ptr):
@@ -366,8 +366,8 @@ def free_compressed(self, ptr):
         decompressed_int = p_decompressed_int.contents
         cupy.cuda.runtime.free(decompressed_int.value)
         cupy.get_default_memory_pool().free_all_blocks()
-        cupy.get_default_pinned_memory_pool().free_all_blocks()
-        torch.cuda.empty_cache()
+        #cupy.get_default_pinned_memory_pool().free_all_blocks()
+        #torch.cuda.empty_cache()
 
     def compress(self, data):
         import cupy
diff --git a/qtensor/compression/tests/test_memory_leak.py b/qtensor/compression/tests/test_memory_leak.py
index fa6ea09e..8163b577 100644
--- a/qtensor/compression/tests/test_memory_leak.py
+++ b/qtensor/compression/tests/test_memory_leak.py
@@ -19,34 +19,29 @@ def _get_nvsmi_mem(handle):
     return mem
 
 
-def free_compressed(ptr):
-    cmp_bytes, *_ = ptr
-    p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
-    # cast to int64 pointer
-    # (effectively converting pointer to pointer to addr to pointer to int64)
-    p_decompressed_int = ctypes.cast(
-        p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64)
-    )
-    decompressed_int = p_decompressed_int.contents
-    cupy.cuda.runtime.free(decompressed_int.value)
-
-
 def test_leak():
-    N = 1024 * 1024 * 32  # 32MB
-    a = cupy.zeros(N, dtype=float)
+    dtype = cupy.complex64
+    dtype_size = dtype(0).nbytes
+    MB_elems = int(1024 ** 2 / dtype_size)
+    MB_target = 128
+    N = MB_target * MB_elems
+    print(f"== Testing memory leak with {N} elements and {MB_target} MB array ==")
+
+    a = cupy.zeros(N, dtype=dtype)
     a[::1024] = 0.01
+    a[::8] = cupy.random.rand(N // 8) * 0.01
     for i in range(1000):
         a[32 * i] = 0.005 * (i % 5 + 1)
     _nvsmi_handle = _init_nvsmi()
     print(f"Original, [0]={a[0]}, [1024]={a[1024]}")
 
     c = CUSZXCompressor(r2r_error=1e-2, r2r_threshold=1e-2)
-    for i in range(200):
+    for i in range(100):
         out = c.compress(a)
         print(i, "Compression ratio", 4 * N / c.compress_size(out))
         b = c.decompress(out)
-        #a[:] = b
+        a[:] = b
         print(i, f"Decompressed, [0]={b[0]}, [1024]={b[1024]}")
         c.free_decompressed()
-        free_compressed(out)
-        print(f"Memory usage: {_get_nvsmi_mem(_nvsmi_handle) / 1024 ** 3} GB")
+        c.free_compressed(out)
+        print(f"== [{i}] Memory usage: {_get_nvsmi_mem(_nvsmi_handle) / 1024 ** 3} GB ==")

From 12e21f85f21c64a1f8f60e100558a81f2e864a00 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Fri, 15 Dec 2023 00:24:50 -0600
Subject: [PATCH 109/126] add test test for leak in contraction

---
 qtensor/compression/Compressor.py             |  6 ++
 qtensor/compression/compressed_contraction.py |  8 +-
 qtensor/compression/tests/test_memory_leak.py | 76 ++++++++++++++++---
 3 files changed, 75 insertions(+), 15 deletions(-)

diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index 69d8a284..63f1f323 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -104,6 +104,12 @@ def get_profile_stats(self):
 
     def compress_size(self, ptr):
         return self.compressor.compress_size(ptr)
+    
+    def free_decompressed(self):
+        self.compressor.free_decompressed()
+    
+    def free_compressed(self, ptr):
+        self.compressor.free_compressed(ptr)
 # --
 
 class NumpyCompressor(Compressor):
diff --git a/qtensor/compression/compressed_contraction.py b/qtensor/compression/compressed_contraction.py
index 041eaf27..893987f9 100644
--- a/qtensor/compression/compressed_contraction.py
+++ b/qtensor/compression/compressed_contraction.py
@@ -40,8 +40,8 @@ def contract_two_tensors(A, B, T_out, einsum=np.einsum):
         result_ints = [relabel_dict_int[int(i)] for i in result_indices]
     else:
         result_ints = list(map(int, result_indices))
-    print(A.data.shape)
-    print(B.data.shape)
+    #print(A.data.shape)
+    #print(B.data.shape)
     out = einsum(A.data, A_ints, B.data, B_ints, result_ints)
     if len(result_ints)>0:
         # This copying is reqiured because cupy doesn't support `out` argument.
@@ -119,13 +119,13 @@ def compressed_contract(A:Tensor, B: Tensor,
 
             C_ixs = [v for v in result_chunk_ixs if v not in exist_compressed]
             C = Tensor('tmp', indices=C_ixs, data=chunk_view)
-            contract_two_tensors(A_slice, B_slice, C)
+            contract_two_tensors(A_slice, B_slice, C, einsum=einsum)
             # Free temp slices
             #import cupy
             #print("Flags", A_slice.data.flags, B_slice.data.flags, C.data.flags)
             #cupy.cuda.runtime.free(A_slice.data.data.ptr)
             #cupy.cuda.runtime.free(B_slice.data.data.ptr)
-            compressor.compressor.free_decompressed()
+            compressor.free_decompressed()
         if len(need_compressed)==0:
             R = Tensor(new_tensor_name, result_indices, data=chunk)
         else:
diff --git a/qtensor/compression/tests/test_memory_leak.py b/qtensor/compression/tests/test_memory_leak.py
index 8163b577..4a5fde72 100644
--- a/qtensor/compression/tests/test_memory_leak.py
+++ b/qtensor/compression/tests/test_memory_leak.py
@@ -3,45 +3,99 @@
 """
 from qtensor.compression import CUSZXCompressor
 import cupy
-import ctypes
+import numpy as np
 
 
 def _init_nvsmi():
     import nvidia_smi
+
     nvidia_smi.nvmlInit()
     nvsmi_handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
     return nvsmi_handle
 
+
 def _get_nvsmi_mem(handle):
     import nvidia_smi
+
     info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
     mem = info.used
     return mem
 
 
-def test_leak():
+def test_leak_compress():
     dtype = cupy.complex64
     dtype_size = dtype(0).nbytes
-    MB_elems = int(1024 ** 2 / dtype_size)
+    MB_elems = int(1024**2 / dtype_size)
     MB_target = 128
     N = MB_target * MB_elems
     print(f"== Testing memory leak with {N} elements and {MB_target} MB array ==")
+    c = CUSZXCompressor(r2r_error=1e-2, r2r_threshold=1e-2)
+    import qtensor
+
+    c = qtensor.compression.ProfileCompressor(c)
+    _nvsmi_handle = _init_nvsmi()
 
     a = cupy.zeros(N, dtype=dtype)
     a[::1024] = 0.01
-    a[::8] = cupy.random.rand(N // 8) * 0.01
+    a[::8] = cupy.random.rand(N // 8)
     for i in range(1000):
-        a[32 * i] = 0.005 * (i % 5 + 1)
-    _nvsmi_handle = _init_nvsmi()
+        a[32 * i + 1] = 0.005 * (i % 5 + 1)
     print(f"Original, [0]={a[0]}, [1024]={a[1024]}")
 
-    c = CUSZXCompressor(r2r_error=1e-2, r2r_threshold=1e-2)
-    for i in range(100):
+    for j in range(100):
         out = c.compress(a)
         print(i, "Compression ratio", 4 * N / c.compress_size(out))
         b = c.decompress(out)
-        a[:] = b
-        print(i, f"Decompressed, [0]={b[0]}, [1024]={b[1024]}")
+        # a[:] = b
+        print(j, f"Decompressed, [0]={b[0]}, [1024]={b[1024]}")
         c.free_decompressed()
         c.free_compressed(out)
-        print(f"== [{i}] Memory usage: {_get_nvsmi_mem(_nvsmi_handle) / 1024 ** 3} GB ==")
+        print(
+            f"== [{j}] Memory usage: {_get_nvsmi_mem(_nvsmi_handle) / 1024 ** 3} GB =="
+        )
+
+
+def test_leak_contract():
+    from qtensor.compression.CompressedTensor import Tensor
+    import qtensor
+    from qtree.optimizer import Var
+    from qtensor.compression.compressed_contraction import compressed_contract
+
+    dtype = cupy.complex64
+    dtype_size = dtype(0).nbytes
+    MB_elems = int(1024**2 / dtype_size)
+    MB_target = 128 # target for largest tensor
+    N = MB_target * MB_elems
+    W_target = int(np.log2(N))
+    print(f"== Testing memory leak with {N} elements and {MB_target} MB array ==")
+    c = CUSZXCompressor(r2r_error=1e-2, r2r_threshold=1e-2)
+    c = qtensor.compression.ProfileCompressor(c)
+    _nvsmi_handle = _init_nvsmi()
+
+    As, Bs = W_target - 4, W_target - 2
+    common_num = int((As + Bs - W_target)/2)
+    print(f"Common indices: {common_num}, W_target: {W_target}")
+    avars = [Var(i) for i in range(As)]
+    bvars = [Var(i) for i in range(common_num)] + [
+        Var(i) for i in range(As, As + Bs - common_num)
+    ]
+    print("A vars", avars)
+    print("B vars", bvars)
+    TA = Tensor.empty("A", avars)
+    TB = Tensor.empty("B", bvars)
+
+    for j in range(100):
+        res = compressed_contract(
+            TA,
+            TB,
+            avars[:common_num],
+            W_target,
+            c,
+            einsum=cupy.einsum,
+            move_data=cupy.array,
+        )
+        print(f"Result indices: {res.indices}")
+        print(f"Result: {res}")
+        print(
+            f"== [{j}] Memory usage: {_get_nvsmi_mem(_nvsmi_handle) / 1024 ** 3} GB =="
+        )

From d66c90b9724beba3f3907ea34cb24cda54d44afe Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Fri, 15 Dec 2023 22:28:45 -0600
Subject: [PATCH 110/126] fix memory leak problems with cuszx

---
 qtensor/compression/CompressedTensor.py       | 61 +++++++++++--------
 qtensor/compression/szx/src/cuszx_entry.cu    |  1 +
 qtensor/compression/tests/test_memory_leak.py | 15 +++--
 3 files changed, 49 insertions(+), 28 deletions(-)

diff --git a/qtensor/compression/CompressedTensor.py b/qtensor/compression/CompressedTensor.py
index 08a0c390..2d9d25df 100644
--- a/qtensor/compression/CompressedTensor.py
+++ b/qtensor/compression/CompressedTensor.py
@@ -4,8 +4,9 @@
 from qtree.system_defs import NP_ARRAY_TYPE
 from .Compressor import NumpyCompressor, Compressor
 
+
 def iterate_indices(indices: list):
-    if len(indices)==0:
+    if len(indices) == 0:
         return [tuple()]
     ranges = [range(v.size) for v in indices]
     return itertools.product(*ranges)
@@ -18,11 +19,16 @@ class CompressedTensor(Tensor):
     The data array is split along several indices S into 2^|S| parts
 
     """
-    def __init__(self, name, indices,
-                 data_key=None, data=None,
-                 slice_indices=[],
-                 compressor:Compressor=NumpyCompressor()
-                ):
+
+    def __init__(
+        self,
+        name,
+        indices,
+        data_key=None,
+        data=None,
+        slice_indices=[],
+        compressor: Compressor = NumpyCompressor(),
+    ):
         """
         Initialize the tensor
         name: str,
@@ -49,7 +55,14 @@ def __init__(self, name, indices,
             self._dtype = None
 
     @classmethod
-    def empty(cls, name, indices, slice_indices=[], compressor=NumpyCompressor(), dtype:type=NP_ARRAY_TYPE):
+    def empty(
+        cls,
+        name,
+        indices,
+        slice_indices=[],
+        compressor=NumpyCompressor(),
+        dtype: type = NP_ARRAY_TYPE,
+    ):
         t = super().empty(name, indices, dtype)
         t.compressor = compressor
         if slice_indices:
@@ -63,18 +76,14 @@ def compress_indices(self, indices: list):
 
         Does not support compressing when already compressed
         """
-        slice_dict = {
-            i: slice(None) for i in self.indices
-        }
+        slice_dict = {i: slice(None) for i in self.indices}
         data_chunks = []
         for ivals in iterate_indices(indices):
             for ix, ival in zip(indices, ivals):
-                slice_dict[ix] = ival# slice(ival, ival+1)
+                slice_dict[ix] = ival  # slice(ival, ival+1)
             dslice = self.data[tuple(slice_dict[i] for i in self.indices)]
 
-            data_chunks.append(
-                self.compressor.compress(dslice)
-            )
+            data_chunks.append(self.compressor.compress(dslice))
             del dslice
         self._data = data_chunks
         self.slice_indices = indices
@@ -92,7 +101,7 @@ def array_indices(self):
 
     def get_chunk(self, ivals):
         dims = [v.size for v in self.slice_indices]
-        if len(ivals)==0:
+        if len(ivals) == 0:
             flat_ix = 0
         else:
             flat_ix = np.ravel_multi_index(ivals, dims)
@@ -104,13 +113,15 @@ def set_chunk(self, ivals, chunk: np.ndarray):
         if self._dtype is None:
             self._dtype = chunk.dtype
         else:
-            assert self.dtype == chunk.dtype, f"Chunk dtype {chunk.dtype} does not match tensor dtype {self.dtype}"
+            assert (
+                self.dtype == chunk.dtype
+            ), f"Chunk dtype {chunk.dtype} does not match tensor dtype {self.dtype}"
         # --
 
         if self._data is None:
-            self._data = np.empty(2**len(self.slice_indices), dtype=object)
+            self._data = np.empty(2 ** len(self.slice_indices), dtype=object)
         dims = [v.size for v in self.slice_indices]
-        if len(ivals)==0:
+        if len(ivals) == 0:
             flat_ix = 0
         else:
             flat_ix = np.ravel_multi_index(ivals, dims)
@@ -138,11 +149,10 @@ def __getitem__(self, key):
         chunk_slice = chunk[tuple(chunk_slices_ints)]
         return Tensor(new_name, new_indices, data=chunk_slice)
 
-
     def __str__(self):
-        array_ix = ','.join(map(str, self.array_indices))
-        split_ix= ','.join(map(str, self.slice_indices))
-        return f'{self._name}{{{split_ix}}}({array_ix})'
+        array_ix = ",".join(map(str, self.array_indices))
+        split_ix = ",".join(map(str, self.slice_indices))
+        return f"{self._name}{{{split_ix}}}({array_ix})"
 
     def copy(self, name=None, indices=None, data_key=None, data=None):
         raise NotImplementedError()
@@ -150,5 +160,8 @@ def copy(self, name=None, indices=None, data_key=None, data=None):
     def __repr__(self):
         return self.__str__()
 
-
-
+    def __del__(self):
+        if self._data is not None:
+            for chunk in self._data:
+                self.compressor.free_compressed(chunk)
+        del self
diff --git a/qtensor/compression/szx/src/cuszx_entry.cu b/qtensor/compression/szx/src/cuszx_entry.cu
index 18f60e09..7f5f78e1 100644
--- a/qtensor/compression/szx/src/cuszx_entry.cu
+++ b/qtensor/compression/szx/src/cuszx_entry.cu
@@ -898,6 +898,7 @@ size_t better_post_proc(size_t *outSize, float *oriData, unsigned char *meta,
     //outBytes = (unsigned char*)malloc(out_size);
 	unsigned char* r = outBytes;
     unsigned char* r_old = outBytes;
+    // cudaDeviceSynchronize(); printf("%s\n",cudaGetLastError());
     checkCudaErrors(cudaMemset(r, SZx_VER_MAJOR, sizeof(char)));
     checkCudaErrors(cudaMemset(r+1, SZx_VER_MINOR, sizeof(char)));
     checkCudaErrors(cudaMemset(r+2, 1, sizeof(char)));
diff --git a/qtensor/compression/tests/test_memory_leak.py b/qtensor/compression/tests/test_memory_leak.py
index 4a5fde72..e0ca675c 100644
--- a/qtensor/compression/tests/test_memory_leak.py
+++ b/qtensor/compression/tests/test_memory_leak.py
@@ -64,7 +64,7 @@ def test_leak_contract():
     dtype = cupy.complex64
     dtype_size = dtype(0).nbytes
     MB_elems = int(1024**2 / dtype_size)
-    MB_target = 128 # target for largest tensor
+    MB_target = 64  # target for largest tensor
     N = MB_target * MB_elems
     W_target = int(np.log2(N))
     print(f"== Testing memory leak with {N} elements and {MB_target} MB array ==")
@@ -73,7 +73,7 @@ def test_leak_contract():
     _nvsmi_handle = _init_nvsmi()
 
     As, Bs = W_target - 4, W_target - 2
-    common_num = int((As + Bs - W_target)/2)
+    common_num = int((As + Bs - W_target) / 2)
     print(f"Common indices: {common_num}, W_target: {W_target}")
     avars = [Var(i) for i in range(As)]
     bvars = [Var(i) for i in range(common_num)] + [
@@ -82,20 +82,27 @@ def test_leak_contract():
     print("A vars", avars)
     print("B vars", bvars)
     TA = Tensor.empty("A", avars)
+    TA.data = np.random.rand(*TA.shape).astype(dtype)
     TB = Tensor.empty("B", bvars)
+    TB.data = np.random.rand(*TB.shape).astype(dtype)
 
+    _mem_histories = []
     for j in range(100):
         res = compressed_contract(
             TA,
             TB,
             avars[:common_num],
-            W_target,
+            W_target - 1,
             c,
             einsum=cupy.einsum,
             move_data=cupy.array,
         )
+        [c.free_compressed(x) for x in res.data]
         print(f"Result indices: {res.indices}")
         print(f"Result: {res}")
+        _mem = _get_nvsmi_mem(_nvsmi_handle) / 1024**3
+        print(f"== [{j}] Memory usage: {_mem} GB ==")
+        _mem_histories.append(_mem)
         print(
-            f"== [{j}] Memory usage: {_get_nvsmi_mem(_nvsmi_handle) / 1024 ** 3} GB =="
+            f"== [{j}] Memory history: {[np.round(x, 2) for x in _mem_histories]} GB =="
         )

From b4a112f4570741d286329bc86b3c11337b0b45bd Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Fri, 15 Dec 2023 23:39:01 -0600
Subject: [PATCH 111/126] fix line endings


From 354b4e0ac9dd4c93e9b60119fb3890895c5cc6c2 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Fri, 15 Dec 2023 23:44:07 -0600
Subject: [PATCH 112/126] replace crlf with lf

---
 qtensor/compression/CompressedTensor.py    |  334 +-
 qtensor/compression/Compressor.py          | 1156 +++---
 qtensor/compression/newsz/nvcomp           |    1 +
 qtensor/compression/szx/cuda-samples       |    1 +
 qtensor/compression/szx/src/cuszx_entry.cu | 3922 ++++++++++----------
 5 files changed, 2708 insertions(+), 2706 deletions(-)
 create mode 160000 qtensor/compression/newsz/nvcomp
 create mode 160000 qtensor/compression/szx/cuda-samples

diff --git a/qtensor/compression/CompressedTensor.py b/qtensor/compression/CompressedTensor.py
index 2d9d25df..dfe9d2a8 100644
--- a/qtensor/compression/CompressedTensor.py
+++ b/qtensor/compression/CompressedTensor.py
@@ -1,167 +1,167 @@
-import itertools
-import numpy as np
-from qtree.optimizer import Tensor
-from qtree.system_defs import NP_ARRAY_TYPE
-from .Compressor import NumpyCompressor, Compressor
-
-
-def iterate_indices(indices: list):
-    if len(indices) == 0:
-        return [tuple()]
-    ranges = [range(v.size) for v in indices]
-    return itertools.product(*ranges)
-
-
-class CompressedTensor(Tensor):
-    """
-    Extension of the Tensor class that holds compressed data
-
-    The data array is split along several indices S into 2^|S| parts
-
-    """
-
-    def __init__(
-        self,
-        name,
-        indices,
-        data_key=None,
-        data=None,
-        slice_indices=[],
-        compressor: Compressor = NumpyCompressor(),
-    ):
-        """
-        Initialize the tensor
-        name: str,
-              the name of the tensor. Used only for display/convenience.
-              May be not unique.
-        indices: tuple,
-              Indices of the tensor
-        shape: tuple,
-              shape of a tensor
-        data_key: int
-              Key to find tensor's data in the global storage
-        data: np.array
-              Actual data of the tensor. Default None.
-              Usually is not supplied at initialization.
-        slice_indices: list[Var]
-            indices along which the tensor is split into chunks
-        """
-        super().__init__(name, indices, data_key=data_key, data=data)
-        self.slice_indices = slice_indices
-        self.compressor = compressor
-        if data is not None:
-            self._dtype = data.dtype
-        else:
-            self._dtype = None
-
-    @classmethod
-    def empty(
-        cls,
-        name,
-        indices,
-        slice_indices=[],
-        compressor=NumpyCompressor(),
-        dtype: type = NP_ARRAY_TYPE,
-    ):
-        t = super().empty(name, indices, dtype)
-        t.compressor = compressor
-        if slice_indices:
-            t.compress_indices(slice_indices)
-        return t
-
-    def compress_indices(self, indices: list):
-        """
-        Slice the self.data along dimensions in `indices`,
-        store them compressed
-
-        Does not support compressing when already compressed
-        """
-        slice_dict = {i: slice(None) for i in self.indices}
-        data_chunks = []
-        for ivals in iterate_indices(indices):
-            for ix, ival in zip(indices, ivals):
-                slice_dict[ix] = ival  # slice(ival, ival+1)
-            dslice = self.data[tuple(slice_dict[i] for i in self.indices)]
-
-            data_chunks.append(self.compressor.compress(dslice))
-            del dslice
-        self._data = data_chunks
-        self.slice_indices = indices
-
-    @property
-    def dtype(self):
-        """
-        DataType of wrapped chunks.
-        """
-        return self._dtype
-
-    @property
-    def array_indices(self):
-        return [x for x in self.indices if x not in self.slice_indices]
-
-    def get_chunk(self, ivals):
-        dims = [v.size for v in self.slice_indices]
-        if len(ivals) == 0:
-            flat_ix = 0
-        else:
-            flat_ix = np.ravel_multi_index(ivals, dims)
-        ptr = self._data[flat_ix]
-        return self.compressor.decompress(ptr)
-
-    def set_chunk(self, ivals, chunk: np.ndarray):
-        # -- Check for consistent data types between chunks
-        if self._dtype is None:
-            self._dtype = chunk.dtype
-        else:
-            assert (
-                self.dtype == chunk.dtype
-            ), f"Chunk dtype {chunk.dtype} does not match tensor dtype {self.dtype}"
-        # --
-
-        if self._data is None:
-            self._data = np.empty(2 ** len(self.slice_indices), dtype=object)
-        dims = [v.size for v in self.slice_indices]
-        if len(ivals) == 0:
-            flat_ix = 0
-        else:
-            flat_ix = np.ravel_multi_index(ivals, dims)
-        self._data[flat_ix] = self.compressor.compress(chunk)
-
-    def __getitem__(self, key):
-        """
-        Get a slice of the tensor along the indices in `key`
-        Currently slicing over all compressed indices is required.
-        Slices over compressed indices must be ints
-        """
-        slices_ints, new_indices = self._parse_getitem_key(key)
-        slice_dict = {}
-        chunk_slices_ints = []
-        compression_ints = []
-        for ix, ival in zip(self.indices, slices_ints):
-            slice_dict[ix] = ival
-            if ix in self.slice_indices:
-                compression_ints.append(ival)
-            else:
-                chunk_slices_ints.append(ival)
-        chunk = self.get_chunk(compression_ints)
-        new_name = f"{self.name}[sliced]"
-        # careful: chunk will not be collected even if slice is small
-        chunk_slice = chunk[tuple(chunk_slices_ints)]
-        return Tensor(new_name, new_indices, data=chunk_slice)
-
-    def __str__(self):
-        array_ix = ",".join(map(str, self.array_indices))
-        split_ix = ",".join(map(str, self.slice_indices))
-        return f"{self._name}{{{split_ix}}}({array_ix})"
-
-    def copy(self, name=None, indices=None, data_key=None, data=None):
-        raise NotImplementedError()
-
-    def __repr__(self):
-        return self.__str__()
-
-    def __del__(self):
-        if self._data is not None:
-            for chunk in self._data:
-                self.compressor.free_compressed(chunk)
-        del self
+import itertools
+import numpy as np
+from qtree.optimizer import Tensor
+from qtree.system_defs import NP_ARRAY_TYPE
+from .Compressor import NumpyCompressor, Compressor
+
+
+def iterate_indices(indices: list):
+    if len(indices) == 0:
+        return [tuple()]
+    ranges = [range(v.size) for v in indices]
+    return itertools.product(*ranges)
+
+
+class CompressedTensor(Tensor):
+    """
+    Extension of the Tensor class that holds compressed data
+
+    The data array is split along several indices S into 2^|S| parts
+
+    """
+
+    def __init__(
+        self,
+        name,
+        indices,
+        data_key=None,
+        data=None,
+        slice_indices=[],
+        compressor: Compressor = NumpyCompressor(),
+    ):
+        """
+        Initialize the tensor
+        name: str,
+              the name of the tensor. Used only for display/convenience.
+              May be not unique.
+        indices: tuple,
+              Indices of the tensor
+        shape: tuple,
+              shape of a tensor
+        data_key: int
+              Key to find tensor's data in the global storage
+        data: np.array
+              Actual data of the tensor. Default None.
+              Usually is not supplied at initialization.
+        slice_indices: list[Var]
+            indices along which the tensor is split into chunks
+        """
+        super().__init__(name, indices, data_key=data_key, data=data)
+        self.slice_indices = slice_indices
+        self.compressor = compressor
+        if data is not None:
+            self._dtype = data.dtype
+        else:
+            self._dtype = None
+
+    @classmethod
+    def empty(
+        cls,
+        name,
+        indices,
+        slice_indices=[],
+        compressor=NumpyCompressor(),
+        dtype: type = NP_ARRAY_TYPE,
+    ):
+        t = super().empty(name, indices, dtype)
+        t.compressor = compressor
+        if slice_indices:
+            t.compress_indices(slice_indices)
+        return t
+
+    def compress_indices(self, indices: list):
+        """
+        Slice the self.data along dimensions in `indices`,
+        store them compressed
+
+        Does not support compressing when already compressed
+        """
+        slice_dict = {i: slice(None) for i in self.indices}
+        data_chunks = []
+        for ivals in iterate_indices(indices):
+            for ix, ival in zip(indices, ivals):
+                slice_dict[ix] = ival  # slice(ival, ival+1)
+            dslice = self.data[tuple(slice_dict[i] for i in self.indices)]
+
+            data_chunks.append(self.compressor.compress(dslice))
+            del dslice
+        self._data = data_chunks
+        self.slice_indices = indices
+
+    @property
+    def dtype(self):
+        """
+        DataType of wrapped chunks.
+        """
+        return self._dtype
+
+    @property
+    def array_indices(self):
+        return [x for x in self.indices if x not in self.slice_indices]
+
+    def get_chunk(self, ivals):
+        dims = [v.size for v in self.slice_indices]
+        if len(ivals) == 0:
+            flat_ix = 0
+        else:
+            flat_ix = np.ravel_multi_index(ivals, dims)
+        ptr = self._data[flat_ix]
+        return self.compressor.decompress(ptr)
+
+    def set_chunk(self, ivals, chunk: np.ndarray):
+        # -- Check for consistent data types between chunks
+        if self._dtype is None:
+            self._dtype = chunk.dtype
+        else:
+            assert (
+                self.dtype == chunk.dtype
+            ), f"Chunk dtype {chunk.dtype} does not match tensor dtype {self.dtype}"
+        # --
+
+        if self._data is None:
+            self._data = np.empty(2 ** len(self.slice_indices), dtype=object)
+        dims = [v.size for v in self.slice_indices]
+        if len(ivals) == 0:
+            flat_ix = 0
+        else:
+            flat_ix = np.ravel_multi_index(ivals, dims)
+        self._data[flat_ix] = self.compressor.compress(chunk)
+
+    def __getitem__(self, key):
+        """
+        Get a slice of the tensor along the indices in `key`
+        Currently slicing over all compressed indices is required.
+        Slices over compressed indices must be ints
+        """
+        slices_ints, new_indices = self._parse_getitem_key(key)
+        slice_dict = {}
+        chunk_slices_ints = []
+        compression_ints = []
+        for ix, ival in zip(self.indices, slices_ints):
+            slice_dict[ix] = ival
+            if ix in self.slice_indices:
+                compression_ints.append(ival)
+            else:
+                chunk_slices_ints.append(ival)
+        chunk = self.get_chunk(compression_ints)
+        new_name = f"{self.name}[sliced]"
+        # careful: chunk will not be collected even if slice is small
+        chunk_slice = chunk[tuple(chunk_slices_ints)]
+        return Tensor(new_name, new_indices, data=chunk_slice)
+
+    def __str__(self):
+        array_ix = ",".join(map(str, self.array_indices))
+        split_ix = ",".join(map(str, self.slice_indices))
+        return f"{self._name}{{{split_ix}}}({array_ix})"
+
+    def copy(self, name=None, indices=None, data_key=None, data=None):
+        raise NotImplementedError()
+
+    def __repr__(self):
+        return self.__str__()
+
+    def __del__(self):
+        if self._data is not None:
+            for chunk in self._data:
+                self.compressor.free_compressed(chunk)
+        del self
diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index 63f1f323..d9d8fbfe 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -1,578 +1,578 @@
-import io
-import sys
-import numpy as np
-from pathlib import Path
-print(Path(__file__).parent/'szx/src/')
-sys.path.append(str(Path(__file__).parent/'szx/src/'))
-sys.path.append('./szx/src')
-# sys.path.append(str(Path(__file__).parent/'szp/src/'))
-# sys.path.append('./szp/src')
-
-sys.path.append(str(Path(__file__).parent/'cusz/src'))
-sys.path.append('./cusz/src')
-sys.path.append(str(Path(__file__).parent/'torch_quant'))
-sys.path.append('./torch_quant')
-sys.path.append(str(Path(__file__).parent/'newsz'))
-sys.path.append('./newsz')
-
-import torch
-try:
-    from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
-    # from cuSZp_wrapper import cuszp_device_compress, cuszp_device_decompress
-    from cusz_wrapper import cusz_device_compress, cusz_device_decompress
-    from torch_quant_perchannel import quant_device_compress, quant_device_decompress
-    from newsz_wrapper import newsz_device_compress, newsz_device_decompress
-except:
-    print("import failed")
-    # Silently fail on missing build of cuszx
-    pass
-
-CUSZX_BLOCKSIZE = 256
-
-class Compressor():
-    def compress(self, data):
-        raise NotImplementedError
-
-    def decompress(self, ptr):
-        raise NotImplementedError
-
-    def compress_size(self, ptr):
-        return ptr.nbytes
-
-# -- Debugging and profiling
-
-import time
-from dataclasses import dataclass, asdict
-@dataclass
-class CompressMeasure:
-    time: float = 0
-    size_in: int = 0
-    size_out: int = 0
-    label: str = ''
-
-    def __str__(self):
-        compress_ratio = self.size_in / self.size_out
-        return (f'Measure: {self.time:.3f}s, '
-                f'{self.size_in/1024**2:.2f}MB -> {self.size_out/1024**2:.2f}MB ({compress_ratio:.3f} in/out ratio)'
-        )
-
-class ProfileCompressor(Compressor):
-    def __init__(self, compressor:Compressor, trace=True):
-        self.trace = trace
-        self.compressor = compressor
-        self.profile_data = {'compress': [], 'decompress': []}
-
-    def compress(self, data):
-        start = time.time()
-        ptr = self.compressor.compress(data)
-        end = time.time()
-        out_size = self.compressor.compress_size(ptr)
-        cmeasure = CompressMeasure(end-start, data.nbytes, out_size)
-        self.profile_data['compress'].append(cmeasure)
-        if self.trace:
-            print(f'Compress: {cmeasure}')
-        return ptr
-
-    def decompress(self, ptr):
-        start = time.time()
-        data = self.compressor.decompress(ptr)
-        end = time.time()
-        in_size = self.compressor.compress_size(ptr)
-        dmeasure = CompressMeasure(end-start, in_size, data.nbytes)
-        self.profile_data['decompress'].append(dmeasure)
-        if self.trace:
-            print(f'Decompress: {dmeasure}')
-        return data
-
-    def get_profile_data(self):
-        return self.profile_data['compress'], self.profile_data['decompress']
-
-    def get_profile_data_json(self):
-        compress, decompress = self.get_profile_data()
-        return {
-            'compress': [asdict(c) for c in compress],
-            'decompress': [asdict(c) for c in decompress],
-        }
-
-    def get_profile_stats(self):
-        compress, decompress = self.get_profile_data()
-        compress_time = sum([x.time for x in compress])
-        decompress_time = sum([x.time for x in decompress])
-        compress_ratios = np.mean([x.size_in/x.size_out for x in compress])
-        compress_size = sum([x.size_out for x in compress])
-        return compress_time, decompress_time, compress_size, compress_ratios
-
-    def compress_size(self, ptr):
-        return self.compressor.compress_size(ptr)
-    
-    def free_decompressed(self):
-        self.compressor.free_decompressed()
-    
-    def free_compressed(self, ptr):
-        self.compressor.free_compressed(ptr)
-# --
-
-class NumpyCompressor(Compressor):
-    def compress(self, data):
-        comp = io.BytesIO()
-        np.savez_compressed(comp, data)
-        return comp
-
-    def compress_size(self, ptr):
-        return ptr.getbuffer().nbytes
-
-    def decompress(self, ptr):
-        ptr.seek(0)
-        return  np.load(ptr)['arr_0']
-
-class TorchCompressor(Compressor):
-    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
-        self.r2r_error = r2r_error
-        self.r2r_threshold = r2r_threshold
-        self.decompressed_own = []
-
-    def free_decompressed(self):
-        import cupy
-        print("Cleanup", len(self.decompressed_own))
-        for x in self.decompressed_own:
-            del x
-        cupy.get_default_memory_pool().free_all_blocks()
-        cupy.get_default_pinned_memory_pool().free_all_blocks()
-        torch.cuda.empty_cache()
-        self.decompressed_own = []
-
-    def free_compressed(self, ptr):
-        import ctypes, cupy
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
-        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
-        # cast to int64 pointer
-        # (effectively converting pointer to pointer to addr to pointer to int64)
-        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-        decompressed_int = p_decompressed_int.contents
-        cupy.cuda.runtime.free(decompressed_int.value)
-
-    def compress(self, data):
-        import cupy
-        if isinstance(data, cupy.ndarray):
-            isCuPy = True
-        else:
-            isCuPy = False
-        num_elements = data.size
-        # Adapt numele depending on itemsize
-        itemsize = data.dtype.itemsize
-        num_elements_eff = int(num_elements*itemsize/4) 
-
-        dtype = data.dtype
-        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
-        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
-
-        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
-
-    def compress_size(self, ptr):
-        return ptr[5]
-
-    def decompress(self, obj):
-        import cupy
-        import ctypes
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
-        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
-        arr_cp = decompressed_ptr[0]
-
-        arr = cupy.reshape(arr_cp, shape)
-        self.decompressed_own.append(arr)
-        return arr
-    
-    ### Compression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
-    # - num_elements = Number of floating point elements in data
-    # - r2r_error = relative-to-value-range error bound for lossy compression
-    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
-    # Returns:
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
-    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
-        
-        if not isCuPy:
-            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-        else:
-            #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
-
-            cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-            del data
-            torch.cuda.empty_cache()
-        return cmp_bytes, outSize_ptr
-
-    ### Decompression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - num_elements = Number of floating point elements in original data
-    # Returns:
-    # - decompressed_data = Float32 pointer to decompressed data
-    #
-    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
-
-    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
-        if not isCuPy:
-            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
-        else:
-            #decompressed_data = cuszp_device_decompress(num_elements, cmp_bytes, cmpsize, owner,dtype)
-# oriData, absErrBound, nbEle, blockSize,threshold
-            decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
-        return decompressed_data
-
-class NEWSZCompressor(Compressor):
-    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
-        self.r2r_error = r2r_error
-        self.r2r_threshold = r2r_threshold
-        self.decompressed_own = []
-
-    def free_decompressed(self):
-        import cupy
-        print("Cleanup", len(self.decompressed_own))
-        for x in self.decompressed_own:
-            #print(x)
-            #if x == None:
-            #    continue
-            #else:
-                #print("CUDA Free", x)
-            cupy.cuda.runtime.free(x)
-            # del x
-            # cupy.get_default_memory_pool().free_all_blocks()
-            # cupy.get_default_pinned_memory_pool().free_all_blocks()
-        # torch.cuda.empty_cache()
-        self.decompressed_own = []
-
-    def free_compressed(self, ptr):
-        import ctypes, cupy
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
-        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
-        # cast to int64 pointer
-        # (effectively converting pointer to pointer to addr to pointer to int64)
-        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-        decompressed_int = p_decompressed_int.contents
-        cupy.cuda.runtime.free(decompressed_int.value)
-
-    def compress(self, data):
-        import cupy
-        if isinstance(data, cupy.ndarray):
-            isCuPy = True
-        else:
-            isCuPy = False
-        num_elements = data.size
-        # Adapt numele depending on itemsize
-        itemsize = data.dtype.itemsize
-        num_elements_eff = int(num_elements*itemsize/4) 
-
-        dtype = data.dtype
-        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
-        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
-
-        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
-
-    def compress_size(self, ptr):
-        return ptr[5]
-
-    def decompress(self, obj):
-        import cupy
-        import ctypes
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
-        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
-        arr_cp = decompressed_ptr[0]
-        self.decompressed_own.append(decompressed_ptr[1])
-        
-        # -- Workaround to convert GPU pointer to int
-        # p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
-        # # cast to int64 pointer
-        # # (effectively converting pointer to pointer to addr to pointer to int64)
-        # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-        # decompressed_int = p_decompressed_int.contents
-        # # --
-        # self.decompressed_own.append(decompressed_int.value)
-        # mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
-        # mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
-        arr = cupy.reshape(arr_cp, shape)
-        # self.decompressed_own.append(arr)
-        # arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
-        return arr
-    
-    ### Compression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
-    # - num_elements = Number of floating point elements in data
-    # - r2r_error = relative-to-value-range error bound for lossy compression
-    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
-    # Returns:
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
-    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
-        
-        if not isCuPy:
-            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-        else:
-            #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
-            cmp_bytes, outSize_ptr = newsz_device_compress(data,num_elements, CUSZX_BLOCKSIZE,r2r_threshold)
-            # cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-            del data
-            torch.cuda.empty_cache()
-        return cmp_bytes, outSize_ptr
-
-    ### Decompression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - num_elements = Number of floating point elements in original data
-    # Returns:
-    # - decompressed_data = Float32 pointer to decompressed data
-    #
-    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
-
-    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
-        if not isCuPy:
-            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
-        else:
-            # cuszx_device_decompress(nbEle, cmpBytes, owner, dtype)
-            decompressed_data = newsz_device_decompress(num_elements, cmp_bytes, owner,dtype)
-# oriData, absErrBound, nbEle, blockSize,threshold
-            # decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
-        return decompressed_data
-
-class CUSZXCompressor(Compressor):
-    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
-        self.r2r_error = r2r_error
-        self.r2r_threshold = r2r_threshold
-        self.decompressed_own = []
-
-    def free_decompressed(self):
-        import cupy
-        print("Cleanup", len(self.decompressed_own))
-        for x in self.decompressed_own:
-            #print(x)
-            #if x == None:
-            #    continue
-            #else:
-                #print("CUDA Free", x)
-            cupy.cuda.runtime.free(x)
-            # del x
-            cupy.get_default_memory_pool().free_all_blocks()
-            #cupy.get_default_pinned_memory_pool().free_all_blocks()
-        #torch.cuda.empty_cache()
-        self.decompressed_own = []
-
-    def free_compressed(self, ptr):
-        import ctypes, cupy
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
-        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
-        # cast to int64 pointer
-        # (effectively converting pointer to pointer to addr to pointer to int64)
-        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-        decompressed_int = p_decompressed_int.contents
-        cupy.cuda.runtime.free(decompressed_int.value)
-        cupy.get_default_memory_pool().free_all_blocks()
-        #cupy.get_default_pinned_memory_pool().free_all_blocks()
-        #torch.cuda.empty_cache()
-
-    def compress(self, data):
-        import cupy
-        if isinstance(data, cupy.ndarray):
-            isCuPy = True
-        else:
-            isCuPy = False
-        num_elements = data.size
-        # Adapt numele depending on itemsize
-        itemsize = data.dtype.itemsize
-        num_elements_eff = int(num_elements*itemsize/4) 
-
-        dtype = data.dtype
-        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
-        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
-
-        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
-
-    def compress_size(self, ptr):
-        return ptr[5]
-
-    def decompress(self, obj):
-        import cupy
-        import ctypes
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
-        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
-        arr_cp = decompressed_ptr[0]
-        self.decompressed_own.append(decompressed_ptr[1])
-        
-        # -- Workaround to convert GPU pointer to int
-        # p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
-        # # cast to int64 pointer
-        # # (effectively converting pointer to pointer to addr to pointer to int64)
-        # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-        # decompressed_int = p_decompressed_int.contents
-        # # --
-        # self.decompressed_own.append(decompressed_int.value)
-        # mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
-        # mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
-        arr = cupy.reshape(arr_cp, shape)
-        # self.decompressed_own.append(arr)
-        # arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
-        return arr
-    
-    ### Compression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
-    # - num_elements = Number of floating point elements in data
-    # - r2r_error = relative-to-value-range error bound for lossy compression
-    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
-    # Returns:
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
-    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
-        
-        if not isCuPy:
-            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-        else:
-            #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
-            cmp_bytes, outSize_ptr = cuszx_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE,r2r_threshold)
-            # cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-            del data
-            torch.cuda.empty_cache()
-        return cmp_bytes, outSize_ptr
-
-    ### Decompression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - num_elements = Number of floating point elements in original data
-    # Returns:
-    # - decompressed_data = Float32 pointer to decompressed data
-    #
-    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
-
-    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
-        if not isCuPy:
-            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
-        else:
-            # cuszx_device_decompress(nbEle, cmpBytes, owner, dtype)
-            decompressed_data = cuszx_device_decompress(num_elements, cmp_bytes, owner,dtype)
-# oriData, absErrBound, nbEle, blockSize,threshold
-            # decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
-        return decompressed_data
-    
-class CUSZCompressor(Compressor):
-    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
-        self.r2r_error = r2r_error
-        self.r2r_threshold = r2r_threshold
-        self.decompressed_own = []
-
-    def free_decompressed(self):
-        import cupy
-        print("Cleanup", len(self.decompressed_own))
-        for x in self.decompressed_own:
-            #print(x)
-            #if x == None:
-            #    continue
-            #else:
-                #print("CUDA Free", x)
-            cupy.cuda.runtime.free(x)
-            # del x
-            # cupy.get_default_memory_pool().free_all_blocks()
-            # cupy.get_default_pinned_memory_pool().free_all_blocks()
-        # torch.cuda.empty_cache()
-        self.decompressed_own = []
-
-    def free_compressed(self, ptr):
-        import ctypes, cupy
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
-        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
-        # cast to int64 pointer
-        # (effectively converting pointer to pointer to addr to pointer to int64)
-        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-        decompressed_int = p_decompressed_int.contents
-        cupy.cuda.runtime.free(decompressed_int.value)
-
-    def compress(self, data):
-        import cupy
-        if isinstance(data, cupy.ndarray):
-            isCuPy = True
-        else:
-            isCuPy = False
-        num_elements = data.size
-        # Adapt numele depending on itemsize
-        itemsize = data.dtype.itemsize
-        num_elements_eff = int(num_elements*itemsize/4) 
-
-        dtype = data.dtype
-        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
-        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
-
-        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
-
-    def compress_size(self, ptr):
-        return ptr[5]
-
-    def decompress(self, obj):
-        import cupy
-        import ctypes
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
-        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
-        arr_cp = decompressed_ptr[0]
-        #self.decompressed_own.append(decompressed_ptr[1])
-        
-        # -- Workaround to convert GPU pointer to int
-        # p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
-        # # cast to int64 pointer
-        # # (effectively converting pointer to pointer to addr to pointer to int64)
-        # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-        # decompressed_int = p_decompressed_int.contents
-        # # --
-        # self.decompressed_own.append(decompressed_int.value)
-        # mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
-        # mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
-        arr = cupy.reshape(arr_cp, shape)
-        self.decompressed_own.append(arr)
-        # arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
-        return arr
-    
-    ### Compression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
-    # - num_elements = Number of floating point elements in data
-    # - r2r_error = relative-to-value-range error bound for lossy compression
-    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
-    # Returns:
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
-    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
-        
-        if not isCuPy:
-            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-        else:
-            #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
-            cmp_bytes, outSize_ptr = cusz_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE,r2r_threshold)
-            # cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-            del data
-            torch.cuda.empty_cache()
-        return cmp_bytes, outSize_ptr
-
-    ### Decompression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - num_elements = Number of floating point elements in original data
-    # Returns:
-    # - decompressed_data = Float32 pointer to decompressed data
-    #
-    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
-
-    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
-        if not isCuPy:
-            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
-        else:
-            # cuszx_device_decompress(nbEle, cmpBytes, owner, dtype)
-            decompressed_data = cusz_device_decompress(num_elements, cmp_bytes, owner,dtype)
-# oriData, absErrBound, nbEle, blockSize,threshold
-            # decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
-        return decompressed_data
+import io
+import sys
+import numpy as np
+from pathlib import Path
+print(Path(__file__).parent/'szx/src/')
+sys.path.append(str(Path(__file__).parent/'szx/src/'))
+sys.path.append('./szx/src')
+# sys.path.append(str(Path(__file__).parent/'szp/src/'))
+# sys.path.append('./szp/src')
+
+sys.path.append(str(Path(__file__).parent/'cusz/src'))
+sys.path.append('./cusz/src')
+sys.path.append(str(Path(__file__).parent/'torch_quant'))
+sys.path.append('./torch_quant')
+sys.path.append(str(Path(__file__).parent/'newsz'))
+sys.path.append('./newsz')
+
+import torch
+try:
+    from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
+    # from cuSZp_wrapper import cuszp_device_compress, cuszp_device_decompress
+    from cusz_wrapper import cusz_device_compress, cusz_device_decompress
+    from torch_quant_perchannel import quant_device_compress, quant_device_decompress
+    from newsz_wrapper import newsz_device_compress, newsz_device_decompress
+except:
+    print("import failed")
+    # Silently fail on missing build of cuszx
+    pass
+
+CUSZX_BLOCKSIZE = 256
+
+class Compressor():
+    def compress(self, data):
+        raise NotImplementedError
+
+    def decompress(self, ptr):
+        raise NotImplementedError
+
+    def compress_size(self, ptr):
+        return ptr.nbytes
+
+# -- Debugging and profiling
+
+import time
+from dataclasses import dataclass, asdict
+@dataclass
+class CompressMeasure:
+    time: float = 0
+    size_in: int = 0
+    size_out: int = 0
+    label: str = ''
+
+    def __str__(self):
+        compress_ratio = self.size_in / self.size_out
+        return (f'Measure: {self.time:.3f}s, '
+                f'{self.size_in/1024**2:.2f}MB -> {self.size_out/1024**2:.2f}MB ({compress_ratio:.3f} in/out ratio)'
+        )
+
+class ProfileCompressor(Compressor):
+    def __init__(self, compressor:Compressor, trace=True):
+        self.trace = trace
+        self.compressor = compressor
+        self.profile_data = {'compress': [], 'decompress': []}
+
+    def compress(self, data):
+        start = time.time()
+        ptr = self.compressor.compress(data)
+        end = time.time()
+        out_size = self.compressor.compress_size(ptr)
+        cmeasure = CompressMeasure(end-start, data.nbytes, out_size)
+        self.profile_data['compress'].append(cmeasure)
+        if self.trace:
+            print(f'Compress: {cmeasure}')
+        return ptr
+
+    def decompress(self, ptr):
+        start = time.time()
+        data = self.compressor.decompress(ptr)
+        end = time.time()
+        in_size = self.compressor.compress_size(ptr)
+        dmeasure = CompressMeasure(end-start, in_size, data.nbytes)
+        self.profile_data['decompress'].append(dmeasure)
+        if self.trace:
+            print(f'Decompress: {dmeasure}')
+        return data
+
+    def get_profile_data(self):
+        return self.profile_data['compress'], self.profile_data['decompress']
+
+    def get_profile_data_json(self):
+        compress, decompress = self.get_profile_data()
+        return {
+            'compress': [asdict(c) for c in compress],
+            'decompress': [asdict(c) for c in decompress],
+        }
+
+    def get_profile_stats(self):
+        compress, decompress = self.get_profile_data()
+        compress_time = sum([x.time for x in compress])
+        decompress_time = sum([x.time for x in decompress])
+        compress_ratios = np.mean([x.size_in/x.size_out for x in compress])
+        compress_size = sum([x.size_out for x in compress])
+        return compress_time, decompress_time, compress_size, compress_ratios
+
+    def compress_size(self, ptr):
+        return self.compressor.compress_size(ptr)
+    
+    def free_decompressed(self):
+        self.compressor.free_decompressed()
+    
+    def free_compressed(self, ptr):
+        self.compressor.free_compressed(ptr)
+# --
+
+class NumpyCompressor(Compressor):
+    def compress(self, data):
+        comp = io.BytesIO()
+        np.savez_compressed(comp, data)
+        return comp
+
+    def compress_size(self, ptr):
+        return ptr.getbuffer().nbytes
+
+    def decompress(self, ptr):
+        ptr.seek(0)
+        return  np.load(ptr)['arr_0']
+
+class TorchCompressor(Compressor):
+    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
+        self.r2r_error = r2r_error
+        self.r2r_threshold = r2r_threshold
+        self.decompressed_own = []
+
+    def free_decompressed(self):
+        import cupy
+        print("Cleanup", len(self.decompressed_own))
+        for x in self.decompressed_own:
+            del x
+        cupy.get_default_memory_pool().free_all_blocks()
+        cupy.get_default_pinned_memory_pool().free_all_blocks()
+        torch.cuda.empty_cache()
+        self.decompressed_own = []
+
+    def free_compressed(self, ptr):
+        import ctypes, cupy
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
+        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
+        # cast to int64 pointer
+        # (effectively converting pointer to pointer to addr to pointer to int64)
+        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        decompressed_int = p_decompressed_int.contents
+        cupy.cuda.runtime.free(decompressed_int.value)
+
+    def compress(self, data):
+        import cupy
+        if isinstance(data, cupy.ndarray):
+            isCuPy = True
+        else:
+            isCuPy = False
+        num_elements = data.size
+        # Adapt numele depending on itemsize
+        itemsize = data.dtype.itemsize
+        num_elements_eff = int(num_elements*itemsize/4) 
+
+        dtype = data.dtype
+        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
+        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
+
+        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
+
+    def compress_size(self, ptr):
+        return ptr[5]
+
+    def decompress(self, obj):
+        import cupy
+        import ctypes
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
+        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
+        arr_cp = decompressed_ptr[0]
+
+        arr = cupy.reshape(arr_cp, shape)
+        self.decompressed_own.append(arr)
+        return arr
+    
+    ### Compression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
+    # - num_elements = Number of floating point elements in data
+    # - r2r_error = relative-to-value-range error bound for lossy compression
+    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
+    # Returns:
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
+    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
+        
+        if not isCuPy:
+            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+        else:
+            #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
+
+            cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+            del data
+            torch.cuda.empty_cache()
+        return cmp_bytes, outSize_ptr
+
+    ### Decompression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - num_elements = Number of floating point elements in original data
+    # Returns:
+    # - decompressed_data = Float32 pointer to decompressed data
+    #
+    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
+
+    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
+        if not isCuPy:
+            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
+        else:
+            #decompressed_data = cuszp_device_decompress(num_elements, cmp_bytes, cmpsize, owner,dtype)
+# oriData, absErrBound, nbEle, blockSize,threshold
+            decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
+        return decompressed_data
+
+class NEWSZCompressor(Compressor):
+    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
+        self.r2r_error = r2r_error
+        self.r2r_threshold = r2r_threshold
+        self.decompressed_own = []
+
+    def free_decompressed(self):
+        import cupy
+        print("Cleanup", len(self.decompressed_own))
+        for x in self.decompressed_own:
+            #print(x)
+            #if x == None:
+            #    continue
+            #else:
+                #print("CUDA Free", x)
+            cupy.cuda.runtime.free(x)
+            # del x
+            # cupy.get_default_memory_pool().free_all_blocks()
+            # cupy.get_default_pinned_memory_pool().free_all_blocks()
+        # torch.cuda.empty_cache()
+        self.decompressed_own = []
+
+    def free_compressed(self, ptr):
+        import ctypes, cupy
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
+        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
+        # cast to int64 pointer
+        # (effectively converting pointer to pointer to addr to pointer to int64)
+        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        decompressed_int = p_decompressed_int.contents
+        cupy.cuda.runtime.free(decompressed_int.value)
+
+    def compress(self, data):
+        import cupy
+        if isinstance(data, cupy.ndarray):
+            isCuPy = True
+        else:
+            isCuPy = False
+        num_elements = data.size
+        # Adapt numele depending on itemsize
+        itemsize = data.dtype.itemsize
+        num_elements_eff = int(num_elements*itemsize/4) 
+
+        dtype = data.dtype
+        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
+        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
+
+        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
+
+    def compress_size(self, ptr):
+        return ptr[5]
+
+    def decompress(self, obj):
+        import cupy
+        import ctypes
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
+        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
+        arr_cp = decompressed_ptr[0]
+        self.decompressed_own.append(decompressed_ptr[1])
+        
+        # -- Workaround to convert GPU pointer to int
+        # p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
+        # # cast to int64 pointer
+        # # (effectively converting pointer to pointer to addr to pointer to int64)
+        # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        # decompressed_int = p_decompressed_int.contents
+        # # --
+        # self.decompressed_own.append(decompressed_int.value)
+        # mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
+        # mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
+        arr = cupy.reshape(arr_cp, shape)
+        # self.decompressed_own.append(arr)
+        # arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
+        return arr
+    
+    ### Compression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
+    # - num_elements = Number of floating point elements in data
+    # - r2r_error = relative-to-value-range error bound for lossy compression
+    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
+    # Returns:
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
+    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
+        
+        if not isCuPy:
+            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+        else:
+            #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
+            cmp_bytes, outSize_ptr = newsz_device_compress(data,num_elements, CUSZX_BLOCKSIZE,r2r_threshold)
+            # cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+            del data
+            torch.cuda.empty_cache()
+        return cmp_bytes, outSize_ptr
+
+    ### Decompression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - num_elements = Number of floating point elements in original data
+    # Returns:
+    # - decompressed_data = Float32 pointer to decompressed data
+    #
+    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
+
+    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
+        if not isCuPy:
+            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
+        else:
+            # cuszx_device_decompress(nbEle, cmpBytes, owner, dtype)
+            decompressed_data = newsz_device_decompress(num_elements, cmp_bytes, owner,dtype)
+# oriData, absErrBound, nbEle, blockSize,threshold
+            # decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
+        return decompressed_data
+
+class CUSZXCompressor(Compressor):
+    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
+        self.r2r_error = r2r_error
+        self.r2r_threshold = r2r_threshold
+        self.decompressed_own = []
+
+    def free_decompressed(self):
+        import cupy
+        print("Cleanup", len(self.decompressed_own))
+        for x in self.decompressed_own:
+            #print(x)
+            #if x == None:
+            #    continue
+            #else:
+                #print("CUDA Free", x)
+            cupy.cuda.runtime.free(x)
+            # del x
+            cupy.get_default_memory_pool().free_all_blocks()
+            #cupy.get_default_pinned_memory_pool().free_all_blocks()
+        #torch.cuda.empty_cache()
+        self.decompressed_own = []
+
+    def free_compressed(self, ptr):
+        import ctypes, cupy
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
+        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
+        # cast to int64 pointer
+        # (effectively converting pointer to pointer to addr to pointer to int64)
+        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        decompressed_int = p_decompressed_int.contents
+        cupy.cuda.runtime.free(decompressed_int.value)
+        cupy.get_default_memory_pool().free_all_blocks()
+        #cupy.get_default_pinned_memory_pool().free_all_blocks()
+        #torch.cuda.empty_cache()
+
+    def compress(self, data):
+        import cupy
+        if isinstance(data, cupy.ndarray):
+            isCuPy = True
+        else:
+            isCuPy = False
+        num_elements = data.size
+        # Adapt numele depending on itemsize
+        itemsize = data.dtype.itemsize
+        num_elements_eff = int(num_elements*itemsize/4) 
+
+        dtype = data.dtype
+        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
+        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
+
+        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
+
+    def compress_size(self, ptr):
+        return ptr[5]
+
+    def decompress(self, obj):
+        import cupy
+        import ctypes
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
+        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
+        arr_cp = decompressed_ptr[0]
+        self.decompressed_own.append(decompressed_ptr[1])
+        
+        # -- Workaround to convert GPU pointer to int
+        # p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
+        # # cast to int64 pointer
+        # # (effectively converting pointer to pointer to addr to pointer to int64)
+        # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        # decompressed_int = p_decompressed_int.contents
+        # # --
+        # self.decompressed_own.append(decompressed_int.value)
+        # mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
+        # mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
+        arr = cupy.reshape(arr_cp, shape)
+        # self.decompressed_own.append(arr)
+        # arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
+        return arr
+    
+    ### Compression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
+    # - num_elements = Number of floating point elements in data
+    # - r2r_error = relative-to-value-range error bound for lossy compression
+    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
+    # Returns:
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
+    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
+        
+        if not isCuPy:
+            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+        else:
+            #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
+            cmp_bytes, outSize_ptr = cuszx_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE,r2r_threshold)
+            # cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+            del data
+            torch.cuda.empty_cache()
+        return cmp_bytes, outSize_ptr
+
+    ### Decompression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - num_elements = Number of floating point elements in original data
+    # Returns:
+    # - decompressed_data = Float32 pointer to decompressed data
+    #
+    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
+
+    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
+        if not isCuPy:
+            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
+        else:
+            # cuszx_device_decompress(nbEle, cmpBytes, owner, dtype)
+            decompressed_data = cuszx_device_decompress(num_elements, cmp_bytes, owner,dtype)
+# oriData, absErrBound, nbEle, blockSize,threshold
+            # decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
+        return decompressed_data
+    
+class CUSZCompressor(Compressor):
+    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
+        self.r2r_error = r2r_error
+        self.r2r_threshold = r2r_threshold
+        self.decompressed_own = []
+
+    def free_decompressed(self):
+        import cupy
+        print("Cleanup", len(self.decompressed_own))
+        for x in self.decompressed_own:
+            #print(x)
+            #if x == None:
+            #    continue
+            #else:
+                #print("CUDA Free", x)
+            cupy.cuda.runtime.free(x)
+            # del x
+            # cupy.get_default_memory_pool().free_all_blocks()
+            # cupy.get_default_pinned_memory_pool().free_all_blocks()
+        # torch.cuda.empty_cache()
+        self.decompressed_own = []
+
+    def free_compressed(self, ptr):
+        import ctypes, cupy
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
+        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
+        # cast to int64 pointer
+        # (effectively converting pointer to pointer to addr to pointer to int64)
+        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        decompressed_int = p_decompressed_int.contents
+        cupy.cuda.runtime.free(decompressed_int.value)
+
+    def compress(self, data):
+        import cupy
+        if isinstance(data, cupy.ndarray):
+            isCuPy = True
+        else:
+            isCuPy = False
+        num_elements = data.size
+        # Adapt numele depending on itemsize
+        itemsize = data.dtype.itemsize
+        num_elements_eff = int(num_elements*itemsize/4) 
+
+        dtype = data.dtype
+        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
+        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
+
+        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
+
+    def compress_size(self, ptr):
+        return ptr[5]
+
+    def decompress(self, obj):
+        import cupy
+        import ctypes
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
+        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
+        arr_cp = decompressed_ptr[0]
+        #self.decompressed_own.append(decompressed_ptr[1])
+        
+        # -- Workaround to convert GPU pointer to int
+        # p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
+        # # cast to int64 pointer
+        # # (effectively converting pointer to pointer to addr to pointer to int64)
+        # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        # decompressed_int = p_decompressed_int.contents
+        # # --
+        # self.decompressed_own.append(decompressed_int.value)
+        # mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
+        # mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
+        arr = cupy.reshape(arr_cp, shape)
+        self.decompressed_own.append(arr)
+        # arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
+        return arr
+    
+    ### Compression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
+    # - num_elements = Number of floating point elements in data
+    # - r2r_error = relative-to-value-range error bound for lossy compression
+    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
+    # Returns:
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
+    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
+        
+        if not isCuPy:
+            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+        else:
+            #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
+            cmp_bytes, outSize_ptr = cusz_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE,r2r_threshold)
+            # cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+            del data
+            torch.cuda.empty_cache()
+        return cmp_bytes, outSize_ptr
+
+    ### Decompression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - num_elements = Number of floating point elements in original data
+    # Returns:
+    # - decompressed_data = Float32 pointer to decompressed data
+    #
+    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
+
+    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
+        if not isCuPy:
+            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
+        else:
+            # cuszx_device_decompress(nbEle, cmpBytes, owner, dtype)
+            decompressed_data = cusz_device_decompress(num_elements, cmp_bytes, owner,dtype)
+# oriData, absErrBound, nbEle, blockSize,threshold
+            # decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
+        return decompressed_data
diff --git a/qtensor/compression/newsz/nvcomp b/qtensor/compression/newsz/nvcomp
new file mode 160000
index 00000000..a6e4e64a
--- /dev/null
+++ b/qtensor/compression/newsz/nvcomp
@@ -0,0 +1 @@
+Subproject commit a6e4e64a177e07cd2e5c8c5e07bb66ffefceae84
diff --git a/qtensor/compression/szx/cuda-samples b/qtensor/compression/szx/cuda-samples
new file mode 160000
index 00000000..e4789153
--- /dev/null
+++ b/qtensor/compression/szx/cuda-samples
@@ -0,0 +1 @@
+Subproject commit e4789153d539b2d2f3976050057a52a1518abcf0
diff --git a/qtensor/compression/szx/src/cuszx_entry.cu b/qtensor/compression/szx/src/cuszx_entry.cu
index 7f5f78e1..213cb689 100644
--- a/qtensor/compression/szx/src/cuszx_entry.cu
+++ b/qtensor/compression/szx/src/cuszx_entry.cu
@@ -1,1961 +1,1961 @@
-#include "cuszx_entry.h"
-#include "szx_defines.h"
-#include "szx_BytesToolkit.h"
-#include "szx_TypeManager.h"
-#include "timingGPU.h"
-#include "szx.h"
-#include <thrust/copy.h>
-#include <thrust/execution_policy.h>
-#include <cub/cub.cuh>
-#include <thrust/extrema.h>
-#include <thrust/reduce.h>
-#include <thrust/functional.h>
-#include <cub/cub.cuh>
-
-#define SPARSITY_LEVEL 0.25
-#define BLOCKS 40
-#define THREADS_PER_BLOCK 256
-
-TimingGPU timer_GPU;
-void bin(unsigned n)
-{
-    unsigned i;
-    for (i = 1 << 31; i > 0; i = i / 2)
-        (n & i) ? printf("1") : printf("0");
-}
-
-__host__ __device__ size_t convert_state_to_out(unsigned char* meta, size_t length, unsigned char *result){
-    size_t out_length;
-
-    if(length%4==0)
-		out_length = length/4;
-	else
-		out_length = length/4+1;
-
-    for (size_t i = 0; i < out_length; i++)
-    {
-        uint8_t tmp = 0;
-
-        for (size_t j = 0; j < 4; j++)
-        {
-            if (i*4 + j < length)
-            {
-                tmp |= (0x03 & meta[i*4+j]) << 2*j;
-            }
-            
-        }
-        result[i] = tmp;
-    }
-    return out_length;
-}
-
-__global__ void convert_state_to_out_kernel(unsigned char* meta, size_t length, unsigned char *result, size_t out_length){
-    
-
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < out_length; i += blockDim.x*gridDim.x){
-        uint8_t tmp = 0;
-
-        for (size_t j = 0; j < 4; j++)
-        {
-            if (i*4 + j < length)
-            {
-                tmp |= (0x03 & meta[i*4+j]) << 2*j;
-            }
-            
-        }
-        result[i] = tmp;
-    }
-}
-
-__global__ void convert_out_to_state_kernel(size_t nbBlocks, unsigned char* cmp, unsigned char* out_state, size_t state_length, int *num_state2blks, int *ncBlocks){
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < state_length; i += blockDim.x*gridDim.x){
-        for (size_t j = 0; j < 4; j++)
-        {
-            if (4*i + j < nbBlocks)
-            {
-                out_state[4*i + j]= (cmp[i] >> 2*j) & 0x03;
-                if (out_state[4*i+j] == 2)
-                {
-                    atomicAdd(num_state2blks, 1);
-                }else if(out_state[4*i+j]==3){
-                    atomicAdd(ncBlocks, 1);
-                }
-                
-            }
-            
-        }
-    }
-}
-
-// nbBlocks, r, stateNBBytes, stateArray
-__host__ __device__ size_t convert_out_to_state(size_t nbBlocks, unsigned char* cmp, unsigned char* out_state){
-    size_t state_length;
-    if(nbBlocks%4==0)
-		state_length = nbBlocks/4;
-	else
-		state_length = nbBlocks/4+1;
-
-    for (size_t i = 0; i < state_length; i++)
-    {
-        for (size_t j = 0; j < 4; j++)
-        {
-            if (4*i + j < nbBlocks)
-            {
-                out_state[4*i + j]= (cmp[i] >> 2*j) & 0x03;
-            }
-            
-        }
-    }
-    return nbBlocks;
-}
-
-__host__ __device__ size_t convert_block2_to_out(unsigned char *result, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
-    size_t out_length = 0;
-    
-    memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
-    out_length += numBlocks*4;
-    memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
-    out_length += num_sig*sizeof(float);
-    memcpy(result+out_length, blk_subidx, num_sig*sizeof(uint8_t));
-    out_length += num_sig*sizeof(uint8_t);
-    memcpy(result+out_length, blk_sig, numBlocks*sizeof(uint8_t));
-    out_length+= numBlocks*sizeof(uint8_t);
-
-    return out_length;
-}
-
-__global__ void convert_block2_to_out_kernel(unsigned char *result, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
-    
-    size_t out_length = 0;
-    unsigned char *tmp_result = result;
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
-        uint32_t local_blkidx = blk_idx[i];
-        tmp_result[4*i] = (local_blkidx) & 0xff;
-        tmp_result[4*i+1] = (local_blkidx >> (8*1)) & 0xff;
-        tmp_result[4*i+2] = (local_blkidx >> (8*2)) & 0xff;
-        tmp_result[4*i+3] = (local_blkidx >> (8*3)) & 0xff;
-    }
-    // memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
-    out_length += numBlocks*4;
-    tmp_result = result+out_length;
-    
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
-        float value = blk_vals[i];
-	    memcpy(&tmp_result[4*i], &value, sizeof(float));
-	//unsigned char *v = ()
-        //tmp_result[(int)4*i] = (unsigned char)((value) & 0xff);
-        //tmp_result[(int)4*i+1] = (unsigned char)((value >> (8*1)) & 0xff);
-        //tmp_result[(int)4*i+2] = (unsigned char)((value >> (8*2)) & 0xff);
-        //tmp_result[(int)4*i+3] = (unsigned char)((value >> (8*3)) & 0xff);
-    }
-    // memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
-    out_length += num_sig*sizeof(float);
-    tmp_result = result+out_length;
-    
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
-        tmp_result[i] = blk_subidx[i];
-        
-    }
-
-    out_length += num_sig*sizeof(uint8_t);
-    tmp_result = result+out_length;
-    
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
-        tmp_result[i] = blk_sig[i];
-        
-    }
-    out_length+= numBlocks*sizeof(uint8_t);
-
-    // return out_length;
-}
-
-__global__ void convert_out_to_block2_kernel(unsigned char *in_cmp, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
-    size_t out_length = 0;
-    
-    unsigned char *tmp_result = in_cmp;
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
-        
-        uint32_t local_blkidx = (tmp_result[4*i] & 0xff) | ((tmp_result[4*i+1] & 0xff) << (8*1)) 
-                                | ((tmp_result[4*i+2] & 0xff) << (8*2)) | ((tmp_result[4*i+3] & 0xff) << (8*3));
-        blk_idx[i] = local_blkidx;
-    }
-    // memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
-    out_length += numBlocks*4;
-    tmp_result = in_cmp+out_length;
-    
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
-        float value = 0.0;
-        memcpy(&value, &tmp_result[4*i], sizeof(float));
-        blk_vals[i] = value;
-	    
-	//unsigned char *v = ()
-        //tmp_result[(int)4*i] = (unsigned char)((value) & 0xff);
-        //tmp_result[(int)4*i+1] = (unsigned char)((value >> (8*1)) & 0xff);
-        //tmp_result[(int)4*i+2] = (unsigned char)((value >> (8*2)) & 0xff);
-        //tmp_result[(int)4*i+3] = (unsigned char)((value >> (8*3)) & 0xff);
-    }
-    // memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
-    out_length += num_sig*sizeof(float);
-    tmp_result = in_cmp+out_length;
-    
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
-        blk_subidx[i] = tmp_result[i];
-        
-    }
-
-    out_length += num_sig*sizeof(uint8_t);
-    tmp_result = in_cmp+out_length;
-    
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
-        blk_sig[i] = tmp_result[i];
-        
-    }
-    out_length+= numBlocks*sizeof(uint8_t);
-}
-
-__host__ __device__ size_t convert_out_to_block2(unsigned char *in_cmp, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
-    size_t out_length = 0;
-    memcpy(blk_idx, in_cmp, numBlocks*sizeof(uint32_t));
-    out_length += numBlocks*4;
-    memcpy(blk_vals, in_cmp+out_length,num_sig*sizeof(float));
-    out_length += num_sig*sizeof(float);
-    memcpy(blk_subidx, in_cmp+out_length, num_sig*sizeof(uint8_t));
-    out_length += num_sig*sizeof(uint8_t);
-    memcpy(blk_sig, in_cmp+out_length, numBlocks*sizeof(uint8_t));
-    out_length += numBlocks*sizeof(uint8_t);
-//    printf("outlength: %d\n",out_length);
-    return out_length;
-}
-
-int _post_proc(float *oriData, unsigned char *meta, short *offsets, unsigned char *midBytes, unsigned char *outBytes, size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig)
-{
-    int out_size = 0;
-
-    size_t nbConstantBlocks = 0;
-    size_t nbBlocks = nbEle/blockSize;
-    size_t ncBytes = blockSize/4;
-    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
-    out_size += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
-    if (nbBlocks%8==0)
-        out_size += nbBlocks/8;
-    else
-        out_size += nbBlocks/8+1;
-    int s0 = 0;
-    int s1 = 0;
-    int s2 = 0;
-    int s3 = 0;
-    for (int i=0; i<nbBlocks; i++){
-        if (meta[i]==0 || meta[i]==1 || meta[i] == 2) nbConstantBlocks++;
-        else out_size += 1+(blockSize/4)+offsets[i];
-    
-    	if(meta[i]==0) s0++;
-    	if(meta[i]==1) s1++;
-    	if(meta[i]==2) s2++;
-    	if(meta[i]==3) s3++;
-    }
-//    printf("%d %d %d %d\n", s0, s1, s2, s3);
-    out_size += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
-
-    //outBytes = (unsigned char*)malloc(out_size);
-  //  printf("accessing outbytes now...\n");
-	unsigned char* r = outBytes;
-    unsigned char* r_old = outBytes; 
-	r[0] = SZx_VER_MAJOR;
-	r[1] = SZx_VER_MINOR;
-	r[2] = 1;
-	r[3] = 0; // indicates this is not a random access version
-	r[4] = (unsigned char)blockSize;
-	r=r+5; //1 byte
-	sizeToBytes(r, nbConstantBlocks);
-	r += sizeof(size_t);
-    sizeToBytes(r, (size_t) num_sig);
-    r += sizeof(size_t); 
-	r += convert_state_to_out(meta, nbBlocks, r);
-    r += convert_block2_to_out(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-    memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
-    r += (nbEle%blockSize)*sizeof(float);
-    unsigned char* c = r;
-    unsigned char* o = c+nbConstantBlocks*sizeof(float);
-    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
-    for (int i=0; i<nbBlocks; i++){
-        
-        if (meta[i]==0 || meta[i] == 1){
-	    memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
-            c += sizeof(float);
-        }else if(meta[i] == 3){
-            shortToBytes(o, offsets[i]);
-	   
-            o += sizeof(short);
-            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
-            
-	    nc += mSize; 
-            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
-            
-	    nc += offsets[i];
-	   
-        } 
-    }
-
-    // return out_size;
-    return (uint32_t) (nc-r_old);
-}
-
-unsigned char* cuSZx_fast_compress_args_unpredictable_blocked_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold)
-{
-//    printf("tr thresh abs %f %f\n", threshold, absErrBound);
-  //  printf("first: %f %f %f\n", oriData[0], oriData[1], oriData[2]);
-    float sparsity_level = SPARSITY_LEVEL;
-	float* d_oriData;
-    cudaMalloc((void**)&d_oriData, sizeof(float)*nbEle); 
-    cudaMemcpy(d_oriData, oriData, sizeof(float)*nbEle, cudaMemcpyHostToDevice); 
-
-	size_t nbBlocks = nbEle/blockSize;
-	size_t remainCount = nbEle%blockSize;
-	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
-
-    size_t ncBytes = blockSize/4;
-    //ncBytes = (blockSize+1)%4==0 ? ncBytes : ncBytes+1; //Bytes to store one non-constant block data.
-    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
-    size_t msz = (1+mSize) * nbBlocks * sizeof(unsigned char);
-    size_t mbsz = sizeof(float) * nbEle * sizeof(unsigned char);
-
-    unsigned char *meta = (unsigned char*)malloc(msz);
-    short *offsets = (short*)malloc(nbBlocks*sizeof(short));
-    unsigned char *midBytes = (unsigned char*)malloc(mbsz);
-
-	unsigned char* d_meta;
-	unsigned char* d_midBytes;
-	short* d_offsets;
-
-    uint32_t *blk_idx, *d_blk_idx;
-    uint8_t *blk_sig, *d_blk_sig;
-    uint8_t *blk_subidx, *d_blk_subidx;
-    float *blk_vals, *d_blk_vals;
-    uint64_t *num_sig, *d_num_sig;
-
-    checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
-    num_sig = (uint64_t *)malloc(sizeof(uint64_t));
-    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
-    // blk_idx = malloc()
-    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, nbEle*sizeof(uint8_t)));
-
-    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, nbEle*sizeof(float)));
-
-    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
-
-    checkCudaErrors(cudaMalloc((void**)&d_meta, msz)); 
-    //checkCudaErrors(cudaMemcpy(d_meta, meta, msz, cudaMemcpyHostToDevice)); 
-    checkCudaErrors(cudaMemset(d_meta, 0, msz));
-    checkCudaErrors(cudaMalloc((void**)&d_offsets, nbBlocks*sizeof(short))); 
-    checkCudaErrors(cudaMemset(d_offsets, 0, nbBlocks*sizeof(short)));
-    checkCudaErrors(cudaMalloc((void**)&d_midBytes, mbsz)); 
-    checkCudaErrors(cudaMemset(d_midBytes, 0, mbsz));
-
-    timer_GPU.StartCounter();
-    // apply_threshold<<<80,256>>>(d_oriData, threshold, nbEle);
-    // cudaDeviceSynchronize();
-    dim3 dimBlock(32, blockSize/32);
-    dim3 dimGrid(65536, 1);
-    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
-    compress_float<<<dimGrid, dimBlock, sMemsize>>>(d_oriData, d_meta, d_offsets, d_midBytes, absErrBound, blockSize, nbBlocks, mSize, sparsity_level, d_blk_idx, d_blk_subidx,d_blk_vals, threshold, d_blk_sig);
-    cudaError_t err = cudaGetLastError();        // Get error code
-    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    //printf("GPU compression timing: %f ms\n", timer_GPU.GetCounter());
-    cudaDeviceSynchronize();
-    get_numsig<<<1,1>>>(d_num_sig);
-    cudaDeviceSynchronize();
-
-    checkCudaErrors(cudaMemcpy(num_sig, d_num_sig, sizeof(uint64_t), cudaMemcpyDeviceToHost));
-
-    blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
-    blk_vals= (float *)malloc((*num_sig)*sizeof(float));
-    blk_subidx = (uint8_t *)malloc((*num_sig)*sizeof(uint8_t));
-    blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
-
-    checkCudaErrors(cudaMemcpy(meta, d_meta, msz, cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(offsets, d_offsets, nbBlocks*sizeof(short), cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(midBytes, d_midBytes, mbsz, cudaMemcpyDeviceToHost)); 
-    
-    
-    checkCudaErrors(cudaMemcpy(blk_idx, d_blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(blk_vals,d_blk_vals, (*num_sig)*sizeof(float), cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(blk_subidx,d_blk_subidx, (*num_sig)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(blk_sig,d_blk_sig, (nbBlocks)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
-
-    size_t maxPreservedBufferSize = sizeof(float)*nbEle;
-    unsigned char* outBytes = (unsigned char*)malloc(maxPreservedBufferSize);
-    memset(outBytes, 0, maxPreservedBufferSize);
-
-    outSize = (size_t *)malloc(sizeof(size_t));
-    //outSize[0] = _post_proc(oriData, meta, offsets, midBytes, outBytes, nbEle, blockSize, *num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-
-    *outSize = _post_proc(oriData, meta, offsets, midBytes, outBytes, nbEle, blockSize, *num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-//    printf("Beginning free\n");
-    // printf("outsize %p \n", outBytes);
-    free(blk_idx);
-    free(blk_subidx);
-    free(blk_vals);
-    free(meta);
-    free(offsets);
-    free(midBytes);
-    checkCudaErrors(cudaFree(d_meta));
-    checkCudaErrors(cudaFree(d_offsets));
-    checkCudaErrors(cudaFree(d_midBytes));
-    return outBytes;
-}
-
-void cuSZx_fast_decompress_args_unpredictable_blocked_float(float** newData, size_t nbEle, unsigned char* cmpBytes)
-{
-    uint32_t *blk_idx, *d_blk_idx;
-    uint8_t *blk_subidx, *d_blk_subidx;
-    uint8_t *blk_sig, *d_blk_sig;
-    float *blk_vals, *d_blk_vals;
-    size_t num_sig, *d_num_sig;
-
-	*newData = (float*)malloc(sizeof(float)*nbEle);
-    memset(*newData, 0, sizeof(float)*nbEle);
-	
-	unsigned char* r = cmpBytes;
-	r += 4;
-	int blockSize = r[0];  //get block size
-	if(blockSize == 0)blockSize = 256;
-	r++;
-	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
-	r += sizeof(size_t);
-	num_sig = bytesToSize(r);
-    r += sizeof(size_t);
-	size_t nbBlocks = nbEle/blockSize;
-    size_t ncBlocks = 0;
-    size_t num_state2_blks = 0;
-	// size_t ncBlocks = nbBlocks - nbConstantBlocks; //get number of constant blocks
-	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
-    size_t ncLeading = blockSize/4;
-    size_t mSize = sizeof(float)+1+ncLeading; //Number of bytes for each data block's metadata.
-	unsigned char* stateArray = (unsigned char*)malloc(nbBlocks);
-    unsigned char* d_stateArray;
-    cudaMalloc(&d_stateArray, nbBlocks);
-	float* constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));			
-	
-    
-
-    blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
-    blk_vals= (float *)malloc((num_sig)*sizeof(float));
-    blk_subidx = (uint8_t *)malloc((num_sig)*sizeof(uint8_t));
-    blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
-
-	// printf("Converting state array\n");
-    convert_out_to_state(nbBlocks, r, stateArray);
-	// convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
-	for (size_t i = 0; i < nbBlocks; i++)
-    {
-        if (stateArray[i] == 2)
-        {
-            num_state2_blks++;
-        }else if(stateArray[i] == 3){
-            ncBlocks++;
-        }
-    }
-    
-	r += stateNBBytes;
-    unsigned char* data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
-    memset(data, 0, ncBlocks*blockSize*sizeof(float));
-    // printf("converting block vals\n");
-    size_t to_add = convert_out_to_block2(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-    r+= to_add;
-    // checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
-    // num_sig = (uint64_t *)malloc(sizeof(uint64_t));
-    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
-    // blk_idx = malloc()
-    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, num_sig*sizeof(uint8_t)));
-
-    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, num_sig*sizeof(float)));
-
-    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
-
-    checkCudaErrors(cudaMemcpy(d_blk_idx, blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(d_blk_vals, blk_vals, (num_sig)*sizeof(float), cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(d_blk_subidx, blk_subidx, (num_sig)*sizeof(uint8_t), cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(d_stateArray, stateArray, nbBlocks, cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(d_blk_sig, blk_sig, nbBlocks*sizeof(uint8_t), cudaMemcpyHostToDevice));
-
-
-	size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
-    memcpy((*newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
-    r += (nbEle%blockSize)*sizeof(float);
-	float* fr = (float*)r; //fr is the starting address of constant median values.
-	for(i = 0;i < nbConstantBlocks;i++, j+=4) //get the median values for constant-value blocks
-		constantMedianArray[i] = fr[i];
-    r += nbConstantBlocks*sizeof(float);
-    unsigned char* p = r + ncBlocks * sizeof(short);
-    for(i = 0;i < ncBlocks;i++){
-        int leng = (int)bytesToShort(r)+mSize;
-        r += sizeof(short);
-        if (leng > blockSize*sizeof(float))
-        {
-            printf("Warning: compressed block is larger than the original block!\n");
-            exit(0);
-        }
-        memcpy(data+i*blockSize*sizeof(float), p, leng);
-        p += leng;
-    } 
-
-    unsigned char* d_data;
-    float *d_newdata;
-    checkCudaErrors(cudaMalloc((void**)&d_data, ncBlocks*blockSize*sizeof(float))); 
-    checkCudaErrors(cudaMemcpy(d_data, data, ncBlocks*blockSize*sizeof(float), cudaMemcpyHostToDevice)); 
-    checkCudaErrors(cudaMalloc(&d_newdata, nbBlocks*blockSize*sizeof(float)));
-
-    timer_GPU.StartCounter();
-    dim3 dimBlock(32, blockSize/32);
-    dim3 dimGrid(65536, 1);
-    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
-    decompress_state2<<<nbBlocks, 64>>>(d_newdata, d_stateArray,d_blk_idx, d_blk_vals, d_blk_subidx,blockSize, d_blk_sig);
-    decompress_float<<<dimGrid, dimBlock, sMemsize>>>(d_data, blockSize, ncBlocks, mSize);
-    cudaError_t err = cudaGetLastError();        // Get error code
-    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    //printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
-    cudaDeviceSynchronize();
-    checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
-    float* fdata = (float*)data;
-
-    int nb=0, nc=0;
-    for (i=0;i<nbBlocks;i++){
-        if (stateArray[i]==0 || stateArray[i]==1){
-            float Median = constantMedianArray[nb];
-            if (Median>1) printf("data%i:%f\n",i, Median);
-            for (j=0;j<blockSize;j++)
-                *((*newData)+i*blockSize+j) = Median;
-            nb++;
-        }else if(stateArray[i]==3){
-            for (j=0;j<blockSize;j++)
-                *((*newData)+i*blockSize+j) = fdata[nc*blockSize+j];
-            nc++;
-        }
-    }
-
-	free(stateArray);
-	free(constantMedianArray);
-	free(data);
-    cudaFree(d_newdata);
-    cudaFree(d_stateArray);
-    checkCudaErrors(cudaFree(d_data));
-
-}
-
-__device__ inline void longToBytes_bigEndian_d(unsigned char *b, unsigned long num) 
-{
-	b[0] = (unsigned char)(num>>56);
-	b[1] = (unsigned char)(num>>48);
-	b[2] = (unsigned char)(num>>40);
-	b[3] = (unsigned char)(num>>32);
-	b[4] = (unsigned char)(num>>24);
-	b[5] = (unsigned char)(num>>16);
-	b[6] = (unsigned char)(num>>8);
-	b[7] = (unsigned char)(num);
-//	if(dataEndianType==LITTLE_ENDIAN_DATA)
-//		symTransform_8bytes(*b);
-}
-
-inline void longToBytes_bigEndian_memset(unsigned char *b, unsigned long num) 
-{
-    checkCudaErrors(cudaMemset(&b[0], (unsigned char)(num>>56), sizeof(char)));
-    checkCudaErrors(cudaMemset(&b[1], (unsigned char)(num>>48), sizeof(char)));
-    checkCudaErrors(cudaMemset(&b[2], (unsigned char)(num>>40), sizeof(char)));
-    checkCudaErrors(cudaMemset(&b[3], (unsigned char)(num>>32), sizeof(char)));
-    checkCudaErrors(cudaMemset(&b[4], (unsigned char)(num>>24), sizeof(char)));
-    checkCudaErrors(cudaMemset(&b[5], (unsigned char)(num>>16), sizeof(char)));
-    checkCudaErrors(cudaMemset(&b[6], (unsigned char)(num>>8), sizeof(char)));
-    checkCudaErrors(cudaMemset(&b[7], (unsigned char)(num), sizeof(char)));
-//	if(dataEndianType==LITTLE_ENDIAN_DATA)
-//		symTransform_8bytes(*b);
-}
-
-__device__ inline void shortToBytes_d(unsigned char* b, short value)
-{
-	lint16 buf;
-	buf.svalue = value;
-	memcpy(b, buf.byte, 2);
-}
-
-
-
-__global__ void getNumNonConstantBlocks(size_t nbBlocks, short *offsets, unsigned char *meta, int blockSize, int *nonconstant, int *out_size){
-    for (int tid = blockDim.x*blockIdx.x + threadIdx.x; tid < nbBlocks; tid += blockDim.x*gridDim.x){
-        if (meta[tid] == 3){ 
-            atomicAdd(nonconstant, 1);
-            atomicAdd(out_size,1+(blockSize/4)+offsets[tid]);
-        }
-    }
-}
-
-__global__ void generateFlags(unsigned char *states, uint64_t *cBlk_flags, uint64_t *ncBlk_flags,uint64_t* offset_indices,short* offsets, size_t nbBlocks){
-    for (int tid = blockDim.x*blockIdx.x + threadIdx.x; tid < nbBlocks; tid += blockDim.x*gridDim.x){
-        if (states[tid] == 0 || states[tid] == 1)
-        {
-            cBlk_flags[tid] = 1;
-            ncBlk_flags[tid] = 0;
-            offset_indices[tid] = 0;
-        }else if(states[tid]==3){
-            ncBlk_flags[tid] = 1;
-            cBlk_flags[tid] = 0;
-            offset_indices[tid] = (uint64_t) offsets[tid];
-        }else{
-            cBlk_flags[tid] = 0;
-            ncBlk_flags[tid] = 0;
-            offset_indices[tid] = 0;
-        }
-        
-    }
-}
-
-__global__ void nccopy_kernel2(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
-                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, uint64_t *cBlk_indices, uint64_t *ncBlk_indices, uint64_t* offset_indices){
-   // printf("blockdim %d blockidx %d threadidx %d griddim %d\n", blockDim.x, blockIdx.x, threadIdx.x, gridDim.x);
-    int i;
-    int num_threads = (blockDim.x*gridDim.x);
-    int tid = blockDim.x*blockIdx.x + threadIdx.x;
-    int blocks_per_thread = nbBlocks/num_threads;
-    int start_idx = tid*blocks_per_thread;
-    int end_idx = start_idx+blocks_per_thread;
-
-    if (tid == num_threads-1)
-    {
-        end_idx = nbBlocks;
-    }
-    
-    unsigned char* tmp_o = o+(sizeof(short)*ncBlk_indices[start_idx]);
-    unsigned char* tmp_nc= nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]);
-    for (i=start_idx; i<end_idx; i++){
-        if(meta[i] == 3){
-	
-            
-            shortToBytes_d(o, offsets[i]);
-            tmp_o += sizeof(short);
-            memcpy(tmp_nc, meta+(nbBlocks+i*mSize), mSize);
-            tmp_nc += mSize; 
-            memcpy(tmp_nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
-            tmp_nc += offsets[i];
-
-            // shortToBytes_d(o+(sizeof(short)*ncBlk_indices[i]), offsets[i]);
-            
-            // memcpy(nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]), meta+(nbBlocks+i*mSize), mSize);
-
-
-            // memcpy(nc+(mSize*(ncBlk_indices[i]+1) + offset_indices[i]*ncBlk_indices[i]), midBytes+(i*blockSize*sizeof(float)), offsets[i]);
-        } 
-    }
-    
-}
-
-
-__global__ void nccopy_kernel(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
-                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, uint64_t *cBlk_indices, uint64_t *ncBlk_indices, uint64_t* offset_indices, size_t *final_nc){
-   // printf("blockdim %d blockidx %d threadidx %d griddim %d\n", blockDim.x, blockIdx.x, threadIdx.x, gridDim.x);
-    int i;
-    // if(threadIdx.x==0){
-	// printf("c: %ld nc: %ld\n", cBlk_indices[nbBlocks-1], ncBlk_indices[nbBlocks-1]);
-    // }
-    for (i=blockDim.x*blockIdx.x + threadIdx.x; i<nbBlocks; i+=blockDim.x*gridDim.x){
-        //printf("meta %d i: %d\n",meta[i], i); 
-        if (meta[i]==0 || meta[i] == 1){
-            // printf("cblk\n");
-	        memcpy(c+(sizeof(float)*cBlk_indices[i]), meta+(nbBlocks+i*mSize), sizeof(float));
-	   
-            // printf("cblk done\n");
-	    // c += sizeof(float);
-	    // float g;
-	    // memcpy(&g, (meta+(nbBlocks+i*mSize)),sizeof(float));
-	    // printf("%d %f\n",i,g);
-        }
-        else if(meta[i] == 3){
-	
-        //     printf("ncblk 1\n");
-            shortToBytes_d(o+(sizeof(short)*ncBlk_indices[i]), offsets[i]);
-             // o += sizeof(short);
-
-        //     printf("ncblk 2 nbBlocks %d %d \n", nbBlocks, i);
-            // printf("nbBlkindices %ld offset_indices %ld\n", ncBlk_indices[i], offset_indices[i]);
-        //     printf(" test 1%c\n",meta+(nbBlocks+i*mSize));
-        //     printf("test 2%c\n", nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]));
-            memcpy(nc+((mSize*ncBlk_indices[i] + offset_indices[i])), meta+(nbBlocks+i*mSize), mSize);
-        //         // nc += mSize; 
-                
-        //     printf("ncblk 3\n");
-            memcpy(nc+(((mSize*ncBlk_indices[i])+mSize + offset_indices[i])), midBytes+(i*blockSize*sizeof(float)), offsets[i]);
-        //         // nc += offsets[i];
-            
-        //     printf("ncblk 4\n");
-        }
-        if (i==nbBlocks-1)
-        {
-            *final_nc = (size_t) (((mSize*ncBlk_indices[i])+mSize + offset_indices[i]))+offsets[i];
-	}
-        
-    }
-    
-}
-
-//__global__ void nccopy_kernel(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
-//                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, int *cBlk_indices, int *ncBlk_indices, int* offset_indices){
-//    printf("blockdim %d blockidx %d threadidx %d griddim %d\n", blockDim.x, blockIdx.x, threadIdx.x, gridDim.x);
-//    int i;
-//    for (i=blockDim.x*blockIdx.x + threadIdx.x; i<nbBlocks; i+=blockDim.x*gridDim.x){
-        //printf("meta %d i: %d\n",meta[i], i); 
-//        if (meta[i]==0 || meta[i] == 1){
-            // printf("cblk\n");
-//	    memcpy(c+(sizeof(float)*cBlk_indices[i]), meta+(nbBlocks+i*mSize), sizeof(float));
-
-            // printf("cblk done\n");
-	    // c += sizeof(float);
-	    // float g;
-	    // memcpy(&g, (meta+(nbBlocks+i*mSize)),sizeof(float));
-	    // printf("%d %f\n",i,g);
-//        }else if(meta[i] == 3){
-	
-//           printf("ncblk 1\n");
-//           shortToBytes_d(o+(sizeof(short)*ncBlk_indices[i]), offsets[i]);
-            // o += sizeof(short);
-
-//           printf("ncblk 2 nbBlocks %d %d \n", nbBlocks, i);
-//	   printf("nbBlkindices %d offset_indices %d\n", ncBlk_indices[i], offset_indices[i]);
-//	   memcpy(nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]), meta+(nbBlocks+i*mSize), mSize);
-            // nc += mSize; 
-            
-//           printf("ncblk 3\n");
-//	   memcpy(nc+(mSize*(ncBlk_indices[i]+1) + offset_indices[i]*ncBlk_indices[i]), midBytes+(i*blockSize*sizeof(float)), offsets[i]);
-            // nc += offsets[i];
-        
-//           printf("ncblk 4\n");
-//	} 
-//    }
-    
-//}
-
-__global__ void set_nc(unsigned char *nc, short *offsets, uint64_t *offset_indices, uint64_t *ncBlk_indices, size_t mSize, size_t nbBlocks){
-    if (threadIdx.x == 0 && blockIdx.x == 0)
-    {
-        nc = nc + (mSize*(ncBlk_indices[nbBlocks -1]+1) + offset_indices[nbBlocks - 1]*ncBlk_indices[nbBlocks - 1]) + offsets[nbBlocks-1];
-    }
-    
-}
-
-void ncblkCopy_fast(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
-                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, size_t *final_nc){
-    uint64_t *cBlk_indices, *ncBlk_indices;
-    uint64_t *offset_indices;
-    TimingGPU timer2;
-
-    // timer2.StartCounter();
-    
-    checkCudaErrors(cudaMalloc(&cBlk_indices, sizeof(uint64_t)*nbBlocks));
-    checkCudaErrors(cudaMalloc(&ncBlk_indices, sizeof(uint64_t)*nbBlocks));
-    checkCudaErrors(cudaMalloc(&offset_indices, sizeof(uint64_t)*nbBlocks));
-
-    generateFlags<<<BLOCKS,THREADS_PER_BLOCK>>>(meta, cBlk_indices, ncBlk_indices, offset_indices, offsets, nbBlocks);
-    cudaDeviceSynchronize();
-
-    thrust::exclusive_scan(thrust::device, cBlk_indices, cBlk_indices + nbBlocks, cBlk_indices, 0);
-    thrust::exclusive_scan(thrust::device, ncBlk_indices, ncBlk_indices + nbBlocks, ncBlk_indices, 0);
-    thrust::exclusive_scan(thrust::device, offset_indices, offset_indices + nbBlocks, offset_indices, 0);
-
-    nccopy_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(c, o, nc, midBytes, meta, nbBlocks, blockSize, offsets, mSize, cBlk_indices,ncBlk_indices,offset_indices,final_nc);
-    // nccopy_kernel2<<<1,1>>>(c, o, nc, midBytes, meta, nbBlocks, blockSize, offsets, mSize, cBlk_indices,ncBlk_indices,offset_indices);
-
-    cudaDeviceSynchronize();
-
-    //printf("nc: %p\n", nc);
-    // printf("%s\n", cudaGetErrorString(cudaGetLastError()));
-    // set_nc<<<1,1>>>(nc, offsets, offset_indices, ncBlk_indices, mSize, nbBlocks);
-    // cudaDeviceSynchronize();
-    // printf("ncblockcpy: %f ms\n", timer2.GetCounter());
-    checkCudaErrors(cudaFree(cBlk_indices));
-    checkCudaErrors(cudaFree(ncBlk_indices));
-    checkCudaErrors(cudaFree(offset_indices));
-}
-
-void ncblkCopy_h(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
-                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize){
-    unsigned char *tmp_states;
-    unsigned char *ncold = nc;
-    uint64_t col_off = 0;
-    short *tmp_offsets;
-    tmp_offsets = (short*)malloc(sizeof(short)*nbBlocks);
-    tmp_states = (unsigned char *)malloc(sizeof(char)*nbBlocks);
-    checkCudaErrors(cudaMemcpy(tmp_states, meta, sizeof(char)*nbBlocks, cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(tmp_offsets,offsets,sizeof(short)*nbBlocks,cudaMemcpyDeviceToHost));
-    cudaStream_t stream[3];
-    cudaStreamCreate(&stream[0]);
-    cudaStreamCreate(&stream[1]);
-    cudaStreamCreate(&stream[2]);
-
-    //printf("here\n");
-    //checkCudaErrors(cudaMemcpy((void**)&d_offsets, nbBlocks*sizeof(short))); 
-    for (int i = 0; i < nbBlocks; i++)
-    {
-        if(tmp_states[i]==3){
-            // shortToBytes_d(o, offsets[i]);
-            // buf = (unsigned char*)
-            
-//	    printf("here2\n");
-            cudaMemcpyAsync(o, offsets+i, 2, cudaMemcpyDeviceToDevice, stream[0]);
-            o += sizeof(short);
-        
-    //	    printf("here2.1\n");
-            // printf("offsets %ld\n", col_off);
-            cudaMemcpyAsync(nc, meta+(nbBlocks+i*mSize), mSize, cudaMemcpyDeviceToDevice, stream[1]);
-                // memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
-                
-            nc += mSize; 
-                
-    //	    printf("here2.2\n");
-            //checkCudaErrors(cudaMemcpy(buf, offsets+i, sizeof(short), cudaMemcpyDeviceToHost));
-                
-    //	    //printf("here2.3 %d\n", buf);
-            cudaMemcpyAsync(nc, midBytes+(i*blockSize*sizeof(float)), (int)tmp_offsets[i], cudaMemcpyDeviceToDevice, stream[2]);
-            // memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
-            nc += tmp_offsets[i];
-            col_off+=tmp_offsets[i];
-       
-///	    printf("here2.4\n");
-       	}
-    }
-    cudaStreamDestroy(stream[0]);
-    cudaStreamDestroy(stream[1]);
-    cudaStreamDestroy(stream[2]);
-
-    free(tmp_states);
-    free(tmp_offsets); 
-}
-
-__global__ void ncblkCopy(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
-                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize)
-{
-    for (int i=blockDim.x*blockIdx.x + threadIdx.x; i<nbBlocks; i+=blockDim.x*gridDim.x){
-        
-        if (meta[i]==0 || meta[i] == 1){
-            memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
-            c += sizeof(float);
-	    // float g;
-	    // memcpy(&g, (meta+(nbBlocks+i*mSize)),sizeof(float));
-	    // printf("%d %f\n",i,g);
-        }else if(meta[i] == 3){
-           shortToBytes_d(o, offsets[i]);
-            o += sizeof(short);
-            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
-            nc += mSize; 
-            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
-            nc += offsets[i];
-        } 
-    }
-}
-
-size_t better_post_proc(size_t *outSize, float *oriData, unsigned char *meta, 
-                                short *offsets, unsigned char *midBytes, unsigned char *outBytes, 
-                                size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, 
-                                float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
-    /**
-     * outSize: host pointer
-     * float *oriData: device pointer
-     * unsigned char* meta: device pointer
-     * short *offsets: device pointer
-     * 
-     * 
-     */
-    int out_size_h = 0;
-    int *out_size_d;
-    int tmp_outsize = 0;
-    size_t *nc_diff;
-    size_t nbConstantBlocks = 0;
-    size_t nbBlocks = nbEle/blockSize;
-    size_t ncBytes = blockSize/4;
-    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
-    out_size_h += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
-    if (nbBlocks%8==0)
-        out_size_h += nbBlocks/8;
-    else
-        out_size_h += nbBlocks/8+1;
-    cudaMalloc(&nc_diff, sizeof(size_t));
-    int *nonconstant_d, nonconstant_h;
-    checkCudaErrors(cudaMalloc((void **)&nonconstant_d, sizeof(int)));
-    checkCudaErrors(cudaMalloc((void **)&out_size_d, sizeof(int)));
-
-    checkCudaErrors(cudaMemset(nonconstant_d, 0, sizeof(int)));
-    checkCudaErrors(cudaMemset(out_size_d, 0, sizeof(int)));
-
-
-    getNumNonConstantBlocks<<<BLOCKS,THREADS_PER_BLOCK>>>(nbBlocks, offsets, meta, blockSize, nonconstant_d, out_size_d);
-    cudaDeviceSynchronize();
-
-    checkCudaErrors(cudaMemcpy(&nonconstant_h, nonconstant_d, sizeof(int), cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(&tmp_outsize, out_size_d, sizeof(int), cudaMemcpyDeviceToHost));
-
-    nbConstantBlocks = nbBlocks - nonconstant_h;
-    out_size_h+=tmp_outsize;
-
-    out_size_h += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
-
-    //outBytes = (unsigned char*)malloc(out_size);
-	unsigned char* r = outBytes;
-    unsigned char* r_old = outBytes;
-    // cudaDeviceSynchronize(); printf("%s\n",cudaGetLastError());
-    checkCudaErrors(cudaMemset(r, SZx_VER_MAJOR, sizeof(char)));
-    checkCudaErrors(cudaMemset(r+1, SZx_VER_MINOR, sizeof(char)));
-    checkCudaErrors(cudaMemset(r+2, 1, sizeof(char)));
-    checkCudaErrors(cudaMemset(r+3, 0, sizeof(char)));
-    checkCudaErrors(cudaMemset(r+4, blockSize, sizeof(char)));
-
-	r=r+5; //1 byte
-	//sizeToBytes(r, nbConstantBlocks);
-    longToBytes_bigEndian_memset(r, nbConstantBlocks);
-	r += sizeof(size_t);
-    //sizeToBytes(r, (size_t) num_sig);
-    longToBytes_bigEndian_memset(r, (unsigned long)num_sig);
-    r += sizeof(size_t); 
-    size_t out_length;
-
-    if(nbBlocks%4==0)
-		out_length = nbBlocks/4;
-	else
-		out_length = nbBlocks/4+1;
-
-    convert_state_to_out_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(meta, nbBlocks, r, out_length);
-    r+=out_length;
-    convert_block2_to_out_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-    r += nbBlocks*4 + num_sig*sizeof(float) + num_sig*sizeof(uint8_t) + nbBlocks*sizeof(uint8_t);
-
-    checkCudaErrors(cudaMemcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float), cudaMemcpyDeviceToDevice));
-    // memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
-    r += (nbEle%blockSize)*sizeof(float);
-    unsigned char* c = r;
-    unsigned char* o = c+nbConstantBlocks*sizeof(float);
-    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
-    // ncblkCopy<<<1,1>>>(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
-    
-    // ncblkCopy_h(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
-    ncblkCopy_fast(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize, nc_diff);
-    // cudaDeviceSynchronize();
-    size_t h_nc_diff;
-    cudaMemcpy(&h_nc_diff,nc_diff, sizeof(size_t),cudaMemcpyDeviceToHost);
-    return (size_t) (nc+h_nc_diff-r_old);
-    // checkCudaErrors(cudaMemcpy(outSize, (size_t)(nc-r_old), sizeof(size_t)));
-    // *outSize = (size_t) (nc-r_old);
-    // return outBytes;
-}
-
-__global__ void device_post_proc(size_t *outSize, float *oriData, unsigned char *meta, 
-                                short *offsets, unsigned char *midBytes, unsigned char *outBytes, 
-                                size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, 
-                                float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig)
-{
-    int out_size = 0;
-
-    size_t nbConstantBlocks = 0;
-    size_t nbBlocks = nbEle/blockSize;
-    size_t ncBytes = blockSize/4;
-    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
-    out_size += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
-    if (nbBlocks%8==0)
-        out_size += nbBlocks/8;
-    else
-        out_size += nbBlocks/8+1;
-    int s0 = 0;
-    int s1 = 0;
-    int s2 = 0;
-    int s3 = 0;
-    for (int i=0; i<nbBlocks; i++){
-        if (meta[i]==0 || meta[i]==1 || meta[i] == 2) nbConstantBlocks++;
-        else out_size += 1+(blockSize/4)+offsets[i];
-    
-    	if(meta[i]==0) s0++;
-    	if(meta[i]==1) s1++;
-    	if(meta[i]==2) s2++;
-    	if(meta[i]==3) s3++;
-    }
-  //  printf("%d %d %d %d\n", s0, s1, s2, s3);
-    out_size += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
-
-    //outBytes = (unsigned char*)malloc(out_size);
-	unsigned char* r = outBytes;
-   // printf("outbytes %p\n",r);
-    unsigned char* r_old = outBytes; 
-	r[0] = SZx_VER_MAJOR;
-	r[1] = SZx_VER_MINOR;
-	r[2] = 1;
-	r[3] = 0; // indicates this is not a random access version
-	r[4] = (unsigned char)blockSize;
-	r=r+5; //1 byte
-	//sizeToBytes(r, nbConstantBlocks);
-    longToBytes_bigEndian_d(r, nbConstantBlocks);
-	r += sizeof(size_t);
-    //sizeToBytes(r, (size_t) num_sig);
-
-   // printf("outbytes %p\n",r);
-    longToBytes_bigEndian_d(r, (unsigned long)num_sig);
-    r += sizeof(size_t); 
-	r += convert_state_to_out(meta, nbBlocks, r);
-   // printf("num sig %d\n", num_sig); 
-   // printf("outbytes %p\n",r);
-    r += convert_block2_to_out(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-    
-   // printf("outbytes %p\n",r);
-    memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
-    r += (nbEle%blockSize)*sizeof(float);
-
-   // printf("outbytes %p\n",r);
-    unsigned char* c = r;
-    unsigned char* o = c+nbConstantBlocks*sizeof(float);
-    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
-    for (int i=0; i<nbBlocks; i++){
-        
-        if (meta[i]==0 || meta[i] == 1){
-            memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
-            c += sizeof(float);
-       
-	    // float g;
-	    // memcpy(&g, (c-sizeof(float)),sizeof(float));
-	    // printf("%d %f\n",i,g);
-       	}else if(meta[i] == 3){
-           shortToBytes_d(o, offsets[i]);
-            o += sizeof(short);
-            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
-            nc += mSize; 
-            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
-            nc += offsets[i];
-        } 
-    }
-
-    // return out_size;
-    *outSize = (size_t) (nc-r_old);
-   // printf("outBytes 0 %d\n", (int) outBytes[0]);
-    // return (uint32_t) (nc-r_old);
-}
-
-__global__ void fin_copy(unsigned char* in, unsigned char *out, size_t n){
-
-	for(size_t i = threadIdx.x+blockIdx.x*gridDim.x; i < n; i+=blockDim.x*gridDim.x){
-		out[i]=in[i];
-	}
-
-}
-
-unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold)
-{
-    /**
-     * Assuming the following are device pointers:
-     *  float *oriData
-     *  size_t *outSize
-     *  unsigned char* outBytes
-     * 
-     */
-    // float *dmin,*dmax, *hmin, *hmax;
-    // void *d_temp_storage = NULL;
-    // size_t temp_storage_bytes = 0;
-    timer_GPU.StartCounter();
-//     cudaMalloc(&dmin, sizeof(float));
-//     cudaMalloc(&dmax, sizeof(float));
-
-//    // dmax = thrust::reduce(oriData, oriData+nbEle, -1, thrust::maximum<float>());
-//    // dmin = thrust::reduce(oriData, oriData+nbEle, 1, thrust::minimum<float>());
-//     cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, oriData, dmax, nbEle);
-//     cudaMalloc(&d_temp_storage, temp_storage_bytes);
-//     cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, oriData, dmax, nbEle);
-
-//     cudaFree(d_temp_storage);
-//     cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, oriData, dmin, nbEle);
-//     cudaMalloc(&d_temp_storage, temp_storage_bytes);
-//     cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, oriData, dmin, nbEle);
-
-//     cudaFree(d_temp_storage);
-//     // thrust::pair<float *, float *> result = thrust::minmax_element(thrust::device, oriData,oriData+nbEle);
-//     //printf("here\n");
-//     cudaMemcpy(hmin, dmin, sizeof(float), cudaMemcpyDeviceToHost);
-//     cudaMemcpy(hmax, dmax,sizeof(float), cudaMemcpyDeviceToHost);
-//     absErrBound = absErrBound*(hmax-hmin);
-//     threshold = threshold*(hmax-hmin);
-    // // printf("%f\n",absErrBound);
-    // cudaFree(dmin);
-    // cudaFree(dmax);
-    float sparsity_level = SPARSITY_LEVEL;
-
-    // Set the input data as the function parameter, this should be a device pointer
-
-	float* d_oriData = oriData;
-    // cudaMalloc((void**)&d_oriData, sizeof(float)*nbEle); 
-    // cudaMemcpy(d_oriData, oriData, sizeof(float)*nbEle, cudaMemcpyHostToDevice); 
-
-	size_t nbBlocks = nbEle/blockSize;
-	size_t remainCount = nbEle%blockSize;
-	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
-
-    size_t ncBytes = blockSize/4;
-    //ncBytes = (blockSize+1)%4==0 ? ncBytes : ncBytes+1; //Bytes to store one non-constant block data.
-    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
-    size_t msz = (1+mSize) * nbBlocks * sizeof(unsigned char);
-    size_t mbsz = sizeof(float) * nbEle * sizeof(unsigned char);
-
-    // These are host pointers and do not need to be allocated
-
-    // unsigned char *meta = (unsigned char*)malloc(msz);
-    // short *offsets = (short*)malloc(nbBlocks*sizeof(short));
-    // unsigned char *midBytes = (unsigned char*)malloc(mbsz);
-
-	unsigned char* d_meta;
-	unsigned char* d_midBytes;
-	short* d_offsets;
-
-    uint32_t *blk_idx, *d_blk_idx;
-    uint8_t *blk_sig, *d_blk_sig;
-    uint8_t *blk_subidx, *d_blk_subidx;
-    float *blk_vals, *d_blk_vals;
-    uint64_t *num_sig, *d_num_sig;
-
-    checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
-    num_sig = (uint64_t *)malloc(sizeof(uint64_t));
-    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
-    // blk_idx = malloc()
-    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, nbEle*sizeof(uint8_t)));
-
-    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, nbEle*sizeof(float)));
-
-    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
-
-    checkCudaErrors(cudaMalloc((void**)&d_meta, msz)); 
-    //checkCudaErrors(cudaMemcpy(d_meta, meta, msz, cudaMemcpyHostToDevice)); 
-    checkCudaErrors(cudaMemset(d_meta, 0, msz));
-    checkCudaErrors(cudaMalloc((void**)&d_offsets, nbBlocks*sizeof(short))); 
-    checkCudaErrors(cudaMemset(d_offsets, 0, nbBlocks*sizeof(short)));
-    checkCudaErrors(cudaMalloc((void**)&d_midBytes, mbsz)); 
-    checkCudaErrors(cudaMemset(d_midBytes, 0, mbsz));
-
-    
-    // apply_threshold<<<80,256>>>(d_oriData, threshold, nbEle);
-    // cudaDeviceSynchronize();
-    dim3 dimBlock(32, blockSize/32);
-    dim3 dimGrid(65536, 1);
-    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
-    //printf("Malloc end timestamp: %f ms\n", timer_GPU.GetCounter());
-    compress_float<<<dimGrid, dimBlock, sMemsize>>>(d_oriData, d_meta, d_offsets, d_midBytes, absErrBound, blockSize, nbBlocks, mSize, sparsity_level, d_blk_idx, d_blk_subidx,d_blk_vals, threshold, d_blk_sig);
-    cudaError_t err = cudaGetLastError();        // Get error code
-   // printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    //printf("GPU compression timestamp: %f ms\n", timer_GPU.GetCounter());
-    cudaDeviceSynchronize();
-    get_numsig<<<1,1>>>(d_num_sig);
-    cudaDeviceSynchronize();
-
-    checkCudaErrors(cudaMemcpy(num_sig, d_num_sig, sizeof(uint64_t), cudaMemcpyDeviceToHost));
-
-    // These are allocations and memcpys to host pointers, do not need them
-
-    // blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
-    // blk_vals= (float *)malloc((*num_sig)*sizeof(float));
-    // blk_subidx = (uint8_t *)malloc((*num_sig)*sizeof(uint8_t));
-    // blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
-
-    // checkCudaErrors(cudaMemcpy(meta, d_meta, msz, cudaMemcpyDeviceToHost)); 
-    // checkCudaErrors(cudaMemcpy(offsets, d_offsets, nbBlocks*sizeof(short), cudaMemcpyDeviceToHost)); 
-    // checkCudaErrors(cudaMemcpy(midBytes, d_midBytes, mbsz, cudaMemcpyDeviceToHost)); 
-    
-    
-    // checkCudaErrors(cudaMemcpy(blk_idx, d_blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyDeviceToHost));
-    // checkCudaErrors(cudaMemcpy(blk_vals,d_blk_vals, (*num_sig)*sizeof(float), cudaMemcpyDeviceToHost));
-    // checkCudaErrors(cudaMemcpy(blk_subidx,d_blk_subidx, (*num_sig)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
-    // checkCudaErrors(cudaMemcpy(blk_sig,d_blk_sig, (nbBlocks)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
-
-
-    size_t maxPreservedBufferSize = sizeof(float)*nbEle;
-    unsigned char *d_outBytes;
-    // unsigned char* outBytes = (unsigned char*)malloc(maxPreservedBufferSize);
-    // memset(outBytes, 0, maxPreservedBufferSize);
-    checkCudaErrors(cudaMalloc(&d_outBytes, maxPreservedBufferSize));
-
-    size_t *d_outSize;
-
-    checkCudaErrors(cudaMalloc(&d_outSize, sizeof(size_t)));
-
-  //  device_post_proc<<<1,1>>>(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
-    *outSize = better_post_proc(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
-    //cudaDeviceSynchronize();
-    
-    //checkCudaErrors(cudaMemcpy(outSize, d_outSize, sizeof(size_t), cudaMemcpyDeviceToHost));
-
-    // printf("completed compression\n");
-    //free(blk_idx);
-    //free(blk_subidx);
-    //free(blk_vals);
-    // free(meta);
-    // free(offsets);
-    // free(midBytes);
-    checkCudaErrors(cudaFree(d_num_sig));
-    checkCudaErrors(cudaFree(d_blk_idx));
-    checkCudaErrors(cudaFree(d_blk_subidx));
-    checkCudaErrors(cudaFree(d_blk_vals));
-    checkCudaErrors(cudaFree(d_blk_sig));
-
-    checkCudaErrors(cudaFree(d_meta));
-    checkCudaErrors(cudaFree(d_offsets));
-    checkCudaErrors(cudaFree(d_midBytes));
-
-    unsigned char *d_newout;
-    
-    *outSize = *outSize;
-    size_t os = *outSize;
-    
-    checkCudaErrors(cudaMalloc(&d_newout, os));
-    //fin_copy<<<40,256>>>(d_outBytes, d_newout,os);
-    checkCudaErrors(cudaMemcpy(d_newout, d_outBytes, os, cudaMemcpyDeviceToDevice));
-    cudaDeviceSynchronize(); 
-
-    checkCudaErrors(cudaFree(d_outBytes));
-    printf("Compression end timestamp: %f ms\n", timer_GPU.GetCounter());
-     
-    err = cudaGetLastError();        // Get error code
-    printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    return d_newout;
-    //return d_outBytes;
-}
-
-__device__ inline long bytesToLong_bigEndian(unsigned char* b) {
-	long temp = 0;
-	long res = 0;
-
-	res <<= 8;
-	temp = b[0] & 0xff;
-	res |= temp;
-
-	res <<= 8;
-	temp = b[1] & 0xff;
-	res |= temp;
-	
-	res <<= 8;
-	temp = b[2] & 0xff;
-	res |= temp;
-	
-	res <<= 8;
-	temp = b[3] & 0xff;
-	res |= temp;
-	
-	res <<= 8;
-	temp = b[4] & 0xff;
-	res |= temp;
-	
-	res <<= 8;
-	temp = b[5] & 0xff;
-	res |= temp;
-	
-	res <<= 8;
-	temp = b[6] & 0xff;
-	res |= temp;
-	
-	res <<= 8;
-	temp = b[7] & 0xff;
-	res |= temp;						
-	
-	return res;
-}
-
-__device__ inline size_t bytesToSize(unsigned char* bytes)
-{
-	size_t result = bytesToLong_bigEndian(bytes);//8	
-	return result;
-}
-
-__device__ inline short bytesToShort(unsigned char* bytes)
-{
-	lint16 buf;
-	memcpy(buf.byte, bytes, 2);
-	
-	return buf.svalue;
-}
-
-__global__ void decompress_get_stats(float *newData, size_t nbEle, unsigned char* cmpBytes, 
-    size_t *numSigValues, int *bs,
-    size_t *numConstantBlks, size_t *numBlks,
-    size_t *mSizeptr, unsigned char *newCmpBytes
-){
-	unsigned char* r = cmpBytes;
-
-    size_t num_sig;
-	r += 4;
-	int blockSize = (int) r[0];  //get block size
-	
-	if(blockSize == 0)blockSize = 256;
-	r++;
-	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
-	r += sizeof(size_t);
-	num_sig = bytesToSize(r);
-    
-    r += sizeof(size_t);
-	size_t nbBlocks = nbEle/blockSize;
-    size_t ncBlocks = 0;
-    size_t num_state2_blks = 0;
-	// size_t ncBlocks = nbBlocks - nbConstantBlocks; //get number of constant blocks
-	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
-    size_t ncLeading = blockSize/4;
-    size_t mSize = sizeof(float)+1+ncLeading; //Number of bytes for each data block's metadata.
-
-    *mSizeptr = mSize;
-
-    *numConstantBlks = nbConstantBlocks;
-    *numBlks = nbBlocks;
-    *numSigValues = num_sig;
-    *bs = blockSize;
-    newCmpBytes = r;
-
-}
-
- void setup_data_stateArray_better(float *newData, size_t nbEle, unsigned char* r, 
-    size_t num_sig, int blockSize,
-    size_t nbConstantBlocks, size_t nbBlocks, size_t *ncBlks,
-    unsigned char *stateArray, unsigned char *newR
-){
-
-    //printf("ma\n");
-    // blockSize = 256;
-    r += 4;
-    r++;
-    r += sizeof(size_t);
-    r += sizeof(size_t);
-    int ncBlocks, *ncBlocks_d;
-	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
-    int num_state2_blks, *num_state2_d;
-    checkCudaErrors(cudaMalloc((void **)&num_state2_d, sizeof(int)));
-    checkCudaErrors(cudaMalloc((void **)&ncBlocks_d, sizeof(int)));
-    checkCudaErrors(cudaMemset(num_state2_d, 0, sizeof(int)));
-    checkCudaErrors(cudaMemset(ncBlocks_d, 0, sizeof(int)));
-
-    //printf("ma2\n");
-//	printf("Converting state array\n");
-    // printf("cmp %d\n", (int)r[0]);
-    // printf("state %d\n", (int)stateArray[0]);
-    // convert_out_to_state(nbBlocks, r, stateArray);
-    convert_out_to_state_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(nbBlocks,r,stateArray,stateNBBytes,
-                            num_state2_d, ncBlocks_d);
-    // printf("state %d\n", (int)stateArray[0]);
-    // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
-	cudaDeviceSynchronize();
-    
-    //printf("ma3\n");
-	r += stateNBBytes;
-    newR = r;
-    cudaMemcpy(&ncBlocks, ncBlocks_d, sizeof(int), cudaMemcpyDeviceToHost);
-    
-    //printf("ma4\n");
-    *ncBlks = ncBlocks;
-
-    //printf("ma4\n");
- }
-
-__global__ void setup_data_stateArray(float *newData, size_t nbEle, unsigned char* r, 
-    size_t num_sig, int blockSize,
-    size_t nbConstantBlocks, size_t nbBlocks, size_t *ncBlks,
-    unsigned char *stateArray, unsigned char *newR
-){
-    // blockSize = 256;
-    r += 4;
-    r++;
-    r += sizeof(size_t);
-    r += sizeof(size_t);
-    size_t ncBlocks = 0;
-	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
-    size_t num_state2_blks = 0;
-//	printf("Converting state array\n");
-    // printf("cmp %d\n", (int)r[0]);
-    // printf("state %d\n", (int)stateArray[0]);
-    convert_out_to_state(nbBlocks, r, stateArray);
-    // convert_out_to_state_kernel<<<40,256>>>(nbBlocks,r,stateArray,stateNBBytes);
-    // printf("state %d\n", (int)stateArray[0]);
-    // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
-	for (size_t i = 0; i < nbBlocks; i++)
-    {
-        if (stateArray[i] == 2)
-        {
-            num_state2_blks++;
-        }else if(stateArray[i] == 3){
-            ncBlocks++;
-        }
-    }
-    
-	r += stateNBBytes;
-    newR = r;
-    *ncBlks = ncBlocks;
-}
-
-__global__ void decomp_startup_kernel(unsigned char* r, size_t nbConstantBlocks, 
-unsigned char *data, int blockSize, size_t mSize, size_t ncBlocks, float *constantMedianArray, uint64_t* g_leng){
-    unsigned char * fr = r; //fr is the starting address of constant median values.
-    int i = 0, j = 0, k = 0;
-  //  printf("%p\n", r);
-    unsigned char tmp_r[4];
-    tmp_r[0]=fr[0];
-    tmp_r[1]=fr[1];
-    tmp_r[2]=fr[2];
-    tmp_r[3]=fr[3];
-
-
-//    printf("nbconstant: %f\n", ((float*)tmp_r)[0]);
-// nbConstantBlocks
-    for(i = blockDim.x*blockIdx.x + threadIdx.x; i < nbConstantBlocks; i += blockDim.x*gridDim.x){ //get the median values for constant-value blocks
-	    
-    	    tmp_r[0]=fr[4*i];
-    	    tmp_r[1]=fr[4*i+1];
-    	    tmp_r[2]=fr[4*i+2];
-    	    tmp_r[3]=fr[4*i+3];
-	    float tmp = ((float*)tmp_r)[0];
-	    constantMedianArray[i] = tmp;
-	    //printf("%d %f\n", i, tmp);
-    }
-   
-
-/** PROBLEM AREA, CAN FIX WITH PARALLELIZATION BUT WATCH *FR and *P **/
-
-    // if(threadIdx.x==0 && blockIdx.x==0){
-    fr += nbConstantBlocks*sizeof(float);
-    unsigned char* p = fr + ncBlocks * sizeof(short);
-    unsigned char* basefr = fr;
-    unsigned char* basep = p;
-    for(i = blockDim.x*blockIdx.x + threadIdx.x;i < ncBlocks;i+=blockDim.x*gridDim.x){
-        fr = basefr+(sizeof(short)*i);
-        int leng = (int)bytesToShort(fr)+mSize;
-        g_leng[i] = (uint64_t)leng;
-        // fr += sizeof(short);
-        if (leng > blockSize*sizeof(float))
-        {
-            printf("Warning: compressed block is larger than the original block!\n");
-            return;
-            // exit(0);
-        }
-        // memcpy(data+i*blockSize*sizeof(float), p, leng);
-
-        // p += leng;
-    }
-    
-    // }
-}
-
-__global__ void decompress_ncblk_kernel(unsigned char* r, size_t nbConstantBlocks, 
-unsigned char *data, int blockSize, size_t mSize, size_t ncBlocks, float *constantMedianArray, uint64_t* g_leng){
-    unsigned char * fr = r;
-    fr += nbConstantBlocks*sizeof(float);
-    unsigned char* p = fr + ncBlocks * sizeof(short);
-    unsigned char* basefr = fr;
-    unsigned char* basep = p;
-
-    for(int i = blockDim.x*blockIdx.x + threadIdx.x;i < ncBlocks;i+=blockDim.x*gridDim.x){
-        fr = basefr+(sizeof(short)*i);
-        int leng = (int)bytesToShort(fr)+mSize;
-        
-	
-	// g_leng[i] = leng;
-        // // fr += sizeof(short);
-        // if (leng > blockSize*sizeof(float))
-        // {
-        //     printf("Warning: compressed block is larger than the original block!\n");
-        //     return;
-        //     // exit(0);
-        // }
-        p = basep + g_leng[i];
-
-        memcpy(data+i*blockSize*sizeof(float), p, leng);
-	
-        // p += leng;
-    }
-}
-
-void decompress_startup_better(float *newData, size_t nbEle, unsigned char* r, 
-    uint32_t *blk_idx, uint8_t *blk_subidx, uint8_t *blk_sig,
-    float *blk_vals, size_t num_sig, int blockSize,
-    size_t nbConstantBlocks, size_t nbBlocks, size_t ncBlocks,
-    unsigned char *stateArray, float* constantMedianArray, unsigned char *data,
-    size_t mSize, unsigned char *newCmpBytes
-){
-    // blockSize = 256;
-    size_t nb_tmp = (int) nbEle/blockSize;
-    uint64_t* g_leng;
-    /**
-     * Structures to return:
-     * blk_idx, blk_subidx, blk_sig, blk_vals, numSigValues (pointer)
-     * bs (pointer to blockSize), numConstantBlks (pointer), numBlks (pointer)
-     * ncBlks (pointer), stateArray, constantMedianArray
-     */
-
-
-    size_t stateNBBytes = nb_tmp%4==0 ? nb_tmp/4 : nb_tmp/4+1;
-    
-    r += 4;
-    r++;
-    r += sizeof(size_t);
-    r += sizeof(size_t);
-
-    r += stateNBBytes;
-
-    convert_out_to_block2_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-    size_t to_add = nbBlocks*4 + num_sig*sizeof(float) + num_sig*sizeof(uint8_t) + nbBlocks*sizeof(uint8_t);
-    r+= to_add;
-
-    size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
-    
-    // printf("before mallocs in kernel\n");
-    checkCudaErrors(cudaMemcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float), cudaMemcpyDeviceToDevice));
-    checkCudaErrors(cudaMalloc(&g_leng, sizeof(uint64_t)*ncBlocks));
-    // memcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
-
-    //printf("before mallocs in kernel %p\n", r);
-    r += (nbEle%blockSize)*sizeof(float);
-    //printf("r: %p\n", r);
-    //printf("%d, %d, %d\n",nbEle, 256, nbEle%256);
-    decomp_startup_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbConstantBlocks,data, blockSize, mSize, ncBlocks, constantMedianArray, g_leng);
-    cudaDeviceSynchronize();
-
-    thrust::exclusive_scan(thrust::device, g_leng, g_leng + ncBlocks, g_leng, 0);
-
-    decompress_ncblk_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbConstantBlocks, data, blockSize, mSize, ncBlocks, constantMedianArray, g_leng);
-    cudaDeviceSynchronize();
-    
-    // cudaError_t err = cudaGetLastError();        // Get error code
-    
-    // printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    cudaFree(g_leng);
-        
-    // err = cudaGetLastError();        // Get error code
-    // printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    r += nbConstantBlocks*sizeof(float);
-
-    newCmpBytes = r;
-
-}
-
-__global__ void decompress_startup(float *newData, size_t nbEle, unsigned char* r, 
-    uint32_t *blk_idx, uint8_t *blk_subidx, uint8_t *blk_sig,
-    float *blk_vals, size_t num_sig, int blockSize,
-    size_t nbConstantBlocks, size_t nbBlocks, size_t ncBlocks,
-    unsigned char *stateArray, float* constantMedianArray, unsigned char *data,
-    size_t mSize, unsigned char *newCmpBytes
-){
-    // blockSize = 256;
-    size_t nb_tmp = (int) nbEle/blockSize;
-    /**
-     * Structures to return:
-     * blk_idx, blk_subidx, blk_sig, blk_vals, numSigValues (pointer)
-     * bs (pointer to blockSize), numConstantBlks (pointer), numBlks (pointer)
-     * ncBlks (pointer), stateArray, constantMedianArray
-     */
-	
-    // size_t ncBlocks = 0;
-	// size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
-    // size_t num_state2_blks = 0;
-	// printf("Converting state array\n");
-    // convert_out_to_state(nbBlocks, r, stateArray);
-    // printf("state %d\n", (int)stateArray[0]);
-    // // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
-	// for (size_t i = 0; i < nbBlocks; i++)
-    // {
-    //     if (stateArray[i] == 2)
-    //     {
-    //         num_state2_blks++;
-    //     }else if(stateArray[i] == 3){
-    //         ncBlocks++;
-    //     }
-    // }
-   // size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
-
-    size_t stateNBBytes = nb_tmp%4==0 ? nb_tmp/4 : nb_tmp/4+1;
-    //printf("%p\n", r);
-    r += 4;
-    r++;
-    r += sizeof(size_t);
-    r += sizeof(size_t);
-    //printf("statenb %d %d\n", stateNBBytes, nb_tmp);
-    r += stateNBBytes;
-    // data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
-    // memset(data, 0, ncBlocks*blockSize*sizeof(float));
-   // printf("converting block vals %d\n", data[0]);
-    size_t to_add = convert_out_to_block2(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-    r+= to_add;
-
-    size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
-    
-    // printf("before mallocs in kernel\n");
-    
-    memcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
-
-    //printf("before mallocs in kernel %p\n", r);
-    r += (nbEle%blockSize)*sizeof(float);
-    //printf("r: %p\n", r);
-    //printf("%d, %d, %d\n",nbEle, 256, nbEle%256);
-    unsigned char * fr = r; //fr is the starting address of constant median values.
-
-  //  printf("%p\n", r);
-    unsigned char tmp_r[4];
-    tmp_r[0]=r[0];
-    tmp_r[1]=r[1];
-    tmp_r[2]=r[2];
-    tmp_r[3]=r[3];
-
-
-//    printf("nbconstant: %f\n", ((float*)tmp_r)[0]);
-    for(i = 0;i < nbConstantBlocks;i++, j+=4){ //get the median values for constant-value blocks
-	    
-    	    tmp_r[0]=r[j];
-    	    tmp_r[1]=r[j+1];
-    	    tmp_r[2]=r[j+2];
-    	    tmp_r[3]=r[j+3];
-	    float tmp = ((float*)tmp_r)[0];
-//	    printf("median: %f\n", tmp);	
-	    constantMedianArray[i] = tmp;
-
-	    // printf("%d %f\n", i, tmp);
-    }
-    //printf("after constantmedian\n");
-    r += nbConstantBlocks*sizeof(float);
-    unsigned char* p = r + ncBlocks * sizeof(short);
-    for(i = 0;i < ncBlocks;i++){
-        int leng = (int)bytesToShort(r)+mSize;
-        r += sizeof(short);
-        if (leng > blockSize*sizeof(float))
-        {
-            printf("Warning: compressed block is larger than the original block!\n");
-            return;
-            // exit(0);
-        }
-//	printf("before memcpy\n");
-        memcpy(data+i*blockSize*sizeof(float), p, leng);
-  //      printf("after memcpy\n");
-	p += leng;
-    } 
-
-    newCmpBytes = r;
-//    printf("before mallocs in kernel\n");
-
-    // printf("nb blocks: %d\n", nbBlocks);
-}
-
-__global__ void cBlkCopy_decompress(int nb, float* constantMedianArray, float *newData, int blockSize, int i){
-    int j;
-    float Median = constantMedianArray[nb];
-    // j = threadIdx.x; j < blockSize; j += blockDim.x
-    for (j = threadIdx.x; j < blockSize; j += blockDim.x)
-        *((newData)+i*blockSize+j) = Median;
-}
-
-__global__ void ncBlkCopy_decompress(int blockSize, float *newData, int nc, float *fdata, int i){
-    int j;
-    for (j = threadIdx.x; j < blockSize; j += blockDim.x)
-        *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
-}
-
-void decompress_post_proc_better(unsigned char *data, float *newData, int blockSize, 
-    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
-    float *constantMedianArray
-){
-    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
-    // checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
-    float* fdata = (float*)data;
-    int i,j;
-    int nb=0, nc=0;
-    //printf("h1\n");
-    for (i=0;i<nbBlocks;i++){
-        unsigned char state;
-        cudaMemcpy(&state, &stateArray[i], sizeof(char), cudaMemcpyDeviceToHost);
-
-        if (state==0 || state==1){
-            cBlkCopy_decompress<<<1,256>>>(nb, constantMedianArray, newData, blockSize, i);
-            nb++;
-        }else if(state==3){
-            ncBlkCopy_decompress<<<1,256>>>(blockSize, newData, nc, fdata, i);
-            nc++;
-        }
-    }
-    cudaDeviceSynchronize();
-    //for(int k = 0; k < nbBlocks*blockSize;k++){
-//	printf("%f\n", newData[k]);
-  //  }
-}
-
-__global__ void print_newdata(float *newData, size_t nbBlocks, int blockSize){
-    for (size_t i = 0; i < nbBlocks*blockSize; i++)
-    {
-        printf("%f\n", newData[i]);
-    }
-    
-}
-
-__global__ void generateNbNc(size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray, uint64_t* nbs,  uint64_t* ncs){
-    for(int i = blockDim.x*blockIdx.x + threadIdx.x;i < nbBlocks;i+=blockDim.x*gridDim.x){
-        unsigned char state = stateArray[i];
-        if(state==0||state==1){
-            nbs[i] = 1;
-            ncs[i] = 0;
-        }else if(state==3){
-            nbs[i] = 0;
-            ncs[i] = 1;
-        }else{
-            nbs[i] = 0;
-            ncs[i] = 0;
-        }
-    }
-}
-
-__global__ void decompress_final_set(unsigned char *data, float *newData, int blockSize, 
-    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
-    float *constantMedianArray, uint64_t* nb, uint64_t* nc){
-    float* fdata = (float*)data;
-    for (int i = blockIdx.x;i < nbBlocks;i+=gridDim.x){
-        if (stateArray[i]==0 || stateArray[i]==1){
-            float Median = constantMedianArray[nb[i]];
-            // if (Median>1) printf("data%i:%f\n",i, Median);
-            for (int j = threadIdx.x; j < blockSize; j += blockDim.x)
-                *((newData)+i*blockSize+j) = Median;
-            // nb++;
-        }else if(stateArray[i]==3){
-            for (int j = threadIdx.x; j < blockSize; j += blockDim.x)
-                *((newData)+i*blockSize+j) = fdata[nc[i]*blockSize+j];
-            // nc++;
-        }
-        __syncthreads();
-    }
-}
-
-void decompress_post_proc_fast(unsigned char *data, float *newData, int blockSize, 
-    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
-    float *constantMedianArray
-){
-    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
-    // checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
-    
-    int i,j;
-    uint64_t *nb, *nc;
-    checkCudaErrors(cudaMalloc(&nb, sizeof(uint64_t)*nbBlocks));
-    checkCudaErrors(cudaMalloc(&nc, sizeof(uint64_t)*nbBlocks));
-
-    generateNbNc<<<BLOCKS,THREADS_PER_BLOCK>>>(nbBlocks, ncBlocks, stateArray, nb,nc);
-    cudaDeviceSynchronize();
-    thrust::exclusive_scan(thrust::device, nb, nb + nbBlocks, nb, 0);
-    thrust::exclusive_scan(thrust::device, nc, nc + nbBlocks, nc, 0);
-
-    decompress_final_set<<<nbBlocks,blockSize>>>(data, newData, blockSize,nbBlocks, ncBlocks, stateArray,constantMedianArray, nb, nc);
-    cudaDeviceSynchronize();
-    cudaFree(nb);
-    cudaFree(nc);
-}
-
-__global__ void decompress_post_proc(unsigned char *data, float *newData, int blockSize, 
-    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
-    float *constantMedianArray
-){
-    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
-    // checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
-    float* fdata = (float*)data;
-    int i,j;
-    int nb=0, nc=0;
-    // if (blockIdx.x == 0)
-    // {
-    //     for (i=0;i<nbBlocks;i++){
-    //         if (stateArray[i]==0 || stateArray[i]==1){
-    //             float Median = constantMedianArray[nb];
-    //             // if (Median>1) printf("data%i:%f\n",i, Median);
-    //             for (j = threadIdx.x; j < blockSize; j += blockDim.x)
-    //                 *((newData)+i*blockSize+j) = Median;
-    //             nb++;
-    //         }
-    //     }
-    // }else{
-    //     for (i=0;i<nbBlocks;i++){
-    //         if(stateArray[i]==3){
-    //             for (j = threadIdx.x; j < blockSize; j += blockDim.x)
-    //                 *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
-    //             nc++;
-    //         }
-    //     }
-    // }
-    
-    for (i=0;i<nbBlocks;i++){
-        if (stateArray[i]==0 || stateArray[i]==1){
-            float Median = constantMedianArray[nb];
-            // if (Median>1) printf("data%i:%f\n",i, Median);
-            for (j = threadIdx.x; j < blockSize; j += blockDim.x)
-                *((newData)+i*blockSize+j) = Median;
-            nb++;
-        }else if(stateArray[i]==3){
-            for (j = threadIdx.x; j < blockSize; j += blockDim.x)
-                *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
-            nc++;
-        }
-    }
-
-    //for(int k = 0; k < nbBlocks*blockSize;k++){
-//	printf("%f\n", newData[k]);
-  //  }
-}
-
-float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
-{
-    /**
-     * Assume the following are device pointers
-     * 
-     * unsigned char* cmpBytes
-     * float** newData
-     * 
-     */
-    
-    uint32_t *blk_idx;
-    uint8_t *blk_subidx;
-    uint8_t *blk_sig;
-    float *blk_vals, *constantMedianArray;
-    size_t *num_sig, *mSize, mSize_h, num_sig_h;
-    int *blockSize, bs;
-    size_t *nbConstantBlocks, *nbBlocks, *ncBlocks, nbBlocks_h, ncBlocks_h, nbConstantBlocks_h;
-    unsigned char *stateArray, *data;
-    float *newData;
-    timer_GPU.StartCounter();
-    unsigned char *oldCmpBytes = cmpBytes;
-	//*newData = (float*)malloc(sizeof(float)*nbEle);
-//    printf("cmpbytes check %d\n", (int)cmpBytes[0]);
-//    printf("new check %f\n", *newData[0]);
-    // printf("malloc\n");
-    checkCudaErrors(cudaMalloc((void**)&num_sig, sizeof(size_t)));
-    checkCudaErrors(cudaMalloc((void**)&blockSize, sizeof(int)));
-    checkCudaErrors(cudaMalloc((void**)&nbConstantBlocks, sizeof(size_t)));
-    checkCudaErrors(cudaMalloc((void**)&nbBlocks, sizeof(size_t)));
-    checkCudaErrors(cudaMalloc((void**)&ncBlocks, sizeof(size_t)));
-    checkCudaErrors(cudaMalloc((void**)&mSize, sizeof(size_t)));    
-    checkCudaErrors(cudaMalloc((void**)&newData, sizeof(float)*nbEle));
-
-    decompress_get_stats<<<1,1>>>(newData, nbEle, cmpBytes, 
-        num_sig, blockSize,
-        nbConstantBlocks, nbBlocks,
-        mSize, cmpBytes
-    );
-    cudaDeviceSynchronize();
-
-    cudaError_t err = cudaGetLastError();        // Get error code
-    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    checkCudaErrors(cudaMemcpy(&nbBlocks_h, nbBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(&nbConstantBlocks_h, nbConstantBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(&bs, blockSize, sizeof(int), cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(&mSize_h, mSize, sizeof(size_t), cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(&num_sig_h, num_sig, sizeof(size_t), cudaMemcpyDeviceToHost)); 
-
-
-    checkCudaErrors(cudaMalloc((void**)&stateArray, nbBlocks_h));
-    checkCudaErrors(cudaMalloc((void**)&constantMedianArray, nbConstantBlocks_h*sizeof(float)));
-
-    checkCudaErrors(cudaMalloc((void**)&blk_idx, nbBlocks_h*sizeof(uint32_t)));
-    checkCudaErrors(cudaMalloc((void**)&blk_vals, num_sig_h*sizeof(float)));
-    checkCudaErrors(cudaMalloc((void**)&blk_subidx, num_sig_h*sizeof(uint8_t)));
-    checkCudaErrors(cudaMalloc((void**)&blk_sig, nbBlocks_h*sizeof(uint8_t)));
-
-    unsigned char* tmp_r = cmpBytes;
-    unsigned char* newR;
-    setup_data_stateArray_better(newData, nbEle, tmp_r, 
-    num_sig_h, bs,
-    nbConstantBlocks_h, nbBlocks_h, &ncBlocks_h,
-    stateArray, newR);
-    
-    
-    
-   // setup_data_stateArray<<<1,1>>>(newData, nbEle, cmpBytes, 
-   //      num_sig_h, bs,
-   //      nbConstantBlocks_h, nbBlocks_h, ncBlocks,
-   //      stateArray, cmpBytes
-   //  );
-   // cudaDeviceSynchronize();
-
-   // printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
-   // checkCudaErrors(cudaMemcpy(&ncBlocks_h, ncBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
-
-    checkCudaErrors(cudaMalloc((void**)&data, ncBlocks_h*bs*sizeof(float)));
-
-    // err = cudaGetLastError();        // Get error code
-    // printf("CUDA start Error: %s\n", cudaGetErrorString(err));
-    // cmpBytes = newCmpBytes;
-    // data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
-    // memset(data, 0, ncBlocks*blockSize*sizeof(float));
-    // stateArray = (unsigned char*)malloc(nbBlocks);
-    
-    // // unsigned char* d_stateArray;
-    // // cudaMalloc(&d_stateArray, nbBlocks);
-	// constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));			
-
-    // blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
-    // blk_vals= (float *)malloc((num_sig)*sizeof(float));
-    // blk_subidx = (uint8_t *)malloc((num_sig)*sizeof(uint8_t));
-    // blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
-
-    //printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
-    //test_nbBlks = (size_t *)malloc(sizeof(size_t));
-    // printf("malloc\n");
-    
-    
-    tmp_r = cmpBytes;
-    decompress_startup_better(newData, nbEle, tmp_r, 
-    blk_idx, blk_subidx, blk_sig,
-    blk_vals, num_sig_h, bs,
-     nbConstantBlocks_h, nbBlocks_h, ncBlocks_h,
-    stateArray, constantMedianArray, data,
-    mSize_h, newR);
-
-
-    // err = cudaGetLastError();        // Get error code
-    // printf("CUDA start Error: %s\n", cudaGetErrorString(err));
-    //decompress_startup<<<1,1>>>(newData, nbEle, cmpBytes, 
-    // blk_idx, blk_subidx, blk_sig,
-    // blk_vals, num_sig_h, bs,
-    // nbConstantBlocks_h, nbBlocks_h, ncBlocks_h,
-    // stateArray, constantMedianArray, data, mSize_h, cmpBytes);
-    //cudaDeviceSynchronize();
-    // cmpBytes = newCmpBytes;
-
-    //printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
-
-    // unsigned char* d_data;
-    float *d_newdata;
-    // checkCudaErrors(cudaMalloc((void**)&d_data, ncBlocks*blockSize*sizeof(float))); 
-    // checkCudaErrors(cudaMemcpy(d_data, data, ncBlocks*blockSize*sizeof(float), cudaMemcpyHostToDevice)); 
-    // printf("nblocks: %d bs: %d ncblock %d\n", nbBlocks_h, bs, ncBlocks_h);
-    checkCudaErrors(cudaMalloc(&d_newdata, nbBlocks_h*bs*sizeof(float)));
-
-    // err = cudaGetLastError();        // Get error code
-    // printf("CUDA dec main Error: %s\n", cudaGetErrorString(err));
-    
-    dim3 dimBlock(32, bs/32);
-    dim3 dimGrid(65536, 1);
-    const int sMemsize = bs * sizeof(float) + dimBlock.y * sizeof(int);
-    decompress_state2<<<nbBlocks_h, 64>>>(d_newdata, stateArray,blk_idx, blk_vals, blk_subidx, bs, blk_sig);
-    cudaDeviceSynchronize();
-
-    // err = cudaGetLastError();        // Get error code
-    // printf("CUDA dec main Error: %s\n", cudaGetErrorString(err));
-    decompress_float<<<dimGrid, dimBlock, sMemsize>>>(data, bs, ncBlocks_h, mSize_h);
-    //printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
-    cudaDeviceSynchronize();
-
-    // err = cudaGetLastError();        // Get error code
-    // printf("CUDA dec main Error: %s\n", cudaGetErrorString(err));
-    
-    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(newData, d_newdata, nbBlocks_h*bs*sizeof(float), cudaMemcpyDeviceToDevice));
-    cudaFree(d_newdata);
-
-    // decompress_post_proc<<<1,1>>>(data, newData, bs, 
-    // nbBlocks_h, ncBlocks_h, stateArray,
-    // constantMedianArray);
-    // cudaDeviceSynchronize();
-    decompress_post_proc_fast(data, newData, bs, 
-    nbBlocks_h, ncBlocks_h, stateArray,
-    constantMedianArray);
-    err = cudaGetLastError();        // Get error code
-    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    //printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
-   // print_newdata<<<1,1>>>(newData, nbBlocks_h, bs);
-	cudaFree(stateArray);
-	cudaFree(constantMedianArray);
-	cudaFree(data);
-    cudaFree(blk_idx);
-    cudaFree(blk_subidx);
-    cudaFree(blk_vals);
-    cudaFree(blk_sig);
-    return newData;
-
-}
-
+#include "cuszx_entry.h"
+#include "szx_defines.h"
+#include "szx_BytesToolkit.h"
+#include "szx_TypeManager.h"
+#include "timingGPU.h"
+#include "szx.h"
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+#include <cub/cub.cuh>
+#include <thrust/extrema.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+#include <cub/cub.cuh>
+
+#define SPARSITY_LEVEL 0.25
+#define BLOCKS 40
+#define THREADS_PER_BLOCK 256
+
+TimingGPU timer_GPU;
+void bin(unsigned n)
+{
+    unsigned i;
+    for (i = 1 << 31; i > 0; i = i / 2)
+        (n & i) ? printf("1") : printf("0");
+}
+
+__host__ __device__ size_t convert_state_to_out(unsigned char* meta, size_t length, unsigned char *result){
+    size_t out_length;
+
+    if(length%4==0)
+		out_length = length/4;
+	else
+		out_length = length/4+1;
+
+    for (size_t i = 0; i < out_length; i++)
+    {
+        uint8_t tmp = 0;
+
+        for (size_t j = 0; j < 4; j++)
+        {
+            if (i*4 + j < length)
+            {
+                tmp |= (0x03 & meta[i*4+j]) << 2*j;
+            }
+            
+        }
+        result[i] = tmp;
+    }
+    return out_length;
+}
+
+__global__ void convert_state_to_out_kernel(unsigned char* meta, size_t length, unsigned char *result, size_t out_length){
+    
+
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < out_length; i += blockDim.x*gridDim.x){
+        uint8_t tmp = 0;
+
+        for (size_t j = 0; j < 4; j++)
+        {
+            if (i*4 + j < length)
+            {
+                tmp |= (0x03 & meta[i*4+j]) << 2*j;
+            }
+            
+        }
+        result[i] = tmp;
+    }
+}
+
+__global__ void convert_out_to_state_kernel(size_t nbBlocks, unsigned char* cmp, unsigned char* out_state, size_t state_length, int *num_state2blks, int *ncBlocks){
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < state_length; i += blockDim.x*gridDim.x){
+        for (size_t j = 0; j < 4; j++)
+        {
+            if (4*i + j < nbBlocks)
+            {
+                out_state[4*i + j]= (cmp[i] >> 2*j) & 0x03;
+                if (out_state[4*i+j] == 2)
+                {
+                    atomicAdd(num_state2blks, 1);
+                }else if(out_state[4*i+j]==3){
+                    atomicAdd(ncBlocks, 1);
+                }
+                
+            }
+            
+        }
+    }
+}
+
+// nbBlocks, r, stateNBBytes, stateArray
+__host__ __device__ size_t convert_out_to_state(size_t nbBlocks, unsigned char* cmp, unsigned char* out_state){
+    size_t state_length;
+    if(nbBlocks%4==0)
+		state_length = nbBlocks/4;
+	else
+		state_length = nbBlocks/4+1;
+
+    for (size_t i = 0; i < state_length; i++)
+    {
+        for (size_t j = 0; j < 4; j++)
+        {
+            if (4*i + j < nbBlocks)
+            {
+                out_state[4*i + j]= (cmp[i] >> 2*j) & 0x03;
+            }
+            
+        }
+    }
+    return nbBlocks;
+}
+
+__host__ __device__ size_t convert_block2_to_out(unsigned char *result, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    size_t out_length = 0;
+    
+    memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
+    out_length += numBlocks*4;
+    memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
+    out_length += num_sig*sizeof(float);
+    memcpy(result+out_length, blk_subidx, num_sig*sizeof(uint8_t));
+    out_length += num_sig*sizeof(uint8_t);
+    memcpy(result+out_length, blk_sig, numBlocks*sizeof(uint8_t));
+    out_length+= numBlocks*sizeof(uint8_t);
+
+    return out_length;
+}
+
+__global__ void convert_block2_to_out_kernel(unsigned char *result, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    
+    size_t out_length = 0;
+    unsigned char *tmp_result = result;
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
+        uint32_t local_blkidx = blk_idx[i];
+        tmp_result[4*i] = (local_blkidx) & 0xff;
+        tmp_result[4*i+1] = (local_blkidx >> (8*1)) & 0xff;
+        tmp_result[4*i+2] = (local_blkidx >> (8*2)) & 0xff;
+        tmp_result[4*i+3] = (local_blkidx >> (8*3)) & 0xff;
+    }
+    // memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
+    out_length += numBlocks*4;
+    tmp_result = result+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
+        float value = blk_vals[i];
+	    memcpy(&tmp_result[4*i], &value, sizeof(float));
+	//unsigned char *v = ()
+        //tmp_result[(int)4*i] = (unsigned char)((value) & 0xff);
+        //tmp_result[(int)4*i+1] = (unsigned char)((value >> (8*1)) & 0xff);
+        //tmp_result[(int)4*i+2] = (unsigned char)((value >> (8*2)) & 0xff);
+        //tmp_result[(int)4*i+3] = (unsigned char)((value >> (8*3)) & 0xff);
+    }
+    // memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
+    out_length += num_sig*sizeof(float);
+    tmp_result = result+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
+        tmp_result[i] = blk_subidx[i];
+        
+    }
+
+    out_length += num_sig*sizeof(uint8_t);
+    tmp_result = result+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
+        tmp_result[i] = blk_sig[i];
+        
+    }
+    out_length+= numBlocks*sizeof(uint8_t);
+
+    // return out_length;
+}
+
+__global__ void convert_out_to_block2_kernel(unsigned char *in_cmp, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    size_t out_length = 0;
+    
+    unsigned char *tmp_result = in_cmp;
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
+        
+        uint32_t local_blkidx = (tmp_result[4*i] & 0xff) | ((tmp_result[4*i+1] & 0xff) << (8*1)) 
+                                | ((tmp_result[4*i+2] & 0xff) << (8*2)) | ((tmp_result[4*i+3] & 0xff) << (8*3));
+        blk_idx[i] = local_blkidx;
+    }
+    // memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
+    out_length += numBlocks*4;
+    tmp_result = in_cmp+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
+        float value = 0.0;
+        memcpy(&value, &tmp_result[4*i], sizeof(float));
+        blk_vals[i] = value;
+	    
+	//unsigned char *v = ()
+        //tmp_result[(int)4*i] = (unsigned char)((value) & 0xff);
+        //tmp_result[(int)4*i+1] = (unsigned char)((value >> (8*1)) & 0xff);
+        //tmp_result[(int)4*i+2] = (unsigned char)((value >> (8*2)) & 0xff);
+        //tmp_result[(int)4*i+3] = (unsigned char)((value >> (8*3)) & 0xff);
+    }
+    // memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
+    out_length += num_sig*sizeof(float);
+    tmp_result = in_cmp+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
+        blk_subidx[i] = tmp_result[i];
+        
+    }
+
+    out_length += num_sig*sizeof(uint8_t);
+    tmp_result = in_cmp+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
+        blk_sig[i] = tmp_result[i];
+        
+    }
+    out_length+= numBlocks*sizeof(uint8_t);
+}
+
+__host__ __device__ size_t convert_out_to_block2(unsigned char *in_cmp, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    size_t out_length = 0;
+    memcpy(blk_idx, in_cmp, numBlocks*sizeof(uint32_t));
+    out_length += numBlocks*4;
+    memcpy(blk_vals, in_cmp+out_length,num_sig*sizeof(float));
+    out_length += num_sig*sizeof(float);
+    memcpy(blk_subidx, in_cmp+out_length, num_sig*sizeof(uint8_t));
+    out_length += num_sig*sizeof(uint8_t);
+    memcpy(blk_sig, in_cmp+out_length, numBlocks*sizeof(uint8_t));
+    out_length += numBlocks*sizeof(uint8_t);
+//    printf("outlength: %d\n",out_length);
+    return out_length;
+}
+
+int _post_proc(float *oriData, unsigned char *meta, short *offsets, unsigned char *midBytes, unsigned char *outBytes, size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig)
+{
+    int out_size = 0;
+
+    size_t nbConstantBlocks = 0;
+    size_t nbBlocks = nbEle/blockSize;
+    size_t ncBytes = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    out_size += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
+    if (nbBlocks%8==0)
+        out_size += nbBlocks/8;
+    else
+        out_size += nbBlocks/8+1;
+    int s0 = 0;
+    int s1 = 0;
+    int s2 = 0;
+    int s3 = 0;
+    for (int i=0; i<nbBlocks; i++){
+        if (meta[i]==0 || meta[i]==1 || meta[i] == 2) nbConstantBlocks++;
+        else out_size += 1+(blockSize/4)+offsets[i];
+    
+    	if(meta[i]==0) s0++;
+    	if(meta[i]==1) s1++;
+    	if(meta[i]==2) s2++;
+    	if(meta[i]==3) s3++;
+    }
+//    printf("%d %d %d %d\n", s0, s1, s2, s3);
+    out_size += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
+
+    //outBytes = (unsigned char*)malloc(out_size);
+  //  printf("accessing outbytes now...\n");
+	unsigned char* r = outBytes;
+    unsigned char* r_old = outBytes; 
+	r[0] = SZx_VER_MAJOR;
+	r[1] = SZx_VER_MINOR;
+	r[2] = 1;
+	r[3] = 0; // indicates this is not a random access version
+	r[4] = (unsigned char)blockSize;
+	r=r+5; //1 byte
+	sizeToBytes(r, nbConstantBlocks);
+	r += sizeof(size_t);
+    sizeToBytes(r, (size_t) num_sig);
+    r += sizeof(size_t); 
+	r += convert_state_to_out(meta, nbBlocks, r);
+    r += convert_block2_to_out(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
+    r += (nbEle%blockSize)*sizeof(float);
+    unsigned char* c = r;
+    unsigned char* o = c+nbConstantBlocks*sizeof(float);
+    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
+    for (int i=0; i<nbBlocks; i++){
+        
+        if (meta[i]==0 || meta[i] == 1){
+	    memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
+            c += sizeof(float);
+        }else if(meta[i] == 3){
+            shortToBytes(o, offsets[i]);
+	   
+            o += sizeof(short);
+            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
+            
+	    nc += mSize; 
+            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            
+	    nc += offsets[i];
+	   
+        } 
+    }
+
+    // return out_size;
+    return (uint32_t) (nc-r_old);
+}
+
+unsigned char* cuSZx_fast_compress_args_unpredictable_blocked_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold)
+{
+//    printf("tr thresh abs %f %f\n", threshold, absErrBound);
+  //  printf("first: %f %f %f\n", oriData[0], oriData[1], oriData[2]);
+    float sparsity_level = SPARSITY_LEVEL;
+	float* d_oriData;
+    cudaMalloc((void**)&d_oriData, sizeof(float)*nbEle); 
+    cudaMemcpy(d_oriData, oriData, sizeof(float)*nbEle, cudaMemcpyHostToDevice); 
+
+	size_t nbBlocks = nbEle/blockSize;
+	size_t remainCount = nbEle%blockSize;
+	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
+
+    size_t ncBytes = blockSize/4;
+    //ncBytes = (blockSize+1)%4==0 ? ncBytes : ncBytes+1; //Bytes to store one non-constant block data.
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    size_t msz = (1+mSize) * nbBlocks * sizeof(unsigned char);
+    size_t mbsz = sizeof(float) * nbEle * sizeof(unsigned char);
+
+    unsigned char *meta = (unsigned char*)malloc(msz);
+    short *offsets = (short*)malloc(nbBlocks*sizeof(short));
+    unsigned char *midBytes = (unsigned char*)malloc(mbsz);
+
+	unsigned char* d_meta;
+	unsigned char* d_midBytes;
+	short* d_offsets;
+
+    uint32_t *blk_idx, *d_blk_idx;
+    uint8_t *blk_sig, *d_blk_sig;
+    uint8_t *blk_subidx, *d_blk_subidx;
+    float *blk_vals, *d_blk_vals;
+    uint64_t *num_sig, *d_num_sig;
+
+    checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
+    num_sig = (uint64_t *)malloc(sizeof(uint64_t));
+    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
+    // blk_idx = malloc()
+    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, nbEle*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, nbEle*sizeof(float)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void**)&d_meta, msz)); 
+    //checkCudaErrors(cudaMemcpy(d_meta, meta, msz, cudaMemcpyHostToDevice)); 
+    checkCudaErrors(cudaMemset(d_meta, 0, msz));
+    checkCudaErrors(cudaMalloc((void**)&d_offsets, nbBlocks*sizeof(short))); 
+    checkCudaErrors(cudaMemset(d_offsets, 0, nbBlocks*sizeof(short)));
+    checkCudaErrors(cudaMalloc((void**)&d_midBytes, mbsz)); 
+    checkCudaErrors(cudaMemset(d_midBytes, 0, mbsz));
+
+    timer_GPU.StartCounter();
+    // apply_threshold<<<80,256>>>(d_oriData, threshold, nbEle);
+    // cudaDeviceSynchronize();
+    dim3 dimBlock(32, blockSize/32);
+    dim3 dimGrid(65536, 1);
+    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
+    compress_float<<<dimGrid, dimBlock, sMemsize>>>(d_oriData, d_meta, d_offsets, d_midBytes, absErrBound, blockSize, nbBlocks, mSize, sparsity_level, d_blk_idx, d_blk_subidx,d_blk_vals, threshold, d_blk_sig);
+    cudaError_t err = cudaGetLastError();        // Get error code
+    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    //printf("GPU compression timing: %f ms\n", timer_GPU.GetCounter());
+    cudaDeviceSynchronize();
+    get_numsig<<<1,1>>>(d_num_sig);
+    cudaDeviceSynchronize();
+
+    checkCudaErrors(cudaMemcpy(num_sig, d_num_sig, sizeof(uint64_t), cudaMemcpyDeviceToHost));
+
+    blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
+    blk_vals= (float *)malloc((*num_sig)*sizeof(float));
+    blk_subidx = (uint8_t *)malloc((*num_sig)*sizeof(uint8_t));
+    blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
+
+    checkCudaErrors(cudaMemcpy(meta, d_meta, msz, cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(offsets, d_offsets, nbBlocks*sizeof(short), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(midBytes, d_midBytes, mbsz, cudaMemcpyDeviceToHost)); 
+    
+    
+    checkCudaErrors(cudaMemcpy(blk_idx, d_blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(blk_vals,d_blk_vals, (*num_sig)*sizeof(float), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(blk_subidx,d_blk_subidx, (*num_sig)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(blk_sig,d_blk_sig, (nbBlocks)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
+
+    size_t maxPreservedBufferSize = sizeof(float)*nbEle;
+    unsigned char* outBytes = (unsigned char*)malloc(maxPreservedBufferSize);
+    memset(outBytes, 0, maxPreservedBufferSize);
+
+    outSize = (size_t *)malloc(sizeof(size_t));
+    //outSize[0] = _post_proc(oriData, meta, offsets, midBytes, outBytes, nbEle, blockSize, *num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+
+    *outSize = _post_proc(oriData, meta, offsets, midBytes, outBytes, nbEle, blockSize, *num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+//    printf("Beginning free\n");
+    // printf("outsize %p \n", outBytes);
+    free(blk_idx);
+    free(blk_subidx);
+    free(blk_vals);
+    free(meta);
+    free(offsets);
+    free(midBytes);
+    checkCudaErrors(cudaFree(d_meta));
+    checkCudaErrors(cudaFree(d_offsets));
+    checkCudaErrors(cudaFree(d_midBytes));
+    return outBytes;
+}
+
+void cuSZx_fast_decompress_args_unpredictable_blocked_float(float** newData, size_t nbEle, unsigned char* cmpBytes)
+{
+    uint32_t *blk_idx, *d_blk_idx;
+    uint8_t *blk_subidx, *d_blk_subidx;
+    uint8_t *blk_sig, *d_blk_sig;
+    float *blk_vals, *d_blk_vals;
+    size_t num_sig, *d_num_sig;
+
+	*newData = (float*)malloc(sizeof(float)*nbEle);
+    memset(*newData, 0, sizeof(float)*nbEle);
+	
+	unsigned char* r = cmpBytes;
+	r += 4;
+	int blockSize = r[0];  //get block size
+	if(blockSize == 0)blockSize = 256;
+	r++;
+	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
+	r += sizeof(size_t);
+	num_sig = bytesToSize(r);
+    r += sizeof(size_t);
+	size_t nbBlocks = nbEle/blockSize;
+    size_t ncBlocks = 0;
+    size_t num_state2_blks = 0;
+	// size_t ncBlocks = nbBlocks - nbConstantBlocks; //get number of constant blocks
+	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    size_t ncLeading = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncLeading; //Number of bytes for each data block's metadata.
+	unsigned char* stateArray = (unsigned char*)malloc(nbBlocks);
+    unsigned char* d_stateArray;
+    cudaMalloc(&d_stateArray, nbBlocks);
+	float* constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));			
+	
+    
+
+    blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
+    blk_vals= (float *)malloc((num_sig)*sizeof(float));
+    blk_subidx = (uint8_t *)malloc((num_sig)*sizeof(uint8_t));
+    blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
+
+	// printf("Converting state array\n");
+    convert_out_to_state(nbBlocks, r, stateArray);
+	// convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
+	for (size_t i = 0; i < nbBlocks; i++)
+    {
+        if (stateArray[i] == 2)
+        {
+            num_state2_blks++;
+        }else if(stateArray[i] == 3){
+            ncBlocks++;
+        }
+    }
+    
+	r += stateNBBytes;
+    unsigned char* data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
+    memset(data, 0, ncBlocks*blockSize*sizeof(float));
+    // printf("converting block vals\n");
+    size_t to_add = convert_out_to_block2(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    r+= to_add;
+    // checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
+    // num_sig = (uint64_t *)malloc(sizeof(uint64_t));
+    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
+    // blk_idx = malloc()
+    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, num_sig*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, num_sig*sizeof(float)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMemcpy(d_blk_idx, blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_blk_vals, blk_vals, (num_sig)*sizeof(float), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_blk_subidx, blk_subidx, (num_sig)*sizeof(uint8_t), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_stateArray, stateArray, nbBlocks, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_blk_sig, blk_sig, nbBlocks*sizeof(uint8_t), cudaMemcpyHostToDevice));
+
+
+	size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+    memcpy((*newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
+    r += (nbEle%blockSize)*sizeof(float);
+	float* fr = (float*)r; //fr is the starting address of constant median values.
+	for(i = 0;i < nbConstantBlocks;i++, j+=4) //get the median values for constant-value blocks
+		constantMedianArray[i] = fr[i];
+    r += nbConstantBlocks*sizeof(float);
+    unsigned char* p = r + ncBlocks * sizeof(short);
+    for(i = 0;i < ncBlocks;i++){
+        int leng = (int)bytesToShort(r)+mSize;
+        r += sizeof(short);
+        if (leng > blockSize*sizeof(float))
+        {
+            printf("Warning: compressed block is larger than the original block!\n");
+            exit(0);
+        }
+        memcpy(data+i*blockSize*sizeof(float), p, leng);
+        p += leng;
+    } 
+
+    unsigned char* d_data;
+    float *d_newdata;
+    checkCudaErrors(cudaMalloc((void**)&d_data, ncBlocks*blockSize*sizeof(float))); 
+    checkCudaErrors(cudaMemcpy(d_data, data, ncBlocks*blockSize*sizeof(float), cudaMemcpyHostToDevice)); 
+    checkCudaErrors(cudaMalloc(&d_newdata, nbBlocks*blockSize*sizeof(float)));
+
+    timer_GPU.StartCounter();
+    dim3 dimBlock(32, blockSize/32);
+    dim3 dimGrid(65536, 1);
+    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
+    decompress_state2<<<nbBlocks, 64>>>(d_newdata, d_stateArray,d_blk_idx, d_blk_vals, d_blk_subidx,blockSize, d_blk_sig);
+    decompress_float<<<dimGrid, dimBlock, sMemsize>>>(d_data, blockSize, ncBlocks, mSize);
+    cudaError_t err = cudaGetLastError();        // Get error code
+    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    //printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
+    cudaDeviceSynchronize();
+    checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
+    float* fdata = (float*)data;
+
+    int nb=0, nc=0;
+    for (i=0;i<nbBlocks;i++){
+        if (stateArray[i]==0 || stateArray[i]==1){
+            float Median = constantMedianArray[nb];
+            if (Median>1) printf("data%i:%f\n",i, Median);
+            for (j=0;j<blockSize;j++)
+                *((*newData)+i*blockSize+j) = Median;
+            nb++;
+        }else if(stateArray[i]==3){
+            for (j=0;j<blockSize;j++)
+                *((*newData)+i*blockSize+j) = fdata[nc*blockSize+j];
+            nc++;
+        }
+    }
+
+	free(stateArray);
+	free(constantMedianArray);
+	free(data);
+    cudaFree(d_newdata);
+    cudaFree(d_stateArray);
+    checkCudaErrors(cudaFree(d_data));
+
+}
+
+__device__ inline void longToBytes_bigEndian_d(unsigned char *b, unsigned long num) 
+{
+	b[0] = (unsigned char)(num>>56);
+	b[1] = (unsigned char)(num>>48);
+	b[2] = (unsigned char)(num>>40);
+	b[3] = (unsigned char)(num>>32);
+	b[4] = (unsigned char)(num>>24);
+	b[5] = (unsigned char)(num>>16);
+	b[6] = (unsigned char)(num>>8);
+	b[7] = (unsigned char)(num);
+//	if(dataEndianType==LITTLE_ENDIAN_DATA)
+//		symTransform_8bytes(*b);
+}
+
+inline void longToBytes_bigEndian_memset(unsigned char *b, unsigned long num) 
+{
+    checkCudaErrors(cudaMemset(&b[0], (unsigned char)(num>>56), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[1], (unsigned char)(num>>48), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[2], (unsigned char)(num>>40), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[3], (unsigned char)(num>>32), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[4], (unsigned char)(num>>24), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[5], (unsigned char)(num>>16), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[6], (unsigned char)(num>>8), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[7], (unsigned char)(num), sizeof(char)));
+//	if(dataEndianType==LITTLE_ENDIAN_DATA)
+//		symTransform_8bytes(*b);
+}
+
+__device__ inline void shortToBytes_d(unsigned char* b, short value)
+{
+	lint16 buf;
+	buf.svalue = value;
+	memcpy(b, buf.byte, 2);
+}
+
+
+
+__global__ void getNumNonConstantBlocks(size_t nbBlocks, short *offsets, unsigned char *meta, int blockSize, int *nonconstant, int *out_size){
+    for (int tid = blockDim.x*blockIdx.x + threadIdx.x; tid < nbBlocks; tid += blockDim.x*gridDim.x){
+        if (meta[tid] == 3){ 
+            atomicAdd(nonconstant, 1);
+            atomicAdd(out_size,1+(blockSize/4)+offsets[tid]);
+        }
+    }
+}
+
+__global__ void generateFlags(unsigned char *states, uint64_t *cBlk_flags, uint64_t *ncBlk_flags,uint64_t* offset_indices,short* offsets, size_t nbBlocks){
+    for (int tid = blockDim.x*blockIdx.x + threadIdx.x; tid < nbBlocks; tid += blockDim.x*gridDim.x){
+        if (states[tid] == 0 || states[tid] == 1)
+        {
+            cBlk_flags[tid] = 1;
+            ncBlk_flags[tid] = 0;
+            offset_indices[tid] = 0;
+        }else if(states[tid]==3){
+            ncBlk_flags[tid] = 1;
+            cBlk_flags[tid] = 0;
+            offset_indices[tid] = (uint64_t) offsets[tid];
+        }else{
+            cBlk_flags[tid] = 0;
+            ncBlk_flags[tid] = 0;
+            offset_indices[tid] = 0;
+        }
+        
+    }
+}
+
+__global__ void nccopy_kernel2(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, uint64_t *cBlk_indices, uint64_t *ncBlk_indices, uint64_t* offset_indices){
+   // printf("blockdim %d blockidx %d threadidx %d griddim %d\n", blockDim.x, blockIdx.x, threadIdx.x, gridDim.x);
+    int i;
+    int num_threads = (blockDim.x*gridDim.x);
+    int tid = blockDim.x*blockIdx.x + threadIdx.x;
+    int blocks_per_thread = nbBlocks/num_threads;
+    int start_idx = tid*blocks_per_thread;
+    int end_idx = start_idx+blocks_per_thread;
+
+    if (tid == num_threads-1)
+    {
+        end_idx = nbBlocks;
+    }
+    
+    unsigned char* tmp_o = o+(sizeof(short)*ncBlk_indices[start_idx]);
+    unsigned char* tmp_nc= nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]);
+    for (i=start_idx; i<end_idx; i++){
+        if(meta[i] == 3){
+	
+            
+            shortToBytes_d(o, offsets[i]);
+            tmp_o += sizeof(short);
+            memcpy(tmp_nc, meta+(nbBlocks+i*mSize), mSize);
+            tmp_nc += mSize; 
+            memcpy(tmp_nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            tmp_nc += offsets[i];
+
+            // shortToBytes_d(o+(sizeof(short)*ncBlk_indices[i]), offsets[i]);
+            
+            // memcpy(nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]), meta+(nbBlocks+i*mSize), mSize);
+
+
+            // memcpy(nc+(mSize*(ncBlk_indices[i]+1) + offset_indices[i]*ncBlk_indices[i]), midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+        } 
+    }
+    
+}
+
+
+__global__ void nccopy_kernel(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, uint64_t *cBlk_indices, uint64_t *ncBlk_indices, uint64_t* offset_indices, size_t *final_nc){
+   // printf("blockdim %d blockidx %d threadidx %d griddim %d\n", blockDim.x, blockIdx.x, threadIdx.x, gridDim.x);
+    int i;
+    // if(threadIdx.x==0){
+	// printf("c: %ld nc: %ld\n", cBlk_indices[nbBlocks-1], ncBlk_indices[nbBlocks-1]);
+    // }
+    for (i=blockDim.x*blockIdx.x + threadIdx.x; i<nbBlocks; i+=blockDim.x*gridDim.x){
+        //printf("meta %d i: %d\n",meta[i], i); 
+        if (meta[i]==0 || meta[i] == 1){
+            // printf("cblk\n");
+	        memcpy(c+(sizeof(float)*cBlk_indices[i]), meta+(nbBlocks+i*mSize), sizeof(float));
+	   
+            // printf("cblk done\n");
+	    // c += sizeof(float);
+	    // float g;
+	    // memcpy(&g, (meta+(nbBlocks+i*mSize)),sizeof(float));
+	    // printf("%d %f\n",i,g);
+        }
+        else if(meta[i] == 3){
+	
+        //     printf("ncblk 1\n");
+            shortToBytes_d(o+(sizeof(short)*ncBlk_indices[i]), offsets[i]);
+             // o += sizeof(short);
+
+        //     printf("ncblk 2 nbBlocks %d %d \n", nbBlocks, i);
+            // printf("nbBlkindices %ld offset_indices %ld\n", ncBlk_indices[i], offset_indices[i]);
+        //     printf(" test 1%c\n",meta+(nbBlocks+i*mSize));
+        //     printf("test 2%c\n", nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]));
+            memcpy(nc+((mSize*ncBlk_indices[i] + offset_indices[i])), meta+(nbBlocks+i*mSize), mSize);
+        //         // nc += mSize; 
+                
+        //     printf("ncblk 3\n");
+            memcpy(nc+(((mSize*ncBlk_indices[i])+mSize + offset_indices[i])), midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+        //         // nc += offsets[i];
+            
+        //     printf("ncblk 4\n");
+        }
+        if (i==nbBlocks-1)
+        {
+            *final_nc = (size_t) (((mSize*ncBlk_indices[i])+mSize + offset_indices[i]))+offsets[i];
+	}
+        
+    }
+    
+}
+
+//__global__ void nccopy_kernel(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+//                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, int *cBlk_indices, int *ncBlk_indices, int* offset_indices){
+//    printf("blockdim %d blockidx %d threadidx %d griddim %d\n", blockDim.x, blockIdx.x, threadIdx.x, gridDim.x);
+//    int i;
+//    for (i=blockDim.x*blockIdx.x + threadIdx.x; i<nbBlocks; i+=blockDim.x*gridDim.x){
+        //printf("meta %d i: %d\n",meta[i], i); 
+//        if (meta[i]==0 || meta[i] == 1){
+            // printf("cblk\n");
+//	    memcpy(c+(sizeof(float)*cBlk_indices[i]), meta+(nbBlocks+i*mSize), sizeof(float));
+
+            // printf("cblk done\n");
+	    // c += sizeof(float);
+	    // float g;
+	    // memcpy(&g, (meta+(nbBlocks+i*mSize)),sizeof(float));
+	    // printf("%d %f\n",i,g);
+//        }else if(meta[i] == 3){
+	
+//           printf("ncblk 1\n");
+//           shortToBytes_d(o+(sizeof(short)*ncBlk_indices[i]), offsets[i]);
+            // o += sizeof(short);
+
+//           printf("ncblk 2 nbBlocks %d %d \n", nbBlocks, i);
+//	   printf("nbBlkindices %d offset_indices %d\n", ncBlk_indices[i], offset_indices[i]);
+//	   memcpy(nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]), meta+(nbBlocks+i*mSize), mSize);
+            // nc += mSize; 
+            
+//           printf("ncblk 3\n");
+//	   memcpy(nc+(mSize*(ncBlk_indices[i]+1) + offset_indices[i]*ncBlk_indices[i]), midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            // nc += offsets[i];
+        
+//           printf("ncblk 4\n");
+//	} 
+//    }
+    
+//}
+
+__global__ void set_nc(unsigned char *nc, short *offsets, uint64_t *offset_indices, uint64_t *ncBlk_indices, size_t mSize, size_t nbBlocks){
+    if (threadIdx.x == 0 && blockIdx.x == 0)
+    {
+        nc = nc + (mSize*(ncBlk_indices[nbBlocks -1]+1) + offset_indices[nbBlocks - 1]*ncBlk_indices[nbBlocks - 1]) + offsets[nbBlocks-1];
+    }
+    
+}
+
+void ncblkCopy_fast(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, size_t *final_nc){
+    uint64_t *cBlk_indices, *ncBlk_indices;
+    uint64_t *offset_indices;
+    TimingGPU timer2;
+
+    // timer2.StartCounter();
+    
+    checkCudaErrors(cudaMalloc(&cBlk_indices, sizeof(uint64_t)*nbBlocks));
+    checkCudaErrors(cudaMalloc(&ncBlk_indices, sizeof(uint64_t)*nbBlocks));
+    checkCudaErrors(cudaMalloc(&offset_indices, sizeof(uint64_t)*nbBlocks));
+
+    generateFlags<<<BLOCKS,THREADS_PER_BLOCK>>>(meta, cBlk_indices, ncBlk_indices, offset_indices, offsets, nbBlocks);
+    cudaDeviceSynchronize();
+
+    thrust::exclusive_scan(thrust::device, cBlk_indices, cBlk_indices + nbBlocks, cBlk_indices, 0);
+    thrust::exclusive_scan(thrust::device, ncBlk_indices, ncBlk_indices + nbBlocks, ncBlk_indices, 0);
+    thrust::exclusive_scan(thrust::device, offset_indices, offset_indices + nbBlocks, offset_indices, 0);
+
+    nccopy_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(c, o, nc, midBytes, meta, nbBlocks, blockSize, offsets, mSize, cBlk_indices,ncBlk_indices,offset_indices,final_nc);
+    // nccopy_kernel2<<<1,1>>>(c, o, nc, midBytes, meta, nbBlocks, blockSize, offsets, mSize, cBlk_indices,ncBlk_indices,offset_indices);
+
+    cudaDeviceSynchronize();
+
+    //printf("nc: %p\n", nc);
+    // printf("%s\n", cudaGetErrorString(cudaGetLastError()));
+    // set_nc<<<1,1>>>(nc, offsets, offset_indices, ncBlk_indices, mSize, nbBlocks);
+    // cudaDeviceSynchronize();
+    // printf("ncblockcpy: %f ms\n", timer2.GetCounter());
+    checkCudaErrors(cudaFree(cBlk_indices));
+    checkCudaErrors(cudaFree(ncBlk_indices));
+    checkCudaErrors(cudaFree(offset_indices));
+}
+
+void ncblkCopy_h(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize){
+    unsigned char *tmp_states;
+    unsigned char *ncold = nc;
+    uint64_t col_off = 0;
+    short *tmp_offsets;
+    tmp_offsets = (short*)malloc(sizeof(short)*nbBlocks);
+    tmp_states = (unsigned char *)malloc(sizeof(char)*nbBlocks);
+    checkCudaErrors(cudaMemcpy(tmp_states, meta, sizeof(char)*nbBlocks, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(tmp_offsets,offsets,sizeof(short)*nbBlocks,cudaMemcpyDeviceToHost));
+    cudaStream_t stream[3];
+    cudaStreamCreate(&stream[0]);
+    cudaStreamCreate(&stream[1]);
+    cudaStreamCreate(&stream[2]);
+
+    //printf("here\n");
+    //checkCudaErrors(cudaMemcpy((void**)&d_offsets, nbBlocks*sizeof(short))); 
+    for (int i = 0; i < nbBlocks; i++)
+    {
+        if(tmp_states[i]==3){
+            // shortToBytes_d(o, offsets[i]);
+            // buf = (unsigned char*)
+            
+//	    printf("here2\n");
+            cudaMemcpyAsync(o, offsets+i, 2, cudaMemcpyDeviceToDevice, stream[0]);
+            o += sizeof(short);
+        
+    //	    printf("here2.1\n");
+            // printf("offsets %ld\n", col_off);
+            cudaMemcpyAsync(nc, meta+(nbBlocks+i*mSize), mSize, cudaMemcpyDeviceToDevice, stream[1]);
+                // memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
+                
+            nc += mSize; 
+                
+    //	    printf("here2.2\n");
+            //checkCudaErrors(cudaMemcpy(buf, offsets+i, sizeof(short), cudaMemcpyDeviceToHost));
+                
+    //	    //printf("here2.3 %d\n", buf);
+            cudaMemcpyAsync(nc, midBytes+(i*blockSize*sizeof(float)), (int)tmp_offsets[i], cudaMemcpyDeviceToDevice, stream[2]);
+            // memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            nc += tmp_offsets[i];
+            col_off+=tmp_offsets[i];
+       
+///	    printf("here2.4\n");
+       	}
+    }
+    cudaStreamDestroy(stream[0]);
+    cudaStreamDestroy(stream[1]);
+    cudaStreamDestroy(stream[2]);
+
+    free(tmp_states);
+    free(tmp_offsets); 
+}
+
+__global__ void ncblkCopy(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize)
+{
+    for (int i=blockDim.x*blockIdx.x + threadIdx.x; i<nbBlocks; i+=blockDim.x*gridDim.x){
+        
+        if (meta[i]==0 || meta[i] == 1){
+            memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
+            c += sizeof(float);
+	    // float g;
+	    // memcpy(&g, (meta+(nbBlocks+i*mSize)),sizeof(float));
+	    // printf("%d %f\n",i,g);
+        }else if(meta[i] == 3){
+           shortToBytes_d(o, offsets[i]);
+            o += sizeof(short);
+            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
+            nc += mSize; 
+            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            nc += offsets[i];
+        } 
+    }
+}
+
+size_t better_post_proc(size_t *outSize, float *oriData, unsigned char *meta, 
+                                short *offsets, unsigned char *midBytes, unsigned char *outBytes, 
+                                size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, 
+                                float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    /**
+     * outSize: host pointer
+     * float *oriData: device pointer
+     * unsigned char* meta: device pointer
+     * short *offsets: device pointer
+     * 
+     * 
+     */
+    int out_size_h = 0;
+    int *out_size_d;
+    int tmp_outsize = 0;
+    size_t *nc_diff;
+    size_t nbConstantBlocks = 0;
+    size_t nbBlocks = nbEle/blockSize;
+    size_t ncBytes = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    out_size_h += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
+    if (nbBlocks%8==0)
+        out_size_h += nbBlocks/8;
+    else
+        out_size_h += nbBlocks/8+1;
+    cudaMalloc(&nc_diff, sizeof(size_t));
+    int *nonconstant_d, nonconstant_h;
+    checkCudaErrors(cudaMalloc((void **)&nonconstant_d, sizeof(int)));
+    checkCudaErrors(cudaMalloc((void **)&out_size_d, sizeof(int)));
+
+    checkCudaErrors(cudaMemset(nonconstant_d, 0, sizeof(int)));
+    checkCudaErrors(cudaMemset(out_size_d, 0, sizeof(int)));
+
+
+    getNumNonConstantBlocks<<<BLOCKS,THREADS_PER_BLOCK>>>(nbBlocks, offsets, meta, blockSize, nonconstant_d, out_size_d);
+    cudaDeviceSynchronize();
+
+    checkCudaErrors(cudaMemcpy(&nonconstant_h, nonconstant_d, sizeof(int), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(&tmp_outsize, out_size_d, sizeof(int), cudaMemcpyDeviceToHost));
+
+    nbConstantBlocks = nbBlocks - nonconstant_h;
+    out_size_h+=tmp_outsize;
+
+    out_size_h += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
+
+    //outBytes = (unsigned char*)malloc(out_size);
+	unsigned char* r = outBytes;
+    unsigned char* r_old = outBytes;
+    // cudaDeviceSynchronize(); printf("%s\n",cudaGetLastError());
+    checkCudaErrors(cudaMemset(r, SZx_VER_MAJOR, sizeof(char)));
+    checkCudaErrors(cudaMemset(r+1, SZx_VER_MINOR, sizeof(char)));
+    checkCudaErrors(cudaMemset(r+2, 1, sizeof(char)));
+    checkCudaErrors(cudaMemset(r+3, 0, sizeof(char)));
+    checkCudaErrors(cudaMemset(r+4, blockSize, sizeof(char)));
+
+	r=r+5; //1 byte
+	//sizeToBytes(r, nbConstantBlocks);
+    longToBytes_bigEndian_memset(r, nbConstantBlocks);
+	r += sizeof(size_t);
+    //sizeToBytes(r, (size_t) num_sig);
+    longToBytes_bigEndian_memset(r, (unsigned long)num_sig);
+    r += sizeof(size_t); 
+    size_t out_length;
+
+    if(nbBlocks%4==0)
+		out_length = nbBlocks/4;
+	else
+		out_length = nbBlocks/4+1;
+
+    convert_state_to_out_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(meta, nbBlocks, r, out_length);
+    r+=out_length;
+    convert_block2_to_out_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    r += nbBlocks*4 + num_sig*sizeof(float) + num_sig*sizeof(uint8_t) + nbBlocks*sizeof(uint8_t);
+
+    checkCudaErrors(cudaMemcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float), cudaMemcpyDeviceToDevice));
+    // memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
+    r += (nbEle%blockSize)*sizeof(float);
+    unsigned char* c = r;
+    unsigned char* o = c+nbConstantBlocks*sizeof(float);
+    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
+    // ncblkCopy<<<1,1>>>(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
+    
+    // ncblkCopy_h(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
+    ncblkCopy_fast(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize, nc_diff);
+    // cudaDeviceSynchronize();
+    size_t h_nc_diff;
+    cudaMemcpy(&h_nc_diff,nc_diff, sizeof(size_t),cudaMemcpyDeviceToHost);
+    return (size_t) (nc+h_nc_diff-r_old);
+    // checkCudaErrors(cudaMemcpy(outSize, (size_t)(nc-r_old), sizeof(size_t)));
+    // *outSize = (size_t) (nc-r_old);
+    // return outBytes;
+}
+
+__global__ void device_post_proc(size_t *outSize, float *oriData, unsigned char *meta, 
+                                short *offsets, unsigned char *midBytes, unsigned char *outBytes, 
+                                size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, 
+                                float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig)
+{
+    int out_size = 0;
+
+    size_t nbConstantBlocks = 0;
+    size_t nbBlocks = nbEle/blockSize;
+    size_t ncBytes = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    out_size += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
+    if (nbBlocks%8==0)
+        out_size += nbBlocks/8;
+    else
+        out_size += nbBlocks/8+1;
+    int s0 = 0;
+    int s1 = 0;
+    int s2 = 0;
+    int s3 = 0;
+    for (int i=0; i<nbBlocks; i++){
+        if (meta[i]==0 || meta[i]==1 || meta[i] == 2) nbConstantBlocks++;
+        else out_size += 1+(blockSize/4)+offsets[i];
+    
+    	if(meta[i]==0) s0++;
+    	if(meta[i]==1) s1++;
+    	if(meta[i]==2) s2++;
+    	if(meta[i]==3) s3++;
+    }
+  //  printf("%d %d %d %d\n", s0, s1, s2, s3);
+    out_size += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
+
+    //outBytes = (unsigned char*)malloc(out_size);
+	unsigned char* r = outBytes;
+   // printf("outbytes %p\n",r);
+    unsigned char* r_old = outBytes; 
+	r[0] = SZx_VER_MAJOR;
+	r[1] = SZx_VER_MINOR;
+	r[2] = 1;
+	r[3] = 0; // indicates this is not a random access version
+	r[4] = (unsigned char)blockSize;
+	r=r+5; //1 byte
+	//sizeToBytes(r, nbConstantBlocks);
+    longToBytes_bigEndian_d(r, nbConstantBlocks);
+	r += sizeof(size_t);
+    //sizeToBytes(r, (size_t) num_sig);
+
+   // printf("outbytes %p\n",r);
+    longToBytes_bigEndian_d(r, (unsigned long)num_sig);
+    r += sizeof(size_t); 
+	r += convert_state_to_out(meta, nbBlocks, r);
+   // printf("num sig %d\n", num_sig); 
+   // printf("outbytes %p\n",r);
+    r += convert_block2_to_out(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    
+   // printf("outbytes %p\n",r);
+    memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
+    r += (nbEle%blockSize)*sizeof(float);
+
+   // printf("outbytes %p\n",r);
+    unsigned char* c = r;
+    unsigned char* o = c+nbConstantBlocks*sizeof(float);
+    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
+    for (int i=0; i<nbBlocks; i++){
+        
+        if (meta[i]==0 || meta[i] == 1){
+            memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
+            c += sizeof(float);
+       
+	    // float g;
+	    // memcpy(&g, (c-sizeof(float)),sizeof(float));
+	    // printf("%d %f\n",i,g);
+       	}else if(meta[i] == 3){
+           shortToBytes_d(o, offsets[i]);
+            o += sizeof(short);
+            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
+            nc += mSize; 
+            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            nc += offsets[i];
+        } 
+    }
+
+    // return out_size;
+    *outSize = (size_t) (nc-r_old);
+   // printf("outBytes 0 %d\n", (int) outBytes[0]);
+    // return (uint32_t) (nc-r_old);
+}
+
+__global__ void fin_copy(unsigned char* in, unsigned char *out, size_t n){
+
+	for(size_t i = threadIdx.x+blockIdx.x*gridDim.x; i < n; i+=blockDim.x*gridDim.x){
+		out[i]=in[i];
+	}
+
+}
+
+unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold)
+{
+    /**
+     * Assuming the following are device pointers:
+     *  float *oriData
+     *  size_t *outSize
+     *  unsigned char* outBytes
+     * 
+     */
+    // float *dmin,*dmax, *hmin, *hmax;
+    // void *d_temp_storage = NULL;
+    // size_t temp_storage_bytes = 0;
+    timer_GPU.StartCounter();
+//     cudaMalloc(&dmin, sizeof(float));
+//     cudaMalloc(&dmax, sizeof(float));
+
+//    // dmax = thrust::reduce(oriData, oriData+nbEle, -1, thrust::maximum<float>());
+//    // dmin = thrust::reduce(oriData, oriData+nbEle, 1, thrust::minimum<float>());
+//     cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, oriData, dmax, nbEle);
+//     cudaMalloc(&d_temp_storage, temp_storage_bytes);
+//     cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, oriData, dmax, nbEle);
+
+//     cudaFree(d_temp_storage);
+//     cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, oriData, dmin, nbEle);
+//     cudaMalloc(&d_temp_storage, temp_storage_bytes);
+//     cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, oriData, dmin, nbEle);
+
+//     cudaFree(d_temp_storage);
+//     // thrust::pair<float *, float *> result = thrust::minmax_element(thrust::device, oriData,oriData+nbEle);
+//     //printf("here\n");
+//     cudaMemcpy(hmin, dmin, sizeof(float), cudaMemcpyDeviceToHost);
+//     cudaMemcpy(hmax, dmax,sizeof(float), cudaMemcpyDeviceToHost);
+//     absErrBound = absErrBound*(hmax-hmin);
+//     threshold = threshold*(hmax-hmin);
+    // // printf("%f\n",absErrBound);
+    // cudaFree(dmin);
+    // cudaFree(dmax);
+    float sparsity_level = SPARSITY_LEVEL;
+
+    // Set the input data as the function parameter, this should be a device pointer
+
+	float* d_oriData = oriData;
+    // cudaMalloc((void**)&d_oriData, sizeof(float)*nbEle); 
+    // cudaMemcpy(d_oriData, oriData, sizeof(float)*nbEle, cudaMemcpyHostToDevice); 
+
+	size_t nbBlocks = nbEle/blockSize;
+	size_t remainCount = nbEle%blockSize;
+	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
+
+    size_t ncBytes = blockSize/4;
+    //ncBytes = (blockSize+1)%4==0 ? ncBytes : ncBytes+1; //Bytes to store one non-constant block data.
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    size_t msz = (1+mSize) * nbBlocks * sizeof(unsigned char);
+    size_t mbsz = sizeof(float) * nbEle * sizeof(unsigned char);
+
+    // These are host pointers and do not need to be allocated
+
+    // unsigned char *meta = (unsigned char*)malloc(msz);
+    // short *offsets = (short*)malloc(nbBlocks*sizeof(short));
+    // unsigned char *midBytes = (unsigned char*)malloc(mbsz);
+
+	unsigned char* d_meta;
+	unsigned char* d_midBytes;
+	short* d_offsets;
+
+    uint32_t *blk_idx, *d_blk_idx;
+    uint8_t *blk_sig, *d_blk_sig;
+    uint8_t *blk_subidx, *d_blk_subidx;
+    float *blk_vals, *d_blk_vals;
+    uint64_t *num_sig, *d_num_sig;
+
+    checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
+    num_sig = (uint64_t *)malloc(sizeof(uint64_t));
+    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
+    // blk_idx = malloc()
+    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, nbEle*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, nbEle*sizeof(float)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void**)&d_meta, msz)); 
+    //checkCudaErrors(cudaMemcpy(d_meta, meta, msz, cudaMemcpyHostToDevice)); 
+    checkCudaErrors(cudaMemset(d_meta, 0, msz));
+    checkCudaErrors(cudaMalloc((void**)&d_offsets, nbBlocks*sizeof(short))); 
+    checkCudaErrors(cudaMemset(d_offsets, 0, nbBlocks*sizeof(short)));
+    checkCudaErrors(cudaMalloc((void**)&d_midBytes, mbsz)); 
+    checkCudaErrors(cudaMemset(d_midBytes, 0, mbsz));
+
+    
+    // apply_threshold<<<80,256>>>(d_oriData, threshold, nbEle);
+    // cudaDeviceSynchronize();
+    dim3 dimBlock(32, blockSize/32);
+    dim3 dimGrid(65536, 1);
+    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
+    //printf("Malloc end timestamp: %f ms\n", timer_GPU.GetCounter());
+    compress_float<<<dimGrid, dimBlock, sMemsize>>>(d_oriData, d_meta, d_offsets, d_midBytes, absErrBound, blockSize, nbBlocks, mSize, sparsity_level, d_blk_idx, d_blk_subidx,d_blk_vals, threshold, d_blk_sig);
+    cudaError_t err = cudaGetLastError();        // Get error code
+   // printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    //printf("GPU compression timestamp: %f ms\n", timer_GPU.GetCounter());
+    cudaDeviceSynchronize();
+    get_numsig<<<1,1>>>(d_num_sig);
+    cudaDeviceSynchronize();
+
+    checkCudaErrors(cudaMemcpy(num_sig, d_num_sig, sizeof(uint64_t), cudaMemcpyDeviceToHost));
+
+    // These are allocations and memcpys to host pointers, do not need them
+
+    // blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
+    // blk_vals= (float *)malloc((*num_sig)*sizeof(float));
+    // blk_subidx = (uint8_t *)malloc((*num_sig)*sizeof(uint8_t));
+    // blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
+
+    // checkCudaErrors(cudaMemcpy(meta, d_meta, msz, cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(offsets, d_offsets, nbBlocks*sizeof(short), cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(midBytes, d_midBytes, mbsz, cudaMemcpyDeviceToHost)); 
+    
+    
+    // checkCudaErrors(cudaMemcpy(blk_idx, d_blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyDeviceToHost));
+    // checkCudaErrors(cudaMemcpy(blk_vals,d_blk_vals, (*num_sig)*sizeof(float), cudaMemcpyDeviceToHost));
+    // checkCudaErrors(cudaMemcpy(blk_subidx,d_blk_subidx, (*num_sig)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
+    // checkCudaErrors(cudaMemcpy(blk_sig,d_blk_sig, (nbBlocks)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
+
+
+    size_t maxPreservedBufferSize = sizeof(float)*nbEle;
+    unsigned char *d_outBytes;
+    // unsigned char* outBytes = (unsigned char*)malloc(maxPreservedBufferSize);
+    // memset(outBytes, 0, maxPreservedBufferSize);
+    checkCudaErrors(cudaMalloc(&d_outBytes, maxPreservedBufferSize));
+
+    size_t *d_outSize;
+
+    checkCudaErrors(cudaMalloc(&d_outSize, sizeof(size_t)));
+
+  //  device_post_proc<<<1,1>>>(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
+    *outSize = better_post_proc(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
+    //cudaDeviceSynchronize();
+    
+    //checkCudaErrors(cudaMemcpy(outSize, d_outSize, sizeof(size_t), cudaMemcpyDeviceToHost));
+
+    // printf("completed compression\n");
+    //free(blk_idx);
+    //free(blk_subidx);
+    //free(blk_vals);
+    // free(meta);
+    // free(offsets);
+    // free(midBytes);
+    checkCudaErrors(cudaFree(d_num_sig));
+    checkCudaErrors(cudaFree(d_blk_idx));
+    checkCudaErrors(cudaFree(d_blk_subidx));
+    checkCudaErrors(cudaFree(d_blk_vals));
+    checkCudaErrors(cudaFree(d_blk_sig));
+
+    checkCudaErrors(cudaFree(d_meta));
+    checkCudaErrors(cudaFree(d_offsets));
+    checkCudaErrors(cudaFree(d_midBytes));
+
+    unsigned char *d_newout;
+    
+    *outSize = *outSize;
+    size_t os = *outSize;
+    
+    checkCudaErrors(cudaMalloc(&d_newout, os));
+    //fin_copy<<<40,256>>>(d_outBytes, d_newout,os);
+    checkCudaErrors(cudaMemcpy(d_newout, d_outBytes, os, cudaMemcpyDeviceToDevice));
+    cudaDeviceSynchronize(); 
+
+    checkCudaErrors(cudaFree(d_outBytes));
+    printf("Compression end timestamp: %f ms\n", timer_GPU.GetCounter());
+     
+    err = cudaGetLastError();        // Get error code
+    printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    return d_newout;
+    //return d_outBytes;
+}
+
+__device__ inline long bytesToLong_bigEndian(unsigned char* b) {
+	long temp = 0;
+	long res = 0;
+
+	res <<= 8;
+	temp = b[0] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = b[1] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[3] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[4] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[5] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[6] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[7] & 0xff;
+	res |= temp;						
+	
+	return res;
+}
+
+__device__ inline size_t bytesToSize(unsigned char* bytes)
+{
+	size_t result = bytesToLong_bigEndian(bytes);//8	
+	return result;
+}
+
+__device__ inline short bytesToShort(unsigned char* bytes)
+{
+	lint16 buf;
+	memcpy(buf.byte, bytes, 2);
+	
+	return buf.svalue;
+}
+
+__global__ void decompress_get_stats(float *newData, size_t nbEle, unsigned char* cmpBytes, 
+    size_t *numSigValues, int *bs,
+    size_t *numConstantBlks, size_t *numBlks,
+    size_t *mSizeptr, unsigned char *newCmpBytes
+){
+	unsigned char* r = cmpBytes;
+
+    size_t num_sig;
+	r += 4;
+	int blockSize = (int) r[0];  //get block size
+	
+	if(blockSize == 0)blockSize = 256;
+	r++;
+	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
+	r += sizeof(size_t);
+	num_sig = bytesToSize(r);
+    
+    r += sizeof(size_t);
+	size_t nbBlocks = nbEle/blockSize;
+    size_t ncBlocks = 0;
+    size_t num_state2_blks = 0;
+	// size_t ncBlocks = nbBlocks - nbConstantBlocks; //get number of constant blocks
+	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    size_t ncLeading = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncLeading; //Number of bytes for each data block's metadata.
+
+    *mSizeptr = mSize;
+
+    *numConstantBlks = nbConstantBlocks;
+    *numBlks = nbBlocks;
+    *numSigValues = num_sig;
+    *bs = blockSize;
+    newCmpBytes = r;
+
+}
+
+ void setup_data_stateArray_better(float *newData, size_t nbEle, unsigned char* r, 
+    size_t num_sig, int blockSize,
+    size_t nbConstantBlocks, size_t nbBlocks, size_t *ncBlks,
+    unsigned char *stateArray, unsigned char *newR
+){
+
+    //printf("ma\n");
+    // blockSize = 256;
+    r += 4;
+    r++;
+    r += sizeof(size_t);
+    r += sizeof(size_t);
+    int ncBlocks, *ncBlocks_d;
+	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    int num_state2_blks, *num_state2_d;
+    checkCudaErrors(cudaMalloc((void **)&num_state2_d, sizeof(int)));
+    checkCudaErrors(cudaMalloc((void **)&ncBlocks_d, sizeof(int)));
+    checkCudaErrors(cudaMemset(num_state2_d, 0, sizeof(int)));
+    checkCudaErrors(cudaMemset(ncBlocks_d, 0, sizeof(int)));
+
+    //printf("ma2\n");
+//	printf("Converting state array\n");
+    // printf("cmp %d\n", (int)r[0]);
+    // printf("state %d\n", (int)stateArray[0]);
+    // convert_out_to_state(nbBlocks, r, stateArray);
+    convert_out_to_state_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(nbBlocks,r,stateArray,stateNBBytes,
+                            num_state2_d, ncBlocks_d);
+    // printf("state %d\n", (int)stateArray[0]);
+    // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
+	cudaDeviceSynchronize();
+    
+    //printf("ma3\n");
+	r += stateNBBytes;
+    newR = r;
+    cudaMemcpy(&ncBlocks, ncBlocks_d, sizeof(int), cudaMemcpyDeviceToHost);
+    
+    //printf("ma4\n");
+    *ncBlks = ncBlocks;
+
+    //printf("ma4\n");
+ }
+
+__global__ void setup_data_stateArray(float *newData, size_t nbEle, unsigned char* r, 
+    size_t num_sig, int blockSize,
+    size_t nbConstantBlocks, size_t nbBlocks, size_t *ncBlks,
+    unsigned char *stateArray, unsigned char *newR
+){
+    // blockSize = 256;
+    r += 4;
+    r++;
+    r += sizeof(size_t);
+    r += sizeof(size_t);
+    size_t ncBlocks = 0;
+	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    size_t num_state2_blks = 0;
+//	printf("Converting state array\n");
+    // printf("cmp %d\n", (int)r[0]);
+    // printf("state %d\n", (int)stateArray[0]);
+    convert_out_to_state(nbBlocks, r, stateArray);
+    // convert_out_to_state_kernel<<<40,256>>>(nbBlocks,r,stateArray,stateNBBytes);
+    // printf("state %d\n", (int)stateArray[0]);
+    // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
+	for (size_t i = 0; i < nbBlocks; i++)
+    {
+        if (stateArray[i] == 2)
+        {
+            num_state2_blks++;
+        }else if(stateArray[i] == 3){
+            ncBlocks++;
+        }
+    }
+    
+	r += stateNBBytes;
+    newR = r;
+    *ncBlks = ncBlocks;
+}
+
+__global__ void decomp_startup_kernel(unsigned char* r, size_t nbConstantBlocks, 
+unsigned char *data, int blockSize, size_t mSize, size_t ncBlocks, float *constantMedianArray, uint64_t* g_leng){
+    unsigned char * fr = r; //fr is the starting address of constant median values.
+    int i = 0, j = 0, k = 0;
+  //  printf("%p\n", r);
+    unsigned char tmp_r[4];
+    tmp_r[0]=fr[0];
+    tmp_r[1]=fr[1];
+    tmp_r[2]=fr[2];
+    tmp_r[3]=fr[3];
+
+
+//    printf("nbconstant: %f\n", ((float*)tmp_r)[0]);
+// nbConstantBlocks
+    for(i = blockDim.x*blockIdx.x + threadIdx.x; i < nbConstantBlocks; i += blockDim.x*gridDim.x){ //get the median values for constant-value blocks
+	    
+    	    tmp_r[0]=fr[4*i];
+    	    tmp_r[1]=fr[4*i+1];
+    	    tmp_r[2]=fr[4*i+2];
+    	    tmp_r[3]=fr[4*i+3];
+	    float tmp = ((float*)tmp_r)[0];
+	    constantMedianArray[i] = tmp;
+	    //printf("%d %f\n", i, tmp);
+    }
+   
+
+/** PROBLEM AREA, CAN FIX WITH PARALLELIZATION BUT WATCH *FR and *P **/
+
+    // if(threadIdx.x==0 && blockIdx.x==0){
+    fr += nbConstantBlocks*sizeof(float);
+    unsigned char* p = fr + ncBlocks * sizeof(short);
+    unsigned char* basefr = fr;
+    unsigned char* basep = p;
+    for(i = blockDim.x*blockIdx.x + threadIdx.x;i < ncBlocks;i+=blockDim.x*gridDim.x){
+        fr = basefr+(sizeof(short)*i);
+        int leng = (int)bytesToShort(fr)+mSize;
+        g_leng[i] = (uint64_t)leng;
+        // fr += sizeof(short);
+        if (leng > blockSize*sizeof(float))
+        {
+            printf("Warning: compressed block is larger than the original block!\n");
+            return;
+            // exit(0);
+        }
+        // memcpy(data+i*blockSize*sizeof(float), p, leng);
+
+        // p += leng;
+    }
+    
+    // }
+}
+
+__global__ void decompress_ncblk_kernel(unsigned char* r, size_t nbConstantBlocks, 
+unsigned char *data, int blockSize, size_t mSize, size_t ncBlocks, float *constantMedianArray, uint64_t* g_leng){
+    unsigned char * fr = r;
+    fr += nbConstantBlocks*sizeof(float);
+    unsigned char* p = fr + ncBlocks * sizeof(short);
+    unsigned char* basefr = fr;
+    unsigned char* basep = p;
+
+    for(int i = blockDim.x*blockIdx.x + threadIdx.x;i < ncBlocks;i+=blockDim.x*gridDim.x){
+        fr = basefr+(sizeof(short)*i);
+        int leng = (int)bytesToShort(fr)+mSize;
+        
+	
+	// g_leng[i] = leng;
+        // // fr += sizeof(short);
+        // if (leng > blockSize*sizeof(float))
+        // {
+        //     printf("Warning: compressed block is larger than the original block!\n");
+        //     return;
+        //     // exit(0);
+        // }
+        p = basep + g_leng[i];
+
+        memcpy(data+i*blockSize*sizeof(float), p, leng);
+	
+        // p += leng;
+    }
+}
+
+void decompress_startup_better(float *newData, size_t nbEle, unsigned char* r, 
+    uint32_t *blk_idx, uint8_t *blk_subidx, uint8_t *blk_sig,
+    float *blk_vals, size_t num_sig, int blockSize,
+    size_t nbConstantBlocks, size_t nbBlocks, size_t ncBlocks,
+    unsigned char *stateArray, float* constantMedianArray, unsigned char *data,
+    size_t mSize, unsigned char *newCmpBytes
+){
+    // blockSize = 256;
+    size_t nb_tmp = (int) nbEle/blockSize;
+    uint64_t* g_leng;
+    /**
+     * Structures to return:
+     * blk_idx, blk_subidx, blk_sig, blk_vals, numSigValues (pointer)
+     * bs (pointer to blockSize), numConstantBlks (pointer), numBlks (pointer)
+     * ncBlks (pointer), stateArray, constantMedianArray
+     */
+
+
+    size_t stateNBBytes = nb_tmp%4==0 ? nb_tmp/4 : nb_tmp/4+1;
+    
+    r += 4;
+    r++;
+    r += sizeof(size_t);
+    r += sizeof(size_t);
+
+    r += stateNBBytes;
+
+    convert_out_to_block2_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    size_t to_add = nbBlocks*4 + num_sig*sizeof(float) + num_sig*sizeof(uint8_t) + nbBlocks*sizeof(uint8_t);
+    r+= to_add;
+
+    size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+    
+    // printf("before mallocs in kernel\n");
+    checkCudaErrors(cudaMemcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float), cudaMemcpyDeviceToDevice));
+    checkCudaErrors(cudaMalloc(&g_leng, sizeof(uint64_t)*ncBlocks));
+    // memcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
+
+    //printf("before mallocs in kernel %p\n", r);
+    r += (nbEle%blockSize)*sizeof(float);
+    //printf("r: %p\n", r);
+    //printf("%d, %d, %d\n",nbEle, 256, nbEle%256);
+    decomp_startup_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbConstantBlocks,data, blockSize, mSize, ncBlocks, constantMedianArray, g_leng);
+    cudaDeviceSynchronize();
+
+    thrust::exclusive_scan(thrust::device, g_leng, g_leng + ncBlocks, g_leng, 0);
+
+    decompress_ncblk_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbConstantBlocks, data, blockSize, mSize, ncBlocks, constantMedianArray, g_leng);
+    cudaDeviceSynchronize();
+    
+    // cudaError_t err = cudaGetLastError();        // Get error code
+    
+    // printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    cudaFree(g_leng);
+        
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    r += nbConstantBlocks*sizeof(float);
+
+    newCmpBytes = r;
+
+}
+
+__global__ void decompress_startup(float *newData, size_t nbEle, unsigned char* r, 
+    uint32_t *blk_idx, uint8_t *blk_subidx, uint8_t *blk_sig,
+    float *blk_vals, size_t num_sig, int blockSize,
+    size_t nbConstantBlocks, size_t nbBlocks, size_t ncBlocks,
+    unsigned char *stateArray, float* constantMedianArray, unsigned char *data,
+    size_t mSize, unsigned char *newCmpBytes
+){
+    // blockSize = 256;
+    size_t nb_tmp = (int) nbEle/blockSize;
+    /**
+     * Structures to return:
+     * blk_idx, blk_subidx, blk_sig, blk_vals, numSigValues (pointer)
+     * bs (pointer to blockSize), numConstantBlks (pointer), numBlks (pointer)
+     * ncBlks (pointer), stateArray, constantMedianArray
+     */
+	
+    // size_t ncBlocks = 0;
+	// size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    // size_t num_state2_blks = 0;
+	// printf("Converting state array\n");
+    // convert_out_to_state(nbBlocks, r, stateArray);
+    // printf("state %d\n", (int)stateArray[0]);
+    // // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
+	// for (size_t i = 0; i < nbBlocks; i++)
+    // {
+    //     if (stateArray[i] == 2)
+    //     {
+    //         num_state2_blks++;
+    //     }else if(stateArray[i] == 3){
+    //         ncBlocks++;
+    //     }
+    // }
+   // size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+
+    size_t stateNBBytes = nb_tmp%4==0 ? nb_tmp/4 : nb_tmp/4+1;
+    //printf("%p\n", r);
+    r += 4;
+    r++;
+    r += sizeof(size_t);
+    r += sizeof(size_t);
+    //printf("statenb %d %d\n", stateNBBytes, nb_tmp);
+    r += stateNBBytes;
+    // data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
+    // memset(data, 0, ncBlocks*blockSize*sizeof(float));
+   // printf("converting block vals %d\n", data[0]);
+    size_t to_add = convert_out_to_block2(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    r+= to_add;
+
+    size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+    
+    // printf("before mallocs in kernel\n");
+    
+    memcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
+
+    //printf("before mallocs in kernel %p\n", r);
+    r += (nbEle%blockSize)*sizeof(float);
+    //printf("r: %p\n", r);
+    //printf("%d, %d, %d\n",nbEle, 256, nbEle%256);
+    unsigned char * fr = r; //fr is the starting address of constant median values.
+
+  //  printf("%p\n", r);
+    unsigned char tmp_r[4];
+    tmp_r[0]=r[0];
+    tmp_r[1]=r[1];
+    tmp_r[2]=r[2];
+    tmp_r[3]=r[3];
+
+
+//    printf("nbconstant: %f\n", ((float*)tmp_r)[0]);
+    for(i = 0;i < nbConstantBlocks;i++, j+=4){ //get the median values for constant-value blocks
+	    
+    	    tmp_r[0]=r[j];
+    	    tmp_r[1]=r[j+1];
+    	    tmp_r[2]=r[j+2];
+    	    tmp_r[3]=r[j+3];
+	    float tmp = ((float*)tmp_r)[0];
+//	    printf("median: %f\n", tmp);	
+	    constantMedianArray[i] = tmp;
+
+	    // printf("%d %f\n", i, tmp);
+    }
+    //printf("after constantmedian\n");
+    r += nbConstantBlocks*sizeof(float);
+    unsigned char* p = r + ncBlocks * sizeof(short);
+    for(i = 0;i < ncBlocks;i++){
+        int leng = (int)bytesToShort(r)+mSize;
+        r += sizeof(short);
+        if (leng > blockSize*sizeof(float))
+        {
+            printf("Warning: compressed block is larger than the original block!\n");
+            return;
+            // exit(0);
+        }
+//	printf("before memcpy\n");
+        memcpy(data+i*blockSize*sizeof(float), p, leng);
+  //      printf("after memcpy\n");
+	p += leng;
+    } 
+
+    newCmpBytes = r;
+//    printf("before mallocs in kernel\n");
+
+    // printf("nb blocks: %d\n", nbBlocks);
+}
+
+__global__ void cBlkCopy_decompress(int nb, float* constantMedianArray, float *newData, int blockSize, int i){
+    int j;
+    float Median = constantMedianArray[nb];
+    // j = threadIdx.x; j < blockSize; j += blockDim.x
+    for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+        *((newData)+i*blockSize+j) = Median;
+}
+
+__global__ void ncBlkCopy_decompress(int blockSize, float *newData, int nc, float *fdata, int i){
+    int j;
+    for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+        *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
+}
+
+void decompress_post_proc_better(unsigned char *data, float *newData, int blockSize, 
+    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
+    float *constantMedianArray
+){
+    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
+    float* fdata = (float*)data;
+    int i,j;
+    int nb=0, nc=0;
+    //printf("h1\n");
+    for (i=0;i<nbBlocks;i++){
+        unsigned char state;
+        cudaMemcpy(&state, &stateArray[i], sizeof(char), cudaMemcpyDeviceToHost);
+
+        if (state==0 || state==1){
+            cBlkCopy_decompress<<<1,256>>>(nb, constantMedianArray, newData, blockSize, i);
+            nb++;
+        }else if(state==3){
+            ncBlkCopy_decompress<<<1,256>>>(blockSize, newData, nc, fdata, i);
+            nc++;
+        }
+    }
+    cudaDeviceSynchronize();
+    //for(int k = 0; k < nbBlocks*blockSize;k++){
+//	printf("%f\n", newData[k]);
+  //  }
+}
+
+__global__ void print_newdata(float *newData, size_t nbBlocks, int blockSize){
+    for (size_t i = 0; i < nbBlocks*blockSize; i++)
+    {
+        printf("%f\n", newData[i]);
+    }
+    
+}
+
+__global__ void generateNbNc(size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray, uint64_t* nbs,  uint64_t* ncs){
+    for(int i = blockDim.x*blockIdx.x + threadIdx.x;i < nbBlocks;i+=blockDim.x*gridDim.x){
+        unsigned char state = stateArray[i];
+        if(state==0||state==1){
+            nbs[i] = 1;
+            ncs[i] = 0;
+        }else if(state==3){
+            nbs[i] = 0;
+            ncs[i] = 1;
+        }else{
+            nbs[i] = 0;
+            ncs[i] = 0;
+        }
+    }
+}
+
+__global__ void decompress_final_set(unsigned char *data, float *newData, int blockSize, 
+    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
+    float *constantMedianArray, uint64_t* nb, uint64_t* nc){
+    float* fdata = (float*)data;
+    for (int i = blockIdx.x;i < nbBlocks;i+=gridDim.x){
+        if (stateArray[i]==0 || stateArray[i]==1){
+            float Median = constantMedianArray[nb[i]];
+            // if (Median>1) printf("data%i:%f\n",i, Median);
+            for (int j = threadIdx.x; j < blockSize; j += blockDim.x)
+                *((newData)+i*blockSize+j) = Median;
+            // nb++;
+        }else if(stateArray[i]==3){
+            for (int j = threadIdx.x; j < blockSize; j += blockDim.x)
+                *((newData)+i*blockSize+j) = fdata[nc[i]*blockSize+j];
+            // nc++;
+        }
+        __syncthreads();
+    }
+}
+
+void decompress_post_proc_fast(unsigned char *data, float *newData, int blockSize, 
+    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
+    float *constantMedianArray
+){
+    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
+    
+    int i,j;
+    uint64_t *nb, *nc;
+    checkCudaErrors(cudaMalloc(&nb, sizeof(uint64_t)*nbBlocks));
+    checkCudaErrors(cudaMalloc(&nc, sizeof(uint64_t)*nbBlocks));
+
+    generateNbNc<<<BLOCKS,THREADS_PER_BLOCK>>>(nbBlocks, ncBlocks, stateArray, nb,nc);
+    cudaDeviceSynchronize();
+    thrust::exclusive_scan(thrust::device, nb, nb + nbBlocks, nb, 0);
+    thrust::exclusive_scan(thrust::device, nc, nc + nbBlocks, nc, 0);
+
+    decompress_final_set<<<nbBlocks,blockSize>>>(data, newData, blockSize,nbBlocks, ncBlocks, stateArray,constantMedianArray, nb, nc);
+    cudaDeviceSynchronize();
+    cudaFree(nb);
+    cudaFree(nc);
+}
+
+__global__ void decompress_post_proc(unsigned char *data, float *newData, int blockSize, 
+    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
+    float *constantMedianArray
+){
+    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
+    float* fdata = (float*)data;
+    int i,j;
+    int nb=0, nc=0;
+    // if (blockIdx.x == 0)
+    // {
+    //     for (i=0;i<nbBlocks;i++){
+    //         if (stateArray[i]==0 || stateArray[i]==1){
+    //             float Median = constantMedianArray[nb];
+    //             // if (Median>1) printf("data%i:%f\n",i, Median);
+    //             for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+    //                 *((newData)+i*blockSize+j) = Median;
+    //             nb++;
+    //         }
+    //     }
+    // }else{
+    //     for (i=0;i<nbBlocks;i++){
+    //         if(stateArray[i]==3){
+    //             for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+    //                 *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
+    //             nc++;
+    //         }
+    //     }
+    // }
+    
+    for (i=0;i<nbBlocks;i++){
+        if (stateArray[i]==0 || stateArray[i]==1){
+            float Median = constantMedianArray[nb];
+            // if (Median>1) printf("data%i:%f\n",i, Median);
+            for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+                *((newData)+i*blockSize+j) = Median;
+            nb++;
+        }else if(stateArray[i]==3){
+            for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+                *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
+            nc++;
+        }
+    }
+
+    //for(int k = 0; k < nbBlocks*blockSize;k++){
+//	printf("%f\n", newData[k]);
+  //  }
+}
+
+float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
+{
+    /**
+     * Assume the following are device pointers
+     * 
+     * unsigned char* cmpBytes
+     * float** newData
+     * 
+     */
+    
+    uint32_t *blk_idx;
+    uint8_t *blk_subidx;
+    uint8_t *blk_sig;
+    float *blk_vals, *constantMedianArray;
+    size_t *num_sig, *mSize, mSize_h, num_sig_h;
+    int *blockSize, bs;
+    size_t *nbConstantBlocks, *nbBlocks, *ncBlocks, nbBlocks_h, ncBlocks_h, nbConstantBlocks_h;
+    unsigned char *stateArray, *data;
+    float *newData;
+    timer_GPU.StartCounter();
+    unsigned char *oldCmpBytes = cmpBytes;
+	//*newData = (float*)malloc(sizeof(float)*nbEle);
+//    printf("cmpbytes check %d\n", (int)cmpBytes[0]);
+//    printf("new check %f\n", *newData[0]);
+    // printf("malloc\n");
+    checkCudaErrors(cudaMalloc((void**)&num_sig, sizeof(size_t)));
+    checkCudaErrors(cudaMalloc((void**)&blockSize, sizeof(int)));
+    checkCudaErrors(cudaMalloc((void**)&nbConstantBlocks, sizeof(size_t)));
+    checkCudaErrors(cudaMalloc((void**)&nbBlocks, sizeof(size_t)));
+    checkCudaErrors(cudaMalloc((void**)&ncBlocks, sizeof(size_t)));
+    checkCudaErrors(cudaMalloc((void**)&mSize, sizeof(size_t)));    
+    checkCudaErrors(cudaMalloc((void**)&newData, sizeof(float)*nbEle));
+
+    decompress_get_stats<<<1,1>>>(newData, nbEle, cmpBytes, 
+        num_sig, blockSize,
+        nbConstantBlocks, nbBlocks,
+        mSize, cmpBytes
+    );
+    cudaDeviceSynchronize();
+
+    cudaError_t err = cudaGetLastError();        // Get error code
+    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    checkCudaErrors(cudaMemcpy(&nbBlocks_h, nbBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(&nbConstantBlocks_h, nbConstantBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(&bs, blockSize, sizeof(int), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(&mSize_h, mSize, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(&num_sig_h, num_sig, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+
+
+    checkCudaErrors(cudaMalloc((void**)&stateArray, nbBlocks_h));
+    checkCudaErrors(cudaMalloc((void**)&constantMedianArray, nbConstantBlocks_h*sizeof(float)));
+
+    checkCudaErrors(cudaMalloc((void**)&blk_idx, nbBlocks_h*sizeof(uint32_t)));
+    checkCudaErrors(cudaMalloc((void**)&blk_vals, num_sig_h*sizeof(float)));
+    checkCudaErrors(cudaMalloc((void**)&blk_subidx, num_sig_h*sizeof(uint8_t)));
+    checkCudaErrors(cudaMalloc((void**)&blk_sig, nbBlocks_h*sizeof(uint8_t)));
+
+    unsigned char* tmp_r = cmpBytes;
+    unsigned char* newR;
+    setup_data_stateArray_better(newData, nbEle, tmp_r, 
+    num_sig_h, bs,
+    nbConstantBlocks_h, nbBlocks_h, &ncBlocks_h,
+    stateArray, newR);
+    
+    
+    
+   // setup_data_stateArray<<<1,1>>>(newData, nbEle, cmpBytes, 
+   //      num_sig_h, bs,
+   //      nbConstantBlocks_h, nbBlocks_h, ncBlocks,
+   //      stateArray, cmpBytes
+   //  );
+   // cudaDeviceSynchronize();
+
+   // printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
+   // checkCudaErrors(cudaMemcpy(&ncBlocks_h, ncBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+
+    checkCudaErrors(cudaMalloc((void**)&data, ncBlocks_h*bs*sizeof(float)));
+
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA start Error: %s\n", cudaGetErrorString(err));
+    // cmpBytes = newCmpBytes;
+    // data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
+    // memset(data, 0, ncBlocks*blockSize*sizeof(float));
+    // stateArray = (unsigned char*)malloc(nbBlocks);
+    
+    // // unsigned char* d_stateArray;
+    // // cudaMalloc(&d_stateArray, nbBlocks);
+	// constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));			
+
+    // blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
+    // blk_vals= (float *)malloc((num_sig)*sizeof(float));
+    // blk_subidx = (uint8_t *)malloc((num_sig)*sizeof(uint8_t));
+    // blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
+
+    //printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
+    //test_nbBlks = (size_t *)malloc(sizeof(size_t));
+    // printf("malloc\n");
+    
+    
+    tmp_r = cmpBytes;
+    decompress_startup_better(newData, nbEle, tmp_r, 
+    blk_idx, blk_subidx, blk_sig,
+    blk_vals, num_sig_h, bs,
+     nbConstantBlocks_h, nbBlocks_h, ncBlocks_h,
+    stateArray, constantMedianArray, data,
+    mSize_h, newR);
+
+
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA start Error: %s\n", cudaGetErrorString(err));
+    //decompress_startup<<<1,1>>>(newData, nbEle, cmpBytes, 
+    // blk_idx, blk_subidx, blk_sig,
+    // blk_vals, num_sig_h, bs,
+    // nbConstantBlocks_h, nbBlocks_h, ncBlocks_h,
+    // stateArray, constantMedianArray, data, mSize_h, cmpBytes);
+    //cudaDeviceSynchronize();
+    // cmpBytes = newCmpBytes;
+
+    //printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
+
+    // unsigned char* d_data;
+    float *d_newdata;
+    // checkCudaErrors(cudaMalloc((void**)&d_data, ncBlocks*blockSize*sizeof(float))); 
+    // checkCudaErrors(cudaMemcpy(d_data, data, ncBlocks*blockSize*sizeof(float), cudaMemcpyHostToDevice)); 
+    // printf("nblocks: %d bs: %d ncblock %d\n", nbBlocks_h, bs, ncBlocks_h);
+    checkCudaErrors(cudaMalloc(&d_newdata, nbBlocks_h*bs*sizeof(float)));
+
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA dec main Error: %s\n", cudaGetErrorString(err));
+    
+    dim3 dimBlock(32, bs/32);
+    dim3 dimGrid(65536, 1);
+    const int sMemsize = bs * sizeof(float) + dimBlock.y * sizeof(int);
+    decompress_state2<<<nbBlocks_h, 64>>>(d_newdata, stateArray,blk_idx, blk_vals, blk_subidx, bs, blk_sig);
+    cudaDeviceSynchronize();
+
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA dec main Error: %s\n", cudaGetErrorString(err));
+    decompress_float<<<dimGrid, dimBlock, sMemsize>>>(data, bs, ncBlocks_h, mSize_h);
+    //printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
+    cudaDeviceSynchronize();
+
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA dec main Error: %s\n", cudaGetErrorString(err));
+    
+    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(newData, d_newdata, nbBlocks_h*bs*sizeof(float), cudaMemcpyDeviceToDevice));
+    cudaFree(d_newdata);
+
+    // decompress_post_proc<<<1,1>>>(data, newData, bs, 
+    // nbBlocks_h, ncBlocks_h, stateArray,
+    // constantMedianArray);
+    // cudaDeviceSynchronize();
+    decompress_post_proc_fast(data, newData, bs, 
+    nbBlocks_h, ncBlocks_h, stateArray,
+    constantMedianArray);
+    err = cudaGetLastError();        // Get error code
+    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    //printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
+   // print_newdata<<<1,1>>>(newData, nbBlocks_h, bs);
+	cudaFree(stateArray);
+	cudaFree(constantMedianArray);
+	cudaFree(data);
+    cudaFree(blk_idx);
+    cudaFree(blk_subidx);
+    cudaFree(blk_vals);
+    cudaFree(blk_sig);
+    return newData;
+
+}
+

From 055aef5ffb3bcdb46832443836441386710c76b7 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Fri, 15 Dec 2023 23:48:11 -0600
Subject: [PATCH 113/126] replace crlf with lf

---
 .../src/simulators/qtensor_energy.py          | 554 +++++++++---------
 1 file changed, 277 insertions(+), 277 deletions(-)

diff --git a/bench/qc_simulation/src/simulators/qtensor_energy.py b/bench/qc_simulation/src/simulators/qtensor_energy.py
index d9689064..18a8a2cf 100644
--- a/bench/qc_simulation/src/simulators/qtensor_energy.py
+++ b/bench/qc_simulation/src/simulators/qtensor_energy.py
@@ -1,277 +1,277 @@
-import qtensor
-import qtree
-import networkx as nx
-import numpy as np
-
-# -- QAOA generic parser
-
-def parse_qaoa_composer(data):
-    import json
-    data = json.loads(data)
-    terms = data["terms"]
-    gamma = np.array(data["gamma"])/np.pi/2
-    beta = np.array(data["beta"])/np.pi
-    N = len(set(sum([t[1] for t in terms], [])))
-    G = nx.Graph()
-    for factor, term in terms:
-        G.add_edge(*term)
-    composer = qtensor.DefaultQAOAComposer(G, gamma=gamma, beta=beta)
-    return composer
-# --
-
-def read_circ(circ_f, type=None):
-
-    if type is None:
-        type = circ_f.path.name.split(".")[-1]
-
-    print("Reading circuit of type", type)
-    if type == "jsonterms":
-        b = circ_f.f.read()
-        return parse_qaoa_composer(b)
-
-    elif type == "qasm":
-        raise Exception("only jsonterms is supported for energy calculations")
-
-def read_preps(prep_f):
-    import pickle
-    return pickle.load(prep_f.f)
-
-def write_preps(peo, prep_f):
-    import pickle
-    pickle.dump(peo, open(prep_f, 'wb'))
-
-def write_json(data, out_file):
-    import json
-    with open(out_file, 'w') as f:
-        json.dump(data, f)
-        # This newline plays nice when cat-ing multiple files
-        f.write('\n')
-
-def preprocess_circ(circ, S, O, M, after_slice):
-    tn = qtensor.optimisation.QtreeTensorNet.from_qtree_gates(circ)
-    opt = qtensor.toolbox.get_ordering_algo(O)
-    if S:
-        # ignore argument type mismatch for pyright -- opt can be `Optimizer`
-        # pyright: reportGeneralTypeIssues=false
-        opt = qtensor.optimisation.TreeTrimSplitter(
-            tw_bias=0, max_tw=M, base_ordering=opt,
-            peo_after_slice_strategy=after_slice
-        )
-        
-        peo, par_vars, _ = opt.optimize(tn)
-        # --dbg
-        graph = tn.get_line_graph()
-        ignore_vars = tn.bra_vars + tn.ket_vars
-        for pv in par_vars:
-            graph.remove_node(int(pv))
-        components = list(nx.connected_components(graph))
-        print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
-        print(f"peo size without par_vars and ignore_vars: {len(peo) - len(par_vars) - len(ignore_vars)}")
-
-        print()
-        # --
-    else:
-        peo, _ = opt.optimize(tn)
-        par_vars = []
-    #print("W", opt.treewidth)
-    return (peo, par_vars, tn), opt.treewidth
-
-def preprocess(in_file, out_file, O='greedy', S=None, M=30, after_slice='run-again'):
-    """
-    Arguments:
-        in_file: input file
-        out_file: output file
-        O: ordering algorithm 
-        S: slicing algorithm 
-        M: Memory limit for slicing 
-    """
-    import copy
-    composer = read_circ(in_file)
-    G = composer.graph
-    prep_data = []
-    for edge in G.edges:
-        c_copy = copy.deepcopy(composer)
-        c_copy.energy_expectation_lightcone(edge)
-        e_prep, treewidth = preprocess_circ(c_copy.circuit, S, O, M, after_slice)
-        if treewidth>25:
-            prep_data.append(e_prep)
-    write_preps(prep_data, out_file)
-    print(f"Wrote {len(prep_data)} preparations of lightcones")
-    return prep_data
-
-def estimate(in_file, out_file, C=100, M=30, F=1e12, T=1e9, **kwargs):
-    """
-    Arguments:
-        in_file: file with preprocessed data
-        out_file: file to write the results to
-        C: Compression ratio
-        M: Memory limit in log2(b/16)
-        F: assumed FLOPS 
-        T: Throughput of compression
-    """
-    from qtensor.compression.cost_estimation import compressed_contraction_cost, Cost
-    from dataclasses import asdict
-    import json
-    prep_data = read_preps(in_file)
-    peo, par_vars, tn = prep_data
-
-    tn.slice({i: slice(0, 1) for i in par_vars})
-    peo = peo[:len(peo) - len(par_vars)]
-    costs: list[Cost] = compressed_contraction_cost(tn, peo, mem_limit=M, compression_ratio=C)
-    totals: Cost = sum(costs[1:], costs[0])
-    time = totals.time(F, T, T, M)
-    C = asdict(totals)
-    C['time'] = time*2**len(par_vars)
-    C['slices'] = 2**len(par_vars)
-    print("C", C)
-    out_file += ".json"
-    write_json(C, out_file)
-    return out_file
-
-def simulate(in_file, out_file,
-             backend='einsum',
-             compress=None,
-             M=29,
-             r2r_error=1e-3, r2r_threshold=1e-3,
-             **kwargs):
-    import cupy
-    prep_data = read_preps(in_file)
-    cupy.cuda.profiler.start()
-
-    C = dict(
-        time=0,
-        elapsed=0,
-        memory=0,
-        memory_history=[],
-        nvmemory=0,
-        result = dict(Re=0, Im=0),
-        compression=dict(compress=[], decompress=[])
-    )
-
-    for prep_lightcone in prep_data[:5]:
-        print(prep_lightcone)
-        r = simulate_preps_lightcone(prep_lightcone, backend, compress, M,
-                                              r2r_error,
-                                              r2r_threshold,**kwargs)
-        C['time'] += r['time']
-        C['elapsed'] += r['elapsed']
-        C['memory'] = max(C['memory'], r['memory'])
-        C['nvmemory'] = max(C['nvmemory'], r['nvmemory'])
-        C['memory_history'] += r['memory_history']
-        C['result']['Re'] += r['result']['Re']
-        C['result']['Im'] += r['result']['Im']
-        if r.get('compression'):
-            C['compression']['compress'] += r['compression']['compress']
-            C['compression']['decompress'] += r['compression']['decompress']
-
-    out_file += ".json"
-    write_json(C, out_file)
-    return out_file
-    cupy.cuda.profiler.stop()
-
-def simulate_preps_lightcone(prep_data,
-             backend='einsum',
-             compress=None,
-             M=29,
-             r2r_error=1e-3, r2r_threshold=1e-3,
-             **kwargs):
-    """
-    Args:
-        in_file: file with preprocessed data
-        out_file: file to write the results to
-        backend: backend to use
-        compress: compression algorithm
-        M: memory threshold for compression
-        r2r_error: relative error for compression
-        r2r_threshold: relative threshold for compression
-    """
-    import time
-    from qtensor.contraction_algos import bucket_elimination
-    from qtensor.compression.Compressor import CUSZCompressor, CUSZXCompressor, TorchCompressor, NEWSZCompressor
-    #from qtensor.compression.Compressor import WriteToDiskCompressor
-    import cupy
-    peo, par_vars, tn = prep_data
-    
-    backend = qtensor.contraction_backends.get_backend(backend)
-    if compress is not None:
-        if compress == 'szx':
-            print(f"{r2r_error=} {r2r_threshold=}")
-            compressor = CUSZXCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
-            compressor = qtensor.compression.ProfileCompressor(compressor)
-        elif compress == 'cusz':
-            print(f"{r2r_error=} {r2r_threshold=}")
-            compressor = CUSZCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
-            compressor = qtensor.compression.ProfileCompressor(compressor)
-        elif compress == 'torch':
-            print(f"{r2r_error=} {r2r_threshold=}")
-            compressor = TorchCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
-            compressor = qtensor.compression.ProfileCompressor(compressor)
-        elif compress == 'newsz':
-            print(f"{r2r_error=} {r2r_threshold=}")
-            compressor = NEWSZCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
-            compressor = qtensor.compression.ProfileCompressor(compressor)
-        elif compress == 'disk':
-            compressor = WriteToDiskCompressor(f'/grand/QTensor/compression/data/tensors_compressed_M{M}/')
-            compressor = qtensor.compression.ProfileCompressor(compressor)
-        else:
-            raise ValueError(f"Unknown compression algorithm: {compress}")
-        backend = qtensor.contraction_backends.CompressionBackend(backend, compressor, M)
-        from qtensor.contraction_backends.performance_measurement_decorator import MemProfBackend
-        backend = MemProfBackend(backend)
-
-    relabelid = {}
-    for tensor in tn.tensors:
-        for i in tensor.indices:
-            relabelid[int(i)] = i
-
-    slice_ext = {relabelid[int(i)]: 0 for i in par_vars}
-
-    if len(par_vars) > 0:
-        print("Parvars", par_vars)
-        print(f"Detected {len(par_vars)} slice variables")
-    sim = qtensor.QtreeSimulator(backend=backend)
-    sim.tn = tn
-    sim.tn.backend = backend
-    sim.peo = peo
-    sim._slice_relabel_buckets(slice_ext)
-    buckets = sim.tn.buckets
-    # --dbg
-    #ignore_vars  = sim.tn.bra_vars + sim.tn.ket_vars 
-    #graph = qtree.graph_model.importers.buckets2graph(buckets, ignore_vars)
-    #graph, label_dict = qtree.graph_model.relabel_graph_nodes(
-        #graph, dict(zip(graph.nodes, np.array(list(graph.nodes)) - 127*2))
-    #) 
-    #import networkx as nx
-    #components = list(nx.connected_components(graph))
-    #print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
-    #print(f"peo size without par_vars and ignore_vars: {len(peo) - len(ignore_vars)}")
-    # --
-
-    start = time.time()
-    for i in range(2**0):
-        print(f"P {i}", end='', flush=True)
-        bcopy = [b[:] for b in buckets]
-        res = bucket_elimination(
-            bcopy, backend,
-            n_var_nosum=len(tn.free_vars)
-        )
-        del bcopy
-        print("Result", res.data.flatten()[0])
-        #time.sleep(0.5)
-    sim_result = backend.get_result_data(res).flatten()[0]
-    print("Simulation result:", sim_result)
-    end = time.time()
-    print("Elapsed", end - start)
-    C = {'time': 2**len(par_vars)*(end - start)}
-    C['elapsed'] = (end - start)
-    C['memory'] = backend.max_mem
-    C['memory_history'] = backend.mem_history
-    C['nvmemory'] = backend.nvsmi_max_mem
-    C['result'] = {
-        "Re": np.real(sim_result).tolist(),
-        "Im": np.imag(sim_result).tolist()
-    }
-    if compress is not None:
-        if isinstance(compressor, qtensor.compression.ProfileCompressor):
-            C['compression'] = compressor.get_profile_data_json()
-    return C
+import qtensor
+import qtree
+import networkx as nx
+import numpy as np
+
+# -- QAOA generic parser
+
+def parse_qaoa_composer(data):
+    import json
+    data = json.loads(data)
+    terms = data["terms"]
+    gamma = np.array(data["gamma"])/np.pi/2
+    beta = np.array(data["beta"])/np.pi
+    N = len(set(sum([t[1] for t in terms], [])))
+    G = nx.Graph()
+    for factor, term in terms:
+        G.add_edge(*term)
+    composer = qtensor.DefaultQAOAComposer(G, gamma=gamma, beta=beta)
+    return composer
+# --
+
+def read_circ(circ_f, type=None):
+
+    if type is None:
+        type = circ_f.path.name.split(".")[-1]
+
+    print("Reading circuit of type", type)
+    if type == "jsonterms":
+        b = circ_f.f.read()
+        return parse_qaoa_composer(b)
+
+    elif type == "qasm":
+        raise Exception("only jsonterms is supported for energy calculations")
+
+def read_preps(prep_f):
+    import pickle
+    return pickle.load(prep_f.f)
+
+def write_preps(peo, prep_f):
+    import pickle
+    pickle.dump(peo, open(prep_f, 'wb'))
+
+def write_json(data, out_file):
+    import json
+    with open(out_file, 'w') as f:
+        json.dump(data, f)
+        # This newline plays nice when cat-ing multiple files
+        f.write('\n')
+
+def preprocess_circ(circ, S, O, M, after_slice):
+    tn = qtensor.optimisation.QtreeTensorNet.from_qtree_gates(circ)
+    opt = qtensor.toolbox.get_ordering_algo(O)
+    if S:
+        # ignore argument type mismatch for pyright -- opt can be `Optimizer`
+        # pyright: reportGeneralTypeIssues=false
+        opt = qtensor.optimisation.TreeTrimSplitter(
+            tw_bias=0, max_tw=M, base_ordering=opt,
+            peo_after_slice_strategy=after_slice
+        )
+        
+        peo, par_vars, _ = opt.optimize(tn)
+        # --dbg
+        graph = tn.get_line_graph()
+        ignore_vars = tn.bra_vars + tn.ket_vars
+        for pv in par_vars:
+            graph.remove_node(int(pv))
+        components = list(nx.connected_components(graph))
+        print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
+        print(f"peo size without par_vars and ignore_vars: {len(peo) - len(par_vars) - len(ignore_vars)}")
+
+        print()
+        # --
+    else:
+        peo, _ = opt.optimize(tn)
+        par_vars = []
+    #print("W", opt.treewidth)
+    return (peo, par_vars, tn), opt.treewidth
+
+def preprocess(in_file, out_file, O='greedy', S=None, M=30, after_slice='run-again'):
+    """
+    Arguments:
+        in_file: input file
+        out_file: output file
+        O: ordering algorithm 
+        S: slicing algorithm 
+        M: Memory limit for slicing 
+    """
+    import copy
+    composer = read_circ(in_file)
+    G = composer.graph
+    prep_data = []
+    for edge in G.edges:
+        c_copy = copy.deepcopy(composer)
+        c_copy.energy_expectation_lightcone(edge)
+        e_prep, treewidth = preprocess_circ(c_copy.circuit, S, O, M, after_slice)
+        if treewidth>25:
+            prep_data.append(e_prep)
+    write_preps(prep_data, out_file)
+    print(f"Wrote {len(prep_data)} preparations of lightcones")
+    return prep_data
+
+def estimate(in_file, out_file, C=100, M=30, F=1e12, T=1e9, **kwargs):
+    """
+    Arguments:
+        in_file: file with preprocessed data
+        out_file: file to write the results to
+        C: Compression ratio
+        M: Memory limit in log2(b/16)
+        F: assumed FLOPS 
+        T: Throughput of compression
+    """
+    from qtensor.compression.cost_estimation import compressed_contraction_cost, Cost
+    from dataclasses import asdict
+    import json
+    prep_data = read_preps(in_file)
+    peo, par_vars, tn = prep_data
+
+    tn.slice({i: slice(0, 1) for i in par_vars})
+    peo = peo[:len(peo) - len(par_vars)]
+    costs: list[Cost] = compressed_contraction_cost(tn, peo, mem_limit=M, compression_ratio=C)
+    totals: Cost = sum(costs[1:], costs[0])
+    time = totals.time(F, T, T, M)
+    C = asdict(totals)
+    C['time'] = time*2**len(par_vars)
+    C['slices'] = 2**len(par_vars)
+    print("C", C)
+    out_file += ".json"
+    write_json(C, out_file)
+    return out_file
+
+def simulate(in_file, out_file,
+             backend='einsum',
+             compress=None,
+             M=29,
+             r2r_error=1e-3, r2r_threshold=1e-3,
+             **kwargs):
+    import cupy
+    prep_data = read_preps(in_file)
+    cupy.cuda.profiler.start()
+
+    C = dict(
+        time=0,
+        elapsed=0,
+        memory=0,
+        memory_history=[],
+        nvmemory=0,
+        result = dict(Re=0, Im=0),
+        compression=dict(compress=[], decompress=[])
+    )
+
+    for prep_lightcone in prep_data[:5]:
+        print(prep_lightcone)
+        r = simulate_preps_lightcone(prep_lightcone, backend, compress, M,
+                                              r2r_error,
+                                              r2r_threshold,**kwargs)
+        C['time'] += r['time']
+        C['elapsed'] += r['elapsed']
+        C['memory'] = max(C['memory'], r['memory'])
+        C['nvmemory'] = max(C['nvmemory'], r['nvmemory'])
+        C['memory_history'] += r['memory_history']
+        C['result']['Re'] += r['result']['Re']
+        C['result']['Im'] += r['result']['Im']
+        if r.get('compression'):
+            C['compression']['compress'] += r['compression']['compress']
+            C['compression']['decompress'] += r['compression']['decompress']
+
+    out_file += ".json"
+    write_json(C, out_file)
+    return out_file
+    cupy.cuda.profiler.stop()
+
+def simulate_preps_lightcone(prep_data,
+             backend='einsum',
+             compress=None,
+             M=29,
+             r2r_error=1e-3, r2r_threshold=1e-3,
+             **kwargs):
+    """
+    Args:
+        in_file: file with preprocessed data
+        out_file: file to write the results to
+        backend: backend to use
+        compress: compression algorithm
+        M: memory threshold for compression
+        r2r_error: relative error for compression
+        r2r_threshold: relative threshold for compression
+    """
+    import time
+    from qtensor.contraction_algos import bucket_elimination
+    from qtensor.compression.Compressor import CUSZCompressor, CUSZXCompressor, TorchCompressor, NEWSZCompressor
+    #from qtensor.compression.Compressor import WriteToDiskCompressor
+    import cupy
+    peo, par_vars, tn = prep_data
+    
+    backend = qtensor.contraction_backends.get_backend(backend)
+    if compress is not None:
+        if compress == 'szx':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = CUSZXCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'cusz':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = CUSZCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'torch':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = TorchCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'newsz':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = NEWSZCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'disk':
+            compressor = WriteToDiskCompressor(f'/grand/QTensor/compression/data/tensors_compressed_M{M}/')
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        else:
+            raise ValueError(f"Unknown compression algorithm: {compress}")
+        backend = qtensor.contraction_backends.CompressionBackend(backend, compressor, M)
+        from qtensor.contraction_backends.performance_measurement_decorator import MemProfBackend
+        backend = MemProfBackend(backend)
+
+    relabelid = {}
+    for tensor in tn.tensors:
+        for i in tensor.indices:
+            relabelid[int(i)] = i
+
+    slice_ext = {relabelid[int(i)]: 0 for i in par_vars}
+
+    if len(par_vars) > 0:
+        print("Parvars", par_vars)
+        print(f"Detected {len(par_vars)} slice variables")
+    sim = qtensor.QtreeSimulator(backend=backend)
+    sim.tn = tn
+    sim.tn.backend = backend
+    sim.peo = peo
+    sim._slice_relabel_buckets(slice_ext)
+    buckets = sim.tn.buckets
+    # --dbg
+    #ignore_vars  = sim.tn.bra_vars + sim.tn.ket_vars 
+    #graph = qtree.graph_model.importers.buckets2graph(buckets, ignore_vars)
+    #graph, label_dict = qtree.graph_model.relabel_graph_nodes(
+        #graph, dict(zip(graph.nodes, np.array(list(graph.nodes)) - 127*2))
+    #) 
+    #import networkx as nx
+    #components = list(nx.connected_components(graph))
+    #print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
+    #print(f"peo size without par_vars and ignore_vars: {len(peo) - len(ignore_vars)}")
+    # --
+
+    start = time.time()
+    for i in range(2**0):
+        print(f"P {i}", end='', flush=True)
+        bcopy = [b[:] for b in buckets]
+        res = bucket_elimination(
+            bcopy, backend,
+            n_var_nosum=len(tn.free_vars)
+        )
+        del bcopy
+        print("Result", res.data.flatten()[0])
+        #time.sleep(0.5)
+    sim_result = backend.get_result_data(res).flatten()[0]
+    print("Simulation result:", sim_result)
+    end = time.time()
+    print("Elapsed", end - start)
+    C = {'time': 2**len(par_vars)*(end - start)}
+    C['elapsed'] = (end - start)
+    C['memory'] = backend.max_mem
+    C['memory_history'] = backend.mem_history
+    C['nvmemory'] = backend.nvsmi_max_mem
+    C['result'] = {
+        "Re": np.real(sim_result).tolist(),
+        "Im": np.imag(sim_result).tolist()
+    }
+    if compress is not None:
+        if isinstance(compressor, qtensor.compression.ProfileCompressor):
+            C['compression'] = compressor.get_profile_data_json()
+    return C

From c644f8e55b5fe897323393fac207994043288a2d Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Fri, 15 Dec 2023 23:49:03 -0600
Subject: [PATCH 114/126] replace crlf with lf

---
 bench/qc_simulation/src/simulators/qtensor.py |  542 +--
 qtensor/compression/CompressedTensor.py       |  308 +-
 qtensor/compression/Compressor.py             | 1164 ++---
 qtensor/compression/newsz/nvcomp              |    1 +
 qtensor/compression/szx/cuda-samples          |    1 +
 qtensor/compression/szx/src/cuszx_entry.cu    | 3920 ++++++++---------
 6 files changed, 2969 insertions(+), 2967 deletions(-)
 create mode 160000 qtensor/compression/newsz/nvcomp
 create mode 160000 qtensor/compression/szx/cuda-samples

diff --git a/bench/qc_simulation/src/simulators/qtensor.py b/bench/qc_simulation/src/simulators/qtensor.py
index e206feb6..493b6b77 100644
--- a/bench/qc_simulation/src/simulators/qtensor.py
+++ b/bench/qc_simulation/src/simulators/qtensor.py
@@ -1,271 +1,271 @@
-import qtensor
-import qtree
-import numpy as np
-
-# -- QAOA generic parser
-
-class QAOAComposer(qtensor.DefaultQAOAComposer):
-    def __init__(self, N, terms, **kwargs):
-        self.n_qubits = N
-        # from ccomp (Can't call DefaultQAOA Composer since need graph)
-        self.builder = self._get_builder()
-        # gamma and beta
-        self.params = kwargs
-        # 
-        self.terms = terms
-        self.qubit_map = {n: i for i, n in enumerate(range(N))}
-
-    def cost_operator_circuit(self, gamma):
-        for factor, term in self.terms:
-            t_mapped = [self.qubit_map[i] for i in term]
-            self.append_Z_term(term, gamma)
-
-    def append_Z_term(self, term, gamma):
-        if len(term) == 2:
-            self.apply_gate(self.operators.ZZ, term[0], term[1], alpha=2*gamma)
-            #self.apply_gate(qtensor.OpFactory.ZZFull, term[0], term[1], alpha=2*gamma)
-        elif len(term) == 4:
-            self.apply_gate(self.operators.Z4, *term, alpha=2*gamma)
-        else:
-            raise ValueError(f"Invalid QAOA term length: {len(term)}")
-
-    def mixer_operator(self, beta):
-        qubits = self.qubit_map.values()
-        for qubit in qubits:
-            self.x_term(qubit, beta)
-
-def parse_qaoa(data):
-    import json
-    data = json.loads(data)
-    terms = data["terms"]
-    gamma = np.array(data["gamma"])/np.pi/2
-    beta = np.array(data["beta"])/np.pi
-    N = len(set(sum([t[1] for t in terms], [])))
-    composer = QAOAComposer(N, terms, gamma=gamma, beta=beta)
-    composer.ansatz_state()
-    return composer.circuit
-# --
-
-def read_circ(circ_f, type=None):
-
-    if type is None:
-        type = circ_f.path.name.split(".")[-1]
-
-    print("Reading circuit of type", type)
-    if type == "jsonterms":
-        b = circ_f.f.read()
-        return parse_qaoa(b)
-
-    elif type == "qasm":
-        from qiskit import QuantumCircuit
-        b = circ_f.f.read()
-        str = b.decode('utf-8')
-
-        qiskit_circuit = QuantumCircuit.from_qasm_str(str)
-        return qtree.operators.from_qiskit_circuit(qiskit_circuit)
-    else:
-        b = circ_f.f.read()
-        str = b.decode('utf-8')
-        import io
-        f = io.StringIO(str)
-        N, circ = qtree.operators.read_circuit_stream(f)
-        return sum(circ, [])
-
-def read_preps(prep_f):
-    import pickle
-    return pickle.load(prep_f.f)
-
-def write_preps(peo, prep_f):
-    import pickle
-    pickle.dump(peo, open(prep_f, 'wb'))
-
-def write_json(data, out_file):
-    import json
-    with open(out_file, 'w') as f:
-        json.dump(data, f)
-        # This newline plays nice when cat-ing multiple files
-        f.write('\n')
-
-def preprocess(in_file, out_file, O='greedy', S=None, M=30, after_slice='run-again'):
-    """
-    Arguments:
-        in_file: input file
-        out_file: output file
-        O: ordering algorithm 
-        S: slicing algorithm 
-        M: Memory limit for slicing 
-    """
-    circ = read_circ(in_file)
-    tn = qtensor.optimisation.QtreeTensorNet.from_qtree_gates(circ)
-    opt = qtensor.toolbox.get_ordering_algo(O)
-    if S:
-        # ignore argument type mismatch for pyright -- opt can be `Optimizer`
-        # pyright: reportGeneralTypeIssues=false
-        opt = qtensor.optimisation.TreeTrimSplitter(
-            tw_bias=0, max_tw=M, base_ordering=opt,
-            peo_after_slice_strategy=after_slice
-        )
-        
-        peo, par_vars, _ = opt.optimize(tn)
-        # --dbg
-        import networkx as nx
-        graph = tn.get_line_graph()
-        ignore_vars = tn.bra_vars + tn.ket_vars
-        for pv in par_vars:
-            graph.remove_node(int(pv))
-        components = list(nx.connected_components(graph))
-        print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
-        print(f"peo size without par_vars and ignore_vars: {len(peo) - len(par_vars) - len(ignore_vars)}")
-
-        print()
-        # --
-    else:
-        peo, _ = opt.optimize(tn)
-        par_vars = []
-    print("W", opt.treewidth)
-    # -- qtensor_estim
-    prep_data = (peo, par_vars, tn)
-    write_preps(prep_data, out_file)
-
-
-def estimate(in_file, out_file, C=100, M=30, F=1e12, T=1e9, **kwargs):
-    """
-    Arguments:
-        in_file: file with preprocessed data
-        out_file: file to write the results to
-        C: Compression ratio
-        M: Memory limit in log2(b/16)
-        F: assumed FLOPS 
-        T: Throughput of compression
-    """
-    from qtensor.compression.cost_estimation import compressed_contraction_cost, Cost
-    from dataclasses import asdict
-    import json
-    prep_data = read_preps(in_file)
-    peo, par_vars, tn = prep_data
-
-    tn.slice({i: slice(0, 1) for i in par_vars})
-    peo = peo[:len(peo) - len(par_vars)]
-    costs: list[Cost] = compressed_contraction_cost(tn, peo, mem_limit=M, compression_ratio=C)
-    totals: Cost = sum(costs[1:], costs[0])
-    time = totals.time(F, T, T, M)
-    C = asdict(totals)
-    C['time'] = time*2**len(par_vars)
-    C['slices'] = 2**len(par_vars)
-    print("C", C)
-    out_file += ".json"
-    write_json(C, out_file)
-    return out_file
-
-def simulate(in_file, out_file,
-             backend='einsum',
-             compress=None,
-             M=29,
-             r2r_error=1e-3, r2r_threshold=1e-3,
-             **kwargs):
-    """
-    Args:
-        in_file: file with preprocessed data
-        out_file: file to write the results to
-        backend: backend to use
-        compress: compression algorithm
-        M: memory threshold for compression
-        r2r_error: relative error for compression
-        r2r_threshold: relative threshold for compression
-    """
-    import time
-    from qtensor.contraction_algos import bucket_elimination
-    from qtensor.compression.Compressor import CUSZCompressor, CUSZXCompressor, TorchCompressor, NEWSZCompressor
-    from qtensor.compression.Compressor import WriteToDiskCompressor
-    import cupy
-    cupy.cuda.profiler.start()
-    prep_data = read_preps(in_file)
-    peo, par_vars, tn = prep_data
-    
-    backend = qtensor.contraction_backends.get_backend(backend)
-    if compress is not None:
-        if compress == 'szx':
-            print(f"{r2r_error=} {r2r_threshold=}")
-            compressor = CUSZXCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
-            compressor = qtensor.compression.ProfileCompressor(compressor)
-        elif compress == 'cusz':
-            print(f"{r2r_error=} {r2r_threshold=}")
-            compressor = CUSZCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
-            compressor = qtensor.compression.ProfileCompressor(compressor)
-        elif compress == 'torch':
-            print(f"{r2r_error=} {r2r_threshold=}")
-            compressor = TorchCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
-            compressor = qtensor.compression.ProfileCompressor(compressor)
-        elif compress == 'newsz':
-            print(f"{r2r_error=} {r2r_threshold=}")
-            compressor = NEWSZCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
-            compressor = qtensor.compression.ProfileCompressor(compressor)
-        elif compress == 'disk':
-            compressor = WriteToDiskCompressor(f'/grand/QTensor/compression/data/tensors_compressed_M{M}/')
-            compressor = qtensor.compression.ProfileCompressor(compressor)
-        else:
-            raise ValueError(f"Unknown compression algorithm: {compress}")
-        backend = qtensor.contraction_backends.CompressionBackend(backend, compressor, M)
-        from qtensor.contraction_backends.performance_measurement_decorator import MemProfBackend
-        backend = MemProfBackend(backend)
-
-    relabelid = {}
-    for tensor in tn.tensors:
-        for i in tensor.indices:
-            relabelid[int(i)] = i
-
-    slice_ext = {relabelid[int(i)]: 0 for i in par_vars}
-
-    if len(par_vars) > 0:
-        print("Parvars", par_vars)
-        print(f"Detected {len(par_vars)} slice variables")
-    sim = qtensor.QtreeSimulator(backend=backend)
-    sim.tn = tn
-    sim.tn.backend = backend
-    sim.peo = peo
-    sim._slice_relabel_buckets(slice_ext)
-    buckets = sim.tn.buckets
-    # --dbg
-    #ignore_vars  = sim.tn.bra_vars + sim.tn.ket_vars 
-    #graph = qtree.graph_model.importers.buckets2graph(buckets, ignore_vars)
-    #graph, label_dict = qtree.graph_model.relabel_graph_nodes(
-        #graph, dict(zip(graph.nodes, np.array(list(graph.nodes)) - 127*2))
-    #) 
-    #import networkx as nx
-    #components = list(nx.connected_components(graph))
-    #print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
-    #print(f"peo size without par_vars and ignore_vars: {len(peo) - len(ignore_vars)}")
-    # --
-
-    start = time.time()
-    for i in range(2**0):
-        print(f"P {i}", end='', flush=True)
-        bcopy = [b[:] for b in buckets]
-        res = bucket_elimination(
-            bcopy, backend,
-            n_var_nosum=len(tn.free_vars)
-        )
-        del bcopy
-        print("Result", res.data.flatten()[0])
-        time.sleep(0.5)
-    sim_result = backend.get_result_data(res).flatten()[0]
-    print("Simulation result:", sim_result)
-    end = time.time()
-    print("Elapsed", end - start)
-    out_file += ".json"
-    C = {'time': 2**len(par_vars)*(end - start)}
-    C['elapsed'] = (end - start)
-    C['memory'] = backend.max_mem
-    C['memory_history'] = backend.mem_history
-    C['nvmemory'] = backend.nvsmi_max_mem
-    C['result'] = {
-        "Re": np.real(sim_result).tolist(),
-        "Im": np.imag(sim_result).tolist()
-    }
-    if compress is not None:
-        if isinstance(compressor, qtensor.compression.ProfileCompressor):
-            C['compression'] = compressor.get_profile_data_json()
-
-    write_json(C, out_file)
-    cupy.cuda.profiler.stop()
-    return out_file
+import qtensor
+import qtree
+import numpy as np
+
+# -- QAOA generic parser
+
+class QAOAComposer(qtensor.DefaultQAOAComposer):
+    def __init__(self, N, terms, **kwargs):
+        self.n_qubits = N
+        # from ccomp (Can't call DefaultQAOA Composer since need graph)
+        self.builder = self._get_builder()
+        # gamma and beta
+        self.params = kwargs
+        # 
+        self.terms = terms
+        self.qubit_map = {n: i for i, n in enumerate(range(N))}
+
+    def cost_operator_circuit(self, gamma):
+        for factor, term in self.terms:
+            t_mapped = [self.qubit_map[i] for i in term]
+            self.append_Z_term(term, gamma)
+
+    def append_Z_term(self, term, gamma):
+        if len(term) == 2:
+            self.apply_gate(self.operators.ZZ, term[0], term[1], alpha=2*gamma)
+            #self.apply_gate(qtensor.OpFactory.ZZFull, term[0], term[1], alpha=2*gamma)
+        elif len(term) == 4:
+            self.apply_gate(self.operators.Z4, *term, alpha=2*gamma)
+        else:
+            raise ValueError(f"Invalid QAOA term length: {len(term)}")
+
+    def mixer_operator(self, beta):
+        qubits = self.qubit_map.values()
+        for qubit in qubits:
+            self.x_term(qubit, beta)
+
+def parse_qaoa(data):
+    import json
+    data = json.loads(data)
+    terms = data["terms"]
+    gamma = np.array(data["gamma"])/np.pi/2
+    beta = np.array(data["beta"])/np.pi
+    N = len(set(sum([t[1] for t in terms], [])))
+    composer = QAOAComposer(N, terms, gamma=gamma, beta=beta)
+    composer.ansatz_state()
+    return composer.circuit
+# --
+
+def read_circ(circ_f, type=None):
+
+    if type is None:
+        type = circ_f.path.name.split(".")[-1]
+
+    print("Reading circuit of type", type)
+    if type == "jsonterms":
+        b = circ_f.f.read()
+        return parse_qaoa(b)
+
+    elif type == "qasm":
+        from qiskit import QuantumCircuit
+        b = circ_f.f.read()
+        str = b.decode('utf-8')
+
+        qiskit_circuit = QuantumCircuit.from_qasm_str(str)
+        return qtree.operators.from_qiskit_circuit(qiskit_circuit)
+    else:
+        b = circ_f.f.read()
+        str = b.decode('utf-8')
+        import io
+        f = io.StringIO(str)
+        N, circ = qtree.operators.read_circuit_stream(f)
+        return sum(circ, [])
+
+def read_preps(prep_f):
+    import pickle
+    return pickle.load(prep_f.f)
+
+def write_preps(peo, prep_f):
+    import pickle
+    pickle.dump(peo, open(prep_f, 'wb'))
+
+def write_json(data, out_file):
+    import json
+    with open(out_file, 'w') as f:
+        json.dump(data, f)
+        # This newline plays nice when cat-ing multiple files
+        f.write('\n')
+
+def preprocess(in_file, out_file, O='greedy', S=None, M=30, after_slice='run-again'):
+    """
+    Arguments:
+        in_file: input file
+        out_file: output file
+        O: ordering algorithm 
+        S: slicing algorithm 
+        M: Memory limit for slicing 
+    """
+    circ = read_circ(in_file)
+    tn = qtensor.optimisation.QtreeTensorNet.from_qtree_gates(circ)
+    opt = qtensor.toolbox.get_ordering_algo(O)
+    if S:
+        # ignore argument type mismatch for pyright -- opt can be `Optimizer`
+        # pyright: reportGeneralTypeIssues=false
+        opt = qtensor.optimisation.TreeTrimSplitter(
+            tw_bias=0, max_tw=M, base_ordering=opt,
+            peo_after_slice_strategy=after_slice
+        )
+        
+        peo, par_vars, _ = opt.optimize(tn)
+        # --dbg
+        import networkx as nx
+        graph = tn.get_line_graph()
+        ignore_vars = tn.bra_vars + tn.ket_vars
+        for pv in par_vars:
+            graph.remove_node(int(pv))
+        components = list(nx.connected_components(graph))
+        print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
+        print(f"peo size without par_vars and ignore_vars: {len(peo) - len(par_vars) - len(ignore_vars)}")
+
+        print()
+        # --
+    else:
+        peo, _ = opt.optimize(tn)
+        par_vars = []
+    print("W", opt.treewidth)
+    # -- qtensor_estim
+    prep_data = (peo, par_vars, tn)
+    write_preps(prep_data, out_file)
+
+
+def estimate(in_file, out_file, C=100, M=30, F=1e12, T=1e9, **kwargs):
+    """
+    Arguments:
+        in_file: file with preprocessed data
+        out_file: file to write the results to
+        C: Compression ratio
+        M: Memory limit in log2(b/16)
+        F: assumed FLOPS 
+        T: Throughput of compression
+    """
+    from qtensor.compression.cost_estimation import compressed_contraction_cost, Cost
+    from dataclasses import asdict
+    import json
+    prep_data = read_preps(in_file)
+    peo, par_vars, tn = prep_data
+
+    tn.slice({i: slice(0, 1) for i in par_vars})
+    peo = peo[:len(peo) - len(par_vars)]
+    costs: list[Cost] = compressed_contraction_cost(tn, peo, mem_limit=M, compression_ratio=C)
+    totals: Cost = sum(costs[1:], costs[0])
+    time = totals.time(F, T, T, M)
+    C = asdict(totals)
+    C['time'] = time*2**len(par_vars)
+    C['slices'] = 2**len(par_vars)
+    print("C", C)
+    out_file += ".json"
+    write_json(C, out_file)
+    return out_file
+
+def simulate(in_file, out_file,
+             backend='einsum',
+             compress=None,
+             M=29,
+             r2r_error=1e-3, r2r_threshold=1e-3,
+             **kwargs):
+    """
+    Args:
+        in_file: file with preprocessed data
+        out_file: file to write the results to
+        backend: backend to use
+        compress: compression algorithm
+        M: memory threshold for compression
+        r2r_error: relative error for compression
+        r2r_threshold: relative threshold for compression
+    """
+    import time
+    from qtensor.contraction_algos import bucket_elimination
+    from qtensor.compression.Compressor import CUSZCompressor, CUSZXCompressor, TorchCompressor, NEWSZCompressor
+    from qtensor.compression.Compressor import WriteToDiskCompressor
+    import cupy
+    cupy.cuda.profiler.start()
+    prep_data = read_preps(in_file)
+    peo, par_vars, tn = prep_data
+    
+    backend = qtensor.contraction_backends.get_backend(backend)
+    if compress is not None:
+        if compress == 'szx':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = CUSZXCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'cusz':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = CUSZCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'torch':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = TorchCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'newsz':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = NEWSZCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'disk':
+            compressor = WriteToDiskCompressor(f'/grand/QTensor/compression/data/tensors_compressed_M{M}/')
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        else:
+            raise ValueError(f"Unknown compression algorithm: {compress}")
+        backend = qtensor.contraction_backends.CompressionBackend(backend, compressor, M)
+        from qtensor.contraction_backends.performance_measurement_decorator import MemProfBackend
+        backend = MemProfBackend(backend)
+
+    relabelid = {}
+    for tensor in tn.tensors:
+        for i in tensor.indices:
+            relabelid[int(i)] = i
+
+    slice_ext = {relabelid[int(i)]: 0 for i in par_vars}
+
+    if len(par_vars) > 0:
+        print("Parvars", par_vars)
+        print(f"Detected {len(par_vars)} slice variables")
+    sim = qtensor.QtreeSimulator(backend=backend)
+    sim.tn = tn
+    sim.tn.backend = backend
+    sim.peo = peo
+    sim._slice_relabel_buckets(slice_ext)
+    buckets = sim.tn.buckets
+    # --dbg
+    #ignore_vars  = sim.tn.bra_vars + sim.tn.ket_vars 
+    #graph = qtree.graph_model.importers.buckets2graph(buckets, ignore_vars)
+    #graph, label_dict = qtree.graph_model.relabel_graph_nodes(
+        #graph, dict(zip(graph.nodes, np.array(list(graph.nodes)) - 127*2))
+    #) 
+    #import networkx as nx
+    #components = list(nx.connected_components(graph))
+    #print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
+    #print(f"peo size without par_vars and ignore_vars: {len(peo) - len(ignore_vars)}")
+    # --
+
+    start = time.time()
+    for i in range(2**0):
+        print(f"P {i}", end='', flush=True)
+        bcopy = [b[:] for b in buckets]
+        res = bucket_elimination(
+            bcopy, backend,
+            n_var_nosum=len(tn.free_vars)
+        )
+        del bcopy
+        print("Result", res.data.flatten()[0])
+        time.sleep(0.5)
+    sim_result = backend.get_result_data(res).flatten()[0]
+    print("Simulation result:", sim_result)
+    end = time.time()
+    print("Elapsed", end - start)
+    out_file += ".json"
+    C = {'time': 2**len(par_vars)*(end - start)}
+    C['elapsed'] = (end - start)
+    C['memory'] = backend.max_mem
+    C['memory_history'] = backend.mem_history
+    C['nvmemory'] = backend.nvsmi_max_mem
+    C['result'] = {
+        "Re": np.real(sim_result).tolist(),
+        "Im": np.imag(sim_result).tolist()
+    }
+    if compress is not None:
+        if isinstance(compressor, qtensor.compression.ProfileCompressor):
+            C['compression'] = compressor.get_profile_data_json()
+
+    write_json(C, out_file)
+    cupy.cuda.profiler.stop()
+    return out_file
diff --git a/qtensor/compression/CompressedTensor.py b/qtensor/compression/CompressedTensor.py
index 08a0c390..3f9181d2 100644
--- a/qtensor/compression/CompressedTensor.py
+++ b/qtensor/compression/CompressedTensor.py
@@ -1,154 +1,154 @@
-import itertools
-import numpy as np
-from qtree.optimizer import Tensor
-from qtree.system_defs import NP_ARRAY_TYPE
-from .Compressor import NumpyCompressor, Compressor
-
-def iterate_indices(indices: list):
-    if len(indices)==0:
-        return [tuple()]
-    ranges = [range(v.size) for v in indices]
-    return itertools.product(*ranges)
-
-
-class CompressedTensor(Tensor):
-    """
-    Extension of the Tensor class that holds compressed data
-
-    The data array is split along several indices S into 2^|S| parts
-
-    """
-    def __init__(self, name, indices,
-                 data_key=None, data=None,
-                 slice_indices=[],
-                 compressor:Compressor=NumpyCompressor()
-                ):
-        """
-        Initialize the tensor
-        name: str,
-              the name of the tensor. Used only for display/convenience.
-              May be not unique.
-        indices: tuple,
-              Indices of the tensor
-        shape: tuple,
-              shape of a tensor
-        data_key: int
-              Key to find tensor's data in the global storage
-        data: np.array
-              Actual data of the tensor. Default None.
-              Usually is not supplied at initialization.
-        slice_indices: list[Var]
-            indices along which the tensor is split into chunks
-        """
-        super().__init__(name, indices, data_key=data_key, data=data)
-        self.slice_indices = slice_indices
-        self.compressor = compressor
-        if data is not None:
-            self._dtype = data.dtype
-        else:
-            self._dtype = None
-
-    @classmethod
-    def empty(cls, name, indices, slice_indices=[], compressor=NumpyCompressor(), dtype:type=NP_ARRAY_TYPE):
-        t = super().empty(name, indices, dtype)
-        t.compressor = compressor
-        if slice_indices:
-            t.compress_indices(slice_indices)
-        return t
-
-    def compress_indices(self, indices: list):
-        """
-        Slice the self.data along dimensions in `indices`,
-        store them compressed
-
-        Does not support compressing when already compressed
-        """
-        slice_dict = {
-            i: slice(None) for i in self.indices
-        }
-        data_chunks = []
-        for ivals in iterate_indices(indices):
-            for ix, ival in zip(indices, ivals):
-                slice_dict[ix] = ival# slice(ival, ival+1)
-            dslice = self.data[tuple(slice_dict[i] for i in self.indices)]
-
-            data_chunks.append(
-                self.compressor.compress(dslice)
-            )
-            del dslice
-        self._data = data_chunks
-        self.slice_indices = indices
-
-    @property
-    def dtype(self):
-        """
-        DataType of wrapped chunks.
-        """
-        return self._dtype
-
-    @property
-    def array_indices(self):
-        return [x for x in self.indices if x not in self.slice_indices]
-
-    def get_chunk(self, ivals):
-        dims = [v.size for v in self.slice_indices]
-        if len(ivals)==0:
-            flat_ix = 0
-        else:
-            flat_ix = np.ravel_multi_index(ivals, dims)
-        ptr = self._data[flat_ix]
-        return self.compressor.decompress(ptr)
-
-    def set_chunk(self, ivals, chunk: np.ndarray):
-        # -- Check for consistent data types between chunks
-        if self._dtype is None:
-            self._dtype = chunk.dtype
-        else:
-            assert self.dtype == chunk.dtype, f"Chunk dtype {chunk.dtype} does not match tensor dtype {self.dtype}"
-        # --
-
-        if self._data is None:
-            self._data = np.empty(2**len(self.slice_indices), dtype=object)
-        dims = [v.size for v in self.slice_indices]
-        if len(ivals)==0:
-            flat_ix = 0
-        else:
-            flat_ix = np.ravel_multi_index(ivals, dims)
-        self._data[flat_ix] = self.compressor.compress(chunk)
-
-    def __getitem__(self, key):
-        """
-        Get a slice of the tensor along the indices in `key`
-        Currently slicing over all compressed indices is required.
-        Slices over compressed indices must be ints
-        """
-        slices_ints, new_indices = self._parse_getitem_key(key)
-        slice_dict = {}
-        chunk_slices_ints = []
-        compression_ints = []
-        for ix, ival in zip(self.indices, slices_ints):
-            slice_dict[ix] = ival
-            if ix in self.slice_indices:
-                compression_ints.append(ival)
-            else:
-                chunk_slices_ints.append(ival)
-        chunk = self.get_chunk(compression_ints)
-        new_name = f"{self.name}[sliced]"
-        # careful: chunk will not be collected even if slice is small
-        chunk_slice = chunk[tuple(chunk_slices_ints)]
-        return Tensor(new_name, new_indices, data=chunk_slice)
-
-
-    def __str__(self):
-        array_ix = ','.join(map(str, self.array_indices))
-        split_ix= ','.join(map(str, self.slice_indices))
-        return f'{self._name}{{{split_ix}}}({array_ix})'
-
-    def copy(self, name=None, indices=None, data_key=None, data=None):
-        raise NotImplementedError()
-
-    def __repr__(self):
-        return self.__str__()
-
-
-
+import itertools
+import numpy as np
+from qtree.optimizer import Tensor
+from qtree.system_defs import NP_ARRAY_TYPE
+from .Compressor import NumpyCompressor, Compressor
+
+def iterate_indices(indices: list):
+    if len(indices)==0:
+        return [tuple()]
+    ranges = [range(v.size) for v in indices]
+    return itertools.product(*ranges)
+
+
+class CompressedTensor(Tensor):
+    """
+    Extension of the Tensor class that holds compressed data
+
+    The data array is split along several indices S into 2^|S| parts
+
+    """
+    def __init__(self, name, indices,
+                 data_key=None, data=None,
+                 slice_indices=[],
+                 compressor:Compressor=NumpyCompressor()
+                ):
+        """
+        Initialize the tensor
+        name: str,
+              the name of the tensor. Used only for display/convenience.
+              May be not unique.
+        indices: tuple,
+              Indices of the tensor
+        shape: tuple,
+              shape of a tensor
+        data_key: int
+              Key to find tensor's data in the global storage
+        data: np.array
+              Actual data of the tensor. Default None.
+              Usually is not supplied at initialization.
+        slice_indices: list[Var]
+            indices along which the tensor is split into chunks
+        """
+        super().__init__(name, indices, data_key=data_key, data=data)
+        self.slice_indices = slice_indices
+        self.compressor = compressor
+        if data is not None:
+            self._dtype = data.dtype
+        else:
+            self._dtype = None
+
+    @classmethod
+    def empty(cls, name, indices, slice_indices=[], compressor=NumpyCompressor(), dtype:type=NP_ARRAY_TYPE):
+        t = super().empty(name, indices, dtype)
+        t.compressor = compressor
+        if slice_indices:
+            t.compress_indices(slice_indices)
+        return t
+
+    def compress_indices(self, indices: list):
+        """
+        Slice the self.data along dimensions in `indices`,
+        store them compressed
+
+        Does not support compressing when already compressed
+        """
+        slice_dict = {
+            i: slice(None) for i in self.indices
+        }
+        data_chunks = []
+        for ivals in iterate_indices(indices):
+            for ix, ival in zip(indices, ivals):
+                slice_dict[ix] = ival# slice(ival, ival+1)
+            dslice = self.data[tuple(slice_dict[i] for i in self.indices)]
+
+            data_chunks.append(
+                self.compressor.compress(dslice)
+            )
+            del dslice
+        self._data = data_chunks
+        self.slice_indices = indices
+
+    @property
+    def dtype(self):
+        """
+        DataType of wrapped chunks.
+        """
+        return self._dtype
+
+    @property
+    def array_indices(self):
+        return [x for x in self.indices if x not in self.slice_indices]
+
+    def get_chunk(self, ivals):
+        dims = [v.size for v in self.slice_indices]
+        if len(ivals)==0:
+            flat_ix = 0
+        else:
+            flat_ix = np.ravel_multi_index(ivals, dims)
+        ptr = self._data[flat_ix]
+        return self.compressor.decompress(ptr)
+
+    def set_chunk(self, ivals, chunk: np.ndarray):
+        # -- Check for consistent data types between chunks
+        if self._dtype is None:
+            self._dtype = chunk.dtype
+        else:
+            assert self.dtype == chunk.dtype, f"Chunk dtype {chunk.dtype} does not match tensor dtype {self.dtype}"
+        # --
+
+        if self._data is None:
+            self._data = np.empty(2**len(self.slice_indices), dtype=object)
+        dims = [v.size for v in self.slice_indices]
+        if len(ivals)==0:
+            flat_ix = 0
+        else:
+            flat_ix = np.ravel_multi_index(ivals, dims)
+        self._data[flat_ix] = self.compressor.compress(chunk)
+
+    def __getitem__(self, key):
+        """
+        Get a slice of the tensor along the indices in `key`
+        Currently slicing over all compressed indices is required.
+        Slices over compressed indices must be ints
+        """
+        slices_ints, new_indices = self._parse_getitem_key(key)
+        slice_dict = {}
+        chunk_slices_ints = []
+        compression_ints = []
+        for ix, ival in zip(self.indices, slices_ints):
+            slice_dict[ix] = ival
+            if ix in self.slice_indices:
+                compression_ints.append(ival)
+            else:
+                chunk_slices_ints.append(ival)
+        chunk = self.get_chunk(compression_ints)
+        new_name = f"{self.name}[sliced]"
+        # careful: chunk will not be collected even if slice is small
+        chunk_slice = chunk[tuple(chunk_slices_ints)]
+        return Tensor(new_name, new_indices, data=chunk_slice)
+
+
+    def __str__(self):
+        array_ix = ','.join(map(str, self.array_indices))
+        split_ix= ','.join(map(str, self.slice_indices))
+        return f'{self._name}{{{split_ix}}}({array_ix})'
+
+    def copy(self, name=None, indices=None, data_key=None, data=None):
+        raise NotImplementedError()
+
+    def __repr__(self):
+        return self.__str__()
+
+
+
diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index 02a5b4da..ea342c25 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -1,582 +1,582 @@
-import io
-import sys
-import numpy as np
-from pathlib import Path
-print(Path(__file__).parent/'szx/src/')
-sys.path.append(str(Path(__file__).parent/'szx/src/'))
-sys.path.append('./szx/src')
-# sys.path.append(str(Path(__file__).parent/'szp/src/'))
-# sys.path.append('./szp/src')
-
-sys.path.append(str(Path(__file__).parent/'cusz/src'))
-sys.path.append('./cusz/src')
-sys.path.append(str(Path(__file__).parent/'torch_quant'))
-sys.path.append('./torch_quant')
-sys.path.append(str(Path(__file__).parent/'newsz'))
-sys.path.append('./newsz')
-
-
-import torch
-try:
-    from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
-    # from cuSZp_wrapper import cuszp_device_compress, cuszp_device_decompress
-    from cusz_wrapper import cusz_device_compress, cusz_device_decompress
-    from torch_quant_perchannel import quant_device_compress, quant_device_decompress
-    from newsz_wrapper import newsz_device_compress, newsz_device_decompress
-except:
-    print("import failed")
-    # Silently fail on missing build of cuszx
-    pass
-
-CUSZX_BLOCKSIZE = 256
-
-# -- helper functions
-
-def _get_data_info(data):
-    import cupy
-    if isinstance(data, cupy.ndarray):
-        isCuPy = True
-    else:
-        isCuPy = False
-    num_elements = data.size
-    # Adapt numele depending on itemsize
-    itemsize = data.dtype.itemsize
-    num_elements_eff = int(num_elements*itemsize/4) 
-    return isCuPy, num_elements_eff
-
-# -- Compressor classes
-
-class Compressor():
-    def compress(self, data):
-        raise NotImplementedError
-
-    def decompress(self, ptr):
-        raise NotImplementedError
-
-    def compress_size(self, ptr):
-        return ptr.nbytes
-
-# -- Debugging and profiling
-
-import time
-from dataclasses import dataclass, asdict
-@dataclass
-class CompressMeasure:
-    time: float = 0
-    size_in: int = 0
-    size_out: int = 0
-    label: str = ''
-
-    def __str__(self):
-        compress_ratio = self.size_in / self.size_out
-        return (f'Measure: {self.time:.3f}s, '
-                f'{self.size_in/1024**2:.2f}MB -> {self.size_out/1024**2:.2f}MB ({compress_ratio:.3f} in/out ratio)'
-        )
-
-class ProfileCompressor(Compressor):
-    def __init__(self, compressor:Compressor, trace=True):
-        self.trace = trace
-        self.compressor = compressor
-        self.profile_data = {'compress': [], 'decompress': []}
-
-    def compress(self, data):
-        start = time.time()
-        ptr = self.compressor.compress(data)
-        end = time.time()
-        out_size = self.compressor.compress_size(ptr)
-        cmeasure = CompressMeasure(end-start, data.nbytes, out_size)
-        self.profile_data['compress'].append(cmeasure)
-        if self.trace:
-            print(f'Compress: {cmeasure}')
-        return ptr
-
-    def decompress(self, ptr):
-        start = time.time()
-        data = self.compressor.decompress(ptr)
-        end = time.time()
-        in_size = self.compressor.compress_size(ptr)
-        dmeasure = CompressMeasure(end-start, in_size, data.nbytes)
-        self.profile_data['decompress'].append(dmeasure)
-        if self.trace:
-            print(f'Decompress: {dmeasure}')
-        return data
-
-    def get_profile_data(self):
-        return self.profile_data['compress'], self.profile_data['decompress']
-
-    def get_profile_data_json(self):
-        compress, decompress = self.get_profile_data()
-        return {
-            'compress': [asdict(c) for c in compress],
-            'decompress': [asdict(c) for c in decompress],
-        }
-
-    def get_profile_stats(self):
-        compress, decompress = self.get_profile_data()
-        compress_time = sum([x.time for x in compress])
-        decompress_time = sum([x.time for x in decompress])
-        compress_ratios = np.mean([x.size_in/x.size_out for x in compress])
-        compress_size = sum([x.size_out for x in compress])
-        return compress_time, decompress_time, compress_size, compress_ratios
-# --
-
-class NumpyCompressor(Compressor):
-    def compress(self, data):
-        comp = io.BytesIO()
-        np.savez_compressed(comp, data)
-        return comp
-
-    def compress_size(self, ptr):
-        return ptr.getbuffer().nbytes
-
-    def decompress(self, ptr):
-        ptr.seek(0)
-        return  np.load(ptr)['arr_0']
-
-class TorchCompressor(Compressor):
-    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
-        self.r2r_error = r2r_error
-        self.r2r_threshold = r2r_threshold
-        self.decompressed_own = []
-
-    def free_decompressed(self):
-        import cupy
-        print("Cleanup", len(self.decompressed_own))
-        for x in self.decompressed_own:
-            del x
-        cupy.get_default_memory_pool().free_all_blocks()
-        cupy.get_default_pinned_memory_pool().free_all_blocks()
-        torch.cuda.empty_cache()
-        self.decompressed_own = []
-
-    def free_compressed(self, ptr):
-        import ctypes, cupy
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
-        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
-        # cast to int64 pointer
-        # (effectively converting pointer to pointer to addr to pointer to int64)
-        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-        decompressed_int = p_decompressed_int.contents
-        cupy.cuda.runtime.free(decompressed_int.value)
-
-    def compress(self, data):
-        isCupy, num_elements_eff = _get_data_info(data)
-        dtype = data.dtype
-        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
-        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
-
-        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
-
-    def compress_size(self, ptr):
-        return ptr[5]
-
-    def decompress(self, obj):
-        import cupy
-        import ctypes
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
-        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
-        arr_cp = decompressed_ptr[0]
-
-        arr = cupy.reshape(arr_cp, shape)
-        self.decompressed_own.append(arr)
-        return arr
-    
-    ### Compression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
-    # - num_elements = Number of floating point elements in data
-    # - r2r_error = relative-to-value-range error bound for lossy compression
-    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
-    # Returns:
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
-    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
-        
-        if not isCuPy:
-            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-        else:
-            #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
-
-            cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-            del data
-            torch.cuda.empty_cache()
-        return cmp_bytes, outSize_ptr
-
-    ### Decompression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - num_elements = Number of floating point elements in original data
-    # Returns:
-    # - decompressed_data = Float32 pointer to decompressed data
-    #
-    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
-
-    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
-        if not isCuPy:
-            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
-        else:
-            #decompressed_data = cuszp_device_decompress(num_elements, cmp_bytes, cmpsize, owner,dtype)
-# oriData, absErrBound, nbEle, blockSize,threshold
-            decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
-        return decompressed_data
-
-class NEWSZCompressor(Compressor):
-    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
-        self.r2r_error = r2r_error
-        self.r2r_threshold = r2r_threshold
-        self.decompressed_own = []
-
-    def free_decompressed(self):
-        import cupy
-        print("Cleanup", len(self.decompressed_own))
-        for x in self.decompressed_own:
-            #print(x)
-            #if x == None:
-            #    continue
-            #else:
-                #print("CUDA Free", x)
-            cupy.cuda.runtime.free(x)
-            # del x
-            # cupy.get_default_memory_pool().free_all_blocks()
-            # cupy.get_default_pinned_memory_pool().free_all_blocks()
-        # torch.cuda.empty_cache()
-        self.decompressed_own = []
-
-    def free_compressed(self, ptr):
-        import ctypes, cupy
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
-        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
-        # cast to int64 pointer
-        # (effectively converting pointer to pointer to addr to pointer to int64)
-        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-        decompressed_int = p_decompressed_int.contents
-        cupy.cuda.runtime.free(decompressed_int.value)
-
-    def compress(self, data):
-        isCuPy, num_elements_eff = _get_data_info(data)
-        dtype = data.dtype
-        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
-        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
-
-        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
-
-    def compress_size(self, ptr):
-        return ptr[5]
-
-    def decompress(self, obj):
-        import cupy
-        import ctypes
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
-        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
-        arr_cp = decompressed_ptr[0]
-        self.decompressed_own.append(decompressed_ptr[1])
-        
-        # -- Workaround to convert GPU pointer to int
-        # p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
-        # # cast to int64 pointer
-        # # (effectively converting pointer to pointer to addr to pointer to int64)
-        # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-        # decompressed_int = p_decompressed_int.contents
-        # # --
-        # self.decompressed_own.append(decompressed_int.value)
-        # mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
-        # mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
-        arr = cupy.reshape(arr_cp, shape)
-        # self.decompressed_own.append(arr)
-        # arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
-        return arr
-    
-    ### Compression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
-    # - num_elements = Number of floating point elements in data
-    # - r2r_error = relative-to-value-range error bound for lossy compression
-    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
-    # Returns:
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
-    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
-        
-        if not isCuPy:
-            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-        else:
-            print('Before compress')
-            cmp_bytes, outSize_ptr = newsz_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-            print('After compress')
-            del data
-            torch.cuda.empty_cache()
-        return cmp_bytes, outSize_ptr
-
-    ### Decompression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - num_elements = Number of floating point elements in original data
-    # Returns:
-    # - decompressed_data = Float32 pointer to decompressed data
-    #
-    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
-
-    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
-        if not isCuPy:
-            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
-        else:
-            # cuszx_device_decompress(nbEle, cmpBytes, owner, dtype)
-            decompressed_data = newsz_device_decompress(num_elements, cmp_bytes, owner,dtype)
-# oriData, absErrBound, nbEle, blockSize,threshold
-            # decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
-        return decompressed_data
-
-class CUSZXCompressor(Compressor):
-    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
-        self.r2r_error = r2r_error
-        self.r2r_threshold = r2r_threshold
-        self.decompressed_own = []
-
-    def free_decompressed(self):
-        import cupy
-        print("Cleanup", len(self.decompressed_own))
-        for x in self.decompressed_own:
-            #print(x)
-            #if x == None:
-            #    continue
-            #else:
-                #print("CUDA Free", x)
-            cupy.cuda.runtime.free(x)
-            # del x
-            # cupy.get_default_memory_pool().free_all_blocks()
-            # cupy.get_default_pinned_memory_pool().free_all_blocks()
-        # torch.cuda.empty_cache()
-        self.decompressed_own = []
-
-    def free_compressed(self, ptr):
-        import ctypes, cupy
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
-        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
-        # cast to int64 pointer
-        # (effectively converting pointer to pointer to addr to pointer to int64)
-        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-        decompressed_int = p_decompressed_int.contents
-        cupy.cuda.runtime.free(decompressed_int.value)
-
-    def compress(self, data):
-        isCuPy, num_elements_eff = _get_data_info(data)
-        dtype = data.dtype
-        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
-        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
-
-        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
-
-    def compress_size(self, ptr):
-        return ptr[5]
-
-    def decompress(self, obj):
-        import cupy
-        import ctypes
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
-        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
-        arr_cp = decompressed_ptr[0]
-        self.decompressed_own.append(decompressed_ptr[1])
-        
-        # -- Workaround to convert GPU pointer to int
-        # p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
-        # # cast to int64 pointer
-        # # (effectively converting pointer to pointer to addr to pointer to int64)
-        # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-        # decompressed_int = p_decompressed_int.contents
-        # # --
-        # self.decompressed_own.append(decompressed_int.value)
-        # mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
-        # mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
-        arr = cupy.reshape(arr_cp, shape)
-        # self.decompressed_own.append(arr)
-        # arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
-        return arr
-    
-    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
-        """
-        ## Compression API with cuSZx ###
-        Parameters:
-         - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-         - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
-         - num_elements = Number of floating point elements in data
-         - r2r_error = relative-to-value-range error bound for lossy compression
-         - r2r_threshold = relative-to-value-range threshold to floor values to zero
-         Returns:
-         - cmp_bytes = Unsigned char pointer to compressed bytes
-         - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
-         """
-        
-        if not isCuPy:
-            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-        else:
-            #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
-            cmp_bytes, outSize_ptr = cuszx_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE,r2r_threshold)
-            # cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-            del data
-            torch.cuda.empty_cache()
-        return cmp_bytes, outSize_ptr
-
-
-    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
-        """
-        ## Decompression API with cuSZx ###
-         Parameters:
-         - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-         - cmp_bytes = Unsigned char pointer to compressed bytes
-         - num_elements = Number of floating point elements in original data
-         Returns:
-         - decompressed_data = Float32 pointer to decompressed data
-        
-         Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
-         """
-        if not isCuPy:
-            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
-        else:
-            # cuszx_device_decompress(nbEle, cmpBytes, owner, dtype)
-            decompressed_data = cuszx_device_decompress(num_elements, cmp_bytes, owner,dtype)
-# oriData, absErrBound, nbEle, blockSize,threshold
-            # decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
-        return decompressed_data
-    
-class CUSZCompressor(Compressor):
-    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
-        self.r2r_error = r2r_error
-        self.r2r_threshold = r2r_threshold
-        self.decompressed_own = []
-
-    def free_decompressed(self):
-        import cupy
-        print("Cleanup", len(self.decompressed_own))
-        for x in self.decompressed_own:
-            cupy.cuda.runtime.free(x)
-            # del x
-            # cupy.get_default_memory_pool().free_all_blocks()
-            # cupy.get_default_pinned_memory_pool().free_all_blocks()
-        # torch.cuda.empty_cache()
-        self.decompressed_own = []
-
-    def free_compressed(self, ptr):
-        import ctypes, cupy
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
-        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
-        # cast to int64 pointer
-        # (effectively converting pointer to pointer to addr to pointer to int64)
-        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-        decompressed_int = p_decompressed_int.contents
-        cupy.cuda.runtime.free(decompressed_int.value)
-
-    def compress(self, data):
-        isCuPy, num_elements_eff = _get_data_info(data)
-
-        dtype = data.dtype
-        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
-        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
-
-    def compress_size(self, ptr):
-        return ptr[5]
-
-    def decompress(self, obj):
-        import cupy
-        import ctypes
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
-        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
-        arr_cp = decompressed_ptr[0]
-        #self.decompressed_own.append(decompressed_ptr[1])
-        
-        # -- Workaround to convert GPU pointer to int
-        # p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
-        # # cast to int64 pointer
-        # # (effectively converting pointer to pointer to addr to pointer to int64)
-        # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-        # decompressed_int = p_decompressed_int.contents
-        # # --
-        # self.decompressed_own.append(decompressed_int.value)
-        # mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
-        # mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
-        arr = cupy.reshape(arr_cp, shape)
-        self.decompressed_own.append(arr)
-        # arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
-        return arr
-    
-    ### Compression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
-    # - num_elements = Number of floating point elements in data
-    # - r2r_error = relative-to-value-range error bound for lossy compression
-    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
-    # Returns:
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
-    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
-        
-        if not isCuPy:
-            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-        else:
-            cmp_bytes, outSize_ptr = cusz_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE,r2r_threshold)
-            # cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-            del data
-            torch.cuda.empty_cache()
-        return cmp_bytes, outSize_ptr
-
-    ### Decompression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - num_elements = Number of floating point elements in original data
-    # Returns:
-    # - decompressed_data = Float32 pointer to decompressed data
-    #
-    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
-
-    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
-        if not isCuPy:
-            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
-        else:
-            # cuszx_device_decompress(nbEle, cmpBytes, owner, dtype)
-            decompressed_data = cusz_device_decompress(num_elements, cmp_bytes, owner,dtype)
-# oriData, absErrBound, nbEle, blockSize,threshold
-            # decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
-        return decompressed_data
-
-class WriteToDiskCompressor(Compressor):
-    def __init__(self, path):
-        from pathlib import Path
-        Path(path).mkdir(exist_ok=True, parents=True)
-        self.path = path
-    
-    def _gen_random_filename(self, info):
-        dtype, shape, isCupy = info
-        k = np.random.randint(0, 100000000)
-        s = hex(k)[2:]
-        return self.path + f'/qtensor_data_{s}_{str(dtype)}.bin'
-
-    def compress(self, data):
-        import cupy
-        if isinstance(data, cupy.ndarray):
-            isCupy=False
-        else:
-            isCupy=True
-        fname = self._gen_random_filename((data.dtype, data.shape, isCupy))
-        data.tofile(fname)
-        return (fname, data.dtype, data.shape, isCupy)
-
-    def compress_size(self, ptr):
-        return 0.1
-
-    def decompress(self, obj):
-        import cupy
-        fname, dtype, shape, isCupy = obj
-        if isCupy:
-            return cupy.fromfile(fname).view(dtype).reshape(shape)
-        else:
-            return np.fromfile(fname).view(dtype).reshape(shape)
-
-    def free_compressed(self, ptr):
-        pass
-    def free_decompressed(self):
-        pass
+import io
+import sys
+import numpy as np
+from pathlib import Path
+print(Path(__file__).parent/'szx/src/')
+sys.path.append(str(Path(__file__).parent/'szx/src/'))
+sys.path.append('./szx/src')
+# sys.path.append(str(Path(__file__).parent/'szp/src/'))
+# sys.path.append('./szp/src')
+
+sys.path.append(str(Path(__file__).parent/'cusz/src'))
+sys.path.append('./cusz/src')
+sys.path.append(str(Path(__file__).parent/'torch_quant'))
+sys.path.append('./torch_quant')
+sys.path.append(str(Path(__file__).parent/'newsz'))
+sys.path.append('./newsz')
+
+
+import torch
+try:
+    from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
+    # from cuSZp_wrapper import cuszp_device_compress, cuszp_device_decompress
+    from cusz_wrapper import cusz_device_compress, cusz_device_decompress
+    from torch_quant_perchannel import quant_device_compress, quant_device_decompress
+    from newsz_wrapper import newsz_device_compress, newsz_device_decompress
+except:
+    print("import failed")
+    # Silently fail on missing build of cuszx
+    pass
+
+CUSZX_BLOCKSIZE = 256
+
+# -- helper functions
+
+def _get_data_info(data):
+    import cupy
+    if isinstance(data, cupy.ndarray):
+        isCuPy = True
+    else:
+        isCuPy = False
+    num_elements = data.size
+    # Adapt numele depending on itemsize
+    itemsize = data.dtype.itemsize
+    num_elements_eff = int(num_elements*itemsize/4) 
+    return isCuPy, num_elements_eff
+
+# -- Compressor classes
+
+class Compressor():
+    def compress(self, data):
+        raise NotImplementedError
+
+    def decompress(self, ptr):
+        raise NotImplementedError
+
+    def compress_size(self, ptr):
+        return ptr.nbytes
+
+# -- Debugging and profiling
+
+import time
+from dataclasses import dataclass, asdict
+@dataclass
+class CompressMeasure:
+    time: float = 0
+    size_in: int = 0
+    size_out: int = 0
+    label: str = ''
+
+    def __str__(self):
+        compress_ratio = self.size_in / self.size_out
+        return (f'Measure: {self.time:.3f}s, '
+                f'{self.size_in/1024**2:.2f}MB -> {self.size_out/1024**2:.2f}MB ({compress_ratio:.3f} in/out ratio)'
+        )
+
+class ProfileCompressor(Compressor):
+    def __init__(self, compressor:Compressor, trace=True):
+        self.trace = trace
+        self.compressor = compressor
+        self.profile_data = {'compress': [], 'decompress': []}
+
+    def compress(self, data):
+        start = time.time()
+        ptr = self.compressor.compress(data)
+        end = time.time()
+        out_size = self.compressor.compress_size(ptr)
+        cmeasure = CompressMeasure(end-start, data.nbytes, out_size)
+        self.profile_data['compress'].append(cmeasure)
+        if self.trace:
+            print(f'Compress: {cmeasure}')
+        return ptr
+
+    def decompress(self, ptr):
+        start = time.time()
+        data = self.compressor.decompress(ptr)
+        end = time.time()
+        in_size = self.compressor.compress_size(ptr)
+        dmeasure = CompressMeasure(end-start, in_size, data.nbytes)
+        self.profile_data['decompress'].append(dmeasure)
+        if self.trace:
+            print(f'Decompress: {dmeasure}')
+        return data
+
+    def get_profile_data(self):
+        return self.profile_data['compress'], self.profile_data['decompress']
+
+    def get_profile_data_json(self):
+        compress, decompress = self.get_profile_data()
+        return {
+            'compress': [asdict(c) for c in compress],
+            'decompress': [asdict(c) for c in decompress],
+        }
+
+    def get_profile_stats(self):
+        compress, decompress = self.get_profile_data()
+        compress_time = sum([x.time for x in compress])
+        decompress_time = sum([x.time for x in decompress])
+        compress_ratios = np.mean([x.size_in/x.size_out for x in compress])
+        compress_size = sum([x.size_out for x in compress])
+        return compress_time, decompress_time, compress_size, compress_ratios
+# --
+
+class NumpyCompressor(Compressor):
+    def compress(self, data):
+        comp = io.BytesIO()
+        np.savez_compressed(comp, data)
+        return comp
+
+    def compress_size(self, ptr):
+        return ptr.getbuffer().nbytes
+
+    def decompress(self, ptr):
+        ptr.seek(0)
+        return  np.load(ptr)['arr_0']
+
+class TorchCompressor(Compressor):
+    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
+        self.r2r_error = r2r_error
+        self.r2r_threshold = r2r_threshold
+        self.decompressed_own = []
+
+    def free_decompressed(self):
+        import cupy
+        print("Cleanup", len(self.decompressed_own))
+        for x in self.decompressed_own:
+            del x
+        cupy.get_default_memory_pool().free_all_blocks()
+        cupy.get_default_pinned_memory_pool().free_all_blocks()
+        torch.cuda.empty_cache()
+        self.decompressed_own = []
+
+    def free_compressed(self, ptr):
+        import ctypes, cupy
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
+        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
+        # cast to int64 pointer
+        # (effectively converting pointer to pointer to addr to pointer to int64)
+        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        decompressed_int = p_decompressed_int.contents
+        cupy.cuda.runtime.free(decompressed_int.value)
+
+    def compress(self, data):
+        isCupy, num_elements_eff = _get_data_info(data)
+        dtype = data.dtype
+        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
+        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
+
+        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
+
+    def compress_size(self, ptr):
+        return ptr[5]
+
+    def decompress(self, obj):
+        import cupy
+        import ctypes
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
+        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
+        arr_cp = decompressed_ptr[0]
+
+        arr = cupy.reshape(arr_cp, shape)
+        self.decompressed_own.append(arr)
+        return arr
+    
+    ### Compression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
+    # - num_elements = Number of floating point elements in data
+    # - r2r_error = relative-to-value-range error bound for lossy compression
+    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
+    # Returns:
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
+    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
+        
+        if not isCuPy:
+            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+        else:
+            #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
+
+            cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+            del data
+            torch.cuda.empty_cache()
+        return cmp_bytes, outSize_ptr
+
+    ### Decompression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - num_elements = Number of floating point elements in original data
+    # Returns:
+    # - decompressed_data = Float32 pointer to decompressed data
+    #
+    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
+
+    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
+        if not isCuPy:
+            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
+        else:
+            #decompressed_data = cuszp_device_decompress(num_elements, cmp_bytes, cmpsize, owner,dtype)
+# oriData, absErrBound, nbEle, blockSize,threshold
+            decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
+        return decompressed_data
+
+class NEWSZCompressor(Compressor):
+    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
+        self.r2r_error = r2r_error
+        self.r2r_threshold = r2r_threshold
+        self.decompressed_own = []
+
+    def free_decompressed(self):
+        import cupy
+        print("Cleanup", len(self.decompressed_own))
+        for x in self.decompressed_own:
+            #print(x)
+            #if x == None:
+            #    continue
+            #else:
+                #print("CUDA Free", x)
+            cupy.cuda.runtime.free(x)
+            # del x
+            # cupy.get_default_memory_pool().free_all_blocks()
+            # cupy.get_default_pinned_memory_pool().free_all_blocks()
+        # torch.cuda.empty_cache()
+        self.decompressed_own = []
+
+    def free_compressed(self, ptr):
+        import ctypes, cupy
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
+        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
+        # cast to int64 pointer
+        # (effectively converting pointer to pointer to addr to pointer to int64)
+        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        decompressed_int = p_decompressed_int.contents
+        cupy.cuda.runtime.free(decompressed_int.value)
+
+    def compress(self, data):
+        isCuPy, num_elements_eff = _get_data_info(data)
+        dtype = data.dtype
+        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
+        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
+
+        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
+
+    def compress_size(self, ptr):
+        return ptr[5]
+
+    def decompress(self, obj):
+        import cupy
+        import ctypes
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
+        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
+        arr_cp = decompressed_ptr[0]
+        self.decompressed_own.append(decompressed_ptr[1])
+        
+        # -- Workaround to convert GPU pointer to int
+        # p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
+        # # cast to int64 pointer
+        # # (effectively converting pointer to pointer to addr to pointer to int64)
+        # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        # decompressed_int = p_decompressed_int.contents
+        # # --
+        # self.decompressed_own.append(decompressed_int.value)
+        # mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
+        # mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
+        arr = cupy.reshape(arr_cp, shape)
+        # self.decompressed_own.append(arr)
+        # arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
+        return arr
+    
+    ### Compression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
+    # - num_elements = Number of floating point elements in data
+    # - r2r_error = relative-to-value-range error bound for lossy compression
+    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
+    # Returns:
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
+    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
+        
+        if not isCuPy:
+            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+        else:
+            print('Before compress')
+            cmp_bytes, outSize_ptr = newsz_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+            print('After compress')
+            del data
+            torch.cuda.empty_cache()
+        return cmp_bytes, outSize_ptr
+
+    ### Decompression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - num_elements = Number of floating point elements in original data
+    # Returns:
+    # - decompressed_data = Float32 pointer to decompressed data
+    #
+    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
+
+    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
+        if not isCuPy:
+            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
+        else:
+            # cuszx_device_decompress(nbEle, cmpBytes, owner, dtype)
+            decompressed_data = newsz_device_decompress(num_elements, cmp_bytes, owner,dtype)
+# oriData, absErrBound, nbEle, blockSize,threshold
+            # decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
+        return decompressed_data
+
+class CUSZXCompressor(Compressor):
+    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
+        self.r2r_error = r2r_error
+        self.r2r_threshold = r2r_threshold
+        self.decompressed_own = []
+
+    def free_decompressed(self):
+        import cupy
+        print("Cleanup", len(self.decompressed_own))
+        for x in self.decompressed_own:
+            #print(x)
+            #if x == None:
+            #    continue
+            #else:
+                #print("CUDA Free", x)
+            cupy.cuda.runtime.free(x)
+            # del x
+            # cupy.get_default_memory_pool().free_all_blocks()
+            # cupy.get_default_pinned_memory_pool().free_all_blocks()
+        # torch.cuda.empty_cache()
+        self.decompressed_own = []
+
+    def free_compressed(self, ptr):
+        import ctypes, cupy
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
+        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
+        # cast to int64 pointer
+        # (effectively converting pointer to pointer to addr to pointer to int64)
+        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        decompressed_int = p_decompressed_int.contents
+        cupy.cuda.runtime.free(decompressed_int.value)
+
+    def compress(self, data):
+        isCuPy, num_elements_eff = _get_data_info(data)
+        dtype = data.dtype
+        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
+        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
+
+        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
+
+    def compress_size(self, ptr):
+        return ptr[5]
+
+    def decompress(self, obj):
+        import cupy
+        import ctypes
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
+        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
+        arr_cp = decompressed_ptr[0]
+        self.decompressed_own.append(decompressed_ptr[1])
+        
+        # -- Workaround to convert GPU pointer to int
+        # p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
+        # # cast to int64 pointer
+        # # (effectively converting pointer to pointer to addr to pointer to int64)
+        # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        # decompressed_int = p_decompressed_int.contents
+        # # --
+        # self.decompressed_own.append(decompressed_int.value)
+        # mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
+        # mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
+        arr = cupy.reshape(arr_cp, shape)
+        # self.decompressed_own.append(arr)
+        # arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
+        return arr
+    
+    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
+        """
+        ## Compression API with cuSZx ###
+        Parameters:
+         - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+         - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
+         - num_elements = Number of floating point elements in data
+         - r2r_error = relative-to-value-range error bound for lossy compression
+         - r2r_threshold = relative-to-value-range threshold to floor values to zero
+         Returns:
+         - cmp_bytes = Unsigned char pointer to compressed bytes
+         - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
+         """
+        
+        if not isCuPy:
+            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+        else:
+            #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
+            cmp_bytes, outSize_ptr = cuszx_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE,r2r_threshold)
+            # cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+            del data
+            torch.cuda.empty_cache()
+        return cmp_bytes, outSize_ptr
+
+
+    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
+        """
+        ## Decompression API with cuSZx ###
+         Parameters:
+         - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+         - cmp_bytes = Unsigned char pointer to compressed bytes
+         - num_elements = Number of floating point elements in original data
+         Returns:
+         - decompressed_data = Float32 pointer to decompressed data
+        
+         Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
+         """
+        if not isCuPy:
+            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
+        else:
+            # cuszx_device_decompress(nbEle, cmpBytes, owner, dtype)
+            decompressed_data = cuszx_device_decompress(num_elements, cmp_bytes, owner,dtype)
+# oriData, absErrBound, nbEle, blockSize,threshold
+            # decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
+        return decompressed_data
+    
+class CUSZCompressor(Compressor):
+    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
+        self.r2r_error = r2r_error
+        self.r2r_threshold = r2r_threshold
+        self.decompressed_own = []
+
+    def free_decompressed(self):
+        import cupy
+        print("Cleanup", len(self.decompressed_own))
+        for x in self.decompressed_own:
+            cupy.cuda.runtime.free(x)
+            # del x
+            # cupy.get_default_memory_pool().free_all_blocks()
+            # cupy.get_default_pinned_memory_pool().free_all_blocks()
+        # torch.cuda.empty_cache()
+        self.decompressed_own = []
+
+    def free_compressed(self, ptr):
+        import ctypes, cupy
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
+        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
+        # cast to int64 pointer
+        # (effectively converting pointer to pointer to addr to pointer to int64)
+        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        decompressed_int = p_decompressed_int.contents
+        cupy.cuda.runtime.free(decompressed_int.value)
+
+    def compress(self, data):
+        isCuPy, num_elements_eff = _get_data_info(data)
+
+        dtype = data.dtype
+        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
+        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
+
+    def compress_size(self, ptr):
+        return ptr[5]
+
+    def decompress(self, obj):
+        import cupy
+        import ctypes
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
+        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
+        arr_cp = decompressed_ptr[0]
+        #self.decompressed_own.append(decompressed_ptr[1])
+        
+        # -- Workaround to convert GPU pointer to int
+        # p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
+        # # cast to int64 pointer
+        # # (effectively converting pointer to pointer to addr to pointer to int64)
+        # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        # decompressed_int = p_decompressed_int.contents
+        # # --
+        # self.decompressed_own.append(decompressed_int.value)
+        # mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
+        # mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
+        arr = cupy.reshape(arr_cp, shape)
+        self.decompressed_own.append(arr)
+        # arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
+        return arr
+    
+    ### Compression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
+    # - num_elements = Number of floating point elements in data
+    # - r2r_error = relative-to-value-range error bound for lossy compression
+    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
+    # Returns:
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
+    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
+        
+        if not isCuPy:
+            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+        else:
+            cmp_bytes, outSize_ptr = cusz_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE,r2r_threshold)
+            # cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+            del data
+            torch.cuda.empty_cache()
+        return cmp_bytes, outSize_ptr
+
+    ### Decompression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - num_elements = Number of floating point elements in original data
+    # Returns:
+    # - decompressed_data = Float32 pointer to decompressed data
+    #
+    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
+
+    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
+        if not isCuPy:
+            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
+        else:
+            # cuszx_device_decompress(nbEle, cmpBytes, owner, dtype)
+            decompressed_data = cusz_device_decompress(num_elements, cmp_bytes, owner,dtype)
+# oriData, absErrBound, nbEle, blockSize,threshold
+            # decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
+        return decompressed_data
+
+class WriteToDiskCompressor(Compressor):
+    def __init__(self, path):
+        from pathlib import Path
+        Path(path).mkdir(exist_ok=True, parents=True)
+        self.path = path
+    
+    def _gen_random_filename(self, info):
+        dtype, shape, isCupy = info
+        k = np.random.randint(0, 100000000)
+        s = hex(k)[2:]
+        return self.path + f'/qtensor_data_{s}_{str(dtype)}.bin'
+
+    def compress(self, data):
+        import cupy
+        if isinstance(data, cupy.ndarray):
+            isCupy=False
+        else:
+            isCupy=True
+        fname = self._gen_random_filename((data.dtype, data.shape, isCupy))
+        data.tofile(fname)
+        return (fname, data.dtype, data.shape, isCupy)
+
+    def compress_size(self, ptr):
+        return 0.1
+
+    def decompress(self, obj):
+        import cupy
+        fname, dtype, shape, isCupy = obj
+        if isCupy:
+            return cupy.fromfile(fname).view(dtype).reshape(shape)
+        else:
+            return np.fromfile(fname).view(dtype).reshape(shape)
+
+    def free_compressed(self, ptr):
+        pass
+    def free_decompressed(self):
+        pass
diff --git a/qtensor/compression/newsz/nvcomp b/qtensor/compression/newsz/nvcomp
new file mode 160000
index 00000000..a6e4e64a
--- /dev/null
+++ b/qtensor/compression/newsz/nvcomp
@@ -0,0 +1 @@
+Subproject commit a6e4e64a177e07cd2e5c8c5e07bb66ffefceae84
diff --git a/qtensor/compression/szx/cuda-samples b/qtensor/compression/szx/cuda-samples
new file mode 160000
index 00000000..e4789153
--- /dev/null
+++ b/qtensor/compression/szx/cuda-samples
@@ -0,0 +1 @@
+Subproject commit e4789153d539b2d2f3976050057a52a1518abcf0
diff --git a/qtensor/compression/szx/src/cuszx_entry.cu b/qtensor/compression/szx/src/cuszx_entry.cu
index 4720bc1d..eec05606 100644
--- a/qtensor/compression/szx/src/cuszx_entry.cu
+++ b/qtensor/compression/szx/src/cuszx_entry.cu
@@ -1,1960 +1,1960 @@
-#include "cuszx_entry.h"
-#include "szx_defines.h"
-#include "szx_BytesToolkit.h"
-#include "szx_TypeManager.h"
-#include "timingGPU.h"
-#include "szx.h"
-#include <thrust/copy.h>
-#include <thrust/execution_policy.h>
-#include <cub/cub.cuh>
-#include <thrust/extrema.h>
-#include <thrust/reduce.h>
-#include <thrust/functional.h>
-#include <cub/cub.cuh>
-
-#define SPARSITY_LEVEL 0.25
-#define BLOCKS 40
-#define THREADS_PER_BLOCK 256
-
-TimingGPU timer_GPU;
-void bin(unsigned n)
-{
-    unsigned i;
-    for (i = 1 << 31; i > 0; i = i / 2)
-        (n & i) ? printf("1") : printf("0");
-}
-
-__host__ __device__ size_t convert_state_to_out(unsigned char* meta, size_t length, unsigned char *result){
-    size_t out_length;
-
-    if(length%4==0)
-		out_length = length/4;
-	else
-		out_length = length/4+1;
-
-    for (size_t i = 0; i < out_length; i++)
-    {
-        uint8_t tmp = 0;
-
-        for (size_t j = 0; j < 4; j++)
-        {
-            if (i*4 + j < length)
-            {
-                tmp |= (0x03 & meta[i*4+j]) << 2*j;
-            }
-            
-        }
-        result[i] = tmp;
-    }
-    return out_length;
-}
-
-__global__ void convert_state_to_out_kernel(unsigned char* meta, size_t length, unsigned char *result, size_t out_length){
-    
-
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < out_length; i += blockDim.x*gridDim.x){
-        uint8_t tmp = 0;
-
-        for (size_t j = 0; j < 4; j++)
-        {
-            if (i*4 + j < length)
-            {
-                tmp |= (0x03 & meta[i*4+j]) << 2*j;
-            }
-            
-        }
-        result[i] = tmp;
-    }
-}
-
-__global__ void convert_out_to_state_kernel(size_t nbBlocks, unsigned char* cmp, unsigned char* out_state, size_t state_length, int *num_state2blks, int *ncBlocks){
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < state_length; i += blockDim.x*gridDim.x){
-        for (size_t j = 0; j < 4; j++)
-        {
-            if (4*i + j < nbBlocks)
-            {
-                out_state[4*i + j]= (cmp[i] >> 2*j) & 0x03;
-                if (out_state[4*i+j] == 2)
-                {
-                    atomicAdd(num_state2blks, 1);
-                }else if(out_state[4*i+j]==3){
-                    atomicAdd(ncBlocks, 1);
-                }
-                
-            }
-            
-        }
-    }
-}
-
-// nbBlocks, r, stateNBBytes, stateArray
-__host__ __device__ size_t convert_out_to_state(size_t nbBlocks, unsigned char* cmp, unsigned char* out_state){
-    size_t state_length;
-    if(nbBlocks%4==0)
-		state_length = nbBlocks/4;
-	else
-		state_length = nbBlocks/4+1;
-
-    for (size_t i = 0; i < state_length; i++)
-    {
-        for (size_t j = 0; j < 4; j++)
-        {
-            if (4*i + j < nbBlocks)
-            {
-                out_state[4*i + j]= (cmp[i] >> 2*j) & 0x03;
-            }
-            
-        }
-    }
-    return nbBlocks;
-}
-
-__host__ __device__ size_t convert_block2_to_out(unsigned char *result, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
-    size_t out_length = 0;
-    
-    memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
-    out_length += numBlocks*4;
-    memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
-    out_length += num_sig*sizeof(float);
-    memcpy(result+out_length, blk_subidx, num_sig*sizeof(uint8_t));
-    out_length += num_sig*sizeof(uint8_t);
-    memcpy(result+out_length, blk_sig, numBlocks*sizeof(uint8_t));
-    out_length+= numBlocks*sizeof(uint8_t);
-
-    return out_length;
-}
-
-__global__ void convert_block2_to_out_kernel(unsigned char *result, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
-    
-    size_t out_length = 0;
-    unsigned char *tmp_result = result;
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
-        uint32_t local_blkidx = blk_idx[i];
-        tmp_result[4*i] = (local_blkidx) & 0xff;
-        tmp_result[4*i+1] = (local_blkidx >> (8*1)) & 0xff;
-        tmp_result[4*i+2] = (local_blkidx >> (8*2)) & 0xff;
-        tmp_result[4*i+3] = (local_blkidx >> (8*3)) & 0xff;
-    }
-    // memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
-    out_length += numBlocks*4;
-    tmp_result = result+out_length;
-    
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
-        float value = blk_vals[i];
-	    memcpy(&tmp_result[4*i], &value, sizeof(float));
-	//unsigned char *v = ()
-        //tmp_result[(int)4*i] = (unsigned char)((value) & 0xff);
-        //tmp_result[(int)4*i+1] = (unsigned char)((value >> (8*1)) & 0xff);
-        //tmp_result[(int)4*i+2] = (unsigned char)((value >> (8*2)) & 0xff);
-        //tmp_result[(int)4*i+3] = (unsigned char)((value >> (8*3)) & 0xff);
-    }
-    // memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
-    out_length += num_sig*sizeof(float);
-    tmp_result = result+out_length;
-    
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
-        tmp_result[i] = blk_subidx[i];
-        
-    }
-
-    out_length += num_sig*sizeof(uint8_t);
-    tmp_result = result+out_length;
-    
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
-        tmp_result[i] = blk_sig[i];
-        
-    }
-    out_length+= numBlocks*sizeof(uint8_t);
-
-    // return out_length;
-}
-
-__global__ void convert_out_to_block2_kernel(unsigned char *in_cmp, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
-    size_t out_length = 0;
-    
-    unsigned char *tmp_result = in_cmp;
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
-        
-        uint32_t local_blkidx = (tmp_result[4*i] & 0xff) | ((tmp_result[4*i+1] & 0xff) << (8*1)) 
-                                | ((tmp_result[4*i+2] & 0xff) << (8*2)) | ((tmp_result[4*i+3] & 0xff) << (8*3));
-        blk_idx[i] = local_blkidx;
-    }
-    // memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
-    out_length += numBlocks*4;
-    tmp_result = in_cmp+out_length;
-    
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
-        float value = 0.0;
-        memcpy(&value, &tmp_result[4*i], sizeof(float));
-        blk_vals[i] = value;
-	    
-	//unsigned char *v = ()
-        //tmp_result[(int)4*i] = (unsigned char)((value) & 0xff);
-        //tmp_result[(int)4*i+1] = (unsigned char)((value >> (8*1)) & 0xff);
-        //tmp_result[(int)4*i+2] = (unsigned char)((value >> (8*2)) & 0xff);
-        //tmp_result[(int)4*i+3] = (unsigned char)((value >> (8*3)) & 0xff);
-    }
-    // memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
-    out_length += num_sig*sizeof(float);
-    tmp_result = in_cmp+out_length;
-    
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
-        blk_subidx[i] = tmp_result[i];
-        
-    }
-
-    out_length += num_sig*sizeof(uint8_t);
-    tmp_result = in_cmp+out_length;
-    
-    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
-        blk_sig[i] = tmp_result[i];
-        
-    }
-    out_length+= numBlocks*sizeof(uint8_t);
-}
-
-__host__ __device__ size_t convert_out_to_block2(unsigned char *in_cmp, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
-    size_t out_length = 0;
-    memcpy(blk_idx, in_cmp, numBlocks*sizeof(uint32_t));
-    out_length += numBlocks*4;
-    memcpy(blk_vals, in_cmp+out_length,num_sig*sizeof(float));
-    out_length += num_sig*sizeof(float);
-    memcpy(blk_subidx, in_cmp+out_length, num_sig*sizeof(uint8_t));
-    out_length += num_sig*sizeof(uint8_t);
-    memcpy(blk_sig, in_cmp+out_length, numBlocks*sizeof(uint8_t));
-    out_length += numBlocks*sizeof(uint8_t);
-//    printf("outlength: %d\n",out_length);
-    return out_length;
-}
-
-int _post_proc(float *oriData, unsigned char *meta, short *offsets, unsigned char *midBytes, unsigned char *outBytes, size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig)
-{
-    int out_size = 0;
-
-    size_t nbConstantBlocks = 0;
-    size_t nbBlocks = nbEle/blockSize;
-    size_t ncBytes = blockSize/4;
-    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
-    out_size += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
-    if (nbBlocks%8==0)
-        out_size += nbBlocks/8;
-    else
-        out_size += nbBlocks/8+1;
-    int s0 = 0;
-    int s1 = 0;
-    int s2 = 0;
-    int s3 = 0;
-    for (int i=0; i<nbBlocks; i++){
-        if (meta[i]==0 || meta[i]==1 || meta[i] == 2) nbConstantBlocks++;
-        else out_size += 1+(blockSize/4)+offsets[i];
-    
-    	if(meta[i]==0) s0++;
-    	if(meta[i]==1) s1++;
-    	if(meta[i]==2) s2++;
-    	if(meta[i]==3) s3++;
-    }
-//    printf("%d %d %d %d\n", s0, s1, s2, s3);
-    out_size += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
-
-    //outBytes = (unsigned char*)malloc(out_size);
-  //  printf("accessing outbytes now...\n");
-	unsigned char* r = outBytes;
-    unsigned char* r_old = outBytes; 
-	r[0] = SZx_VER_MAJOR;
-	r[1] = SZx_VER_MINOR;
-	r[2] = 1;
-	r[3] = 0; // indicates this is not a random access version
-	r[4] = (unsigned char)blockSize;
-	r=r+5; //1 byte
-	sizeToBytes(r, nbConstantBlocks);
-	r += sizeof(size_t);
-    sizeToBytes(r, (size_t) num_sig);
-    r += sizeof(size_t); 
-	r += convert_state_to_out(meta, nbBlocks, r);
-    r += convert_block2_to_out(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-    memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
-    r += (nbEle%blockSize)*sizeof(float);
-    unsigned char* c = r;
-    unsigned char* o = c+nbConstantBlocks*sizeof(float);
-    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
-    for (int i=0; i<nbBlocks; i++){
-        
-        if (meta[i]==0 || meta[i] == 1){
-	    memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
-            c += sizeof(float);
-        }else if(meta[i] == 3){
-            shortToBytes(o, offsets[i]);
-	   
-            o += sizeof(short);
-            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
-            
-	    nc += mSize; 
-            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
-            
-	    nc += offsets[i];
-	   
-        } 
-    }
-
-    // return out_size;
-    return (uint32_t) (nc-r_old);
-}
-
-unsigned char* cuSZx_fast_compress_args_unpredictable_blocked_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold)
-{
-//    printf("tr thresh abs %f %f\n", threshold, absErrBound);
-  //  printf("first: %f %f %f\n", oriData[0], oriData[1], oriData[2]);
-    float sparsity_level = SPARSITY_LEVEL;
-	float* d_oriData;
-    cudaMalloc((void**)&d_oriData, sizeof(float)*nbEle); 
-    cudaMemcpy(d_oriData, oriData, sizeof(float)*nbEle, cudaMemcpyHostToDevice); 
-
-	size_t nbBlocks = nbEle/blockSize;
-	size_t remainCount = nbEle%blockSize;
-	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
-
-    size_t ncBytes = blockSize/4;
-    //ncBytes = (blockSize+1)%4==0 ? ncBytes : ncBytes+1; //Bytes to store one non-constant block data.
-    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
-    size_t msz = (1+mSize) * nbBlocks * sizeof(unsigned char);
-    size_t mbsz = sizeof(float) * nbEle * sizeof(unsigned char);
-
-    unsigned char *meta = (unsigned char*)malloc(msz);
-    short *offsets = (short*)malloc(nbBlocks*sizeof(short));
-    unsigned char *midBytes = (unsigned char*)malloc(mbsz);
-
-	unsigned char* d_meta;
-	unsigned char* d_midBytes;
-	short* d_offsets;
-
-    uint32_t *blk_idx, *d_blk_idx;
-    uint8_t *blk_sig, *d_blk_sig;
-    uint8_t *blk_subidx, *d_blk_subidx;
-    float *blk_vals, *d_blk_vals;
-    uint64_t *num_sig, *d_num_sig;
-
-    checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
-    num_sig = (uint64_t *)malloc(sizeof(uint64_t));
-    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
-    // blk_idx = malloc()
-    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, nbEle*sizeof(uint8_t)));
-
-    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, nbEle*sizeof(float)));
-
-    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
-
-    checkCudaErrors(cudaMalloc((void**)&d_meta, msz)); 
-    //checkCudaErrors(cudaMemcpy(d_meta, meta, msz, cudaMemcpyHostToDevice)); 
-    checkCudaErrors(cudaMemset(d_meta, 0, msz));
-    checkCudaErrors(cudaMalloc((void**)&d_offsets, nbBlocks*sizeof(short))); 
-    checkCudaErrors(cudaMemset(d_offsets, 0, nbBlocks*sizeof(short)));
-    checkCudaErrors(cudaMalloc((void**)&d_midBytes, mbsz)); 
-    checkCudaErrors(cudaMemset(d_midBytes, 0, mbsz));
-
-    timer_GPU.StartCounter();
-    // apply_threshold<<<80,256>>>(d_oriData, threshold, nbEle);
-    // cudaDeviceSynchronize();
-    dim3 dimBlock(32, blockSize/32);
-    dim3 dimGrid(65536, 1);
-    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
-    compress_float<<<dimGrid, dimBlock, sMemsize>>>(d_oriData, d_meta, d_offsets, d_midBytes, absErrBound, blockSize, nbBlocks, mSize, sparsity_level, d_blk_idx, d_blk_subidx,d_blk_vals, threshold, d_blk_sig);
-    cudaError_t err = cudaGetLastError();        // Get error code
-    printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    printf("GPU compression timing: %f ms\n", timer_GPU.GetCounter());
-    cudaDeviceSynchronize();
-    get_numsig<<<1,1>>>(d_num_sig);
-    cudaDeviceSynchronize();
-
-    checkCudaErrors(cudaMemcpy(num_sig, d_num_sig, sizeof(uint64_t), cudaMemcpyDeviceToHost));
-
-    blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
-    blk_vals= (float *)malloc((*num_sig)*sizeof(float));
-    blk_subidx = (uint8_t *)malloc((*num_sig)*sizeof(uint8_t));
-    blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
-
-    checkCudaErrors(cudaMemcpy(meta, d_meta, msz, cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(offsets, d_offsets, nbBlocks*sizeof(short), cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(midBytes, d_midBytes, mbsz, cudaMemcpyDeviceToHost)); 
-    
-    
-    checkCudaErrors(cudaMemcpy(blk_idx, d_blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(blk_vals,d_blk_vals, (*num_sig)*sizeof(float), cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(blk_subidx,d_blk_subidx, (*num_sig)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(blk_sig,d_blk_sig, (nbBlocks)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
-
-    size_t maxPreservedBufferSize = sizeof(float)*nbEle;
-    unsigned char* outBytes = (unsigned char*)malloc(maxPreservedBufferSize);
-    memset(outBytes, 0, maxPreservedBufferSize);
-
-    outSize = (size_t *)malloc(sizeof(size_t));
-    //outSize[0] = _post_proc(oriData, meta, offsets, midBytes, outBytes, nbEle, blockSize, *num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-
-    *outSize = _post_proc(oriData, meta, offsets, midBytes, outBytes, nbEle, blockSize, *num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-//    printf("Beginning free\n");
-    // printf("outsize %p \n", outBytes);
-    free(blk_idx);
-    free(blk_subidx);
-    free(blk_vals);
-    free(meta);
-    free(offsets);
-    free(midBytes);
-    checkCudaErrors(cudaFree(d_meta));
-    checkCudaErrors(cudaFree(d_offsets));
-    checkCudaErrors(cudaFree(d_midBytes));
-    return outBytes;
-}
-
-void cuSZx_fast_decompress_args_unpredictable_blocked_float(float** newData, size_t nbEle, unsigned char* cmpBytes)
-{
-    uint32_t *blk_idx, *d_blk_idx;
-    uint8_t *blk_subidx, *d_blk_subidx;
-    uint8_t *blk_sig, *d_blk_sig;
-    float *blk_vals, *d_blk_vals;
-    size_t num_sig, *d_num_sig;
-
-	*newData = (float*)malloc(sizeof(float)*nbEle);
-    memset(*newData, 0, sizeof(float)*nbEle);
-	
-	unsigned char* r = cmpBytes;
-	r += 4;
-	int blockSize = r[0];  //get block size
-	if(blockSize == 0)blockSize = 256;
-	r++;
-	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
-	r += sizeof(size_t);
-	num_sig = bytesToSize(r);
-    r += sizeof(size_t);
-	size_t nbBlocks = nbEle/blockSize;
-    size_t ncBlocks = 0;
-    size_t num_state2_blks = 0;
-	// size_t ncBlocks = nbBlocks - nbConstantBlocks; //get number of constant blocks
-	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
-    size_t ncLeading = blockSize/4;
-    size_t mSize = sizeof(float)+1+ncLeading; //Number of bytes for each data block's metadata.
-	unsigned char* stateArray = (unsigned char*)malloc(nbBlocks);
-    unsigned char* d_stateArray;
-    cudaMalloc(&d_stateArray, nbBlocks);
-	float* constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));			
-	
-    
-
-    blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
-    blk_vals= (float *)malloc((num_sig)*sizeof(float));
-    blk_subidx = (uint8_t *)malloc((num_sig)*sizeof(uint8_t));
-    blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
-
-	// printf("Converting state array\n");
-    convert_out_to_state(nbBlocks, r, stateArray);
-	// convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
-	for (size_t i = 0; i < nbBlocks; i++)
-    {
-        if (stateArray[i] == 2)
-        {
-            num_state2_blks++;
-        }else if(stateArray[i] == 3){
-            ncBlocks++;
-        }
-    }
-    
-	r += stateNBBytes;
-    unsigned char* data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
-    memset(data, 0, ncBlocks*blockSize*sizeof(float));
-    // printf("converting block vals\n");
-    size_t to_add = convert_out_to_block2(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-    r+= to_add;
-    // checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
-    // num_sig = (uint64_t *)malloc(sizeof(uint64_t));
-    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
-    // blk_idx = malloc()
-    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, num_sig*sizeof(uint8_t)));
-
-    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, num_sig*sizeof(float)));
-
-    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
-
-    checkCudaErrors(cudaMemcpy(d_blk_idx, blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(d_blk_vals, blk_vals, (num_sig)*sizeof(float), cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(d_blk_subidx, blk_subidx, (num_sig)*sizeof(uint8_t), cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(d_stateArray, stateArray, nbBlocks, cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(d_blk_sig, blk_sig, nbBlocks*sizeof(uint8_t), cudaMemcpyHostToDevice));
-
-
-	size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
-    memcpy((*newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
-    r += (nbEle%blockSize)*sizeof(float);
-	float* fr = (float*)r; //fr is the starting address of constant median values.
-	for(i = 0;i < nbConstantBlocks;i++, j+=4) //get the median values for constant-value blocks
-		constantMedianArray[i] = fr[i];
-    r += nbConstantBlocks*sizeof(float);
-    unsigned char* p = r + ncBlocks * sizeof(short);
-    for(i = 0;i < ncBlocks;i++){
-        int leng = (int)bytesToShort(r)+mSize;
-        r += sizeof(short);
-        if (leng > blockSize*sizeof(float))
-        {
-            printf("Warning: compressed block is larger than the original block!\n");
-            exit(0);
-        }
-        memcpy(data+i*blockSize*sizeof(float), p, leng);
-        p += leng;
-    } 
-
-    unsigned char* d_data;
-    float *d_newdata;
-    checkCudaErrors(cudaMalloc((void**)&d_data, ncBlocks*blockSize*sizeof(float))); 
-    checkCudaErrors(cudaMemcpy(d_data, data, ncBlocks*blockSize*sizeof(float), cudaMemcpyHostToDevice)); 
-    checkCudaErrors(cudaMalloc(&d_newdata, nbBlocks*blockSize*sizeof(float)));
-
-    timer_GPU.StartCounter();
-    dim3 dimBlock(32, blockSize/32);
-    dim3 dimGrid(65536, 1);
-    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
-    decompress_state2<<<nbBlocks, 64>>>(d_newdata, d_stateArray,d_blk_idx, d_blk_vals, d_blk_subidx,blockSize, d_blk_sig);
-    decompress_float<<<dimGrid, dimBlock, sMemsize>>>(d_data, blockSize, ncBlocks, mSize);
-    cudaError_t err = cudaGetLastError();        // Get error code
-    printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
-    cudaDeviceSynchronize();
-    checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
-    float* fdata = (float*)data;
-
-    int nb=0, nc=0;
-    for (i=0;i<nbBlocks;i++){
-        if (stateArray[i]==0 || stateArray[i]==1){
-            float Median = constantMedianArray[nb];
-            if (Median>1) printf("data%i:%f\n",i, Median);
-            for (j=0;j<blockSize;j++)
-                *((*newData)+i*blockSize+j) = Median;
-            nb++;
-        }else if(stateArray[i]==3){
-            for (j=0;j<blockSize;j++)
-                *((*newData)+i*blockSize+j) = fdata[nc*blockSize+j];
-            nc++;
-        }
-    }
-
-	free(stateArray);
-	free(constantMedianArray);
-	free(data);
-    cudaFree(d_newdata);
-    cudaFree(d_stateArray);
-    checkCudaErrors(cudaFree(d_data));
-
-}
-
-__device__ inline void longToBytes_bigEndian_d(unsigned char *b, unsigned long num) 
-{
-	b[0] = (unsigned char)(num>>56);
-	b[1] = (unsigned char)(num>>48);
-	b[2] = (unsigned char)(num>>40);
-	b[3] = (unsigned char)(num>>32);
-	b[4] = (unsigned char)(num>>24);
-	b[5] = (unsigned char)(num>>16);
-	b[6] = (unsigned char)(num>>8);
-	b[7] = (unsigned char)(num);
-//	if(dataEndianType==LITTLE_ENDIAN_DATA)
-//		symTransform_8bytes(*b);
-}
-
-inline void longToBytes_bigEndian_memset(unsigned char *b, unsigned long num) 
-{
-    checkCudaErrors(cudaMemset(&b[0], (unsigned char)(num>>56), sizeof(char)));
-    checkCudaErrors(cudaMemset(&b[1], (unsigned char)(num>>48), sizeof(char)));
-    checkCudaErrors(cudaMemset(&b[2], (unsigned char)(num>>40), sizeof(char)));
-    checkCudaErrors(cudaMemset(&b[3], (unsigned char)(num>>32), sizeof(char)));
-    checkCudaErrors(cudaMemset(&b[4], (unsigned char)(num>>24), sizeof(char)));
-    checkCudaErrors(cudaMemset(&b[5], (unsigned char)(num>>16), sizeof(char)));
-    checkCudaErrors(cudaMemset(&b[6], (unsigned char)(num>>8), sizeof(char)));
-    checkCudaErrors(cudaMemset(&b[7], (unsigned char)(num), sizeof(char)));
-//	if(dataEndianType==LITTLE_ENDIAN_DATA)
-//		symTransform_8bytes(*b);
-}
-
-__device__ inline void shortToBytes_d(unsigned char* b, short value)
-{
-	lint16 buf;
-	buf.svalue = value;
-	memcpy(b, buf.byte, 2);
-}
-
-
-
-__global__ void getNumNonConstantBlocks(size_t nbBlocks, short *offsets, unsigned char *meta, int blockSize, int *nonconstant, int *out_size){
-    for (int tid = blockDim.x*blockIdx.x + threadIdx.x; tid < nbBlocks; tid += blockDim.x*gridDim.x){
-        if (meta[tid] == 3){ 
-            atomicAdd(nonconstant, 1);
-            atomicAdd(out_size,1+(blockSize/4)+offsets[tid]);
-        }
-    }
-}
-
-__global__ void generateFlags(unsigned char *states, uint64_t *cBlk_flags, uint64_t *ncBlk_flags,uint64_t* offset_indices,short* offsets, size_t nbBlocks){
-    for (int tid = blockDim.x*blockIdx.x + threadIdx.x; tid < nbBlocks; tid += blockDim.x*gridDim.x){
-        if (states[tid] == 0 || states[tid] == 1)
-        {
-            cBlk_flags[tid] = 1;
-            ncBlk_flags[tid] = 0;
-            offset_indices[tid] = 0;
-        }else if(states[tid]==3){
-            ncBlk_flags[tid] = 1;
-            cBlk_flags[tid] = 0;
-            offset_indices[tid] = (uint64_t) offsets[tid];
-        }else{
-            cBlk_flags[tid] = 0;
-            ncBlk_flags[tid] = 0;
-            offset_indices[tid] = 0;
-        }
-        
-    }
-}
-
-__global__ void nccopy_kernel2(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
-                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, uint64_t *cBlk_indices, uint64_t *ncBlk_indices, uint64_t* offset_indices){
-   // printf("blockdim %d blockidx %d threadidx %d griddim %d\n", blockDim.x, blockIdx.x, threadIdx.x, gridDim.x);
-    int i;
-    int num_threads = (blockDim.x*gridDim.x);
-    int tid = blockDim.x*blockIdx.x + threadIdx.x;
-    int blocks_per_thread = nbBlocks/num_threads;
-    int start_idx = tid*blocks_per_thread;
-    int end_idx = start_idx+blocks_per_thread;
-
-    if (tid == num_threads-1)
-    {
-        end_idx = nbBlocks;
-    }
-    
-    unsigned char* tmp_o = o+(sizeof(short)*ncBlk_indices[start_idx]);
-    unsigned char* tmp_nc= nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]);
-    for (i=start_idx; i<end_idx; i++){
-        if(meta[i] == 3){
-	
-            
-            shortToBytes_d(o, offsets[i]);
-            tmp_o += sizeof(short);
-            memcpy(tmp_nc, meta+(nbBlocks+i*mSize), mSize);
-            tmp_nc += mSize; 
-            memcpy(tmp_nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
-            tmp_nc += offsets[i];
-
-            // shortToBytes_d(o+(sizeof(short)*ncBlk_indices[i]), offsets[i]);
-            
-            // memcpy(nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]), meta+(nbBlocks+i*mSize), mSize);
-
-
-            // memcpy(nc+(mSize*(ncBlk_indices[i]+1) + offset_indices[i]*ncBlk_indices[i]), midBytes+(i*blockSize*sizeof(float)), offsets[i]);
-        } 
-    }
-    
-}
-
-
-__global__ void nccopy_kernel(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
-                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, uint64_t *cBlk_indices, uint64_t *ncBlk_indices, uint64_t* offset_indices, size_t *final_nc){
-   // printf("blockdim %d blockidx %d threadidx %d griddim %d\n", blockDim.x, blockIdx.x, threadIdx.x, gridDim.x);
-    int i;
-    // if(threadIdx.x==0){
-	// printf("c: %ld nc: %ld\n", cBlk_indices[nbBlocks-1], ncBlk_indices[nbBlocks-1]);
-    // }
-    for (i=blockDim.x*blockIdx.x + threadIdx.x; i<nbBlocks; i+=blockDim.x*gridDim.x){
-        //printf("meta %d i: %d\n",meta[i], i); 
-        if (meta[i]==0 || meta[i] == 1){
-            // printf("cblk\n");
-	        memcpy(c+(sizeof(float)*cBlk_indices[i]), meta+(nbBlocks+i*mSize), sizeof(float));
-	   
-            // printf("cblk done\n");
-	    // c += sizeof(float);
-	    // float g;
-	    // memcpy(&g, (meta+(nbBlocks+i*mSize)),sizeof(float));
-	    // printf("%d %f\n",i,g);
-        }
-        else if(meta[i] == 3){
-	
-        //     printf("ncblk 1\n");
-            shortToBytes_d(o+(sizeof(short)*ncBlk_indices[i]), offsets[i]);
-             // o += sizeof(short);
-
-        //     printf("ncblk 2 nbBlocks %d %d \n", nbBlocks, i);
-            // printf("nbBlkindices %ld offset_indices %ld\n", ncBlk_indices[i], offset_indices[i]);
-        //     printf(" test 1%c\n",meta+(nbBlocks+i*mSize));
-        //     printf("test 2%c\n", nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]));
-            memcpy(nc+((mSize*ncBlk_indices[i] + offset_indices[i])), meta+(nbBlocks+i*mSize), mSize);
-        //         // nc += mSize; 
-                
-        //     printf("ncblk 3\n");
-            memcpy(nc+(((mSize*ncBlk_indices[i])+mSize + offset_indices[i])), midBytes+(i*blockSize*sizeof(float)), offsets[i]);
-        //         // nc += offsets[i];
-            
-        //     printf("ncblk 4\n");
-        }
-        if (i==nbBlocks-1)
-        {
-            *final_nc = (size_t) (((mSize*ncBlk_indices[i])+mSize + offset_indices[i]))+offsets[i];
-	}
-        
-    }
-    
-}
-
-//__global__ void nccopy_kernel(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
-//                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, int *cBlk_indices, int *ncBlk_indices, int* offset_indices){
-//    printf("blockdim %d blockidx %d threadidx %d griddim %d\n", blockDim.x, blockIdx.x, threadIdx.x, gridDim.x);
-//    int i;
-//    for (i=blockDim.x*blockIdx.x + threadIdx.x; i<nbBlocks; i+=blockDim.x*gridDim.x){
-        //printf("meta %d i: %d\n",meta[i], i); 
-//        if (meta[i]==0 || meta[i] == 1){
-            // printf("cblk\n");
-//	    memcpy(c+(sizeof(float)*cBlk_indices[i]), meta+(nbBlocks+i*mSize), sizeof(float));
-
-            // printf("cblk done\n");
-	    // c += sizeof(float);
-	    // float g;
-	    // memcpy(&g, (meta+(nbBlocks+i*mSize)),sizeof(float));
-	    // printf("%d %f\n",i,g);
-//        }else if(meta[i] == 3){
-	
-//           printf("ncblk 1\n");
-//           shortToBytes_d(o+(sizeof(short)*ncBlk_indices[i]), offsets[i]);
-            // o += sizeof(short);
-
-//           printf("ncblk 2 nbBlocks %d %d \n", nbBlocks, i);
-//	   printf("nbBlkindices %d offset_indices %d\n", ncBlk_indices[i], offset_indices[i]);
-//	   memcpy(nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]), meta+(nbBlocks+i*mSize), mSize);
-            // nc += mSize; 
-            
-//           printf("ncblk 3\n");
-//	   memcpy(nc+(mSize*(ncBlk_indices[i]+1) + offset_indices[i]*ncBlk_indices[i]), midBytes+(i*blockSize*sizeof(float)), offsets[i]);
-            // nc += offsets[i];
-        
-//           printf("ncblk 4\n");
-//	} 
-//    }
-    
-//}
-
-__global__ void set_nc(unsigned char *nc, short *offsets, uint64_t *offset_indices, uint64_t *ncBlk_indices, size_t mSize, size_t nbBlocks){
-    if (threadIdx.x == 0 && blockIdx.x == 0)
-    {
-        nc = nc + (mSize*(ncBlk_indices[nbBlocks -1]+1) + offset_indices[nbBlocks - 1]*ncBlk_indices[nbBlocks - 1]) + offsets[nbBlocks-1];
-    }
-    
-}
-
-void ncblkCopy_fast(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
-                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, size_t *final_nc){
-    uint64_t *cBlk_indices, *ncBlk_indices;
-    uint64_t *offset_indices;
-    TimingGPU timer2;
-
-    // timer2.StartCounter();
-    
-    checkCudaErrors(cudaMalloc(&cBlk_indices, sizeof(uint64_t)*nbBlocks));
-    checkCudaErrors(cudaMalloc(&ncBlk_indices, sizeof(uint64_t)*nbBlocks));
-    checkCudaErrors(cudaMalloc(&offset_indices, sizeof(uint64_t)*nbBlocks));
-
-    generateFlags<<<BLOCKS,THREADS_PER_BLOCK>>>(meta, cBlk_indices, ncBlk_indices, offset_indices, offsets, nbBlocks);
-    cudaDeviceSynchronize();
-
-    thrust::exclusive_scan(thrust::device, cBlk_indices, cBlk_indices + nbBlocks, cBlk_indices, 0);
-    thrust::exclusive_scan(thrust::device, ncBlk_indices, ncBlk_indices + nbBlocks, ncBlk_indices, 0);
-    thrust::exclusive_scan(thrust::device, offset_indices, offset_indices + nbBlocks, offset_indices, 0);
-
-    nccopy_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(c, o, nc, midBytes, meta, nbBlocks, blockSize, offsets, mSize, cBlk_indices,ncBlk_indices,offset_indices,final_nc);
-    // nccopy_kernel2<<<1,1>>>(c, o, nc, midBytes, meta, nbBlocks, blockSize, offsets, mSize, cBlk_indices,ncBlk_indices,offset_indices);
-
-    cudaDeviceSynchronize();
-
-    //printf("nc: %p\n", nc);
-    // printf("%s\n", cudaGetErrorString(cudaGetLastError()));
-    // set_nc<<<1,1>>>(nc, offsets, offset_indices, ncBlk_indices, mSize, nbBlocks);
-    // cudaDeviceSynchronize();
-    // printf("ncblockcpy: %f ms\n", timer2.GetCounter());
-    checkCudaErrors(cudaFree(cBlk_indices));
-    checkCudaErrors(cudaFree(ncBlk_indices));
-    checkCudaErrors(cudaFree(offset_indices));
-}
-
-void ncblkCopy_h(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
-                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize){
-    unsigned char *tmp_states;
-    unsigned char *ncold = nc;
-    uint64_t col_off = 0;
-    short *tmp_offsets;
-    tmp_offsets = (short*)malloc(sizeof(short)*nbBlocks);
-    tmp_states = (unsigned char *)malloc(sizeof(char)*nbBlocks);
-    checkCudaErrors(cudaMemcpy(tmp_states, meta, sizeof(char)*nbBlocks, cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(tmp_offsets,offsets,sizeof(short)*nbBlocks,cudaMemcpyDeviceToHost));
-    cudaStream_t stream[3];
-    cudaStreamCreate(&stream[0]);
-    cudaStreamCreate(&stream[1]);
-    cudaStreamCreate(&stream[2]);
-
-    //printf("here\n");
-    //checkCudaErrors(cudaMemcpy((void**)&d_offsets, nbBlocks*sizeof(short))); 
-    for (int i = 0; i < nbBlocks; i++)
-    {
-        if(tmp_states[i]==3){
-            // shortToBytes_d(o, offsets[i]);
-            // buf = (unsigned char*)
-            
-//	    printf("here2\n");
-            cudaMemcpyAsync(o, offsets+i, 2, cudaMemcpyDeviceToDevice, stream[0]);
-            o += sizeof(short);
-        
-    //	    printf("here2.1\n");
-            // printf("offsets %ld\n", col_off);
-            cudaMemcpyAsync(nc, meta+(nbBlocks+i*mSize), mSize, cudaMemcpyDeviceToDevice, stream[1]);
-                // memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
-                
-            nc += mSize; 
-                
-    //	    printf("here2.2\n");
-            //checkCudaErrors(cudaMemcpy(buf, offsets+i, sizeof(short), cudaMemcpyDeviceToHost));
-                
-    //	    //printf("here2.3 %d\n", buf);
-            cudaMemcpyAsync(nc, midBytes+(i*blockSize*sizeof(float)), (int)tmp_offsets[i], cudaMemcpyDeviceToDevice, stream[2]);
-            // memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
-            nc += tmp_offsets[i];
-            col_off+=tmp_offsets[i];
-       
-///	    printf("here2.4\n");
-       	}
-    }
-    cudaStreamDestroy(stream[0]);
-    cudaStreamDestroy(stream[1]);
-    cudaStreamDestroy(stream[2]);
-
-    free(tmp_states);
-    free(tmp_offsets); 
-}
-
-__global__ void ncblkCopy(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
-                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize)
-{
-    for (int i=blockDim.x*blockIdx.x + threadIdx.x; i<nbBlocks; i+=blockDim.x*gridDim.x){
-        
-        if (meta[i]==0 || meta[i] == 1){
-            memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
-            c += sizeof(float);
-	    // float g;
-	    // memcpy(&g, (meta+(nbBlocks+i*mSize)),sizeof(float));
-	    // printf("%d %f\n",i,g);
-        }else if(meta[i] == 3){
-           shortToBytes_d(o, offsets[i]);
-            o += sizeof(short);
-            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
-            nc += mSize; 
-            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
-            nc += offsets[i];
-        } 
-    }
-}
-
-size_t better_post_proc(size_t *outSize, float *oriData, unsigned char *meta, 
-                                short *offsets, unsigned char *midBytes, unsigned char *outBytes, 
-                                size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, 
-                                float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
-    /**
-     * outSize: host pointer
-     * float *oriData: device pointer
-     * unsigned char* meta: device pointer
-     * short *offsets: device pointer
-     * 
-     * 
-     */
-    int out_size_h = 0;
-    int *out_size_d;
-    int tmp_outsize = 0;
-    size_t *nc_diff;
-    size_t nbConstantBlocks = 0;
-    size_t nbBlocks = nbEle/blockSize;
-    size_t ncBytes = blockSize/4;
-    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
-    out_size_h += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
-    if (nbBlocks%8==0)
-        out_size_h += nbBlocks/8;
-    else
-        out_size_h += nbBlocks/8+1;
-    cudaMalloc(&nc_diff, sizeof(size_t));
-    int *nonconstant_d, nonconstant_h;
-    checkCudaErrors(cudaMalloc((void **)&nonconstant_d, sizeof(int)));
-    checkCudaErrors(cudaMalloc((void **)&out_size_d, sizeof(int)));
-
-    checkCudaErrors(cudaMemset(nonconstant_d, 0, sizeof(int)));
-    checkCudaErrors(cudaMemset(out_size_d, 0, sizeof(int)));
-
-
-    getNumNonConstantBlocks<<<BLOCKS,THREADS_PER_BLOCK>>>(nbBlocks, offsets, meta, blockSize, nonconstant_d, out_size_d);
-    cudaDeviceSynchronize();
-
-    checkCudaErrors(cudaMemcpy(&nonconstant_h, nonconstant_d, sizeof(int), cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(&tmp_outsize, out_size_d, sizeof(int), cudaMemcpyDeviceToHost));
-
-    nbConstantBlocks = nbBlocks - nonconstant_h;
-    out_size_h+=tmp_outsize;
-
-    out_size_h += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
-
-    //outBytes = (unsigned char*)malloc(out_size);
-	unsigned char* r = outBytes;
-    unsigned char* r_old = outBytes;
-    checkCudaErrors(cudaMemset(r, SZx_VER_MAJOR, sizeof(char)));
-    checkCudaErrors(cudaMemset(r+1, SZx_VER_MINOR, sizeof(char)));
-    checkCudaErrors(cudaMemset(r+2, 1, sizeof(char)));
-    checkCudaErrors(cudaMemset(r+3, 0, sizeof(char)));
-    checkCudaErrors(cudaMemset(r+4, blockSize, sizeof(char)));
-
-	r=r+5; //1 byte
-	//sizeToBytes(r, nbConstantBlocks);
-    longToBytes_bigEndian_memset(r, nbConstantBlocks);
-	r += sizeof(size_t);
-    //sizeToBytes(r, (size_t) num_sig);
-    longToBytes_bigEndian_memset(r, (unsigned long)num_sig);
-    r += sizeof(size_t); 
-    size_t out_length;
-
-    if(nbBlocks%4==0)
-		out_length = nbBlocks/4;
-	else
-		out_length = nbBlocks/4+1;
-
-    convert_state_to_out_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(meta, nbBlocks, r, out_length);
-    r+=out_length;
-    convert_block2_to_out_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-    r += nbBlocks*4 + num_sig*sizeof(float) + num_sig*sizeof(uint8_t) + nbBlocks*sizeof(uint8_t);
-
-    checkCudaErrors(cudaMemcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float), cudaMemcpyDeviceToDevice));
-    // memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
-    r += (nbEle%blockSize)*sizeof(float);
-    unsigned char* c = r;
-    unsigned char* o = c+nbConstantBlocks*sizeof(float);
-    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
-    // ncblkCopy<<<1,1>>>(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
-    
-    // ncblkCopy_h(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
-    ncblkCopy_fast(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize, nc_diff);
-    // cudaDeviceSynchronize();
-    size_t h_nc_diff;
-    cudaMemcpy(&h_nc_diff,nc_diff, sizeof(size_t),cudaMemcpyDeviceToHost);
-    return (size_t) (nc+h_nc_diff-r_old);
-    // checkCudaErrors(cudaMemcpy(outSize, (size_t)(nc-r_old), sizeof(size_t)));
-    // *outSize = (size_t) (nc-r_old);
-    // return outBytes;
-}
-
-__global__ void device_post_proc(size_t *outSize, float *oriData, unsigned char *meta, 
-                                short *offsets, unsigned char *midBytes, unsigned char *outBytes, 
-                                size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, 
-                                float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig)
-{
-    int out_size = 0;
-
-    size_t nbConstantBlocks = 0;
-    size_t nbBlocks = nbEle/blockSize;
-    size_t ncBytes = blockSize/4;
-    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
-    out_size += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
-    if (nbBlocks%8==0)
-        out_size += nbBlocks/8;
-    else
-        out_size += nbBlocks/8+1;
-    int s0 = 0;
-    int s1 = 0;
-    int s2 = 0;
-    int s3 = 0;
-    for (int i=0; i<nbBlocks; i++){
-        if (meta[i]==0 || meta[i]==1 || meta[i] == 2) nbConstantBlocks++;
-        else out_size += 1+(blockSize/4)+offsets[i];
-    
-    	if(meta[i]==0) s0++;
-    	if(meta[i]==1) s1++;
-    	if(meta[i]==2) s2++;
-    	if(meta[i]==3) s3++;
-    }
-  //  printf("%d %d %d %d\n", s0, s1, s2, s3);
-    out_size += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
-
-    //outBytes = (unsigned char*)malloc(out_size);
-	unsigned char* r = outBytes;
-   // printf("outbytes %p\n",r);
-    unsigned char* r_old = outBytes; 
-	r[0] = SZx_VER_MAJOR;
-	r[1] = SZx_VER_MINOR;
-	r[2] = 1;
-	r[3] = 0; // indicates this is not a random access version
-	r[4] = (unsigned char)blockSize;
-	r=r+5; //1 byte
-	//sizeToBytes(r, nbConstantBlocks);
-    longToBytes_bigEndian_d(r, nbConstantBlocks);
-	r += sizeof(size_t);
-    //sizeToBytes(r, (size_t) num_sig);
-
-   // printf("outbytes %p\n",r);
-    longToBytes_bigEndian_d(r, (unsigned long)num_sig);
-    r += sizeof(size_t); 
-	r += convert_state_to_out(meta, nbBlocks, r);
-   // printf("num sig %d\n", num_sig); 
-   // printf("outbytes %p\n",r);
-    r += convert_block2_to_out(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-    
-   // printf("outbytes %p\n",r);
-    memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
-    r += (nbEle%blockSize)*sizeof(float);
-
-   // printf("outbytes %p\n",r);
-    unsigned char* c = r;
-    unsigned char* o = c+nbConstantBlocks*sizeof(float);
-    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
-    for (int i=0; i<nbBlocks; i++){
-        
-        if (meta[i]==0 || meta[i] == 1){
-            memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
-            c += sizeof(float);
-       
-	    // float g;
-	    // memcpy(&g, (c-sizeof(float)),sizeof(float));
-	    // printf("%d %f\n",i,g);
-       	}else if(meta[i] == 3){
-           shortToBytes_d(o, offsets[i]);
-            o += sizeof(short);
-            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
-            nc += mSize; 
-            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
-            nc += offsets[i];
-        } 
-    }
-
-    // return out_size;
-    *outSize = (size_t) (nc-r_old);
-   // printf("outBytes 0 %d\n", (int) outBytes[0]);
-    // return (uint32_t) (nc-r_old);
-}
-
-__global__ void fin_copy(unsigned char* in, unsigned char *out, size_t n){
-
-	for(size_t i = threadIdx.x+blockIdx.x*gridDim.x; i < n; i+=blockDim.x*gridDim.x){
-		out[i]=in[i];
-	}
-
-}
-
-unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold)
-{
-    /**
-     * Assuming the following are device pointers:
-     *  float *oriData
-     *  size_t *outSize
-     *  unsigned char* outBytes
-     * 
-     */
-    // float *dmin,*dmax, *hmin, *hmax;
-    // void *d_temp_storage = NULL;
-    // size_t temp_storage_bytes = 0;
-    timer_GPU.StartCounter();
-//     cudaMalloc(&dmin, sizeof(float));
-//     cudaMalloc(&dmax, sizeof(float));
-
-//    // dmax = thrust::reduce(oriData, oriData+nbEle, -1, thrust::maximum<float>());
-//    // dmin = thrust::reduce(oriData, oriData+nbEle, 1, thrust::minimum<float>());
-//     cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, oriData, dmax, nbEle);
-//     cudaMalloc(&d_temp_storage, temp_storage_bytes);
-//     cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, oriData, dmax, nbEle);
-
-//     cudaFree(d_temp_storage);
-//     cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, oriData, dmin, nbEle);
-//     cudaMalloc(&d_temp_storage, temp_storage_bytes);
-//     cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, oriData, dmin, nbEle);
-
-//     cudaFree(d_temp_storage);
-//     // thrust::pair<float *, float *> result = thrust::minmax_element(thrust::device, oriData,oriData+nbEle);
-//     //printf("here\n");
-//     cudaMemcpy(hmin, dmin, sizeof(float), cudaMemcpyDeviceToHost);
-//     cudaMemcpy(hmax, dmax,sizeof(float), cudaMemcpyDeviceToHost);
-//     absErrBound = absErrBound*(hmax-hmin);
-//     threshold = threshold*(hmax-hmin);
-    // // printf("%f\n",absErrBound);
-    // cudaFree(dmin);
-    // cudaFree(dmax);
-    float sparsity_level = SPARSITY_LEVEL;
-
-    // Set the input data as the function parameter, this should be a device pointer
-
-	float* d_oriData = oriData;
-    // cudaMalloc((void**)&d_oriData, sizeof(float)*nbEle); 
-    // cudaMemcpy(d_oriData, oriData, sizeof(float)*nbEle, cudaMemcpyHostToDevice); 
-
-	size_t nbBlocks = nbEle/blockSize;
-	size_t remainCount = nbEle%blockSize;
-	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
-
-    size_t ncBytes = blockSize/4;
-    //ncBytes = (blockSize+1)%4==0 ? ncBytes : ncBytes+1; //Bytes to store one non-constant block data.
-    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
-    size_t msz = (1+mSize) * nbBlocks * sizeof(unsigned char);
-    size_t mbsz = sizeof(float) * nbEle * sizeof(unsigned char);
-
-    // These are host pointers and do not need to be allocated
-
-    // unsigned char *meta = (unsigned char*)malloc(msz);
-    // short *offsets = (short*)malloc(nbBlocks*sizeof(short));
-    // unsigned char *midBytes = (unsigned char*)malloc(mbsz);
-
-	unsigned char* d_meta;
-	unsigned char* d_midBytes;
-	short* d_offsets;
-
-    uint32_t *blk_idx, *d_blk_idx;
-    uint8_t *blk_sig, *d_blk_sig;
-    uint8_t *blk_subidx, *d_blk_subidx;
-    float *blk_vals, *d_blk_vals;
-    uint64_t *num_sig, *d_num_sig;
-
-    checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
-    num_sig = (uint64_t *)malloc(sizeof(uint64_t));
-    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
-    // blk_idx = malloc()
-    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, nbEle*sizeof(uint8_t)));
-
-    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, nbEle*sizeof(float)));
-
-    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
-
-    checkCudaErrors(cudaMalloc((void**)&d_meta, msz)); 
-    //checkCudaErrors(cudaMemcpy(d_meta, meta, msz, cudaMemcpyHostToDevice)); 
-    checkCudaErrors(cudaMemset(d_meta, 0, msz));
-    checkCudaErrors(cudaMalloc((void**)&d_offsets, nbBlocks*sizeof(short))); 
-    checkCudaErrors(cudaMemset(d_offsets, 0, nbBlocks*sizeof(short)));
-    checkCudaErrors(cudaMalloc((void**)&d_midBytes, mbsz)); 
-    checkCudaErrors(cudaMemset(d_midBytes, 0, mbsz));
-
-    
-    // apply_threshold<<<80,256>>>(d_oriData, threshold, nbEle);
-    // cudaDeviceSynchronize();
-    dim3 dimBlock(32, blockSize/32);
-    dim3 dimGrid(65536, 1);
-    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
-    //printf("Malloc end timestamp: %f ms\n", timer_GPU.GetCounter());
-    compress_float<<<dimGrid, dimBlock, sMemsize>>>(d_oriData, d_meta, d_offsets, d_midBytes, absErrBound, blockSize, nbBlocks, mSize, sparsity_level, d_blk_idx, d_blk_subidx,d_blk_vals, threshold, d_blk_sig);
-    cudaError_t err = cudaGetLastError();        // Get error code
-   // printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    //printf("GPU compression timestamp: %f ms\n", timer_GPU.GetCounter());
-    cudaDeviceSynchronize();
-    get_numsig<<<1,1>>>(d_num_sig);
-    cudaDeviceSynchronize();
-
-    checkCudaErrors(cudaMemcpy(num_sig, d_num_sig, sizeof(uint64_t), cudaMemcpyDeviceToHost));
-
-    // These are allocations and memcpys to host pointers, do not need them
-
-    // blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
-    // blk_vals= (float *)malloc((*num_sig)*sizeof(float));
-    // blk_subidx = (uint8_t *)malloc((*num_sig)*sizeof(uint8_t));
-    // blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
-
-    // checkCudaErrors(cudaMemcpy(meta, d_meta, msz, cudaMemcpyDeviceToHost)); 
-    // checkCudaErrors(cudaMemcpy(offsets, d_offsets, nbBlocks*sizeof(short), cudaMemcpyDeviceToHost)); 
-    // checkCudaErrors(cudaMemcpy(midBytes, d_midBytes, mbsz, cudaMemcpyDeviceToHost)); 
-    
-    
-    // checkCudaErrors(cudaMemcpy(blk_idx, d_blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyDeviceToHost));
-    // checkCudaErrors(cudaMemcpy(blk_vals,d_blk_vals, (*num_sig)*sizeof(float), cudaMemcpyDeviceToHost));
-    // checkCudaErrors(cudaMemcpy(blk_subidx,d_blk_subidx, (*num_sig)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
-    // checkCudaErrors(cudaMemcpy(blk_sig,d_blk_sig, (nbBlocks)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
-
-
-    size_t maxPreservedBufferSize = sizeof(float)*nbEle;
-    unsigned char *d_outBytes;
-    // unsigned char* outBytes = (unsigned char*)malloc(maxPreservedBufferSize);
-    // memset(outBytes, 0, maxPreservedBufferSize);
-    checkCudaErrors(cudaMalloc(&d_outBytes, maxPreservedBufferSize));
-
-    size_t *d_outSize;
-
-    checkCudaErrors(cudaMalloc(&d_outSize, sizeof(size_t)));
-
-  //  device_post_proc<<<1,1>>>(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
-    *outSize = better_post_proc(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
-    //cudaDeviceSynchronize();
-    
-    //checkCudaErrors(cudaMemcpy(outSize, d_outSize, sizeof(size_t), cudaMemcpyDeviceToHost));
-
-    // printf("completed compression\n");
-    //free(blk_idx);
-    //free(blk_subidx);
-    //free(blk_vals);
-    // free(meta);
-    // free(offsets);
-    // free(midBytes);
-    checkCudaErrors(cudaFree(d_num_sig));
-    checkCudaErrors(cudaFree(d_blk_idx));
-    checkCudaErrors(cudaFree(d_blk_subidx));
-    checkCudaErrors(cudaFree(d_blk_vals));
-    checkCudaErrors(cudaFree(d_blk_sig));
-
-    checkCudaErrors(cudaFree(d_meta));
-    checkCudaErrors(cudaFree(d_offsets));
-    checkCudaErrors(cudaFree(d_midBytes));
-
-    unsigned char *d_newout;
-    
-    *outSize = *outSize;
-    size_t os = *outSize;
-    
-    checkCudaErrors(cudaMalloc(&d_newout, os));
-    //fin_copy<<<40,256>>>(d_outBytes, d_newout,os);
-    checkCudaErrors(cudaMemcpy(d_newout, d_outBytes, os, cudaMemcpyDeviceToDevice));
-    cudaDeviceSynchronize(); 
-
-    checkCudaErrors(cudaFree(d_outBytes));
-    printf("Compression end timestamp: %f ms\n", timer_GPU.GetCounter());
-     
-    err = cudaGetLastError();        // Get error code
-    printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    return d_newout;
-    //return d_outBytes;
-}
-
-__device__ inline long bytesToLong_bigEndian(unsigned char* b) {
-	long temp = 0;
-	long res = 0;
-
-	res <<= 8;
-	temp = b[0] & 0xff;
-	res |= temp;
-
-	res <<= 8;
-	temp = b[1] & 0xff;
-	res |= temp;
-	
-	res <<= 8;
-	temp = b[2] & 0xff;
-	res |= temp;
-	
-	res <<= 8;
-	temp = b[3] & 0xff;
-	res |= temp;
-	
-	res <<= 8;
-	temp = b[4] & 0xff;
-	res |= temp;
-	
-	res <<= 8;
-	temp = b[5] & 0xff;
-	res |= temp;
-	
-	res <<= 8;
-	temp = b[6] & 0xff;
-	res |= temp;
-	
-	res <<= 8;
-	temp = b[7] & 0xff;
-	res |= temp;						
-	
-	return res;
-}
-
-__device__ inline size_t bytesToSize(unsigned char* bytes)
-{
-	size_t result = bytesToLong_bigEndian(bytes);//8	
-	return result;
-}
-
-__device__ inline short bytesToShort(unsigned char* bytes)
-{
-	lint16 buf;
-	memcpy(buf.byte, bytes, 2);
-	
-	return buf.svalue;
-}
-
-__global__ void decompress_get_stats(float *newData, size_t nbEle, unsigned char* cmpBytes, 
-    size_t *numSigValues, int *bs,
-    size_t *numConstantBlks, size_t *numBlks,
-    size_t *mSizeptr, unsigned char *newCmpBytes
-){
-	unsigned char* r = cmpBytes;
-
-    size_t num_sig;
-	r += 4;
-	int blockSize = (int) r[0];  //get block size
-	
-	if(blockSize == 0)blockSize = 256;
-	r++;
-	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
-	r += sizeof(size_t);
-	num_sig = bytesToSize(r);
-    
-    r += sizeof(size_t);
-	size_t nbBlocks = nbEle/blockSize;
-    size_t ncBlocks = 0;
-    size_t num_state2_blks = 0;
-	// size_t ncBlocks = nbBlocks - nbConstantBlocks; //get number of constant blocks
-	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
-    size_t ncLeading = blockSize/4;
-    size_t mSize = sizeof(float)+1+ncLeading; //Number of bytes for each data block's metadata.
-
-    *mSizeptr = mSize;
-
-    *numConstantBlks = nbConstantBlocks;
-    *numBlks = nbBlocks;
-    *numSigValues = num_sig;
-    *bs = blockSize;
-    newCmpBytes = r;
-
-}
-
- void setup_data_stateArray_better(float *newData, size_t nbEle, unsigned char* r, 
-    size_t num_sig, int blockSize,
-    size_t nbConstantBlocks, size_t nbBlocks, size_t *ncBlks,
-    unsigned char *stateArray, unsigned char *newR
-){
-
-    //printf("ma\n");
-    // blockSize = 256;
-    r += 4;
-    r++;
-    r += sizeof(size_t);
-    r += sizeof(size_t);
-    int ncBlocks, *ncBlocks_d;
-	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
-    int num_state2_blks, *num_state2_d;
-    checkCudaErrors(cudaMalloc((void **)&num_state2_d, sizeof(int)));
-    checkCudaErrors(cudaMalloc((void **)&ncBlocks_d, sizeof(int)));
-    checkCudaErrors(cudaMemset(num_state2_d, 0, sizeof(int)));
-    checkCudaErrors(cudaMemset(ncBlocks_d, 0, sizeof(int)));
-
-    //printf("ma2\n");
-//	printf("Converting state array\n");
-    // printf("cmp %d\n", (int)r[0]);
-    // printf("state %d\n", (int)stateArray[0]);
-    // convert_out_to_state(nbBlocks, r, stateArray);
-    convert_out_to_state_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(nbBlocks,r,stateArray,stateNBBytes,
-                            num_state2_d, ncBlocks_d);
-    // printf("state %d\n", (int)stateArray[0]);
-    // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
-	cudaDeviceSynchronize();
-    
-    //printf("ma3\n");
-	r += stateNBBytes;
-    newR = r;
-    cudaMemcpy(&ncBlocks, ncBlocks_d, sizeof(int), cudaMemcpyDeviceToHost);
-    
-    //printf("ma4\n");
-    *ncBlks = ncBlocks;
-
-    //printf("ma4\n");
- }
-
-__global__ void setup_data_stateArray(float *newData, size_t nbEle, unsigned char* r, 
-    size_t num_sig, int blockSize,
-    size_t nbConstantBlocks, size_t nbBlocks, size_t *ncBlks,
-    unsigned char *stateArray, unsigned char *newR
-){
-    // blockSize = 256;
-    r += 4;
-    r++;
-    r += sizeof(size_t);
-    r += sizeof(size_t);
-    size_t ncBlocks = 0;
-	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
-    size_t num_state2_blks = 0;
-//	printf("Converting state array\n");
-    // printf("cmp %d\n", (int)r[0]);
-    // printf("state %d\n", (int)stateArray[0]);
-    convert_out_to_state(nbBlocks, r, stateArray);
-    // convert_out_to_state_kernel<<<40,256>>>(nbBlocks,r,stateArray,stateNBBytes);
-    // printf("state %d\n", (int)stateArray[0]);
-    // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
-	for (size_t i = 0; i < nbBlocks; i++)
-    {
-        if (stateArray[i] == 2)
-        {
-            num_state2_blks++;
-        }else if(stateArray[i] == 3){
-            ncBlocks++;
-        }
-    }
-    
-	r += stateNBBytes;
-    newR = r;
-    *ncBlks = ncBlocks;
-}
-
-__global__ void decomp_startup_kernel(unsigned char* r, size_t nbConstantBlocks, 
-unsigned char *data, int blockSize, size_t mSize, size_t ncBlocks, float *constantMedianArray, uint64_t* g_leng){
-    unsigned char * fr = r; //fr is the starting address of constant median values.
-    int i = 0, j = 0, k = 0;
-  //  printf("%p\n", r);
-    unsigned char tmp_r[4];
-    tmp_r[0]=fr[0];
-    tmp_r[1]=fr[1];
-    tmp_r[2]=fr[2];
-    tmp_r[3]=fr[3];
-
-
-//    printf("nbconstant: %f\n", ((float*)tmp_r)[0]);
-// nbConstantBlocks
-    for(i = blockDim.x*blockIdx.x + threadIdx.x; i < nbConstantBlocks; i += blockDim.x*gridDim.x){ //get the median values for constant-value blocks
-	    
-    	    tmp_r[0]=fr[4*i];
-    	    tmp_r[1]=fr[4*i+1];
-    	    tmp_r[2]=fr[4*i+2];
-    	    tmp_r[3]=fr[4*i+3];
-	    float tmp = ((float*)tmp_r)[0];
-	    constantMedianArray[i] = tmp;
-	    //printf("%d %f\n", i, tmp);
-    }
-   
-
-/** PROBLEM AREA, CAN FIX WITH PARALLELIZATION BUT WATCH *FR and *P **/
-
-    // if(threadIdx.x==0 && blockIdx.x==0){
-    fr += nbConstantBlocks*sizeof(float);
-    unsigned char* p = fr + ncBlocks * sizeof(short);
-    unsigned char* basefr = fr;
-    unsigned char* basep = p;
-    for(i = blockDim.x*blockIdx.x + threadIdx.x;i < ncBlocks;i+=blockDim.x*gridDim.x){
-        fr = basefr+(sizeof(short)*i);
-        int leng = (int)bytesToShort(fr)+mSize;
-        g_leng[i] = (uint64_t)leng;
-        // fr += sizeof(short);
-        if (leng > blockSize*sizeof(float))
-        {
-            printf("Warning: compressed block is larger than the original block!\n");
-            return;
-            // exit(0);
-        }
-        // memcpy(data+i*blockSize*sizeof(float), p, leng);
-
-        // p += leng;
-    }
-    
-    // }
-}
-
-__global__ void decompress_ncblk_kernel(unsigned char* r, size_t nbConstantBlocks, 
-unsigned char *data, int blockSize, size_t mSize, size_t ncBlocks, float *constantMedianArray, uint64_t* g_leng){
-    unsigned char * fr = r;
-    fr += nbConstantBlocks*sizeof(float);
-    unsigned char* p = fr + ncBlocks * sizeof(short);
-    unsigned char* basefr = fr;
-    unsigned char* basep = p;
-
-    for(int i = blockDim.x*blockIdx.x + threadIdx.x;i < ncBlocks;i+=blockDim.x*gridDim.x){
-        fr = basefr+(sizeof(short)*i);
-        int leng = (int)bytesToShort(fr)+mSize;
-        
-	
-	// g_leng[i] = leng;
-        // // fr += sizeof(short);
-        // if (leng > blockSize*sizeof(float))
-        // {
-        //     printf("Warning: compressed block is larger than the original block!\n");
-        //     return;
-        //     // exit(0);
-        // }
-        p = basep + g_leng[i];
-
-        memcpy(data+i*blockSize*sizeof(float), p, leng);
-	
-        // p += leng;
-    }
-}
-
-void decompress_startup_better(float *newData, size_t nbEle, unsigned char* r, 
-    uint32_t *blk_idx, uint8_t *blk_subidx, uint8_t *blk_sig,
-    float *blk_vals, size_t num_sig, int blockSize,
-    size_t nbConstantBlocks, size_t nbBlocks, size_t ncBlocks,
-    unsigned char *stateArray, float* constantMedianArray, unsigned char *data,
-    size_t mSize, unsigned char *newCmpBytes
-){
-    // blockSize = 256;
-    size_t nb_tmp = (int) nbEle/blockSize;
-    uint64_t* g_leng;
-    /**
-     * Structures to return:
-     * blk_idx, blk_subidx, blk_sig, blk_vals, numSigValues (pointer)
-     * bs (pointer to blockSize), numConstantBlks (pointer), numBlks (pointer)
-     * ncBlks (pointer), stateArray, constantMedianArray
-     */
-
-
-    size_t stateNBBytes = nb_tmp%4==0 ? nb_tmp/4 : nb_tmp/4+1;
-    
-    r += 4;
-    r++;
-    r += sizeof(size_t);
-    r += sizeof(size_t);
-
-    r += stateNBBytes;
-
-    convert_out_to_block2_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-    size_t to_add = nbBlocks*4 + num_sig*sizeof(float) + num_sig*sizeof(uint8_t) + nbBlocks*sizeof(uint8_t);
-    r+= to_add;
-
-    size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
-    
-    // printf("before mallocs in kernel\n");
-    checkCudaErrors(cudaMemcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float), cudaMemcpyDeviceToDevice));
-    checkCudaErrors(cudaMalloc(&g_leng, sizeof(uint64_t)*ncBlocks));
-    // memcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
-
-    //printf("before mallocs in kernel %p\n", r);
-    r += (nbEle%blockSize)*sizeof(float);
-    //printf("r: %p\n", r);
-    //printf("%d, %d, %d\n",nbEle, 256, nbEle%256);
-    decomp_startup_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbConstantBlocks,data, blockSize, mSize, ncBlocks, constantMedianArray, g_leng);
-    cudaDeviceSynchronize();
-
-    thrust::exclusive_scan(thrust::device, g_leng, g_leng + ncBlocks, g_leng, 0);
-
-    decompress_ncblk_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbConstantBlocks, data, blockSize, mSize, ncBlocks, constantMedianArray, g_leng);
-    cudaDeviceSynchronize();
-    
-    // cudaError_t err = cudaGetLastError();        // Get error code
-    
-    // printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    cudaFree(g_leng);
-        
-    // err = cudaGetLastError();        // Get error code
-    // printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    r += nbConstantBlocks*sizeof(float);
-
-    newCmpBytes = r;
-
-}
-
-__global__ void decompress_startup(float *newData, size_t nbEle, unsigned char* r, 
-    uint32_t *blk_idx, uint8_t *blk_subidx, uint8_t *blk_sig,
-    float *blk_vals, size_t num_sig, int blockSize,
-    size_t nbConstantBlocks, size_t nbBlocks, size_t ncBlocks,
-    unsigned char *stateArray, float* constantMedianArray, unsigned char *data,
-    size_t mSize, unsigned char *newCmpBytes
-){
-    // blockSize = 256;
-    size_t nb_tmp = (int) nbEle/blockSize;
-    /**
-     * Structures to return:
-     * blk_idx, blk_subidx, blk_sig, blk_vals, numSigValues (pointer)
-     * bs (pointer to blockSize), numConstantBlks (pointer), numBlks (pointer)
-     * ncBlks (pointer), stateArray, constantMedianArray
-     */
-	
-    // size_t ncBlocks = 0;
-	// size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
-    // size_t num_state2_blks = 0;
-	// printf("Converting state array\n");
-    // convert_out_to_state(nbBlocks, r, stateArray);
-    // printf("state %d\n", (int)stateArray[0]);
-    // // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
-	// for (size_t i = 0; i < nbBlocks; i++)
-    // {
-    //     if (stateArray[i] == 2)
-    //     {
-    //         num_state2_blks++;
-    //     }else if(stateArray[i] == 3){
-    //         ncBlocks++;
-    //     }
-    // }
-   // size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
-
-    size_t stateNBBytes = nb_tmp%4==0 ? nb_tmp/4 : nb_tmp/4+1;
-    //printf("%p\n", r);
-    r += 4;
-    r++;
-    r += sizeof(size_t);
-    r += sizeof(size_t);
-    //printf("statenb %d %d\n", stateNBBytes, nb_tmp);
-    r += stateNBBytes;
-    // data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
-    // memset(data, 0, ncBlocks*blockSize*sizeof(float));
-   // printf("converting block vals %d\n", data[0]);
-    size_t to_add = convert_out_to_block2(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
-    r+= to_add;
-
-    size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
-    
-    // printf("before mallocs in kernel\n");
-    
-    memcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
-
-    //printf("before mallocs in kernel %p\n", r);
-    r += (nbEle%blockSize)*sizeof(float);
-    //printf("r: %p\n", r);
-    //printf("%d, %d, %d\n",nbEle, 256, nbEle%256);
-    unsigned char * fr = r; //fr is the starting address of constant median values.
-
-  //  printf("%p\n", r);
-    unsigned char tmp_r[4];
-    tmp_r[0]=r[0];
-    tmp_r[1]=r[1];
-    tmp_r[2]=r[2];
-    tmp_r[3]=r[3];
-
-
-//    printf("nbconstant: %f\n", ((float*)tmp_r)[0]);
-    for(i = 0;i < nbConstantBlocks;i++, j+=4){ //get the median values for constant-value blocks
-	    
-    	    tmp_r[0]=r[j];
-    	    tmp_r[1]=r[j+1];
-    	    tmp_r[2]=r[j+2];
-    	    tmp_r[3]=r[j+3];
-	    float tmp = ((float*)tmp_r)[0];
-//	    printf("median: %f\n", tmp);	
-	    constantMedianArray[i] = tmp;
-
-	    // printf("%d %f\n", i, tmp);
-    }
-    //printf("after constantmedian\n");
-    r += nbConstantBlocks*sizeof(float);
-    unsigned char* p = r + ncBlocks * sizeof(short);
-    for(i = 0;i < ncBlocks;i++){
-        int leng = (int)bytesToShort(r)+mSize;
-        r += sizeof(short);
-        if (leng > blockSize*sizeof(float))
-        {
-            printf("Warning: compressed block is larger than the original block!\n");
-            return;
-            // exit(0);
-        }
-//	printf("before memcpy\n");
-        memcpy(data+i*blockSize*sizeof(float), p, leng);
-  //      printf("after memcpy\n");
-	p += leng;
-    } 
-
-    newCmpBytes = r;
-//    printf("before mallocs in kernel\n");
-
-    // printf("nb blocks: %d\n", nbBlocks);
-}
-
-__global__ void cBlkCopy_decompress(int nb, float* constantMedianArray, float *newData, int blockSize, int i){
-    int j;
-    float Median = constantMedianArray[nb];
-    // j = threadIdx.x; j < blockSize; j += blockDim.x
-    for (j = threadIdx.x; j < blockSize; j += blockDim.x)
-        *((newData)+i*blockSize+j) = Median;
-}
-
-__global__ void ncBlkCopy_decompress(int blockSize, float *newData, int nc, float *fdata, int i){
-    int j;
-    for (j = threadIdx.x; j < blockSize; j += blockDim.x)
-        *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
-}
-
-void decompress_post_proc_better(unsigned char *data, float *newData, int blockSize, 
-    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
-    float *constantMedianArray
-){
-    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
-    // checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
-    float* fdata = (float*)data;
-    int i,j;
-    int nb=0, nc=0;
-    //printf("h1\n");
-    for (i=0;i<nbBlocks;i++){
-        unsigned char state;
-        cudaMemcpy(&state, &stateArray[i], sizeof(char), cudaMemcpyDeviceToHost);
-
-        if (state==0 || state==1){
-            cBlkCopy_decompress<<<1,256>>>(nb, constantMedianArray, newData, blockSize, i);
-            nb++;
-        }else if(state==3){
-            ncBlkCopy_decompress<<<1,256>>>(blockSize, newData, nc, fdata, i);
-            nc++;
-        }
-    }
-    cudaDeviceSynchronize();
-    //for(int k = 0; k < nbBlocks*blockSize;k++){
-//	printf("%f\n", newData[k]);
-  //  }
-}
-
-__global__ void print_newdata(float *newData, size_t nbBlocks, int blockSize){
-    for (size_t i = 0; i < nbBlocks*blockSize; i++)
-    {
-        printf("%f\n", newData[i]);
-    }
-    
-}
-
-__global__ void generateNbNc(size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray, uint64_t* nbs,  uint64_t* ncs){
-    for(int i = blockDim.x*blockIdx.x + threadIdx.x;i < nbBlocks;i+=blockDim.x*gridDim.x){
-        unsigned char state = stateArray[i];
-        if(state==0||state==1){
-            nbs[i] = 1;
-            ncs[i] = 0;
-        }else if(state==3){
-            nbs[i] = 0;
-            ncs[i] = 1;
-        }else{
-            nbs[i] = 0;
-            ncs[i] = 0;
-        }
-    }
-}
-
-__global__ void decompress_final_set(unsigned char *data, float *newData, int blockSize, 
-    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
-    float *constantMedianArray, uint64_t* nb, uint64_t* nc){
-    float* fdata = (float*)data;
-    for (int i = blockIdx.x;i < nbBlocks;i+=gridDim.x){
-        if (stateArray[i]==0 || stateArray[i]==1){
-            float Median = constantMedianArray[nb[i]];
-            // if (Median>1) printf("data%i:%f\n",i, Median);
-            for (int j = threadIdx.x; j < blockSize; j += blockDim.x)
-                *((newData)+i*blockSize+j) = Median;
-            // nb++;
-        }else if(stateArray[i]==3){
-            for (int j = threadIdx.x; j < blockSize; j += blockDim.x)
-                *((newData)+i*blockSize+j) = fdata[nc[i]*blockSize+j];
-            // nc++;
-        }
-        __syncthreads();
-    }
-}
-
-void decompress_post_proc_fast(unsigned char *data, float *newData, int blockSize, 
-    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
-    float *constantMedianArray
-){
-    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
-    // checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
-    
-    int i,j;
-    uint64_t *nb, *nc;
-    checkCudaErrors(cudaMalloc(&nb, sizeof(uint64_t)*nbBlocks));
-    checkCudaErrors(cudaMalloc(&nc, sizeof(uint64_t)*nbBlocks));
-
-    generateNbNc<<<BLOCKS,THREADS_PER_BLOCK>>>(nbBlocks, ncBlocks, stateArray, nb,nc);
-    cudaDeviceSynchronize();
-    thrust::exclusive_scan(thrust::device, nb, nb + nbBlocks, nb, 0);
-    thrust::exclusive_scan(thrust::device, nc, nc + nbBlocks, nc, 0);
-
-    decompress_final_set<<<nbBlocks,blockSize>>>(data, newData, blockSize,nbBlocks, ncBlocks, stateArray,constantMedianArray, nb, nc);
-    cudaDeviceSynchronize();
-    cudaFree(nb);
-    cudaFree(nc);
-}
-
-__global__ void decompress_post_proc(unsigned char *data, float *newData, int blockSize, 
-    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
-    float *constantMedianArray
-){
-    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
-    // checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
-    float* fdata = (float*)data;
-    int i,j;
-    int nb=0, nc=0;
-    // if (blockIdx.x == 0)
-    // {
-    //     for (i=0;i<nbBlocks;i++){
-    //         if (stateArray[i]==0 || stateArray[i]==1){
-    //             float Median = constantMedianArray[nb];
-    //             // if (Median>1) printf("data%i:%f\n",i, Median);
-    //             for (j = threadIdx.x; j < blockSize; j += blockDim.x)
-    //                 *((newData)+i*blockSize+j) = Median;
-    //             nb++;
-    //         }
-    //     }
-    // }else{
-    //     for (i=0;i<nbBlocks;i++){
-    //         if(stateArray[i]==3){
-    //             for (j = threadIdx.x; j < blockSize; j += blockDim.x)
-    //                 *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
-    //             nc++;
-    //         }
-    //     }
-    // }
-    
-    for (i=0;i<nbBlocks;i++){
-        if (stateArray[i]==0 || stateArray[i]==1){
-            float Median = constantMedianArray[nb];
-            // if (Median>1) printf("data%i:%f\n",i, Median);
-            for (j = threadIdx.x; j < blockSize; j += blockDim.x)
-                *((newData)+i*blockSize+j) = Median;
-            nb++;
-        }else if(stateArray[i]==3){
-            for (j = threadIdx.x; j < blockSize; j += blockDim.x)
-                *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
-            nc++;
-        }
-    }
-
-    //for(int k = 0; k < nbBlocks*blockSize;k++){
-//	printf("%f\n", newData[k]);
-  //  }
-}
-
-float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
-{
-    /**
-     * Assume the following are device pointers
-     * 
-     * unsigned char* cmpBytes
-     * float** newData
-     * 
-     */
-    
-    uint32_t *blk_idx;
-    uint8_t *blk_subidx;
-    uint8_t *blk_sig;
-    float *blk_vals, *constantMedianArray;
-    size_t *num_sig, *mSize, mSize_h, num_sig_h;
-    int *blockSize, bs;
-    size_t *nbConstantBlocks, *nbBlocks, *ncBlocks, nbBlocks_h, ncBlocks_h, nbConstantBlocks_h;
-    unsigned char *stateArray, *data;
-    float *newData;
-    timer_GPU.StartCounter();
-    unsigned char *oldCmpBytes = cmpBytes;
-	//*newData = (float*)malloc(sizeof(float)*nbEle);
-//    printf("cmpbytes check %d\n", (int)cmpBytes[0]);
-//    printf("new check %f\n", *newData[0]);
-    // printf("malloc\n");
-    checkCudaErrors(cudaMalloc((void**)&num_sig, sizeof(size_t)));
-    checkCudaErrors(cudaMalloc((void**)&blockSize, sizeof(int)));
-    checkCudaErrors(cudaMalloc((void**)&nbConstantBlocks, sizeof(size_t)));
-    checkCudaErrors(cudaMalloc((void**)&nbBlocks, sizeof(size_t)));
-    checkCudaErrors(cudaMalloc((void**)&ncBlocks, sizeof(size_t)));
-    checkCudaErrors(cudaMalloc((void**)&mSize, sizeof(size_t)));    
-    checkCudaErrors(cudaMalloc((void**)&newData, sizeof(float)*nbEle));
-
-    decompress_get_stats<<<1,1>>>(newData, nbEle, cmpBytes, 
-        num_sig, blockSize,
-        nbConstantBlocks, nbBlocks,
-        mSize, cmpBytes
-    );
-    cudaDeviceSynchronize();
-
-    cudaError_t err = cudaGetLastError();        // Get error code
-    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    checkCudaErrors(cudaMemcpy(&nbBlocks_h, nbBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(&nbConstantBlocks_h, nbConstantBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(&bs, blockSize, sizeof(int), cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(&mSize_h, mSize, sizeof(size_t), cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(&num_sig_h, num_sig, sizeof(size_t), cudaMemcpyDeviceToHost)); 
-
-
-    checkCudaErrors(cudaMalloc((void**)&stateArray, nbBlocks_h));
-    checkCudaErrors(cudaMalloc((void**)&constantMedianArray, nbConstantBlocks_h*sizeof(float)));
-
-    checkCudaErrors(cudaMalloc((void**)&blk_idx, nbBlocks_h*sizeof(uint32_t)));
-    checkCudaErrors(cudaMalloc((void**)&blk_vals, num_sig_h*sizeof(float)));
-    checkCudaErrors(cudaMalloc((void**)&blk_subidx, num_sig_h*sizeof(uint8_t)));
-    checkCudaErrors(cudaMalloc((void**)&blk_sig, nbBlocks_h*sizeof(uint8_t)));
-
-    unsigned char* tmp_r = cmpBytes;
-    unsigned char* newR;
-    setup_data_stateArray_better(newData, nbEle, tmp_r, 
-    num_sig_h, bs,
-    nbConstantBlocks_h, nbBlocks_h, &ncBlocks_h,
-    stateArray, newR);
-    
-    
-    
-   // setup_data_stateArray<<<1,1>>>(newData, nbEle, cmpBytes, 
-   //      num_sig_h, bs,
-   //      nbConstantBlocks_h, nbBlocks_h, ncBlocks,
-   //      stateArray, cmpBytes
-   //  );
-   // cudaDeviceSynchronize();
-
-   // printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
-   // checkCudaErrors(cudaMemcpy(&ncBlocks_h, ncBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
-
-    checkCudaErrors(cudaMalloc((void**)&data, ncBlocks_h*bs*sizeof(float)));
-
-    // err = cudaGetLastError();        // Get error code
-    // printf("CUDA start Error: %s\n", cudaGetErrorString(err));
-    // cmpBytes = newCmpBytes;
-    // data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
-    // memset(data, 0, ncBlocks*blockSize*sizeof(float));
-    // stateArray = (unsigned char*)malloc(nbBlocks);
-    
-    // // unsigned char* d_stateArray;
-    // // cudaMalloc(&d_stateArray, nbBlocks);
-	// constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));			
-
-    // blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
-    // blk_vals= (float *)malloc((num_sig)*sizeof(float));
-    // blk_subidx = (uint8_t *)malloc((num_sig)*sizeof(uint8_t));
-    // blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
-
-    //printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
-    //test_nbBlks = (size_t *)malloc(sizeof(size_t));
-    // printf("malloc\n");
-    
-    
-    tmp_r = cmpBytes;
-    decompress_startup_better(newData, nbEle, tmp_r, 
-    blk_idx, blk_subidx, blk_sig,
-    blk_vals, num_sig_h, bs,
-     nbConstantBlocks_h, nbBlocks_h, ncBlocks_h,
-    stateArray, constantMedianArray, data,
-    mSize_h, newR);
-
-
-    // err = cudaGetLastError();        // Get error code
-    // printf("CUDA start Error: %s\n", cudaGetErrorString(err));
-    //decompress_startup<<<1,1>>>(newData, nbEle, cmpBytes, 
-    // blk_idx, blk_subidx, blk_sig,
-    // blk_vals, num_sig_h, bs,
-    // nbConstantBlocks_h, nbBlocks_h, ncBlocks_h,
-    // stateArray, constantMedianArray, data, mSize_h, cmpBytes);
-    //cudaDeviceSynchronize();
-    // cmpBytes = newCmpBytes;
-
-    //printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
-
-    // unsigned char* d_data;
-    float *d_newdata;
-    // checkCudaErrors(cudaMalloc((void**)&d_data, ncBlocks*blockSize*sizeof(float))); 
-    // checkCudaErrors(cudaMemcpy(d_data, data, ncBlocks*blockSize*sizeof(float), cudaMemcpyHostToDevice)); 
-    // printf("nblocks: %d bs: %d ncblock %d\n", nbBlocks_h, bs, ncBlocks_h);
-    checkCudaErrors(cudaMalloc(&d_newdata, nbBlocks_h*bs*sizeof(float)));
-
-    // err = cudaGetLastError();        // Get error code
-    // printf("CUDA dec main Error: %s\n", cudaGetErrorString(err));
-    
-    dim3 dimBlock(32, bs/32);
-    dim3 dimGrid(65536, 1);
-    const int sMemsize = bs * sizeof(float) + dimBlock.y * sizeof(int);
-    decompress_state2<<<nbBlocks_h, 64>>>(d_newdata, stateArray,blk_idx, blk_vals, blk_subidx, bs, blk_sig);
-    cudaDeviceSynchronize();
-
-    // err = cudaGetLastError();        // Get error code
-    // printf("CUDA dec main Error: %s\n", cudaGetErrorString(err));
-    decompress_float<<<dimGrid, dimBlock, sMemsize>>>(data, bs, ncBlocks_h, mSize_h);
-    //printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
-    cudaDeviceSynchronize();
-
-    // err = cudaGetLastError();        // Get error code
-    // printf("CUDA dec main Error: %s\n", cudaGetErrorString(err));
-    
-    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
-    checkCudaErrors(cudaMemcpy(newData, d_newdata, nbBlocks_h*bs*sizeof(float), cudaMemcpyDeviceToDevice));
-    cudaFree(d_newdata);
-
-    // decompress_post_proc<<<1,1>>>(data, newData, bs, 
-    // nbBlocks_h, ncBlocks_h, stateArray,
-    // constantMedianArray);
-    // cudaDeviceSynchronize();
-    decompress_post_proc_fast(data, newData, bs, 
-    nbBlocks_h, ncBlocks_h, stateArray,
-    constantMedianArray);
-    err = cudaGetLastError();        // Get error code
-    printf("CUDA Error: %s\n", cudaGetErrorString(err));
-    printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
-   // print_newdata<<<1,1>>>(newData, nbBlocks_h, bs);
-	cudaFree(stateArray);
-	cudaFree(constantMedianArray);
-	cudaFree(data);
-    cudaFree(blk_idx);
-    cudaFree(blk_subidx);
-    cudaFree(blk_vals);
-    cudaFree(blk_sig);
-    return newData;
-
-}
-
+#include "cuszx_entry.h"
+#include "szx_defines.h"
+#include "szx_BytesToolkit.h"
+#include "szx_TypeManager.h"
+#include "timingGPU.h"
+#include "szx.h"
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+#include <cub/cub.cuh>
+#include <thrust/extrema.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+#include <cub/cub.cuh>
+
+#define SPARSITY_LEVEL 0.25
+#define BLOCKS 40
+#define THREADS_PER_BLOCK 256
+
+TimingGPU timer_GPU;
+void bin(unsigned n)
+{
+    unsigned i;
+    for (i = 1 << 31; i > 0; i = i / 2)
+        (n & i) ? printf("1") : printf("0");
+}
+
+__host__ __device__ size_t convert_state_to_out(unsigned char* meta, size_t length, unsigned char *result){
+    size_t out_length;
+
+    if(length%4==0)
+		out_length = length/4;
+	else
+		out_length = length/4+1;
+
+    for (size_t i = 0; i < out_length; i++)
+    {
+        uint8_t tmp = 0;
+
+        for (size_t j = 0; j < 4; j++)
+        {
+            if (i*4 + j < length)
+            {
+                tmp |= (0x03 & meta[i*4+j]) << 2*j;
+            }
+            
+        }
+        result[i] = tmp;
+    }
+    return out_length;
+}
+
+__global__ void convert_state_to_out_kernel(unsigned char* meta, size_t length, unsigned char *result, size_t out_length){
+    
+
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < out_length; i += blockDim.x*gridDim.x){
+        uint8_t tmp = 0;
+
+        for (size_t j = 0; j < 4; j++)
+        {
+            if (i*4 + j < length)
+            {
+                tmp |= (0x03 & meta[i*4+j]) << 2*j;
+            }
+            
+        }
+        result[i] = tmp;
+    }
+}
+
+__global__ void convert_out_to_state_kernel(size_t nbBlocks, unsigned char* cmp, unsigned char* out_state, size_t state_length, int *num_state2blks, int *ncBlocks){
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < state_length; i += blockDim.x*gridDim.x){
+        for (size_t j = 0; j < 4; j++)
+        {
+            if (4*i + j < nbBlocks)
+            {
+                out_state[4*i + j]= (cmp[i] >> 2*j) & 0x03;
+                if (out_state[4*i+j] == 2)
+                {
+                    atomicAdd(num_state2blks, 1);
+                }else if(out_state[4*i+j]==3){
+                    atomicAdd(ncBlocks, 1);
+                }
+                
+            }
+            
+        }
+    }
+}
+
+// nbBlocks, r, stateNBBytes, stateArray
+__host__ __device__ size_t convert_out_to_state(size_t nbBlocks, unsigned char* cmp, unsigned char* out_state){
+    size_t state_length;
+    if(nbBlocks%4==0)
+		state_length = nbBlocks/4;
+	else
+		state_length = nbBlocks/4+1;
+
+    for (size_t i = 0; i < state_length; i++)
+    {
+        for (size_t j = 0; j < 4; j++)
+        {
+            if (4*i + j < nbBlocks)
+            {
+                out_state[4*i + j]= (cmp[i] >> 2*j) & 0x03;
+            }
+            
+        }
+    }
+    return nbBlocks;
+}
+
+__host__ __device__ size_t convert_block2_to_out(unsigned char *result, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    size_t out_length = 0;
+    
+    memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
+    out_length += numBlocks*4;
+    memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
+    out_length += num_sig*sizeof(float);
+    memcpy(result+out_length, blk_subidx, num_sig*sizeof(uint8_t));
+    out_length += num_sig*sizeof(uint8_t);
+    memcpy(result+out_length, blk_sig, numBlocks*sizeof(uint8_t));
+    out_length+= numBlocks*sizeof(uint8_t);
+
+    return out_length;
+}
+
+__global__ void convert_block2_to_out_kernel(unsigned char *result, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    
+    size_t out_length = 0;
+    unsigned char *tmp_result = result;
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
+        uint32_t local_blkidx = blk_idx[i];
+        tmp_result[4*i] = (local_blkidx) & 0xff;
+        tmp_result[4*i+1] = (local_blkidx >> (8*1)) & 0xff;
+        tmp_result[4*i+2] = (local_blkidx >> (8*2)) & 0xff;
+        tmp_result[4*i+3] = (local_blkidx >> (8*3)) & 0xff;
+    }
+    // memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
+    out_length += numBlocks*4;
+    tmp_result = result+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
+        float value = blk_vals[i];
+	    memcpy(&tmp_result[4*i], &value, sizeof(float));
+	//unsigned char *v = ()
+        //tmp_result[(int)4*i] = (unsigned char)((value) & 0xff);
+        //tmp_result[(int)4*i+1] = (unsigned char)((value >> (8*1)) & 0xff);
+        //tmp_result[(int)4*i+2] = (unsigned char)((value >> (8*2)) & 0xff);
+        //tmp_result[(int)4*i+3] = (unsigned char)((value >> (8*3)) & 0xff);
+    }
+    // memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
+    out_length += num_sig*sizeof(float);
+    tmp_result = result+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
+        tmp_result[i] = blk_subidx[i];
+        
+    }
+
+    out_length += num_sig*sizeof(uint8_t);
+    tmp_result = result+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
+        tmp_result[i] = blk_sig[i];
+        
+    }
+    out_length+= numBlocks*sizeof(uint8_t);
+
+    // return out_length;
+}
+
+__global__ void convert_out_to_block2_kernel(unsigned char *in_cmp, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    size_t out_length = 0;
+    
+    unsigned char *tmp_result = in_cmp;
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
+        
+        uint32_t local_blkidx = (tmp_result[4*i] & 0xff) | ((tmp_result[4*i+1] & 0xff) << (8*1)) 
+                                | ((tmp_result[4*i+2] & 0xff) << (8*2)) | ((tmp_result[4*i+3] & 0xff) << (8*3));
+        blk_idx[i] = local_blkidx;
+    }
+    // memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
+    out_length += numBlocks*4;
+    tmp_result = in_cmp+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
+        float value = 0.0;
+        memcpy(&value, &tmp_result[4*i], sizeof(float));
+        blk_vals[i] = value;
+	    
+	//unsigned char *v = ()
+        //tmp_result[(int)4*i] = (unsigned char)((value) & 0xff);
+        //tmp_result[(int)4*i+1] = (unsigned char)((value >> (8*1)) & 0xff);
+        //tmp_result[(int)4*i+2] = (unsigned char)((value >> (8*2)) & 0xff);
+        //tmp_result[(int)4*i+3] = (unsigned char)((value >> (8*3)) & 0xff);
+    }
+    // memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
+    out_length += num_sig*sizeof(float);
+    tmp_result = in_cmp+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
+        blk_subidx[i] = tmp_result[i];
+        
+    }
+
+    out_length += num_sig*sizeof(uint8_t);
+    tmp_result = in_cmp+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
+        blk_sig[i] = tmp_result[i];
+        
+    }
+    out_length+= numBlocks*sizeof(uint8_t);
+}
+
+__host__ __device__ size_t convert_out_to_block2(unsigned char *in_cmp, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    size_t out_length = 0;
+    memcpy(blk_idx, in_cmp, numBlocks*sizeof(uint32_t));
+    out_length += numBlocks*4;
+    memcpy(blk_vals, in_cmp+out_length,num_sig*sizeof(float));
+    out_length += num_sig*sizeof(float);
+    memcpy(blk_subidx, in_cmp+out_length, num_sig*sizeof(uint8_t));
+    out_length += num_sig*sizeof(uint8_t);
+    memcpy(blk_sig, in_cmp+out_length, numBlocks*sizeof(uint8_t));
+    out_length += numBlocks*sizeof(uint8_t);
+//    printf("outlength: %d\n",out_length);
+    return out_length;
+}
+
+int _post_proc(float *oriData, unsigned char *meta, short *offsets, unsigned char *midBytes, unsigned char *outBytes, size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig)
+{
+    int out_size = 0;
+
+    size_t nbConstantBlocks = 0;
+    size_t nbBlocks = nbEle/blockSize;
+    size_t ncBytes = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    out_size += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
+    if (nbBlocks%8==0)
+        out_size += nbBlocks/8;
+    else
+        out_size += nbBlocks/8+1;
+    int s0 = 0;
+    int s1 = 0;
+    int s2 = 0;
+    int s3 = 0;
+    for (int i=0; i<nbBlocks; i++){
+        if (meta[i]==0 || meta[i]==1 || meta[i] == 2) nbConstantBlocks++;
+        else out_size += 1+(blockSize/4)+offsets[i];
+    
+    	if(meta[i]==0) s0++;
+    	if(meta[i]==1) s1++;
+    	if(meta[i]==2) s2++;
+    	if(meta[i]==3) s3++;
+    }
+//    printf("%d %d %d %d\n", s0, s1, s2, s3);
+    out_size += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
+
+    //outBytes = (unsigned char*)malloc(out_size);
+  //  printf("accessing outbytes now...\n");
+	unsigned char* r = outBytes;
+    unsigned char* r_old = outBytes; 
+	r[0] = SZx_VER_MAJOR;
+	r[1] = SZx_VER_MINOR;
+	r[2] = 1;
+	r[3] = 0; // indicates this is not a random access version
+	r[4] = (unsigned char)blockSize;
+	r=r+5; //1 byte
+	sizeToBytes(r, nbConstantBlocks);
+	r += sizeof(size_t);
+    sizeToBytes(r, (size_t) num_sig);
+    r += sizeof(size_t); 
+	r += convert_state_to_out(meta, nbBlocks, r);
+    r += convert_block2_to_out(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
+    r += (nbEle%blockSize)*sizeof(float);
+    unsigned char* c = r;
+    unsigned char* o = c+nbConstantBlocks*sizeof(float);
+    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
+    for (int i=0; i<nbBlocks; i++){
+        
+        if (meta[i]==0 || meta[i] == 1){
+	    memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
+            c += sizeof(float);
+        }else if(meta[i] == 3){
+            shortToBytes(o, offsets[i]);
+	   
+            o += sizeof(short);
+            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
+            
+	    nc += mSize; 
+            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            
+	    nc += offsets[i];
+	   
+        } 
+    }
+
+    // return out_size;
+    return (uint32_t) (nc-r_old);
+}
+
+unsigned char* cuSZx_fast_compress_args_unpredictable_blocked_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold)
+{
+//    printf("tr thresh abs %f %f\n", threshold, absErrBound);
+  //  printf("first: %f %f %f\n", oriData[0], oriData[1], oriData[2]);
+    float sparsity_level = SPARSITY_LEVEL;
+	float* d_oriData;
+    cudaMalloc((void**)&d_oriData, sizeof(float)*nbEle); 
+    cudaMemcpy(d_oriData, oriData, sizeof(float)*nbEle, cudaMemcpyHostToDevice); 
+
+	size_t nbBlocks = nbEle/blockSize;
+	size_t remainCount = nbEle%blockSize;
+	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
+
+    size_t ncBytes = blockSize/4;
+    //ncBytes = (blockSize+1)%4==0 ? ncBytes : ncBytes+1; //Bytes to store one non-constant block data.
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    size_t msz = (1+mSize) * nbBlocks * sizeof(unsigned char);
+    size_t mbsz = sizeof(float) * nbEle * sizeof(unsigned char);
+
+    unsigned char *meta = (unsigned char*)malloc(msz);
+    short *offsets = (short*)malloc(nbBlocks*sizeof(short));
+    unsigned char *midBytes = (unsigned char*)malloc(mbsz);
+
+	unsigned char* d_meta;
+	unsigned char* d_midBytes;
+	short* d_offsets;
+
+    uint32_t *blk_idx, *d_blk_idx;
+    uint8_t *blk_sig, *d_blk_sig;
+    uint8_t *blk_subidx, *d_blk_subidx;
+    float *blk_vals, *d_blk_vals;
+    uint64_t *num_sig, *d_num_sig;
+
+    checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
+    num_sig = (uint64_t *)malloc(sizeof(uint64_t));
+    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
+    // blk_idx = malloc()
+    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, nbEle*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, nbEle*sizeof(float)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void**)&d_meta, msz)); 
+    //checkCudaErrors(cudaMemcpy(d_meta, meta, msz, cudaMemcpyHostToDevice)); 
+    checkCudaErrors(cudaMemset(d_meta, 0, msz));
+    checkCudaErrors(cudaMalloc((void**)&d_offsets, nbBlocks*sizeof(short))); 
+    checkCudaErrors(cudaMemset(d_offsets, 0, nbBlocks*sizeof(short)));
+    checkCudaErrors(cudaMalloc((void**)&d_midBytes, mbsz)); 
+    checkCudaErrors(cudaMemset(d_midBytes, 0, mbsz));
+
+    timer_GPU.StartCounter();
+    // apply_threshold<<<80,256>>>(d_oriData, threshold, nbEle);
+    // cudaDeviceSynchronize();
+    dim3 dimBlock(32, blockSize/32);
+    dim3 dimGrid(65536, 1);
+    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
+    compress_float<<<dimGrid, dimBlock, sMemsize>>>(d_oriData, d_meta, d_offsets, d_midBytes, absErrBound, blockSize, nbBlocks, mSize, sparsity_level, d_blk_idx, d_blk_subidx,d_blk_vals, threshold, d_blk_sig);
+    cudaError_t err = cudaGetLastError();        // Get error code
+    printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    printf("GPU compression timing: %f ms\n", timer_GPU.GetCounter());
+    cudaDeviceSynchronize();
+    get_numsig<<<1,1>>>(d_num_sig);
+    cudaDeviceSynchronize();
+
+    checkCudaErrors(cudaMemcpy(num_sig, d_num_sig, sizeof(uint64_t), cudaMemcpyDeviceToHost));
+
+    blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
+    blk_vals= (float *)malloc((*num_sig)*sizeof(float));
+    blk_subidx = (uint8_t *)malloc((*num_sig)*sizeof(uint8_t));
+    blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
+
+    checkCudaErrors(cudaMemcpy(meta, d_meta, msz, cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(offsets, d_offsets, nbBlocks*sizeof(short), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(midBytes, d_midBytes, mbsz, cudaMemcpyDeviceToHost)); 
+    
+    
+    checkCudaErrors(cudaMemcpy(blk_idx, d_blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(blk_vals,d_blk_vals, (*num_sig)*sizeof(float), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(blk_subidx,d_blk_subidx, (*num_sig)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(blk_sig,d_blk_sig, (nbBlocks)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
+
+    size_t maxPreservedBufferSize = sizeof(float)*nbEle;
+    unsigned char* outBytes = (unsigned char*)malloc(maxPreservedBufferSize);
+    memset(outBytes, 0, maxPreservedBufferSize);
+
+    outSize = (size_t *)malloc(sizeof(size_t));
+    //outSize[0] = _post_proc(oriData, meta, offsets, midBytes, outBytes, nbEle, blockSize, *num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+
+    *outSize = _post_proc(oriData, meta, offsets, midBytes, outBytes, nbEle, blockSize, *num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+//    printf("Beginning free\n");
+    // printf("outsize %p \n", outBytes);
+    free(blk_idx);
+    free(blk_subidx);
+    free(blk_vals);
+    free(meta);
+    free(offsets);
+    free(midBytes);
+    checkCudaErrors(cudaFree(d_meta));
+    checkCudaErrors(cudaFree(d_offsets));
+    checkCudaErrors(cudaFree(d_midBytes));
+    return outBytes;
+}
+
+void cuSZx_fast_decompress_args_unpredictable_blocked_float(float** newData, size_t nbEle, unsigned char* cmpBytes)
+{
+    uint32_t *blk_idx, *d_blk_idx;
+    uint8_t *blk_subidx, *d_blk_subidx;
+    uint8_t *blk_sig, *d_blk_sig;
+    float *blk_vals, *d_blk_vals;
+    size_t num_sig, *d_num_sig;
+
+	*newData = (float*)malloc(sizeof(float)*nbEle);
+    memset(*newData, 0, sizeof(float)*nbEle);
+	
+	unsigned char* r = cmpBytes;
+	r += 4;
+	int blockSize = r[0];  //get block size
+	if(blockSize == 0)blockSize = 256;
+	r++;
+	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
+	r += sizeof(size_t);
+	num_sig = bytesToSize(r);
+    r += sizeof(size_t);
+	size_t nbBlocks = nbEle/blockSize;
+    size_t ncBlocks = 0;
+    size_t num_state2_blks = 0;
+	// size_t ncBlocks = nbBlocks - nbConstantBlocks; //get number of constant blocks
+	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    size_t ncLeading = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncLeading; //Number of bytes for each data block's metadata.
+	unsigned char* stateArray = (unsigned char*)malloc(nbBlocks);
+    unsigned char* d_stateArray;
+    cudaMalloc(&d_stateArray, nbBlocks);
+	float* constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));			
+	
+    
+
+    blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
+    blk_vals= (float *)malloc((num_sig)*sizeof(float));
+    blk_subidx = (uint8_t *)malloc((num_sig)*sizeof(uint8_t));
+    blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
+
+	// printf("Converting state array\n");
+    convert_out_to_state(nbBlocks, r, stateArray);
+	// convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
+	for (size_t i = 0; i < nbBlocks; i++)
+    {
+        if (stateArray[i] == 2)
+        {
+            num_state2_blks++;
+        }else if(stateArray[i] == 3){
+            ncBlocks++;
+        }
+    }
+    
+	r += stateNBBytes;
+    unsigned char* data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
+    memset(data, 0, ncBlocks*blockSize*sizeof(float));
+    // printf("converting block vals\n");
+    size_t to_add = convert_out_to_block2(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    r+= to_add;
+    // checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
+    // num_sig = (uint64_t *)malloc(sizeof(uint64_t));
+    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
+    // blk_idx = malloc()
+    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, num_sig*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, num_sig*sizeof(float)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMemcpy(d_blk_idx, blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_blk_vals, blk_vals, (num_sig)*sizeof(float), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_blk_subidx, blk_subidx, (num_sig)*sizeof(uint8_t), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_stateArray, stateArray, nbBlocks, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_blk_sig, blk_sig, nbBlocks*sizeof(uint8_t), cudaMemcpyHostToDevice));
+
+
+	size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+    memcpy((*newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
+    r += (nbEle%blockSize)*sizeof(float);
+	float* fr = (float*)r; //fr is the starting address of constant median values.
+	for(i = 0;i < nbConstantBlocks;i++, j+=4) //get the median values for constant-value blocks
+		constantMedianArray[i] = fr[i];
+    r += nbConstantBlocks*sizeof(float);
+    unsigned char* p = r + ncBlocks * sizeof(short);
+    for(i = 0;i < ncBlocks;i++){
+        int leng = (int)bytesToShort(r)+mSize;
+        r += sizeof(short);
+        if (leng > blockSize*sizeof(float))
+        {
+            printf("Warning: compressed block is larger than the original block!\n");
+            exit(0);
+        }
+        memcpy(data+i*blockSize*sizeof(float), p, leng);
+        p += leng;
+    } 
+
+    unsigned char* d_data;
+    float *d_newdata;
+    checkCudaErrors(cudaMalloc((void**)&d_data, ncBlocks*blockSize*sizeof(float))); 
+    checkCudaErrors(cudaMemcpy(d_data, data, ncBlocks*blockSize*sizeof(float), cudaMemcpyHostToDevice)); 
+    checkCudaErrors(cudaMalloc(&d_newdata, nbBlocks*blockSize*sizeof(float)));
+
+    timer_GPU.StartCounter();
+    dim3 dimBlock(32, blockSize/32);
+    dim3 dimGrid(65536, 1);
+    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
+    decompress_state2<<<nbBlocks, 64>>>(d_newdata, d_stateArray,d_blk_idx, d_blk_vals, d_blk_subidx,blockSize, d_blk_sig);
+    decompress_float<<<dimGrid, dimBlock, sMemsize>>>(d_data, blockSize, ncBlocks, mSize);
+    cudaError_t err = cudaGetLastError();        // Get error code
+    printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
+    cudaDeviceSynchronize();
+    checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
+    float* fdata = (float*)data;
+
+    int nb=0, nc=0;
+    for (i=0;i<nbBlocks;i++){
+        if (stateArray[i]==0 || stateArray[i]==1){
+            float Median = constantMedianArray[nb];
+            if (Median>1) printf("data%i:%f\n",i, Median);
+            for (j=0;j<blockSize;j++)
+                *((*newData)+i*blockSize+j) = Median;
+            nb++;
+        }else if(stateArray[i]==3){
+            for (j=0;j<blockSize;j++)
+                *((*newData)+i*blockSize+j) = fdata[nc*blockSize+j];
+            nc++;
+        }
+    }
+
+	free(stateArray);
+	free(constantMedianArray);
+	free(data);
+    cudaFree(d_newdata);
+    cudaFree(d_stateArray);
+    checkCudaErrors(cudaFree(d_data));
+
+}
+
+__device__ inline void longToBytes_bigEndian_d(unsigned char *b, unsigned long num) 
+{
+	b[0] = (unsigned char)(num>>56);
+	b[1] = (unsigned char)(num>>48);
+	b[2] = (unsigned char)(num>>40);
+	b[3] = (unsigned char)(num>>32);
+	b[4] = (unsigned char)(num>>24);
+	b[5] = (unsigned char)(num>>16);
+	b[6] = (unsigned char)(num>>8);
+	b[7] = (unsigned char)(num);
+//	if(dataEndianType==LITTLE_ENDIAN_DATA)
+//		symTransform_8bytes(*b);
+}
+
+inline void longToBytes_bigEndian_memset(unsigned char *b, unsigned long num) 
+{
+    checkCudaErrors(cudaMemset(&b[0], (unsigned char)(num>>56), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[1], (unsigned char)(num>>48), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[2], (unsigned char)(num>>40), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[3], (unsigned char)(num>>32), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[4], (unsigned char)(num>>24), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[5], (unsigned char)(num>>16), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[6], (unsigned char)(num>>8), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[7], (unsigned char)(num), sizeof(char)));
+//	if(dataEndianType==LITTLE_ENDIAN_DATA)
+//		symTransform_8bytes(*b);
+}
+
+__device__ inline void shortToBytes_d(unsigned char* b, short value)
+{
+	lint16 buf;
+	buf.svalue = value;
+	memcpy(b, buf.byte, 2);
+}
+
+
+
+__global__ void getNumNonConstantBlocks(size_t nbBlocks, short *offsets, unsigned char *meta, int blockSize, int *nonconstant, int *out_size){
+    for (int tid = blockDim.x*blockIdx.x + threadIdx.x; tid < nbBlocks; tid += blockDim.x*gridDim.x){
+        if (meta[tid] == 3){ 
+            atomicAdd(nonconstant, 1);
+            atomicAdd(out_size,1+(blockSize/4)+offsets[tid]);
+        }
+    }
+}
+
+__global__ void generateFlags(unsigned char *states, uint64_t *cBlk_flags, uint64_t *ncBlk_flags,uint64_t* offset_indices,short* offsets, size_t nbBlocks){
+    for (int tid = blockDim.x*blockIdx.x + threadIdx.x; tid < nbBlocks; tid += blockDim.x*gridDim.x){
+        if (states[tid] == 0 || states[tid] == 1)
+        {
+            cBlk_flags[tid] = 1;
+            ncBlk_flags[tid] = 0;
+            offset_indices[tid] = 0;
+        }else if(states[tid]==3){
+            ncBlk_flags[tid] = 1;
+            cBlk_flags[tid] = 0;
+            offset_indices[tid] = (uint64_t) offsets[tid];
+        }else{
+            cBlk_flags[tid] = 0;
+            ncBlk_flags[tid] = 0;
+            offset_indices[tid] = 0;
+        }
+        
+    }
+}
+
+__global__ void nccopy_kernel2(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, uint64_t *cBlk_indices, uint64_t *ncBlk_indices, uint64_t* offset_indices){
+   // printf("blockdim %d blockidx %d threadidx %d griddim %d\n", blockDim.x, blockIdx.x, threadIdx.x, gridDim.x);
+    int i;
+    int num_threads = (blockDim.x*gridDim.x);
+    int tid = blockDim.x*blockIdx.x + threadIdx.x;
+    int blocks_per_thread = nbBlocks/num_threads;
+    int start_idx = tid*blocks_per_thread;
+    int end_idx = start_idx+blocks_per_thread;
+
+    if (tid == num_threads-1)
+    {
+        end_idx = nbBlocks;
+    }
+    
+    unsigned char* tmp_o = o+(sizeof(short)*ncBlk_indices[start_idx]);
+    unsigned char* tmp_nc= nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]);
+    for (i=start_idx; i<end_idx; i++){
+        if(meta[i] == 3){
+	
+            
+            shortToBytes_d(o, offsets[i]);
+            tmp_o += sizeof(short);
+            memcpy(tmp_nc, meta+(nbBlocks+i*mSize), mSize);
+            tmp_nc += mSize; 
+            memcpy(tmp_nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            tmp_nc += offsets[i];
+
+            // shortToBytes_d(o+(sizeof(short)*ncBlk_indices[i]), offsets[i]);
+            
+            // memcpy(nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]), meta+(nbBlocks+i*mSize), mSize);
+
+
+            // memcpy(nc+(mSize*(ncBlk_indices[i]+1) + offset_indices[i]*ncBlk_indices[i]), midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+        } 
+    }
+    
+}
+
+
+__global__ void nccopy_kernel(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, uint64_t *cBlk_indices, uint64_t *ncBlk_indices, uint64_t* offset_indices, size_t *final_nc){
+   // printf("blockdim %d blockidx %d threadidx %d griddim %d\n", blockDim.x, blockIdx.x, threadIdx.x, gridDim.x);
+    int i;
+    // if(threadIdx.x==0){
+	// printf("c: %ld nc: %ld\n", cBlk_indices[nbBlocks-1], ncBlk_indices[nbBlocks-1]);
+    // }
+    for (i=blockDim.x*blockIdx.x + threadIdx.x; i<nbBlocks; i+=blockDim.x*gridDim.x){
+        //printf("meta %d i: %d\n",meta[i], i); 
+        if (meta[i]==0 || meta[i] == 1){
+            // printf("cblk\n");
+	        memcpy(c+(sizeof(float)*cBlk_indices[i]), meta+(nbBlocks+i*mSize), sizeof(float));
+	   
+            // printf("cblk done\n");
+	    // c += sizeof(float);
+	    // float g;
+	    // memcpy(&g, (meta+(nbBlocks+i*mSize)),sizeof(float));
+	    // printf("%d %f\n",i,g);
+        }
+        else if(meta[i] == 3){
+	
+        //     printf("ncblk 1\n");
+            shortToBytes_d(o+(sizeof(short)*ncBlk_indices[i]), offsets[i]);
+             // o += sizeof(short);
+
+        //     printf("ncblk 2 nbBlocks %d %d \n", nbBlocks, i);
+            // printf("nbBlkindices %ld offset_indices %ld\n", ncBlk_indices[i], offset_indices[i]);
+        //     printf(" test 1%c\n",meta+(nbBlocks+i*mSize));
+        //     printf("test 2%c\n", nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]));
+            memcpy(nc+((mSize*ncBlk_indices[i] + offset_indices[i])), meta+(nbBlocks+i*mSize), mSize);
+        //         // nc += mSize; 
+                
+        //     printf("ncblk 3\n");
+            memcpy(nc+(((mSize*ncBlk_indices[i])+mSize + offset_indices[i])), midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+        //         // nc += offsets[i];
+            
+        //     printf("ncblk 4\n");
+        }
+        if (i==nbBlocks-1)
+        {
+            *final_nc = (size_t) (((mSize*ncBlk_indices[i])+mSize + offset_indices[i]))+offsets[i];
+	}
+        
+    }
+    
+}
+
+//__global__ void nccopy_kernel(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+//                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, int *cBlk_indices, int *ncBlk_indices, int* offset_indices){
+//    printf("blockdim %d blockidx %d threadidx %d griddim %d\n", blockDim.x, blockIdx.x, threadIdx.x, gridDim.x);
+//    int i;
+//    for (i=blockDim.x*blockIdx.x + threadIdx.x; i<nbBlocks; i+=blockDim.x*gridDim.x){
+        //printf("meta %d i: %d\n",meta[i], i); 
+//        if (meta[i]==0 || meta[i] == 1){
+            // printf("cblk\n");
+//	    memcpy(c+(sizeof(float)*cBlk_indices[i]), meta+(nbBlocks+i*mSize), sizeof(float));
+
+            // printf("cblk done\n");
+	    // c += sizeof(float);
+	    // float g;
+	    // memcpy(&g, (meta+(nbBlocks+i*mSize)),sizeof(float));
+	    // printf("%d %f\n",i,g);
+//        }else if(meta[i] == 3){
+	
+//           printf("ncblk 1\n");
+//           shortToBytes_d(o+(sizeof(short)*ncBlk_indices[i]), offsets[i]);
+            // o += sizeof(short);
+
+//           printf("ncblk 2 nbBlocks %d %d \n", nbBlocks, i);
+//	   printf("nbBlkindices %d offset_indices %d\n", ncBlk_indices[i], offset_indices[i]);
+//	   memcpy(nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]), meta+(nbBlocks+i*mSize), mSize);
+            // nc += mSize; 
+            
+//           printf("ncblk 3\n");
+//	   memcpy(nc+(mSize*(ncBlk_indices[i]+1) + offset_indices[i]*ncBlk_indices[i]), midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            // nc += offsets[i];
+        
+//           printf("ncblk 4\n");
+//	} 
+//    }
+    
+//}
+
+__global__ void set_nc(unsigned char *nc, short *offsets, uint64_t *offset_indices, uint64_t *ncBlk_indices, size_t mSize, size_t nbBlocks){
+    if (threadIdx.x == 0 && blockIdx.x == 0)
+    {
+        nc = nc + (mSize*(ncBlk_indices[nbBlocks -1]+1) + offset_indices[nbBlocks - 1]*ncBlk_indices[nbBlocks - 1]) + offsets[nbBlocks-1];
+    }
+    
+}
+
+void ncblkCopy_fast(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, size_t *final_nc){
+    uint64_t *cBlk_indices, *ncBlk_indices;
+    uint64_t *offset_indices;
+    TimingGPU timer2;
+
+    // timer2.StartCounter();
+    
+    checkCudaErrors(cudaMalloc(&cBlk_indices, sizeof(uint64_t)*nbBlocks));
+    checkCudaErrors(cudaMalloc(&ncBlk_indices, sizeof(uint64_t)*nbBlocks));
+    checkCudaErrors(cudaMalloc(&offset_indices, sizeof(uint64_t)*nbBlocks));
+
+    generateFlags<<<BLOCKS,THREADS_PER_BLOCK>>>(meta, cBlk_indices, ncBlk_indices, offset_indices, offsets, nbBlocks);
+    cudaDeviceSynchronize();
+
+    thrust::exclusive_scan(thrust::device, cBlk_indices, cBlk_indices + nbBlocks, cBlk_indices, 0);
+    thrust::exclusive_scan(thrust::device, ncBlk_indices, ncBlk_indices + nbBlocks, ncBlk_indices, 0);
+    thrust::exclusive_scan(thrust::device, offset_indices, offset_indices + nbBlocks, offset_indices, 0);
+
+    nccopy_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(c, o, nc, midBytes, meta, nbBlocks, blockSize, offsets, mSize, cBlk_indices,ncBlk_indices,offset_indices,final_nc);
+    // nccopy_kernel2<<<1,1>>>(c, o, nc, midBytes, meta, nbBlocks, blockSize, offsets, mSize, cBlk_indices,ncBlk_indices,offset_indices);
+
+    cudaDeviceSynchronize();
+
+    //printf("nc: %p\n", nc);
+    // printf("%s\n", cudaGetErrorString(cudaGetLastError()));
+    // set_nc<<<1,1>>>(nc, offsets, offset_indices, ncBlk_indices, mSize, nbBlocks);
+    // cudaDeviceSynchronize();
+    // printf("ncblockcpy: %f ms\n", timer2.GetCounter());
+    checkCudaErrors(cudaFree(cBlk_indices));
+    checkCudaErrors(cudaFree(ncBlk_indices));
+    checkCudaErrors(cudaFree(offset_indices));
+}
+
+void ncblkCopy_h(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize){
+    unsigned char *tmp_states;
+    unsigned char *ncold = nc;
+    uint64_t col_off = 0;
+    short *tmp_offsets;
+    tmp_offsets = (short*)malloc(sizeof(short)*nbBlocks);
+    tmp_states = (unsigned char *)malloc(sizeof(char)*nbBlocks);
+    checkCudaErrors(cudaMemcpy(tmp_states, meta, sizeof(char)*nbBlocks, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(tmp_offsets,offsets,sizeof(short)*nbBlocks,cudaMemcpyDeviceToHost));
+    cudaStream_t stream[3];
+    cudaStreamCreate(&stream[0]);
+    cudaStreamCreate(&stream[1]);
+    cudaStreamCreate(&stream[2]);
+
+    //printf("here\n");
+    //checkCudaErrors(cudaMemcpy((void**)&d_offsets, nbBlocks*sizeof(short))); 
+    for (int i = 0; i < nbBlocks; i++)
+    {
+        if(tmp_states[i]==3){
+            // shortToBytes_d(o, offsets[i]);
+            // buf = (unsigned char*)
+            
+//	    printf("here2\n");
+            cudaMemcpyAsync(o, offsets+i, 2, cudaMemcpyDeviceToDevice, stream[0]);
+            o += sizeof(short);
+        
+    //	    printf("here2.1\n");
+            // printf("offsets %ld\n", col_off);
+            cudaMemcpyAsync(nc, meta+(nbBlocks+i*mSize), mSize, cudaMemcpyDeviceToDevice, stream[1]);
+                // memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
+                
+            nc += mSize; 
+                
+    //	    printf("here2.2\n");
+            //checkCudaErrors(cudaMemcpy(buf, offsets+i, sizeof(short), cudaMemcpyDeviceToHost));
+                
+    //	    //printf("here2.3 %d\n", buf);
+            cudaMemcpyAsync(nc, midBytes+(i*blockSize*sizeof(float)), (int)tmp_offsets[i], cudaMemcpyDeviceToDevice, stream[2]);
+            // memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            nc += tmp_offsets[i];
+            col_off+=tmp_offsets[i];
+       
+///	    printf("here2.4\n");
+       	}
+    }
+    cudaStreamDestroy(stream[0]);
+    cudaStreamDestroy(stream[1]);
+    cudaStreamDestroy(stream[2]);
+
+    free(tmp_states);
+    free(tmp_offsets); 
+}
+
+__global__ void ncblkCopy(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize)
+{
+    for (int i=blockDim.x*blockIdx.x + threadIdx.x; i<nbBlocks; i+=blockDim.x*gridDim.x){
+        
+        if (meta[i]==0 || meta[i] == 1){
+            memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
+            c += sizeof(float);
+	    // float g;
+	    // memcpy(&g, (meta+(nbBlocks+i*mSize)),sizeof(float));
+	    // printf("%d %f\n",i,g);
+        }else if(meta[i] == 3){
+           shortToBytes_d(o, offsets[i]);
+            o += sizeof(short);
+            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
+            nc += mSize; 
+            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            nc += offsets[i];
+        } 
+    }
+}
+
+size_t better_post_proc(size_t *outSize, float *oriData, unsigned char *meta, 
+                                short *offsets, unsigned char *midBytes, unsigned char *outBytes, 
+                                size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, 
+                                float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    /**
+     * outSize: host pointer
+     * float *oriData: device pointer
+     * unsigned char* meta: device pointer
+     * short *offsets: device pointer
+     * 
+     * 
+     */
+    int out_size_h = 0;
+    int *out_size_d;
+    int tmp_outsize = 0;
+    size_t *nc_diff;
+    size_t nbConstantBlocks = 0;
+    size_t nbBlocks = nbEle/blockSize;
+    size_t ncBytes = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    out_size_h += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
+    if (nbBlocks%8==0)
+        out_size_h += nbBlocks/8;
+    else
+        out_size_h += nbBlocks/8+1;
+    cudaMalloc(&nc_diff, sizeof(size_t));
+    int *nonconstant_d, nonconstant_h;
+    checkCudaErrors(cudaMalloc((void **)&nonconstant_d, sizeof(int)));
+    checkCudaErrors(cudaMalloc((void **)&out_size_d, sizeof(int)));
+
+    checkCudaErrors(cudaMemset(nonconstant_d, 0, sizeof(int)));
+    checkCudaErrors(cudaMemset(out_size_d, 0, sizeof(int)));
+
+
+    getNumNonConstantBlocks<<<BLOCKS,THREADS_PER_BLOCK>>>(nbBlocks, offsets, meta, blockSize, nonconstant_d, out_size_d);
+    cudaDeviceSynchronize();
+
+    checkCudaErrors(cudaMemcpy(&nonconstant_h, nonconstant_d, sizeof(int), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(&tmp_outsize, out_size_d, sizeof(int), cudaMemcpyDeviceToHost));
+
+    nbConstantBlocks = nbBlocks - nonconstant_h;
+    out_size_h+=tmp_outsize;
+
+    out_size_h += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
+
+    //outBytes = (unsigned char*)malloc(out_size);
+	unsigned char* r = outBytes;
+    unsigned char* r_old = outBytes;
+    checkCudaErrors(cudaMemset(r, SZx_VER_MAJOR, sizeof(char)));
+    checkCudaErrors(cudaMemset(r+1, SZx_VER_MINOR, sizeof(char)));
+    checkCudaErrors(cudaMemset(r+2, 1, sizeof(char)));
+    checkCudaErrors(cudaMemset(r+3, 0, sizeof(char)));
+    checkCudaErrors(cudaMemset(r+4, blockSize, sizeof(char)));
+
+	r=r+5; //1 byte
+	//sizeToBytes(r, nbConstantBlocks);
+    longToBytes_bigEndian_memset(r, nbConstantBlocks);
+	r += sizeof(size_t);
+    //sizeToBytes(r, (size_t) num_sig);
+    longToBytes_bigEndian_memset(r, (unsigned long)num_sig);
+    r += sizeof(size_t); 
+    size_t out_length;
+
+    if(nbBlocks%4==0)
+		out_length = nbBlocks/4;
+	else
+		out_length = nbBlocks/4+1;
+
+    convert_state_to_out_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(meta, nbBlocks, r, out_length);
+    r+=out_length;
+    convert_block2_to_out_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    r += nbBlocks*4 + num_sig*sizeof(float) + num_sig*sizeof(uint8_t) + nbBlocks*sizeof(uint8_t);
+
+    checkCudaErrors(cudaMemcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float), cudaMemcpyDeviceToDevice));
+    // memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
+    r += (nbEle%blockSize)*sizeof(float);
+    unsigned char* c = r;
+    unsigned char* o = c+nbConstantBlocks*sizeof(float);
+    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
+    // ncblkCopy<<<1,1>>>(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
+    
+    // ncblkCopy_h(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
+    ncblkCopy_fast(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize, nc_diff);
+    // cudaDeviceSynchronize();
+    size_t h_nc_diff;
+    cudaMemcpy(&h_nc_diff,nc_diff, sizeof(size_t),cudaMemcpyDeviceToHost);
+    return (size_t) (nc+h_nc_diff-r_old);
+    // checkCudaErrors(cudaMemcpy(outSize, (size_t)(nc-r_old), sizeof(size_t)));
+    // *outSize = (size_t) (nc-r_old);
+    // return outBytes;
+}
+
+__global__ void device_post_proc(size_t *outSize, float *oriData, unsigned char *meta, 
+                                short *offsets, unsigned char *midBytes, unsigned char *outBytes, 
+                                size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, 
+                                float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig)
+{
+    int out_size = 0;
+
+    size_t nbConstantBlocks = 0;
+    size_t nbBlocks = nbEle/blockSize;
+    size_t ncBytes = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    out_size += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
+    if (nbBlocks%8==0)
+        out_size += nbBlocks/8;
+    else
+        out_size += nbBlocks/8+1;
+    int s0 = 0;
+    int s1 = 0;
+    int s2 = 0;
+    int s3 = 0;
+    for (int i=0; i<nbBlocks; i++){
+        if (meta[i]==0 || meta[i]==1 || meta[i] == 2) nbConstantBlocks++;
+        else out_size += 1+(blockSize/4)+offsets[i];
+    
+    	if(meta[i]==0) s0++;
+    	if(meta[i]==1) s1++;
+    	if(meta[i]==2) s2++;
+    	if(meta[i]==3) s3++;
+    }
+  //  printf("%d %d %d %d\n", s0, s1, s2, s3);
+    out_size += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
+
+    //outBytes = (unsigned char*)malloc(out_size);
+	unsigned char* r = outBytes;
+   // printf("outbytes %p\n",r);
+    unsigned char* r_old = outBytes; 
+	r[0] = SZx_VER_MAJOR;
+	r[1] = SZx_VER_MINOR;
+	r[2] = 1;
+	r[3] = 0; // indicates this is not a random access version
+	r[4] = (unsigned char)blockSize;
+	r=r+5; //1 byte
+	//sizeToBytes(r, nbConstantBlocks);
+    longToBytes_bigEndian_d(r, nbConstantBlocks);
+	r += sizeof(size_t);
+    //sizeToBytes(r, (size_t) num_sig);
+
+   // printf("outbytes %p\n",r);
+    longToBytes_bigEndian_d(r, (unsigned long)num_sig);
+    r += sizeof(size_t); 
+	r += convert_state_to_out(meta, nbBlocks, r);
+   // printf("num sig %d\n", num_sig); 
+   // printf("outbytes %p\n",r);
+    r += convert_block2_to_out(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    
+   // printf("outbytes %p\n",r);
+    memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
+    r += (nbEle%blockSize)*sizeof(float);
+
+   // printf("outbytes %p\n",r);
+    unsigned char* c = r;
+    unsigned char* o = c+nbConstantBlocks*sizeof(float);
+    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
+    for (int i=0; i<nbBlocks; i++){
+        
+        if (meta[i]==0 || meta[i] == 1){
+            memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
+            c += sizeof(float);
+       
+	    // float g;
+	    // memcpy(&g, (c-sizeof(float)),sizeof(float));
+	    // printf("%d %f\n",i,g);
+       	}else if(meta[i] == 3){
+           shortToBytes_d(o, offsets[i]);
+            o += sizeof(short);
+            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
+            nc += mSize; 
+            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            nc += offsets[i];
+        } 
+    }
+
+    // return out_size;
+    *outSize = (size_t) (nc-r_old);
+   // printf("outBytes 0 %d\n", (int) outBytes[0]);
+    // return (uint32_t) (nc-r_old);
+}
+
+__global__ void fin_copy(unsigned char* in, unsigned char *out, size_t n){
+
+	for(size_t i = threadIdx.x+blockIdx.x*gridDim.x; i < n; i+=blockDim.x*gridDim.x){
+		out[i]=in[i];
+	}
+
+}
+
+unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold)
+{
+    /**
+     * Assuming the following are device pointers:
+     *  float *oriData
+     *  size_t *outSize
+     *  unsigned char* outBytes
+     * 
+     */
+    // float *dmin,*dmax, *hmin, *hmax;
+    // void *d_temp_storage = NULL;
+    // size_t temp_storage_bytes = 0;
+    timer_GPU.StartCounter();
+//     cudaMalloc(&dmin, sizeof(float));
+//     cudaMalloc(&dmax, sizeof(float));
+
+//    // dmax = thrust::reduce(oriData, oriData+nbEle, -1, thrust::maximum<float>());
+//    // dmin = thrust::reduce(oriData, oriData+nbEle, 1, thrust::minimum<float>());
+//     cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, oriData, dmax, nbEle);
+//     cudaMalloc(&d_temp_storage, temp_storage_bytes);
+//     cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, oriData, dmax, nbEle);
+
+//     cudaFree(d_temp_storage);
+//     cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, oriData, dmin, nbEle);
+//     cudaMalloc(&d_temp_storage, temp_storage_bytes);
+//     cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, oriData, dmin, nbEle);
+
+//     cudaFree(d_temp_storage);
+//     // thrust::pair<float *, float *> result = thrust::minmax_element(thrust::device, oriData,oriData+nbEle);
+//     //printf("here\n");
+//     cudaMemcpy(hmin, dmin, sizeof(float), cudaMemcpyDeviceToHost);
+//     cudaMemcpy(hmax, dmax,sizeof(float), cudaMemcpyDeviceToHost);
+//     absErrBound = absErrBound*(hmax-hmin);
+//     threshold = threshold*(hmax-hmin);
+    // // printf("%f\n",absErrBound);
+    // cudaFree(dmin);
+    // cudaFree(dmax);
+    float sparsity_level = SPARSITY_LEVEL;
+
+    // Set the input data as the function parameter, this should be a device pointer
+
+	float* d_oriData = oriData;
+    // cudaMalloc((void**)&d_oriData, sizeof(float)*nbEle); 
+    // cudaMemcpy(d_oriData, oriData, sizeof(float)*nbEle, cudaMemcpyHostToDevice); 
+
+	size_t nbBlocks = nbEle/blockSize;
+	size_t remainCount = nbEle%blockSize;
+	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
+
+    size_t ncBytes = blockSize/4;
+    //ncBytes = (blockSize+1)%4==0 ? ncBytes : ncBytes+1; //Bytes to store one non-constant block data.
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    size_t msz = (1+mSize) * nbBlocks * sizeof(unsigned char);
+    size_t mbsz = sizeof(float) * nbEle * sizeof(unsigned char);
+
+    // These are host pointers and do not need to be allocated
+
+    // unsigned char *meta = (unsigned char*)malloc(msz);
+    // short *offsets = (short*)malloc(nbBlocks*sizeof(short));
+    // unsigned char *midBytes = (unsigned char*)malloc(mbsz);
+
+	unsigned char* d_meta;
+	unsigned char* d_midBytes;
+	short* d_offsets;
+
+    uint32_t *blk_idx, *d_blk_idx;
+    uint8_t *blk_sig, *d_blk_sig;
+    uint8_t *blk_subidx, *d_blk_subidx;
+    float *blk_vals, *d_blk_vals;
+    uint64_t *num_sig, *d_num_sig;
+
+    checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
+    num_sig = (uint64_t *)malloc(sizeof(uint64_t));
+    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
+    // blk_idx = malloc()
+    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, nbEle*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, nbEle*sizeof(float)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void**)&d_meta, msz)); 
+    //checkCudaErrors(cudaMemcpy(d_meta, meta, msz, cudaMemcpyHostToDevice)); 
+    checkCudaErrors(cudaMemset(d_meta, 0, msz));
+    checkCudaErrors(cudaMalloc((void**)&d_offsets, nbBlocks*sizeof(short))); 
+    checkCudaErrors(cudaMemset(d_offsets, 0, nbBlocks*sizeof(short)));
+    checkCudaErrors(cudaMalloc((void**)&d_midBytes, mbsz)); 
+    checkCudaErrors(cudaMemset(d_midBytes, 0, mbsz));
+
+    
+    // apply_threshold<<<80,256>>>(d_oriData, threshold, nbEle);
+    // cudaDeviceSynchronize();
+    dim3 dimBlock(32, blockSize/32);
+    dim3 dimGrid(65536, 1);
+    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
+    //printf("Malloc end timestamp: %f ms\n", timer_GPU.GetCounter());
+    compress_float<<<dimGrid, dimBlock, sMemsize>>>(d_oriData, d_meta, d_offsets, d_midBytes, absErrBound, blockSize, nbBlocks, mSize, sparsity_level, d_blk_idx, d_blk_subidx,d_blk_vals, threshold, d_blk_sig);
+    cudaError_t err = cudaGetLastError();        // Get error code
+   // printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    //printf("GPU compression timestamp: %f ms\n", timer_GPU.GetCounter());
+    cudaDeviceSynchronize();
+    get_numsig<<<1,1>>>(d_num_sig);
+    cudaDeviceSynchronize();
+
+    checkCudaErrors(cudaMemcpy(num_sig, d_num_sig, sizeof(uint64_t), cudaMemcpyDeviceToHost));
+
+    // These are allocations and memcpys to host pointers, do not need them
+
+    // blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
+    // blk_vals= (float *)malloc((*num_sig)*sizeof(float));
+    // blk_subidx = (uint8_t *)malloc((*num_sig)*sizeof(uint8_t));
+    // blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
+
+    // checkCudaErrors(cudaMemcpy(meta, d_meta, msz, cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(offsets, d_offsets, nbBlocks*sizeof(short), cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(midBytes, d_midBytes, mbsz, cudaMemcpyDeviceToHost)); 
+    
+    
+    // checkCudaErrors(cudaMemcpy(blk_idx, d_blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyDeviceToHost));
+    // checkCudaErrors(cudaMemcpy(blk_vals,d_blk_vals, (*num_sig)*sizeof(float), cudaMemcpyDeviceToHost));
+    // checkCudaErrors(cudaMemcpy(blk_subidx,d_blk_subidx, (*num_sig)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
+    // checkCudaErrors(cudaMemcpy(blk_sig,d_blk_sig, (nbBlocks)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
+
+
+    size_t maxPreservedBufferSize = sizeof(float)*nbEle;
+    unsigned char *d_outBytes;
+    // unsigned char* outBytes = (unsigned char*)malloc(maxPreservedBufferSize);
+    // memset(outBytes, 0, maxPreservedBufferSize);
+    checkCudaErrors(cudaMalloc(&d_outBytes, maxPreservedBufferSize));
+
+    size_t *d_outSize;
+
+    checkCudaErrors(cudaMalloc(&d_outSize, sizeof(size_t)));
+
+  //  device_post_proc<<<1,1>>>(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
+    *outSize = better_post_proc(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
+    //cudaDeviceSynchronize();
+    
+    //checkCudaErrors(cudaMemcpy(outSize, d_outSize, sizeof(size_t), cudaMemcpyDeviceToHost));
+
+    // printf("completed compression\n");
+    //free(blk_idx);
+    //free(blk_subidx);
+    //free(blk_vals);
+    // free(meta);
+    // free(offsets);
+    // free(midBytes);
+    checkCudaErrors(cudaFree(d_num_sig));
+    checkCudaErrors(cudaFree(d_blk_idx));
+    checkCudaErrors(cudaFree(d_blk_subidx));
+    checkCudaErrors(cudaFree(d_blk_vals));
+    checkCudaErrors(cudaFree(d_blk_sig));
+
+    checkCudaErrors(cudaFree(d_meta));
+    checkCudaErrors(cudaFree(d_offsets));
+    checkCudaErrors(cudaFree(d_midBytes));
+
+    unsigned char *d_newout;
+    
+    *outSize = *outSize;
+    size_t os = *outSize;
+    
+    checkCudaErrors(cudaMalloc(&d_newout, os));
+    //fin_copy<<<40,256>>>(d_outBytes, d_newout,os);
+    checkCudaErrors(cudaMemcpy(d_newout, d_outBytes, os, cudaMemcpyDeviceToDevice));
+    cudaDeviceSynchronize(); 
+
+    checkCudaErrors(cudaFree(d_outBytes));
+    printf("Compression end timestamp: %f ms\n", timer_GPU.GetCounter());
+     
+    err = cudaGetLastError();        // Get error code
+    printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    return d_newout;
+    //return d_outBytes;
+}
+
+__device__ inline long bytesToLong_bigEndian(unsigned char* b) {
+	long temp = 0;
+	long res = 0;
+
+	res <<= 8;
+	temp = b[0] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = b[1] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[3] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[4] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[5] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[6] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[7] & 0xff;
+	res |= temp;						
+	
+	return res;
+}
+
+__device__ inline size_t bytesToSize(unsigned char* bytes)
+{
+	size_t result = bytesToLong_bigEndian(bytes);//8	
+	return result;
+}
+
+__device__ inline short bytesToShort(unsigned char* bytes)
+{
+	lint16 buf;
+	memcpy(buf.byte, bytes, 2);
+	
+	return buf.svalue;
+}
+
+__global__ void decompress_get_stats(float *newData, size_t nbEle, unsigned char* cmpBytes, 
+    size_t *numSigValues, int *bs,
+    size_t *numConstantBlks, size_t *numBlks,
+    size_t *mSizeptr, unsigned char *newCmpBytes
+){
+	unsigned char* r = cmpBytes;
+
+    size_t num_sig;
+	r += 4;
+	int blockSize = (int) r[0];  //get block size
+	
+	if(blockSize == 0)blockSize = 256;
+	r++;
+	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
+	r += sizeof(size_t);
+	num_sig = bytesToSize(r);
+    
+    r += sizeof(size_t);
+	size_t nbBlocks = nbEle/blockSize;
+    size_t ncBlocks = 0;
+    size_t num_state2_blks = 0;
+	// size_t ncBlocks = nbBlocks - nbConstantBlocks; //get number of constant blocks
+	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    size_t ncLeading = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncLeading; //Number of bytes for each data block's metadata.
+
+    *mSizeptr = mSize;
+
+    *numConstantBlks = nbConstantBlocks;
+    *numBlks = nbBlocks;
+    *numSigValues = num_sig;
+    *bs = blockSize;
+    newCmpBytes = r;
+
+}
+
+ void setup_data_stateArray_better(float *newData, size_t nbEle, unsigned char* r, 
+    size_t num_sig, int blockSize,
+    size_t nbConstantBlocks, size_t nbBlocks, size_t *ncBlks,
+    unsigned char *stateArray, unsigned char *newR
+){
+
+    //printf("ma\n");
+    // blockSize = 256;
+    r += 4;
+    r++;
+    r += sizeof(size_t);
+    r += sizeof(size_t);
+    int ncBlocks, *ncBlocks_d;
+	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    int num_state2_blks, *num_state2_d;
+    checkCudaErrors(cudaMalloc((void **)&num_state2_d, sizeof(int)));
+    checkCudaErrors(cudaMalloc((void **)&ncBlocks_d, sizeof(int)));
+    checkCudaErrors(cudaMemset(num_state2_d, 0, sizeof(int)));
+    checkCudaErrors(cudaMemset(ncBlocks_d, 0, sizeof(int)));
+
+    //printf("ma2\n");
+//	printf("Converting state array\n");
+    // printf("cmp %d\n", (int)r[0]);
+    // printf("state %d\n", (int)stateArray[0]);
+    // convert_out_to_state(nbBlocks, r, stateArray);
+    convert_out_to_state_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(nbBlocks,r,stateArray,stateNBBytes,
+                            num_state2_d, ncBlocks_d);
+    // printf("state %d\n", (int)stateArray[0]);
+    // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
+	cudaDeviceSynchronize();
+    
+    //printf("ma3\n");
+	r += stateNBBytes;
+    newR = r;
+    cudaMemcpy(&ncBlocks, ncBlocks_d, sizeof(int), cudaMemcpyDeviceToHost);
+    
+    //printf("ma4\n");
+    *ncBlks = ncBlocks;
+
+    //printf("ma4\n");
+ }
+
+__global__ void setup_data_stateArray(float *newData, size_t nbEle, unsigned char* r, 
+    size_t num_sig, int blockSize,
+    size_t nbConstantBlocks, size_t nbBlocks, size_t *ncBlks,
+    unsigned char *stateArray, unsigned char *newR
+){
+    // blockSize = 256;
+    r += 4;
+    r++;
+    r += sizeof(size_t);
+    r += sizeof(size_t);
+    size_t ncBlocks = 0;
+	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    size_t num_state2_blks = 0;
+//	printf("Converting state array\n");
+    // printf("cmp %d\n", (int)r[0]);
+    // printf("state %d\n", (int)stateArray[0]);
+    convert_out_to_state(nbBlocks, r, stateArray);
+    // convert_out_to_state_kernel<<<40,256>>>(nbBlocks,r,stateArray,stateNBBytes);
+    // printf("state %d\n", (int)stateArray[0]);
+    // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
+	for (size_t i = 0; i < nbBlocks; i++)
+    {
+        if (stateArray[i] == 2)
+        {
+            num_state2_blks++;
+        }else if(stateArray[i] == 3){
+            ncBlocks++;
+        }
+    }
+    
+	r += stateNBBytes;
+    newR = r;
+    *ncBlks = ncBlocks;
+}
+
+__global__ void decomp_startup_kernel(unsigned char* r, size_t nbConstantBlocks, 
+unsigned char *data, int blockSize, size_t mSize, size_t ncBlocks, float *constantMedianArray, uint64_t* g_leng){
+    unsigned char * fr = r; //fr is the starting address of constant median values.
+    int i = 0, j = 0, k = 0;
+  //  printf("%p\n", r);
+    unsigned char tmp_r[4];
+    tmp_r[0]=fr[0];
+    tmp_r[1]=fr[1];
+    tmp_r[2]=fr[2];
+    tmp_r[3]=fr[3];
+
+
+//    printf("nbconstant: %f\n", ((float*)tmp_r)[0]);
+// nbConstantBlocks
+    for(i = blockDim.x*blockIdx.x + threadIdx.x; i < nbConstantBlocks; i += blockDim.x*gridDim.x){ //get the median values for constant-value blocks
+	    
+    	    tmp_r[0]=fr[4*i];
+    	    tmp_r[1]=fr[4*i+1];
+    	    tmp_r[2]=fr[4*i+2];
+    	    tmp_r[3]=fr[4*i+3];
+	    float tmp = ((float*)tmp_r)[0];
+	    constantMedianArray[i] = tmp;
+	    //printf("%d %f\n", i, tmp);
+    }
+   
+
+/** PROBLEM AREA, CAN FIX WITH PARALLELIZATION BUT WATCH *FR and *P **/
+
+    // if(threadIdx.x==0 && blockIdx.x==0){
+    fr += nbConstantBlocks*sizeof(float);
+    unsigned char* p = fr + ncBlocks * sizeof(short);
+    unsigned char* basefr = fr;
+    unsigned char* basep = p;
+    for(i = blockDim.x*blockIdx.x + threadIdx.x;i < ncBlocks;i+=blockDim.x*gridDim.x){
+        fr = basefr+(sizeof(short)*i);
+        int leng = (int)bytesToShort(fr)+mSize;
+        g_leng[i] = (uint64_t)leng;
+        // fr += sizeof(short);
+        if (leng > blockSize*sizeof(float))
+        {
+            printf("Warning: compressed block is larger than the original block!\n");
+            return;
+            // exit(0);
+        }
+        // memcpy(data+i*blockSize*sizeof(float), p, leng);
+
+        // p += leng;
+    }
+    
+    // }
+}
+
+__global__ void decompress_ncblk_kernel(unsigned char* r, size_t nbConstantBlocks, 
+unsigned char *data, int blockSize, size_t mSize, size_t ncBlocks, float *constantMedianArray, uint64_t* g_leng){
+    unsigned char * fr = r;
+    fr += nbConstantBlocks*sizeof(float);
+    unsigned char* p = fr + ncBlocks * sizeof(short);
+    unsigned char* basefr = fr;
+    unsigned char* basep = p;
+
+    for(int i = blockDim.x*blockIdx.x + threadIdx.x;i < ncBlocks;i+=blockDim.x*gridDim.x){
+        fr = basefr+(sizeof(short)*i);
+        int leng = (int)bytesToShort(fr)+mSize;
+        
+	
+	// g_leng[i] = leng;
+        // // fr += sizeof(short);
+        // if (leng > blockSize*sizeof(float))
+        // {
+        //     printf("Warning: compressed block is larger than the original block!\n");
+        //     return;
+        //     // exit(0);
+        // }
+        p = basep + g_leng[i];
+
+        memcpy(data+i*blockSize*sizeof(float), p, leng);
+	
+        // p += leng;
+    }
+}
+
+void decompress_startup_better(float *newData, size_t nbEle, unsigned char* r, 
+    uint32_t *blk_idx, uint8_t *blk_subidx, uint8_t *blk_sig,
+    float *blk_vals, size_t num_sig, int blockSize,
+    size_t nbConstantBlocks, size_t nbBlocks, size_t ncBlocks,
+    unsigned char *stateArray, float* constantMedianArray, unsigned char *data,
+    size_t mSize, unsigned char *newCmpBytes
+){
+    // blockSize = 256;
+    size_t nb_tmp = (int) nbEle/blockSize;
+    uint64_t* g_leng;
+    /**
+     * Structures to return:
+     * blk_idx, blk_subidx, blk_sig, blk_vals, numSigValues (pointer)
+     * bs (pointer to blockSize), numConstantBlks (pointer), numBlks (pointer)
+     * ncBlks (pointer), stateArray, constantMedianArray
+     */
+
+
+    size_t stateNBBytes = nb_tmp%4==0 ? nb_tmp/4 : nb_tmp/4+1;
+    
+    r += 4;
+    r++;
+    r += sizeof(size_t);
+    r += sizeof(size_t);
+
+    r += stateNBBytes;
+
+    convert_out_to_block2_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    size_t to_add = nbBlocks*4 + num_sig*sizeof(float) + num_sig*sizeof(uint8_t) + nbBlocks*sizeof(uint8_t);
+    r+= to_add;
+
+    size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+    
+    // printf("before mallocs in kernel\n");
+    checkCudaErrors(cudaMemcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float), cudaMemcpyDeviceToDevice));
+    checkCudaErrors(cudaMalloc(&g_leng, sizeof(uint64_t)*ncBlocks));
+    // memcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
+
+    //printf("before mallocs in kernel %p\n", r);
+    r += (nbEle%blockSize)*sizeof(float);
+    //printf("r: %p\n", r);
+    //printf("%d, %d, %d\n",nbEle, 256, nbEle%256);
+    decomp_startup_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbConstantBlocks,data, blockSize, mSize, ncBlocks, constantMedianArray, g_leng);
+    cudaDeviceSynchronize();
+
+    thrust::exclusive_scan(thrust::device, g_leng, g_leng + ncBlocks, g_leng, 0);
+
+    decompress_ncblk_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbConstantBlocks, data, blockSize, mSize, ncBlocks, constantMedianArray, g_leng);
+    cudaDeviceSynchronize();
+    
+    // cudaError_t err = cudaGetLastError();        // Get error code
+    
+    // printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    cudaFree(g_leng);
+        
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    r += nbConstantBlocks*sizeof(float);
+
+    newCmpBytes = r;
+
+}
+
+__global__ void decompress_startup(float *newData, size_t nbEle, unsigned char* r, 
+    uint32_t *blk_idx, uint8_t *blk_subidx, uint8_t *blk_sig,
+    float *blk_vals, size_t num_sig, int blockSize,
+    size_t nbConstantBlocks, size_t nbBlocks, size_t ncBlocks,
+    unsigned char *stateArray, float* constantMedianArray, unsigned char *data,
+    size_t mSize, unsigned char *newCmpBytes
+){
+    // blockSize = 256;
+    size_t nb_tmp = (int) nbEle/blockSize;
+    /**
+     * Structures to return:
+     * blk_idx, blk_subidx, blk_sig, blk_vals, numSigValues (pointer)
+     * bs (pointer to blockSize), numConstantBlks (pointer), numBlks (pointer)
+     * ncBlks (pointer), stateArray, constantMedianArray
+     */
+	
+    // size_t ncBlocks = 0;
+	// size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    // size_t num_state2_blks = 0;
+	// printf("Converting state array\n");
+    // convert_out_to_state(nbBlocks, r, stateArray);
+    // printf("state %d\n", (int)stateArray[0]);
+    // // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
+	// for (size_t i = 0; i < nbBlocks; i++)
+    // {
+    //     if (stateArray[i] == 2)
+    //     {
+    //         num_state2_blks++;
+    //     }else if(stateArray[i] == 3){
+    //         ncBlocks++;
+    //     }
+    // }
+   // size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+
+    size_t stateNBBytes = nb_tmp%4==0 ? nb_tmp/4 : nb_tmp/4+1;
+    //printf("%p\n", r);
+    r += 4;
+    r++;
+    r += sizeof(size_t);
+    r += sizeof(size_t);
+    //printf("statenb %d %d\n", stateNBBytes, nb_tmp);
+    r += stateNBBytes;
+    // data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
+    // memset(data, 0, ncBlocks*blockSize*sizeof(float));
+   // printf("converting block vals %d\n", data[0]);
+    size_t to_add = convert_out_to_block2(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    r+= to_add;
+
+    size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+    
+    // printf("before mallocs in kernel\n");
+    
+    memcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
+
+    //printf("before mallocs in kernel %p\n", r);
+    r += (nbEle%blockSize)*sizeof(float);
+    //printf("r: %p\n", r);
+    //printf("%d, %d, %d\n",nbEle, 256, nbEle%256);
+    unsigned char * fr = r; //fr is the starting address of constant median values.
+
+  //  printf("%p\n", r);
+    unsigned char tmp_r[4];
+    tmp_r[0]=r[0];
+    tmp_r[1]=r[1];
+    tmp_r[2]=r[2];
+    tmp_r[3]=r[3];
+
+
+//    printf("nbconstant: %f\n", ((float*)tmp_r)[0]);
+    for(i = 0;i < nbConstantBlocks;i++, j+=4){ //get the median values for constant-value blocks
+	    
+    	    tmp_r[0]=r[j];
+    	    tmp_r[1]=r[j+1];
+    	    tmp_r[2]=r[j+2];
+    	    tmp_r[3]=r[j+3];
+	    float tmp = ((float*)tmp_r)[0];
+//	    printf("median: %f\n", tmp);	
+	    constantMedianArray[i] = tmp;
+
+	    // printf("%d %f\n", i, tmp);
+    }
+    //printf("after constantmedian\n");
+    r += nbConstantBlocks*sizeof(float);
+    unsigned char* p = r + ncBlocks * sizeof(short);
+    for(i = 0;i < ncBlocks;i++){
+        int leng = (int)bytesToShort(r)+mSize;
+        r += sizeof(short);
+        if (leng > blockSize*sizeof(float))
+        {
+            printf("Warning: compressed block is larger than the original block!\n");
+            return;
+            // exit(0);
+        }
+//	printf("before memcpy\n");
+        memcpy(data+i*blockSize*sizeof(float), p, leng);
+  //      printf("after memcpy\n");
+	p += leng;
+    } 
+
+    newCmpBytes = r;
+//    printf("before mallocs in kernel\n");
+
+    // printf("nb blocks: %d\n", nbBlocks);
+}
+
+__global__ void cBlkCopy_decompress(int nb, float* constantMedianArray, float *newData, int blockSize, int i){
+    int j;
+    float Median = constantMedianArray[nb];
+    // j = threadIdx.x; j < blockSize; j += blockDim.x
+    for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+        *((newData)+i*blockSize+j) = Median;
+}
+
+__global__ void ncBlkCopy_decompress(int blockSize, float *newData, int nc, float *fdata, int i){
+    int j;
+    for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+        *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
+}
+
+void decompress_post_proc_better(unsigned char *data, float *newData, int blockSize, 
+    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
+    float *constantMedianArray
+){
+    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
+    float* fdata = (float*)data;
+    int i,j;
+    int nb=0, nc=0;
+    //printf("h1\n");
+    for (i=0;i<nbBlocks;i++){
+        unsigned char state;
+        cudaMemcpy(&state, &stateArray[i], sizeof(char), cudaMemcpyDeviceToHost);
+
+        if (state==0 || state==1){
+            cBlkCopy_decompress<<<1,256>>>(nb, constantMedianArray, newData, blockSize, i);
+            nb++;
+        }else if(state==3){
+            ncBlkCopy_decompress<<<1,256>>>(blockSize, newData, nc, fdata, i);
+            nc++;
+        }
+    }
+    cudaDeviceSynchronize();
+    //for(int k = 0; k < nbBlocks*blockSize;k++){
+//	printf("%f\n", newData[k]);
+  //  }
+}
+
+__global__ void print_newdata(float *newData, size_t nbBlocks, int blockSize){
+    for (size_t i = 0; i < nbBlocks*blockSize; i++)
+    {
+        printf("%f\n", newData[i]);
+    }
+    
+}
+
+__global__ void generateNbNc(size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray, uint64_t* nbs,  uint64_t* ncs){
+    for(int i = blockDim.x*blockIdx.x + threadIdx.x;i < nbBlocks;i+=blockDim.x*gridDim.x){
+        unsigned char state = stateArray[i];
+        if(state==0||state==1){
+            nbs[i] = 1;
+            ncs[i] = 0;
+        }else if(state==3){
+            nbs[i] = 0;
+            ncs[i] = 1;
+        }else{
+            nbs[i] = 0;
+            ncs[i] = 0;
+        }
+    }
+}
+
+__global__ void decompress_final_set(unsigned char *data, float *newData, int blockSize, 
+    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
+    float *constantMedianArray, uint64_t* nb, uint64_t* nc){
+    float* fdata = (float*)data;
+    for (int i = blockIdx.x;i < nbBlocks;i+=gridDim.x){
+        if (stateArray[i]==0 || stateArray[i]==1){
+            float Median = constantMedianArray[nb[i]];
+            // if (Median>1) printf("data%i:%f\n",i, Median);
+            for (int j = threadIdx.x; j < blockSize; j += blockDim.x)
+                *((newData)+i*blockSize+j) = Median;
+            // nb++;
+        }else if(stateArray[i]==3){
+            for (int j = threadIdx.x; j < blockSize; j += blockDim.x)
+                *((newData)+i*blockSize+j) = fdata[nc[i]*blockSize+j];
+            // nc++;
+        }
+        __syncthreads();
+    }
+}
+
+void decompress_post_proc_fast(unsigned char *data, float *newData, int blockSize, 
+    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
+    float *constantMedianArray
+){
+    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
+    
+    int i,j;
+    uint64_t *nb, *nc;
+    checkCudaErrors(cudaMalloc(&nb, sizeof(uint64_t)*nbBlocks));
+    checkCudaErrors(cudaMalloc(&nc, sizeof(uint64_t)*nbBlocks));
+
+    generateNbNc<<<BLOCKS,THREADS_PER_BLOCK>>>(nbBlocks, ncBlocks, stateArray, nb,nc);
+    cudaDeviceSynchronize();
+    thrust::exclusive_scan(thrust::device, nb, nb + nbBlocks, nb, 0);
+    thrust::exclusive_scan(thrust::device, nc, nc + nbBlocks, nc, 0);
+
+    decompress_final_set<<<nbBlocks,blockSize>>>(data, newData, blockSize,nbBlocks, ncBlocks, stateArray,constantMedianArray, nb, nc);
+    cudaDeviceSynchronize();
+    cudaFree(nb);
+    cudaFree(nc);
+}
+
+__global__ void decompress_post_proc(unsigned char *data, float *newData, int blockSize, 
+    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
+    float *constantMedianArray
+){
+    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
+    float* fdata = (float*)data;
+    int i,j;
+    int nb=0, nc=0;
+    // if (blockIdx.x == 0)
+    // {
+    //     for (i=0;i<nbBlocks;i++){
+    //         if (stateArray[i]==0 || stateArray[i]==1){
+    //             float Median = constantMedianArray[nb];
+    //             // if (Median>1) printf("data%i:%f\n",i, Median);
+    //             for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+    //                 *((newData)+i*blockSize+j) = Median;
+    //             nb++;
+    //         }
+    //     }
+    // }else{
+    //     for (i=0;i<nbBlocks;i++){
+    //         if(stateArray[i]==3){
+    //             for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+    //                 *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
+    //             nc++;
+    //         }
+    //     }
+    // }
+    
+    for (i=0;i<nbBlocks;i++){
+        if (stateArray[i]==0 || stateArray[i]==1){
+            float Median = constantMedianArray[nb];
+            // if (Median>1) printf("data%i:%f\n",i, Median);
+            for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+                *((newData)+i*blockSize+j) = Median;
+            nb++;
+        }else if(stateArray[i]==3){
+            for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+                *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
+            nc++;
+        }
+    }
+
+    //for(int k = 0; k < nbBlocks*blockSize;k++){
+//	printf("%f\n", newData[k]);
+  //  }
+}
+
+float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
+{
+    /**
+     * Assume the following are device pointers
+     * 
+     * unsigned char* cmpBytes
+     * float** newData
+     * 
+     */
+    
+    uint32_t *blk_idx;
+    uint8_t *blk_subidx;
+    uint8_t *blk_sig;
+    float *blk_vals, *constantMedianArray;
+    size_t *num_sig, *mSize, mSize_h, num_sig_h;
+    int *blockSize, bs;
+    size_t *nbConstantBlocks, *nbBlocks, *ncBlocks, nbBlocks_h, ncBlocks_h, nbConstantBlocks_h;
+    unsigned char *stateArray, *data;
+    float *newData;
+    timer_GPU.StartCounter();
+    unsigned char *oldCmpBytes = cmpBytes;
+	//*newData = (float*)malloc(sizeof(float)*nbEle);
+//    printf("cmpbytes check %d\n", (int)cmpBytes[0]);
+//    printf("new check %f\n", *newData[0]);
+    // printf("malloc\n");
+    checkCudaErrors(cudaMalloc((void**)&num_sig, sizeof(size_t)));
+    checkCudaErrors(cudaMalloc((void**)&blockSize, sizeof(int)));
+    checkCudaErrors(cudaMalloc((void**)&nbConstantBlocks, sizeof(size_t)));
+    checkCudaErrors(cudaMalloc((void**)&nbBlocks, sizeof(size_t)));
+    checkCudaErrors(cudaMalloc((void**)&ncBlocks, sizeof(size_t)));
+    checkCudaErrors(cudaMalloc((void**)&mSize, sizeof(size_t)));    
+    checkCudaErrors(cudaMalloc((void**)&newData, sizeof(float)*nbEle));
+
+    decompress_get_stats<<<1,1>>>(newData, nbEle, cmpBytes, 
+        num_sig, blockSize,
+        nbConstantBlocks, nbBlocks,
+        mSize, cmpBytes
+    );
+    cudaDeviceSynchronize();
+
+    cudaError_t err = cudaGetLastError();        // Get error code
+    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    checkCudaErrors(cudaMemcpy(&nbBlocks_h, nbBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(&nbConstantBlocks_h, nbConstantBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(&bs, blockSize, sizeof(int), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(&mSize_h, mSize, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(&num_sig_h, num_sig, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+
+
+    checkCudaErrors(cudaMalloc((void**)&stateArray, nbBlocks_h));
+    checkCudaErrors(cudaMalloc((void**)&constantMedianArray, nbConstantBlocks_h*sizeof(float)));
+
+    checkCudaErrors(cudaMalloc((void**)&blk_idx, nbBlocks_h*sizeof(uint32_t)));
+    checkCudaErrors(cudaMalloc((void**)&blk_vals, num_sig_h*sizeof(float)));
+    checkCudaErrors(cudaMalloc((void**)&blk_subidx, num_sig_h*sizeof(uint8_t)));
+    checkCudaErrors(cudaMalloc((void**)&blk_sig, nbBlocks_h*sizeof(uint8_t)));
+
+    unsigned char* tmp_r = cmpBytes;
+    unsigned char* newR;
+    setup_data_stateArray_better(newData, nbEle, tmp_r, 
+    num_sig_h, bs,
+    nbConstantBlocks_h, nbBlocks_h, &ncBlocks_h,
+    stateArray, newR);
+    
+    
+    
+   // setup_data_stateArray<<<1,1>>>(newData, nbEle, cmpBytes, 
+   //      num_sig_h, bs,
+   //      nbConstantBlocks_h, nbBlocks_h, ncBlocks,
+   //      stateArray, cmpBytes
+   //  );
+   // cudaDeviceSynchronize();
+
+   // printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
+   // checkCudaErrors(cudaMemcpy(&ncBlocks_h, ncBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+
+    checkCudaErrors(cudaMalloc((void**)&data, ncBlocks_h*bs*sizeof(float)));
+
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA start Error: %s\n", cudaGetErrorString(err));
+    // cmpBytes = newCmpBytes;
+    // data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
+    // memset(data, 0, ncBlocks*blockSize*sizeof(float));
+    // stateArray = (unsigned char*)malloc(nbBlocks);
+    
+    // // unsigned char* d_stateArray;
+    // // cudaMalloc(&d_stateArray, nbBlocks);
+	// constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));			
+
+    // blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
+    // blk_vals= (float *)malloc((num_sig)*sizeof(float));
+    // blk_subidx = (uint8_t *)malloc((num_sig)*sizeof(uint8_t));
+    // blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
+
+    //printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
+    //test_nbBlks = (size_t *)malloc(sizeof(size_t));
+    // printf("malloc\n");
+    
+    
+    tmp_r = cmpBytes;
+    decompress_startup_better(newData, nbEle, tmp_r, 
+    blk_idx, blk_subidx, blk_sig,
+    blk_vals, num_sig_h, bs,
+     nbConstantBlocks_h, nbBlocks_h, ncBlocks_h,
+    stateArray, constantMedianArray, data,
+    mSize_h, newR);
+
+
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA start Error: %s\n", cudaGetErrorString(err));
+    //decompress_startup<<<1,1>>>(newData, nbEle, cmpBytes, 
+    // blk_idx, blk_subidx, blk_sig,
+    // blk_vals, num_sig_h, bs,
+    // nbConstantBlocks_h, nbBlocks_h, ncBlocks_h,
+    // stateArray, constantMedianArray, data, mSize_h, cmpBytes);
+    //cudaDeviceSynchronize();
+    // cmpBytes = newCmpBytes;
+
+    //printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
+
+    // unsigned char* d_data;
+    float *d_newdata;
+    // checkCudaErrors(cudaMalloc((void**)&d_data, ncBlocks*blockSize*sizeof(float))); 
+    // checkCudaErrors(cudaMemcpy(d_data, data, ncBlocks*blockSize*sizeof(float), cudaMemcpyHostToDevice)); 
+    // printf("nblocks: %d bs: %d ncblock %d\n", nbBlocks_h, bs, ncBlocks_h);
+    checkCudaErrors(cudaMalloc(&d_newdata, nbBlocks_h*bs*sizeof(float)));
+
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA dec main Error: %s\n", cudaGetErrorString(err));
+    
+    dim3 dimBlock(32, bs/32);
+    dim3 dimGrid(65536, 1);
+    const int sMemsize = bs * sizeof(float) + dimBlock.y * sizeof(int);
+    decompress_state2<<<nbBlocks_h, 64>>>(d_newdata, stateArray,blk_idx, blk_vals, blk_subidx, bs, blk_sig);
+    cudaDeviceSynchronize();
+
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA dec main Error: %s\n", cudaGetErrorString(err));
+    decompress_float<<<dimGrid, dimBlock, sMemsize>>>(data, bs, ncBlocks_h, mSize_h);
+    //printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
+    cudaDeviceSynchronize();
+
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA dec main Error: %s\n", cudaGetErrorString(err));
+    
+    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(newData, d_newdata, nbBlocks_h*bs*sizeof(float), cudaMemcpyDeviceToDevice));
+    cudaFree(d_newdata);
+
+    // decompress_post_proc<<<1,1>>>(data, newData, bs, 
+    // nbBlocks_h, ncBlocks_h, stateArray,
+    // constantMedianArray);
+    // cudaDeviceSynchronize();
+    decompress_post_proc_fast(data, newData, bs, 
+    nbBlocks_h, ncBlocks_h, stateArray,
+    constantMedianArray);
+    err = cudaGetLastError();        // Get error code
+    printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
+   // print_newdata<<<1,1>>>(newData, nbBlocks_h, bs);
+	cudaFree(stateArray);
+	cudaFree(constantMedianArray);
+	cudaFree(data);
+    cudaFree(blk_idx);
+    cudaFree(blk_subidx);
+    cudaFree(blk_vals);
+    cudaFree(blk_sig);
+    return newData;
+
+}
+

From c66753960c0a8ce3d674999645ce65a3948834a0 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Sat, 16 Dec 2023 00:14:16 -0600
Subject: [PATCH 115/126] minor torch compressor refactor

---
 qtensor/compression/Compressor.py           | 59 +++------------------
 qtensor/contraction_backends/compression.py | 33 +-----------
 2 files changed, 8 insertions(+), 84 deletions(-)

diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index ea342c25..34a5de53 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -150,77 +150,30 @@ def free_decompressed(self):
         self.decompressed_own = []
 
     def free_compressed(self, ptr):
-        import ctypes, cupy
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
-        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
-        # cast to int64 pointer
-        # (effectively converting pointer to pointer to addr to pointer to int64)
-        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-        decompressed_int = p_decompressed_int.contents
-        cupy.cuda.runtime.free(decompressed_int.value)
+        cmp_bytes, num_elements_eff, shape, dtype, _ = ptr
+        del cmp_bytes
 
     def compress(self, data):
         isCupy, num_elements_eff = _get_data_info(data)
         dtype = data.dtype
-        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
-        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
+        cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements_eff, CUSZX_BLOCKSIZE, self.r2r_threshold)
+        return (cmp_bytes, num_elements_eff, data.shape, dtype, outSize_ptr)
 
         # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
 
     def compress_size(self, ptr):
-        return ptr[5]
+        return ptr[4]
 
     def decompress(self, obj):
         import cupy
-        import ctypes
         cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
-        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
+        decompressed_ptr = quant_device_decompress(num_elements_eff, cmp_bytes, self, dtype)
         arr_cp = decompressed_ptr[0]
 
         arr = cupy.reshape(arr_cp, shape)
         self.decompressed_own.append(arr)
         return arr
     
-    ### Compression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
-    # - num_elements = Number of floating point elements in data
-    # - r2r_error = relative-to-value-range error bound for lossy compression
-    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
-    # Returns:
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
-    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
-        
-        if not isCuPy:
-            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-        else:
-            #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
-
-            cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-            del data
-            torch.cuda.empty_cache()
-        return cmp_bytes, outSize_ptr
-
-    ### Decompression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - num_elements = Number of floating point elements in original data
-    # Returns:
-    # - decompressed_data = Float32 pointer to decompressed data
-    #
-    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
-
-    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
-        if not isCuPy:
-            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
-        else:
-            #decompressed_data = cuszp_device_decompress(num_elements, cmp_bytes, cmpsize, owner,dtype)
-# oriData, absErrBound, nbEle, blockSize,threshold
-            decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
-        return decompressed_data
 
 class NEWSZCompressor(Compressor):
     def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
diff --git a/qtensor/contraction_backends/compression.py b/qtensor/contraction_backends/compression.py
index 994c1699..172bc3d6 100644
--- a/qtensor/contraction_backends/compression.py
+++ b/qtensor/contraction_backends/compression.py
@@ -67,24 +67,7 @@ def process_bucket(self, bucket, no_sum=False):
             import cupy
             for t in [accum, t]:
                 if isinstance(t, CompressedTensor):
-                    for c in t.data:
-                        if len(c)==6:
-                            cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = c
-                            del cmp_bytes
-                        # import ctypes
-                        # p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
-                        # # cast to int64 pointer
-                        # # (effectively converting pointer to pointer to addr to pointer to int64)
-                        # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-                        # decompressed_int = p_decompressed_int.contents
-                        # print("Freeing mem", decompressed_int.value)
-                        # cupy.cuda.runtime.free(decompressed_int.value)
-                    t.compressor.compressor.free_decompressed()
-                    #raise ValueError("Done")
-                else:
-                    #print("PTR", t.data.data.ptr)
-                    #cupy.cuda.runtime.free(t.data.data.ptr)
-                    pass
+                    t.compressor.free_decompressed()
                     
             accum = accum_new
 
@@ -95,19 +78,7 @@ def process_bucket(self, bucket, no_sum=False):
             indices = (accum.indices[-1], )
             res = compressed_sum(accum, indices, self.compressor, self.max_tw,  **ctr_kw)
             if isinstance(accum, CompressedTensor):
-                import cupy
-                for c in accum.data:
-                    cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = c
-                    del cmp_bytes
-                    #import ctypes
-                    #p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
-                    # cast to int64 pointer
-                    # (effectively converting pointer to pointer to addr to pointer to int64)
-                    #p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-                    #decompressed_int = p_decompressed_int.contents
-                    #print("Freeing mem", decompressed_int.value)
-                    #cupy.cuda.runtime.free(decompressed_int.value)
-                accum.compressor.compressor.free_decompressed()
+                accum.compressor.free_decompressed()
             return res
 
     def get_sliced_buckets(self, buckets, data_dict, slice_dict):

From b52d487051727918de085d98beed7a77abdf49d3 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Sat, 16 Dec 2023 00:18:35 -0600
Subject: [PATCH 116/126] torch compressor fix

---
 qtensor/compression/Compressor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index 8747a7fe..18126bbf 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -175,7 +175,7 @@ def compress_size(self, ptr):
 
     def decompress(self, obj):
         import cupy
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
+        cmp_bytes, num_elements_eff, shape, dtype, cmpsize = obj
         decompressed_ptr = quant_device_decompress(num_elements_eff, cmp_bytes, self, dtype)
         arr_cp = decompressed_ptr[0]
 

From a600c9952c551f4d6118d130357db8e53ec975ba Mon Sep 17 00:00:00 2001
From: Milan Kartik Shah <mkshah5@ncsu.edu>
Date: Fri, 9 Feb 2024 14:55:44 -0500
Subject: [PATCH 117/126] Added cuSZp compressor

---
 bench/qc_simulation/src/simulators/qtensor.py |   6 +-
 qtensor/compression/Compressor.py             | 123 ++++---
 qtensor/compression/cuszp/cuSZp               |   1 +
 qtensor/compression/cuszp/cuSZp_interface.cpp | 137 +++++++
 qtensor/compression/cuszp/cuszp_wrapper.py    | 113 ++++++
 qtensor/compression/cuszp/gnncuszp.py         | 347 ++++++++++++++++++
 qtensor/compression/cuszp/setup.py            |  28 ++
 7 files changed, 697 insertions(+), 58 deletions(-)
 create mode 160000 qtensor/compression/cuszp/cuSZp
 create mode 100644 qtensor/compression/cuszp/cuSZp_interface.cpp
 create mode 100644 qtensor/compression/cuszp/cuszp_wrapper.py
 create mode 100644 qtensor/compression/cuszp/gnncuszp.py
 create mode 100644 qtensor/compression/cuszp/setup.py

diff --git a/bench/qc_simulation/src/simulators/qtensor.py b/bench/qc_simulation/src/simulators/qtensor.py
index e206feb6..20660ac9 100644
--- a/bench/qc_simulation/src/simulators/qtensor.py
+++ b/bench/qc_simulation/src/simulators/qtensor.py
@@ -175,7 +175,7 @@ def simulate(in_file, out_file,
     """
     import time
     from qtensor.contraction_algos import bucket_elimination
-    from qtensor.compression.Compressor import CUSZCompressor, CUSZXCompressor, TorchCompressor, NEWSZCompressor
+    from qtensor.compression.Compressor import CUSZCompressor, CUSZXCompressor, TorchCompressor, NEWSZCompressor, CUSZPCompressor
     from qtensor.compression.Compressor import WriteToDiskCompressor
     import cupy
     cupy.cuda.profiler.start()
@@ -203,6 +203,10 @@ def simulate(in_file, out_file,
         elif compress == 'disk':
             compressor = WriteToDiskCompressor(f'/grand/QTensor/compression/data/tensors_compressed_M{M}/')
             compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == "cuszp":
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = CUSZPCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
         else:
             raise ValueError(f"Unknown compression algorithm: {compress}")
         backend = qtensor.contraction_backends.CompressionBackend(backend, compressor, M)
diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index 02a5b4da..9e58ffb2 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -14,15 +14,18 @@
 sys.path.append('./torch_quant')
 sys.path.append(str(Path(__file__).parent/'newsz'))
 sys.path.append('./newsz')
+sys.path.append(str(Path(__file__).parent/'cuszp'))
+sys.path.append('./cuszp')
 
 
 import torch
 try:
     from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
-    # from cuSZp_wrapper import cuszp_device_compress, cuszp_device_decompress
-    from cusz_wrapper import cusz_device_compress, cusz_device_decompress
+#    from cuSZp_wrapper import cuszp_device_compress, cuszp_device_decompress
+#    from cusz_wrapper import cusz_device_compress, cusz_device_decompress
     from torch_quant_perchannel import quant_device_compress, quant_device_decompress
-    from newsz_wrapper import newsz_device_compress, newsz_device_decompress
+#    from newsz_wrapper import newsz_device_compress, newsz_device_decompress
+    from cuszp_wrapper import cuszp_device_compress, cuszp_device_decompress
 except:
     print("import failed")
     # Silently fail on missing build of cuszx
@@ -118,6 +121,15 @@ def get_profile_stats(self):
         compress_ratios = np.mean([x.size_in/x.size_out for x in compress])
         compress_size = sum([x.size_out for x in compress])
         return compress_time, decompress_time, compress_size, compress_ratios
+
+    def compress_size(self, ptr):
+        return self.compressor.compress_size(ptr)
+    
+    def free_decompressed(self):
+        self.compressor.free_decompressed()
+    
+    def free_compressed(self, ptr):
+        self.compressor.free_compressed(ptr)
 # --
 
 class NumpyCompressor(Compressor):
@@ -150,77 +162,71 @@ def free_decompressed(self):
         self.decompressed_own = []
 
     def free_compressed(self, ptr):
-        import ctypes, cupy
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
-        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
-        # cast to int64 pointer
-        # (effectively converting pointer to pointer to addr to pointer to int64)
-        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-        decompressed_int = p_decompressed_int.contents
-        cupy.cuda.runtime.free(decompressed_int.value)
+        cmp_bytes, num_elements_eff, shape, dtype, _ = ptr
+        del cmp_bytes
 
     def compress(self, data):
         isCupy, num_elements_eff = _get_data_info(data)
         dtype = data.dtype
-        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
-        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
+        cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements_eff, CUSZX_BLOCKSIZE, self.r2r_threshold)
+        return (cmp_bytes, num_elements_eff, data.shape, dtype, outSize_ptr)
 
         # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
 
     def compress_size(self, ptr):
-        return ptr[5]
+        return ptr[4]
 
     def decompress(self, obj):
         import cupy
-        import ctypes
-        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
-        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
+        cmp_bytes, num_elements_eff, shape, dtype, cmpsize = obj
+        decompressed_ptr = quant_device_decompress(num_elements_eff, cmp_bytes, self, dtype)
         arr_cp = decompressed_ptr[0]
 
         arr = cupy.reshape(arr_cp, shape)
         self.decompressed_own.append(arr)
         return arr
     
-    ### Compression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
-    # - num_elements = Number of floating point elements in data
-    # - r2r_error = relative-to-value-range error bound for lossy compression
-    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
-    # Returns:
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
-    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
-        
-        if not isCuPy:
-            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-        else:
-            #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
+class CUSZPCompressor(Compressor):
+    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
+        self.r2r_error = r2r_error
+        self.r2r_threshold = r2r_threshold
+        self.decompressed_own = []
 
-            cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
-            del data
-            torch.cuda.empty_cache()
-        return cmp_bytes, outSize_ptr
+    def free_decompressed(self):
+        import cupy
+        print("Cleanup", len(self.decompressed_own))
+        for x in self.decompressed_own:
+            del x
+        cupy.get_default_memory_pool().free_all_blocks()
+        cupy.get_default_pinned_memory_pool().free_all_blocks()
+        torch.cuda.empty_cache()
+        self.decompressed_own = []
 
-    ### Decompression API with cuSZx ###
-    # Parameters:
-    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
-    # - cmp_bytes = Unsigned char pointer to compressed bytes
-    # - num_elements = Number of floating point elements in original data
-    # Returns:
-    # - decompressed_data = Float32 pointer to decompressed data
-    #
-    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
+    def free_compressed(self, ptr):
+        cmp_bytes, num_elements_eff, shape, dtype, _ = ptr
+        del cmp_bytes
 
-    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
-        if not isCuPy:
-            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
-        else:
-            #decompressed_data = cuszp_device_decompress(num_elements, cmp_bytes, cmpsize, owner,dtype)
-# oriData, absErrBound, nbEle, blockSize,threshold
-            decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
-        return decompressed_data
+    def compress(self, data):
+        isCupy, num_elements_eff = _get_data_info(data)
+        dtype = data.dtype
+        cmp_bytes, outSize_ptr = cuszp_device_compress(data, self.r2r_error,self.r2r_threshold)
+        return (cmp_bytes, num_elements_eff, data.shape, dtype, outSize_ptr)
+
+        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
+
+    def compress_size(self, ptr):
+        return ptr[4]
+
+    def decompress(self, obj):
+        import cupy
+        cmp_bytes, num_elements_eff, shape, dtype, cmpsize = obj
+        decompressed_ptr = cuszp_device_decompress(num_elements_eff, cmp_bytes)
+        arr_cp = decompressed_ptr[0]
+
+        arr = cupy.reshape(arr_cp, shape)
+        self.decompressed_own.append(arr)
+        return arr
+    
 
 class NEWSZCompressor(Compressor):
     def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
@@ -347,9 +353,9 @@ def free_decompressed(self):
                 #print("CUDA Free", x)
             cupy.cuda.runtime.free(x)
             # del x
-            # cupy.get_default_memory_pool().free_all_blocks()
-            # cupy.get_default_pinned_memory_pool().free_all_blocks()
-        # torch.cuda.empty_cache()
+            cupy.get_default_memory_pool().free_all_blocks()
+            #cupy.get_default_pinned_memory_pool().free_all_blocks()
+        #torch.cuda.empty_cache()
         self.decompressed_own = []
 
     def free_compressed(self, ptr):
@@ -361,6 +367,9 @@ def free_compressed(self, ptr):
         p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
         decompressed_int = p_decompressed_int.contents
         cupy.cuda.runtime.free(decompressed_int.value)
+        cupy.get_default_memory_pool().free_all_blocks()
+        #cupy.get_default_pinned_memory_pool().free_all_blocks()
+        #torch.cuda.empty_cache()
 
     def compress(self, data):
         isCuPy, num_elements_eff = _get_data_info(data)
diff --git a/qtensor/compression/cuszp/cuSZp b/qtensor/compression/cuszp/cuSZp
new file mode 160000
index 00000000..f47064f4
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp
@@ -0,0 +1 @@
+Subproject commit f47064f4edbc00aceb36692232ac7eef3fefaf2b
diff --git a/qtensor/compression/cuszp/cuSZp_interface.cpp b/qtensor/compression/cuszp/cuSZp_interface.cpp
new file mode 100644
index 00000000..e46d7e04
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp_interface.cpp
@@ -0,0 +1,137 @@
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+// #include <cuSZp/cuSZp_entry_f32.h>
+// #include <cuSZp/cuSZp_timer.h>
+// #include <cuSZp/cuSZp_utility.h>
+#include <cuSZp_entry_f32.h>
+#include <cuSZp_timer.h>
+#include <cuSZp_utility.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#define CHECK_CUDA(x) \
+  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+torch::Tensor compress(torch::Tensor input, float error_bound,
+                       std::string mode) {
+  CHECK_INPUT(input);
+  // Get the input tensor's data pointer and size
+  float *d_input_data = input.data_ptr<float>();
+  int64_t num_elements = input.numel();
+  size_t compressed_size = 0;
+
+  // Cuda allocate memory for the compressed output
+  unsigned char *d_compressed_data;
+  cudaMalloc((void **)&d_compressed_data, num_elements * sizeof(float));
+  cudaMemset(d_compressed_data, 0, num_elements * sizeof(float));
+  printf("f ptr %p\n", d_input_data);
+  // Initializing CUDA Stream.
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  // Just a warmup.
+  SZp_compress_deviceptr_f32(d_input_data, d_compressed_data, num_elements,
+                             &compressed_size, error_bound, stream);
+  // Ensure on a 4096 boundary
+  // compressed_size = (compressed_size + 4095) / 4096 * 4096;
+  // Create a new tensor on the GPU from the compressed output
+  
+  cudaStreamSynchronize(stream);
+  
+  cudaError_t err = cudaGetLastError();
+  printf("after comp\n");
+  if (err != cudaSuccess) {
+    printf("CUDA error: %s\n", cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+
+
+ // torch::Tensor test_t = torch::zeros(5);
+  err = cudaGetLastError();
+  printf("after comp\n");
+  if (err != cudaSuccess) {
+    printf("CUDA error: %s\n", cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+
+
+  torch::Tensor output = torch::empty(
+      {compressed_size}, torch::TensorOptions()
+                             .dtype(torch::kUInt8)
+                             .device(torch::kCUDA)
+			     .layout(at::kStrided)
+                             .memory_format(torch::MemoryFormat::Contiguous));
+  // write from d_compressed_data
+  cudaMemcpy(output.data_ptr<unsigned char>(), d_compressed_data,
+             compressed_size, cudaMemcpyDeviceToDevice);
+  // Sync free
+  cudaStreamSynchronize(stream);
+
+  printf("after comp2\n");
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("CUDA error: %s\n", cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+
+  // cudaMemGetInfo(&free_byte, &total_byte);
+  // printf("GPU memory usage before output: used = %f, free = %f MB, total = %f
+  // MB\n",
+  //       (double)(total_byte - free_byte) / 1024.0 / 1024.0, (double)free_byte
+  //       / 1024.0 / 1024.0, (double)total_byte / 1024.0 / 1024.0);
+  cudaFree(d_compressed_data);
+  cudaStreamDestroy(stream);
+  CHECK_INPUT(output);
+  return output;
+}
+
+torch::Tensor decompress(torch::Tensor compressed_data, int64_t num_elements,
+                         size_t compressed_size, float error_bound,
+                         std::string mode) {
+  CHECK_INPUT(compressed_data);
+  // Get the input tensor's data pointer and size
+  unsigned char *d_compressed_data = compressed_data.data_ptr<unsigned char>();
+
+  // torch::Tensor decompressed_data = torch::empty(
+  //     , torch::TensorOptions()
+  //                         .dtype(torch::kFloat32)
+  //                         .device(torch::kCUDA)
+  //                         .memory_format(torch::MemoryFormat::Contiguous));
+  torch::Tensor decompressed_data = torch::zeros(
+      {num_elements}, torch::TensorOptions()
+                          .dtype(torch::kFloat32)
+                          .device(torch::kCUDA)
+                          .memory_format(torch::MemoryFormat::Contiguous));
+  float *d_decompressed_data = decompressed_data.data_ptr<float>();
+
+  // Initializing CUDA Stream.
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  SZp_decompress_deviceptr_f32(d_decompressed_data, d_compressed_data,
+                               num_elements, compressed_size, error_bound,
+                               stream);
+  cudaStreamSynchronize(stream);
+  // Check cuda errors
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("CUDA error: %s\n", cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+  cudaStreamDestroy(stream);
+  CHECK_INPUT(decompressed_data);
+  return decompressed_data;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("compress", &compress, "Compress a PyTorch tensor using cuSZp");
+  m.def("decompress", &decompress, "Decompress a PyTorch tensor using cuSZp");
+}
diff --git a/qtensor/compression/cuszp/cuszp_wrapper.py b/qtensor/compression/cuszp/cuszp_wrapper.py
new file mode 100644
index 00000000..54d04549
--- /dev/null
+++ b/qtensor/compression/cuszp/cuszp_wrapper.py
@@ -0,0 +1,113 @@
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+#from qtensor.tools.lazy_import import cupy as cp
+import cupy as cp
+import time
+import torch
+
+import cuszp
+
+from pathlib import Path
+
+def cuszp_device_compress(oriData, absErrBound,threshold):
+
+    oriData = oriData.flatten()
+    x = torch.as_tensor(oriData, device='cuda')
+    
+    ori_real = x.real
+    ori_imag = x.imag
+    x = x.contiguous()
+    x = torch.cat((ori_real, ori_imag))
+    x = torch.flatten(x)
+    bitmap = None
+    d = torch.max(x) - torch.min(x)
+    d = d.item()
+    absErrBound = float(absErrBound*(d))
+    threshold = threshold*(d)
+    truth_values = torch.abs(x)<=threshold
+    x[truth_values] = 0.0
+
+    o_bytes = cuszp.compress(x, absErrBound, "rel")
+    outSize = o_bytes.numel()*o_bytes.element_size()
+
+    return (o_bytes,bitmap, absErrBound), outSize
+
+
+def cuszp_device_decompress(nbEle, cmpBytes):
+
+    (cmpBytes, bitmap, absErrBound) = cmpBytes
+
+    newData = cuszp.decompress(
+        cmpBytes,
+        nbEle,
+        cmpBytes.numel()*cmpBytes.element_size(),
+        absErrBound,
+        "rel",
+    )
+
+    arr = cp.asarray(newData)
+    res = arr
+    c_res = cp.zeros(int(nbEle/2), np.complex64)
+    c_res.real = res[0:int(nbEle/2)]
+    c_res.imag = res[int(nbEle/2):]
+
+    return (c_res, None)
+
+### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
+
+def free_compressed(ptr):
+    p_ptr = ctypes.addressof(ptr)
+    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decomp_int = p_int.contents
+    #cp.cuda.runtime.free(decomp_int.value)
+
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = int(1024*64)
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.01
+    r2r_error = 0.01
+    ranga_vr = RANGE
+    in_vector = np.zeros((DATA_SIZE,))
+    for i in range(0,int(DATA_SIZE/4)):
+        in_vector[i] = 0.0
+    for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+        in_vector[i] = 5.0
+    for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+        in_vector[i] = random.uniform(MIN_D, MAX_D)
+    for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+        in_vector[i] = -7.0
+    for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+        in_vector[i] = 0.001
+
+    print(DATA_SIZE)
+    in_vector = in_vector.astype('complex64')
+    in_vector_gpu = cp.asarray(in_vector)
+    
+    #in_vector_gpu = cp.asarray(in_vector)
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+    for i in range(2):
+        s_time = time.time()
+        o_bytes, outSize = cuszp_device_compress(in_vector_gpu, r2r_error, r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        print(outSize)
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        (d_bytes,ptr )= cuszp_device_decompress(DATA_SIZE*2, o_bytes)
+        #free_compressed(o_bytes[0])
+        #cp.cuda.runtime.free(d_bytes)
+        print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
+        print("Decompress Success")
diff --git a/qtensor/compression/cuszp/gnncuszp.py b/qtensor/compression/cuszp/gnncuszp.py
new file mode 100644
index 00000000..381a1989
--- /dev/null
+++ b/qtensor/compression/cuszp/gnncuszp.py
@@ -0,0 +1,347 @@
+import cuszp
+import torch
+from statcollector import StatCollector
+# Create a class that performs compression and decompression on a tensor
+
+
+class Compressor(torch.nn.Module):
+    def __init__(self, err_mode, err_bound, device, num_nodes,statcollector:StatCollector):
+        super(Compressor, self).__init__()
+        self.err_mode = err_mode
+        self.err_bound = err_bound
+        self.device = device
+        self.compressor = cuszp
+        self.num_nodes = num_nodes
+        self.sc = statcollector
+
+    def compress(self, x):
+        # Ensure float32 type
+        if not x.dtype == torch.float32:
+            raise TypeError("x must be of type torch.float32")
+        x = x.contiguous()
+        if self.err_mode == "rel" or self.err_mode == "relative":
+            # Value-range error bound
+            x_max = torch.max(x)
+            x_min = torch.min(x)
+            # Compute the err_bound
+            err_bound = (x_max - x_min) * self.err_bound
+            # print("min =", x_min, "max =", x_max, "err_bound =", err_bound)
+            self.sc.add_tensor_stat("Min Value", x_min.item())
+            self.sc.add_tensor_stat("Max Value", x_max.item())
+
+        elif self.err_mode == "abs" or self.err_mode == "absolute":
+            err_bound = self.err_bound
+        else:
+            raise ValueError("err_mode must be 'rel / relative' or 'abs / absolute'")
+        self.sc.add_tensor_stat("Absolute Error Bound", err_bound.item())
+
+        return CompressedElement(x, self.compressor.compress(x, err_bound, self.err_mode), err_bound, self.device)
+
+    def decompress(self, comp_element):
+        if not isinstance(comp_element, CompressedElement):
+            raise TypeError("comp_element must be an instance of CompressedElement")
+        compressed_size = (
+            comp_element.compressed_data.numel()
+            * comp_element.compressed_data.element_size()
+        )
+        decompressed = self.compressor.decompress(
+            comp_element.compressed_data,
+            comp_element.uncompressed_elements,
+            compressed_size,
+            comp_element.err_bound,
+            self.err_mode,
+        )
+        # Reshape decompressed to match original shape
+        decompressed = decompressed.reshape(comp_element.original_shape)
+        return decompressed
+
+    def pack_hook(self, x):
+        if (
+            x.dtype == torch.float32
+            and x.requires_grad
+            and not x.is_sparse
+            and isinstance(x, torch.Tensor)
+            and x.shape[0] == self.num_nodes
+        ):
+            # print("Packing", x.shape)
+            t0 = self.sc.new_clock()
+            self.sc.sync_start_time(t0)
+
+            compressed = self.compress(x)
+
+            self.sc.sync_end_time(t0)
+            self.sc.increment_epoch_stat("Total Compression Time (s)",self.sc.get_elapsed_time(t0))
+
+            # print("Uncompressed size =", (x.numel() * x.element_size()) / 1024 / 1024)
+            # print(
+            #     "Compressed size =",
+            #     (
+            #         compressed.compressed_data.numel()
+            #         * compressed.compressed_data.element_size()
+            #     )
+            #     / 1024
+            #     / 1024,
+            # )
+            # print(
+            #     "Compression Ratio = ",
+            #     (x.numel() * x.element_size())
+            #     / (
+            #         compressed.compressed_data.numel()
+            #         * compressed.compressed_data.element_size()
+            #     ),
+            # )
+            csize = compressed.compressed_data.numel()*compressed.compressed_data.element_size()
+            osize = x.numel() * x.element_size()
+            self.sc.add_tensor_stat("Uncompressed Size (bytes)", osize)
+            self.sc.add_tensor_stat("Compressed Size (bytes)", csize)
+            self.sc.increment_epoch_stat("Average CR", osize/csize)
+            self.sc.increment_epoch_stat("Aggregate Uncompressed Tensor Size (bytes)", osize)
+            self.sc.increment_epoch_stat("Aggregate Compressed Tensor Size (bytes)", csize)
+            # print( "Data Saved", ((x.numel() * x.element_size()) - (compressed.compressed_data.numel() * compressed.compressed_data.element_size()))/1024/1024)
+            # print("Testing decompress,", decompressed)
+            # print("Compressed data", compressed.compressed_data)
+            # print("Decompressed shape =", decompressed.shape)
+            # print("X shape = ", x.shape)
+            # abs_error = torch.abs(x - decompressed)
+            # max_error = torch.max(abs_error)
+            # if max_error > self.err_bound * 1.1:
+            #     # Print the location of the max error and the values
+            #     print("Max error location =", torch.argmax(torch.abs(x - decompressed)))
+            #     print("Max error value =", max_error)
+            #     location = torch.argmax(torch.abs(x - decompressed))
+            #     # Print row and column of max error
+            #     print("Row =", int(location / x.shape[1]))
+            #     print("Column =", location % x.shape[1])
+            #     # Count the number of elements that are > self.err_bound * 1.1
+            #     bound_err_cnt = torch.sum(abs_error > self.err_bound * 1.1)
+            #     print("Number of elements > err_bound * 1.1 =", bound_err_cnt)
+            #     print("X value =", x[int(location / x.shape[1])][location % x.shape[1]])
+            #     print(
+            #         "Decompressed value =",
+            #         decompressed[int(location / x.shape[1])][location % x.shape[1]],
+            #     )
+            #     raise ValueError(
+            #         "Error bound exceeded! Max error = ", max_error
+            #     )
+            # # Ensure max_error <= err_bound
+
+            # print("Max error =", max_error)
+            # Ensure x is freed
+            # delete x
+            self.sc.increment_epoch_stat("Compressed Tensor Count",1)
+            self.sc.register_tensor_row_and_update()
+
+
+            del x
+            # empty cache
+            torch.cuda.empty_cache()
+            return compressed
+        else:
+            return x
+
+    def unpack_hook(self, x):
+        if isinstance(x, CompressedElement):
+            # print("Unpacking", x.name)
+            # print("Unpacking")
+            t0 = self.sc.new_clock()
+            self.sc.sync_start_time(t0)
+
+            decompressed = self.decompress(x)
+
+            self.sc.sync_end_time(t0)
+            self.sc.increment_epoch_stat("Total Decompression Time (s)",self.sc.get_elapsed_time(t0))
+
+            # print("Unpacked")
+            # print("Unpacked to", decompressed)
+            return decompressed
+        else:
+            return x
+
+
+# Create class for a compressed element that is used by the Compressor class
+
+
+class CompressedElement(torch.nn.Module):
+    def __init__(self, x, compressed, err_bound, device):
+        super(CompressedElement, self).__init__()
+        self.device = device
+        # self.compressor = cuszp
+        self.compressed_data = compressed
+        self.uncompressed_elements = x.numel()
+        self.original_shape = x.shape
+        self.err_bound = err_bound
+
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+from qtensor.tools.lazy_import import cupy as cp
+import time
+import torch
+
+from pathlib import Path
+
+
+
+def quant_device_compress(oriData, nbEle, blockSize,threshold):
+    #print(nbEle)
+    ori_nbEle = nbEle
+    variable = ctypes.c_size_t(0)
+    outSize = ctypes.pointer(variable)
+
+    oriData = oriData.flatten()
+    ori_real = oriData.real
+    ori_imag = oriData.imag
+    oriData = cp.concatenate((ori_real, ori_imag))
+    sample = oriData[::2]
+    max_val = cp.amax(oriData).get()
+    min_val = cp.amin(oriData).get()
+    d = max_val - min_val
+    if d.dtype == np.complex64:
+        d = d.real
+    threshold = threshold*(d)
+    s_1 = time.time() 
+    truth_values = abs(oriData)<=threshold
+    oriData[truth_values] = 0.0
+    truth_values = cp.invert(truth_values)
+    ori_len = oriData.shape[0]
+    nonzero_percent = cp.count_nonzero(oriData)/oriData.shape[0]
+    print("Percent nonzero: "+str(nonzero_percent))
+
+    isGrouped = False
+    if nonzero_percent<=0.5:
+        isGrouped=True
+        oriData = oriData[truth_values]
+    
+    nbEle = oriData.shape[0]
+    
+    # oriData = cp.reshape(oriData, (-1, blockSize))  # Reshape to blocksize
+    tensor = torch.as_tensor(oriData, device='cuda')
+    # print("Min val: "+str(cp.amin(oriData).get())+" range: "+str(d))
+#    scale = d/255.0
+#    zero_point = -1*round(min_val*scale) - 128
+
+    scale = d/((2**8) - 1)
+    #zero_point = -1*round(min_val*scale)
+    zero_point = -1*round(min_val*scale)+32
+#    q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
+    
+    q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
+    del tensor
+    torch.cuda.empty_cache()
+    if isGrouped:
+        bitmap = cp.packbits(truth_values)
+    else:
+        bitmap = None
+    del truth_values
+    #q_ten2 = torch.dequantize(q_tensor)
+    #print(tensor)
+    #print(q_ten2)
+    #print("Max PW error")
+    #print(torch.max(torch.div(torch.abs(torch.sub(tensor[tensor!=0.0],q_ten2[tensor!=0.0])),tensor[tensor!=0.0])))
+    return (q_tensor, bitmap, isGrouped), (nbEle/4)+(ori_len/8)
+
+
+def quant_device_decompress(nbEle, cmpBytes, owner, dtype):
+    (q_tensor, bitmap, isGrouped) = cmpBytes
+    if isGrouped:
+        bitmap = cp.unpackbits(bitmap)
+    restored = torch.dequantize(q_tensor)
+    arr = cp.asarray(restored)
+    # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error
+
+    # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
+    # -- Workaround to convert GPU pointer to int
+    # p_decompressed_ptr = ctypes.addressof(newData)
+    # cast to int64 pointer
+    # (effectively converting pointer to pointer to addr to pointer to int64)
+    # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+    # decompressed_int = p_decompressed_int.contents
+    # # --
+    # pointer_for_free = decompressed_int.value
+    # # self.decompressed_own.append(decompressed_int.value)
+    # mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0)
+    # mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
+    #print("mem ptr")
+    #print(mem_ptr)
+    # arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr)
+    #print(nbEle)
+    if isGrouped:
+        res = cp.zeros((nbEle,))
+    # ## need to convert newData to cupy
+        cp.place(res,bitmap,arr)
+
+        c_res = cp.zeros(int(nbEle/2), np.complex64)
+    #c_res.real = arr[0:int(nbEle/2)]
+    #c_res.imag = arr[int(nbEle/2):]
+
+        c_res.real = res[0:int(nbEle/2)]
+        c_res.imag = res[int(nbEle/2):]
+    else:
+        c_res = cp.zeros(int(nbEle/2), np.complex64)
+        c_res.real = arr[0:int(nbEle/2)]
+        c_res.imag = arr[int(nbEle/2):]
+    return (c_res, None)
+
+### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
+
+def free_compressed(ptr):
+    p_ptr = ctypes.addressof(ptr)
+    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decomp_int = p_int.contents
+    cp.cuda.runtime.free(decomp_int.value)
+
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = int(1024)
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.002
+    r2r_error = 0.0001
+
+    in_vector = np.fromfile("all_sample.bin", dtype=np.complex64)
+    #print(np.max(in_vector))
+    DATA_SIZE = len(in_vector)
+    #range_vr = np.max(in_vector)-np.min(in_vector)
+    #r2r_threshold = r2r_threshold*range_vr
+    #r2r_error = r2r_error*range_vr
+    #in_vector = np.zeros((DATA_SIZE,))
+    #for i in range(0,int(DATA_SIZE/4)):
+    #    in_vector[i] = 0.0
+    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+    #    in_vector[i] = 5.0
+    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
+    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+    #    in_vector[i] = -7.0
+    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+    #    in_vector[i] = 0.001
+
+    print(DATA_SIZE)
+    #in_vector = in_vector.astype('float32')
+    in_vector_gpu = cp.asarray(in_vector)
+    
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+    for i in range(200):
+        s_time = time.time()
+        o_bytes, outSize = quant_device_compress(in_vector_gpu, DATA_SIZE, 256, r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        # print(outSize[0])
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        (d_bytes,ptr )= quant_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
+        
+        # free_compressed(o_bytes[0])
+        # cp.cuda.runtime.free(ptr)
+        print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
+        print("Decompress Success")
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/setup.py b/qtensor/compression/cuszp/setup.py
new file mode 100644
index 00000000..3bc77e8f
--- /dev/null
+++ b/qtensor/compression/cuszp/setup.py
@@ -0,0 +1,28 @@
+from setuptools import setup, Extension
+from torch.utils import cpp_extension
+import os
+
+cuSZp_install = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cuSZp')
+cuSZp_include = os.path.join(cuSZp_install, 'include')
+cuSZp_src = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cuSZp', 'src')
+# Retrieve list of source files
+cuSZp_src_files = []
+for root, dirs, files in os.walk(cuSZp_src):
+    for file in files:
+        if file.endswith('.cu'):
+            cuSZp_src_files.append(os.path.join(root, file))
+cuSZp_src_files.append('cuSZp_interface.cpp')
+
+# define the extension module
+cuSZp_extension = cpp_extension.CUDAExtension(
+    name='cuszp',
+    sources=cuSZp_src_files,
+    include_dirs=[cuSZp_include],
+)
+
+# build the extension module
+setup(
+    name='cuszp',
+    ext_modules=[cuSZp_extension],
+    cmdclass={'build_ext': cpp_extension.BuildExtension}
+)

From 9bc3d9cc3461a22e8f487bdb82d3879fd3a88706 Mon Sep 17 00:00:00 2001
From: Milan Kartik Shah <mkshah5@ncsu.edu>
Date: Thu, 15 Feb 2024 14:45:53 -0500
Subject: [PATCH 118/126] Remove empty directory

---
 qtensor/compression/cuszp/cuSZp | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 qtensor/compression/cuszp/cuSZp

diff --git a/qtensor/compression/cuszp/cuSZp b/qtensor/compression/cuszp/cuSZp
deleted file mode 160000
index f47064f4..00000000
--- a/qtensor/compression/cuszp/cuSZp
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit f47064f4edbc00aceb36692232ac7eef3fefaf2b

From 92bf98bc59ce133c5a010fa5b24abd13cd0c063c Mon Sep 17 00:00:00 2001
From: Milan Kartik Shah <mkshah5@ncsu.edu>
Date: Thu, 15 Feb 2024 14:49:59 -0500
Subject: [PATCH 119/126] Added cuSZp src code

---
 .../compression/cuszp/cuSZp/CMakeLists.txt    |  79 +++
 .../compression/cuszp/cuSZp/Config.cmake.in   |   5 +
 qtensor/compression/cuszp/cuSZp/LICENSE       |  30 +
 qtensor/compression/cuszp/cuSZp/README.md     | 106 +++
 .../cuszp/cuSZp/cmake/Installing.cmake        |  67 ++
 .../cuszp/cuSZp/examples/CMakeLists.txt       |  45 ++
 .../cuSZp/examples/cuSZp_cpu_f32_api.cpp      |  83 +++
 .../cuSZp/examples/cuSZp_cpu_f64_api.cpp      |  83 +++
 .../cuSZp/examples/cuSZp_gpu_f32_api.cpp      | 119 ++++
 .../cuSZp/examples/cuSZp_gpu_f64_api.cpp      | 120 ++++
 .../cuszp/cuSZp/include/cuSZp_entry_f32.h     |  11 +
 .../cuszp/cuSZp/include/cuSZp_entry_f64.h     |  11 +
 .../cuszp/cuSZp/include/cuSZp_f32.h           |  12 +
 .../cuszp/cuSZp/include/cuSZp_f64.h           |  12 +
 .../cuszp/cuSZp/include/cuSZp_timer.h         |  31 +
 .../cuszp/cuSZp/include/cuSZp_utility.h       |  18 +
 .../cuszp/cuSZp/src/cuSZp_entry_f32.cu        | 149 +++++
 .../cuszp/cuSZp/src/cuSZp_entry_f64.cu        | 149 +++++
 .../compression/cuszp/cuSZp/src/cuSZp_f32.cu  | 335 ++++++++++
 .../compression/cuszp/cuSZp/src/cuSZp_f64.cu  | 333 ++++++++++
 .../cuszp/cuSZp/src/cuSZp_timer.cu            |  31 +
 .../cuszp/cuSZp/src/cuSZp_utility.cu          | 614 ++++++++++++++++++
 22 files changed, 2443 insertions(+)
 create mode 100644 qtensor/compression/cuszp/cuSZp/CMakeLists.txt
 create mode 100644 qtensor/compression/cuszp/cuSZp/Config.cmake.in
 create mode 100644 qtensor/compression/cuszp/cuSZp/LICENSE
 create mode 100644 qtensor/compression/cuszp/cuSZp/README.md
 create mode 100644 qtensor/compression/cuszp/cuSZp/cmake/Installing.cmake
 create mode 100644 qtensor/compression/cuszp/cuSZp/examples/CMakeLists.txt
 create mode 100644 qtensor/compression/cuszp/cuSZp/examples/cuSZp_cpu_f32_api.cpp
 create mode 100644 qtensor/compression/cuszp/cuSZp/examples/cuSZp_cpu_f64_api.cpp
 create mode 100644 qtensor/compression/cuszp/cuSZp/examples/cuSZp_gpu_f32_api.cpp
 create mode 100644 qtensor/compression/cuszp/cuSZp/examples/cuSZp_gpu_f64_api.cpp
 create mode 100644 qtensor/compression/cuszp/cuSZp/include/cuSZp_entry_f32.h
 create mode 100644 qtensor/compression/cuszp/cuSZp/include/cuSZp_entry_f64.h
 create mode 100644 qtensor/compression/cuszp/cuSZp/include/cuSZp_f32.h
 create mode 100644 qtensor/compression/cuszp/cuSZp/include/cuSZp_f64.h
 create mode 100644 qtensor/compression/cuszp/cuSZp/include/cuSZp_timer.h
 create mode 100644 qtensor/compression/cuszp/cuSZp/include/cuSZp_utility.h
 create mode 100644 qtensor/compression/cuszp/cuSZp/src/cuSZp_entry_f32.cu
 create mode 100644 qtensor/compression/cuszp/cuSZp/src/cuSZp_entry_f64.cu
 create mode 100644 qtensor/compression/cuszp/cuSZp/src/cuSZp_f32.cu
 create mode 100644 qtensor/compression/cuszp/cuSZp/src/cuSZp_f64.cu
 create mode 100644 qtensor/compression/cuszp/cuSZp/src/cuSZp_timer.cu
 create mode 100644 qtensor/compression/cuszp/cuSZp/src/cuSZp_utility.cu

diff --git a/qtensor/compression/cuszp/cuSZp/CMakeLists.txt b/qtensor/compression/cuszp/cuSZp/CMakeLists.txt
new file mode 100644
index 00000000..d3c752ba
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/CMakeLists.txt
@@ -0,0 +1,79 @@
+# Specify the minimum version of CMake required to build the project
+cmake_minimum_required(VERSION 3.21)
+
+project(cuSZp
+        VERSION 0.0.2
+        DESCRIPTION "Error-bounded GPU lossy compression library"
+        )
+set(namespace "cuSZp")
+enable_language(CXX)
+enable_language(CUDA)
+
+find_package(CUDAToolkit REQUIRED)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+#set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -debug -Wall -diag-disable=10441")
+#set(CMAKE_CXX_FLAGS_RELEASE "-diag-disable=10441 -g -ftz -fma -O2 -fp-model precise -prec-div -Wall")
+
+#set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -ftz=true -G -allow-unsupported-compiler")
+#set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -allow-unsupported-compiler")
+
+set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
+set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+set(CMAKE_CUDA_STANDARD "17")
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+#set(CMAKE_CUDA_FLAGS_INIT "-std=c++17 -allow-unsupported-compiler")
+set(CMAKE_CUDA_ARCHITECTURES 60 61 62 70 75)
+set(CUDA_PROPAGATE_HOST_FLAGS ON)
+set(CUDA_LIBRARY CUDA::cudart)
+
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY VALUE Release)
+endif()
+
+add_library(${PROJECT_NAME} STATIC)
+
+target_sources(${PROJECT_NAME}
+        PRIVATE
+        src/cuSZp_f32.cu
+        src/cuSZp_f64.cu
+        src/cuSZp_utility.cu
+        src/cuSZp_timer.cu
+        src/cuSZp_entry_f32.cu
+        src/cuSZp_entry_f64.cu
+        )
+
+target_include_directories(${PROJECT_NAME}
+        PRIVATE
+        # where the library itself will look for its internal headers
+        ${CMAKE_CURRENT_SOURCE_DIR}/src
+        PUBLIC
+        # where top-level project will look for the library's public headers
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+        # where external projects will look for the library's public headers
+        $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+        )
+
+#target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+
+target_link_libraries(${PROJECT_NAME} PRIVATE CUDA::cudart)
+
+set(public_headers
+        include/cuSZp_f32.h
+        include/cuSZp_f64.h
+        include/cuSZp_utility.h
+        include/cuSZp_timer.h
+        include/cuSZp_entry_f32.h
+        include/cuSZp_entry_f64.h
+        )
+
+set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+include(Installing)
+
+option(CUSZP_BUILD_EXAMPLES "Option to enable building example programs" ON)
+if (CUSZP_BUILD_EXAMPLES)
+    add_subdirectory(examples)
+endif ()
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/Config.cmake.in b/qtensor/compression/cuszp/cuSZp/Config.cmake.in
new file mode 100644
index 00000000..97b7684e
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/Config.cmake.in
@@ -0,0 +1,5 @@
+@PACKAGE_INIT@
+
+include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake")
+
+check_required_components(@PROJECT_NAME@)
diff --git a/qtensor/compression/cuszp/cuSZp/LICENSE b/qtensor/compression/cuszp/cuSZp/LICENSE
new file mode 100644
index 00000000..d4fb7dda
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/LICENSE
@@ -0,0 +1,30 @@
+Copyright © 2023, UChicago Argonne and University of Iowa
+
+All Rights Reserved
+
+Software Name: cuSZp: An Ultra-fast GPU Error-bounded Lossy Compressor with Optimized End-to-End Performance
+
+By: Argonne National Laboratory, University of Iowa
+
+OPEN SOURCE LICENSE
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+******************************************************************************************************
+                                              DISCLAIMER
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************************************************************************************
+
+Contact: SZ Team (szlossycompressor@gmail.com)
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/README.md b/qtensor/compression/cuszp/cuSZp/README.md
new file mode 100644
index 00000000..14454bd0
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/README.md
@@ -0,0 +1,106 @@
+# cuSZp
+<a href="./LICENSE"><img src="https://img.shields.io/badge/License-BSD%203--Clause-blue.svg"></a> 
+
+cuSZp is a user-friendly error-bounded lossy compression tool specifically designed for the compression of single- and double-precision floating-point data using NVIDIA GPUs. 
+This tool fuses all compression or decompression computations into one single kernel, achieving ultra fast end-to-end throughput.
+Specifically, the cuSZp framework is structured around four pivotal stages: Quantization and Prediction, Fixed-length Encoding, Global Synchronization, and Block Bit-shuffling. 
+Noting that ongoing optimization efforts are being devoted to cuSZp, aimed at further improving its end-to-end performance.
+
+- Developer: Yafan Huang
+- Contributors: Sheng Di, Xiaodong Yu, Guanpeng Li, and Franck Cappello
+
+## Environment Requirements
+- Linux OS with NVIDIA GPUs
+- Git >= 2.15
+- CMake >= 3.21
+- Cuda Toolkit >= 11.0
+- GCC >= 7.3.0
+
+## Compile and Run cuSZp Prepared Executable Binary
+You can compile and install cuSZp with following commands:
+```shell
+$ git clone https://github.com/szcompressor/cuSZp.git
+$ cd cuSZp
+$ mkdir build && cd build
+$ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../install/ ..
+$ make -j
+$ make install
+```
+After compilation, you will see a list of executable binaries ```cuSZp/install/bin/```:
+- ```cuSZp_cpu_f32_api```: single-precision, host pointers (i.e. on CPU).
+- ```cuSZp_gpu_f32_api```: single-precision, device pointers (i.e. on GPU).
+- ```cuSZp_cpu_f64_api```: double-precision, host pointers (i.e. on CPU).
+- ```cuSZp_gpu_f64_api```: double-precision, device pointers (i.e. on GPU).
+
+To use those binaries, try following commands. 
+We here use RTM pressure_2000 dataset (1.4 GB, 1008x1008x352) for single-precision example, and NWChem acd-tst.bin.d64 (6.0 GB) for double-precision example.
+```shell
+# Example for single-precision API
+# ./cuSZp_gpu_f32_api TARGET_HPC_DATASET ERROR_MODE ERROR_BOUND
+#                                        ABS or REL
+$ ./cuSZp_gpu_f32_api ./pressure_2000 REL 1e-4
+cuSZp finished!
+cuSZp compression   end-to-end speed: 151.564649 GB/s
+cuSZp decompression end-to-end speed: 232.503219 GB/s
+cuSZp compression ratio: 13.003452
+
+Pass error check!
+$
+# Example for double-precision API
+# ./cuSZp_gpu_f64_api TARGET_HPC_DATASET ERROR_MODE ERROR_BOUND
+#                                        ABS or REL
+$ ./cuSZp_gpu_f64_api ./acd-tst.bin.d64 ABS 1E-8
+cuSZp finished!
+cuSZp compression   end-to-end speed: 110.117965 GB/s
+cuSZp decompression end-to-end speed: 222.743097 GB/s
+cuSZp compression ratio: 3.990585
+
+Pass error check!
+```
+More HPC dataset can be downloaded from [SDRBench](https://sdrbench.github.io/).
+
+## Using cuSZp as an Internal API
+This repository provides several examples for using cuSZp compression and decompression for different scenarios (device pointer? host pointer? f32 or f64?).
+The examples can be found in ```cuSZp/examples/```.
+Assuming your original data, compressed data, and reconstructed data are all device pointers (allocated on GPU), and the data type is single-precision. The compression and decompression APIs can be called as below:
+```C++
+// For measuring the end-to-end throughput.
+TimingGPU timer_GPU;
+
+// cuSZp compression.
+timer_GPU.StartCounter(); // set timer
+SZp_compress_deviceptr_f32(d_oriData, d_cmpBytes, nbEle, &cmpSize, errorBound, stream);
+float cmpTime = timer_GPU.GetCounter();
+
+// cuSZp decompression.
+timer_GPU.StartCounter(); // set timer
+SZp_decompress_deviceptr_f32(d_decData, d_cmpBytes, nbEle, cmpSize, errorBound, stream);
+float decTime = timer_GPU.GetCounter();
+```
+More details can be checked in:
+- **f32-hostptr**: ```cuSZp/examples/cuSZp_cpu_f32_api.cpp```.
+- **f32-deviceptr**: ```cuSZp/examples/cuSZp_gpu_f32_api.cpp```.
+- **f64-hostptr**: ```cuSZp/examples/cuSZp_cpu_f64_api.cpp```.
+- **f64-deviceptr**: ```cuSZp/examples/cuSZp_gpu_f64_api.cpp```.
+
+## Citation
+```bibtex
+@inproceedings{cuSZp2023huang,
+      title = {cuSZp: An Ultra-Fast GPU Error-Bounded Lossy Compression Framework with Optimized End-to-End Performance}
+     author = {Huang, Yafan and Di, Sheng and Yu, Xiaodong and Li, Guanpeng and Cappello, Franck},
+       year = {2023},
+       isbn = {979-8-4007-0109-2/23/11},
+  publisher = {Association for Computing Machinery},
+    address = {Denver, CO, USA},
+        doi = {10.1145/3581784.3607048},
+  booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
+   keywords = {Lossy compression; parallel computing; HPC; GPU},
+     series = {SC'23}
+}
+```
+
+## Copyright
+(C) 2023 by Argonne National Laboratory and University of Iowa. More details see [COPYRIGHT](https://github.com/szcompressor/cuSZp/blob/master/LICENSE).
+
+## Acknowledgement
+This research was supported by the Exascale Computing Project (ECP), Project Number: 17-SC-20-SC, a collaborative effort of two DOE organizations – the Office of Science and the National Nuclear Security Administration, responsible for the planning and preparation of a capable exascale ecosystem, including software, applications, hardware, advanced system engineering and early testbed platforms, to support the nation’s exascale computing imperative. The material was supported by the U.S. Department of Energy, Office of Science, Advanced Scientific Computing Research (ASCR), under contract DE-AC02-06CH11357, and supported by the National Science Foundation under Grant OAC-2003709 and OAC-2104023. We acknowledge the computing resources provided on Bebop (operated by Laboratory Computing Resource Center at Argonne) and on Theta and JLSE (operated by Argonne Leadership Computing Facility). We acknowledge the support of ARAMCO. 
diff --git a/qtensor/compression/cuszp/cuSZp/cmake/Installing.cmake b/qtensor/compression/cuszp/cuSZp/cmake/Installing.cmake
new file mode 100644
index 00000000..8a635ca6
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/cmake/Installing.cmake
@@ -0,0 +1,67 @@
+include(GNUInstallDirs)
+
+if(DEFINED CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+    message(
+            STATUS
+            "CMAKE_INSTALL_PREFIX is not set\n"
+            "Default value: ${CMAKE_INSTALL_PREFIX}\n"
+            "Will set it to ${CMAKE_SOURCE_DIR}/install"
+    )
+    set(CMAKE_INSTALL_PREFIX
+            "${CMAKE_SOURCE_DIR}/install"
+            CACHE PATH "Where the library will be installed to" FORCE
+            )
+else()
+    message(
+            STATUS
+            "CMAKE_INSTALL_PREFIX was already set\n"
+            "Current value: ${CMAKE_INSTALL_PREFIX}"
+    )
+endif()
+
+set_target_properties(${PROJECT_NAME} PROPERTIES PUBLIC_HEADER "${public_headers}")
+
+set_target_properties(${PROJECT_NAME} PROPERTIES DEBUG_POSTFIX "d")
+
+install(TARGETS ${PROJECT_NAME}
+        EXPORT "${PROJECT_NAME}Targets"
+        # these get default values from GNUInstallDirs, no need to set them
+        #RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} # bin
+        #LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} # lib
+        #ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} # lib
+        # except for public headers, as we want them to be inside a library folder
+        PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME} # include/SomeProject
+        INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} # include
+        )
+
+# generate and install export file
+install(EXPORT "${PROJECT_NAME}Targets"
+        FILE "${PROJECT_NAME}Targets.cmake"
+        NAMESPACE ${namespace}::
+        DESTINATION cmake
+        )
+
+include(CMakePackageConfigHelpers)
+
+# generate the version file for the config file
+write_basic_package_version_file(
+        "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake"
+        VERSION "${version}"
+        COMPATIBILITY AnyNewerVersion
+)
+# create config file
+configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in
+        "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
+        INSTALL_DESTINATION cmake
+        )
+# install config files
+install(FILES
+        "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
+        "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake"
+        DESTINATION cmake
+        )
+# generate the export targets for the build tree
+export(EXPORT "${PROJECT_NAME}Targets"
+        FILE "${CMAKE_CURRENT_BINARY_DIR}/cmake/${PROJECT_NAME}Targets.cmake"
+        NAMESPACE ${namespace}::
+        )
diff --git a/qtensor/compression/cuszp/cuSZp/examples/CMakeLists.txt b/qtensor/compression/cuszp/cuSZp/examples/CMakeLists.txt
new file mode 100644
index 00000000..e5484362
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/examples/CMakeLists.txt
@@ -0,0 +1,45 @@
+# Find CUDA package
+find_package(CUDA REQUIRED)
+
+set(install_dir ${PROJECT_BINARY_DIR}/examples/bin)
+set(execName_gpu_f32 "cuSZp_gpu_f32_api")
+set(execName_cpu_f32 "cuSZp_cpu_f32_api")
+set(execName_gpu_f64 "cuSZp_gpu_f64_api")
+set(execName_cpu_f64 "cuSZp_cpu_f64_api")
+set(SRC_DIR ${PROJECT_SOURCE_DIR}/src)
+set(INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include)
+
+# Add include and library directories
+include_directories(${INCLUDE_DIR})
+
+# Compile headers as a library
+cuda_add_library(cuSZp_libs STATIC ${SRC_DIR}/cuSZp_f32.cu
+                                   ${SRC_DIR}/cuSZp_f64.cu 
+                                   ${SRC_DIR}/cuSZp_utility.cu
+                                   ${SRC_DIR}/cuSZp_timer.cu
+                                   ${SRC_DIR}/cuSZp_entry_f32.cu
+                                   ${SRC_DIR}/cuSZp_entry_f64.cu)
+
+# Compile executable binary
+cuda_add_executable(${execName_gpu_f32} cuSZp_gpu_f32_api.cpp)
+cuda_add_executable(${execName_cpu_f32} cuSZp_cpu_f32_api.cpp)
+cuda_add_executable(${execName_gpu_f64} cuSZp_gpu_f64_api.cpp)
+cuda_add_executable(${execName_cpu_f64} cuSZp_cpu_f64_api.cpp)
+
+# Link with headers
+target_link_libraries(${execName_gpu_f32} cuSZp_libs)
+target_link_libraries(${execName_cpu_f32} cuSZp_libs)
+target_link_libraries(${execName_gpu_f64} cuSZp_libs)
+target_link_libraries(${execName_cpu_f64} cuSZp_libs)
+
+# Set output paths for the compiled binary
+set_target_properties(${execName_gpu_f32} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${install_dir})
+set_target_properties(${execName_cpu_f32} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${install_dir})
+set_target_properties(${execName_gpu_f64} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${install_dir})
+set_target_properties(${execName_cpu_f64} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${install_dir})
+
+# Set installation paths for the compiled binary.
+install(TARGETS ${execName_gpu_f32} DESTINATION bin)
+install(TARGETS ${execName_cpu_f32} DESTINATION bin)
+install(TARGETS ${execName_gpu_f64} DESTINATION bin)
+install(TARGETS ${execName_cpu_f64} DESTINATION bin)
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/examples/cuSZp_cpu_f32_api.cpp b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_cpu_f32_api.cpp
new file mode 100644
index 00000000..f543f59c
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_cpu_f32_api.cpp
@@ -0,0 +1,83 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <cuSZp_utility.h>
+#include <cuSZp_entry_f32.h>
+
+int main(int argc, char* argv[])
+{
+    // Read input information.
+    char oriFilePath[640];
+    char errorMode[20];
+    int status=0;
+    if(argc != 4)
+    {
+        printf("Usage: cuSZp_cpu_f32_api [srcFilePath] [errorMode] [errBound] # errorMode can only be ABS or REL\n");
+        printf("Example: cuSZp_cpu_f32_api testfloat_8_8_128.dat ABS 1E-2     # compress dataset with absolute 1E-2 error bound\n");
+        printf("         cuSZp_cpu_f32_api testfloat_8_8_128.dat REL 1e-3     # compress dataset with relative 1E-3 error bound\n");
+        exit(0);
+    }
+    sprintf(oriFilePath, "%s", argv[1]);
+    sprintf(errorMode, "%s", argv[2]);
+    float errorBound = atof(argv[3]);
+
+    // Input data preparation.
+    float* oriData = NULL;
+    float* decData = NULL;
+    unsigned char* cmpBytes = NULL;
+    size_t nbEle = 0;
+    size_t cmpSize = 0;
+    oriData = readFloatData_Yafan(oriFilePath, &nbEle, &status);
+    decData = (float*)malloc(nbEle*sizeof(float));
+    cmpBytes = (unsigned char*)malloc(nbEle*sizeof(float));
+
+    // Generating error bounds.
+    if(strcmp(errorMode, "REL")==0)
+    {
+        float max_val = oriData[0];
+        float min_val = oriData[0];
+        for(size_t i=0; i<nbEle; i++)
+        {
+            if(oriData[i]>max_val)
+                max_val = oriData[i];
+            else if(oriData[i]<min_val)
+                min_val = oriData[i];
+        }
+        errorBound = errorBound * (max_val - min_val);
+    }
+    else if(strcmp(errorMode, "ABS")!=0)
+    {
+        printf("invalid errorMode! errorMode can only be ABS or REL.\n");
+        exit(0);
+    }
+
+    // cuSZp compression.
+    SZp_compress_hostptr_f32(oriData, cmpBytes, nbEle, &cmpSize, errorBound);
+    
+    // cuSZp decompression.
+    SZp_decompress_hostptr_f32(decData, cmpBytes, nbEle, cmpSize, errorBound);
+
+    // Print result.
+    printf("cuSZp finished!\n");
+    printf("compression ratios: %f\n\n", (nbEle*sizeof(float)/1024.0/1024.0)/(cmpSize*sizeof(unsigned char)/1024.0/1024.0));
+
+    // Error check
+    int not_bound = 0;
+    for(size_t i=0; i<nbEle; i+=1)
+    {
+        if(abs(oriData[i]-decData[i]) > errorBound*1.1)
+        {
+            not_bound++;
+            // printf("not bound: %zu oriData: %f, decData: %f, errors: %f, bound: %f\n", i, oriData[i], decData[i], abs(oriData[i]-decData[i]), errBound);
+        }
+    }
+    if(!not_bound) printf("\033[0;32mPass error check!\033[0m\n");
+    else printf("\033[0;31mFail error check!\033[0m\n");
+    
+    // Free allocated data.
+    free(oriData);
+    free(decData);
+    free(cmpBytes);
+    return 0;
+}
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/examples/cuSZp_cpu_f64_api.cpp b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_cpu_f64_api.cpp
new file mode 100644
index 00000000..6ed6adb1
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_cpu_f64_api.cpp
@@ -0,0 +1,83 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <cuSZp_utility.h>
+#include <cuSZp_entry_f64.h>
+
+int main(int argc, char* argv[])
+{
+    // Read input information.
+    char oriFilePath[640];
+    char errorMode[20];
+    int status=0;
+    if(argc != 4)
+    {
+        printf("Usage: cuSZp_cpu_f64_api [srcFilePath] [errorMode] [errBound] # errorMode can only be ABS or REL\n");
+        printf("Example: cuSZp_cpu_f64_api testdouble_8_8_128.dat ABS 1E-2     # compress dataset with absolute 1E-2 error bound\n");
+        printf("         cuSZp_cpu_f64_api testdouble_8_8_128.dat REL 1e-3     # compress dataset with relative 1E-3 error bound\n");
+        exit(0);
+    }
+    sprintf(oriFilePath, "%s", argv[1]);
+    sprintf(errorMode, "%s", argv[2]);
+    double errorBound = atof(argv[3]);
+
+    // Input data preparation.
+    double* oriData = NULL;
+    double* decData = NULL;
+    unsigned char* cmpBytes = NULL;
+    size_t nbEle = 0;
+    size_t cmpSize = 0;
+    oriData = readDoubleData_Yafan(oriFilePath, &nbEle, &status);
+    decData = (double*)malloc(nbEle*sizeof(double));
+    cmpBytes = (unsigned char*)malloc(nbEle*sizeof(double));
+
+    // Generating error bounds.
+    if(strcmp(errorMode, "REL")==0)
+    {
+        double max_val = oriData[0];
+        double min_val = oriData[0];
+        for(size_t i=0; i<nbEle; i++)
+        {
+            if(oriData[i]>max_val)
+                max_val = oriData[i];
+            else if(oriData[i]<min_val)
+                min_val = oriData[i];
+        }
+        errorBound = errorBound * (max_val - min_val);
+    }
+    else if(strcmp(errorMode, "ABS")!=0)
+    {
+        printf("invalid errorMode! errorMode can only be ABS or REL.\n");
+        exit(0);
+    }
+
+    // cuSZp compression.
+    SZp_compress_hostptr_f64(oriData, cmpBytes, nbEle, &cmpSize, errorBound);
+    
+    // cuSZp decompression.
+    SZp_decompress_hostptr_f64(decData, cmpBytes, nbEle, cmpSize, errorBound);
+
+    // Print result.
+    printf("cuSZp finished!\n");
+    printf("compression ratios: %f\n\n", (nbEle*sizeof(double)/1024.0/1024.0)/(cmpSize*sizeof(unsigned char)/1024.0/1024.0));
+
+    // Error check
+    int not_bound = 0;
+    for(size_t i=0; i<nbEle; i+=1)
+    {
+        if(abs(oriData[i]-decData[i]) > errorBound*1.1)
+        {
+            not_bound++;
+            // printf("not bound: %zu oriData: %f, decData: %f, errors: %f, bound: %f\n", i, oriData[i], decData[i], abs(oriData[i]-decData[i]), errBound);
+        }
+    }
+    if(!not_bound) printf("\033[0;32mPass error check!\033[0m\n");
+    else printf("\033[0;31mFail error check!\033[0m\n");
+    
+    // Free allocated data.
+    free(oriData);
+    free(decData);
+    free(cmpBytes);
+    return 0;
+}
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/examples/cuSZp_gpu_f32_api.cpp b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_gpu_f32_api.cpp
new file mode 100644
index 00000000..7c54199d
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_gpu_f32_api.cpp
@@ -0,0 +1,119 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <cuda_runtime.h>
+#include <cuSZp_utility.h>
+#include <cuSZp_entry_f32.h>
+#include <cuSZp_timer.h>
+
+int main(int argc, char* argv[])
+{
+    // Read input information.
+    char oriFilePath[640];
+    char errorMode[20];
+    int status=0;
+    if(argc != 4)
+    {
+        printf("Usage: cuSZp_gpu_f32_api [srcFilePath] [errorMode] [errBound] # errorMode can only be ABS or REL\n");
+        printf("Example: cuSZp_gpu_f32_api testfloat_8_8_128.dat ABS 1E-2     # compress dataset with absolute 1E-2 error bound\n");
+        printf("         cuSZp_gpu_f32_api testfloat_8_8_128.dat REL 1e-3     # compress dataset with relative 1E-3 error bound\n");
+        exit(0);
+    }
+    sprintf(oriFilePath, "%s", argv[1]);
+    sprintf(errorMode, "%s", argv[2]);
+    float errorBound = atof(argv[3]);
+
+    // For measuring the end-to-end throughput.
+    TimingGPU timer_GPU;
+
+    // Input data preparation on CPU.
+    float* oriData = NULL;
+    float* decData = NULL;
+    unsigned char* cmpBytes = NULL;
+    size_t nbEle = 0;
+    size_t cmpSize = 0;
+    oriData = readFloatData_Yafan(oriFilePath, &nbEle, &status);
+    decData = (float*)malloc(nbEle*sizeof(float));
+    cmpBytes = (unsigned char*)malloc(nbEle*sizeof(float));
+
+    // Generating error bounds.
+    if(strcmp(errorMode, "REL")==0)
+    {
+        float max_val = oriData[0];
+        float min_val = oriData[0];
+        for(size_t i=0; i<nbEle; i++)
+        {
+            if(oriData[i]>max_val)
+                max_val = oriData[i];
+            else if(oriData[i]<min_val)
+                min_val = oriData[i];
+        }
+        errorBound = errorBound * (max_val - min_val);
+    }
+    else if(strcmp(errorMode, "ABS")!=0)
+    {
+        printf("invalid errorMode! errorMode can only be ABS or REL.\n");
+        exit(0);
+    }
+
+    // Input data preparation on GPU.
+    float* d_oriData;
+    float* d_decData;
+    unsigned char* d_cmpBytes;
+    size_t pad_nbEle = (nbEle + 262144 - 1) / 262144 * 262144; // A temp demo, will add more block sizes in future implementation.
+    cudaMalloc((void**)&d_oriData, sizeof(float)*pad_nbEle);
+    cudaMemcpy(d_oriData, oriData, sizeof(float)*pad_nbEle, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_decData, sizeof(float)*pad_nbEle);
+    cudaMemset(d_decData, 0, sizeof(float)*pad_nbEle);
+    cudaMalloc((void**)&d_cmpBytes, sizeof(float)*pad_nbEle);
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // Just a warmup.
+    for(int i=0; i<3; i++)
+        SZp_compress_deviceptr_f32(d_oriData, d_cmpBytes, nbEle, &cmpSize, errorBound, stream);
+
+    // cuSZp compression.
+    timer_GPU.StartCounter(); // set timer
+    SZp_compress_deviceptr_f32(d_oriData, d_cmpBytes, nbEle, &cmpSize, errorBound, stream);
+    float cmpTime = timer_GPU.GetCounter();
+    
+    // cuSZp decompression.
+    timer_GPU.StartCounter(); // set timer
+    SZp_decompress_deviceptr_f32(d_decData, d_cmpBytes, nbEle, cmpSize, errorBound, stream);
+    float decTime = timer_GPU.GetCounter();
+
+    // Print result.
+    printf("cuSZp finished!\n");
+    printf("cuSZp compression   end-to-end speed: %f GB/s\n", (nbEle*sizeof(float)/1024.0/1024.0)/cmpTime);
+    printf("cuSZp decompression end-to-end speed: %f GB/s\n", (nbEle*sizeof(float)/1024.0/1024.0)/decTime);
+    printf("cuSZp compression ratio: %f\n\n", (nbEle*sizeof(float)/1024.0/1024.0)/(cmpSize*sizeof(unsigned char)/1024.0/1024.0));
+
+    // Error check
+    cudaMemcpy(cmpBytes, d_cmpBytes, cmpSize*sizeof(unsigned char), cudaMemcpyDeviceToHost);
+    cudaMemcpy(decData, d_decData, sizeof(float)*nbEle, cudaMemcpyDeviceToHost);
+    int not_bound = 0;
+    for(size_t i=0; i<nbEle; i+=1)
+    {
+        if(abs(oriData[i]-decData[i]) > errorBound*1.1)
+        {
+            not_bound++;
+            // printf("not bound: %zu oriData: %f, decData: %f, errors: %f, bound: %f\n", i, oriData[i], decData[i], abs(oriData[i]-decData[i]), errBound);
+        }
+    }
+    if(!not_bound) printf("\033[0;32mPass error check!\033[0m\n");
+    else printf("\033[0;31mFail error check!\033[0m\n");
+    
+    // Free allocated data.
+    free(oriData);
+    free(decData);
+    free(cmpBytes);
+    cudaFree(d_oriData);
+    cudaFree(d_decData);
+    cudaFree(d_cmpBytes);
+    cudaStreamDestroy(stream);
+    return 0;
+}
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/examples/cuSZp_gpu_f64_api.cpp b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_gpu_f64_api.cpp
new file mode 100644
index 00000000..3c03df17
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_gpu_f64_api.cpp
@@ -0,0 +1,120 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <cuda_runtime.h>
+#include <cuSZp_utility.h>
+#include <cuSZp_entry_f64.h>
+#include <cuSZp_timer.h>
+
+int main(int argc, char* argv[])
+{
+    // Read input information.
+    char oriFilePath[640];
+    char errorMode[20];
+    int status=0;
+    if(argc != 4)
+    {
+        printf("Usage: cuSZp_gpu_f64_api [srcFilePath] [errorMode] [errBound] # errorMode can only be ABS or REL\n");
+        printf("Example: cuSZp_gpu_f64_api testdouble_8_8_128.dat ABS 1E-2     # compress dataset with absolute 1E-2 error bound\n");
+        printf("         cuSZp_gpu_f64_api testdouble_8_8_128.dat REL 1e-3     # compress dataset with relative 1E-3 error bound\n");
+        exit(0);
+    }
+    sprintf(oriFilePath, "%s", argv[1]);
+    sprintf(errorMode, "%s", argv[2]);
+    double errorBound = atof(argv[3]);
+
+    // For measuring the end-to-end throughput.
+    TimingGPU timer_GPU;
+
+    // Input data preparation on CPU.
+    double* oriData = NULL;
+    double* decData = NULL;
+    unsigned char* cmpBytes = NULL;
+    size_t nbEle = 0;
+    size_t cmpSize = 0;
+    oriData = readDoubleData_Yafan(oriFilePath, &nbEle, &status);
+    decData = (double*)malloc(nbEle*sizeof(double));
+    cmpBytes = (unsigned char*)malloc(nbEle*sizeof(double));
+
+    // Generating error bounds.
+    if(strcmp(errorMode, "REL")==0)
+    {
+        double max_val = oriData[0];
+        double min_val = oriData[0];
+        for(size_t i=0; i<nbEle; i++)
+        {
+            if(oriData[i]>max_val)
+                max_val = oriData[i];
+            else if(oriData[i]<min_val)
+                min_val = oriData[i];
+        }
+        errorBound = errorBound * (max_val - min_val);
+    }
+    else if(strcmp(errorMode, "ABS")!=0)
+    {
+        printf("invalid errorMode! errorMode can only be ABS or REL.\n");
+        exit(0);
+    }
+
+    // Input data preparation on GPU.
+    double* d_oriData;
+    double* d_decData;
+    unsigned char* d_cmpBytes;
+    size_t pad_nbEle = (nbEle + 262144 - 1) / 262144 * 262144; // A temp demo, will add more block sizes in future implementation.
+    cudaMalloc((void**)&d_oriData, sizeof(double)*pad_nbEle);
+    cudaMemcpy(d_oriData, oriData, sizeof(double)*pad_nbEle, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_decData, sizeof(double)*pad_nbEle);
+    cudaMemset(d_decData, 0, sizeof(double)*pad_nbEle);
+    cudaMalloc((void**)&d_cmpBytes, sizeof(double)*pad_nbEle);
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // Just a warmup.
+    for(int i=0; i<3; i++)
+        SZp_compress_deviceptr_f64(d_oriData, d_cmpBytes, nbEle, &cmpSize, errorBound, stream);
+
+    // cuSZp compression.
+    timer_GPU.StartCounter(); // set timer
+    SZp_compress_deviceptr_f64(d_oriData, d_cmpBytes, nbEle, &cmpSize, errorBound, stream);
+    float cmpTime = timer_GPU.GetCounter();
+    
+    // cuSZp decompression.
+    timer_GPU.StartCounter(); // set timer
+    SZp_decompress_deviceptr_f64(d_decData, d_cmpBytes, nbEle, cmpSize, errorBound, stream);
+    float decTime = timer_GPU.GetCounter();
+
+    // Print result.
+    printf("cuSZp finished!\n");
+    printf("cuSZp compression   end-to-end speed: %f GB/s\n", (nbEle*sizeof(double)/1024.0/1024.0)/cmpTime);
+    printf("cuSZp decompression end-to-end speed: %f GB/s\n", (nbEle*sizeof(double)/1024.0/1024.0)/decTime);
+    printf("cuSZp compression ratio: %f\n\n", (nbEle*sizeof(double)/1024.0/1024.0)/(cmpSize*sizeof(unsigned char)/1024.0/1024.0));
+
+    // Error check
+    cudaMemcpy(cmpBytes, d_cmpBytes, cmpSize*sizeof(unsigned char), cudaMemcpyDeviceToHost);
+    cudaMemcpy(decData, d_decData, sizeof(double)*nbEle, cudaMemcpyDeviceToHost);
+    int not_bound = 0;
+    for(size_t i=0; i<nbEle; i+=1)
+    {
+        if(abs(oriData[i]-decData[i]) > errorBound*1.1)
+        {
+            not_bound++;
+            // printf("not bound: %zu oriData: %f, decData: %f, errors: %f, bound: %f\n", i, oriData[i], decData[i], abs(oriData[i]-decData[i]), errBound);
+        }
+    }
+    if(!not_bound) printf("\033[0;32mPass error check!\033[0m\n");
+    else printf("\033[0;31mFail error check!\033[0m\n");
+    
+    // Free allocated data.
+    free(oriData);
+    free(decData);
+    free(cmpBytes);
+    cudaFree(d_oriData);
+    cudaFree(d_decData);
+    cudaFree(d_cmpBytes);
+    cudaStreamDestroy(stream);
+
+    return 0;
+}
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/include/cuSZp_entry_f32.h b/qtensor/compression/cuszp/cuSZp/include/cuSZp_entry_f32.h
new file mode 100644
index 00000000..4300d26c
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/include/cuSZp_entry_f32.h
@@ -0,0 +1,11 @@
+#ifndef CUSZP_INCLUDE_CUSZP_ENTRY_F32_H
+#define CUSZP_INCLUDE_CUSZP_ENTRY_F32_H
+
+#include <cuda_runtime.h>
+
+void SZp_compress_hostptr_f32(float* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound);
+void SZp_decompress_hostptr_f32(float* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, float errorBound);
+void SZp_compress_deviceptr_f32(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream = 0);
+void SZp_decompress_deviceptr_f32(float* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, float errorBound, cudaStream_t stream = 0);
+
+#endif // CUSZP_INCLUDE_CUSZP_ENTRY_F32_H
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/include/cuSZp_entry_f64.h b/qtensor/compression/cuszp/cuSZp/include/cuSZp_entry_f64.h
new file mode 100644
index 00000000..29837263
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/include/cuSZp_entry_f64.h
@@ -0,0 +1,11 @@
+#ifndef CUSZP_INCLUDE_CUSZP_ENTRY_F64_H
+#define CUSZP_INCLUDE_CUSZP_ENTRY_F64_H
+
+#include <cuda_runtime.h>
+
+void SZp_compress_hostptr_f64(double* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, double errorBound);
+void SZp_decompress_hostptr_f64(double* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, double errorBound);
+void SZp_compress_deviceptr_f64(double* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, double errorBound, cudaStream_t stream = 0);
+void SZp_decompress_deviceptr_f64(double* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, double errorBound, cudaStream_t stream = 0);
+
+#endif // CUSZP_INCLUDE_CUSZP_ENTRY_F64_H
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/include/cuSZp_f32.h b/qtensor/compression/cuszp/cuSZp/include/cuSZp_f32.h
new file mode 100644
index 00000000..fa91cf50
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/include/cuSZp_f32.h
@@ -0,0 +1,12 @@
+#ifndef CUSZP_INCLUDE_CUSZP_F32_H
+#define CUSZP_INCLUDE_CUSZP_F32_H
+
+static const int cmp_tblock_size_f32 = 32;
+static const int dec_tblock_size_f32 = 32;
+static const int cmp_chunk_f32 = 256;
+static const int dec_chunk_f32 = 256;
+
+__global__ void SZp_compress_kernel_f32(const float* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle);
+__global__ void SZp_decompress_kernel_f32(float* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle);
+
+#endif // CUSZP_INCLUDE_CUSZP_F32_H
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/include/cuSZp_f64.h b/qtensor/compression/cuszp/cuSZp/include/cuSZp_f64.h
new file mode 100644
index 00000000..c26f5ee6
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/include/cuSZp_f64.h
@@ -0,0 +1,12 @@
+#ifndef CUSZP_INCLUDE_CUSZP_F64_H
+#define CUSZP_INCLUDE_CUSZP_F64_H
+
+static const int cmp_tblock_size_f64 = 32;
+static const int dec_tblock_size_f64 = 32;
+static const int cmp_chunk_f64 = 8192;
+static const int dec_chunk_f64 = 8192;
+
+__global__ void SZp_compress_kernel_f64(const double* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const double eb, const size_t nbEle);
+__global__ void SZp_decompress_kernel_f64(double* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const double eb, const size_t nbEle);
+
+#endif // CUSZP_INCLUDE_CUSZP_F64_H
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/include/cuSZp_timer.h b/qtensor/compression/cuszp/cuSZp/include/cuSZp_timer.h
new file mode 100644
index 00000000..faca61c3
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/include/cuSZp_timer.h
@@ -0,0 +1,31 @@
+#ifndef CUSZP_INCLUDE_CUSZP_TIMER_H
+#define CUSZP_INCLUDE_CUSZP_TIMER_H
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+struct PrivateTimingGPU {
+    cudaEvent_t start;
+    cudaEvent_t stop;
+};
+
+class TimingGPU
+{
+    private:
+        PrivateTimingGPU *privateTimingGPU;
+
+    public:
+
+        TimingGPU();
+
+        ~TimingGPU();
+
+        void StartCounter();
+
+        void StartCounterFlags();
+
+        float GetCounter();
+
+};
+
+#endif // CUSZP_INCLUDE_CUSZP_TIMER_H
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/include/cuSZp_utility.h b/qtensor/compression/cuszp/cuSZp/include/cuSZp_utility.h
new file mode 100644
index 00000000..1e29f134
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/include/cuSZp_utility.h
@@ -0,0 +1,18 @@
+#ifndef CUSZP_INCLUDE_CUSZP_UTILITY_H
+#define CUSZP_INCLUDE_CUSZP_UTILITY_H
+
+void symTransForm_4Bytes(unsigned char data[4]);
+void symTransform_8bytes(unsigned char data[8]);
+unsigned char *readByteData_Yafan(char *srcFilePath, size_t *byteLength, int *status);
+float *readFloatData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status);
+float *readFloatData_Yafan(char *srcFilePath, size_t *nbEle, int *status);
+double *readDoubleData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status);
+double *readDoubleData_Yafan(char *srcFilePath, size_t *nbEle, int *status);
+void writeByteData_Yafan(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status);
+void writeFloatData_inBytes_Yafan(float *data, size_t nbEle, char* tgtFilePath, int *status);
+void writeDoubleData_inBytes_Yafan(double *data, size_t nbEle, char* tgtFilePath, int *status);
+double SSIM_3d_calcWindow_float(float* data, float* other, size_t size1, size_t size0, int offset0, int offset1, int offset2, int windowSize0, int windowSize1, int windowSize2);
+double computeSSIM(float* oriData, float* decData, size_t size2, size_t size1, size_t size0);
+double *computePSNR(size_t nbEle, float *ori_data, float *data);
+
+#endif // CUSZP_INCLUDE_CUSZP_UTILITY_H
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/src/cuSZp_entry_f32.cu b/qtensor/compression/cuszp/cuSZp/src/cuSZp_entry_f32.cu
new file mode 100644
index 00000000..6a29939d
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/src/cuSZp_entry_f32.cu
@@ -0,0 +1,149 @@
+#include "cuSZp_entry_f32.h"
+#include "cuSZp_f32.h"
+#include <stdio.h>
+
+void SZp_compress_hostptr_f32(float* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound)
+{
+    // Data blocking.
+    int bsize = cmp_tblock_size_f32;
+    int gsize = (nbEle + bsize * cmp_chunk_f32 - 1) / (bsize * cmp_chunk_f32);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * cmp_chunk_f32;
+
+    // Initializing global memory for GPU compression.
+    float* d_oriData;
+    unsigned char* d_cmpData;
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_oriData, sizeof(float)*pad_nbEle);
+    cudaMemcpy(d_oriData, oriData, sizeof(float)*pad_nbEle, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_cmpData, sizeof(float)*pad_nbEle);
+    cudaMallocManaged((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+    cudaMemset(d_oriData + nbEle, 0, (pad_nbEle - nbEle) * sizeof(float));
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_compress_kernel_f32<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Obtain compression ratio and move data back to CPU.  
+    *cmpSize = (size_t)d_cmpOffset[cmpOffSize-1] + (nbEle+31)/32;
+    cudaMemcpy(cmpBytes, d_cmpData, *cmpSize*sizeof(unsigned char), cudaMemcpyDeviceToHost);
+
+    // Free memory that is used.
+    cudaFree(d_oriData);
+    cudaFree(d_cmpData);
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+    cudaStreamDestroy(stream);
+}
+
+
+void SZp_decompress_hostptr_f32(float* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, float errorBound)
+{
+    // Data blocking.
+    int bsize = dec_tblock_size_f32;
+    int gsize = (nbEle + bsize * dec_chunk_f32 - 1) / (bsize * dec_chunk_f32);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * dec_chunk_f32;
+
+    // Initializing global memory for GPU compression.
+    float* d_decData;
+    unsigned char* d_cmpData;
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_decData, sizeof(float)*pad_nbEle);
+    cudaMemset(d_decData, 0, sizeof(float)*pad_nbEle);
+    cudaMalloc((void**)&d_cmpData, sizeof(float)*pad_nbEle);
+    cudaMemcpy(d_cmpData, cmpBytes, sizeof(unsigned char)*cmpSize, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_decompress_kernel_f32<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Move data back to CPU.
+    cudaMemcpy(decData, d_decData, sizeof(float)*nbEle, cudaMemcpyDeviceToHost);
+
+    // Free memoy that is used.
+    cudaFree(d_decData);
+    cudaFree(d_cmpData);
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+    cudaStreamDestroy(stream);
+}
+
+
+void SZp_compress_deviceptr_f32(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream)
+{
+    int bsize = cmp_tblock_size_f32;
+    int gsize = (nbEle + bsize * cmp_chunk_f32 - 1) / (bsize * cmp_chunk_f32);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * cmp_chunk_f32;
+
+    // Initializing global memory for GPU compression.
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMallocManaged((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+    // cudaMemset(d_oriData + nbEle, 0, (pad_nbEle - nbEle) * sizeof(float));
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_compress_kernel_f32<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Obtain compression ratio and move data back to CPU.  
+    *cmpSize = (size_t)d_cmpOffset[cmpOffSize-1] + (nbEle+31)/32;
+
+    // Free memory that is used.
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+}
+
+
+void SZp_decompress_deviceptr_f32(float* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, float errorBound, cudaStream_t stream)
+{
+    // Data blocking.
+    int bsize = dec_tblock_size_f32;
+    int gsize = (nbEle + bsize * dec_chunk_f32 - 1) / (bsize * dec_chunk_f32);
+    int cmpOffSize = gsize + 1;
+
+    // Initializing global memory for GPU compression.
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_decompress_kernel_f32<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Free memoy that is used.
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+}
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/src/cuSZp_entry_f64.cu b/qtensor/compression/cuszp/cuSZp/src/cuSZp_entry_f64.cu
new file mode 100644
index 00000000..8bd1e76f
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/src/cuSZp_entry_f64.cu
@@ -0,0 +1,149 @@
+#include "cuSZp_entry_f64.h"
+#include "cuSZp_f64.h"
+
+void SZp_compress_hostptr_f64(double* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, double errorBound)
+{
+    // Data blocking.
+    int bsize = cmp_tblock_size_f64;
+    int gsize = (nbEle + bsize * cmp_chunk_f64 - 1) / (bsize * cmp_chunk_f64);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * cmp_chunk_f64;
+
+    // Initializing global memory for GPU compression.
+    double* d_oriData;
+    unsigned char* d_cmpData;
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_oriData, sizeof(double)*pad_nbEle);
+    cudaMemcpy(d_oriData, oriData, sizeof(double)*pad_nbEle, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_cmpData, sizeof(double)*pad_nbEle);
+    cudaMallocManaged((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+    cudaMemset(d_oriData + nbEle, 0, (pad_nbEle - nbEle) * sizeof(double));
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_compress_kernel_f64<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Obtain compression ratio and move data back to CPU.  
+    *cmpSize = (size_t)d_cmpOffset[cmpOffSize-1] + (nbEle+31)/32;
+    cudaMemcpy(cmpBytes, d_cmpData, *cmpSize*sizeof(unsigned char), cudaMemcpyDeviceToHost);
+
+    // Free memory that is used.
+    cudaFree(d_oriData);
+    cudaFree(d_cmpData);
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+    cudaStreamDestroy(stream);
+}
+
+
+void SZp_decompress_hostptr_f64(double* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, double errorBound)
+{
+    // Data blocking.
+    int bsize = dec_tblock_size_f64;
+    int gsize = (nbEle + bsize * dec_chunk_f64 - 1) / (bsize * dec_chunk_f64);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * dec_chunk_f64;
+
+    // Initializing global memory for GPU compression.
+    double* d_decData;
+    unsigned char* d_cmpData;
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_decData, sizeof(double)*pad_nbEle);
+    cudaMemset(d_decData, 0, sizeof(double)*pad_nbEle);
+    cudaMalloc((void**)&d_cmpData, sizeof(double)*pad_nbEle);
+    cudaMemcpy(d_cmpData, cmpBytes, sizeof(unsigned char)*cmpSize, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_decompress_kernel_f64<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Move data back to CPU.
+    cudaMemcpy(decData, d_decData, sizeof(double)*nbEle, cudaMemcpyDeviceToHost);
+
+    // Free memoy that is used.
+    cudaFree(d_decData);
+    cudaFree(d_cmpData);
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+    cudaStreamDestroy(stream);
+}
+
+
+void SZp_compress_deviceptr_f64(double* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, double errorBound, cudaStream_t stream)
+{
+    // Data blocking.
+    int bsize = cmp_tblock_size_f64;
+    int gsize = (nbEle + bsize * cmp_chunk_f64 - 1) / (bsize * cmp_chunk_f64);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * cmp_chunk_f64;
+
+    // Initializing global memory for GPU compression.
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMallocManaged((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+    cudaMemset(d_oriData + nbEle, 0, (pad_nbEle - nbEle) * sizeof(double));
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_compress_kernel_f64<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Obtain compression ratio and move data back to CPU.  
+    *cmpSize = (size_t)d_cmpOffset[cmpOffSize-1] + (nbEle+31)/32;
+
+    // Free memory that is used.
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+}
+
+
+void SZp_decompress_deviceptr_f64(double* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, double errorBound, cudaStream_t stream)
+{
+    // Data blocking.
+    int bsize = dec_tblock_size_f64;
+    int gsize = (nbEle + bsize * dec_chunk_f64 - 1) / (bsize * dec_chunk_f64);
+    int cmpOffSize = gsize + 1;
+
+    // Initializing global memory for GPU compression.
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_decompress_kernel_f64<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Free memoy that is used.
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+}
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/src/cuSZp_f32.cu b/qtensor/compression/cuszp/cuSZp/src/cuSZp_f32.cu
new file mode 100644
index 00000000..90c2c45d
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/src/cuSZp_f32.cu
@@ -0,0 +1,335 @@
+#include "cuSZp_f32.h"
+
+__device__ inline int quantization_f32(float data, float recipPrecision)
+{
+    float dataRecip = data*recipPrecision;
+    int s = dataRecip>=-0.5f?0:1;
+    return (int)(dataRecip+0.5f) - s;
+}
+
+
+__device__ inline int get_bit_num(unsigned int x)
+{
+    return (sizeof(unsigned int)*8) - __clz(x);
+}
+
+
+__global__ void SZp_compress_kernel_f32(const float* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle)
+{
+    __shared__ unsigned int base_idx;
+
+    const int tid = threadIdx.x;
+    const int idx = blockIdx.x * blockDim.x + tid;
+    const int lane = idx & 31;
+    const int warp = idx >> 5;
+    const int block_num = cmp_chunk_f32/32;
+    const int start_idx = idx * cmp_chunk_f32;
+    const int start_block_idx = start_idx/32;
+    const int rate_ofs = (nbEle+31)/32;
+    const float recipPrecision = 0.5f/eb;
+
+    int temp_start_idx, temp_end_idx;
+    int quant_chunk_idx;
+    int block_idx;
+    int currQuant, lorenQuant, prevQuant, maxQuant;
+    int absQuant[cmp_chunk_f32];
+    unsigned int sign_flag[block_num];
+    int sign_ofs;
+    int fixed_rate[block_num];
+    unsigned int thread_ofs = 0;
+
+    for(int j=0; j<block_num; j++)
+    {
+        sign_flag[j] = 0;
+        temp_start_idx = start_idx + j*32;
+        temp_end_idx = temp_start_idx + 32;
+        block_idx = start_block_idx+j;
+        prevQuant = 0;
+        maxQuant = 0;
+
+        for(int i=temp_start_idx; i<temp_end_idx; i++)
+        {
+            quant_chunk_idx = i%cmp_chunk_f32;
+            currQuant = i > nbEle ? 0 : quantization_f32(oriData[i], recipPrecision);
+            lorenQuant = currQuant - prevQuant;
+            prevQuant = currQuant;
+            sign_ofs = i % 32;
+            sign_flag[j] |= (lorenQuant < 0) << (31 - sign_ofs);
+            absQuant[quant_chunk_idx] = abs(lorenQuant);
+            maxQuant = maxQuant > absQuant[quant_chunk_idx] ? maxQuant : absQuant[quant_chunk_idx];
+        }
+
+        fixed_rate[j] = get_bit_num(maxQuant);
+        thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
+        if(block_idx<rate_ofs) cmpData[block_idx] = (unsigned char)fixed_rate[j];
+    }
+    __syncthreads();
+
+    for(int i=1; i<32; i<<=1)
+    {
+        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
+        if(lane >= i) thread_ofs += tmp;
+    }
+    __syncthreads();
+
+    if(lane==31) 
+    {
+        cmpOffset[warp+1] = (thread_ofs+7)/8;
+        __threadfence();
+        if(warp==0)
+        {
+            flag[1] = 2;
+            __threadfence();
+        }
+        else
+        {
+            flag[warp+1] = 1;
+            __threadfence();
+        }
+    }
+    __syncthreads();
+
+    if(warp>0)
+    {
+        if(!lane)
+        {
+            int temp_flag = 1;
+            while(temp_flag!=2) temp_flag = flag[warp];
+            __threadfence();
+            cmpOffset[warp] += cmpOffset[warp-1];
+            if(warp==gridDim.x-1) cmpOffset[warp+1] += cmpOffset[warp];
+            __threadfence();
+            flag[warp+1] = 2;
+        }
+        
+    }
+    __syncthreads();
+
+    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
+    __syncthreads();
+
+    unsigned int prev_thread = __shfl_up_sync(0xffffffff, thread_ofs, 1);
+    unsigned int cmp_byte_ofs;
+    if(!lane) cmp_byte_ofs = base_idx;
+    else cmp_byte_ofs = base_idx + prev_thread / 8;
+    
+    for(int j=0; j<block_num; j++)  
+    {
+        int chunk_idx_start = j*32;
+
+        if(fixed_rate[j])
+        {
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 24);
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 16);
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 8);
+            cmpData[cmp_byte_ofs++] = 0xff & sign_flag[j];
+
+            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
+            int mask = 1;
+            for(int i=0; i<fixed_rate[j]; i++)
+            {
+                tmp_char0 = 0;
+                tmp_char1 = 0;
+                tmp_char2 = 0;
+                tmp_char3 = 0;
+
+                tmp_char0 = (((absQuant[chunk_idx_start+0] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+1] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+2] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+3] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+4] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+5] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+6] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+7] & mask) >> i) << 0);
+
+                tmp_char1 = (((absQuant[chunk_idx_start+8] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+9] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+10] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+11] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+12] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+13] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+14] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+15] & mask) >> i) << 0);
+
+                tmp_char2 = (((absQuant[chunk_idx_start+16] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+17] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+18] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+19] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+20] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+21] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+22] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+23] & mask) >> i) << 0);
+                
+                tmp_char3 = (((absQuant[chunk_idx_start+24] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+25] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+26] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+27] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+28] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+29] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+30] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+31] & mask) >> i) << 0);
+
+                // Move data to global memory.
+                cmpData[cmp_byte_ofs++] = tmp_char0;
+                cmpData[cmp_byte_ofs++] = tmp_char1;
+                cmpData[cmp_byte_ofs++] = tmp_char2;
+                cmpData[cmp_byte_ofs++] = tmp_char3;
+                mask <<= 1;
+            }
+        }
+    }
+}
+
+
+__global__ void SZp_decompress_kernel_f32(float* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle)
+{
+    __shared__ unsigned int base_idx;
+
+    const int tid = threadIdx.x;
+    const int idx = blockIdx.x * blockDim.x + tid;
+    const int lane = idx & 31;
+    const int warp = idx >> 5;
+    const int block_num = dec_chunk_f32/32;
+    const int start_idx = idx * dec_chunk_f32;
+    const int start_block_idx = start_idx/32;
+    const int rate_ofs = (nbEle+31)/32;
+
+    int temp_start_idx;
+    int block_idx;
+    int absQuant[32];
+    int currQuant, lorenQuant, prevQuant;
+    int sign_ofs;
+    int fixed_rate[block_num];
+    unsigned int thread_ofs = 0;
+
+    for(int j=0; j<block_num; j++)
+    {
+        block_idx = start_block_idx + j;
+        if(block_idx<rate_ofs) 
+        {
+            fixed_rate[j] = (int)cmpData[block_idx];
+            thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
+        }
+    }
+    __syncthreads();
+
+    for(int i=1; i<32; i<<=1)
+    {
+        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
+        if(lane >= i) thread_ofs += tmp;
+    }
+    __syncthreads();
+
+    if(lane==31) 
+    {
+        cmpOffset[warp+1] = (thread_ofs+7)/8;
+        __threadfence();
+        if(warp==0)
+        {
+            flag[1] = 2;
+            __threadfence();
+        }
+        else
+        {
+            flag[warp+1] = 1;
+            __threadfence();
+        }
+    }
+    __syncthreads();
+
+    if(warp>0)
+    {
+        if(!lane)
+        {
+            int temp_flag = 1;
+            while(temp_flag!=2) temp_flag = flag[warp];
+            __threadfence();
+            cmpOffset[warp] += cmpOffset[warp-1];
+            __threadfence();
+            flag[warp+1] = 2;
+        }
+    }
+    __syncthreads();
+
+    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
+    __syncthreads();
+
+    unsigned int prev_thread = __shfl_up_sync(0xffffffff, thread_ofs, 1);
+    unsigned int cmp_byte_ofs;
+    if(!lane) cmp_byte_ofs = base_idx;
+    else cmp_byte_ofs = base_idx + prev_thread / 8;
+
+    for(int j=0; j<block_num; j++)
+    {
+        temp_start_idx = start_idx + j*32;
+        unsigned int sign_flag = 0;
+
+        if(fixed_rate[j])
+        {
+            sign_flag = (0xff000000 & (cmpData[cmp_byte_ofs++] << 24)) |
+                        (0x00ff0000 & (cmpData[cmp_byte_ofs++] << 16)) |
+                        (0x0000ff00 & (cmpData[cmp_byte_ofs++] << 8))  |
+                        (0x000000ff & cmpData[cmp_byte_ofs++]);
+            
+            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
+            for(int i=0; i<32; i++) absQuant[i] = 0;
+            for(int i=0; i<fixed_rate[j]; i++)
+            {
+                tmp_char0 = cmpData[cmp_byte_ofs++];
+                tmp_char1 = cmpData[cmp_byte_ofs++];
+                tmp_char2 = cmpData[cmp_byte_ofs++];
+                tmp_char3 = cmpData[cmp_byte_ofs++];
+
+                absQuant[0] |= ((tmp_char0 >> 7) & 0x00000001) << i;
+                absQuant[1] |= ((tmp_char0 >> 6) & 0x00000001) << i;
+                absQuant[2] |= ((tmp_char0 >> 5) & 0x00000001) << i;
+                absQuant[3] |= ((tmp_char0 >> 4) & 0x00000001) << i;
+                absQuant[4] |= ((tmp_char0 >> 3) & 0x00000001) << i;
+                absQuant[5] |= ((tmp_char0 >> 2) & 0x00000001) << i;
+                absQuant[6] |= ((tmp_char0 >> 1) & 0x00000001) << i;
+                absQuant[7] |= ((tmp_char0 >> 0) & 0x00000001) << i;
+
+                absQuant[8] |= ((tmp_char1 >> 7) & 0x00000001) << i;
+                absQuant[9] |= ((tmp_char1 >> 6) & 0x00000001) << i;
+                absQuant[10] |= ((tmp_char1 >> 5) & 0x00000001) << i;
+                absQuant[11] |= ((tmp_char1 >> 4) & 0x00000001) << i;
+                absQuant[12] |= ((tmp_char1 >> 3) & 0x00000001) << i;
+                absQuant[13] |= ((tmp_char1 >> 2) & 0x00000001) << i;
+                absQuant[14] |= ((tmp_char1 >> 1) & 0x00000001) << i;
+                absQuant[15] |= ((tmp_char1 >> 0) & 0x00000001) << i;
+
+                absQuant[16] |= ((tmp_char2 >> 7) & 0x00000001) << i;
+                absQuant[17] |= ((tmp_char2 >> 6) & 0x00000001) << i;
+                absQuant[18] |= ((tmp_char2 >> 5) & 0x00000001) << i;
+                absQuant[19] |= ((tmp_char2 >> 4) & 0x00000001) << i;
+                absQuant[20] |= ((tmp_char2 >> 3) & 0x00000001) << i;
+                absQuant[21] |= ((tmp_char2 >> 2) & 0x00000001) << i;
+                absQuant[22] |= ((tmp_char2 >> 1) & 0x00000001) << i;
+                absQuant[23] |= ((tmp_char2 >> 0) & 0x00000001) << i;
+
+                absQuant[24] |= ((tmp_char3 >> 7) & 0x00000001) << i;
+                absQuant[25] |= ((tmp_char3 >> 6) & 0x00000001) << i;
+                absQuant[26] |= ((tmp_char3 >> 5) & 0x00000001) << i;
+                absQuant[27] |= ((tmp_char3 >> 4) & 0x00000001) << i;
+                absQuant[28] |= ((tmp_char3 >> 3) & 0x00000001) << i;
+                absQuant[29] |= ((tmp_char3 >> 2) & 0x00000001) << i;
+                absQuant[30] |= ((tmp_char3 >> 1) & 0x00000001) << i;
+                absQuant[31] |= ((tmp_char3 >> 0) & 0x00000001) << i;
+            }
+            prevQuant = 0;
+            for(int i=0; i<32; i++)
+            {
+                sign_ofs = i % 32;
+                if(sign_flag & (1 << (31 - sign_ofs)))
+                    lorenQuant = absQuant[i] * -1;
+                else
+                    lorenQuant = absQuant[i];
+                currQuant = lorenQuant + prevQuant;
+                if(temp_start_idx+i < nbEle){
+                    decData[temp_start_idx+i] = currQuant * eb * 2;
+                }
+                prevQuant = currQuant;
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/src/cuSZp_f64.cu b/qtensor/compression/cuszp/cuSZp/src/cuSZp_f64.cu
new file mode 100644
index 00000000..c92dacba
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/src/cuSZp_f64.cu
@@ -0,0 +1,333 @@
+#include "cuSZp_f64.h"
+
+__device__ inline int quantization_f64(double data, double recipPrecision)
+{
+    double dataRecip = data*recipPrecision;
+    int s = dataRecip>=-0.5?0:1;
+    return (int)(dataRecip+0.5) - s;
+}
+
+
+__device__ inline int get_bit_num(unsigned int x)
+{
+    return (sizeof(unsigned int)*8) - __clz(x);
+}
+
+
+__global__ void SZp_compress_kernel_f64(const double* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const double eb, const size_t nbEle)
+{
+    __shared__ unsigned int base_idx;
+
+    const int tid = threadIdx.x;
+    const int idx = blockIdx.x * blockDim.x + tid;
+    const int lane = idx & 31;
+    const int warp = idx >> 5;
+    const int block_num = cmp_chunk_f64/32;
+    const int start_idx = idx * cmp_chunk_f64;
+    const int start_block_idx = start_idx/32;
+    const int rate_ofs = (nbEle+31)/32;
+    const double recipPrecision = 0.5/eb;
+
+    int temp_start_idx, temp_end_idx;
+    int quant_chunk_idx;
+    int block_idx;
+    int currQuant, lorenQuant, prevQuant, maxQuant;
+    int absQuant[cmp_chunk_f64];
+    unsigned int sign_flag[block_num];
+    int sign_ofs;
+    int fixed_rate[block_num];
+    unsigned int thread_ofs = 0;
+
+    for(int j=0; j<block_num; j++)
+    {
+        sign_flag[j] = 0;
+        temp_start_idx = start_idx + j*32;
+        temp_end_idx = temp_start_idx + 32;
+        block_idx = start_block_idx+j;
+        prevQuant = 0;
+        maxQuant = 0;
+
+        for(int i=temp_start_idx; i<temp_end_idx; i++)
+        {
+            quant_chunk_idx = i%cmp_chunk_f64;
+            currQuant = quantization_f64(oriData[i], recipPrecision);
+            lorenQuant = currQuant - prevQuant;
+            prevQuant = currQuant;
+            sign_ofs = i % 32;
+            sign_flag[j] |= (lorenQuant < 0) << (31 - sign_ofs);
+            absQuant[quant_chunk_idx] = abs(lorenQuant);
+            maxQuant = maxQuant > absQuant[quant_chunk_idx] ? maxQuant : absQuant[quant_chunk_idx];
+        }
+
+        fixed_rate[j] = get_bit_num(maxQuant);
+        thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
+        if(block_idx<rate_ofs) cmpData[block_idx] = (unsigned char)fixed_rate[j];
+    }
+    __syncthreads();
+
+    for(int i=1; i<32; i<<=1)
+    {
+        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
+        if(lane >= i) thread_ofs += tmp;
+    }
+    __syncthreads();
+
+    if(lane==31) 
+    {
+        cmpOffset[warp+1] = (thread_ofs+7)/8;
+        __threadfence();
+        if(warp==0)
+        {
+            flag[1] = 2;
+            __threadfence();
+        }
+        else
+        {
+            flag[warp+1] = 1;
+            __threadfence();
+        }
+    }
+    __syncthreads();
+
+    if(warp>0)
+    {
+        if(!lane)
+        {
+            int temp_flag = 1;
+            while(temp_flag!=2) temp_flag = flag[warp];
+            __threadfence();
+            cmpOffset[warp] += cmpOffset[warp-1];
+            if(warp==gridDim.x-1) cmpOffset[warp+1] += cmpOffset[warp];
+            __threadfence();
+            flag[warp+1] = 2;
+        }
+        
+    }
+    __syncthreads();
+
+    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
+    __syncthreads();
+
+    unsigned int prev_thread = __shfl_up_sync(0xffffffff, thread_ofs, 1);
+    unsigned int cmp_byte_ofs;
+    if(!lane) cmp_byte_ofs = base_idx;
+    else cmp_byte_ofs = base_idx + prev_thread / 8;
+    
+    for(int j=0; j<block_num; j++)  
+    {
+        int chunk_idx_start = j*32;
+
+        if(fixed_rate[j])
+        {
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 24);
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 16);
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 8);
+            cmpData[cmp_byte_ofs++] = 0xff & sign_flag[j];
+
+            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
+            int mask = 1;
+            for(int i=0; i<fixed_rate[j]; i++)
+            {
+                tmp_char0 = 0;
+                tmp_char1 = 0;
+                tmp_char2 = 0;
+                tmp_char3 = 0;
+
+                tmp_char0 = (((absQuant[chunk_idx_start+0] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+1] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+2] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+3] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+4] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+5] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+6] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+7] & mask) >> i) << 0);
+
+                tmp_char1 = (((absQuant[chunk_idx_start+8] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+9] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+10] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+11] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+12] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+13] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+14] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+15] & mask) >> i) << 0);
+
+                tmp_char2 = (((absQuant[chunk_idx_start+16] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+17] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+18] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+19] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+20] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+21] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+22] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+23] & mask) >> i) << 0);
+                
+                tmp_char3 = (((absQuant[chunk_idx_start+24] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+25] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+26] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+27] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+28] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+29] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+30] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+31] & mask) >> i) << 0);
+
+                // Move data to global memory.
+                cmpData[cmp_byte_ofs++] = tmp_char0;
+                cmpData[cmp_byte_ofs++] = tmp_char1;
+                cmpData[cmp_byte_ofs++] = tmp_char2;
+                cmpData[cmp_byte_ofs++] = tmp_char3;
+                mask <<= 1;
+            }
+        }
+    }
+}
+
+
+__global__ void SZp_decompress_kernel_f64(double* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const double eb, const size_t nbEle)
+{
+    __shared__ unsigned int base_idx;
+
+    const int tid = threadIdx.x;
+    const int idx = blockIdx.x * blockDim.x + tid;
+    const int lane = idx & 31;
+    const int warp = idx >> 5;
+    const int block_num = dec_chunk_f64/32;
+    const int start_idx = idx * dec_chunk_f64;
+    const int start_block_idx = start_idx/32;
+    const int rate_ofs = (nbEle+31)/32;
+
+    int temp_start_idx;
+    int block_idx;
+    int absQuant[32];
+    int currQuant, lorenQuant, prevQuant;
+    int sign_ofs;
+    int fixed_rate[block_num];
+    unsigned int thread_ofs = 0;
+
+    for(int j=0; j<block_num; j++)
+    {
+        block_idx = start_block_idx + j;
+        if(block_idx<rate_ofs) 
+        {
+            fixed_rate[j] = (int)cmpData[block_idx];
+            thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
+        }
+    }
+    __syncthreads();
+
+    for(int i=1; i<32; i<<=1)
+    {
+        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
+        if(lane >= i) thread_ofs += tmp;
+    }
+    __syncthreads();
+
+    if(lane==31) 
+    {
+        cmpOffset[warp+1] = (thread_ofs+7)/8;
+        __threadfence();
+        if(warp==0)
+        {
+            flag[1] = 2;
+            __threadfence();
+        }
+        else
+        {
+            flag[warp+1] = 1;
+            __threadfence();
+        }
+    }
+    __syncthreads();
+
+    if(warp>0)
+    {
+        if(!lane)
+        {
+            int temp_flag = 1;
+            while(temp_flag!=2) temp_flag = flag[warp];
+            __threadfence();
+            cmpOffset[warp] += cmpOffset[warp-1];
+            __threadfence();
+            flag[warp+1] = 2;
+        }
+    }
+    __syncthreads();
+
+    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
+    __syncthreads();
+
+    unsigned int prev_thread = __shfl_up_sync(0xffffffff, thread_ofs, 1);
+    unsigned int cmp_byte_ofs;
+    if(!lane) cmp_byte_ofs = base_idx;
+    else cmp_byte_ofs = base_idx + prev_thread / 8;
+
+    for(int j=0; j<block_num; j++)
+    {
+        temp_start_idx = start_idx + j*32;
+        unsigned int sign_flag = 0;
+
+        if(fixed_rate[j])
+        {
+            sign_flag = (0xff000000 & (cmpData[cmp_byte_ofs++] << 24)) |
+                        (0x00ff0000 & (cmpData[cmp_byte_ofs++] << 16)) |
+                        (0x0000ff00 & (cmpData[cmp_byte_ofs++] << 8))  |
+                        (0x000000ff & cmpData[cmp_byte_ofs++]);
+            
+            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
+            for(int i=0; i<32; i++) absQuant[i] = 0;
+            for(int i=0; i<fixed_rate[j]; i++)
+            {
+                tmp_char0 = cmpData[cmp_byte_ofs++];
+                tmp_char1 = cmpData[cmp_byte_ofs++];
+                tmp_char2 = cmpData[cmp_byte_ofs++];
+                tmp_char3 = cmpData[cmp_byte_ofs++];
+
+                absQuant[0] |= ((tmp_char0 >> 7) & 0x00000001) << i;
+                absQuant[1] |= ((tmp_char0 >> 6) & 0x00000001) << i;
+                absQuant[2] |= ((tmp_char0 >> 5) & 0x00000001) << i;
+                absQuant[3] |= ((tmp_char0 >> 4) & 0x00000001) << i;
+                absQuant[4] |= ((tmp_char0 >> 3) & 0x00000001) << i;
+                absQuant[5] |= ((tmp_char0 >> 2) & 0x00000001) << i;
+                absQuant[6] |= ((tmp_char0 >> 1) & 0x00000001) << i;
+                absQuant[7] |= ((tmp_char0 >> 0) & 0x00000001) << i;
+
+                absQuant[8] |= ((tmp_char1 >> 7) & 0x00000001) << i;
+                absQuant[9] |= ((tmp_char1 >> 6) & 0x00000001) << i;
+                absQuant[10] |= ((tmp_char1 >> 5) & 0x00000001) << i;
+                absQuant[11] |= ((tmp_char1 >> 4) & 0x00000001) << i;
+                absQuant[12] |= ((tmp_char1 >> 3) & 0x00000001) << i;
+                absQuant[13] |= ((tmp_char1 >> 2) & 0x00000001) << i;
+                absQuant[14] |= ((tmp_char1 >> 1) & 0x00000001) << i;
+                absQuant[15] |= ((tmp_char1 >> 0) & 0x00000001) << i;
+
+                absQuant[16] |= ((tmp_char2 >> 7) & 0x00000001) << i;
+                absQuant[17] |= ((tmp_char2 >> 6) & 0x00000001) << i;
+                absQuant[18] |= ((tmp_char2 >> 5) & 0x00000001) << i;
+                absQuant[19] |= ((tmp_char2 >> 4) & 0x00000001) << i;
+                absQuant[20] |= ((tmp_char2 >> 3) & 0x00000001) << i;
+                absQuant[21] |= ((tmp_char2 >> 2) & 0x00000001) << i;
+                absQuant[22] |= ((tmp_char2 >> 1) & 0x00000001) << i;
+                absQuant[23] |= ((tmp_char2 >> 0) & 0x00000001) << i;
+
+                absQuant[24] |= ((tmp_char3 >> 7) & 0x00000001) << i;
+                absQuant[25] |= ((tmp_char3 >> 6) & 0x00000001) << i;
+                absQuant[26] |= ((tmp_char3 >> 5) & 0x00000001) << i;
+                absQuant[27] |= ((tmp_char3 >> 4) & 0x00000001) << i;
+                absQuant[28] |= ((tmp_char3 >> 3) & 0x00000001) << i;
+                absQuant[29] |= ((tmp_char3 >> 2) & 0x00000001) << i;
+                absQuant[30] |= ((tmp_char3 >> 1) & 0x00000001) << i;
+                absQuant[31] |= ((tmp_char3 >> 0) & 0x00000001) << i;
+            }
+            prevQuant = 0;
+            for(int i=0; i<32; i++)
+            {
+                sign_ofs = i % 32;
+                if(sign_flag & (1 << (31 - sign_ofs)))
+                    lorenQuant = absQuant[i] * -1;
+                else
+                    lorenQuant = absQuant[i];
+                currQuant = lorenQuant + prevQuant;
+                decData[temp_start_idx+i] = currQuant * eb * 2;
+                prevQuant = currQuant;
+            }
+        }
+    }
+}
diff --git a/qtensor/compression/cuszp/cuSZp/src/cuSZp_timer.cu b/qtensor/compression/cuszp/cuSZp/src/cuSZp_timer.cu
new file mode 100644
index 00000000..74c81c30
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/src/cuSZp_timer.cu
@@ -0,0 +1,31 @@
+#include "cuSZp_timer.h"
+
+TimingGPU::TimingGPU() { privateTimingGPU = new PrivateTimingGPU;  }
+
+TimingGPU::~TimingGPU() { }
+
+void TimingGPU::StartCounter()
+{
+    cudaEventCreate(&((*privateTimingGPU).start));
+    cudaEventCreate(&((*privateTimingGPU).stop));
+    cudaEventRecord((*privateTimingGPU).start,0);
+}
+
+void TimingGPU::StartCounterFlags()
+{
+    int eventflags = cudaEventBlockingSync;
+
+    cudaEventCreateWithFlags(&((*privateTimingGPU).start),eventflags);
+    cudaEventCreateWithFlags(&((*privateTimingGPU).stop),eventflags);
+    cudaEventRecord((*privateTimingGPU).start,0);
+}
+
+// Gets the counter in ms
+float TimingGPU::GetCounter()
+{
+    float time;
+    cudaEventRecord((*privateTimingGPU).stop, 0);
+    cudaEventSynchronize((*privateTimingGPU).stop);
+    cudaEventElapsedTime(&time,(*privateTimingGPU).start,(*privateTimingGPU).stop);
+    return time;
+}
diff --git a/qtensor/compression/cuszp/cuSZp/src/cuSZp_utility.cu b/qtensor/compression/cuszp/cuSZp/src/cuSZp_utility.cu
new file mode 100644
index 00000000..077951f8
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/src/cuSZp_utility.cu
@@ -0,0 +1,614 @@
+//
+// Created by Yafan Huang on 5/31/22.
+//     Copied from SZ2, QCAT, and SZx.
+//
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <math.h>
+#include <string.h>
+#include "cuSZp_utility.h"
+
+/*Macro Definition for Processing Data*/
+#define SZ_SCES 0  //successful
+#define SZ_NSCS -1 //Not successful
+#define SZ_FERR -2 //Failed to open input file
+#define SZ_TERR -3 //wrong data type (should be only float or double)
+#define RW_SCES 0
+#define RW_FERR 1
+#define RW_TERR 2
+#define LITTLE_ENDIAN_SYSTEM 0
+#define QCAT_BUFS 64
+
+
+/*Global Varaibles for Processing Data*/
+int dataEndianType_Yafan = 0;
+int sysEndianType_Yafan = 0; //0 means little endian, 1 means big endian
+
+
+typedef union llfloat
+{
+    float value;
+    unsigned int ivalue;
+    unsigned char byte[4];
+} llfloat;
+
+
+typedef union lldouble
+{
+    double value;
+    uint64_t lvalue;
+    unsigned char byte[8];
+} lldouble;
+
+
+/** ************************************************************************
+ * @brief Reverse 4-bit-length unsigned char array.
+ * 
+ * @param   data[4]         4-bit-length unsigned char array.
+ * *********************************************************************** */
+void symTransForm_4Bytes(unsigned char data[4])
+{
+        unsigned char tmp = data[0];
+        data[0] = data[3];
+        data[3] = tmp;
+
+        tmp = data[1];
+        data[1] = data[2];
+        data[2] = tmp;
+}
+
+
+/** ************************************************************************
+ * @brief Reverse 8-bit-length unsigned char array.
+ * 
+ * @param   data[8]         8-bit-length unsigned char array.
+ * *********************************************************************** */
+void symTransform_8bytes(unsigned char data[8])
+{
+	unsigned char tmp = data[0];
+	data[0] = data[7];
+	data[7] = tmp;
+
+	tmp = data[1];
+	data[1] = data[6];
+	data[6] = tmp;
+
+	tmp = data[2];
+	data[2] = data[5];
+	data[5] = tmp;
+
+	tmp = data[3];
+	data[3] = data[4];
+	data[4] = tmp;
+}
+
+
+/** ************************************************************************
+ * @brief Read byte data from path to source binary format file.
+ *        Usually used for decompressing data from input file.
+ *        Variables byteLength and status can be obtained through this function.       
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   byteLength      the length of byte array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  byteBuf         unsigned char array with length byteLength
+ * *********************************************************************** */
+unsigned char *readByteData_Yafan(char *srcFilePath, size_t *byteLength, int *status)
+{
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = RW_FERR;
+        return 0;
+    }
+	fseek(pFile, 0, SEEK_END);
+    *byteLength = ftell(pFile);
+    fclose(pFile);
+    
+    unsigned char *byteBuf = ( unsigned char *)malloc((*byteLength)*sizeof(unsigned char)); //sizeof(char)==1
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = RW_FERR;
+        return 0;
+    }
+    fread(byteBuf, 1, *byteLength, pFile);
+    fclose(pFile);
+    *status = RW_SCES;
+    return byteBuf;
+}
+
+
+/** ************************************************************************
+ * @brief Read float data from path to source binary format file in endian systems.
+ *        Usually used for compressing data from input file.
+ *        Variables nbEle and status can be obtained through this function. 
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   nbEle           the length of float array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  daBuf           float array with length nbEle
+ * *********************************************************************** */
+float *readFloatData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = RW_FERR;
+        return NULL;
+    }
+	fseek(pFile, 0, SEEK_END);
+    inSize = ftell(pFile);
+    *nbEle = inSize/4; 
+    fclose(pFile);
+    
+    if(inSize<=0)
+    {
+		printf("Error: input file is wrong!\n");
+		*status = RW_FERR;
+	}
+    
+    float *daBuf = (float *)malloc(inSize);
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = RW_FERR;
+        return NULL;
+    }
+    fread(daBuf, 4, *nbEle, pFile);
+    fclose(pFile);
+    *status = RW_SCES;
+    return daBuf;
+}
+
+
+/** ************************************************************************
+ * @brief Read float data from path to source binary format file.
+ *        Usually used for compressing data from input file.
+ *        Variables nbEle and status can be obtained through this function. 
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   nbEle           the length of float array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  daBuf           float array with length nbEle
+ * *********************************************************************** */
+float *readFloatData_Yafan(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = RW_SCES;
+	if(dataEndianType_Yafan==sysEndianType_Yafan)
+	{
+		float *daBuf = readFloatData_systemEndian_Yafan(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+		
+		size_t byteLength;
+		unsigned char* bytes = readByteData_Yafan(srcFilePath, &byteLength, &state);
+		if(state == RW_FERR)
+		{
+			*status = RW_FERR;
+			return NULL;
+		}
+		float *daBuf = (float *)malloc(byteLength);
+		*nbEle = byteLength/4;
+		
+		llfloat buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i*4;
+			memcpy(buf.byte, bytes+j, 4);
+			symTransForm_4Bytes(buf.byte);
+			daBuf[i] = buf.value;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+/** ************************************************************************
+ * @brief Read double data from path to source binary format file in endian systems.
+ *        Usually used for compressing data from input file.
+ *        Variables nbEle and status can be obtained through this function. 
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   nbEle           the length of double array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  daBuf           double array with length nbEle
+ * *********************************************************************** */
+double *readDoubleData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = SZ_FERR;
+        return NULL;
+    }
+	fseek(pFile, 0, SEEK_END);
+    inSize = ftell(pFile);
+    *nbEle = inSize/8; //only support double in this version
+    fclose(pFile);
+
+    double *daBuf = (double *)malloc(inSize);
+
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = SZ_FERR;
+        return NULL;
+    }
+    fread(daBuf, 8, *nbEle, pFile);
+    fclose(pFile);
+    *status = SZ_SCES;
+    return daBuf;
+}
+
+
+/** ************************************************************************
+ * @brief Read double data from path to source binary format file.
+ *        Usually used for compressing data from input file.
+ *        Variables nbEle and status can be obtained through this function. 
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   nbEle           the length of double array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  daBuf           double array with length nbEle
+ * *********************************************************************** */
+double *readDoubleData_Yafan(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType_Yafan==sysEndianType_Yafan)
+	{
+		double *daBuf = readDoubleData_systemEndian_Yafan(srcFilePath, nbEle,&state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData_Yafan(srcFilePath, &byteLength, &state);
+		if(state==SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		double *daBuf = (double *)malloc(byteLength);
+		*nbEle = byteLength/8;
+
+		lldouble buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i*8;
+			memcpy(buf.byte, bytes+j, 8);
+			symTransform_8bytes(buf.byte);
+			daBuf[i] = buf.value;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+
+/** ************************************************************************
+ * @brief Write byte data to binary format file.
+ *        Usually used for writing compressed data.
+ *        Variable status can be obtained/switched through this function. 
+ * 
+ * @param   bytes           unsigned char array (compressed data)
+ * @param   byteLength      the length of unsigned char array
+ * @param   tgtFilePath     output file path
+ * @param   status          data processing states (macro definitions) 
+ * *********************************************************************** */
+void writeByteData_Yafan(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status)
+{
+	FILE *pFile = fopen(tgtFilePath, "wb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 3\n");
+        *status = RW_FERR;
+        return;
+    }
+    
+    fwrite(bytes, 1, byteLength, pFile); //write outSize bytes
+    fclose(pFile);
+    *status = RW_SCES;
+}
+
+
+/** ************************************************************************
+ * @brief Write float data to binary format file.
+ *        Usually used for writing decompressed (reconstructed) data.
+ *        Variable status can be obtained/switched through this function. 
+ * 
+ * @param   bytes           unsigned char array (compressed data)
+ * @param   nbEle           the length of float array
+ * @param   tgtFilePath     output file path
+ * @param   status          data processing states (macro definitions) 
+ * *********************************************************************** */
+void writeFloatData_inBytes_Yafan(float *data, size_t nbEle, char* tgtFilePath, int *status)
+{
+	size_t i = 0; 
+	int state = RW_SCES;
+	llfloat buf;
+	unsigned char* bytes = (unsigned char*)malloc(nbEle*sizeof(float));
+	for(i=0;i<nbEle;i++)
+	{
+		buf.value = data[i];
+		bytes[i*4+0] = buf.byte[0];
+		bytes[i*4+1] = buf.byte[1];
+		bytes[i*4+2] = buf.byte[2];
+		bytes[i*4+3] = buf.byte[3];					
+	}
+
+	size_t byteLength = nbEle*sizeof(float);
+	writeByteData_Yafan(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+
+/** ************************************************************************
+ * @brief Write double data to binary format file.
+ *        Usually used for writing decompressed (reconstructed) data.
+ *        Variable status can be obtained/switched through this function. 
+ * 
+ * @param   bytes           unsigned char array (compressed data)
+ * @param   nbEle           the length of float array
+ * @param   tgtFilePath     output file path
+ * @param   status          data processing states (macro definitions) 
+ * *********************************************************************** */
+void writeDoubleData_inBytes_Yafan(double *data, size_t nbEle, char* tgtFilePath, int *status)
+{
+	size_t i = 0, index = 0;
+	int state = SZ_SCES;
+	lldouble buf;
+	unsigned char* bytes = (unsigned char*)malloc(nbEle*sizeof(double));
+	for(i=0;i<nbEle;i++)
+	{
+		index = i*8;
+		buf.value = data[i];
+		bytes[index+0] = buf.byte[0];
+		bytes[index+1] = buf.byte[1];
+		bytes[index+2] = buf.byte[2];
+		bytes[index+3] = buf.byte[3];
+		bytes[index+4] = buf.byte[4];
+		bytes[index+5] = buf.byte[5];
+		bytes[index+6] = buf.byte[6];
+		bytes[index+7] = buf.byte[7];
+	}
+
+	size_t byteLength = nbEle*sizeof(double);
+	writeByteData_Yafan(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+
+/** ************************************************************************
+ * @brief Calculate SSIM in a small fraction of a 3D data file.
+ *        A subfunction used in computeSSIM().
+ * 
+ * @param   data            original float array
+ * @param   other           other (reconstructed) float array
+ * @param   size1           3d-ssim setting.
+ * @param   size0           3d-ssim setting.
+ * @param   offset0         3d-ssim setting.
+ * @param   offset1         3d-ssim setting.
+ * @param   offset2         3d-ssim setting.
+ * @param   windowSize0     3d-ssim setting.
+ * @param   windowSize1     3d-ssim setting.
+ * @param   windowSize2     3d-ssim setting.
+ * 
+ * @return  ssim            ssim value of the current small fraction data
+ * *********************************************************************** */
+double SSIM_3d_calcWindow_float(float* data, float* other, size_t size1, size_t size0, int offset0, int offset1, int offset2, int windowSize0, int windowSize1, int windowSize2) {
+    int i0,i1,i2,index;
+    int np=0; //Number of points
+    float xMin=data[offset0+size0*(offset1+size1*offset2)];
+    float xMax=data[offset0+size0*(offset1+size1*offset2)];
+    float yMin=other[offset0+size0*(offset1+size1*offset2)];
+    float yMax=other[offset0+size0*(offset1+size1*offset2)];
+    double xSum=0;
+    double ySum=0;
+    for(i2=offset2; i2<offset2+windowSize2; i2++) {
+        for(i1=offset1; i1<offset1+windowSize1; i1++) {
+            for(i0=offset0; i0<offset0+windowSize0; i0++) {
+                np++;
+                index=i0+size0*(i1+size1*i2);
+                if(xMin>data[index])
+                    xMin=data[index];
+                if(xMax<data[index])
+                    xMax=data[index];
+                if(yMin>other[index])
+                    yMin=other[index];
+                if(yMax<other[index])
+                    yMax=other[index];
+                xSum+=data[index];
+                ySum+=other[index];
+            }
+        }
+    }
+    double xMean=xSum/np;
+    double yMean=ySum/np;
+    double var_x = 0, var_y = 0, var_xy = 0;
+    for(i2=offset2; i2<offset2+windowSize2; i2++) {
+        for(i1=offset1; i1<offset1+windowSize1; i1++) {
+            for(i0=offset0; i0<offset0+windowSize0; i0++) {
+                index=i0+size0*(i1+size1*i2);
+                var_x += (data[index] - xMean)*(data[index] - xMean);
+                var_y += (other[index] - yMean)*(other[index] - yMean);
+                var_xy += (data[index] - xMean)*(other[index] - yMean);
+            }
+        }
+    }
+    var_x /= np;
+    var_y /= np;
+    var_xy /= np;
+    double xSigma=sqrt(var_x);
+    double ySigma=sqrt(var_y);
+    double xyCov = var_xy;
+    double c1,c2;
+    if(xMax-xMin==0) {
+		/*K1==0.01, K2==0.03*/
+        c1=0.01*0.01;
+        c2=0.03*0.03;
+    } else {
+        c1=0.01*0.01*(xMax-xMin)*(xMax-xMin);
+        c2=0.03*0.03*(xMax-xMin)*(xMax-xMin);
+    }
+    double c3=c2/2;
+    double luminance=(2*xMean*yMean+c1)/(xMean*xMean+yMean*yMean+c1);
+    double contrast=(2*xSigma*ySigma+c2)/(xSigma*xSigma+ySigma*ySigma+c2);
+    double structure=(xyCov+c3)/(xSigma*ySigma+c3);
+    double ssim=luminance*contrast*structure;
+    return ssim;
+}
+
+
+/** ************************************************************************
+ * @brief Calculate SSIM between 3D original and decompressed (reconstructed) data.
+ *        API for computing SSIM.
+ * 
+ * @param   oriData         original float array
+ * @param   decData         decompressed (reconstructed) float array
+ * @param   size2           the 1st dim of 3D data.
+ * @param   size1           the 2nd dim of 3D data.
+ * @param   size0           the 3rd dim of 3D data. (the fastest dim)
+ * 
+ * @return  ssimSum/nw      final ssim value between oriData and decData
+ * *********************************************************************** */
+double computeSSIM(float* oriData, float* decData, size_t size2, size_t size1, size_t size0)
+{
+	int windowSize0=7;
+	int windowSize1=7;
+	int windowSize2=7;
+	int windowShift0=2;
+	int windowShift1=2;
+	int windowShift2=2;
+    int offset0,offset1,offset2;
+    int nw=0; //Number of windows
+    double ssimSum=0;
+    int offsetInc0,offsetInc1,offsetInc2;
+    if(windowSize0>size0) {
+        printf("ERROR: windowSize0 = %d > %zu\n", windowSize0, size0);
+    }
+    if(windowSize1>size1) {
+        printf("ERROR: windowSize1 = %d > %zu\n", windowSize1, size1);
+    }
+    if(windowSize2>size2) {
+        printf("ERROR: windowSize2 = %d > %zu\n", windowSize2, size2);
+    }
+    //offsetInc0=windowSize0/2;
+    //offsetInc1=windowSize1/2;
+    //offsetInc2=windowSize2/2;
+    offsetInc0=windowShift0;
+    offsetInc1=windowShift1;
+    offsetInc2=windowShift2;
+    for(offset2=0; offset2+windowSize2<=size2; offset2+=offsetInc2) { //MOVING WINDOW
+        for(offset1=0; offset1+windowSize1<=size1; offset1+=offsetInc1) { //MOVING WINDOW
+            for(offset0=0; offset0+windowSize0<=size0; offset0+=offsetInc0) { //MOVING WINDOW
+                nw++;
+                ssimSum+=SSIM_3d_calcWindow_float(oriData, decData, size1, size0, offset0, offset1, offset2, windowSize0, windowSize1, windowSize2);
+            }
+        }
+    }
+    return ssimSum/nw;
+}
+
+/** ************************************************************************
+ * @brief Calculate PSNR between 3D original and decompressed (reconstructed) data.
+ *        API for computing PSNR.
+ * 
+ * @param   nbEle           the length of float array
+ * @param   ori_data        original float array
+ * @param   dec_data        decompressed (reconstructed) float array
+ * 
+ * @return  result          6-length double array, which contains:
+ *                              0. *Mean Square Error (MSE)*
+ *                              1. *Value Range (Max-Min)*
+ *                              2. *Peak Signal-to-noise Ratio (PSNR)*
+ *                              3. Squared Error
+ *                              4. Normalized Squared Error
+ *                              5. Normalized Squared MSE
+ * *********************************************************************** */
+double *computePSNR(size_t nbEle, float *ori_data, float *data) {
+    size_t i = 0;
+    double Max = 0, Min = 0, diffMax = 0;
+    Max = ori_data[0];
+    Min = ori_data[0];
+    diffMax = data[0] > ori_data[0] ? data[0] - ori_data[0] : ori_data[0] - data[0];
+
+    //diffMax = fabs(data[0] - ori_data[0]);
+    double sum1 = 0, sum2 = 0, sum22 = 0;
+
+    for (i = 0; i < nbEle; i++) {
+        sum1 += ori_data[i];
+        sum2 += data[i];
+        sum22 += data[i] * data[i];
+    }
+    double mean1 = sum1 / nbEle;
+    double mean2 = sum2 / nbEle;
+
+    double sum3 = 0, sum4 = 0;
+    double sum = 0, prodSum = 0, relerr = 0;
+
+    double maxpw_relerr = 0;
+    for (i = 0; i < nbEle; i++) {
+        if (Max < ori_data[i]) Max = ori_data[i];
+        if (Min > ori_data[i]) Min = ori_data[i];
+
+        float err = fabs(data[i] - ori_data[i]);
+        if (ori_data[i] != 0) {
+            relerr = err / fabs(ori_data[i]);
+            if (maxpw_relerr < relerr)
+                maxpw_relerr = relerr;
+        }
+
+        if (diffMax < err)
+            diffMax = err;
+        prodSum += (ori_data[i] - mean1) * (data[i] - mean2);
+        sum3 += (ori_data[i] - mean1) * (ori_data[i] - mean1);
+        sum4 += (data[i] - mean2) * (data[i] - mean2);
+        sum += err * err;
+    }
+    double std1 = sqrt(sum3 / nbEle);
+    double std2 = sqrt(sum4 / nbEle);
+    double ee = prodSum / nbEle;
+    double acEff = ee / std1 / std2;
+
+    double mse = sum / nbEle;
+    double range = Max - Min;
+    double psnr = 20 * log10(range) - 10 * log10(mse);
+    double normErr = sqrt(sum);
+    double normErr_norm = normErr / sqrt(sum22);
+    double nrmse = sqrt(mse) / range;
+    double *result = (double *) malloc(sizeof(double) * 6);
+    result[0] = mse;
+    result[1] = range;
+    result[2] = psnr;
+    result[3] = normErr;
+    result[4] = normErr_norm;
+    result[5] = nrmse;
+
+    return result;
+}
\ No newline at end of file

From 019b506d05048ca773ca50de246a99448b7f6f83 Mon Sep 17 00:00:00 2001
From: Danil <mail@example.com>
Date: Thu, 15 Feb 2024 20:00:56 +0000
Subject: [PATCH 120/126] small fix to Compressor api

---
 qtensor/compression/Compressor.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index 18126bbf..2b4f7afe 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -142,6 +142,13 @@ def decompress(self, ptr):
         ptr.seek(0)
         return  np.load(ptr)['arr_0']
 
+    def free_compressed(self, ptr):
+        del ptr
+        return
+
+    def free_decompressed(self):
+        return
+
 class TorchCompressor(Compressor):
     def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
         self.r2r_error = r2r_error

From a61a658991ea740d011d5d1def0e73be48c5a337 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Fri, 15 Mar 2024 11:14:43 -0500
Subject: [PATCH 121/126] replace lineend characters

---
 .../compression/cusz/include/cli/analyzer.hh  |  556 ++--
 .../compression/cusz/include/cli/document.hh  |  544 ++--
 .../cusz/include/cli/quality_viewer.hh        |  326 +--
 qtensor/compression/cusz/include/cli/query.hh |  142 +-
 .../compression/cusz/include/cli/query_dev.hh |  136 +-
 .../cusz/include/cli/timerecord_viewer.hh     |  218 +-
 .../compression/cusz/include/cli/verify.hh    |  174 +-
 qtensor/compression/cusz/include/common.hh    |   36 +-
 .../cusz/include/common/capsule.hh            |  804 +++---
 .../cusz/include/common/configs.hh            |  708 ++---
 .../cusz/include/common/definition.hh         |  132 +-
 .../cusz/include/common/type_traits.hh        |  216 +-
 .../compression/cusz/include/compaction.hh    |   36 +-
 qtensor/compression/cusz/include/component.hh |   36 +-
 .../cusz/include/component/glue.cuh           |  240 +-
 .../component/pred_boilerplate_deprecated.hh  |  420 +--
 .../cusz/include/component/prediction.inl     |  386 +--
 .../cusz/include/component/spcodec.inl        |  436 +--
 .../compression/cusz/include/compressor.hh    |  330 +--
 qtensor/compression/cusz/include/context.hh   |  502 ++--
 qtensor/compression/cusz/include/cusz.h       |  120 +-
 .../compression/cusz/include/cusz/custom.h    |   52 +-
 qtensor/compression/cusz/include/cusz/it.hh   |  154 +-
 qtensor/compression/cusz/include/cusz/nd.h    |   28 +-
 qtensor/compression/cusz/include/cusz/pn.hh   |   98 +-
 .../compression/cusz/include/cusz/record.h    |   76 +-
 qtensor/compression/cusz/include/cusz/type.h  |  438 +--
 qtensor/compression/cusz/include/framework.hh |  124 +-
 qtensor/compression/cusz/include/header.h     |  222 +-
 qtensor/compression/cusz/include/hf/hf.hh     |  340 +--
 .../compression/cusz/include/hf/hf_bookg.hh   |   90 +-
 .../compression/cusz/include/hf/hf_codecg.hh  |  164 +-
 .../compression/cusz/include/hf/hf_struct.h   |  106 +-
 .../cusz/include/kernel/claunch_cuda.h        |   98 +-
 .../cusz/include/kernel/cpplaunch_cuda.hh     |  102 +-
 .../cusz/include/kernel/dryrun.cuh            |   92 +-
 .../cusz/include/kernel/launch_spm.cuh        |  696 ++---
 .../cusz/include/kernel/lorenzo_all.h         |   88 +-
 .../cusz/include/kernel/lorenzo_all.hh        |  192 +-
 .../compression/cusz/include/kernel/spv_gpu.h |   84 +-
 .../cusz/include/kernel/spv_gpu.hh            |   66 +-
 .../cusz/include/kernel/v2_lorenzo.hh         |   64 +-
 .../cusz/include/pipeline/compaction_g.inl    |  146 +-
 .../cusz/include/pipeline/v2_compressor.hh    |  292 +-
 .../compression/cusz/include/stat/compare.h   |  114 +-
 .../cusz/include/stat/compare_cpu.hh          |  124 +-
 .../cusz/include/stat/compare_gpu.hh          |   66 +-
 qtensor/compression/cusz/include/stat/stat.h  |   58 +-
 qtensor/compression/cusz/include/stat/stat.hh |   30 +-
 .../compression/cusz/include/stat/stat_g.hh   |   88 +-
 qtensor/compression/cusz/include/utils.hh     |   40 +-
 .../cusz/include/utils/cuda_err.cuh           |  370 +--
 .../cusz/include/utils/cuda_mem.cuh           |  200 +-
 .../cusz/include/utils/cusparse_err.cuh       |  120 +-
 .../compression/cusz/include/utils/format.hh  |  114 +-
 qtensor/compression/cusz/include/utils/io.hh  |  118 +-
 .../cusz/include/utils/print_gpu.h            |   90 +-
 .../cusz/include/utils/print_gpu.hh           |   42 +-
 .../cusz/include/utils/strhelper.hh           |  288 +-
 .../compression/cusz/include/utils/timer.h    |  184 +-
 .../compression/cusz/include/utils/timer.hh   |  306 +-
 qtensor/compression/cusz/src/cli/cli.cu       |   28 +-
 qtensor/compression/cusz/src/cli/cli.cuh      |  390 +--
 .../compression/cusz/src/cli/dryrun_part.cu   |   34 +-
 .../compression/cusz/src/cli/dryrun_part.cuh  |  392 +--
 qtensor/compression/cusz/src/cli_bin.cu       |   54 +-
 qtensor/compression/cusz/src/compressor.cc    |  298 +-
 qtensor/compression/cusz/src/context.cc       |  986 +++----
 qtensor/compression/cusz/src/cusz/custom.cc   |   68 +-
 qtensor/compression/cusz/src/cusz_lib.cc      |  228 +-
 .../compression/cusz/src/cusz_version.h.in    |    6 +-
 qtensor/compression/cusz/src/cusz_wrapper.cu  |  308 +-
 qtensor/compression/cusz/src/cusz_wrapper.py  |  346 +--
 .../cusz/src/detail/compare_cpu.inl           |  218 +-
 .../cusz/src/detail/compare_gpu.inl           |  386 +--
 .../cusz/src/detail/compressor_impl.cu        |   36 +-
 .../cusz/src/detail/compressor_impl.inl       |  958 +++----
 qtensor/compression/cusz/src/detail/spmat.cu  |   28 +-
 .../compression/cusz/src/detail/spv_gpu.inl   |  154 +-
 qtensor/compression/cusz/src/detail/spvec.cu  |   36 +-
 .../cusz/src/experimental/Makefile            |   14 +-
 .../src/experimental/dpcpp_demo_lorenzo.cu    |  240 +-
 .../cusz/src/hf/detail/hf_bookg.inl           | 1484 +++++-----
 .../cusz/src/hf/detail/hf_codecg.inl          |  592 ++--
 .../cusz/src/hf/detail/hf_pimpl.inl           |  728 ++---
 .../cusz/src/hf/detail/par_merge.inl          |  888 +++---
 qtensor/compression/cusz/src/hf/hf.cc         |  218 +-
 qtensor/compression/cusz/src/hf/hf_bookg.cu   |   66 +-
 qtensor/compression/cusz/src/hf/hf_codecg.cu  |  538 ++--
 qtensor/compression/cusz/src/hf/hf_pimpl.cu   |   62 +-
 .../cusz/src/kernel/claunch_cuda.cu           |  152 +-
 .../cusz/src/kernel/detail/hist.inl           |  200 +-
 .../cusz/src/kernel/detail/lorenzo.inl        | 1632 +++++------
 .../cusz/src/kernel/detail/lorenzo23.inl      | 2474 ++++++++---------
 .../cusz/src/kernel/detail/lorenzo_proto.inl  |  428 +--
 .../cusz/src/kernel/detail/lorenzo_serial.inl |  652 ++---
 .../cusz/src/kernel/detail/lorenzo_var.inl    | 1060 +++----
 .../cusz/src/kernel/detail/spline3.inl        | 1492 +++++-----
 .../cusz/src/kernel/detail/subroutine.inl     | 2148 +++++++-------
 .../cusz/src/kernel/detail/subsub.inl         |  184 +-
 .../compression/cusz/src/kernel/lorenzo.cu    |  418 +--
 .../cusz/src/kernel/lorenzo_proto.cu          |  352 +--
 .../cusz/src/kernel/lorenzo_serial.cc         |  236 +-
 .../cusz/src/kernel/lorenzo_var.cu            |  412 +--
 .../cusz/src/kernel/preprocess.cuh            |  130 +-
 qtensor/compression/cusz/src/kernel/rle.cuh   |  148 +-
 .../compression/cusz/src/kernel/spv_gpu.cu    |  120 +-
 .../compression/cusz/src/kernel/v2_lorenzo.cu |  236 +-
 .../cusz/src/pipeline/v2_compressor.cc        |  222 +-
 .../cusz/src/pipeline/v2_compressor_impl.cu   |   28 +-
 .../cusz/src/pipeline/v2_compressor_impl.inl  |  478 ++--
 qtensor/compression/cusz/src/stat/cmpg1_1.cu  |   60 +-
 qtensor/compression/cusz/src/stat/cmpg1_2.cu  |   58 +-
 qtensor/compression/cusz/src/stat/cmpg1_3.cu  |   58 +-
 qtensor/compression/cusz/src/stat/cmpg1_4.cu  |   58 +-
 qtensor/compression/cusz/src/stat/cmpg1_5.cu  |   58 +-
 qtensor/compression/cusz/src/stat/cmpg2.cu    |   68 +-
 qtensor/compression/cusz/src/stat/cmpg3.cu    |   64 +-
 qtensor/compression/cusz/src/stat/cmpg4_1.cu  |   48 +-
 qtensor/compression/cusz/src/stat/cmpg4_2.cu  |   50 +-
 qtensor/compression/cusz/src/stat/cmpg4_3.cu  |   46 +-
 qtensor/compression/cusz/src/stat/cmpg4_4.cu  |   48 +-
 .../compression/cusz/src/stat/compare_cpu.cc  |   86 +-
 qtensor/compression/cusz/src/stat/stat_g.cu   |  190 +-
 .../compression/cusz/src/utils/dbg_print.cuh  |  262 +-
 .../compression/cusz/src/utils/print_gpu.cu   |  242 +-
 .../compression/cusz/src/utils/timer_cpu.cc   |   60 +-
 .../compression/cusz/src/utils/timer_gpu.cu   |  164 +-
 .../compression/cusz/src/utils/vis_stat.hh    |  274 +-
 .../compression/cuszp/cuSZp/CMakeLists.txt    |  156 +-
 .../compression/cuszp/cuSZp/Config.cmake.in   |   10 +-
 qtensor/compression/cuszp/cuSZp/LICENSE       |   58 +-
 qtensor/compression/cuszp/cuSZp/README.md     |  212 +-
 .../cuszp/cuSZp/cmake/Installing.cmake        |  134 +-
 .../cuszp/cuSZp/examples/CMakeLists.txt       |   88 +-
 .../cuSZp/examples/cuSZp_cpu_f32_api.cpp      |  164 +-
 .../cuSZp/examples/cuSZp_cpu_f64_api.cpp      |  164 +-
 .../cuSZp/examples/cuSZp_gpu_f32_api.cpp      |  236 +-
 .../cuSZp/examples/cuSZp_gpu_f64_api.cpp      |  238 +-
 .../cuszp/cuSZp/include/cuSZp_entry_f32.h     |   20 +-
 .../cuszp/cuSZp/include/cuSZp_entry_f64.h     |   20 +-
 .../cuszp/cuSZp/include/cuSZp_f32.h           |   22 +-
 .../cuszp/cuSZp/include/cuSZp_f64.h           |   22 +-
 .../cuszp/cuSZp/include/cuSZp_timer.h         |   60 +-
 .../cuszp/cuSZp/include/cuSZp_utility.h       |   34 +-
 .../cuszp/cuSZp/src/cuSZp_entry_f32.cu        |  296 +-
 .../cuszp/cuSZp/src/cuSZp_entry_f64.cu        |  296 +-
 .../compression/cuszp/cuSZp/src/cuSZp_f32.cu  |  668 ++---
 .../compression/cuszp/cuSZp/src/cuSZp_f64.cu  |  666 ++---
 .../cuszp/cuSZp/src/cuSZp_timer.cu            |   62 +-
 .../cuszp/cuSZp/src/cuSZp_utility.cu          | 1226 ++++----
 qtensor/compression/cuszp/cuSZp_interface.cpp |  274 +-
 qtensor/compression/cuszp/cuszp_wrapper.py    |  226 +-
 qtensor/compression/cuszp/gnncuszp.py         |  692 ++---
 qtensor/compression/cuszp/setup.py            |   56 +-
 qtensor/compression/newsz/newsz.cu            |  496 ++--
 qtensor/compression/newsz/newsz.h             |    6 +-
 qtensor/compression/newsz/newsz_wrapper.cu    |   42 +-
 qtensor/compression/newsz/newsz_wrapper.py    |  322 +--
 qtensor/compression/szp/include/cuSZp.h       |   22 +-
 qtensor/compression/szp/include/cuSZp_entry.h |   22 +-
 qtensor/compression/szp/include/cuSZp_timer.h |   60 +-
 .../compression/szp/include/cuSZp_utility.h   |   26 +-
 qtensor/compression/szp/src/cuSZp.cu          |  784 +++---
 qtensor/compression/szp/src/cuSZp_entry.cu    |  294 +-
 qtensor/compression/szp/src/cuSZp_timer.cu    |   62 +-
 qtensor/compression/szp/src/cuSZp_utility.cu  |  984 +++----
 qtensor/compression/szp/src/cuSZp_wrapper.cu  |   74 +-
 qtensor/compression/szp/src/cuSZp_wrapper.py  |  380 +--
 .../compression/torch_quant/torch_quant.py    |  348 +--
 .../torch_quant/torch_quant_perchannel.py     |  406 +--
 171 files changed, 23985 insertions(+), 23985 deletions(-)

diff --git a/qtensor/compression/cusz/include/cli/analyzer.hh b/qtensor/compression/cusz/include/cli/analyzer.hh
index 7ff4b37d..8c58a71c 100644
--- a/qtensor/compression/cusz/include/cli/analyzer.hh
+++ b/qtensor/compression/cusz/include/cli/analyzer.hh
@@ -1,278 +1,278 @@
-/**
- * @file analyzer.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.2
- * @date 2021-03-26
- *
- * (C) 2021 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef ANALYSIS_ANALYZER_HH
-#define ANALYSIS_ANALYZER_HH
-
-#include <cstdio>
-#include <iostream>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-#include <thrust/device_ptr.h>
-#include <thrust/extrema.h>
-#include <thrust/sort.h>
-
-#include <algorithm>
-#include <numeric>
-
-#include "../hf/hf_bookg.hh"
-#include "../hf/hf_codecg.hh"
-#include "../kernel/cpplaunch_cuda.hh"
-#include "../utils/timer.hh"
-
-using std::cout;
-
-#if __cplusplus >= 201703L
-#define CONSTEXPR constexpr
-#else
-#define CONSTEXPR
-#endif
-
-enum class ExecutionPolicy { host, cuda_device };
-enum class AnalyzerMethod { thrust, cuda_native, stl };
-
-class Analyzer {
-    typedef struct ExtremaResult {
-        double max_val, min_val, rng;
-        double seconds;
-    } extrema_result_t;
-
-    typedef struct Compressibility {
-        size_t len;
-        struct {
-            double       entropy;
-            unsigned int top1_freq;
-            double       top1_prob;
-            double       dropout_equiv_bitlen_2x() const { return 64 * (1 - top1_prob); }
-            double       dropout_equiv_bitlen_1_5x() const { return 48 * (1 - top1_prob); }
-        } hist;
-        struct {
-            double r_lowerbound;
-            double avgb_lowerbound;
-            double r_upperbound;
-            double avgb_upperbound;
-        } huffman_theory;
-        struct {
-            double min_bitlen;
-            double avgb;
-        } huffman_stat;
-    } theory_t;
-
-    theory_t theory;
-
-   public:
-    Analyzer()  = default;
-    ~Analyzer() = default;
-
-    // TODO execution policy
-    template <typename T, ExecutionPolicy policy = ExecutionPolicy::host>
-    static std::vector<T> percentile100(T* in, size_t len)
-    {
-        std::vector<T> res;
-        auto           step = int(ceil(len / 100));
-
-        if CONSTEXPR (policy == ExecutionPolicy::cuda_device) {
-            // caveat: no residence check
-            thrust::sort(thrust::device, in, in + len);
-            T* htmp;
-            cudaMallocHost(&htmp, sizeof(T) * len);
-            cudaMemcpy(htmp, in, sizeof(T) * len, cudaMemcpyDeviceToHost);
-            for (auto i = 0; i < len; i += step) {  //
-                res.push_back(htmp[i]);
-            }
-            res.push_back(htmp[len - 1]);
-            cudaFreeHost(htmp);
-        }
-        else {  // fallback
-            std::sort(in, in + len);
-            for (auto i = 0; i < len; i += step) {  //
-                res.push_back(in[i]);
-            }
-            res.push_back(in[len - 1]);
-        }
-
-        return res;
-    }
-
-    template <typename Data, ExecutionPolicy policy, AnalyzerMethod method>
-    static extrema_result_t get_maxmin_rng(Data* d_data, size_t len)
-    {
-        if CONSTEXPR (policy == ExecutionPolicy::cuda_device and method == AnalyzerMethod::thrust) {
-            auto t0 = hires::now();
-            // ------------------------------------------------------------
-            thrust::device_ptr<Data> g_ptr = thrust::device_pointer_cast(d_data);
-
-            auto max_el_loc = thrust::max_element(g_ptr, g_ptr + len);  // excluding padded
-            auto min_el_loc = thrust::min_element(g_ptr, g_ptr + len);  // excluding padded
-
-            double max_val = *max_el_loc;
-            double min_val = *min_el_loc;
-            double rng     = max_val - min_val;
-            // ------------------------------------------------------------
-            auto t1 = hires::now();
-
-            return extrema_result_t{max_val, min_val, rng, static_cast<duration_t>(t1 - t0).count()};
-        }
-        else {
-            throw std::runtime_error("Analyzer::get_maxmin_rng() Other policy and method not implemented.");
-        }
-    }
-
-    template <typename UInt, ExecutionPolicy policy, AnalyzerMethod method>
-    static void get_histogram(UInt* data, size_t data_len, unsigned int* freq, size_t num_bins)
-    {
-        // TODO static check UInt
-        if CONSTEXPR (policy == ExecutionPolicy::cuda_device and method == AnalyzerMethod::cuda_native) {
-            float dummy;
-            launch_histogram(data, data_len, freq, num_bins, dummy);
-        }
-        else {
-            // TODO static check
-            throw std::runtime_error("Analyzer::get_histogram() using other policy or method not implemented.");
-        }
-    }
-
-    Analyzer& estimate_compressibility_from_histogram(unsigned int* h_freq, size_t dict_size)
-    {
-        auto   len       = std::accumulate(h_freq, h_freq + dict_size, 0u);  // excluding outlier
-        auto   top1_freq = *std::max_element(h_freq, h_freq + dict_size);
-        double top1_prob = (1.0 * top1_freq) / (1.0 * len);
-        double entropy   = 0.0;
-        for (auto i = 0; i < dict_size; i++) {
-            double p = h_freq[i] / (1.0 * len);
-            if (p != 0) entropy += -std::log2(p) * p;
-        }
-        double r_lowerbound    = 1 - (-std::log2(top1_prob) * top1_prob - std::log2(1 - top1_prob) * (1 - top1_prob));
-        double r_upperbound    = top1_prob + 0.086;  // [Gallager 78]
-        double avgb_lowerbound = entropy + r_lowerbound;
-        double avgb_upperbound = entropy + r_upperbound;
-
-        // dropout
-        // auto equiv_bitlen_dropout_2x   = 64 * (1 - top1_prob);
-        // auto equiv_bitlen_dropout_1_5x = 48 * (1 - top1_prob);
-
-        // record
-        theory.len                            = len;
-        theory.hist.entropy                   = entropy;
-        theory.hist.top1_freq                 = top1_freq;
-        theory.hist.top1_prob                 = top1_prob;
-        theory.huffman_theory.r_lowerbound    = r_lowerbound;
-        theory.huffman_theory.r_upperbound    = r_upperbound;
-        theory.huffman_theory.avgb_lowerbound = avgb_lowerbound;
-        theory.huffman_theory.avgb_upperbound = avgb_upperbound;
-
-        return *this;
-    };
-
-    template <typename Huff>
-    Analyzer&
-    get_stat_from_huffman_book(const unsigned int* h_freq, const Huff* h_codebook, size_t len, size_t num_bins)
-    {
-        // real-bitlen, for reference only, not part of workflow
-        std::vector<Huff>         v_canon_cb(h_codebook, h_codebook + num_bins);
-        std::vector<unsigned int> v_freq(h_freq, h_freq + num_bins);
-
-        // TODO somewhere explicitly state that null codeword is of length 0xff
-        std::sort(v_canon_cb.begin(), v_canon_cb.end(), [](Huff& a, Huff& b) {
-            auto a_bits = reinterpret_cast<struct PackedWordByWidth<sizeof(Huff)>*>(&a)->bits;
-            auto b_bits = reinterpret_cast<struct PackedWordByWidth<sizeof(Huff)>*>(&b)->bits;
-            return a_bits < b_bits;
-        });
-        std::sort(v_freq.begin(), v_freq.end(), std::greater<Huff>());
-
-        double real_avgb = 0.0;
-        for (auto i = 0; i < num_bins; i++) {
-            if (v_freq[i] != 0) {
-                auto bits = reinterpret_cast<struct PackedWordByWidth<sizeof(Huff)>*>(&v_canon_cb[i])->bits;
-                real_avgb += v_freq[i] * bits;
-            }
-        }
-        real_avgb /= len;
-
-        theory.huffman_stat.avgb = real_avgb;
-        theory.huffman_stat.min_bitlen =
-            reinterpret_cast<struct PackedWordByWidth<sizeof(Huff)>*>(&v_canon_cb.at(0))->bits;
-
-        return *this;
-    }
-
-    Analyzer&
-    print_compressibility(bool print_huffman_stat = false, bool print_dropout = false, double equiv_origin_bitlen = 32)
-    {
-        cout << "\n\e[31m";  // extra linebreak on start
-
-        cout << "* Derived from histogram:" << '\n';
-        cout << "  - len (freq sum):\t" << theory.len << '\n';
-        cout << "  - entropy H(X):\t" << theory.hist.entropy << '\n';
-        cout << "  - most likely freq:\t" << theory.hist.top1_freq << '\n';
-        cout << "  - most likely prob (p1):\t" << theory.hist.top1_prob << '\n';
-        cout << '\n';
-
-        if (theory.hist.top1_prob < 0.4) {
-            cout << "* The probability of the most likely symbol < 0.4, go recoding (Huffman)." << '\n';
-            cout << "* Compressibility lower bound is for reference only." << '\n';
-            cout << "  - est. redundancy upper bound (arbitrary p1):\t" << theory.huffman_theory.r_upperbound << '\n';
-            cout << "  - est. avg.bitlen upper bound (arbitrary p1):\t" << theory.huffman_theory.avgb_upperbound
-                 << '\n';
-            cout << "  - est. CR lower bound (arbitrary p1):\t"
-                 << equiv_origin_bitlen / theory.huffman_theory.avgb_upperbound << '\n';
-            cout << '\n';
-        }
-        else {
-            cout << "* Compressibility upper bound is determined by the lower bound of average bitlength." << '\n';
-            cout << "  - est. redundancy lower bound (p1 > 0.4):\t" << theory.huffman_theory.r_lowerbound << '\n';
-            cout << "  - est. avg.bitlen lower bound (p1 > 0.4):\t" << theory.huffman_theory.avgb_lowerbound << '\n';
-            cout << "  - est. CR upper bound (arbitrary p1):\t"
-                 << equiv_origin_bitlen / theory.huffman_theory.avgb_lowerbound << '\n';
-            cout << '\n';
-
-            cout << "* Compressibility lower bound is for reference only." << '\n';
-            cout << "  - est. redundancy upper bound (arbitrary p1):\t" << theory.huffman_theory.r_upperbound << '\n';
-            cout << "  - est. avg.bitlen upper bound (arbitrary p1):\t" << theory.huffman_theory.avgb_upperbound
-                 << '\n';
-            cout << "  - est. CR lower bound (arbitrary p1):\t"
-                 << equiv_origin_bitlen / theory.huffman_theory.avgb_upperbound << '\n';
-            cout << '\n';
-
-            if (print_dropout) {
-                auto dropout_equiv_bitlen_2x   = theory.hist.dropout_equiv_bitlen_2x();
-                auto dropout_equiv_bitlen_1_5x = theory.hist.dropout_equiv_bitlen_1_5x();
-                // TODO determine path, print log
-                cout << "* Considering dropout:" << '\n';
-                cout << "  - dropout at 1.0x metadata overhead" << '\n';
-                cout << "    | equiv.bitlen:\t" << dropout_equiv_bitlen_2x << '\n';
-                cout << "    | reduction rate:\t" << (equiv_origin_bitlen / dropout_equiv_bitlen_2x) << '\n';
-                cout << "    | bitlen_dropout <= bitlen_enc?\t"
-                     << (dropout_equiv_bitlen_2x <= theory.huffman_theory.avgb_lowerbound) << '\n';
-                cout << "  - dropout at 0.5x metadata overhead" << '\n';
-                cout << "    | equiv.bitlen:\t" << dropout_equiv_bitlen_1_5x << '\n';
-                cout << "    | reduction rate (fp32):\t" << (equiv_origin_bitlen / dropout_equiv_bitlen_1_5x) << '\n';
-                cout << "    | bitlen_dropout <= bitlen_enc?\t"
-                     << (dropout_equiv_bitlen_1_5x <= theory.huffman_theory.avgb_lowerbound) << '\n';
-                cout << '\n';
-            }
-        }
-
-        if (print_huffman_stat) {
-            cout << "* From Huffman codebook:" << '\n';
-            cout << "  - avg. bitlen:\t" << theory.huffman_stat.avgb << '\n';
-            cout << "  - shortest bitlen:\t" << theory.huffman_stat.min_bitlen << '\n';
-            cout << '\n';
-        }
-        cout << "\e[0m";
-
-        return *this;
-    }
-};
-
-#endif
+/**
+ * @file analyzer.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.2
+ * @date 2021-03-26
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef ANALYSIS_ANALYZER_HH
+#define ANALYSIS_ANALYZER_HH
+
+#include <cstdio>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <thrust/device_ptr.h>
+#include <thrust/extrema.h>
+#include <thrust/sort.h>
+
+#include <algorithm>
+#include <numeric>
+
+#include "../hf/hf_bookg.hh"
+#include "../hf/hf_codecg.hh"
+#include "../kernel/cpplaunch_cuda.hh"
+#include "../utils/timer.hh"
+
+using std::cout;
+
+#if __cplusplus >= 201703L
+#define CONSTEXPR constexpr
+#else
+#define CONSTEXPR
+#endif
+
+enum class ExecutionPolicy { host, cuda_device };
+enum class AnalyzerMethod { thrust, cuda_native, stl };
+
+class Analyzer {
+    typedef struct ExtremaResult {
+        double max_val, min_val, rng;
+        double seconds;
+    } extrema_result_t;
+
+    typedef struct Compressibility {
+        size_t len;
+        struct {
+            double       entropy;
+            unsigned int top1_freq;
+            double       top1_prob;
+            double       dropout_equiv_bitlen_2x() const { return 64 * (1 - top1_prob); }
+            double       dropout_equiv_bitlen_1_5x() const { return 48 * (1 - top1_prob); }
+        } hist;
+        struct {
+            double r_lowerbound;
+            double avgb_lowerbound;
+            double r_upperbound;
+            double avgb_upperbound;
+        } huffman_theory;
+        struct {
+            double min_bitlen;
+            double avgb;
+        } huffman_stat;
+    } theory_t;
+
+    theory_t theory;
+
+   public:
+    Analyzer()  = default;
+    ~Analyzer() = default;
+
+    // TODO execution policy
+    template <typename T, ExecutionPolicy policy = ExecutionPolicy::host>
+    static std::vector<T> percentile100(T* in, size_t len)
+    {
+        std::vector<T> res;
+        auto           step = int(ceil(len / 100));
+
+        if CONSTEXPR (policy == ExecutionPolicy::cuda_device) {
+            // caveat: no residence check
+            thrust::sort(thrust::device, in, in + len);
+            T* htmp;
+            cudaMallocHost(&htmp, sizeof(T) * len);
+            cudaMemcpy(htmp, in, sizeof(T) * len, cudaMemcpyDeviceToHost);
+            for (auto i = 0; i < len; i += step) {  //
+                res.push_back(htmp[i]);
+            }
+            res.push_back(htmp[len - 1]);
+            cudaFreeHost(htmp);
+        }
+        else {  // fallback
+            std::sort(in, in + len);
+            for (auto i = 0; i < len; i += step) {  //
+                res.push_back(in[i]);
+            }
+            res.push_back(in[len - 1]);
+        }
+
+        return res;
+    }
+
+    template <typename Data, ExecutionPolicy policy, AnalyzerMethod method>
+    static extrema_result_t get_maxmin_rng(Data* d_data, size_t len)
+    {
+        if CONSTEXPR (policy == ExecutionPolicy::cuda_device and method == AnalyzerMethod::thrust) {
+            auto t0 = hires::now();
+            // ------------------------------------------------------------
+            thrust::device_ptr<Data> g_ptr = thrust::device_pointer_cast(d_data);
+
+            auto max_el_loc = thrust::max_element(g_ptr, g_ptr + len);  // excluding padded
+            auto min_el_loc = thrust::min_element(g_ptr, g_ptr + len);  // excluding padded
+
+            double max_val = *max_el_loc;
+            double min_val = *min_el_loc;
+            double rng     = max_val - min_val;
+            // ------------------------------------------------------------
+            auto t1 = hires::now();
+
+            return extrema_result_t{max_val, min_val, rng, static_cast<duration_t>(t1 - t0).count()};
+        }
+        else {
+            throw std::runtime_error("Analyzer::get_maxmin_rng() Other policy and method not implemented.");
+        }
+    }
+
+    template <typename UInt, ExecutionPolicy policy, AnalyzerMethod method>
+    static void get_histogram(UInt* data, size_t data_len, unsigned int* freq, size_t num_bins)
+    {
+        // TODO static check UInt
+        if CONSTEXPR (policy == ExecutionPolicy::cuda_device and method == AnalyzerMethod::cuda_native) {
+            float dummy;
+            launch_histogram(data, data_len, freq, num_bins, dummy);
+        }
+        else {
+            // TODO static check
+            throw std::runtime_error("Analyzer::get_histogram() using other policy or method not implemented.");
+        }
+    }
+
+    Analyzer& estimate_compressibility_from_histogram(unsigned int* h_freq, size_t dict_size)
+    {
+        auto   len       = std::accumulate(h_freq, h_freq + dict_size, 0u);  // excluding outlier
+        auto   top1_freq = *std::max_element(h_freq, h_freq + dict_size);
+        double top1_prob = (1.0 * top1_freq) / (1.0 * len);
+        double entropy   = 0.0;
+        for (auto i = 0; i < dict_size; i++) {
+            double p = h_freq[i] / (1.0 * len);
+            if (p != 0) entropy += -std::log2(p) * p;
+        }
+        double r_lowerbound    = 1 - (-std::log2(top1_prob) * top1_prob - std::log2(1 - top1_prob) * (1 - top1_prob));
+        double r_upperbound    = top1_prob + 0.086;  // [Gallager 78]
+        double avgb_lowerbound = entropy + r_lowerbound;
+        double avgb_upperbound = entropy + r_upperbound;
+
+        // dropout
+        // auto equiv_bitlen_dropout_2x   = 64 * (1 - top1_prob);
+        // auto equiv_bitlen_dropout_1_5x = 48 * (1 - top1_prob);
+
+        // record
+        theory.len                            = len;
+        theory.hist.entropy                   = entropy;
+        theory.hist.top1_freq                 = top1_freq;
+        theory.hist.top1_prob                 = top1_prob;
+        theory.huffman_theory.r_lowerbound    = r_lowerbound;
+        theory.huffman_theory.r_upperbound    = r_upperbound;
+        theory.huffman_theory.avgb_lowerbound = avgb_lowerbound;
+        theory.huffman_theory.avgb_upperbound = avgb_upperbound;
+
+        return *this;
+    };
+
+    template <typename Huff>
+    Analyzer&
+    get_stat_from_huffman_book(const unsigned int* h_freq, const Huff* h_codebook, size_t len, size_t num_bins)
+    {
+        // real-bitlen, for reference only, not part of workflow
+        std::vector<Huff>         v_canon_cb(h_codebook, h_codebook + num_bins);
+        std::vector<unsigned int> v_freq(h_freq, h_freq + num_bins);
+
+        // TODO somewhere explicitly state that null codeword is of length 0xff
+        std::sort(v_canon_cb.begin(), v_canon_cb.end(), [](Huff& a, Huff& b) {
+            auto a_bits = reinterpret_cast<struct PackedWordByWidth<sizeof(Huff)>*>(&a)->bits;
+            auto b_bits = reinterpret_cast<struct PackedWordByWidth<sizeof(Huff)>*>(&b)->bits;
+            return a_bits < b_bits;
+        });
+        std::sort(v_freq.begin(), v_freq.end(), std::greater<Huff>());
+
+        double real_avgb = 0.0;
+        for (auto i = 0; i < num_bins; i++) {
+            if (v_freq[i] != 0) {
+                auto bits = reinterpret_cast<struct PackedWordByWidth<sizeof(Huff)>*>(&v_canon_cb[i])->bits;
+                real_avgb += v_freq[i] * bits;
+            }
+        }
+        real_avgb /= len;
+
+        theory.huffman_stat.avgb = real_avgb;
+        theory.huffman_stat.min_bitlen =
+            reinterpret_cast<struct PackedWordByWidth<sizeof(Huff)>*>(&v_canon_cb.at(0))->bits;
+
+        return *this;
+    }
+
+    Analyzer&
+    print_compressibility(bool print_huffman_stat = false, bool print_dropout = false, double equiv_origin_bitlen = 32)
+    {
+        cout << "\n\e[31m";  // extra linebreak on start
+
+        cout << "* Derived from histogram:" << '\n';
+        cout << "  - len (freq sum):\t" << theory.len << '\n';
+        cout << "  - entropy H(X):\t" << theory.hist.entropy << '\n';
+        cout << "  - most likely freq:\t" << theory.hist.top1_freq << '\n';
+        cout << "  - most likely prob (p1):\t" << theory.hist.top1_prob << '\n';
+        cout << '\n';
+
+        if (theory.hist.top1_prob < 0.4) {
+            cout << "* The probability of the most likely symbol < 0.4, go recoding (Huffman)." << '\n';
+            cout << "* Compressibility lower bound is for reference only." << '\n';
+            cout << "  - est. redundancy upper bound (arbitrary p1):\t" << theory.huffman_theory.r_upperbound << '\n';
+            cout << "  - est. avg.bitlen upper bound (arbitrary p1):\t" << theory.huffman_theory.avgb_upperbound
+                 << '\n';
+            cout << "  - est. CR lower bound (arbitrary p1):\t"
+                 << equiv_origin_bitlen / theory.huffman_theory.avgb_upperbound << '\n';
+            cout << '\n';
+        }
+        else {
+            cout << "* Compressibility upper bound is determined by the lower bound of average bitlength." << '\n';
+            cout << "  - est. redundancy lower bound (p1 > 0.4):\t" << theory.huffman_theory.r_lowerbound << '\n';
+            cout << "  - est. avg.bitlen lower bound (p1 > 0.4):\t" << theory.huffman_theory.avgb_lowerbound << '\n';
+            cout << "  - est. CR upper bound (arbitrary p1):\t"
+                 << equiv_origin_bitlen / theory.huffman_theory.avgb_lowerbound << '\n';
+            cout << '\n';
+
+            cout << "* Compressibility lower bound is for reference only." << '\n';
+            cout << "  - est. redundancy upper bound (arbitrary p1):\t" << theory.huffman_theory.r_upperbound << '\n';
+            cout << "  - est. avg.bitlen upper bound (arbitrary p1):\t" << theory.huffman_theory.avgb_upperbound
+                 << '\n';
+            cout << "  - est. CR lower bound (arbitrary p1):\t"
+                 << equiv_origin_bitlen / theory.huffman_theory.avgb_upperbound << '\n';
+            cout << '\n';
+
+            if (print_dropout) {
+                auto dropout_equiv_bitlen_2x   = theory.hist.dropout_equiv_bitlen_2x();
+                auto dropout_equiv_bitlen_1_5x = theory.hist.dropout_equiv_bitlen_1_5x();
+                // TODO determine path, print log
+                cout << "* Considering dropout:" << '\n';
+                cout << "  - dropout at 1.0x metadata overhead" << '\n';
+                cout << "    | equiv.bitlen:\t" << dropout_equiv_bitlen_2x << '\n';
+                cout << "    | reduction rate:\t" << (equiv_origin_bitlen / dropout_equiv_bitlen_2x) << '\n';
+                cout << "    | bitlen_dropout <= bitlen_enc?\t"
+                     << (dropout_equiv_bitlen_2x <= theory.huffman_theory.avgb_lowerbound) << '\n';
+                cout << "  - dropout at 0.5x metadata overhead" << '\n';
+                cout << "    | equiv.bitlen:\t" << dropout_equiv_bitlen_1_5x << '\n';
+                cout << "    | reduction rate (fp32):\t" << (equiv_origin_bitlen / dropout_equiv_bitlen_1_5x) << '\n';
+                cout << "    | bitlen_dropout <= bitlen_enc?\t"
+                     << (dropout_equiv_bitlen_1_5x <= theory.huffman_theory.avgb_lowerbound) << '\n';
+                cout << '\n';
+            }
+        }
+
+        if (print_huffman_stat) {
+            cout << "* From Huffman codebook:" << '\n';
+            cout << "  - avg. bitlen:\t" << theory.huffman_stat.avgb << '\n';
+            cout << "  - shortest bitlen:\t" << theory.huffman_stat.min_bitlen << '\n';
+            cout << '\n';
+        }
+        cout << "\e[0m";
+
+        return *this;
+    }
+};
+
+#endif
diff --git a/qtensor/compression/cusz/include/cli/document.hh b/qtensor/compression/cusz/include/cli/document.hh
index 240de036..ed68bdf5 100644
--- a/qtensor/compression/cusz/include/cli/document.hh
+++ b/qtensor/compression/cusz/include/cli/document.hh
@@ -1,272 +1,272 @@
-/**
- * @file document.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.1.1
- * @date 2020-09-22
- *
- * @copyright (C) 2020 by Washington State University, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#ifndef ARGUMENT_PARSER_DOCUMENT_HH
-#define ARGUMENT_PARSER_DOCUMENT_HH
-
-#include <regex>
-#include <string>
-
-
-const std::string fmt_b("\e[1m");
-const std::string fmt_0("\e[0m");
-
-const std::regex  bful("@(.*?)@");
-const std::string bful_text("\e[1m\e[4m$1\e[0m");
-const std::regex  bf("\\*(.*?)\\*");
-const std::string bf_text("\e[1m$1\e[0m");
-const std::regex  ul(R"(_((\w|-|\d|\.)+?)_)");
-const std::string ul_text("\e[4m$1\e[0m");
-const std::regex  red(R"(\^\^(.*?)\^\^)");
-const std::string red_text("\e[31m$1\e[0m");
-
-std::string  //
-Format(const std::string& s)
-{
-    auto a = std::regex_replace(s, bful, bful_text);
-    auto b = std::regex_replace(a, bf, bf_text);
-    auto c = std::regex_replace(b, ul, ul_text);
-    auto d = std::regex_replace(c, red, red_text);
-    return d;
-}
-
-static const char cusz_short_doc[] =
-    // "cusz, version [placeholder]\n"
-    "\n"
-    "usage: cusz [-zxrh] [-i file] [-t dtype] [-m mode] [-e eb] [-l x,y,z] "
-    "...\n"
-    "\n"
-    "  z : zip/compress\n"
-    "  x : unzip/decompress\n"
-    "  r : dryrun\n"
-    "  h : print full-length help document\n"
-    "\n"
-    "  i file  : path to input datum\n"
-    "  t dtype : f32 or fp4 (to be updated)\n"
-    "  m mode  : compression mode; abs, r2r\n"
-    "  e eb    : error bound; default 1e-4\n"
-    "  l size  : \"-l x\" for 1D; \"-l [X]x[Y]\" for 2D; \"-l [X]x[Y]x[Z]\" for 3D\n"
-    // "  p pred  : select predictor from \"lorenzo\" and \"spline3d\"\n"
-    "\n"
-    "  config list:\n"
-    "      syntax: opt=v, \"kw1=val1,kw1=val2[,...]\"\n"
-    "      + eb     error bound\n"
-    "      + radius The number of quant-codes is 2x radius.\n"
-    "      + demo  load predefined lengths for demo datasets\n"
-    "          - skipping \"-l x[,y[,z]]\"\n"
-    "          - (1D) hacc  hacc1b  (2D) cesm  exafel\n"
-    "          - (3D) hurricane  nyx-s  nyx-m  qmc  qmcpre  rtm  parihaka\n"
-    "      + anchor (on|off)\n"
-    // "      + pipeline auto, binary, radius\n"
-    "      example: \"--config demo=cesm,radius=512\"\n"
-    "  report list: \n"
-    "      syntax: opt[=v], \"kw1[=(on|off)],kw2[=(on|off)]\n"
-    "      keyworkds: time, quality\n"
-    "      example: \"--report time\", \"--report time=off\"\n"
-    "\n"
-    "example:\n"
-    "   CESM=./data/cesm-CLDHGH-3600x1800\n"
-    "   cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -z --report time\n"
-    "   cusz -i ${CESM}.cusza -x --report time --compare ${CESM}\n"
-    "\n"
-    "\"cusz -h\" for details.\n";
-
-static const char cusz_full_doc[] =
-    "*NAME*\n"
-    "        cuSZ: CUDA-Based Error-Bounded Lossy Compressor for Scientific Data\n"
-    "        Lowercased \"*cusz*\" is the command."
-    "\n"
-    "*SYNOPSIS*\n"
-    "        The basic use is listed below,\n"
-    "        *cusz* *-t* f32 *-m* r2r *-e* 1.0e-4.0 *-i* ./data/cesm-CLDHGH-3600x1800 *-l* 3600,1800 *-z* *--report* "
-    "time\n"
-    //   cusz -t f32 -m r2r -e 1.0e-4.0 -i ./data/cesm-CLDHGH-3600x1800 -l 3600x1800 -z --report time\n
-    "             ^^------ ------ ----------- ------------------------------- ------------  |  ^^\n"
-    "             ^^ dtype  mode  error bound            input file           low-to-high  zip ^^\n"
-    "\n"
-    "        *cusz* *-i* ./data/cesm-CLDHGH-3600x1800.cusza *-x* *--compare* ./data/cesm-CLDHGH-3600x1800 *--report* "
-    "time\n"
-    //       cusz -i ./data/cesm-CLDHGH-3600x1800.cusza -x --compare ./data/cesm-CLDHGH-3600x1800 --report
-    //       time\n"
-    "             ^^-------------------------------------  |   ^^\n"
-    "             ^^            compressed file          unzip ^^\n"
-    "\n"
-    "        *cusz* *-t* f32|64 *-m* [eb mode] *-e* [eb] *-i* [datum file] *-l* [x[,y[,z]]] *-z*\n"
-    "        *cusz* *-i* [basename].cusza *-x*\n"
-    "\n"
-    "*OPTIONS*\n"
-    "    *Mandatory* (zip and dryrun)\n"
-    "        *-z* or *--compress* or *--*@z@*ip*\n"
-    "        *-r* or *--dry-*@r@*un*\n"
-    "                No lossless Huffman codec. Only to get data quality summary.\n"
-    "                In addition, quant. rep. and dict. size are retained\n"
-    "\n"
-    "        *-m* or *--*@m@*ode* <abs|r2r>\n"
-    "                Specify error-controlling mode. Supported modes include:\n"
-    "                _abs_: absolute mode, eb = input eb\n"
-    "                _r2r_: relative-to-value-range mode, eb = input eb x value range\n"
-    "\n"
-    "        *-e* or *--eb* or *--error-bound* [num]\n"
-    "                Specify error bound. e.g., _1.23_, _1e-4_, _1.23e-4.56_\n"
-    "\n"
-    "        *-i* or *--*@i@*nput* [file]\n"
-    "\n"
-    "        *-d* or *--dict-size* [256|512|1024|...]\n"
-    "                Specify dictionary size/quantization bin number.\n"
-    "                Should be a power-of-2.\n"
-    "\n"
-    "        *-l* [x[,y[,z]]]   Specify (1|2|3)D data size, with dimensions from low to high.\n"
-    "\n"
-    "    *Mandatory* (unzip)\n"
-    "        *-x* or *--e*@x@*tract* or *--decompress* or *--unzip*\n"
-    "\n"
-    "        *-i* or *--*@i@*nput* [corresponding datum basename (w/o extension)]\n"
-    "\n"
-    "    *Additional*\n"
-    "        *-p* or *--*@p@*redictor*\n"
-    "                Select predictor from \"lorenzo\" (default) or \"spline3d\" (3D only).\n"
-    "        *--origin* or *--compare* /path/to/origin-datum\n"
-    "                For verification & get data quality evaluation.\n"
-    "        *--opath*  /path/to\n"
-    "                Specify alternative output path.\n"
-    "\n"
-    "    *Modules*\n"
-    "        *--skip* _module-1_,_module-2_,...,_module-n_,\n"
-    "                Disable functionality modules. Supported module(s) include:\n"
-    "                _huffman_  Huffman codec after prediction+quantization (p+q) and before reversed p+q.\n"
-    "                _write2disk_  Skip write decompression data.\n"
-    //    "\n"
-    //    "        *-p* or *--pre* _method-1_,_method-2_,...,_method-n_\n"
-    //    "                Enable preprocessing. Supported preprocessing method(s) include:\n"
-    //    "                _binning_  Downsampling datum by 2x2 to 1.\n"
-    "\n"
-    "    *Print Report to stdout*\n"
-    "        *--report* (option=on/off)-list\n"
-    "                Syntax: opt[=v], \"kw1[=(on|off)],kw2=[=(on|off)]\n"
-    "                Keyworkds: time  quality  compressibility\n"
-    "                Example: \"--report time\", \"--report time=off\"\n"
-    "\n"
-    "    *Demonstration*\n"
-    "        *-h* or *--help*\n"
-    "                Get help documentation.\n"
-    "\n"
-    //    "        *-V* or *--verbose*\n"
-    //    "                Print host and device information for diagnostics.\n"
-    //    "\n"
-    //    "        *-M* or *--meta*\n"
-    //    "                Get archive metadata. (TODO)\n"
-    "\n"
-    "    *Advanced Runtime Configuration*\n"
-    "        *--demo* [demo-dataset]\n"
-    "                Use demo dataset, will omit given dimension(s). Supported datasets include:\n"
-    "                1D: _hacc_  _hacc1b_    2D: _cesm_  _exafel_\n"
-    "                3D: _hurricane_  _nyx-s_  _nyx-m_  _qmc_  _qmcpre_  _rtm_  _parihaka_\n"
-    "\n"
-    "        *-c* or *--config* (option=value)-list\n"
-    "               Syntax: opt=v, \"kw1=val1,kw1=val2[,...]\"\n"
-    "                   + *eb*=<val>    error bound\n"
-    "                   + *cap*=<val>   capacity, number of quant-codes\n"
-    "                   + *demo*=<val>  skip length input (\"-l x[,y[,z]]\"), alternative to \"--demo dataset\"\n"
-    "\n"
-    "               Other internal parameters:\n"
-    "                   + *quantbyte*=<1|2>\n"
-    "                       Specify quantization code representation.\n"
-    "                       Options _1_, _2_ are for *1-* and *2-*byte, respectively. (default: 2)\n"
-    "                       ^^Manually specifying this may not result in optimal memory footprint.^^\n"
-    "                   + *huffbyte*=<4|8>\n"
-    "                       Specify Huffman codeword representation.\n"
-    "                       Options _4_, _8_ are for *4-* and *8-*byte, respectively. (default: 4)\n"
-    "                       ^^Manually specifying this may not result in optimal memory footprint.^^\n"
-    "                   + *huffchunk*=[256|512|1024|...]\n"
-    "                       Manually specify chunk size for Huffman codec, overriding autotuning.\n"
-    "                       Should be a power-of-2 that is sufficiently large.\n"
-    "                       ^^This affects Huffman decoding performance significantly.^^\n"
-    "\n"
-    "*EXAMPLES*\n"
-    "    *Demo Datasets*\n"
-    "        Set a *shell variable*:\n"
-    "        export PATH=$(pwd)/bin:$PATH\n"
-    "        CESM=./data/cesm-CLDHGH-3600x1800\n"
-    "        HURR=./data/hurr-CLOUDf48-500x500x100\n"
-    "\n"
-    "        *CESM* example:\n"
-    "        cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -z --report time\n"
-    "        cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -r\n"
-    "        cusz -i ${CESM}.cusza -x --report time --compare ${CESM} --skip write2disk\n"
-    "\n"
-    "        *CESM* example with specified output path:\n"
-    "        mkdir data2 data3\n"
-    "        ^^# zip, output to `data2`^^\n"
-    "        cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -z --opath data2\n"
-    "        ^^# unzip, in situ^^\n"
-    "        cusz -i ${CESM}.cusza -x && ls data2\n"
-    "        ^^# unzip, output to `data3`^^\n"
-    "        cusz -i ${CESM}.cusza -x --opath data3 && ls data3\n"
-    "        ^^# unzip, output to `data3`, compare to the original datum^^\n"
-    "        cusz -i ${CESM}.cusza -x --opath data3 --compare ${CESM} && ls data3\n"
-    "\n"
-    "        *Hurricane Isabel* example:\n"
-    "        cusz -t f32 -m r2r -e 1e-4 -i ${HURR} -l 500x500x100 -z\n"
-    "        cusz -t f32 -m r2r -e 1e-4 -i ${HURR} -l 500x500x100 -r\n"
-    "        cusz -i ${HURR}.cusza -x\n"
-    "\n";
-
-// TODO
-// "        *EXAFEL* example:\n"
-// "        cusz -t f32 -m r2r -e 1e-4 -i ./data/exafel-59200x388 --demo exafeldemo -z -x --pre binning\n"
-// "        cusz -t f32 -m r2r -e 1e-4 -i ./data/exafel-59200x388 --demo exafeldemo -z -x --pre binning "
-// "--skip huffman\n"
-// "        cusz -i ./data/exafel-59200x388.BN.cusza -x\n";
-
-static const char huff_re_short_doc[] =
-    "\n"
-    "OVERVIEW: Huffman submodule as standalone program\n"  // TODO from this line on
-    "\n"
-    "USAGE:\n"
-    "  The basic use with demo datum is listed below,\n"
-    "    ./huff --encode --decode --verify --input ./baryon_density.dat.b16 \\\n"
-    "        -3 512 512 512 --input-rep 16 --huffman-rep 32 --huffman-chunk 2048 --dict-size 1024\n"
-    "  or shorter\n"
-    "    ./huff -e -d -V -i ./baryon_density.dat.b16 -3 512 512 512 -R 16 -H 32 -C 2048 -c 1024\n"
-    "            ^  ^  ^ --------------------------- -------------- ----- ----- ------- -------\n"
-    "            |  |  |       input datum file         dimension   input Huff. Huff.   codebook\n"
-    "          enc dec verify                                       rep.  rep.  chunk   size\n"
-    "\n"
-    "EXAMPLES\n"
-    "  Essential:\n"
-    "    ./bin/huff -e -d -i ./baryon_density.dat.b16 -3 512 512 512 -R 16 -c 1024\n"
-    "    have to input dimension, and higher dimension for a multiplication of each dim.,\n"
-    "    as default values input-rep=16 (bits), huff-rep=32 (bits), codebook-size=1024 (symbols)\n"
-    "\n";
-
-static const char doc_dim_order[] =
-    "\n"
-    "  Input dimension follows low-to-high (e.g., x-y-z) order.\n"
-    "  Taking 2D CESM-ATM as an example, \n"
-    "\n"
-    "  |<------------------------- x 3600 --------------------------->|    \n"
-    "  +--------------------------------------------------------------+  - \n"
-    "  |                                                              |  ^ \n"
-    "  |                                                              |  | \n"
-    "  |              CESM-ATM:    1800x3600 (y-x order)              |  | \n"
-    "  |              datum name:  <field>_1800_3600                  |  y \n"
-    "  |                                                              | 1800 \n"
-    "  |              input:       -l 3600,1800                       |  | \n"
-    "  |              input order: -l [x,y]                           |  | \n"
-    "  |                                                              |  | \n"
-    "  |                                                              |  v \n"
-    "  +--------------------------------------------------------------+  - \n"
-    "\n"
-    "  Taking 3D Hurricane as another example, whose dimensions are\n"
-    "  100x500x500, the input is \"-l 500,500,100\".\n";
-
-#endif
+/**
+ * @file document.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.1.1
+ * @date 2020-09-22
+ *
+ * @copyright (C) 2020 by Washington State University, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef ARGUMENT_PARSER_DOCUMENT_HH
+#define ARGUMENT_PARSER_DOCUMENT_HH
+
+#include <regex>
+#include <string>
+
+
+const std::string fmt_b("\e[1m");
+const std::string fmt_0("\e[0m");
+
+const std::regex  bful("@(.*?)@");
+const std::string bful_text("\e[1m\e[4m$1\e[0m");
+const std::regex  bf("\\*(.*?)\\*");
+const std::string bf_text("\e[1m$1\e[0m");
+const std::regex  ul(R"(_((\w|-|\d|\.)+?)_)");
+const std::string ul_text("\e[4m$1\e[0m");
+const std::regex  red(R"(\^\^(.*?)\^\^)");
+const std::string red_text("\e[31m$1\e[0m");
+
+std::string  //
+Format(const std::string& s)
+{
+    auto a = std::regex_replace(s, bful, bful_text);
+    auto b = std::regex_replace(a, bf, bf_text);
+    auto c = std::regex_replace(b, ul, ul_text);
+    auto d = std::regex_replace(c, red, red_text);
+    return d;
+}
+
+static const char cusz_short_doc[] =
+    // "cusz, version [placeholder]\n"
+    "\n"
+    "usage: cusz [-zxrh] [-i file] [-t dtype] [-m mode] [-e eb] [-l x,y,z] "
+    "...\n"
+    "\n"
+    "  z : zip/compress\n"
+    "  x : unzip/decompress\n"
+    "  r : dryrun\n"
+    "  h : print full-length help document\n"
+    "\n"
+    "  i file  : path to input datum\n"
+    "  t dtype : f32 or fp4 (to be updated)\n"
+    "  m mode  : compression mode; abs, r2r\n"
+    "  e eb    : error bound; default 1e-4\n"
+    "  l size  : \"-l x\" for 1D; \"-l [X]x[Y]\" for 2D; \"-l [X]x[Y]x[Z]\" for 3D\n"
+    // "  p pred  : select predictor from \"lorenzo\" and \"spline3d\"\n"
+    "\n"
+    "  config list:\n"
+    "      syntax: opt=v, \"kw1=val1,kw1=val2[,...]\"\n"
+    "      + eb     error bound\n"
+    "      + radius The number of quant-codes is 2x radius.\n"
+    "      + demo  load predefined lengths for demo datasets\n"
+    "          - skipping \"-l x[,y[,z]]\"\n"
+    "          - (1D) hacc  hacc1b  (2D) cesm  exafel\n"
+    "          - (3D) hurricane  nyx-s  nyx-m  qmc  qmcpre  rtm  parihaka\n"
+    "      + anchor (on|off)\n"
+    // "      + pipeline auto, binary, radius\n"
+    "      example: \"--config demo=cesm,radius=512\"\n"
+    "  report list: \n"
+    "      syntax: opt[=v], \"kw1[=(on|off)],kw2[=(on|off)]\n"
+    "      keyworkds: time, quality\n"
+    "      example: \"--report time\", \"--report time=off\"\n"
+    "\n"
+    "example:\n"
+    "   CESM=./data/cesm-CLDHGH-3600x1800\n"
+    "   cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -z --report time\n"
+    "   cusz -i ${CESM}.cusza -x --report time --compare ${CESM}\n"
+    "\n"
+    "\"cusz -h\" for details.\n";
+
+static const char cusz_full_doc[] =
+    "*NAME*\n"
+    "        cuSZ: CUDA-Based Error-Bounded Lossy Compressor for Scientific Data\n"
+    "        Lowercased \"*cusz*\" is the command."
+    "\n"
+    "*SYNOPSIS*\n"
+    "        The basic use is listed below,\n"
+    "        *cusz* *-t* f32 *-m* r2r *-e* 1.0e-4.0 *-i* ./data/cesm-CLDHGH-3600x1800 *-l* 3600,1800 *-z* *--report* "
+    "time\n"
+    //   cusz -t f32 -m r2r -e 1.0e-4.0 -i ./data/cesm-CLDHGH-3600x1800 -l 3600x1800 -z --report time\n
+    "             ^^------ ------ ----------- ------------------------------- ------------  |  ^^\n"
+    "             ^^ dtype  mode  error bound            input file           low-to-high  zip ^^\n"
+    "\n"
+    "        *cusz* *-i* ./data/cesm-CLDHGH-3600x1800.cusza *-x* *--compare* ./data/cesm-CLDHGH-3600x1800 *--report* "
+    "time\n"
+    //       cusz -i ./data/cesm-CLDHGH-3600x1800.cusza -x --compare ./data/cesm-CLDHGH-3600x1800 --report
+    //       time\n"
+    "             ^^-------------------------------------  |   ^^\n"
+    "             ^^            compressed file          unzip ^^\n"
+    "\n"
+    "        *cusz* *-t* f32|64 *-m* [eb mode] *-e* [eb] *-i* [datum file] *-l* [x[,y[,z]]] *-z*\n"
+    "        *cusz* *-i* [basename].cusza *-x*\n"
+    "\n"
+    "*OPTIONS*\n"
+    "    *Mandatory* (zip and dryrun)\n"
+    "        *-z* or *--compress* or *--*@z@*ip*\n"
+    "        *-r* or *--dry-*@r@*un*\n"
+    "                No lossless Huffman codec. Only to get data quality summary.\n"
+    "                In addition, quant. rep. and dict. size are retained\n"
+    "\n"
+    "        *-m* or *--*@m@*ode* <abs|r2r>\n"
+    "                Specify error-controlling mode. Supported modes include:\n"
+    "                _abs_: absolute mode, eb = input eb\n"
+    "                _r2r_: relative-to-value-range mode, eb = input eb x value range\n"
+    "\n"
+    "        *-e* or *--eb* or *--error-bound* [num]\n"
+    "                Specify error bound. e.g., _1.23_, _1e-4_, _1.23e-4.56_\n"
+    "\n"
+    "        *-i* or *--*@i@*nput* [file]\n"
+    "\n"
+    "        *-d* or *--dict-size* [256|512|1024|...]\n"
+    "                Specify dictionary size/quantization bin number.\n"
+    "                Should be a power-of-2.\n"
+    "\n"
+    "        *-l* [x[,y[,z]]]   Specify (1|2|3)D data size, with dimensions from low to high.\n"
+    "\n"
+    "    *Mandatory* (unzip)\n"
+    "        *-x* or *--e*@x@*tract* or *--decompress* or *--unzip*\n"
+    "\n"
+    "        *-i* or *--*@i@*nput* [corresponding datum basename (w/o extension)]\n"
+    "\n"
+    "    *Additional*\n"
+    "        *-p* or *--*@p@*redictor*\n"
+    "                Select predictor from \"lorenzo\" (default) or \"spline3d\" (3D only).\n"
+    "        *--origin* or *--compare* /path/to/origin-datum\n"
+    "                For verification & get data quality evaluation.\n"
+    "        *--opath*  /path/to\n"
+    "                Specify alternative output path.\n"
+    "\n"
+    "    *Modules*\n"
+    "        *--skip* _module-1_,_module-2_,...,_module-n_,\n"
+    "                Disable functionality modules. Supported module(s) include:\n"
+    "                _huffman_  Huffman codec after prediction+quantization (p+q) and before reversed p+q.\n"
+    "                _write2disk_  Skip write decompression data.\n"
+    //    "\n"
+    //    "        *-p* or *--pre* _method-1_,_method-2_,...,_method-n_\n"
+    //    "                Enable preprocessing. Supported preprocessing method(s) include:\n"
+    //    "                _binning_  Downsampling datum by 2x2 to 1.\n"
+    "\n"
+    "    *Print Report to stdout*\n"
+    "        *--report* (option=on/off)-list\n"
+    "                Syntax: opt[=v], \"kw1[=(on|off)],kw2=[=(on|off)]\n"
+    "                Keyworkds: time  quality  compressibility\n"
+    "                Example: \"--report time\", \"--report time=off\"\n"
+    "\n"
+    "    *Demonstration*\n"
+    "        *-h* or *--help*\n"
+    "                Get help documentation.\n"
+    "\n"
+    //    "        *-V* or *--verbose*\n"
+    //    "                Print host and device information for diagnostics.\n"
+    //    "\n"
+    //    "        *-M* or *--meta*\n"
+    //    "                Get archive metadata. (TODO)\n"
+    "\n"
+    "    *Advanced Runtime Configuration*\n"
+    "        *--demo* [demo-dataset]\n"
+    "                Use demo dataset, will omit given dimension(s). Supported datasets include:\n"
+    "                1D: _hacc_  _hacc1b_    2D: _cesm_  _exafel_\n"
+    "                3D: _hurricane_  _nyx-s_  _nyx-m_  _qmc_  _qmcpre_  _rtm_  _parihaka_\n"
+    "\n"
+    "        *-c* or *--config* (option=value)-list\n"
+    "               Syntax: opt=v, \"kw1=val1,kw1=val2[,...]\"\n"
+    "                   + *eb*=<val>    error bound\n"
+    "                   + *cap*=<val>   capacity, number of quant-codes\n"
+    "                   + *demo*=<val>  skip length input (\"-l x[,y[,z]]\"), alternative to \"--demo dataset\"\n"
+    "\n"
+    "               Other internal parameters:\n"
+    "                   + *quantbyte*=<1|2>\n"
+    "                       Specify quantization code representation.\n"
+    "                       Options _1_, _2_ are for *1-* and *2-*byte, respectively. (default: 2)\n"
+    "                       ^^Manually specifying this may not result in optimal memory footprint.^^\n"
+    "                   + *huffbyte*=<4|8>\n"
+    "                       Specify Huffman codeword representation.\n"
+    "                       Options _4_, _8_ are for *4-* and *8-*byte, respectively. (default: 4)\n"
+    "                       ^^Manually specifying this may not result in optimal memory footprint.^^\n"
+    "                   + *huffchunk*=[256|512|1024|...]\n"
+    "                       Manually specify chunk size for Huffman codec, overriding autotuning.\n"
+    "                       Should be a power-of-2 that is sufficiently large.\n"
+    "                       ^^This affects Huffman decoding performance significantly.^^\n"
+    "\n"
+    "*EXAMPLES*\n"
+    "    *Demo Datasets*\n"
+    "        Set a *shell variable*:\n"
+    "        export PATH=$(pwd)/bin:$PATH\n"
+    "        CESM=./data/cesm-CLDHGH-3600x1800\n"
+    "        HURR=./data/hurr-CLOUDf48-500x500x100\n"
+    "\n"
+    "        *CESM* example:\n"
+    "        cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -z --report time\n"
+    "        cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -r\n"
+    "        cusz -i ${CESM}.cusza -x --report time --compare ${CESM} --skip write2disk\n"
+    "\n"
+    "        *CESM* example with specified output path:\n"
+    "        mkdir data2 data3\n"
+    "        ^^# zip, output to `data2`^^\n"
+    "        cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -z --opath data2\n"
+    "        ^^# unzip, in situ^^\n"
+    "        cusz -i ${CESM}.cusza -x && ls data2\n"
+    "        ^^# unzip, output to `data3`^^\n"
+    "        cusz -i ${CESM}.cusza -x --opath data3 && ls data3\n"
+    "        ^^# unzip, output to `data3`, compare to the original datum^^\n"
+    "        cusz -i ${CESM}.cusza -x --opath data3 --compare ${CESM} && ls data3\n"
+    "\n"
+    "        *Hurricane Isabel* example:\n"
+    "        cusz -t f32 -m r2r -e 1e-4 -i ${HURR} -l 500x500x100 -z\n"
+    "        cusz -t f32 -m r2r -e 1e-4 -i ${HURR} -l 500x500x100 -r\n"
+    "        cusz -i ${HURR}.cusza -x\n"
+    "\n";
+
+// TODO
+// "        *EXAFEL* example:\n"
+// "        cusz -t f32 -m r2r -e 1e-4 -i ./data/exafel-59200x388 --demo exafeldemo -z -x --pre binning\n"
+// "        cusz -t f32 -m r2r -e 1e-4 -i ./data/exafel-59200x388 --demo exafeldemo -z -x --pre binning "
+// "--skip huffman\n"
+// "        cusz -i ./data/exafel-59200x388.BN.cusza -x\n";
+
+static const char huff_re_short_doc[] =
+    "\n"
+    "OVERVIEW: Huffman submodule as standalone program\n"  // TODO from this line on
+    "\n"
+    "USAGE:\n"
+    "  The basic use with demo datum is listed below,\n"
+    "    ./huff --encode --decode --verify --input ./baryon_density.dat.b16 \\\n"
+    "        -3 512 512 512 --input-rep 16 --huffman-rep 32 --huffman-chunk 2048 --dict-size 1024\n"
+    "  or shorter\n"
+    "    ./huff -e -d -V -i ./baryon_density.dat.b16 -3 512 512 512 -R 16 -H 32 -C 2048 -c 1024\n"
+    "            ^  ^  ^ --------------------------- -------------- ----- ----- ------- -------\n"
+    "            |  |  |       input datum file         dimension   input Huff. Huff.   codebook\n"
+    "          enc dec verify                                       rep.  rep.  chunk   size\n"
+    "\n"
+    "EXAMPLES\n"
+    "  Essential:\n"
+    "    ./bin/huff -e -d -i ./baryon_density.dat.b16 -3 512 512 512 -R 16 -c 1024\n"
+    "    have to input dimension, and higher dimension for a multiplication of each dim.,\n"
+    "    as default values input-rep=16 (bits), huff-rep=32 (bits), codebook-size=1024 (symbols)\n"
+    "\n";
+
+static const char doc_dim_order[] =
+    "\n"
+    "  Input dimension follows low-to-high (e.g., x-y-z) order.\n"
+    "  Taking 2D CESM-ATM as an example, \n"
+    "\n"
+    "  |<------------------------- x 3600 --------------------------->|    \n"
+    "  +--------------------------------------------------------------+  - \n"
+    "  |                                                              |  ^ \n"
+    "  |                                                              |  | \n"
+    "  |              CESM-ATM:    1800x3600 (y-x order)              |  | \n"
+    "  |              datum name:  <field>_1800_3600                  |  y \n"
+    "  |                                                              | 1800 \n"
+    "  |              input:       -l 3600,1800                       |  | \n"
+    "  |              input order: -l [x,y]                           |  | \n"
+    "  |                                                              |  | \n"
+    "  |                                                              |  v \n"
+    "  +--------------------------------------------------------------+  - \n"
+    "\n"
+    "  Taking 3D Hurricane as another example, whose dimensions are\n"
+    "  100x500x500, the input is \"-l 500,500,100\".\n";
+
+#endif
diff --git a/qtensor/compression/cusz/include/cli/quality_viewer.hh b/qtensor/compression/cusz/include/cli/quality_viewer.hh
index 0a5e9eed..eb8a27c2 100644
--- a/qtensor/compression/cusz/include/cli/quality_viewer.hh
+++ b/qtensor/compression/cusz/include/cli/quality_viewer.hh
@@ -1,163 +1,163 @@
-/**
- * @file quality_viewer.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-04-09
- * @deprecated 0.3.2
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef QUALITY_VIEWER_HH
-#define QUALITY_VIEWER_HH
-
-// 22-11-20 would fail in cxxapi.cu if deleted
-#include <thrust/equal.h>
-
-#include "../common/capsule.hh"
-#include "../common/definition.hh"
-#include "../header.h"
-#include "../stat/compare_gpu.hh"
-#include "verify.hh"
-
-namespace cusz {
-
-const static auto HOST        = cusz::LOC::HOST;
-const static auto DEVICE      = cusz::LOC::DEVICE;
-const static auto HOST_DEVICE = cusz::LOC::HOST_DEVICE;
-
-struct QualityViewer {
-    template <typename Data>
-    static void print_metrics_cross(cusz_stats* s, size_t compressed_bytes = 0, bool gpu_checker = false)
-    {
-        auto checker = (not gpu_checker) ? string("(using CPU checker)") : string("(using GPU checker)");
-        auto bytes   = (s->len * sizeof(Data) * 1.0);
-
-        auto println = [](const char* s, double n1, double n2, double n3, double n4) {
-            printf("  %-10s %16.8g %16.8g %16.8g %16.8g\n", s, n1, n2, n3, n4);
-        };
-        auto printhead = [](const char* s1, const char* s2, const char* s3, const char* s4, const char* s5) {
-            printf("  \e[1m\e[31m%-10s %16s %16s %16s %16s\e[0m\n", s1, s2, s3, s4, s5);
-        };
-
-        auto is_fp = std::is_same<Data, float>::value or std::is_same<Data, double>::value ? const_cast<char*>("yes")
-                                                                                           : const_cast<char*>("no");
-        printf("\nquality metrics %s:\n", checker.c_str());
-
-        printhead("", "data-len", "data-byte", "fp-type?", "");
-        printf("  %-10s %16zu %16lu %16s\n", "", s->len, sizeof(Data), is_fp);
-
-        printhead("", "min", "max", "rng", "std");
-        println("origin", s->odata.min, s->odata.max, s->odata.rng, s->odata.std);
-        println("eb-lossy", s->xdata.min, s->xdata.max, s->xdata.rng, s->xdata.std);
-
-        printhead("", "abs-val", "abs-idx", "pw-rel", "VS-RNG");
-        println("max-error", s->max_err.abs, s->max_err.idx, s->max_err.pwrrel, s->max_err.rel);
-
-        printhead("", "CR", "NRMSE", "cross-cor", "PSNR");
-        println("metrics", bytes / compressed_bytes, s->reduced.NRMSE, s->reduced.coeff, s->reduced.PSNR);
-
-        // printf("\n");
-    };
-
-    static void print_metrics_auto(double* lag1_cor, double* lag2_cor)
-    {
-        auto printhead = [](const char* s1, const char* s2, const char* s3, const char* s4, const char* s5) {
-            printf("  \e[1m\e[31m%-10s %16s %16s %16s %16s\e[0m\n", s1, s2, s3, s4, s5);
-        };
-
-        printhead("", "lag1-cor", "lag2-cor", "", "");
-        printf("  %-10s %16lf %16lf\n", "auto", *lag1_cor, *lag2_cor);
-        printf("\n");
-    };
-
-    template <typename T>
-    static void echo_metric_gpu(T* reconstructed, T* origin, size_t len, size_t compressed_bytes = 0)
-    {
-        // cross
-        auto stat_x = new cusz_stats;
-        psz::thrustgpu_assess_quality<T>(stat_x, reconstructed, origin, len);
-        print_metrics_cross<T>(stat_x, compressed_bytes, true);
-
-        auto stat_auto_lag1 = new cusz_stats;
-        psz::thrustgpu_assess_quality<T>(stat_auto_lag1, origin, origin + 1, len - 1);
-        auto stat_auto_lag2 = new cusz_stats;
-        psz::thrustgpu_assess_quality<T>(stat_auto_lag2, origin, origin + 2, len - 2);
-
-        print_metrics_auto(&stat_auto_lag1->reduced.coeff, &stat_auto_lag2->reduced.coeff);
-    }
-
-    template <typename T>
-    static void echo_metric_cpu(T* _d1, T* _d2, size_t len, size_t compressed_bytes = 0, bool from_device = true)
-    {
-        auto stat = new cusz_stats;
-        T*   reconstructed;
-        T*   origin;
-        if (not from_device) {
-            reconstructed = _d1;
-            origin        = _d2;
-        }
-        else {
-            printf("allocating tmp space for CPU verification\n");
-            auto bytes = sizeof(T) * len;
-            cudaMallocHost(&reconstructed, bytes);
-            cudaMallocHost(&origin, bytes);
-            cudaMemcpy(reconstructed, _d1, bytes, cudaMemcpyDeviceToHost);
-            cudaMemcpy(origin, _d2, bytes, cudaMemcpyDeviceToHost);
-        }
-        cusz::verify_data<T>(stat, reconstructed, origin, len);
-        print_metrics_cross<T>(stat, compressed_bytes, false);
-
-        auto stat_auto_lag1 = new cusz_stats;
-        verify_data<T>(stat_auto_lag1, origin, origin + 1, len - 1);
-        auto stat_auto_lag2 = new cusz_stats;
-        verify_data<T>(stat_auto_lag2, origin, origin + 2, len - 2);
-
-        print_metrics_auto(&stat_auto_lag1->reduced.coeff, &stat_auto_lag2->reduced.coeff);
-
-        if (from_device) {
-            if (reconstructed) cudaFreeHost(reconstructed);
-            if (origin) cudaFreeHost(origin);
-        }
-    }
-
-    template <typename T>
-    static void load_origin(string const& fname, Capsule<T>& origin)
-    {
-        origin.mallochost().malloc().fromfile(fname);
-    }
-
-    template <typename T>
-    static void view(header_t header, Capsule<T>& xdata, Capsule<T>& cmp, string const& compare)
-    {
-        auto len             = ConfigHelper::get_uncompressed_len(header);
-        auto compressd_bytes = ConfigHelper::get_filesize(header);
-
-        auto compare_on_gpu = [&]() {
-            cmp.mallochost().malloc().fromfile(compare).host2device();
-            echo_metric_gpu(xdata.dptr(), cmp.dptr(), len, compressd_bytes);
-            cmp.freehost().free();
-        };
-
-        auto compare_on_cpu = [&]() {
-            cmp.mallochost().fromfile(compare);
-            xdata.device2host();
-            echo_metric_cpu(xdata.hptr(), cmp.hptr(), len, compressd_bytes);
-            cmp.freehost();
-        };
-
-        if (compare != "") {
-            auto gb = 1.0 * sizeof(T) * len / 1e9;
-            if (gb < 0.8)
-                compare_on_gpu();
-            else
-                compare_on_cpu();
-        }
-    }
-};
-
-}  // namespace cusz
-
-#endif
+/**
+ * @file quality_viewer.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-09
+ * @deprecated 0.3.2
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef QUALITY_VIEWER_HH
+#define QUALITY_VIEWER_HH
+
+// 22-11-20 would fail in cxxapi.cu if deleted
+#include <thrust/equal.h>
+
+#include "../common/capsule.hh"
+#include "../common/definition.hh"
+#include "../header.h"
+#include "../stat/compare_gpu.hh"
+#include "verify.hh"
+
+namespace cusz {
+
+const static auto HOST        = cusz::LOC::HOST;
+const static auto DEVICE      = cusz::LOC::DEVICE;
+const static auto HOST_DEVICE = cusz::LOC::HOST_DEVICE;
+
+struct QualityViewer {
+    template <typename Data>
+    static void print_metrics_cross(cusz_stats* s, size_t compressed_bytes = 0, bool gpu_checker = false)
+    {
+        auto checker = (not gpu_checker) ? string("(using CPU checker)") : string("(using GPU checker)");
+        auto bytes   = (s->len * sizeof(Data) * 1.0);
+
+        auto println = [](const char* s, double n1, double n2, double n3, double n4) {
+            printf("  %-10s %16.8g %16.8g %16.8g %16.8g\n", s, n1, n2, n3, n4);
+        };
+        auto printhead = [](const char* s1, const char* s2, const char* s3, const char* s4, const char* s5) {
+            printf("  \e[1m\e[31m%-10s %16s %16s %16s %16s\e[0m\n", s1, s2, s3, s4, s5);
+        };
+
+        auto is_fp = std::is_same<Data, float>::value or std::is_same<Data, double>::value ? const_cast<char*>("yes")
+                                                                                           : const_cast<char*>("no");
+        printf("\nquality metrics %s:\n", checker.c_str());
+
+        printhead("", "data-len", "data-byte", "fp-type?", "");
+        printf("  %-10s %16zu %16lu %16s\n", "", s->len, sizeof(Data), is_fp);
+
+        printhead("", "min", "max", "rng", "std");
+        println("origin", s->odata.min, s->odata.max, s->odata.rng, s->odata.std);
+        println("eb-lossy", s->xdata.min, s->xdata.max, s->xdata.rng, s->xdata.std);
+
+        printhead("", "abs-val", "abs-idx", "pw-rel", "VS-RNG");
+        println("max-error", s->max_err.abs, s->max_err.idx, s->max_err.pwrrel, s->max_err.rel);
+
+        printhead("", "CR", "NRMSE", "cross-cor", "PSNR");
+        println("metrics", bytes / compressed_bytes, s->reduced.NRMSE, s->reduced.coeff, s->reduced.PSNR);
+
+        // printf("\n");
+    };
+
+    static void print_metrics_auto(double* lag1_cor, double* lag2_cor)
+    {
+        auto printhead = [](const char* s1, const char* s2, const char* s3, const char* s4, const char* s5) {
+            printf("  \e[1m\e[31m%-10s %16s %16s %16s %16s\e[0m\n", s1, s2, s3, s4, s5);
+        };
+
+        printhead("", "lag1-cor", "lag2-cor", "", "");
+        printf("  %-10s %16lf %16lf\n", "auto", *lag1_cor, *lag2_cor);
+        printf("\n");
+    };
+
+    template <typename T>
+    static void echo_metric_gpu(T* reconstructed, T* origin, size_t len, size_t compressed_bytes = 0)
+    {
+        // cross
+        auto stat_x = new cusz_stats;
+        psz::thrustgpu_assess_quality<T>(stat_x, reconstructed, origin, len);
+        print_metrics_cross<T>(stat_x, compressed_bytes, true);
+
+        auto stat_auto_lag1 = new cusz_stats;
+        psz::thrustgpu_assess_quality<T>(stat_auto_lag1, origin, origin + 1, len - 1);
+        auto stat_auto_lag2 = new cusz_stats;
+        psz::thrustgpu_assess_quality<T>(stat_auto_lag2, origin, origin + 2, len - 2);
+
+        print_metrics_auto(&stat_auto_lag1->reduced.coeff, &stat_auto_lag2->reduced.coeff);
+    }
+
+    template <typename T>
+    static void echo_metric_cpu(T* _d1, T* _d2, size_t len, size_t compressed_bytes = 0, bool from_device = true)
+    {
+        auto stat = new cusz_stats;
+        T*   reconstructed;
+        T*   origin;
+        if (not from_device) {
+            reconstructed = _d1;
+            origin        = _d2;
+        }
+        else {
+            printf("allocating tmp space for CPU verification\n");
+            auto bytes = sizeof(T) * len;
+            cudaMallocHost(&reconstructed, bytes);
+            cudaMallocHost(&origin, bytes);
+            cudaMemcpy(reconstructed, _d1, bytes, cudaMemcpyDeviceToHost);
+            cudaMemcpy(origin, _d2, bytes, cudaMemcpyDeviceToHost);
+        }
+        cusz::verify_data<T>(stat, reconstructed, origin, len);
+        print_metrics_cross<T>(stat, compressed_bytes, false);
+
+        auto stat_auto_lag1 = new cusz_stats;
+        verify_data<T>(stat_auto_lag1, origin, origin + 1, len - 1);
+        auto stat_auto_lag2 = new cusz_stats;
+        verify_data<T>(stat_auto_lag2, origin, origin + 2, len - 2);
+
+        print_metrics_auto(&stat_auto_lag1->reduced.coeff, &stat_auto_lag2->reduced.coeff);
+
+        if (from_device) {
+            if (reconstructed) cudaFreeHost(reconstructed);
+            if (origin) cudaFreeHost(origin);
+        }
+    }
+
+    template <typename T>
+    static void load_origin(string const& fname, Capsule<T>& origin)
+    {
+        origin.mallochost().malloc().fromfile(fname);
+    }
+
+    template <typename T>
+    static void view(header_t header, Capsule<T>& xdata, Capsule<T>& cmp, string const& compare)
+    {
+        auto len             = ConfigHelper::get_uncompressed_len(header);
+        auto compressd_bytes = ConfigHelper::get_filesize(header);
+
+        auto compare_on_gpu = [&]() {
+            cmp.mallochost().malloc().fromfile(compare).host2device();
+            echo_metric_gpu(xdata.dptr(), cmp.dptr(), len, compressd_bytes);
+            cmp.freehost().free();
+        };
+
+        auto compare_on_cpu = [&]() {
+            cmp.mallochost().fromfile(compare);
+            xdata.device2host();
+            echo_metric_cpu(xdata.hptr(), cmp.hptr(), len, compressd_bytes);
+            cmp.freehost();
+        };
+
+        if (compare != "") {
+            auto gb = 1.0 * sizeof(T) * len / 1e9;
+            if (gb < 0.8)
+                compare_on_gpu();
+            else
+                compare_on_cpu();
+        }
+    }
+};
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/include/cli/query.hh b/qtensor/compression/cusz/include/cli/query.hh
index 91fcf65d..c09326c8 100644
--- a/qtensor/compression/cusz/include/cli/query.hh
+++ b/qtensor/compression/cusz/include/cli/query.hh
@@ -1,71 +1,71 @@
-/**
- * @file query.hh
- * @author Jiannan Tian
- * @brief query machine information
- * @version 0.1.3
- * @date 2020-10-05
- *
- * @copyright (C) 2020 by Washington State University, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#ifndef QUERY_HH
-#define QUERY_HH
-
-#include <array>
-#include <cstdio>
-#include <iostream>
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-#include <cuda_runtime.h>
-
-#include "query_dev.hh"
-
-struct Diagnostics {
-    static std::string ExecShellCommand(const char* cmd)
-    {
-        std::array<char, 128>                    buffer;
-        std::string                              result;
-        std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd, "r"), pclose);
-        if (!pipe) { throw std::runtime_error("popen() failed!"); }
-        while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { result += buffer.data(); }
-        return result;
-    }
-
-    static void GetMachineProperties()
-    {
-        std::vector<std::string> v;
-        std::cout << "host information: " << std::endl;
-
-        auto cpuinfo = ExecShellCommand(  //
-            std::string("cat /proc/cpuinfo "
-                        "| grep \"model name\" "
-                        "| head -n 1 "
-                        "| awk -F': ' '{print $NF}'")
-                .c_str());
-        std::cout << "  cpu model\t" << cpuinfo;
-
-        auto meminfo = ExecShellCommand(  //
-            std::string("cat /proc/meminfo"
-                        "| grep \"MemTotal\" "
-                        "| awk -F' ' '{print $2\" \"$3}'")
-                .c_str());
-
-        std::cout << "  memory size\t" << meminfo;
-
-        auto endianness = ExecShellCommand(  //
-            std::string("lscpu "
-                        "| grep Endian "
-                        "| awk -F'  ' '{print $NF}'")
-                .c_str());
-
-        std::cout << "  byte order\t" << endianness;
-        printf("\n");
-    }
-};
-
-#endif
+/**
+ * @file query.hh
+ * @author Jiannan Tian
+ * @brief query machine information
+ * @version 0.1.3
+ * @date 2020-10-05
+ *
+ * @copyright (C) 2020 by Washington State University, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef QUERY_HH
+#define QUERY_HH
+
+#include <array>
+#include <cstdio>
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <cuda_runtime.h>
+
+#include "query_dev.hh"
+
+struct Diagnostics {
+    static std::string ExecShellCommand(const char* cmd)
+    {
+        std::array<char, 128>                    buffer;
+        std::string                              result;
+        std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd, "r"), pclose);
+        if (!pipe) { throw std::runtime_error("popen() failed!"); }
+        while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { result += buffer.data(); }
+        return result;
+    }
+
+    static void GetMachineProperties()
+    {
+        std::vector<std::string> v;
+        std::cout << "host information: " << std::endl;
+
+        auto cpuinfo = ExecShellCommand(  //
+            std::string("cat /proc/cpuinfo "
+                        "| grep \"model name\" "
+                        "| head -n 1 "
+                        "| awk -F': ' '{print $NF}'")
+                .c_str());
+        std::cout << "  cpu model\t" << cpuinfo;
+
+        auto meminfo = ExecShellCommand(  //
+            std::string("cat /proc/meminfo"
+                        "| grep \"MemTotal\" "
+                        "| awk -F' ' '{print $2\" \"$3}'")
+                .c_str());
+
+        std::cout << "  memory size\t" << meminfo;
+
+        auto endianness = ExecShellCommand(  //
+            std::string("lscpu "
+                        "| grep Endian "
+                        "| awk -F'  ' '{print $NF}'")
+                .c_str());
+
+        std::cout << "  byte order\t" << endianness;
+        printf("\n");
+    }
+};
+
+#endif
diff --git a/qtensor/compression/cusz/include/cli/query_dev.hh b/qtensor/compression/cusz/include/cli/query_dev.hh
index c2eb37aa..34a429ea 100644
--- a/qtensor/compression/cusz/include/cli/query_dev.hh
+++ b/qtensor/compression/cusz/include/cli/query_dev.hh
@@ -1,69 +1,69 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/* This sample queries the properties of the CUDA devices present in the system
- * via CUDA Runtime API. */
-
-/**
- * @brief Get the Device Property object
- * modified from `cuda-samples/Samples/deviceQuery/deviceQuery.cpp`
- */
-
-struct GpuDiagnostics {
-    static void GetDeviceProperty()
-    {
-        int         num_dev  = 0;
-        cudaError_t error_id = cudaGetDeviceCount(&num_dev);
-
-        if (error_id != cudaSuccess) {
-            printf("cudaGetDeviceCount returned %d\n-> %s\n", static_cast<int>(error_id), cudaGetErrorString(error_id));
-            exit(EXIT_FAILURE);
-        }
-        if (num_dev == 0) { printf("NO CUDA device detected.\n"); }
-        int dev, driver_ver = 0, runtime_ver = 0;
-
-        for (dev = 0; dev < num_dev; ++dev) {
-            cudaSetDevice(dev);
-            cudaDeviceProp dev_prop;
-            cudaGetDeviceProperties(&dev_prop, dev);
-            printf("device #%d, %s: \n", dev, dev_prop.name);
-
-            cudaDriverGetVersion(&driver_ver);
-            cudaRuntimeGetVersion(&runtime_ver);
-            printf(
-                "  driver/runtime\t%d.%d/%d.%d\n", driver_ver / 1000, (driver_ver % 100) / 10, runtime_ver / 1000,
-                (runtime_ver % 100) / 10);
-            printf("  compute capability:\t%d.%d\n", dev_prop.major, dev_prop.minor);
-            printf("  global memory:\t%.0f MiB\n", static_cast<float>(dev_prop.totalGlobalMem / 1048576.0f));
-            printf("  constant memory:\t%zu bytes\n", dev_prop.totalConstMem);
-            printf("  shared mem per block:\t%zu bytes\n", dev_prop.sharedMemPerBlock);
-            printf("  shared mem per SM:\t%zu bytes\n", dev_prop.sharedMemPerMultiprocessor);
-            printf("  registers per block:\t%d\n", dev_prop.regsPerBlock);
-        }
-        printf("\n");
-    }
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* This sample queries the properties of the CUDA devices present in the system
+ * via CUDA Runtime API. */
+
+/**
+ * @brief Get the Device Property object
+ * modified from `cuda-samples/Samples/deviceQuery/deviceQuery.cpp`
+ */
+
+struct GpuDiagnostics {
+    static void GetDeviceProperty()
+    {
+        int         num_dev  = 0;
+        cudaError_t error_id = cudaGetDeviceCount(&num_dev);
+
+        if (error_id != cudaSuccess) {
+            printf("cudaGetDeviceCount returned %d\n-> %s\n", static_cast<int>(error_id), cudaGetErrorString(error_id));
+            exit(EXIT_FAILURE);
+        }
+        if (num_dev == 0) { printf("NO CUDA device detected.\n"); }
+        int dev, driver_ver = 0, runtime_ver = 0;
+
+        for (dev = 0; dev < num_dev; ++dev) {
+            cudaSetDevice(dev);
+            cudaDeviceProp dev_prop;
+            cudaGetDeviceProperties(&dev_prop, dev);
+            printf("device #%d, %s: \n", dev, dev_prop.name);
+
+            cudaDriverGetVersion(&driver_ver);
+            cudaRuntimeGetVersion(&runtime_ver);
+            printf(
+                "  driver/runtime\t%d.%d/%d.%d\n", driver_ver / 1000, (driver_ver % 100) / 10, runtime_ver / 1000,
+                (runtime_ver % 100) / 10);
+            printf("  compute capability:\t%d.%d\n", dev_prop.major, dev_prop.minor);
+            printf("  global memory:\t%.0f MiB\n", static_cast<float>(dev_prop.totalGlobalMem / 1048576.0f));
+            printf("  constant memory:\t%zu bytes\n", dev_prop.totalConstMem);
+            printf("  shared mem per block:\t%zu bytes\n", dev_prop.sharedMemPerBlock);
+            printf("  shared mem per SM:\t%zu bytes\n", dev_prop.sharedMemPerMultiprocessor);
+            printf("  registers per block:\t%d\n", dev_prop.regsPerBlock);
+        }
+        printf("\n");
+    }
 };
\ No newline at end of file
diff --git a/qtensor/compression/cusz/include/cli/timerecord_viewer.hh b/qtensor/compression/cusz/include/cli/timerecord_viewer.hh
index 9e245073..52baac95 100644
--- a/qtensor/compression/cusz/include/cli/timerecord_viewer.hh
+++ b/qtensor/compression/cusz/include/cli/timerecord_viewer.hh
@@ -1,109 +1,109 @@
-/**
- * @file timerecord_viewer.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-04-09
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef CLI_TIMERECORD_VIEWER_HH
-#define CLI_TIMERECORD_VIEWER_HH
-
-#include <algorithm>
-#include "../common/definition.hh"
-
-namespace cusz {
-
-struct TimeRecordViewer {
-    static float get_throughput(float milliseconds, size_t bytes)
-    {
-        auto GiB     = 1.0 * 1024 * 1024 * 1024;
-        auto seconds = milliseconds * 1e-3;
-        return bytes / GiB / seconds;
-    }
-
-    static void println_throughput(const char* s, float timer, size_t bytes)
-    {
-        if (timer == 0.0) return;
-
-        auto t = get_throughput(timer, bytes);
-        printf("  %-12s %'12f %'10.2f\n", s, timer, t);
-    };
-
-    static void println_throughput_tablehead()
-    {
-        printf(
-            "\n  \e[1m\e[31m%-12s %12s %10s\e[0m\n",  //
-            const_cast<char*>("kernel"),              //
-            const_cast<char*>("time, ms"),            //
-            const_cast<char*>("GiB/s")                //
-        );
-    }
-
-    static double get_total_time(timerecord_t r)
-    {
-        double total = 0.0;
-        std::for_each(r->begin(), r->end(), [&](TimeRecordTuple t) { return total += std::get<1>(t); });
-        return total;
-    }
-    static void view_compression(timerecord_t r, size_t bytes, size_t compressed_bytes = 0)
-    {
-        auto report_cr = [&]() {
-            auto cr = 1.0 * bytes / compressed_bytes;
-            if (compressed_bytes != 0) printf("  %-*s %.2f\n", 20, "compression ratio", cr);
-        };
-
-        TimeRecord reflow;
-
-        {  // reflow
-            TimeRecordTuple book_tuple;
-
-            auto total_time    = get_total_time(r);
-            auto subtotal_time = total_time;
-
-            for (auto& i : *r) {
-                auto item = std::string(std::get<0>(i));
-                if (item == "book") {
-                    book_tuple = i;
-                    subtotal_time -= std::get<1>(i);
-                }
-                else {
-                    reflow.push_back(i);
-                }
-            }
-            reflow.push_back({const_cast<const char*>("(subtotal)"), subtotal_time});
-            printf("\e[2m");
-            reflow.push_back(book_tuple);
-            reflow.push_back({const_cast<const char*>("(total)"), total_time});
-            printf("\e[0m");
-        }
-
-        printf("\n(c) COMPRESSION REPORT\n");
-        report_cr();
-
-        ReportHelper::println_throughput_tablehead();
-        for (auto& i : reflow) ReportHelper::println_throughput(std::get<0>(i), std::get<1>(i), bytes);
-
-        printf("\n");
-    }
-
-    static void view_decompression(timerecord_t r, size_t bytes)
-    {
-        printf("\n(d) deCOMPRESSION REPORT\n");
-
-        auto total_time = get_total_time(r);
-        (*r).push_back({const_cast<const char*>("(total)"), total_time});
-
-        ReportHelper::println_throughput_tablehead();
-        for (auto& i : *r) ReportHelper::println_throughput(std::get<0>(i), std::get<1>(i), bytes);
-
-        printf("\n");
-    }
-};
-
-}  // namespace cusz
-
-#endif
+/**
+ * @file timerecord_viewer.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-09
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CLI_TIMERECORD_VIEWER_HH
+#define CLI_TIMERECORD_VIEWER_HH
+
+#include <algorithm>
+#include "../common/definition.hh"
+
+namespace cusz {
+
+struct TimeRecordViewer {
+    static float get_throughput(float milliseconds, size_t bytes)
+    {
+        auto GiB     = 1.0 * 1024 * 1024 * 1024;
+        auto seconds = milliseconds * 1e-3;
+        return bytes / GiB / seconds;
+    }
+
+    static void println_throughput(const char* s, float timer, size_t bytes)
+    {
+        if (timer == 0.0) return;
+
+        auto t = get_throughput(timer, bytes);
+        printf("  %-12s %'12f %'10.2f\n", s, timer, t);
+    };
+
+    static void println_throughput_tablehead()
+    {
+        printf(
+            "\n  \e[1m\e[31m%-12s %12s %10s\e[0m\n",  //
+            const_cast<char*>("kernel"),              //
+            const_cast<char*>("time, ms"),            //
+            const_cast<char*>("GiB/s")                //
+        );
+    }
+
+    static double get_total_time(timerecord_t r)
+    {
+        double total = 0.0;
+        std::for_each(r->begin(), r->end(), [&](TimeRecordTuple t) { return total += std::get<1>(t); });
+        return total;
+    }
+    static void view_compression(timerecord_t r, size_t bytes, size_t compressed_bytes = 0)
+    {
+        auto report_cr = [&]() {
+            auto cr = 1.0 * bytes / compressed_bytes;
+            if (compressed_bytes != 0) printf("  %-*s %.2f\n", 20, "compression ratio", cr);
+        };
+
+        TimeRecord reflow;
+
+        {  // reflow
+            TimeRecordTuple book_tuple;
+
+            auto total_time    = get_total_time(r);
+            auto subtotal_time = total_time;
+
+            for (auto& i : *r) {
+                auto item = std::string(std::get<0>(i));
+                if (item == "book") {
+                    book_tuple = i;
+                    subtotal_time -= std::get<1>(i);
+                }
+                else {
+                    reflow.push_back(i);
+                }
+            }
+            reflow.push_back({const_cast<const char*>("(subtotal)"), subtotal_time});
+            printf("\e[2m");
+            reflow.push_back(book_tuple);
+            reflow.push_back({const_cast<const char*>("(total)"), total_time});
+            printf("\e[0m");
+        }
+
+        printf("\n(c) COMPRESSION REPORT\n");
+        report_cr();
+
+        ReportHelper::println_throughput_tablehead();
+        for (auto& i : reflow) ReportHelper::println_throughput(std::get<0>(i), std::get<1>(i), bytes);
+
+        printf("\n");
+    }
+
+    static void view_decompression(timerecord_t r, size_t bytes)
+    {
+        printf("\n(d) deCOMPRESSION REPORT\n");
+
+        auto total_time = get_total_time(r);
+        (*r).push_back({const_cast<const char*>("(total)"), total_time});
+
+        ReportHelper::println_throughput_tablehead();
+        for (auto& i : *r) ReportHelper::println_throughput(std::get<0>(i), std::get<1>(i), bytes);
+
+        printf("\n");
+    }
+};
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/include/cli/verify.hh b/qtensor/compression/cusz/include/cli/verify.hh
index 1e856021..621a0077 100644
--- a/qtensor/compression/cusz/include/cli/verify.hh
+++ b/qtensor/compression/cusz/include/cli/verify.hh
@@ -1,87 +1,87 @@
-#ifndef ANALYSIS_VERIFY_HH
-#define ANALYSIS_VERIFY_HH
-
-/**
- * @file verify.hh
- * @author Jiannan Tian
- * @brief Verification of decompressed data.
- * @version 0.2
- * @date 2020-09-20
- * Created on: 2019-09-30
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#include "../common.hh"
-#include "../cusz/type.h"
-
-using namespace std;
-
-namespace cusz {
-
-template <typename T>
-void verify_data(cusz_stats* s, T* xdata, T* odata, size_t len)
-{
-    double max_odata = odata[0], min_odata = odata[0];
-    double max_xdata = xdata[0], min_xdata = xdata[0];
-    double max_abserr = max_abserr = fabs(xdata[0] - odata[0]);
-
-    double sum_0 = 0, sum_x = 0;
-    for (size_t i = 0; i < len; i++) sum_0 += odata[i], sum_x += xdata[i];
-
-    double mean_odata = sum_0 / len, mean_xdata = sum_x / len;
-    double sum_var_odata = 0, sum_var_xdata = 0, sum_err2 = 0, sum_corr = 0, rel_abserr = 0;
-
-    double max_pwrrel_abserr = 0;
-    size_t max_abserr_index  = 0;
-    for (size_t i = 0; i < len; i++) {
-        max_odata = max_odata < odata[i] ? odata[i] : max_odata;
-        min_odata = min_odata > odata[i] ? odata[i] : min_odata;
-
-        max_xdata = max_xdata < odata[i] ? odata[i] : max_xdata;
-        min_xdata = min_xdata > xdata[i] ? xdata[i] : min_xdata;
-
-        float abserr = fabs(xdata[i] - odata[i]);
-        if (odata[i] != 0) {
-            rel_abserr        = abserr / fabs(odata[i]);
-            max_pwrrel_abserr = max_pwrrel_abserr < rel_abserr ? rel_abserr : max_pwrrel_abserr;
-        }
-        max_abserr_index = max_abserr < abserr ? i : max_abserr_index;
-        max_abserr       = max_abserr < abserr ? abserr : max_abserr;
-        sum_corr += (odata[i] - mean_odata) * (xdata[i] - mean_xdata);
-        sum_var_odata += (odata[i] - mean_odata) * (odata[i] - mean_odata);
-        sum_var_xdata += (xdata[i] - mean_xdata) * (xdata[i] - mean_xdata);
-        sum_err2 += abserr * abserr;
-    }
-    double std_odata = sqrt(sum_var_odata / len);
-    double std_xdata = sqrt(sum_var_xdata / len);
-    double ee        = sum_corr / len;
-
-    s->len = len;
-
-    s->odata.max = max_odata;
-    s->odata.min = min_odata;
-    s->odata.rng = max_odata - min_odata;
-    s->odata.std = std_odata;
-
-    s->xdata.max = max_xdata;
-    s->xdata.min = min_xdata;
-    s->xdata.rng = max_xdata - min_xdata;
-    s->xdata.std = std_xdata;
-
-    s->max_err.idx    = max_abserr_index;
-    s->max_err.abs    = max_abserr;
-    s->max_err.rel    = max_abserr / s->odata.rng;
-    s->max_err.pwrrel = max_pwrrel_abserr;
-
-    s->reduced.coeff = ee / std_odata / std_xdata;
-    s->reduced.MSE   = sum_err2 / len;
-    s->reduced.NRMSE = sqrt(s->reduced.MSE) / s->odata.rng;
-    s->reduced.PSNR  = 20 * log10(s->odata.rng) - 10 * log10(s->reduced.MSE);
-}
-
-}  // namespace cusz
-
-#endif
+#ifndef ANALYSIS_VERIFY_HH
+#define ANALYSIS_VERIFY_HH
+
+/**
+ * @file verify.hh
+ * @author Jiannan Tian
+ * @brief Verification of decompressed data.
+ * @version 0.2
+ * @date 2020-09-20
+ * Created on: 2019-09-30
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include "../common.hh"
+#include "../cusz/type.h"
+
+using namespace std;
+
+namespace cusz {
+
+template <typename T>
+void verify_data(cusz_stats* s, T* xdata, T* odata, size_t len)
+{
+    double max_odata = odata[0], min_odata = odata[0];
+    double max_xdata = xdata[0], min_xdata = xdata[0];
+    double max_abserr = max_abserr = fabs(xdata[0] - odata[0]);
+
+    double sum_0 = 0, sum_x = 0;
+    for (size_t i = 0; i < len; i++) sum_0 += odata[i], sum_x += xdata[i];
+
+    double mean_odata = sum_0 / len, mean_xdata = sum_x / len;
+    double sum_var_odata = 0, sum_var_xdata = 0, sum_err2 = 0, sum_corr = 0, rel_abserr = 0;
+
+    double max_pwrrel_abserr = 0;
+    size_t max_abserr_index  = 0;
+    for (size_t i = 0; i < len; i++) {
+        max_odata = max_odata < odata[i] ? odata[i] : max_odata;
+        min_odata = min_odata > odata[i] ? odata[i] : min_odata;
+
+        max_xdata = max_xdata < odata[i] ? odata[i] : max_xdata;
+        min_xdata = min_xdata > xdata[i] ? xdata[i] : min_xdata;
+
+        float abserr = fabs(xdata[i] - odata[i]);
+        if (odata[i] != 0) {
+            rel_abserr        = abserr / fabs(odata[i]);
+            max_pwrrel_abserr = max_pwrrel_abserr < rel_abserr ? rel_abserr : max_pwrrel_abserr;
+        }
+        max_abserr_index = max_abserr < abserr ? i : max_abserr_index;
+        max_abserr       = max_abserr < abserr ? abserr : max_abserr;
+        sum_corr += (odata[i] - mean_odata) * (xdata[i] - mean_xdata);
+        sum_var_odata += (odata[i] - mean_odata) * (odata[i] - mean_odata);
+        sum_var_xdata += (xdata[i] - mean_xdata) * (xdata[i] - mean_xdata);
+        sum_err2 += abserr * abserr;
+    }
+    double std_odata = sqrt(sum_var_odata / len);
+    double std_xdata = sqrt(sum_var_xdata / len);
+    double ee        = sum_corr / len;
+
+    s->len = len;
+
+    s->odata.max = max_odata;
+    s->odata.min = min_odata;
+    s->odata.rng = max_odata - min_odata;
+    s->odata.std = std_odata;
+
+    s->xdata.max = max_xdata;
+    s->xdata.min = min_xdata;
+    s->xdata.rng = max_xdata - min_xdata;
+    s->xdata.std = std_xdata;
+
+    s->max_err.idx    = max_abserr_index;
+    s->max_err.abs    = max_abserr;
+    s->max_err.rel    = max_abserr / s->odata.rng;
+    s->max_err.pwrrel = max_pwrrel_abserr;
+
+    s->reduced.coeff = ee / std_odata / std_xdata;
+    s->reduced.MSE   = sum_err2 / len;
+    s->reduced.NRMSE = sqrt(s->reduced.MSE) / s->odata.rng;
+    s->reduced.PSNR  = 20 * log10(s->odata.rng) - 10 * log10(s->reduced.MSE);
+}
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/include/common.hh b/qtensor/compression/cusz/include/common.hh
index 5d2bf33e..b2741954 100644
--- a/qtensor/compression/cusz/include/common.hh
+++ b/qtensor/compression/cusz/include/common.hh
@@ -1,19 +1,19 @@
-/**
- * @file common.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2021-09-26
- *
- * (C) 2021 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef CUSZ_COMMON_HH
-#define CUSZ_COMMON_HH
-
-#include "common/configs.hh"
-#include "common/definition.hh"
-#include "common/type_traits.hh"
-
+/**
+ * @file common.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-09-26
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_COMMON_HH
+#define CUSZ_COMMON_HH
+
+#include "common/configs.hh"
+#include "common/definition.hh"
+#include "common/type_traits.hh"
+
 #endif
\ No newline at end of file
diff --git a/qtensor/compression/cusz/include/common/capsule.hh b/qtensor/compression/cusz/include/common/capsule.hh
index 05d8ebf6..be1f1f1b 100644
--- a/qtensor/compression/cusz/include/common/capsule.hh
+++ b/qtensor/compression/cusz/include/common/capsule.hh
@@ -1,402 +1,402 @@
-/**
- * @file capsule.hh
- * @author Jiannan Tian
- * @brief Simple data analysis (header)
- * @version 0.2.3
- * @date 2020-11-03
- * (create) 2020-11-03 (rev1) 2021-03-24 (rev2) 2021-09-08
- * @deprecated 0.3.2
- *
- * @copyright (C) 2020 by Washington State University, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#ifndef CAPSULE_HH
-#define CAPSULE_HH
-
-#if __cplusplus >= 201703L
-#define CONSTEXPR constexpr
-#else
-#define CONSTEXPR
-#endif
-
-#include <cuda_runtime.h>
-#include <driver_types.h>
-
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-
-#include "../stat/compare_gpu.hh"
-// #include "../utils/io.hh"
-#include "../utils/timer.hh"
-#include "definition.hh"
-
-template <typename T>
-class Capsule {
-   private:
-    // variables
-    struct {
-        bool hptr{false}, dptr{false}, uniptr{false};
-    } alloc_status;
-
-    T *_dptr{nullptr}, *_hptr{nullptr}, *_uniptr{nullptr};
-
-    uint32_t _len{0};
-    dim3     _len3{1, 1, 1}, _stride3{1, 1, 1};
-
-    std::string name;
-
-    // logging setup; standalone
-    const std::string LOG_NULL      = "      ";
-    const std::string LOG_INFO      = "  ::  ";
-    const std::string LOG_ERR       = " ERR  ";
-    const std::string LOG_WARN      = "WARN  ";
-    const std::string LOG_DBG       = " dbg  ";
-    const std::string LOG_EXCEPTION = "  !!  ";
-
-    // https://stackoverflow.com/a/26080768/8740097  CC BY-SA 3.0
-    template <typename S>
-    void build_string(std::ostream& o, S t)
-    {
-        o << t << " ";
-    }
-
-    template <typename S, typename... Args>
-    void build_string(std::ostream& o, S t, Args... args)  // recursive variadic function
-    {
-        build_string(o, t);
-        build_string(o, args...);
-    }
-
-    template <typename... Args>
-    void LOGGING(const std::string& log_head, Args... args)
-    {
-        std::ostringstream oss;
-        oss << log_head;
-        build_string(oss, args...);
-
-        oss.seekp(0, std::ios::end);
-        std::stringstream::pos_type offset = oss.tellp();
-        if (log_head == LOG_DBG) { std::cout << "\e[2m"; }  // dbg
-        std::cout << oss.str() << std::endl;                // print content
-        if (log_head == LOG_DBG) std::cout << "\e[0m";      // finish printing dbg
-    }
-
-    // IO
-    int fs2mem(const char* fname, void* array, size_t num_els)
-    {
-        auto bytes = sizeof(T) * num_els;
-
-        std::ifstream ifs(fname, std::ios::binary | std::ios::in);
-        if (not ifs.is_open()) {
-            std::cerr << "fail to open " << fname << std::endl;
-            return -1;
-        }
-        ifs.read(reinterpret_cast<char*>(array), std::streamsize(bytes));
-        ifs.close();
-
-        return 0;
-    }
-
-    int mem2fs(const char* fname, void* array, size_t num_els)
-    {
-        auto bytes = sizeof(type) * num_els;
-
-        std::ofstream ofs(fname, std::ios::binary | std::ios::out);
-        if (not ofs.is_open()) {
-            std::cerr << "fail to open " << fname << std::endl;
-            return -1;
-        }
-
-        ofs.write(reinterpret_cast<const char*>(array), std::streamsize(bytes));
-        ofs.close();
-
-        return 0;
-    }
-
-    std::string ERRSTR_BUILDER(std::string func, std::string msg)
-    {
-        return "[Capsule(\"" + name + "\")::" + func + "] " + msg;
-    }
-
-    void check_len(std::string funcname)
-    {
-        if (_len == 0) throw std::runtime_error("[Capsule(\"" + name + "\")::" + funcname + "] " + "len == 0");
-    }
-
-    std::string ERROR_UNDEFINED_BEHAVIOR(std::string func, std::string msg = "undefined behavior")
-    {  //
-        return ERRSTR_BUILDER(func, "undefined behavior");
-    }
-
-   public:
-    using type = T;
-
-    // TODO rule of n
-    // constructor
-    Capsule() = default;
-    Capsule(const std::string _str) : name(_str){};
-    Capsule(uint32_t len, const std::string _str = std::string("<unnamed>")) : _len(len), name(_str) {}
-    Capsule(uint32_t x, uint32_t y, uint32_t z, const std::string _str = std::string("<unnamed>")) : name(_str)
-    {
-        _len3 = dim3(x, y, z);
-        _len  = x * y * z;
-    }
-
-    ~Capsule()
-    {
-        // Becasue _hptr can be obtained externally, and could be non-pinned, cudaFreeHost may not work properly.
-        // if (alloc_status.hptr) cudaFreeHost(_hptr);
-
-        if (alloc_status.dptr) cudaFree(_dptr);
-        if (alloc_status.uniptr) cudaFree(_uniptr);
-    }
-
-    // getter start --------------------
-    T*& dptr() { return _dptr; }
-    T*& hptr() { return _hptr; }
-    T*& uniptr() { return _uniptr; }
-
-    uint32_t len() const { return _len; }
-    dim3     len3() const { return _len3; }
-    dim3     stride3() const { return _stride3; }
-    // 1D
-    T& dptr(uint32_t i) { return _dptr[i]; }
-    T& hptr(uint32_t i) { return _hptr[i]; }
-    T& uniptr(uint32_t i) { return _uniptr[i]; }
-    // 2D
-    T& dptr(uint32_t x, uint32_t y) { return _dptr[x + y * _stride3.y]; }
-    T& hptr(uint32_t x, uint32_t y) { return _hptr[x + y * _stride3.y]; }
-    T& uniptr(uint32_t x, uint32_t y) { return _uniptr[x + y * _stride3.y]; }
-    // 3D
-    T& dptr(uint32_t x, uint32_t y, uint32_t z) { return _dptr[x + y * _stride3.y + z * _stride3.z]; }
-    T& hptr(uint32_t x, uint32_t y, uint32_t z) { return _hptr[x + y * _stride3.y + z * _stride3.z]; }
-    T& uniptr(uint32_t x, uint32_t y, uint32_t z) { return _uniptr[x + y * _stride3.y + z * _stride3.z]; }
-    // getter end -----------------------
-
-    // setter start ---------------------
-    Capsule& set_hptr(T* ptr)
-    {
-        _hptr = ptr, alloc_status.hptr = true;
-        return *this;
-    }
-    Capsule& set_dptr(T* ptr)
-    {
-        _dptr = ptr, alloc_status.dptr = true;
-        return *this;
-    }
-    Capsule& set_uniptr(T* ptr)
-    {
-        _uniptr = ptr, alloc_status.uniptr = true;
-        return *this;
-    }
-
-    // variable len
-    Capsule& set_len(uint32_t len)
-    {
-        if (len <= 0) throw std::runtime_error("length must be greater than 0");
-        _len = len;
-        return *this;
-    }
-
-    Capsule& set_len3(uint32_t x, uint32_t y = 1, uint32_t z = 1)
-    {
-        if (x == 1) throw std::runtime_error("x must be > 1.");
-        if (x * y * z == 0) throw std::runtime_error("x, y, z must be non-zero.");
-
-        _len3    = dim3(x, y, z);
-        _stride3 = dim3(1, x, x * y);
-        _len     = x * y * z;
-
-        return *this;
-    }
-    // setter end ----------------------
-
-    // debug
-    void debug()
-    {
-        printf("Capsule debugging information\n");
-        printf("  name   : %s\n", name.c_str());
-        printf("  len    : %u\n", len());
-        printf("  hptr   : %s\n", alloc_status.hptr ? "set" : "not set");
-        printf("  dptr   : %s\n", alloc_status.dptr ? "set" : "not set");
-        printf("  uniptr : %s\n", alloc_status.uniptr ? "set" : "not set");
-    }
-
-    // for debugging
-    Capsule& set_name(std::string _str)
-    {
-        name = _str;
-        return *this;
-    }
-
-    // IO
-    Capsule& fromfile(std::string fname, double* time = nullptr)
-    {
-        if (not _hptr) throw std::runtime_error(ERRSTR_BUILDER("fromfile", "_hptr not set"));
-        if (_len == 0) throw std::runtime_error(ERRSTR_BUILDER("fromfile", "len == 0"));
-
-        auto a = hires::now();
-        fs2mem(fname.c_str(), _hptr, _len);
-        auto z = hires::now();
-
-        if (time) *time = static_cast<duration_t>(z - a).count();
-
-        return *this;
-    }
-
-    Capsule& tofile(std::string fname, double* time = nullptr)
-    {
-        if (not _hptr) { throw std::runtime_error(ERRSTR_BUILDER("tofile", "_hptr not set")); }
-        if (_len == 0) throw std::runtime_error(ERRSTR_BUILDER("tofile", "len == 0"));
-
-        auto a = hires::now();
-        mem2fs(fname.c_str(), _hptr, _len);
-        auto z = hires::now();
-
-        if (time) *time = static_cast<duration_t>(z - a).count();
-
-        return *this;
-    }
-
-    uint32_t nbyte() const { return _len * sizeof(T); }
-
-    // memcpy h2d, synchronous
-    Capsule& host2device()
-    {
-        check_len("host2device");
-
-        cudaMemcpy(_dptr, _hptr, nbyte(), cudaMemcpyHostToDevice);
-        return *this;
-    }
-    // memcpy d2h, synchronous
-    Capsule& device2host()
-    {
-        check_len("device2host");
-
-        cudaMemcpy(_hptr, _dptr, nbyte(), cudaMemcpyDeviceToHost);
-        return *this;
-    }
-    // memcpy h2d, asynchronous
-    Capsule& host2device_async(cudaStream_t stream)
-    {
-        check_len("host2device_async");
-
-        cudaMemcpyAsync(_dptr, _hptr, nbyte(), cudaMemcpyHostToDevice, stream);
-        return *this;
-    }
-    // memcpy d2h, asynchronous
-    Capsule& device2host_async(cudaStream_t stream)
-    {
-        check_len("device2host_async");
-
-        cudaMemcpyAsync(_hptr, _dptr, nbyte(), cudaMemcpyDeviceToHost, stream);
-        return *this;
-    }
-    // shorthand
-    Capsule& h2d() { return host2device(); }
-    Capsule& d2h() { return device2host(); }
-    Capsule& async_h2d(cudaStream_t stream) { return host2device_async(stream); }
-    Capsule& async_d2h(cudaStream_t stream) { return device2host_async(stream); }
-
-    // cudaMalloc wrapper
-    Capsule& malloc(bool do_memset = true, uint8_t memset_val = 0)
-    {
-        check_len("malloc");
-
-        if (alloc_status.dptr)
-            LOGGING(LOG_WARN, "already allocated on device");
-        else {
-            cudaMalloc(&_dptr, nbyte());
-            cudaMemset(_dptr, memset_val, nbyte());
-            alloc_status.dptr = true;
-        }
-        return *this;
-    }
-    // cudaMallocHost wrapper, pinned
-    Capsule& mallochost(bool do_memset = true, uint8_t memset_val = 0)
-    {
-        check_len("mallochost");
-
-        if (alloc_status.hptr)
-            LOGGING(LOG_WARN, "already allocated on host");
-        else {
-            cudaMallocHost(&_hptr, nbyte());
-            memset(_hptr, memset_val, nbyte());
-            alloc_status.hptr = true;
-        }
-        return *this;
-    }
-    // cudaMallocManaged wrapper
-    Capsule& mallocmanaged(bool do_memset = true, uint8_t memset_val = 0)
-    {
-        check_len("mallocmanaged");
-
-        if (alloc_status.uniptr)
-            LOGGING(LOG_WARN, "already allocated as unified");
-        else {
-            cudaMallocManaged(&_uniptr, nbyte());
-            cudaMemset(_uniptr, memset_val, nbyte());
-            alloc_status.uniptr = true;
-        }
-        return *this;
-    }
-    // cudaFree wrapper
-    Capsule& free()
-    {
-        if (not _dptr) throw std::runtime_error(ERRSTR_BUILDER("free", "_dptr is null"));
-        cudaFree(_dptr);
-        alloc_status.dptr = false;
-        return *this;
-    }
-    // cudaFreeHost wrapper
-    Capsule& freehost()
-    {
-        if (not _hptr) throw std::runtime_error(ERRSTR_BUILDER("free", "_hptr is null"));
-        cudaFreeHost(_hptr);
-        alloc_status.hptr = false;
-        return *this;
-    }
-    // cudaFree wrapper, but for unified memory
-    Capsule& freemanaged()
-    {
-        if (not _uniptr) throw std::runtime_error(ERRSTR_BUILDER("free", "_uniptr is null"));
-        cudaFree(_uniptr);
-        alloc_status.uniptr = false;
-        return *this;
-    }
-
-   private:
-    double maxval, minval, rng;
-
-   public:
-    double get_maxval() { return maxval; }
-    double get_minval() { return minval; }
-    double get_rng() { return rng; }
-
-    // data scan
-    Capsule& prescan(double& max_value, double& min_value, double& rng)
-    {
-        // may not work for _uniptr
-        T result[4];
-        psz::thrustgpu_get_extrema_rawptr<T>(_dptr, _len, result);
-
-        min_value = result[0];
-        max_value = result[1];
-        rng       = max_value - min_value;
-
-        return *this;
-    }
-    // data scan
-    Capsule& prescan()
-    {
-        prescan(maxval, minval, rng);
-        return *this;
-    }
-};
-
-#endif
+/**
+ * @file capsule.hh
+ * @author Jiannan Tian
+ * @brief Simple data analysis (header)
+ * @version 0.2.3
+ * @date 2020-11-03
+ * (create) 2020-11-03 (rev1) 2021-03-24 (rev2) 2021-09-08
+ * @deprecated 0.3.2
+ *
+ * @copyright (C) 2020 by Washington State University, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CAPSULE_HH
+#define CAPSULE_HH
+
+#if __cplusplus >= 201703L
+#define CONSTEXPR constexpr
+#else
+#define CONSTEXPR
+#endif
+
+#include <cuda_runtime.h>
+#include <driver_types.h>
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#include "../stat/compare_gpu.hh"
+// #include "../utils/io.hh"
+#include "../utils/timer.hh"
+#include "definition.hh"
+
+template <typename T>
+class Capsule {
+   private:
+    // variables
+    struct {
+        bool hptr{false}, dptr{false}, uniptr{false};
+    } alloc_status;
+
+    T *_dptr{nullptr}, *_hptr{nullptr}, *_uniptr{nullptr};
+
+    uint32_t _len{0};
+    dim3     _len3{1, 1, 1}, _stride3{1, 1, 1};
+
+    std::string name;
+
+    // logging setup; standalone
+    const std::string LOG_NULL      = "      ";
+    const std::string LOG_INFO      = "  ::  ";
+    const std::string LOG_ERR       = " ERR  ";
+    const std::string LOG_WARN      = "WARN  ";
+    const std::string LOG_DBG       = " dbg  ";
+    const std::string LOG_EXCEPTION = "  !!  ";
+
+    // https://stackoverflow.com/a/26080768/8740097  CC BY-SA 3.0
+    template <typename S>
+    void build_string(std::ostream& o, S t)
+    {
+        o << t << " ";
+    }
+
+    template <typename S, typename... Args>
+    void build_string(std::ostream& o, S t, Args... args)  // recursive variadic function
+    {
+        build_string(o, t);
+        build_string(o, args...);
+    }
+
+    template <typename... Args>
+    void LOGGING(const std::string& log_head, Args... args)
+    {
+        std::ostringstream oss;
+        oss << log_head;
+        build_string(oss, args...);
+
+        oss.seekp(0, std::ios::end);
+        std::stringstream::pos_type offset = oss.tellp();
+        if (log_head == LOG_DBG) { std::cout << "\e[2m"; }  // dbg
+        std::cout << oss.str() << std::endl;                // print content
+        if (log_head == LOG_DBG) std::cout << "\e[0m";      // finish printing dbg
+    }
+
+    // IO
+    int fs2mem(const char* fname, void* array, size_t num_els)
+    {
+        auto bytes = sizeof(T) * num_els;
+
+        std::ifstream ifs(fname, std::ios::binary | std::ios::in);
+        if (not ifs.is_open()) {
+            std::cerr << "fail to open " << fname << std::endl;
+            return -1;
+        }
+        ifs.read(reinterpret_cast<char*>(array), std::streamsize(bytes));
+        ifs.close();
+
+        return 0;
+    }
+
+    int mem2fs(const char* fname, void* array, size_t num_els)
+    {
+        auto bytes = sizeof(type) * num_els;
+
+        std::ofstream ofs(fname, std::ios::binary | std::ios::out);
+        if (not ofs.is_open()) {
+            std::cerr << "fail to open " << fname << std::endl;
+            return -1;
+        }
+
+        ofs.write(reinterpret_cast<const char*>(array), std::streamsize(bytes));
+        ofs.close();
+
+        return 0;
+    }
+
+    std::string ERRSTR_BUILDER(std::string func, std::string msg)
+    {
+        return "[Capsule(\"" + name + "\")::" + func + "] " + msg;
+    }
+
+    void check_len(std::string funcname)
+    {
+        if (_len == 0) throw std::runtime_error("[Capsule(\"" + name + "\")::" + funcname + "] " + "len == 0");
+    }
+
+    std::string ERROR_UNDEFINED_BEHAVIOR(std::string func, std::string msg = "undefined behavior")
+    {  //
+        return ERRSTR_BUILDER(func, "undefined behavior");
+    }
+
+   public:
+    using type = T;
+
+    // TODO rule of n
+    // constructor
+    Capsule() = default;
+    Capsule(const std::string _str) : name(_str){};
+    Capsule(uint32_t len, const std::string _str = std::string("<unnamed>")) : _len(len), name(_str) {}
+    Capsule(uint32_t x, uint32_t y, uint32_t z, const std::string _str = std::string("<unnamed>")) : name(_str)
+    {
+        _len3 = dim3(x, y, z);
+        _len  = x * y * z;
+    }
+
+    ~Capsule()
+    {
+        // Becasue _hptr can be obtained externally, and could be non-pinned, cudaFreeHost may not work properly.
+        // if (alloc_status.hptr) cudaFreeHost(_hptr);
+
+        if (alloc_status.dptr) cudaFree(_dptr);
+        if (alloc_status.uniptr) cudaFree(_uniptr);
+    }
+
+    // getter start --------------------
+    T*& dptr() { return _dptr; }
+    T*& hptr() { return _hptr; }
+    T*& uniptr() { return _uniptr; }
+
+    uint32_t len() const { return _len; }
+    dim3     len3() const { return _len3; }
+    dim3     stride3() const { return _stride3; }
+    // 1D
+    T& dptr(uint32_t i) { return _dptr[i]; }
+    T& hptr(uint32_t i) { return _hptr[i]; }
+    T& uniptr(uint32_t i) { return _uniptr[i]; }
+    // 2D
+    T& dptr(uint32_t x, uint32_t y) { return _dptr[x + y * _stride3.y]; }
+    T& hptr(uint32_t x, uint32_t y) { return _hptr[x + y * _stride3.y]; }
+    T& uniptr(uint32_t x, uint32_t y) { return _uniptr[x + y * _stride3.y]; }
+    // 3D
+    T& dptr(uint32_t x, uint32_t y, uint32_t z) { return _dptr[x + y * _stride3.y + z * _stride3.z]; }
+    T& hptr(uint32_t x, uint32_t y, uint32_t z) { return _hptr[x + y * _stride3.y + z * _stride3.z]; }
+    T& uniptr(uint32_t x, uint32_t y, uint32_t z) { return _uniptr[x + y * _stride3.y + z * _stride3.z]; }
+    // getter end -----------------------
+
+    // setter start ---------------------
+    Capsule& set_hptr(T* ptr)
+    {
+        _hptr = ptr, alloc_status.hptr = true;
+        return *this;
+    }
+    Capsule& set_dptr(T* ptr)
+    {
+        _dptr = ptr, alloc_status.dptr = true;
+        return *this;
+    }
+    Capsule& set_uniptr(T* ptr)
+    {
+        _uniptr = ptr, alloc_status.uniptr = true;
+        return *this;
+    }
+
+    // variable len
+    Capsule& set_len(uint32_t len)
+    {
+        if (len <= 0) throw std::runtime_error("length must be greater than 0");
+        _len = len;
+        return *this;
+    }
+
+    Capsule& set_len3(uint32_t x, uint32_t y = 1, uint32_t z = 1)
+    {
+        if (x == 1) throw std::runtime_error("x must be > 1.");
+        if (x * y * z == 0) throw std::runtime_error("x, y, z must be non-zero.");
+
+        _len3    = dim3(x, y, z);
+        _stride3 = dim3(1, x, x * y);
+        _len     = x * y * z;
+
+        return *this;
+    }
+    // setter end ----------------------
+
+    // debug
+    void debug()
+    {
+        printf("Capsule debugging information\n");
+        printf("  name   : %s\n", name.c_str());
+        printf("  len    : %u\n", len());
+        printf("  hptr   : %s\n", alloc_status.hptr ? "set" : "not set");
+        printf("  dptr   : %s\n", alloc_status.dptr ? "set" : "not set");
+        printf("  uniptr : %s\n", alloc_status.uniptr ? "set" : "not set");
+    }
+
+    // for debugging
+    Capsule& set_name(std::string _str)
+    {
+        name = _str;
+        return *this;
+    }
+
+    // IO
+    Capsule& fromfile(std::string fname, double* time = nullptr)
+    {
+        if (not _hptr) throw std::runtime_error(ERRSTR_BUILDER("fromfile", "_hptr not set"));
+        if (_len == 0) throw std::runtime_error(ERRSTR_BUILDER("fromfile", "len == 0"));
+
+        auto a = hires::now();
+        fs2mem(fname.c_str(), _hptr, _len);
+        auto z = hires::now();
+
+        if (time) *time = static_cast<duration_t>(z - a).count();
+
+        return *this;
+    }
+
+    Capsule& tofile(std::string fname, double* time = nullptr)
+    {
+        if (not _hptr) { throw std::runtime_error(ERRSTR_BUILDER("tofile", "_hptr not set")); }
+        if (_len == 0) throw std::runtime_error(ERRSTR_BUILDER("tofile", "len == 0"));
+
+        auto a = hires::now();
+        mem2fs(fname.c_str(), _hptr, _len);
+        auto z = hires::now();
+
+        if (time) *time = static_cast<duration_t>(z - a).count();
+
+        return *this;
+    }
+
+    uint32_t nbyte() const { return _len * sizeof(T); }
+
+    // memcpy h2d, synchronous
+    Capsule& host2device()
+    {
+        check_len("host2device");
+
+        cudaMemcpy(_dptr, _hptr, nbyte(), cudaMemcpyHostToDevice);
+        return *this;
+    }
+    // memcpy d2h, synchronous
+    Capsule& device2host()
+    {
+        check_len("device2host");
+
+        cudaMemcpy(_hptr, _dptr, nbyte(), cudaMemcpyDeviceToHost);
+        return *this;
+    }
+    // memcpy h2d, asynchronous
+    Capsule& host2device_async(cudaStream_t stream)
+    {
+        check_len("host2device_async");
+
+        cudaMemcpyAsync(_dptr, _hptr, nbyte(), cudaMemcpyHostToDevice, stream);
+        return *this;
+    }
+    // memcpy d2h, asynchronous
+    Capsule& device2host_async(cudaStream_t stream)
+    {
+        check_len("device2host_async");
+
+        cudaMemcpyAsync(_hptr, _dptr, nbyte(), cudaMemcpyDeviceToHost, stream);
+        return *this;
+    }
+    // shorthand
+    Capsule& h2d() { return host2device(); }
+    Capsule& d2h() { return device2host(); }
+    Capsule& async_h2d(cudaStream_t stream) { return host2device_async(stream); }
+    Capsule& async_d2h(cudaStream_t stream) { return device2host_async(stream); }
+
+    // cudaMalloc wrapper
+    Capsule& malloc(bool do_memset = true, uint8_t memset_val = 0)
+    {
+        check_len("malloc");
+
+        if (alloc_status.dptr)
+            LOGGING(LOG_WARN, "already allocated on device");
+        else {
+            cudaMalloc(&_dptr, nbyte());
+            cudaMemset(_dptr, memset_val, nbyte());
+            alloc_status.dptr = true;
+        }
+        return *this;
+    }
+    // cudaMallocHost wrapper, pinned
+    Capsule& mallochost(bool do_memset = true, uint8_t memset_val = 0)
+    {
+        check_len("mallochost");
+
+        if (alloc_status.hptr)
+            LOGGING(LOG_WARN, "already allocated on host");
+        else {
+            cudaMallocHost(&_hptr, nbyte());
+            memset(_hptr, memset_val, nbyte());
+            alloc_status.hptr = true;
+        }
+        return *this;
+    }
+    // cudaMallocManaged wrapper
+    Capsule& mallocmanaged(bool do_memset = true, uint8_t memset_val = 0)
+    {
+        check_len("mallocmanaged");
+
+        if (alloc_status.uniptr)
+            LOGGING(LOG_WARN, "already allocated as unified");
+        else {
+            cudaMallocManaged(&_uniptr, nbyte());
+            cudaMemset(_uniptr, memset_val, nbyte());
+            alloc_status.uniptr = true;
+        }
+        return *this;
+    }
+    // cudaFree wrapper
+    Capsule& free()
+    {
+        if (not _dptr) throw std::runtime_error(ERRSTR_BUILDER("free", "_dptr is null"));
+        cudaFree(_dptr);
+        alloc_status.dptr = false;
+        return *this;
+    }
+    // cudaFreeHost wrapper
+    Capsule& freehost()
+    {
+        if (not _hptr) throw std::runtime_error(ERRSTR_BUILDER("free", "_hptr is null"));
+        cudaFreeHost(_hptr);
+        alloc_status.hptr = false;
+        return *this;
+    }
+    // cudaFree wrapper, but for unified memory
+    Capsule& freemanaged()
+    {
+        if (not _uniptr) throw std::runtime_error(ERRSTR_BUILDER("free", "_uniptr is null"));
+        cudaFree(_uniptr);
+        alloc_status.uniptr = false;
+        return *this;
+    }
+
+   private:
+    double maxval, minval, rng;
+
+   public:
+    double get_maxval() { return maxval; }
+    double get_minval() { return minval; }
+    double get_rng() { return rng; }
+
+    // data scan
+    Capsule& prescan(double& max_value, double& min_value, double& rng)
+    {
+        // may not work for _uniptr
+        T result[4];
+        psz::thrustgpu_get_extrema_rawptr<T>(_dptr, _len, result);
+
+        min_value = result[0];
+        max_value = result[1];
+        rng       = max_value - min_value;
+
+        return *this;
+    }
+    // data scan
+    Capsule& prescan()
+    {
+        prescan(maxval, minval, rng);
+        return *this;
+    }
+};
+
+#endif
diff --git a/qtensor/compression/cusz/include/common/configs.hh b/qtensor/compression/cusz/include/common/configs.hh
index 7c1e0654..d9a0bd39 100644
--- a/qtensor/compression/cusz/include/common/configs.hh
+++ b/qtensor/compression/cusz/include/common/configs.hh
@@ -1,354 +1,354 @@
-/**
- * @file configs.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2021-09-26
- *
- * (C) 2021 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef CUSZ_COMMON_CONFIGS_HH
-#define CUSZ_COMMON_CONFIGS_HH
-
-#include <cuda_runtime.h>
-#include <cxxabi.h>
-#include <cmath>
-#include <fstream>
-#include <limits>
-#include <numeric>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "../header.h"
-#include "definition.hh"
-
-#if __cplusplus >= 201703L
-#define CONSTEXPR constexpr
-#else
-#define CONSTEXPR
-#endif
-
-struct Reinterpret1DTo2D {
-    template <typename T>
-    static T get_square_size(T len)
-    {
-        return static_cast<T>(ceil(sqrt(len)));
-    }
-};
-
-struct Align {
-    template <cusz::ALIGNDATA ad = cusz::ALIGNDATA::NONE>
-    static size_t get_aligned_datalen(size_t len)
-    {
-        if CONSTEXPR (ad == cusz::ALIGNDATA::NONE) return len;
-        if CONSTEXPR (ad == cusz::ALIGNDATA::SQUARE_MATRIX) {
-            auto m = Reinterpret1DTo2D::get_square_size(len);
-            return m * m;
-        }
-    }
-
-    static const int DEFAULT_ALIGN_NBYTE = 128;
-
-    template <int NUM>
-    static inline bool is_aligned_at(const void* ptr)
-    {  //
-        return reinterpret_cast<uintptr_t>(ptr) % NUM == 0;
-    };
-
-    template <typename T, int NUM = DEFAULT_ALIGN_NBYTE>
-    static size_t get_aligned_nbyte(size_t len)
-    {
-        return ((sizeof(T) * len - 1) / NUM + 1) * NUM;
-    }
-};
-
-// sparsity rate is less that 5%
-struct SparseMethodSetup {
-    // "Density" denotes the degree of non-zeros (nz).
-    static constexpr float default_density  = 0.25;                 // ratio of nonzeros (R_nz)
-    static constexpr float default_sparsity = 1 - default_density;  // ratio of zeros, 1 - R_nz
-
-    static constexpr int default_density_factor = 4;  // ratio of nonzeros (R_nz)
-
-    template <typename T, typename M = int>
-    static uint32_t get_csr_nbyte(uint32_t len, uint32_t nnz)
-    {
-        auto m     = Reinterpret1DTo2D::get_square_size(len);
-        auto nbyte = sizeof(M) * (m + 1) + sizeof(M) * nnz + sizeof(T) * nnz;
-        return nbyte;
-    }
-};
-
-struct HuffmanHelper {
-    // deprecated
-    // template <typename SYM, typename BOOK>
-    // static uint32_t get_revbook_nbyte(int dict_size)
-    // {
-    //     constexpr auto TYPE_BITCOUNT = sizeof(BOOK) * 8;
-    //     return sizeof(BOOK) * (2 * TYPE_BITCOUNT) + sizeof(SYM) * dict_size;
-    // }
-
-    static const int BLOCK_DIM_ENCODE  = 256;
-    static const int BLOCK_DIM_DEFLATE = 256;
-
-    static const int ENC_SEQUENTIALITY = 4;  // empirical
-    static const int DEFLATE_CONSTANT  = 4;  // TODO -> deflate_chunk_constant
-};
-
-struct StringHelper {
-    static std::string nnz_percentage(uint32_t nnz, uint32_t data_len)
-    {
-        return "(" + std::to_string(nnz / 1.0 / data_len * 100) + "%)";
-    }
-};
-
-struct ConfigHelper {
-    static uint32_t predictor_lookup(std::string name)
-    {
-        const std::unordered_map<std::string, uint32_t> lut = {
-            {"lorenzo", 0}, {"lorenzoii", 1}, {"spline3", 2}  //
-        };
-        if (lut.find(name) != lut.end()) throw std::runtime_error("no such predictor as " + name);
-        return lut.at(name);
-    }
-
-    static uint32_t codec_lookup(std::string name)
-    {
-        const std::unordered_map<std::string, uint32_t> lut = {
-            {"huffman-coarse", 0}  //
-        };
-        if (lut.find(name) != lut.end()) throw std::runtime_error("no such codec as " + name);
-        return lut.at(name);
-    }
-
-    static uint32_t spcodec_lookup(std::string name)
-    {
-        const std::unordered_map<std::string, uint32_t> lut = {
-            {"spmat", 0}, {"spvec", 1}  //
-        };
-        if (lut.find(name) != lut.end()) throw std::runtime_error("no such codec as " + name);
-        return lut.at(name);
-    }
-
-    static std::string get_default_predictor() { return "lorenzo"; }
-    static std::string get_default_spcodec() { return "csr11"; }
-    static std::string get_default_codec() { return "huffman-coarse"; }
-    static std::string get_default_cuszmode() { return "r2r"; }
-    static std::string get_default_dtype() { return "f32"; }
-
-    static bool check_predictor(const std::string& val, bool fatal = false)
-    {
-        auto legal = (val == "lorenzo") or (val == "spline3");
-        if (not legal) {
-            if (fatal)
-                throw std::runtime_error("`predictor` must be \"lorenzo\" or \"spline3\".");
-            else
-                printf("fallback to the default \"%s\".", get_default_predictor().c_str());
-        }
-        return legal;
-    }
-
-    static bool check_codec(const std::string& val, bool fatal = false)
-    {
-        auto legal = (val == "huffman-coarse");
-        if (not legal) {
-            if (fatal)
-                throw std::runtime_error("`codec` must be \"huffman-coarse\".");
-            else
-                printf("fallback to the default \"%s\".", get_default_codec().c_str());
-        }
-        return legal;
-    }
-
-    static bool check_spcodec(const std::string& val, bool fatal = false)
-    {
-        auto legal = (val == "csr11") or (val == "rle");
-        if (not legal) {
-            if (fatal)
-                throw std::runtime_error("`codec` must be \"csr11\" or \"rle\".");
-            else
-                printf("fallback to the default \"%s\".", get_default_codec().c_str());
-        }
-        return legal;
-    }
-
-    static bool check_cuszmode(const std::string& val, bool fatal = false)
-    {
-        auto legal = (val == "r2r") or (val == "abs");
-        if (not legal) {
-            if (fatal)
-                throw std::runtime_error("`mode` must be \"r2r\" or \"abs\".");
-            else
-                printf("fallback to the default \"%s\".", get_default_cuszmode().c_str());
-        }
-        return legal;
-    }
-
-    static bool check_dtype(const std::string& val, bool fatal = false)
-    {
-        auto legal = (val == "f32");
-        // auto legal = (val == "f32") or (val == "f64");
-        if (not legal) {
-            if (fatal)
-                throw std::runtime_error("`dtype` must be \"f32\".");
-            else
-                printf("fallback to the default \"%s\".", get_default_dtype().c_str());
-        }
-        return legal;
-    }
-
-    static bool check_opt_in_list(std::string const& opt, std::vector<std::string> vs)
-    {
-        for (auto& i : vs) {
-            if (opt == i) return true;
-        }
-        return false;
-    }
-
-    static void parse_length_literal(const char* str, std::vector<std::string>& dims)
-    {
-        std::stringstream data_len_ss(str);
-        auto              data_len_literal = data_len_ss.str();
-        char              delimiter        = 'x';
-
-        while (data_len_ss.good()) {
-            std::string substr;
-            std::getline(data_len_ss, substr, delimiter);
-            dims.push_back(substr);
-        }
-    }
-
-    static size_t get_filesize(std::string fname)
-    {
-        std::ifstream in(fname.c_str(), std::ifstream::ate | std::ifstream::binary);
-        return in.tellg();
-    }
-
-    static size_t get_filesize(cusz_header* h)
-    {
-        auto END = sizeof(h->entry) / sizeof(h->entry[0]);
-        return h->entry[END - 1];
-    }
-
-    static size_t get_uncompressed_len(cusz_header* h) { return h->x * h->y * h->z; }
-
-    template <typename T1, typename T2>
-    static size_t get_npart(T1 size, T2 subsize)
-    {
-        static_assert(
-            std::numeric_limits<T1>::is_integer and std::numeric_limits<T2>::is_integer,
-            "[get_npart] must be plain interger types.");
-
-        return (size + subsize - 1) / subsize;
-    }
-
-    // #ifdef __CUDACC__
-    static int get_ndim(dim3 len3)
-    {
-        auto ndim = 3;
-        if (len3.z == 1) ndim = 2;
-        if (len3.z == 1 and len3.y == 1) ndim = 1;
-        return ndim;
-    }
-
-    static dim3 get_pardeg3(dim3 len3, dim3 sublen3)
-    {
-        return dim3(
-            get_npart(len3.x, sublen3.x),  //
-            get_npart(len3.y, sublen3.y),  //
-            get_npart(len3.z, sublen3.z));
-    }
-
-    template <typename T>
-    static dim3 get_pardeg3(dim3 len3, T sublen3[3])
-    {
-        return dim3(
-            get_npart(len3.x, sublen3[0]),  //
-            get_npart(len3.y, sublen3[1]),  //
-            get_npart(len3.z, sublen3[2]));
-    }
-
-    template <typename T>
-    static dim3 multiply_dim3(dim3 a, T b[3])
-    {
-        return dim3(a.x * b[0], a.y * b[1], a.z * b[2]);
-    }
-
-    static dim3 multiply_dim3(dim3 a, dim3 b)
-    {  //
-        return dim3(a.x * b.x, a.y * b.y, a.z * b.z);
-    }
-
-    static size_t get_serialized_len(dim3 a) { return a.x * a.y * a.z; }
-
-    static dim3 get_leap(dim3 len3) { return dim3(1, len3.x, len3.x * len3.y); }
-
-    // #endif
-
-    template <typename T>
-    static size_t get_serialized_len(T a[3])
-    {  //
-        return a[0] * a[1] * a[2];
-    }
-};
-
-struct CompareHelper {
-    template <typename TRIO>
-    static bool eq(TRIO a, TRIO b)
-    {
-        return (a.x == b.x) and (a.y == b.y) and (a.z == b.z);
-    };
-};
-
-struct ReportHelper {
-    static float get_throughput(float milliseconds, size_t nbyte)
-    {
-        auto GiB     = 1.0 * 1024 * 1024 * 1024;
-        auto seconds = milliseconds * 1e-3;
-        return nbyte / GiB / seconds;
-    }
-
-    static void println_throughput(const char* s, float timer, size_t _nbyte)
-    {
-        if (timer == 0.0) return;
-        auto t = get_throughput(timer, _nbyte);
-        printf("  %-12s %'12f %'10.2f\n", s, timer, t);
-    };
-
-    static void println_throughput_tablehead()
-    {
-        printf(
-            "\n  \e[1m\e[31m%-12s %12s %10s\e[0m\n",  //
-            const_cast<char*>("kernel"),              //
-            const_cast<char*>("time, ms"),            //
-            const_cast<char*>("GiB/s")                //
-        );
-    }
-
-    static void print_datasegment_tablehead()
-    {
-        printf(
-            "\ndata segments:\n  \e[1m\e[31m%-18s\t%12s\t%15s\t%15s\e[0m\n",  //
-            const_cast<char*>("name"),                                        //
-            const_cast<char*>("nbyte"),                                       //
-            const_cast<char*>("start"),                                       //
-            const_cast<char*>("end"));
-    }
-
-    static std::string demangle(const char* name)
-    {
-        int   status = -4;
-        char* res    = abi::__cxa_demangle(name, nullptr, nullptr, &status);
-
-        const char* const demangled_name = (status == 0) ? res : name;
-        std::string       ret_val(demangled_name);
-        free(res);
-        return ret_val;
-    };
-};
-
-#endif
+/**
+ * @file configs.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-09-26
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_COMMON_CONFIGS_HH
+#define CUSZ_COMMON_CONFIGS_HH
+
+#include <cuda_runtime.h>
+#include <cxxabi.h>
+#include <cmath>
+#include <fstream>
+#include <limits>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "../header.h"
+#include "definition.hh"
+
+#if __cplusplus >= 201703L
+#define CONSTEXPR constexpr
+#else
+#define CONSTEXPR
+#endif
+
+struct Reinterpret1DTo2D {
+    template <typename T>
+    static T get_square_size(T len)
+    {
+        return static_cast<T>(ceil(sqrt(len)));
+    }
+};
+
+struct Align {
+    template <cusz::ALIGNDATA ad = cusz::ALIGNDATA::NONE>
+    static size_t get_aligned_datalen(size_t len)
+    {
+        if CONSTEXPR (ad == cusz::ALIGNDATA::NONE) return len;
+        if CONSTEXPR (ad == cusz::ALIGNDATA::SQUARE_MATRIX) {
+            auto m = Reinterpret1DTo2D::get_square_size(len);
+            return m * m;
+        }
+    }
+
+    static const int DEFAULT_ALIGN_NBYTE = 128;
+
+    template <int NUM>
+    static inline bool is_aligned_at(const void* ptr)
+    {  //
+        return reinterpret_cast<uintptr_t>(ptr) % NUM == 0;
+    };
+
+    template <typename T, int NUM = DEFAULT_ALIGN_NBYTE>
+    static size_t get_aligned_nbyte(size_t len)
+    {
+        return ((sizeof(T) * len - 1) / NUM + 1) * NUM;
+    }
+};
+
+// sparsity rate is less that 5%
+struct SparseMethodSetup {
+    // "Density" denotes the degree of non-zeros (nz).
+    static constexpr float default_density  = 0.25;                 // ratio of nonzeros (R_nz)
+    static constexpr float default_sparsity = 1 - default_density;  // ratio of zeros, 1 - R_nz
+
+    static constexpr int default_density_factor = 4;  // ratio of nonzeros (R_nz)
+
+    template <typename T, typename M = int>
+    static uint32_t get_csr_nbyte(uint32_t len, uint32_t nnz)
+    {
+        auto m     = Reinterpret1DTo2D::get_square_size(len);
+        auto nbyte = sizeof(M) * (m + 1) + sizeof(M) * nnz + sizeof(T) * nnz;
+        return nbyte;
+    }
+};
+
+struct HuffmanHelper {
+    // deprecated
+    // template <typename SYM, typename BOOK>
+    // static uint32_t get_revbook_nbyte(int dict_size)
+    // {
+    //     constexpr auto TYPE_BITCOUNT = sizeof(BOOK) * 8;
+    //     return sizeof(BOOK) * (2 * TYPE_BITCOUNT) + sizeof(SYM) * dict_size;
+    // }
+
+    static const int BLOCK_DIM_ENCODE  = 256;
+    static const int BLOCK_DIM_DEFLATE = 256;
+
+    static const int ENC_SEQUENTIALITY = 4;  // empirical
+    static const int DEFLATE_CONSTANT  = 4;  // TODO -> deflate_chunk_constant
+};
+
+struct StringHelper {
+    static std::string nnz_percentage(uint32_t nnz, uint32_t data_len)
+    {
+        return "(" + std::to_string(nnz / 1.0 / data_len * 100) + "%)";
+    }
+};
+
+struct ConfigHelper {
+    static uint32_t predictor_lookup(std::string name)
+    {
+        const std::unordered_map<std::string, uint32_t> lut = {
+            {"lorenzo", 0}, {"lorenzoii", 1}, {"spline3", 2}  //
+        };
+        if (lut.find(name) != lut.end()) throw std::runtime_error("no such predictor as " + name);
+        return lut.at(name);
+    }
+
+    static uint32_t codec_lookup(std::string name)
+    {
+        const std::unordered_map<std::string, uint32_t> lut = {
+            {"huffman-coarse", 0}  //
+        };
+        if (lut.find(name) != lut.end()) throw std::runtime_error("no such codec as " + name);
+        return lut.at(name);
+    }
+
+    static uint32_t spcodec_lookup(std::string name)
+    {
+        const std::unordered_map<std::string, uint32_t> lut = {
+            {"spmat", 0}, {"spvec", 1}  //
+        };
+        if (lut.find(name) != lut.end()) throw std::runtime_error("no such codec as " + name);
+        return lut.at(name);
+    }
+
+    static std::string get_default_predictor() { return "lorenzo"; }
+    static std::string get_default_spcodec() { return "csr11"; }
+    static std::string get_default_codec() { return "huffman-coarse"; }
+    static std::string get_default_cuszmode() { return "r2r"; }
+    static std::string get_default_dtype() { return "f32"; }
+
+    static bool check_predictor(const std::string& val, bool fatal = false)
+    {
+        auto legal = (val == "lorenzo") or (val == "spline3");
+        if (not legal) {
+            if (fatal)
+                throw std::runtime_error("`predictor` must be \"lorenzo\" or \"spline3\".");
+            else
+                printf("fallback to the default \"%s\".", get_default_predictor().c_str());
+        }
+        return legal;
+    }
+
+    static bool check_codec(const std::string& val, bool fatal = false)
+    {
+        auto legal = (val == "huffman-coarse");
+        if (not legal) {
+            if (fatal)
+                throw std::runtime_error("`codec` must be \"huffman-coarse\".");
+            else
+                printf("fallback to the default \"%s\".", get_default_codec().c_str());
+        }
+        return legal;
+    }
+
+    static bool check_spcodec(const std::string& val, bool fatal = false)
+    {
+        auto legal = (val == "csr11") or (val == "rle");
+        if (not legal) {
+            if (fatal)
+                throw std::runtime_error("`codec` must be \"csr11\" or \"rle\".");
+            else
+                printf("fallback to the default \"%s\".", get_default_codec().c_str());
+        }
+        return legal;
+    }
+
+    static bool check_cuszmode(const std::string& val, bool fatal = false)
+    {
+        auto legal = (val == "r2r") or (val == "abs");
+        if (not legal) {
+            if (fatal)
+                throw std::runtime_error("`mode` must be \"r2r\" or \"abs\".");
+            else
+                printf("fallback to the default \"%s\".", get_default_cuszmode().c_str());
+        }
+        return legal;
+    }
+
+    static bool check_dtype(const std::string& val, bool fatal = false)
+    {
+        auto legal = (val == "f32");
+        // auto legal = (val == "f32") or (val == "f64");
+        if (not legal) {
+            if (fatal)
+                throw std::runtime_error("`dtype` must be \"f32\".");
+            else
+                printf("fallback to the default \"%s\".", get_default_dtype().c_str());
+        }
+        return legal;
+    }
+
+    static bool check_opt_in_list(std::string const& opt, std::vector<std::string> vs)
+    {
+        for (auto& i : vs) {
+            if (opt == i) return true;
+        }
+        return false;
+    }
+
+    static void parse_length_literal(const char* str, std::vector<std::string>& dims)
+    {
+        std::stringstream data_len_ss(str);
+        auto              data_len_literal = data_len_ss.str();
+        char              delimiter        = 'x';
+
+        while (data_len_ss.good()) {
+            std::string substr;
+            std::getline(data_len_ss, substr, delimiter);
+            dims.push_back(substr);
+        }
+    }
+
+    static size_t get_filesize(std::string fname)
+    {
+        std::ifstream in(fname.c_str(), std::ifstream::ate | std::ifstream::binary);
+        return in.tellg();
+    }
+
+    static size_t get_filesize(cusz_header* h)
+    {
+        auto END = sizeof(h->entry) / sizeof(h->entry[0]);
+        return h->entry[END - 1];
+    }
+
+    static size_t get_uncompressed_len(cusz_header* h) { return h->x * h->y * h->z; }
+
+    template <typename T1, typename T2>
+    static size_t get_npart(T1 size, T2 subsize)
+    {
+        static_assert(
+            std::numeric_limits<T1>::is_integer and std::numeric_limits<T2>::is_integer,
+            "[get_npart] must be plain interger types.");
+
+        return (size + subsize - 1) / subsize;
+    }
+
+    // #ifdef __CUDACC__
+    static int get_ndim(dim3 len3)
+    {
+        auto ndim = 3;
+        if (len3.z == 1) ndim = 2;
+        if (len3.z == 1 and len3.y == 1) ndim = 1;
+        return ndim;
+    }
+
+    static dim3 get_pardeg3(dim3 len3, dim3 sublen3)
+    {
+        return dim3(
+            get_npart(len3.x, sublen3.x),  //
+            get_npart(len3.y, sublen3.y),  //
+            get_npart(len3.z, sublen3.z));
+    }
+
+    template <typename T>
+    static dim3 get_pardeg3(dim3 len3, T sublen3[3])
+    {
+        return dim3(
+            get_npart(len3.x, sublen3[0]),  //
+            get_npart(len3.y, sublen3[1]),  //
+            get_npart(len3.z, sublen3[2]));
+    }
+
+    template <typename T>
+    static dim3 multiply_dim3(dim3 a, T b[3])
+    {
+        return dim3(a.x * b[0], a.y * b[1], a.z * b[2]);
+    }
+
+    static dim3 multiply_dim3(dim3 a, dim3 b)
+    {  //
+        return dim3(a.x * b.x, a.y * b.y, a.z * b.z);
+    }
+
+    static size_t get_serialized_len(dim3 a) { return a.x * a.y * a.z; }
+
+    static dim3 get_leap(dim3 len3) { return dim3(1, len3.x, len3.x * len3.y); }
+
+    // #endif
+
+    template <typename T>
+    static size_t get_serialized_len(T a[3])
+    {  //
+        return a[0] * a[1] * a[2];
+    }
+};
+
+struct CompareHelper {
+    template <typename TRIO>
+    static bool eq(TRIO a, TRIO b)
+    {
+        return (a.x == b.x) and (a.y == b.y) and (a.z == b.z);
+    };
+};
+
+struct ReportHelper {
+    static float get_throughput(float milliseconds, size_t nbyte)
+    {
+        auto GiB     = 1.0 * 1024 * 1024 * 1024;
+        auto seconds = milliseconds * 1e-3;
+        return nbyte / GiB / seconds;
+    }
+
+    static void println_throughput(const char* s, float timer, size_t _nbyte)
+    {
+        if (timer == 0.0) return;
+        auto t = get_throughput(timer, _nbyte);
+        printf("  %-12s %'12f %'10.2f\n", s, timer, t);
+    };
+
+    static void println_throughput_tablehead()
+    {
+        printf(
+            "\n  \e[1m\e[31m%-12s %12s %10s\e[0m\n",  //
+            const_cast<char*>("kernel"),              //
+            const_cast<char*>("time, ms"),            //
+            const_cast<char*>("GiB/s")                //
+        );
+    }
+
+    static void print_datasegment_tablehead()
+    {
+        printf(
+            "\ndata segments:\n  \e[1m\e[31m%-18s\t%12s\t%15s\t%15s\e[0m\n",  //
+            const_cast<char*>("name"),                                        //
+            const_cast<char*>("nbyte"),                                       //
+            const_cast<char*>("start"),                                       //
+            const_cast<char*>("end"));
+    }
+
+    static std::string demangle(const char* name)
+    {
+        int   status = -4;
+        char* res    = abi::__cxa_demangle(name, nullptr, nullptr, &status);
+
+        const char* const demangled_name = (status == 0) ? res : name;
+        std::string       ret_val(demangled_name);
+        free(res);
+        return ret_val;
+    };
+};
+
+#endif
diff --git a/qtensor/compression/cusz/include/common/definition.hh b/qtensor/compression/cusz/include/common/definition.hh
index c7c328ef..af30239b 100644
--- a/qtensor/compression/cusz/include/common/definition.hh
+++ b/qtensor/compression/cusz/include/common/definition.hh
@@ -1,66 +1,66 @@
-/**
- * @file definition.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2021-09-20
- *
- * (C) 2021 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef CUSZ_COMMON_DEFINITION_HH
-#define CUSZ_COMMON_DEFINITION_HH
-
-#include <cstdint>
-#include <tuple>
-#include <vector>
-
-namespace cusz {
-
-enum class TASK { COMPRESS, DECOMPRESS, EXPERIMENT, COMPRESS_DRYRUN };
-enum class DEV { TEST, DEV, RELEASE };
-enum class LOC { HOST, DEVICE, HOST_DEVICE, UNIFIED, FS, NONE, __BUFFER };
-enum class WHEN { COMPRESS, DECOMPRESS, EXPERIMENT, COMPRESS_DRYRUN };
-enum class ALIGNDATA { NONE, SQUARE_MATRIX, POWEROF2, NEXT_EVEN };
-enum class ALIGNMEM { NONE, WARP32B, WARP64B, WARP128B };
-
-// TODO when to use ADDR8?
-// TODO change to `enum class`
-enum class SEG { HEADER, BOOK, QUANT, REVBOOK, ANCHOR, SPFMT, HUFF_META, HUFF_DATA };
-
-enum class execution { cuda, serial };
-enum class method { native, thrust };
-
-struct OK {
-    template <cusz::DEV m>
-    static void ALLOC()
-    {
-        static_assert(
-            m == cusz::DEV::TEST or m == cusz::DEV::DEV,  //
-            "muse be cusz::DEV::TEST or cusz::DEV::DEV; use with caution");
-    }
-
-    template <cusz::DEV m>
-    static void FREE()
-    {
-        static_assert(
-            m == cusz::DEV::TEST or m == cusz::DEV::DEV,  //
-            "muse be cusz::DEV::TEST or cusz::DEV::DEV; use with caution");
-    }
-};
-
-using ADDR4 = uint32_t;
-using ADDR8 = size_t;
-
-using FREQ = uint32_t;
-
-using TimeRecordTuple = std::tuple<const char*, double>;
-using TimeRecord      = std::vector<TimeRecordTuple>;
-using timerecord_t    = TimeRecord*;
-
-using BYTE = uint8_t;
-
-};  // namespace cusz
-
-#endif
+/**
+ * @file definition.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-09-20
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_COMMON_DEFINITION_HH
+#define CUSZ_COMMON_DEFINITION_HH
+
+#include <cstdint>
+#include <tuple>
+#include <vector>
+
+namespace cusz {
+
+enum class TASK { COMPRESS, DECOMPRESS, EXPERIMENT, COMPRESS_DRYRUN };
+enum class DEV { TEST, DEV, RELEASE };
+enum class LOC { HOST, DEVICE, HOST_DEVICE, UNIFIED, FS, NONE, __BUFFER };
+enum class WHEN { COMPRESS, DECOMPRESS, EXPERIMENT, COMPRESS_DRYRUN };
+enum class ALIGNDATA { NONE, SQUARE_MATRIX, POWEROF2, NEXT_EVEN };
+enum class ALIGNMEM { NONE, WARP32B, WARP64B, WARP128B };
+
+// TODO when to use ADDR8?
+// TODO change to `enum class`
+enum class SEG { HEADER, BOOK, QUANT, REVBOOK, ANCHOR, SPFMT, HUFF_META, HUFF_DATA };
+
+enum class execution { cuda, serial };
+enum class method { native, thrust };
+
+struct OK {
+    template <cusz::DEV m>
+    static void ALLOC()
+    {
+        static_assert(
+            m == cusz::DEV::TEST or m == cusz::DEV::DEV,  //
+            "muse be cusz::DEV::TEST or cusz::DEV::DEV; use with caution");
+    }
+
+    template <cusz::DEV m>
+    static void FREE()
+    {
+        static_assert(
+            m == cusz::DEV::TEST or m == cusz::DEV::DEV,  //
+            "muse be cusz::DEV::TEST or cusz::DEV::DEV; use with caution");
+    }
+};
+
+using ADDR4 = uint32_t;
+using ADDR8 = size_t;
+
+using FREQ = uint32_t;
+
+using TimeRecordTuple = std::tuple<const char*, double>;
+using TimeRecord      = std::vector<TimeRecordTuple>;
+using timerecord_t    = TimeRecord*;
+
+using BYTE = uint8_t;
+
+};  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/include/common/type_traits.hh b/qtensor/compression/cusz/include/common/type_traits.hh
index a77c2738..3d623beb 100644
--- a/qtensor/compression/cusz/include/common/type_traits.hh
+++ b/qtensor/compression/cusz/include/common/type_traits.hh
@@ -1,108 +1,108 @@
-/**
- * @file type_traits.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.1.1
- * @date 2020-09-23
- * (create) 2020-09-23, (rev) 2021-09-17
- *
- * @copyright (C) 2020 by Washington State University, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#ifndef TYPE_TRAITS_HH
-#define TYPE_TRAITS_HH
-
-#include <stdexcept>
-#include <type_traits>
-
-#include "cusz/type.h"
-#include "definition.hh"
-
-template <typename T>
-cusz_datatype cusz_typeof()
-{
-    if (std::is_same<T, float>::value)
-        return FP32;
-    else if (std::is_same<T, double>::value)
-        return FP64;
-    else
-        throw std::runtime_error("Type not supported.");
-}
-
-// clang-format off
-
-/**
- * @brief CUDA API does not accept uint64_t (understandable by literal), but instead, 
- * `unsigned long long`, which is ambiguous anyway.
- */
-template <typename T> struct cuszCOMPAT;
-template <> struct cuszCOMPAT<uint32_t> { using type = uint32_t; };
-template <> struct cuszCOMPAT<uint64_t> { using type = unsigned long long; };
-
-template <int WIDTH, bool FP = true> struct DataTrait;
-template <> struct DataTrait<4, true>  { typedef float   type; };
-template <> struct DataTrait<8, true>  { typedef double  type; };
-template <> struct DataTrait<1, false> { typedef int8_t  type; }; // future use
-template <> struct DataTrait<2, false> { typedef int16_t type; }; // future use
-template <> struct DataTrait<4, false> { typedef int32_t type; }; // future use
-template <> struct DataTrait<8, false> { typedef int64_t type; }; // future use
-
-template <int NDIM> struct ChunkingTrait;
-template <> struct ChunkingTrait<1>     { static const int BLOCK = 256; static const int SEQ = 8; };
-template <> struct ChunkingTrait<0x101> { static const int BLOCK = 128; };
-template <> struct ChunkingTrait<0x201> { static const int BLOCK = 64;  };
-template <> struct ChunkingTrait<2>     { static const int BLOCK = 16; static const int YSEQ = 8; };
-template <> struct ChunkingTrait<3>     { static const int BLOCK = 8;  static const int YSEQ = 8; };
-
-// template <int WIDTH> struct QuantTrait;
-// template <> struct QuantTrait<1> { typedef uint8_t type; };
-// template <> struct QuantTrait<2> { typedef uint16_t type; };
-// template <> struct QuantTrait<4> { typedef uint32_t type; };
-
-template <int WIDTH, bool FP = false> struct ErrCtrlTrait;
-template <> struct ErrCtrlTrait<1, false> { typedef uint8_t  type; };
-template <> struct ErrCtrlTrait<2, false> { typedef uint16_t type; };
-template <> struct ErrCtrlTrait<4, false> { typedef uint32_t type; };
-template <> struct ErrCtrlTrait<4, true>  { typedef float    type; };
-template <> struct ErrCtrlTrait<8, true>  { typedef double   type; };
-
-template <int WIDTH> struct HuffTrait;
-template <> struct HuffTrait<4> { typedef cuszCOMPAT<uint32_t>::type type; };
-template <> struct HuffTrait<8> { typedef cuszCOMPAT<uint64_t>::type type; };
-
-template <int WIDTH> struct ReducerTrait;
-template <> struct ReducerTrait<4> { typedef uint32_t type; };
-template <> struct ReducerTrait<8> { typedef uint64_t type; };
-
-template <int WIDTH> struct MetadataTrait;
-template <> struct MetadataTrait<4> { typedef uint32_t type; };
-template <> struct MetadataTrait<8> { typedef uint64_t type; }; // size_t is problematic; do not use
-
-template <bool LARGE> struct LargeInputTrait;
-template <> struct LargeInputTrait<false> { using type = MetadataTrait<4>::type; };
-template <> struct LargeInputTrait<true>  { using type = MetadataTrait<8>::type; };
-
-template <bool FAST> struct FastLowPrecisionTrait;
-template <> struct FastLowPrecisionTrait<true>  { typedef float  type; };
-template <> struct FastLowPrecisionTrait<false> { typedef double type; };
-
-// template <typename F> struct cuszCUSPARSE;
-// template <> struct cuszCUSPARSE<float>  { const static cudaDataType type = CUDA_R_32F; };
-// template <> struct cuszCUSPARSE<double> { const static cudaDataType type = CUDA_R_64F; };
-
-#ifdef __CUDACC__
-#include <driver_types.h>
-
-template <cusz::LOC FROM, cusz::LOC TO> struct CopyDirection;
-template <> struct CopyDirection<cusz::LOC::HOST,   cusz::LOC::HOST>   { static const cudaMemcpyKind direction = cudaMemcpyHostToHost;     };
-template <> struct CopyDirection<cusz::LOC::HOST,   cusz::LOC::DEVICE> { static const cudaMemcpyKind direction = cudaMemcpyHostToDevice;   };
-template <> struct CopyDirection<cusz::LOC::DEVICE, cusz::LOC::HOST>   { static const cudaMemcpyKind direction = cudaMemcpyDeviceToHost;   };
-template <> struct CopyDirection<cusz::LOC::DEVICE, cusz::LOC::DEVICE> { static const cudaMemcpyKind direction = cudaMemcpyDeviceToDevice; };
-
-#endif
-
-// clang-format on
-
-#endif
+/**
+ * @file type_traits.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.1.1
+ * @date 2020-09-23
+ * (create) 2020-09-23, (rev) 2021-09-17
+ *
+ * @copyright (C) 2020 by Washington State University, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef TYPE_TRAITS_HH
+#define TYPE_TRAITS_HH
+
+#include <stdexcept>
+#include <type_traits>
+
+#include "cusz/type.h"
+#include "definition.hh"
+
+template <typename T>
+cusz_datatype cusz_typeof()
+{
+    if (std::is_same<T, float>::value)
+        return FP32;
+    else if (std::is_same<T, double>::value)
+        return FP64;
+    else
+        throw std::runtime_error("Type not supported.");
+}
+
+// clang-format off
+
+/**
+ * @brief CUDA API does not accept uint64_t (understandable by literal), but instead, 
+ * `unsigned long long`, which is ambiguous anyway.
+ */
+template <typename T> struct cuszCOMPAT;
+template <> struct cuszCOMPAT<uint32_t> { using type = uint32_t; };
+template <> struct cuszCOMPAT<uint64_t> { using type = unsigned long long; };
+
+template <int WIDTH, bool FP = true> struct DataTrait;
+template <> struct DataTrait<4, true>  { typedef float   type; };
+template <> struct DataTrait<8, true>  { typedef double  type; };
+template <> struct DataTrait<1, false> { typedef int8_t  type; }; // future use
+template <> struct DataTrait<2, false> { typedef int16_t type; }; // future use
+template <> struct DataTrait<4, false> { typedef int32_t type; }; // future use
+template <> struct DataTrait<8, false> { typedef int64_t type; }; // future use
+
+template <int NDIM> struct ChunkingTrait;
+template <> struct ChunkingTrait<1>     { static const int BLOCK = 256; static const int SEQ = 8; };
+template <> struct ChunkingTrait<0x101> { static const int BLOCK = 128; };
+template <> struct ChunkingTrait<0x201> { static const int BLOCK = 64;  };
+template <> struct ChunkingTrait<2>     { static const int BLOCK = 16; static const int YSEQ = 8; };
+template <> struct ChunkingTrait<3>     { static const int BLOCK = 8;  static const int YSEQ = 8; };
+
+// template <int WIDTH> struct QuantTrait;
+// template <> struct QuantTrait<1> { typedef uint8_t type; };
+// template <> struct QuantTrait<2> { typedef uint16_t type; };
+// template <> struct QuantTrait<4> { typedef uint32_t type; };
+
+template <int WIDTH, bool FP = false> struct ErrCtrlTrait;
+template <> struct ErrCtrlTrait<1, false> { typedef uint8_t  type; };
+template <> struct ErrCtrlTrait<2, false> { typedef uint16_t type; };
+template <> struct ErrCtrlTrait<4, false> { typedef uint32_t type; };
+template <> struct ErrCtrlTrait<4, true>  { typedef float    type; };
+template <> struct ErrCtrlTrait<8, true>  { typedef double   type; };
+
+template <int WIDTH> struct HuffTrait;
+template <> struct HuffTrait<4> { typedef cuszCOMPAT<uint32_t>::type type; };
+template <> struct HuffTrait<8> { typedef cuszCOMPAT<uint64_t>::type type; };
+
+template <int WIDTH> struct ReducerTrait;
+template <> struct ReducerTrait<4> { typedef uint32_t type; };
+template <> struct ReducerTrait<8> { typedef uint64_t type; };
+
+template <int WIDTH> struct MetadataTrait;
+template <> struct MetadataTrait<4> { typedef uint32_t type; };
+template <> struct MetadataTrait<8> { typedef uint64_t type; }; // size_t is problematic; do not use
+
+template <bool LARGE> struct LargeInputTrait;
+template <> struct LargeInputTrait<false> { using type = MetadataTrait<4>::type; };
+template <> struct LargeInputTrait<true>  { using type = MetadataTrait<8>::type; };
+
+template <bool FAST> struct FastLowPrecisionTrait;
+template <> struct FastLowPrecisionTrait<true>  { typedef float  type; };
+template <> struct FastLowPrecisionTrait<false> { typedef double type; };
+
+// template <typename F> struct cuszCUSPARSE;
+// template <> struct cuszCUSPARSE<float>  { const static cudaDataType type = CUDA_R_32F; };
+// template <> struct cuszCUSPARSE<double> { const static cudaDataType type = CUDA_R_64F; };
+
+#ifdef __CUDACC__
+#include <driver_types.h>
+
+template <cusz::LOC FROM, cusz::LOC TO> struct CopyDirection;
+template <> struct CopyDirection<cusz::LOC::HOST,   cusz::LOC::HOST>   { static const cudaMemcpyKind direction = cudaMemcpyHostToHost;     };
+template <> struct CopyDirection<cusz::LOC::HOST,   cusz::LOC::DEVICE> { static const cudaMemcpyKind direction = cudaMemcpyHostToDevice;   };
+template <> struct CopyDirection<cusz::LOC::DEVICE, cusz::LOC::HOST>   { static const cudaMemcpyKind direction = cudaMemcpyDeviceToHost;   };
+template <> struct CopyDirection<cusz::LOC::DEVICE, cusz::LOC::DEVICE> { static const cudaMemcpyKind direction = cudaMemcpyDeviceToDevice; };
+
+#endif
+
+// clang-format on
+
+#endif
diff --git a/qtensor/compression/cusz/include/compaction.hh b/qtensor/compression/cusz/include/compaction.hh
index bd2a27eb..4a21f571 100644
--- a/qtensor/compression/cusz/include/compaction.hh
+++ b/qtensor/compression/cusz/include/compaction.hh
@@ -1,18 +1,18 @@
-/**
- * @file compaction.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.4
- * @date 2023-01-23
- *
- * (C) 2023 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef DAB40B13_9236_42A9_8047_49CD896671C9
-#define DAB40B13_9236_42A9_8047_49CD896671C9
-
-template <typename T>
-struct CompactionDRAM;
-
-#endif /* DAB40B13_9236_42A9_8047_49CD896671C9 */
+/**
+ * @file compaction.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-23
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef DAB40B13_9236_42A9_8047_49CD896671C9
+#define DAB40B13_9236_42A9_8047_49CD896671C9
+
+template <typename T>
+struct CompactionDRAM;
+
+#endif /* DAB40B13_9236_42A9_8047_49CD896671C9 */
diff --git a/qtensor/compression/cusz/include/component.hh b/qtensor/compression/cusz/include/component.hh
index ec5c08a6..34fb8e00 100644
--- a/qtensor/compression/cusz/include/component.hh
+++ b/qtensor/compression/cusz/include/component.hh
@@ -1,19 +1,19 @@
-/**
- * @file componment.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2021-10-06
- *
- * (C) 2021 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef CUSZ_COMPONENT_HH
-#define CUSZ_COMPONENT_HH
-
-#include "component/prediction.inl"
-#include "component/spcodec.inl"
-#include "hf/hf.hh"
-
+/**
+ * @file componment.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-10-06
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_COMPONENT_HH
+#define CUSZ_COMPONENT_HH
+
+#include "component/prediction.inl"
+#include "component/spcodec.inl"
+#include "hf/hf.hh"
+
 #endif
\ No newline at end of file
diff --git a/qtensor/compression/cusz/include/component/glue.cuh b/qtensor/compression/cusz/include/component/glue.cuh
index c4d69141..cdcc8ff0 100644
--- a/qtensor/compression/cusz/include/component/glue.cuh
+++ b/qtensor/compression/cusz/include/component/glue.cuh
@@ -1,120 +1,120 @@
-/**
- * @file glue.cuh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-03-01
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef WRAPPER_GLUE_CUH
-#define WRAPPER_GLUE_CUH
-
-#include <thrust/count.h>
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include "spcodec.hh"
-
-// when using nvcc, functors must be defined outside a (__host__) function
-template <typename E>
-struct cleanup : public thrust::unary_function<E, E> {
-    int radius;
-    cleanup(int radius) : radius(radius) {}
-    __host__ __device__ E operator()(const E e) const { return e; }
-};
-
-template <typename E, typename Policy, typename IDX = int, bool SHIFT = true>
-void split_by_radius(
-    E*           in_errctrl,
-    size_t       in_len,
-    int const    radius,
-    IDX*         out_idx,
-    E*           out_val,
-    int&         out_nnz,
-    cudaStream_t stream = nullptr,
-    Policy       policy = thrust::device)
-{
-    using thrust::placeholders::_1;
-
-    thrust::cuda::par.on(stream);
-    thrust::counting_iterator<IDX> zero(0);
-
-    // find out the indices
-    out_nnz = thrust::copy_if(policy, zero, zero + in_len, in_errctrl, out_idx, _1 >= 2 * radius or _1 <= 0) - out_idx;
-
-    // fetch corresponding values
-    thrust::copy(
-        policy, thrust::make_permutation_iterator(in_errctrl, out_idx),
-        thrust::make_permutation_iterator(in_errctrl + out_nnz, out_idx + out_nnz), out_val);
-
-    // clear up
-    cleanup<E> functor(radius);
-    thrust::transform(
-        policy,                                                                      //
-        thrust::make_permutation_iterator(in_errctrl, out_idx),                      //
-        thrust::make_permutation_iterator(in_errctrl + out_nnz, out_idx + out_nnz),  //
-        thrust::make_permutation_iterator(in_errctrl, out_idx),                      //
-        functor);
-}
-
-template <typename E, typename Policy, typename IDX = int>
-void split_by_binary_twopass(
-    E*           in_errctrl,
-    size_t       in_len,
-    int const    radius,
-    IDX*         out_idx,
-    E*           out_val,
-    int&         out_nnz,
-    cudaStream_t stream = nullptr,
-    Policy       policy = thrust::device)
-{
-    using thrust::placeholders::_1;
-
-    thrust::cuda::par.on(stream);
-    thrust::counting_iterator<IDX> zero(0);
-
-    // find out the indices
-    out_nnz = thrust::copy_if(policy, zero, zero + in_len, in_errctrl, out_idx, _1 != radius) - out_idx;
-
-    // fetch corresponding values
-    thrust::copy(
-        policy, thrust::make_permutation_iterator(in_errctrl, out_idx),
-        thrust::make_permutation_iterator(in_errctrl + out_nnz, out_idx + out_nnz), out_val);
-}
-
-// when using nvcc, functors must be defined outside a (__host__) function
-template <typename Tuple>
-struct is_outlier {
-    int radius;
-    is_outlier(int radius) : radius(radius) {}
-    __host__ __device__ bool operator()(const Tuple t) const { return thrust::get<1>(t) != radius; }
-};
-
-template <typename E, typename Policy, typename IDX = int>
-void split_by_binary_onepass(
-    E*           in_errctrl,
-    size_t       in_len,
-    int const    radius,
-    IDX*         out_idx,
-    E*           out_val,
-    int&         out_nnz,
-    cudaStream_t stream = nullptr,
-    Policy       policy = thrust::device)
-{
-    thrust::cuda::par.on(stream);
-    using Tuple = thrust::tuple<IDX, E>;
-    thrust::counting_iterator<IDX> zero(0);
-
-    auto in      = thrust::make_zip_iterator(thrust::make_tuple(zero, in_errctrl));
-    auto in_last = thrust::make_zip_iterator(thrust::make_tuple(zero + in_len, in_errctrl + in_len));
-    auto out     = thrust::make_zip_iterator(thrust::make_tuple(out_idx, out_val));
-
-    is_outlier<Tuple> functor(radius);
-    out_nnz = thrust::copy_if(policy, in, in_last, out, functor) - out;
-}
-
-enum class GlueMethod { SPLIT_BY_RADIUS, SPLIT_01_ONEPASS, SPLIT_01_TWOPASS };
-
-#endif
+/**
+ * @file glue.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-03-01
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef WRAPPER_GLUE_CUH
+#define WRAPPER_GLUE_CUH
+
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include "spcodec.hh"
+
+// when using nvcc, functors must be defined outside a (__host__) function
+template <typename E>
+struct cleanup : public thrust::unary_function<E, E> {
+    int radius;
+    cleanup(int radius) : radius(radius) {}
+    __host__ __device__ E operator()(const E e) const { return e; }
+};
+
+template <typename E, typename Policy, typename IDX = int, bool SHIFT = true>
+void split_by_radius(
+    E*           in_errctrl,
+    size_t       in_len,
+    int const    radius,
+    IDX*         out_idx,
+    E*           out_val,
+    int&         out_nnz,
+    cudaStream_t stream = nullptr,
+    Policy       policy = thrust::device)
+{
+    using thrust::placeholders::_1;
+
+    thrust::cuda::par.on(stream);
+    thrust::counting_iterator<IDX> zero(0);
+
+    // find out the indices
+    out_nnz = thrust::copy_if(policy, zero, zero + in_len, in_errctrl, out_idx, _1 >= 2 * radius or _1 <= 0) - out_idx;
+
+    // fetch corresponding values
+    thrust::copy(
+        policy, thrust::make_permutation_iterator(in_errctrl, out_idx),
+        thrust::make_permutation_iterator(in_errctrl + out_nnz, out_idx + out_nnz), out_val);
+
+    // clear up
+    cleanup<E> functor(radius);
+    thrust::transform(
+        policy,                                                                      //
+        thrust::make_permutation_iterator(in_errctrl, out_idx),                      //
+        thrust::make_permutation_iterator(in_errctrl + out_nnz, out_idx + out_nnz),  //
+        thrust::make_permutation_iterator(in_errctrl, out_idx),                      //
+        functor);
+}
+
+template <typename E, typename Policy, typename IDX = int>
+void split_by_binary_twopass(
+    E*           in_errctrl,
+    size_t       in_len,
+    int const    radius,
+    IDX*         out_idx,
+    E*           out_val,
+    int&         out_nnz,
+    cudaStream_t stream = nullptr,
+    Policy       policy = thrust::device)
+{
+    using thrust::placeholders::_1;
+
+    thrust::cuda::par.on(stream);
+    thrust::counting_iterator<IDX> zero(0);
+
+    // find out the indices
+    out_nnz = thrust::copy_if(policy, zero, zero + in_len, in_errctrl, out_idx, _1 != radius) - out_idx;
+
+    // fetch corresponding values
+    thrust::copy(
+        policy, thrust::make_permutation_iterator(in_errctrl, out_idx),
+        thrust::make_permutation_iterator(in_errctrl + out_nnz, out_idx + out_nnz), out_val);
+}
+
+// when using nvcc, functors must be defined outside a (__host__) function
+template <typename Tuple>
+struct is_outlier {
+    int radius;
+    is_outlier(int radius) : radius(radius) {}
+    __host__ __device__ bool operator()(const Tuple t) const { return thrust::get<1>(t) != radius; }
+};
+
+template <typename E, typename Policy, typename IDX = int>
+void split_by_binary_onepass(
+    E*           in_errctrl,
+    size_t       in_len,
+    int const    radius,
+    IDX*         out_idx,
+    E*           out_val,
+    int&         out_nnz,
+    cudaStream_t stream = nullptr,
+    Policy       policy = thrust::device)
+{
+    thrust::cuda::par.on(stream);
+    using Tuple = thrust::tuple<IDX, E>;
+    thrust::counting_iterator<IDX> zero(0);
+
+    auto in      = thrust::make_zip_iterator(thrust::make_tuple(zero, in_errctrl));
+    auto in_last = thrust::make_zip_iterator(thrust::make_tuple(zero + in_len, in_errctrl + in_len));
+    auto out     = thrust::make_zip_iterator(thrust::make_tuple(out_idx, out_val));
+
+    is_outlier<Tuple> functor(radius);
+    out_nnz = thrust::copy_if(policy, in, in_last, out, functor) - out;
+}
+
+enum class GlueMethod { SPLIT_BY_RADIUS, SPLIT_01_ONEPASS, SPLIT_01_TWOPASS };
+
+#endif
diff --git a/qtensor/compression/cusz/include/component/pred_boilerplate_deprecated.hh b/qtensor/compression/cusz/include/component/pred_boilerplate_deprecated.hh
index bb7a0584..f83c25cd 100644
--- a/qtensor/compression/cusz/include/component/pred_boilerplate_deprecated.hh
+++ b/qtensor/compression/cusz/include/component/pred_boilerplate_deprecated.hh
@@ -1,210 +1,210 @@
-/**
- * @file predictor_boilerplate.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2021-09-15
- *
- * (C) 2021 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef CUSZ_INCLUDE_PREDICTOR_HH
-#define CUSZ_INCLUDE_PREDICTOR_HH
-
-#include <cstdint>
-#include <cstdio>
-#include <stdexcept>
-
-#include "../common/configs.hh"
-#include "../cusz/type.h"
-
-namespace cusz {
-
-class PredictorBoilerplate {
-   protected:
-    struct DerivedLengths {
-        struct Interpretion3D {
-            dim3   len3, leap;
-            size_t serialized;
-
-            void set_leap() { leap = ConfigHelper::get_leap(len3); }
-            void set_serialized() { serialized = ConfigHelper::get_serialized_len(len3); }
-        };
-
-        struct Interpretion3D base, anchor, aligned;
-
-        dim3 nblock;
-        int  ndim;
-
-        struct {
-            size_t data, quant, outlier, anchor;
-        } assigned;
-
-        dim3 get_len3() const { return base.len3; }
-        dim3 get_leap() const { return base.leap; }
-    };
-
-    template <class DERIVED>
-    void __derive_len(dim3 base, DERIVED& derived)
-    {
-        int sublen[3]      = {1, 1, 1};
-        int anchor_step[3] = {1, 1, 1};
-        __derive_len(base, derived, sublen, anchor_step, false);
-    }
-
-    template <class DERIVED>
-    void
-    __derive_len(dim3 base, DERIVED& derived, int const sublen3[3], int const anchor_step3[3], bool use_anchor = false)
-    {
-        derived.base.len3 = base;
-        derived.base.set_leap();
-        derived.base.set_serialized();
-        derived.ndim = ConfigHelper::get_ndim(base);
-
-        if (not use_anchor) {
-            derived.assigned.data    = derived.base.serialized;
-            derived.assigned.quant   = derived.base.serialized;
-            derived.assigned.outlier = derived.base.serialized;
-            derived.assigned.anchor  = 0;
-        }
-        else {
-            derived.nblock = ConfigHelper::get_pardeg3(base, sublen3);
-
-            derived.aligned.len3 = ConfigHelper::multiply_dim3(derived.nblock, sublen3);
-            derived.aligned.set_leap();
-            derived.aligned.set_serialized();
-
-            derived.anchor.len3 = ConfigHelper::get_pardeg3(base, anchor_step3);
-            derived.anchor.set_leap();
-            derived.anchor.set_serialized();
-
-            derived.assigned.data    = derived.base.serialized;
-            derived.assigned.quant   = derived.aligned.serialized;
-            derived.assigned.outlier = std::max(derived.base.serialized, derived.aligned.serialized);  // TODO
-            derived.assigned.anchor  = derived.anchor.serialized;
-        }
-    }
-
-    template <class DERIVED, typename T, typename E, typename FP = float>
-    void __debug_list_derived(DERIVED const& derived, bool use_anchor = false)
-    {
-        auto base    = derived.base;
-        auto aligned = derived.aligned;
-        auto anchor  = derived.anchor;
-        auto nblock  = derived.nblock;
-
-        printf("%-*s:  (%u, %u, %u)\n", 16, "sizeof.{T,E,FP}", (int)sizeof(T), (int)sizeof(E), (int)sizeof(FP));
-        printf("%-*s:  (%u, %u, %u)\n", 16, "base.len3", base.len3.x, base.len3.y, base.len3.z);
-        printf("%-*s:  (%u, %u, %u)\n", 16, "base.leap", base.leap.x, base.leap.y, base.leap.z);
-        printf("%-*s:  %'zu\n", 16, "base.serial", base.serialized);
-
-        if (use_anchor) {
-            printf("%-*s:  (%u, %u, %u)\n", 16, "nblock", nblock.x, nblock.y, nblock.z);
-
-            printf("%-*s:  (%u, %u, %u)\n", 16, "aligned.len3", aligned.len3.x, aligned.len3.y, aligned.len3.z);
-            printf("%-*s:  (%u, %u, %u)\n", 16, "aligned.leap", aligned.leap.x, aligned.leap.y, aligned.leap.z);
-            printf("%-*s:  %'zu\n", 16, "aligned.serial", aligned.serialized);
-
-            printf("%-*s:  (%u, %u, %u)\n", 16, "anchor.len3", anchor.len3.x, anchor.len3.y, anchor.len3.z);
-            printf("%-*s:  (%u, %u, %u)\n", 16, "anchor.leap", anchor.leap.x, anchor.leap.y, anchor.leap.z);
-            printf("%-*s:  %'zu\n", 16, "anchor.serial", anchor.serialized);
-        }
-
-        printf("%-*s:  %'zu\n", 16, "len.data", derived.assigned.data);
-        printf("%-*s:  %'zu\n", 16, "len.quant", derived.assigned.quant);
-        printf("%-*s:  %'zu\n", 16, "len.outlier", derived.assigned.outlier);
-        printf("%-*s:  %'zu\n", 16, "len.anchor", derived.assigned.anchor);
-    }
-
-    void check_rtlen()
-    {
-        auto rtlen3    = rtlen.get_len3();
-        auto alloclen3 = alloclen.get_len3();
-
-        if (rtlen3.x > alloclen3.x or rtlen3.y > alloclen3.y or rtlen3.z > alloclen3.z or
-            rtlen.base.serialized > alloclen.base.serialized)
-            throw std::runtime_error("Predictor: the runtime lengths cannot be greater than the allocation lengths.");
-    }
-
-    template <typename T, typename E, typename FP = float>
-    void debug_list_alloclen(bool use_anchor = false)
-    {
-        printf("\ndebugging, listing allocation lengths:\n");
-        __debug_list_derived<decltype(alloclen), T, E, FP>(alloclen, use_anchor);
-    }
-
-    template <typename T, typename E, typename FP = float>
-    void debug_list_rtlen(bool use_anchor = false)
-    {
-        printf("\ndebugging, listing runtime lengths:\n");
-        __debug_list_derived<decltype(rtlen), T, E, FP>(rtlen, use_anchor);
-    }
-
-   protected:
-    struct DerivedLengths alloclen, rtlen;
-
-    float time_elapsed;
-
-    // -----------------------------------------------------------------------------
-    //                                  accessor
-    // -----------------------------------------------------------------------------
-   public:
-    // helper
-    size_t get_alloclen_data() const { return alloclen.assigned.data; }
-    size_t get_alloclen_anchor() const { return alloclen.assigned.anchor; }
-    size_t get_alloclen_quant() const { return alloclen.assigned.quant; }
-    size_t get_alloclen_outlier() const { return alloclen.assigned.outlier; }
-
-    dim3   get_len3() const { return rtlen.base.len3; }
-    dim3   get_leap3() const { return rtlen.base.leap; }
-    size_t get_len_data() const { return rtlen.assigned.data; }
-    size_t get_len_anchor() const { return rtlen.assigned.anchor; }
-    size_t get_len_quant() const { return rtlen.assigned.quant; }
-    size_t get_len_outlier() const { return rtlen.assigned.outlier; }
-
-    float get_time_elapsed() const { return time_elapsed; }
-
-    size_t get_x() const { return this->rtlen.get_len3().x; }
-    size_t get_y() const { return this->rtlen.get_len3().y; }
-    size_t get_z() const { return this->rtlen.get_len3().z; }
-
-    dim3 get_leap() const { return this->rtlen.get_leap(); }
-    int  get_ndim() const { return this->rtlen.ndim; }
-
-    void derive_alloclen(cusz_predictortype predictor, dim3 base)
-    {
-        if (predictor == LorenzoI) {
-            // normal
-            this->__derive_len(base, this->alloclen);
-        }
-
-        else if (predictor == Spline3) {
-            // maximum possible
-            int sublen[3]      = {32, 8, 8};
-            int anchor_step[3] = {8, 8, 8};
-            this->__derive_len(base, this->alloclen, sublen, anchor_step, true);
-        }
-    }
-
-    void derive_rtlen(cusz_predictortype predictor, dim3 base)
-    {
-        if (predictor == LorenzoI) {
-            // normal
-            this->__derive_len(base, this->rtlen);
-        }
-        else if (predictor == Spline3) {
-            // maximum possible
-            int sublen[3]      = {32, 8, 8};
-            int anchor_step[3] = {8, 8, 8};
-            this->__derive_len(base, this->rtlen, sublen, anchor_step, true);
-        }
-    }
-
-    // "real" methods
-    virtual ~PredictorBoilerplate() = default;
-};
-
-}  // namespace cusz
-
-#endif
+/**
+ * @file predictor_boilerplate.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-09-15
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_INCLUDE_PREDICTOR_HH
+#define CUSZ_INCLUDE_PREDICTOR_HH
+
+#include <cstdint>
+#include <cstdio>
+#include <stdexcept>
+
+#include "../common/configs.hh"
+#include "../cusz/type.h"
+
+namespace cusz {
+
+class PredictorBoilerplate {
+   protected:
+    struct DerivedLengths {
+        struct Interpretion3D {
+            dim3   len3, leap;
+            size_t serialized;
+
+            void set_leap() { leap = ConfigHelper::get_leap(len3); }
+            void set_serialized() { serialized = ConfigHelper::get_serialized_len(len3); }
+        };
+
+        struct Interpretion3D base, anchor, aligned;
+
+        dim3 nblock;
+        int  ndim;
+
+        struct {
+            size_t data, quant, outlier, anchor;
+        } assigned;
+
+        dim3 get_len3() const { return base.len3; }
+        dim3 get_leap() const { return base.leap; }
+    };
+
+    template <class DERIVED>
+    void __derive_len(dim3 base, DERIVED& derived)
+    {
+        int sublen[3]      = {1, 1, 1};
+        int anchor_step[3] = {1, 1, 1};
+        __derive_len(base, derived, sublen, anchor_step, false);
+    }
+
+    template <class DERIVED>
+    void
+    __derive_len(dim3 base, DERIVED& derived, int const sublen3[3], int const anchor_step3[3], bool use_anchor = false)
+    {
+        derived.base.len3 = base;
+        derived.base.set_leap();
+        derived.base.set_serialized();
+        derived.ndim = ConfigHelper::get_ndim(base);
+
+        if (not use_anchor) {
+            derived.assigned.data    = derived.base.serialized;
+            derived.assigned.quant   = derived.base.serialized;
+            derived.assigned.outlier = derived.base.serialized;
+            derived.assigned.anchor  = 0;
+        }
+        else {
+            derived.nblock = ConfigHelper::get_pardeg3(base, sublen3);
+
+            derived.aligned.len3 = ConfigHelper::multiply_dim3(derived.nblock, sublen3);
+            derived.aligned.set_leap();
+            derived.aligned.set_serialized();
+
+            derived.anchor.len3 = ConfigHelper::get_pardeg3(base, anchor_step3);
+            derived.anchor.set_leap();
+            derived.anchor.set_serialized();
+
+            derived.assigned.data    = derived.base.serialized;
+            derived.assigned.quant   = derived.aligned.serialized;
+            derived.assigned.outlier = std::max(derived.base.serialized, derived.aligned.serialized);  // TODO
+            derived.assigned.anchor  = derived.anchor.serialized;
+        }
+    }
+
+    template <class DERIVED, typename T, typename E, typename FP = float>
+    void __debug_list_derived(DERIVED const& derived, bool use_anchor = false)
+    {
+        auto base    = derived.base;
+        auto aligned = derived.aligned;
+        auto anchor  = derived.anchor;
+        auto nblock  = derived.nblock;
+
+        printf("%-*s:  (%u, %u, %u)\n", 16, "sizeof.{T,E,FP}", (int)sizeof(T), (int)sizeof(E), (int)sizeof(FP));
+        printf("%-*s:  (%u, %u, %u)\n", 16, "base.len3", base.len3.x, base.len3.y, base.len3.z);
+        printf("%-*s:  (%u, %u, %u)\n", 16, "base.leap", base.leap.x, base.leap.y, base.leap.z);
+        printf("%-*s:  %'zu\n", 16, "base.serial", base.serialized);
+
+        if (use_anchor) {
+            printf("%-*s:  (%u, %u, %u)\n", 16, "nblock", nblock.x, nblock.y, nblock.z);
+
+            printf("%-*s:  (%u, %u, %u)\n", 16, "aligned.len3", aligned.len3.x, aligned.len3.y, aligned.len3.z);
+            printf("%-*s:  (%u, %u, %u)\n", 16, "aligned.leap", aligned.leap.x, aligned.leap.y, aligned.leap.z);
+            printf("%-*s:  %'zu\n", 16, "aligned.serial", aligned.serialized);
+
+            printf("%-*s:  (%u, %u, %u)\n", 16, "anchor.len3", anchor.len3.x, anchor.len3.y, anchor.len3.z);
+            printf("%-*s:  (%u, %u, %u)\n", 16, "anchor.leap", anchor.leap.x, anchor.leap.y, anchor.leap.z);
+            printf("%-*s:  %'zu\n", 16, "anchor.serial", anchor.serialized);
+        }
+
+        printf("%-*s:  %'zu\n", 16, "len.data", derived.assigned.data);
+        printf("%-*s:  %'zu\n", 16, "len.quant", derived.assigned.quant);
+        printf("%-*s:  %'zu\n", 16, "len.outlier", derived.assigned.outlier);
+        printf("%-*s:  %'zu\n", 16, "len.anchor", derived.assigned.anchor);
+    }
+
+    void check_rtlen()
+    {
+        auto rtlen3    = rtlen.get_len3();
+        auto alloclen3 = alloclen.get_len3();
+
+        if (rtlen3.x > alloclen3.x or rtlen3.y > alloclen3.y or rtlen3.z > alloclen3.z or
+            rtlen.base.serialized > alloclen.base.serialized)
+            throw std::runtime_error("Predictor: the runtime lengths cannot be greater than the allocation lengths.");
+    }
+
+    template <typename T, typename E, typename FP = float>
+    void debug_list_alloclen(bool use_anchor = false)
+    {
+        printf("\ndebugging, listing allocation lengths:\n");
+        __debug_list_derived<decltype(alloclen), T, E, FP>(alloclen, use_anchor);
+    }
+
+    template <typename T, typename E, typename FP = float>
+    void debug_list_rtlen(bool use_anchor = false)
+    {
+        printf("\ndebugging, listing runtime lengths:\n");
+        __debug_list_derived<decltype(rtlen), T, E, FP>(rtlen, use_anchor);
+    }
+
+   protected:
+    struct DerivedLengths alloclen, rtlen;
+
+    float time_elapsed;
+
+    // -----------------------------------------------------------------------------
+    //                                  accessor
+    // -----------------------------------------------------------------------------
+   public:
+    // helper
+    size_t get_alloclen_data() const { return alloclen.assigned.data; }
+    size_t get_alloclen_anchor() const { return alloclen.assigned.anchor; }
+    size_t get_alloclen_quant() const { return alloclen.assigned.quant; }
+    size_t get_alloclen_outlier() const { return alloclen.assigned.outlier; }
+
+    dim3   get_len3() const { return rtlen.base.len3; }
+    dim3   get_leap3() const { return rtlen.base.leap; }
+    size_t get_len_data() const { return rtlen.assigned.data; }
+    size_t get_len_anchor() const { return rtlen.assigned.anchor; }
+    size_t get_len_quant() const { return rtlen.assigned.quant; }
+    size_t get_len_outlier() const { return rtlen.assigned.outlier; }
+
+    float get_time_elapsed() const { return time_elapsed; }
+
+    size_t get_x() const { return this->rtlen.get_len3().x; }
+    size_t get_y() const { return this->rtlen.get_len3().y; }
+    size_t get_z() const { return this->rtlen.get_len3().z; }
+
+    dim3 get_leap() const { return this->rtlen.get_leap(); }
+    int  get_ndim() const { return this->rtlen.ndim; }
+
+    void derive_alloclen(cusz_predictortype predictor, dim3 base)
+    {
+        if (predictor == LorenzoI) {
+            // normal
+            this->__derive_len(base, this->alloclen);
+        }
+
+        else if (predictor == Spline3) {
+            // maximum possible
+            int sublen[3]      = {32, 8, 8};
+            int anchor_step[3] = {8, 8, 8};
+            this->__derive_len(base, this->alloclen, sublen, anchor_step, true);
+        }
+    }
+
+    void derive_rtlen(cusz_predictortype predictor, dim3 base)
+    {
+        if (predictor == LorenzoI) {
+            // normal
+            this->__derive_len(base, this->rtlen);
+        }
+        else if (predictor == Spline3) {
+            // maximum possible
+            int sublen[3]      = {32, 8, 8};
+            int anchor_step[3] = {8, 8, 8};
+            this->__derive_len(base, this->rtlen, sublen, anchor_step, true);
+        }
+    }
+
+    // "real" methods
+    virtual ~PredictorBoilerplate() = default;
+};
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/include/component/prediction.inl b/qtensor/compression/cusz/include/component/prediction.inl
index 941f2592..50091ae1 100644
--- a/qtensor/compression/cusz/include/component/prediction.inl
+++ b/qtensor/compression/cusz/include/component/prediction.inl
@@ -1,193 +1,193 @@
-/**
- * @file prediction.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-04-23
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef FB315D3E_6B96_4F5D_9975_F35702205BC1
-#define FB315D3E_6B96_4F5D_9975_F35702205BC1
-
-#include <cuda_runtime.h>
-#include <cstdint>
-#include <memory>
-#include "../common.hh"
-#include "../kernel/cpplaunch_cuda.hh"
-#include "../kernel/lorenzo_all.hh"
-#include "../utils.hh"
-
-#include "cusz/type.h"
-#include "pred_boilerplate_deprecated.hh"
-
-#define DEFINE_ARRAY(VAR, TYPE) TYPE* d_##VAR{nullptr};
-
-#define ALLOCDEV(VAR, SYM, NBYTE)                    \
-    if (NBYTE != 0) {                                \
-        CHECK_CUDA(cudaMalloc(&d_##VAR, NBYTE));     \
-        CHECK_CUDA(cudaMemset(d_##VAR, 0x0, NBYTE)); \
-    }
-
-#define ALLOCDEV2(VAR, TYPE, LEN)                                 \
-    if (LEN != 0) {                                               \
-        CHECK_CUDA(cudaMalloc(&d_##VAR, sizeof(TYPE) * LEN));     \
-        CHECK_CUDA(cudaMemset(d_##VAR, 0x0, sizeof(TYPE) * LEN)); \
-    }
-
-#define FREE_DEV_ARRAY(VAR)            \
-    if (d_##VAR) {                     \
-        CHECK_CUDA(cudaFree(d_##VAR)); \
-        d_##VAR = nullptr;             \
-    }
-
-namespace cusz {
-
-template <typename T, typename E, typename FP>
-class PredictionUnified : public PredictorBoilerplate {
-   public:
-    using Origin    = T;
-    using Anchor    = T;
-    using ErrCtrl   = E;
-    using Precision = FP;
-
-   public:
-    ~PredictionUnified()
-    {  // dtor
-        FREE_DEV_ARRAY(anchor);
-        FREE_DEV_ARRAY(errctrl);
-        FREE_DEV_ARRAY(outlier);
-    }
-    PredictionUnified() {}                                   // ctor
-    PredictionUnified(const PredictionUnified&);             // copy ctor
-    PredictionUnified& operator=(const PredictionUnified&);  // copy assign
-    PredictionUnified(PredictionUnified&&);                  // move ctor
-    PredictionUnified& operator=(PredictionUnified&&);       // move assign
-
-    void init(cusz_predictortype predictor, size_t x, size_t y, size_t z, bool dbg_print = false)
-    {
-        auto len3 = dim3(x, y, z);
-        init(predictor, len3, dbg_print);
-    }
-    void init(cusz_predictortype predictor, dim3 xyz, bool dbg_print = false)
-    {
-        this->derive_alloclen(predictor, xyz);
-
-        // allocate
-        ALLOCDEV2(anchor, T, this->alloclen.assigned.anchor);
-        ALLOCDEV2(errctrl, E, this->alloclen.assigned.quant);
-        ALLOCDEV2(outlier, T, this->alloclen.assigned.outlier);
-
-        if (dbg_print) this->debug_list_alloclen<T, E, FP>();
-    }
-
-    void construct(
-        cusz_predictortype predictor,
-        dim3 const         len3,
-        T*                 data,
-        T**                ptr_anchor,
-        E**                ptr_errctrl,
-        T**                ptr_outlier,
-        double const       eb,
-        int const          radius,
-        cudaStream_t       stream)
-    {
-        *ptr_anchor  = d_anchor;
-        *ptr_errctrl = d_errctrl;
-        *ptr_outlier = d_outlier;
-
-        if (predictor == LorenzoI) {
-            derive_rtlen(LorenzoI, len3);
-            this->check_rtlen();
-
-            // ad hoc placeholder
-            // auto      anchor_len3  = dim3(0, 0, 0);
-            // auto      errctrl_len3 = dim3(0, 0, 0);
-            uint32_t* outlier_idx = nullptr;
-
-            compress_predict_lorenzo_i<T, E, FP>(
-                data, len3, eb, radius,                      //
-                d_errctrl, d_outlier, outlier_idx, nullptr,  //
-                &time_elapsed, stream);
-        }
-        else if (predictor == Spline3) {
-            this->derive_rtlen(Spline3, len3);
-            this->check_rtlen();
-
-            cusz::cpplaunch_construct_Spline3<T, E, FP>(
-                true,  //
-                data, len3, d_anchor, this->rtlen.anchor.len3, d_errctrl, this->rtlen.aligned.len3, eb, radius,
-                &time_elapsed, stream);
-        }
-    }
-
-    void reconstruct(
-        cusz_predictortype predictor,
-        dim3               len3,
-        T*                 outlier_xdata,
-        T*                 anchor,
-        E*                 errctrl,
-        double const       eb,
-        int const          radius,
-        cudaStream_t       stream)
-    {
-        if (predictor == LorenzoI) {
-            this->derive_rtlen(LorenzoI, len3);
-            this->check_rtlen();
-
-            // ad hoc placeholder
-            // auto      anchor_len3  = dim3(0, 0, 0);
-            // auto      errctrl_len3 = dim3(0, 0, 0);
-            auto      xdata       = outlier_xdata;
-            auto      outlier     = outlier_xdata;
-            uint32_t* outlier_idx = nullptr;
-
-            auto xdata_len3 = len3;
-
-            decompress_predict_lorenzo_i<T, E, FP>(
-                errctrl, xdata_len3, outlier, outlier_idx, 0, eb, radius,  //
-                xdata,                                                     //
-                &time_elapsed, stream);
-        }
-        else if (predictor == Spline3) {
-            this->derive_rtlen(Spline3, len3);
-            this->check_rtlen();
-            // this->debug_list_rtlen<T, E, FP>(true);
-
-            // launch_reconstruct_Spline3<T, E, FP>(
-            cusz::cpplaunch_reconstruct_Spline3<T, E, FP>(
-                outlier_xdata, len3, anchor, this->rtlen.anchor.len3, errctrl, this->rtlen.aligned.len3, eb, radius,
-                &time_elapsed, stream);
-        }
-    }
-
-    void clear_buffer() { cudaMemset(d_errctrl, 0x0, sizeof(E) * this->rtlen.assigned.quant); }
-
-    float get_time_elapsed() const { return time_elapsed; }
-    // size_t get_alloclen_data() const;
-    // size_t get_alloclen_quant() const;
-    // size_t get_len_data() const;
-    // size_t get_len_quant() const;
-    // size_t get_len_anchor() const;
-
-    E* expose_quant() const { return d_errctrl; }
-    E* expose_errctrl() const { return d_errctrl; }
-    T* expose_anchor() const { return d_anchor; }
-    T* expose_outlier() const { return d_outlier; }
-
-   public:
-    // data
-    DEFINE_ARRAY(anchor, T);
-    DEFINE_ARRAY(errctrl, E);
-    DEFINE_ARRAY(outlier, T);
-};
-
-}  // namespace cusz
-
-#undef ALLOCDEV
-#undef FREE_DEV_ARRAY
-#undef DEFINE_ARRAY
-
-#endif /* FB315D3E_6B96_4F5D_9975_F35702205BC1 */
+/**
+ * @file prediction.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-23
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef FB315D3E_6B96_4F5D_9975_F35702205BC1
+#define FB315D3E_6B96_4F5D_9975_F35702205BC1
+
+#include <cuda_runtime.h>
+#include <cstdint>
+#include <memory>
+#include "../common.hh"
+#include "../kernel/cpplaunch_cuda.hh"
+#include "../kernel/lorenzo_all.hh"
+#include "../utils.hh"
+
+#include "cusz/type.h"
+#include "pred_boilerplate_deprecated.hh"
+
+#define DEFINE_ARRAY(VAR, TYPE) TYPE* d_##VAR{nullptr};
+
+#define ALLOCDEV(VAR, SYM, NBYTE)                    \
+    if (NBYTE != 0) {                                \
+        CHECK_CUDA(cudaMalloc(&d_##VAR, NBYTE));     \
+        CHECK_CUDA(cudaMemset(d_##VAR, 0x0, NBYTE)); \
+    }
+
+#define ALLOCDEV2(VAR, TYPE, LEN)                                 \
+    if (LEN != 0) {                                               \
+        CHECK_CUDA(cudaMalloc(&d_##VAR, sizeof(TYPE) * LEN));     \
+        CHECK_CUDA(cudaMemset(d_##VAR, 0x0, sizeof(TYPE) * LEN)); \
+    }
+
+#define FREE_DEV_ARRAY(VAR)            \
+    if (d_##VAR) {                     \
+        CHECK_CUDA(cudaFree(d_##VAR)); \
+        d_##VAR = nullptr;             \
+    }
+
+namespace cusz {
+
+template <typename T, typename E, typename FP>
+class PredictionUnified : public PredictorBoilerplate {
+   public:
+    using Origin    = T;
+    using Anchor    = T;
+    using ErrCtrl   = E;
+    using Precision = FP;
+
+   public:
+    ~PredictionUnified()
+    {  // dtor
+        FREE_DEV_ARRAY(anchor);
+        FREE_DEV_ARRAY(errctrl);
+        FREE_DEV_ARRAY(outlier);
+    }
+    PredictionUnified() {}                                   // ctor
+    PredictionUnified(const PredictionUnified&);             // copy ctor
+    PredictionUnified& operator=(const PredictionUnified&);  // copy assign
+    PredictionUnified(PredictionUnified&&);                  // move ctor
+    PredictionUnified& operator=(PredictionUnified&&);       // move assign
+
+    void init(cusz_predictortype predictor, size_t x, size_t y, size_t z, bool dbg_print = false)
+    {
+        auto len3 = dim3(x, y, z);
+        init(predictor, len3, dbg_print);
+    }
+    void init(cusz_predictortype predictor, dim3 xyz, bool dbg_print = false)
+    {
+        this->derive_alloclen(predictor, xyz);
+
+        // allocate
+        ALLOCDEV2(anchor, T, this->alloclen.assigned.anchor);
+        ALLOCDEV2(errctrl, E, this->alloclen.assigned.quant);
+        ALLOCDEV2(outlier, T, this->alloclen.assigned.outlier);
+
+        if (dbg_print) this->debug_list_alloclen<T, E, FP>();
+    }
+
+    void construct(
+        cusz_predictortype predictor,
+        dim3 const         len3,
+        T*                 data,
+        T**                ptr_anchor,
+        E**                ptr_errctrl,
+        T**                ptr_outlier,
+        double const       eb,
+        int const          radius,
+        cudaStream_t       stream)
+    {
+        *ptr_anchor  = d_anchor;
+        *ptr_errctrl = d_errctrl;
+        *ptr_outlier = d_outlier;
+
+        if (predictor == LorenzoI) {
+            derive_rtlen(LorenzoI, len3);
+            this->check_rtlen();
+
+            // ad hoc placeholder
+            // auto      anchor_len3  = dim3(0, 0, 0);
+            // auto      errctrl_len3 = dim3(0, 0, 0);
+            uint32_t* outlier_idx = nullptr;
+
+            compress_predict_lorenzo_i<T, E, FP>(
+                data, len3, eb, radius,                      //
+                d_errctrl, d_outlier, outlier_idx, nullptr,  //
+                &time_elapsed, stream);
+        }
+        else if (predictor == Spline3) {
+            this->derive_rtlen(Spline3, len3);
+            this->check_rtlen();
+
+            cusz::cpplaunch_construct_Spline3<T, E, FP>(
+                true,  //
+                data, len3, d_anchor, this->rtlen.anchor.len3, d_errctrl, this->rtlen.aligned.len3, eb, radius,
+                &time_elapsed, stream);
+        }
+    }
+
+    void reconstruct(
+        cusz_predictortype predictor,
+        dim3               len3,
+        T*                 outlier_xdata,
+        T*                 anchor,
+        E*                 errctrl,
+        double const       eb,
+        int const          radius,
+        cudaStream_t       stream)
+    {
+        if (predictor == LorenzoI) {
+            this->derive_rtlen(LorenzoI, len3);
+            this->check_rtlen();
+
+            // ad hoc placeholder
+            // auto      anchor_len3  = dim3(0, 0, 0);
+            // auto      errctrl_len3 = dim3(0, 0, 0);
+            auto      xdata       = outlier_xdata;
+            auto      outlier     = outlier_xdata;
+            uint32_t* outlier_idx = nullptr;
+
+            auto xdata_len3 = len3;
+
+            decompress_predict_lorenzo_i<T, E, FP>(
+                errctrl, xdata_len3, outlier, outlier_idx, 0, eb, radius,  //
+                xdata,                                                     //
+                &time_elapsed, stream);
+        }
+        else if (predictor == Spline3) {
+            this->derive_rtlen(Spline3, len3);
+            this->check_rtlen();
+            // this->debug_list_rtlen<T, E, FP>(true);
+
+            // launch_reconstruct_Spline3<T, E, FP>(
+            cusz::cpplaunch_reconstruct_Spline3<T, E, FP>(
+                outlier_xdata, len3, anchor, this->rtlen.anchor.len3, errctrl, this->rtlen.aligned.len3, eb, radius,
+                &time_elapsed, stream);
+        }
+    }
+
+    void clear_buffer() { cudaMemset(d_errctrl, 0x0, sizeof(E) * this->rtlen.assigned.quant); }
+
+    float get_time_elapsed() const { return time_elapsed; }
+    // size_t get_alloclen_data() const;
+    // size_t get_alloclen_quant() const;
+    // size_t get_len_data() const;
+    // size_t get_len_quant() const;
+    // size_t get_len_anchor() const;
+
+    E* expose_quant() const { return d_errctrl; }
+    E* expose_errctrl() const { return d_errctrl; }
+    T* expose_anchor() const { return d_anchor; }
+    T* expose_outlier() const { return d_outlier; }
+
+   public:
+    // data
+    DEFINE_ARRAY(anchor, T);
+    DEFINE_ARRAY(errctrl, E);
+    DEFINE_ARRAY(outlier, T);
+};
+
+}  // namespace cusz
+
+#undef ALLOCDEV
+#undef FREE_DEV_ARRAY
+#undef DEFINE_ARRAY
+
+#endif /* FB315D3E_6B96_4F5D_9975_F35702205BC1 */
diff --git a/qtensor/compression/cusz/include/component/spcodec.inl b/qtensor/compression/cusz/include/component/spcodec.inl
index 32c91ab0..2a57f2f1 100644
--- a/qtensor/compression/cusz/include/component/spcodec.inl
+++ b/qtensor/compression/cusz/include/component/spcodec.inl
@@ -1,218 +1,218 @@
-/**
- * @file spcodec_vec.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-08-22
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef CF358238_3946_4FFC_B5E6_45C12F0C0B44
-#define CF358238_3946_4FFC_B5E6_45C12F0C0B44
-
-#include <cuda_runtime.h>
-#include <cstdint>
-#include <memory>
-
-#include <thrust/count.h>
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-
-#include "../common.hh"
-#include "../kernel/spv_gpu.hh"
-#include "utils/cuda_err.cuh"
-
-#define DEFINE_ARRAY(VAR, TYPE) TYPE* d_##VAR{nullptr};
-
-#define SPVEC_ALLOCDEV(VAR, SYM)                           \
-    CHECK_CUDA(cudaMalloc(&d_##VAR, rte.nbyte[RTE::SYM])); \
-    CHECK_CUDA(cudaMemset(d_##VAR, 0x0, rte.nbyte[RTE::SYM]));
-
-#define SPVEC_FREEDEV(VAR)             \
-    if (d_##VAR) {                     \
-        CHECK_CUDA(cudaFree(d_##VAR)); \
-        d_##VAR = nullptr;             \
-    }
-
-#define SPVEC_D2DCPY(VAR, FIELD)                                                                       \
-    {                                                                                                  \
-        auto dst = d_spfmt + header.entry[Header::FIELD];                                              \
-        auto src = reinterpret_cast<BYTE*>(d_##VAR);                                                   \
-        CHECK_CUDA(cudaMemcpyAsync(dst, src, nbyte[Header::FIELD], cudaMemcpyDeviceToDevice, stream)); \
-    }
-
-namespace cusz {
-
-/*******************************************************************************
- * sparsity-aware coder/decoder, vector
- *******************************************************************************/
-
-template <typename T, typename M = uint32_t>
-class SpcodecVec {
-   public:
-    using Origin    = T;
-    using BYTE      = uint8_t;
-    using MetadataT = M;
-
-    struct alignas(128) Header {
-        static const int HEADER = 0;
-        static const int IDX    = 1;
-        static const int VAL    = 2;
-        static const int END    = 3;
-
-        int       self_bytes : 16;
-        size_t    uncompressed_len;
-        int       nnz;
-        MetadataT entry[END + 1];
-
-        MetadataT subfile_size() const { return entry[END]; }
-    };
-
-    struct runtime_encode_helper {
-        static const int SPFMT = 0;
-        static const int IDX   = 1;
-        static const int VAL   = 2;
-        static const int END   = 3;
-
-        uint32_t nbyte[END];
-        int      nnz{0};
-    };
-
-   private:
-    DEFINE_ARRAY(spfmt, BYTE);
-    DEFINE_ARRAY(idx, M);
-    DEFINE_ARRAY(val, T);
-
-    using RTE = runtime_encode_helper;
-
-    float milliseconds{0.0};
-
-    RTE rte;
-
-   public:
-    ~SpcodecVec()
-    {
-        SPVEC_FREEDEV(spfmt);
-        SPVEC_FREEDEV(idx);
-        SPVEC_FREEDEV(val);
-    }                                          // dtor
-    SpcodecVec() {}                            // ctor
-    SpcodecVec(const SpcodecVec&);             // copy ctor
-    SpcodecVec& operator=(const SpcodecVec&);  // copy assign
-    SpcodecVec(SpcodecVec&&);                  // move ctor
-    SpcodecVec& operator=(SpcodecVec&&);       // move assign
-
-    void init(size_t const len, int density_factor = 4, bool dbg_print = false)
-    {
-        auto max_bytes = [&]() { return len / density_factor * sizeof(T); };
-        auto init_nnz  = [&]() { return len / density_factor; };
-
-        memset(rte.nbyte, 0, sizeof(uint32_t) * RTE::END);
-        rte.nnz = init_nnz();
-
-        rte.nbyte[RTE::SPFMT] = max_bytes();
-        rte.nbyte[RTE::IDX]   = rte.nnz * sizeof(int);
-        rte.nbyte[RTE::VAL]   = rte.nnz * sizeof(T);
-
-        SPVEC_ALLOCDEV(spfmt, SPFMT);
-        SPVEC_ALLOCDEV(idx, IDX);
-        SPVEC_ALLOCDEV(val, VAL);
-
-        // if (dbg_print) debug();
-    }
-
-    void encode(
-        T*           in,
-        size_t const in_len,
-        BYTE*&       out,
-        size_t&      out_len,
-        cudaStream_t stream    = nullptr,
-        bool         dbg_print = false)
-    {
-        Header header;
-
-        psz::spv_gather<T, M>(in, in_len, this->d_val, this->d_idx, &rte.nnz, &milliseconds, stream);
-
-        subfile_collect(header, in_len, stream, dbg_print);
-        out     = d_spfmt;
-        out_len = header.subfile_size();
-    }
-
-    void decode(BYTE* coded, T* decoded, cudaStream_t stream = nullptr)
-    {
-        Header header;
-        CHECK_CUDA(cudaMemcpyAsync(&header, coded, sizeof(header), cudaMemcpyDeviceToHost, stream));
-
-#define ACCESSOR(SYM, TYPE) reinterpret_cast<TYPE*>(coded + header.entry[Header::SYM])
-        auto d_idx = ACCESSOR(IDX, uint32_t);
-        auto d_val = ACCESSOR(VAL, T);
-#undef ACCESSOR
-
-        psz::spv_scatter<T, M>(d_val, d_idx, header.nnz, decoded, &milliseconds, stream);
-    }
-
-    void clear_buffer()
-    {
-        cudaMemset(d_spfmt, 0x0, rte.nbyte[RTE::SPFMT]);
-        cudaMemset(d_idx, 0x0, rte.nbyte[RTE::IDX]);
-        cudaMemset(d_val, 0x0, rte.nbyte[RTE::VAL]);
-    }
-
-    float get_time_elapsed() const { return milliseconds; }
-
-    void subfile_collect(Header& header, size_t len, cudaStream_t stream, bool dbg_print)
-    {
-        header.self_bytes       = sizeof(Header);
-        header.uncompressed_len = len;
-        header.nnz              = rte.nnz;
-
-        // update (redundant here)
-        rte.nbyte[RTE::IDX] = sizeof(int) * rte.nnz;
-        rte.nbyte[RTE::VAL] = sizeof(T) * rte.nnz;
-
-        MetadataT nbyte[Header::END];
-        nbyte[Header::HEADER] = 128;
-        nbyte[Header::IDX]    = rte.nbyte[RTE::IDX];
-        nbyte[Header::VAL]    = rte.nbyte[RTE::VAL];
-
-        header.entry[0] = 0;
-        // *.END + 1; need to knwo the ending position
-        for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] = nbyte[i - 1]; }
-        for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; }
-
-        auto debug_header_entry = [&]() {
-            printf("\nCSR11::subfile_collect() debugging:\n");
-            printf("%-*s:  %'10ld\n", 16, "final.nnz", rte.nnz);
-            printf("  ENTRIES\n");
-
-#define PRINT_ENTRY(VAR) printf("%d %-*s:  %'10u\n", (int)Header::VAR, 14, #VAR, header.entry[Header::VAR]);
-            PRINT_ENTRY(HEADER);
-            PRINT_ENTRY(IDX);
-            PRINT_ENTRY(VAL);
-            PRINT_ENTRY(END);
-            printf("\n");
-#undef PRINT_ENTRY
-        };
-        if (dbg_print) debug_header_entry();
-
-        CHECK_CUDA(cudaMemcpyAsync(d_spfmt, &header, sizeof(header), cudaMemcpyHostToDevice, stream));
-
-        /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream));
-
-        SPVEC_D2DCPY(idx, IDX)
-        SPVEC_D2DCPY(val, VAL)
-
-        /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream));
-    }
-};
-
-}  // namespace cusz
-
-#undef DEFINE_ARRAY
-#undef SPVEC_ALLOCDEV
-#undef SPVEC_FREEDEV
-#undef SPVEC_D2DCPY
-
-#endif /* CF358238_3946_4FFC_B5E6_45C12F0C0B44 */
+/**
+ * @file spcodec_vec.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-08-22
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CF358238_3946_4FFC_B5E6_45C12F0C0B44
+#define CF358238_3946_4FFC_B5E6_45C12F0C0B44
+
+#include <cuda_runtime.h>
+#include <cstdint>
+#include <memory>
+
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+
+#include "../common.hh"
+#include "../kernel/spv_gpu.hh"
+#include "utils/cuda_err.cuh"
+
+#define DEFINE_ARRAY(VAR, TYPE) TYPE* d_##VAR{nullptr};
+
+#define SPVEC_ALLOCDEV(VAR, SYM)                           \
+    CHECK_CUDA(cudaMalloc(&d_##VAR, rte.nbyte[RTE::SYM])); \
+    CHECK_CUDA(cudaMemset(d_##VAR, 0x0, rte.nbyte[RTE::SYM]));
+
+#define SPVEC_FREEDEV(VAR)             \
+    if (d_##VAR) {                     \
+        CHECK_CUDA(cudaFree(d_##VAR)); \
+        d_##VAR = nullptr;             \
+    }
+
+#define SPVEC_D2DCPY(VAR, FIELD)                                                                       \
+    {                                                                                                  \
+        auto dst = d_spfmt + header.entry[Header::FIELD];                                              \
+        auto src = reinterpret_cast<BYTE*>(d_##VAR);                                                   \
+        CHECK_CUDA(cudaMemcpyAsync(dst, src, nbyte[Header::FIELD], cudaMemcpyDeviceToDevice, stream)); \
+    }
+
+namespace cusz {
+
+/*******************************************************************************
+ * sparsity-aware coder/decoder, vector
+ *******************************************************************************/
+
+template <typename T, typename M = uint32_t>
+class SpcodecVec {
+   public:
+    using Origin    = T;
+    using BYTE      = uint8_t;
+    using MetadataT = M;
+
+    struct alignas(128) Header {
+        static const int HEADER = 0;
+        static const int IDX    = 1;
+        static const int VAL    = 2;
+        static const int END    = 3;
+
+        int       self_bytes : 16;
+        size_t    uncompressed_len;
+        int       nnz;
+        MetadataT entry[END + 1];
+
+        MetadataT subfile_size() const { return entry[END]; }
+    };
+
+    struct runtime_encode_helper {
+        static const int SPFMT = 0;
+        static const int IDX   = 1;
+        static const int VAL   = 2;
+        static const int END   = 3;
+
+        uint32_t nbyte[END];
+        int      nnz{0};
+    };
+
+   private:
+    DEFINE_ARRAY(spfmt, BYTE);
+    DEFINE_ARRAY(idx, M);
+    DEFINE_ARRAY(val, T);
+
+    using RTE = runtime_encode_helper;
+
+    float milliseconds{0.0};
+
+    RTE rte;
+
+   public:
+    ~SpcodecVec()
+    {
+        SPVEC_FREEDEV(spfmt);
+        SPVEC_FREEDEV(idx);
+        SPVEC_FREEDEV(val);
+    }                                          // dtor
+    SpcodecVec() {}                            // ctor
+    SpcodecVec(const SpcodecVec&);             // copy ctor
+    SpcodecVec& operator=(const SpcodecVec&);  // copy assign
+    SpcodecVec(SpcodecVec&&);                  // move ctor
+    SpcodecVec& operator=(SpcodecVec&&);       // move assign
+
+    void init(size_t const len, int density_factor = 4, bool dbg_print = false)
+    {
+        auto max_bytes = [&]() { return len / density_factor * sizeof(T); };
+        auto init_nnz  = [&]() { return len / density_factor; };
+
+        memset(rte.nbyte, 0, sizeof(uint32_t) * RTE::END);
+        rte.nnz = init_nnz();
+
+        rte.nbyte[RTE::SPFMT] = max_bytes();
+        rte.nbyte[RTE::IDX]   = rte.nnz * sizeof(int);
+        rte.nbyte[RTE::VAL]   = rte.nnz * sizeof(T);
+
+        SPVEC_ALLOCDEV(spfmt, SPFMT);
+        SPVEC_ALLOCDEV(idx, IDX);
+        SPVEC_ALLOCDEV(val, VAL);
+
+        // if (dbg_print) debug();
+    }
+
+    void encode(
+        T*           in,
+        size_t const in_len,
+        BYTE*&       out,
+        size_t&      out_len,
+        cudaStream_t stream    = nullptr,
+        bool         dbg_print = false)
+    {
+        Header header;
+
+        psz::spv_gather<T, M>(in, in_len, this->d_val, this->d_idx, &rte.nnz, &milliseconds, stream);
+
+        subfile_collect(header, in_len, stream, dbg_print);
+        out     = d_spfmt;
+        out_len = header.subfile_size();
+    }
+
+    void decode(BYTE* coded, T* decoded, cudaStream_t stream = nullptr)
+    {
+        Header header;
+        CHECK_CUDA(cudaMemcpyAsync(&header, coded, sizeof(header), cudaMemcpyDeviceToHost, stream));
+
+#define ACCESSOR(SYM, TYPE) reinterpret_cast<TYPE*>(coded + header.entry[Header::SYM])
+        auto d_idx = ACCESSOR(IDX, uint32_t);
+        auto d_val = ACCESSOR(VAL, T);
+#undef ACCESSOR
+
+        psz::spv_scatter<T, M>(d_val, d_idx, header.nnz, decoded, &milliseconds, stream);
+    }
+
+    void clear_buffer()
+    {
+        cudaMemset(d_spfmt, 0x0, rte.nbyte[RTE::SPFMT]);
+        cudaMemset(d_idx, 0x0, rte.nbyte[RTE::IDX]);
+        cudaMemset(d_val, 0x0, rte.nbyte[RTE::VAL]);
+    }
+
+    float get_time_elapsed() const { return milliseconds; }
+
+    void subfile_collect(Header& header, size_t len, cudaStream_t stream, bool dbg_print)
+    {
+        header.self_bytes       = sizeof(Header);
+        header.uncompressed_len = len;
+        header.nnz              = rte.nnz;
+
+        // update (redundant here)
+        rte.nbyte[RTE::IDX] = sizeof(int) * rte.nnz;
+        rte.nbyte[RTE::VAL] = sizeof(T) * rte.nnz;
+
+        MetadataT nbyte[Header::END];
+        nbyte[Header::HEADER] = 128;
+        nbyte[Header::IDX]    = rte.nbyte[RTE::IDX];
+        nbyte[Header::VAL]    = rte.nbyte[RTE::VAL];
+
+        header.entry[0] = 0;
+        // *.END + 1; need to knwo the ending position
+        for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] = nbyte[i - 1]; }
+        for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; }
+
+        auto debug_header_entry = [&]() {
+            printf("\nCSR11::subfile_collect() debugging:\n");
+            printf("%-*s:  %'10ld\n", 16, "final.nnz", rte.nnz);
+            printf("  ENTRIES\n");
+
+#define PRINT_ENTRY(VAR) printf("%d %-*s:  %'10u\n", (int)Header::VAR, 14, #VAR, header.entry[Header::VAR]);
+            PRINT_ENTRY(HEADER);
+            PRINT_ENTRY(IDX);
+            PRINT_ENTRY(VAL);
+            PRINT_ENTRY(END);
+            printf("\n");
+#undef PRINT_ENTRY
+        };
+        if (dbg_print) debug_header_entry();
+
+        CHECK_CUDA(cudaMemcpyAsync(d_spfmt, &header, sizeof(header), cudaMemcpyHostToDevice, stream));
+
+        /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        SPVEC_D2DCPY(idx, IDX)
+        SPVEC_D2DCPY(val, VAL)
+
+        /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream));
+    }
+};
+
+}  // namespace cusz
+
+#undef DEFINE_ARRAY
+#undef SPVEC_ALLOCDEV
+#undef SPVEC_FREEDEV
+#undef SPVEC_D2DCPY
+
+#endif /* CF358238_3946_4FFC_B5E6_45C12F0C0B44 */
diff --git a/qtensor/compression/cusz/include/compressor.hh b/qtensor/compression/cusz/include/compressor.hh
index 7ea8c0ab..adea8f57 100644
--- a/qtensor/compression/cusz/include/compressor.hh
+++ b/qtensor/compression/cusz/include/compressor.hh
@@ -1,165 +1,165 @@
-/**
- * @file compressor.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-04-23
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef CUSZ_COMPRESSOR_HH
-#define CUSZ_COMPRESSOR_HH
-
-#include <cuda_runtime.h>
-#include <memory>
-
-#include "common/type_traits.hh"
-#include "compaction.hh"
-#include "component.hh"
-#include "context.hh"
-#include "header.h"
-
-#define PUBLIC_TYPES                                                   \
-    using Predictor     = typename BINDING::Predictor;                 \
-    using Spcodec       = typename BINDING::Spcodec;                   \
-    using Codec         = typename BINDING::Codec;                     \
-    using FallbackCodec = typename BINDING::FallbackCodec;             \
-    using BYTE          = uint8_t;                                     \
-                                                                       \
-    using T    = typename BINDING::DATA;                               \
-    using FP   = typename BINDING::FP;                                 \
-    using E    = typename BINDING::ERRCTRL;                            \
-    using H    = typename Codec::Encoded;                              \
-    using M    = typename Codec::MetadataT;                            \
-    using H_FB = typename FallbackCodec::Encoded;                      \
-                                                                       \
-    using TimeRecord   = std::vector<std::tuple<const char*, double>>; \
-    using timerecord_t = TimeRecord*;
-
-namespace cusz {
-
-// extra helper
-struct CompressorHelper {
-    static int autotune_coarse_parvle(Context* ctx);
-};
-
-template <class BINDING>
-class Compressor {
-   public:
-    using Predictor     = typename BINDING::Predictor;
-    using Spcodec       = typename BINDING::Spcodec;
-    using Codec         = typename BINDING::Codec;
-    using FallbackCodec = typename BINDING::FallbackCodec;
-    using BYTE          = uint8_t;
-
-    using T    = typename Predictor::Origin;
-    using FP   = typename Predictor::Precision;
-    using E    = typename Predictor::ErrCtrl;
-    using H    = typename Codec::Encoded;
-    using M    = typename Codec::MetadataT;
-    using H_FB = typename FallbackCodec::Encoded;
-
-    using TimeRecord   = std::vector<std::tuple<const char*, double>>;
-    using timerecord_t = TimeRecord*;
-
-   private:
-    class impl;
-    std::unique_ptr<impl> pimpl;
-
-   public:
-    ~Compressor();
-    Compressor();
-    Compressor(const Compressor&);
-    Compressor& operator=(const Compressor&);
-    Compressor(Compressor&&);
-    Compressor& operator=(Compressor&&);
-
-    // methods
-    void init(Context*, bool dbg_print = false);
-    void init(Header*, bool dbg_print = false);
-    void destroy();
-    void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false);
-    void decompress(Header*, BYTE*, T*, cudaStream_t = nullptr, bool = true);
-    void clear_buffer();
-    // getter
-    void export_header(Header&);
-    void export_header(Header*);
-    void export_timerecord(TimeRecord*);
-};
-
-template <class BINDING>
-class Compressor<BINDING>::impl {
-   public:
-    using Predictor     = typename BINDING::Predictor;
-    using Spcodec       = typename BINDING::Spcodec;
-    using Codec         = typename BINDING::Codec;
-    using FallbackCodec = typename BINDING::FallbackCodec;
-    using BYTE          = uint8_t;
-
-    using T    = typename Predictor::Origin;
-    using FP   = typename Predictor::Precision;
-    using E    = typename Predictor::ErrCtrl;
-    using H    = typename Codec::Encoded;
-    using M    = typename Codec::MetadataT;
-    using H_FB = typename FallbackCodec::Encoded;
-
-    using TimeRecord   = std::vector<std::tuple<const char*, double>>;
-    using timerecord_t = TimeRecord*;
-
-   private:
-    // state
-    bool  use_fallback_codec{false};
-    bool  fallback_codec_allocated{false};
-    BYTE* d_reserved_compressed{nullptr};
-    // profiling
-    TimeRecord timerecord;
-    // header
-    Header header;
-    // components
-
-    Predictor*     predictor;
-    Spcodec*       spcodec;
-    Codec*         codec;
-    FallbackCodec* fb_codec;
-    // variables
-    uint32_t* d_freq;
-    float     time_hist;
-    dim3      data_len3;
-
-   public:
-    ~impl();
-    impl();
-
-    // public methods
-    void init(Context* config, bool dbg_print = false);
-    void init(Header* config, bool dbg_print = false);
-    void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false);
-    void decompress(Header*, BYTE*, T*, cudaStream_t = nullptr, bool = true);
-    void clear_buffer();
-
-    // getter
-    void     export_header(Header&);
-    void     export_header(Header*);
-    void     export_timerecord(TimeRecord*);
-    uint32_t get_len_data();
-
-   private:
-    // helper
-    template <class CONFIG>
-    void init_detail(CONFIG*, bool);
-    void init_codec(size_t, unsigned int, int, int, bool);
-    void collect_compress_timerecord();
-    void collect_decompress_timerecord();
-    void encode_with_exception(E*, size_t, uint32_t*, int, int, int, bool, BYTE*&, size_t&, cudaStream_t, bool);
-    void subfile_collect(T*, size_t, BYTE*, size_t, BYTE*, size_t, cudaStream_t, bool);
-    void destroy();
-    // getter
-};
-
-}  // namespace cusz
-
-#undef PUBLIC_TYPES
-
-#endif
+/**
+ * @file compressor.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-23
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_COMPRESSOR_HH
+#define CUSZ_COMPRESSOR_HH
+
+#include <cuda_runtime.h>
+#include <memory>
+
+#include "common/type_traits.hh"
+#include "compaction.hh"
+#include "component.hh"
+#include "context.hh"
+#include "header.h"
+
+#define PUBLIC_TYPES                                                   \
+    using Predictor     = typename BINDING::Predictor;                 \
+    using Spcodec       = typename BINDING::Spcodec;                   \
+    using Codec         = typename BINDING::Codec;                     \
+    using FallbackCodec = typename BINDING::FallbackCodec;             \
+    using BYTE          = uint8_t;                                     \
+                                                                       \
+    using T    = typename BINDING::DATA;                               \
+    using FP   = typename BINDING::FP;                                 \
+    using E    = typename BINDING::ERRCTRL;                            \
+    using H    = typename Codec::Encoded;                              \
+    using M    = typename Codec::MetadataT;                            \
+    using H_FB = typename FallbackCodec::Encoded;                      \
+                                                                       \
+    using TimeRecord   = std::vector<std::tuple<const char*, double>>; \
+    using timerecord_t = TimeRecord*;
+
+namespace cusz {
+
+// extra helper
+struct CompressorHelper {
+    static int autotune_coarse_parvle(Context* ctx);
+};
+
+template <class BINDING>
+class Compressor {
+   public:
+    using Predictor     = typename BINDING::Predictor;
+    using Spcodec       = typename BINDING::Spcodec;
+    using Codec         = typename BINDING::Codec;
+    using FallbackCodec = typename BINDING::FallbackCodec;
+    using BYTE          = uint8_t;
+
+    using T    = typename Predictor::Origin;
+    using FP   = typename Predictor::Precision;
+    using E    = typename Predictor::ErrCtrl;
+    using H    = typename Codec::Encoded;
+    using M    = typename Codec::MetadataT;
+    using H_FB = typename FallbackCodec::Encoded;
+
+    using TimeRecord   = std::vector<std::tuple<const char*, double>>;
+    using timerecord_t = TimeRecord*;
+
+   private:
+    class impl;
+    std::unique_ptr<impl> pimpl;
+
+   public:
+    ~Compressor();
+    Compressor();
+    Compressor(const Compressor&);
+    Compressor& operator=(const Compressor&);
+    Compressor(Compressor&&);
+    Compressor& operator=(Compressor&&);
+
+    // methods
+    void init(Context*, bool dbg_print = false);
+    void init(Header*, bool dbg_print = false);
+    void destroy();
+    void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false);
+    void decompress(Header*, BYTE*, T*, cudaStream_t = nullptr, bool = true);
+    void clear_buffer();
+    // getter
+    void export_header(Header&);
+    void export_header(Header*);
+    void export_timerecord(TimeRecord*);
+};
+
+template <class BINDING>
+class Compressor<BINDING>::impl {
+   public:
+    using Predictor     = typename BINDING::Predictor;
+    using Spcodec       = typename BINDING::Spcodec;
+    using Codec         = typename BINDING::Codec;
+    using FallbackCodec = typename BINDING::FallbackCodec;
+    using BYTE          = uint8_t;
+
+    using T    = typename Predictor::Origin;
+    using FP   = typename Predictor::Precision;
+    using E    = typename Predictor::ErrCtrl;
+    using H    = typename Codec::Encoded;
+    using M    = typename Codec::MetadataT;
+    using H_FB = typename FallbackCodec::Encoded;
+
+    using TimeRecord   = std::vector<std::tuple<const char*, double>>;
+    using timerecord_t = TimeRecord*;
+
+   private:
+    // state
+    bool  use_fallback_codec{false};
+    bool  fallback_codec_allocated{false};
+    BYTE* d_reserved_compressed{nullptr};
+    // profiling
+    TimeRecord timerecord;
+    // header
+    Header header;
+    // components
+
+    Predictor*     predictor;
+    Spcodec*       spcodec;
+    Codec*         codec;
+    FallbackCodec* fb_codec;
+    // variables
+    uint32_t* d_freq;
+    float     time_hist;
+    dim3      data_len3;
+
+   public:
+    ~impl();
+    impl();
+
+    // public methods
+    void init(Context* config, bool dbg_print = false);
+    void init(Header* config, bool dbg_print = false);
+    void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false);
+    void decompress(Header*, BYTE*, T*, cudaStream_t = nullptr, bool = true);
+    void clear_buffer();
+
+    // getter
+    void     export_header(Header&);
+    void     export_header(Header*);
+    void     export_timerecord(TimeRecord*);
+    uint32_t get_len_data();
+
+   private:
+    // helper
+    template <class CONFIG>
+    void init_detail(CONFIG*, bool);
+    void init_codec(size_t, unsigned int, int, int, bool);
+    void collect_compress_timerecord();
+    void collect_decompress_timerecord();
+    void encode_with_exception(E*, size_t, uint32_t*, int, int, int, bool, BYTE*&, size_t&, cudaStream_t, bool);
+    void subfile_collect(T*, size_t, BYTE*, size_t, BYTE*, size_t, cudaStream_t, bool);
+    void destroy();
+    // getter
+};
+
+}  // namespace cusz
+
+#undef PUBLIC_TYPES
+
+#endif
diff --git a/qtensor/compression/cusz/include/context.hh b/qtensor/compression/cusz/include/context.hh
index 36cbae57..d177fb8f 100644
--- a/qtensor/compression/cusz/include/context.hh
+++ b/qtensor/compression/cusz/include/context.hh
@@ -1,251 +1,251 @@
-#ifndef ARGPARSE_HH
-#define ARGPARSE_HH
-
-/**
- * @file argparse.hh
- * @author Jiannan Tian
- * @brief Argument parser (header).
- * @version 0.1
- * @date 2020-09-20
- * Created on: 20-04-24
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#include <cstdlib>
-#include <iostream>
-#include <regex>
-#include <string>
-
-#include "common/configs.hh"
-#include "common/definition.hh"
-#include "utils/format.hh"
-#include "utils/strhelper.hh"
-
-namespace cusz {
-
-extern const char* VERSION_TEXT;
-extern const int   version;
-extern const int   compatibility;
-
-}  // namespace cusz
-
-struct cuszCTX {
-   public:
-    // on-off's
-    struct {
-        bool construct{false}, reconstruct{false}, dryrun{false};
-        bool experiment{false};
-        bool gtest{false};
-    } cli_task;
-
-    struct {
-        bool binning{false}, logtransform{false}, prescan{false};
-    } preprocess;
-    struct {
-        bool gpu_nvcomp_cascade{false}, cpu_gzip{false};
-    } postcompress;
-
-    struct {
-        bool predefined_demo{false}, release_input{false};
-        bool anchor{false}, autotune_vle_pardeg{true}, gpu_verify{false};
-    } use;
-
-    struct {
-        bool book{false}, quant{false};
-    } export_raw;
-
-    struct {
-        bool write2disk{false}, huffman{false};
-    } skip;
-    struct {
-        bool time{false}, cr{false}, compressibility{false};
-    } report;
-
-    // filenames
-    struct {
-        std::string fname, origin_cmp, path_basename, basename, compress_output;
-    } fname;
-
-    bool verbose{false};
-
-    // Stat stat;
-
-    int read_args_status{0};
-
-    std::string opath;
-
-    std::string demo_dataset;
-    std::string dtype     = ConfigHelper::get_default_dtype();      // "f32"
-    std::string mode      = ConfigHelper::get_default_cuszmode();   // "r2r"
-    std::string predictor = ConfigHelper::get_default_predictor();  // "lorenzo"
-    std::string codec     = ConfigHelper::get_default_codec();      // "huffman-coarse"
-    std::string spcodec   = ConfigHelper::get_default_spcodec();    // "cusparse-csr"
-    std::string pipeline  = "auto";
-
-    // sparsity related: init_nnz when setting up Spcodec
-    float nz_density{SparseMethodSetup::default_density};
-    float nz_density_factor{SparseMethodSetup::default_density_factor};
-
-    uint32_t codecs_in_use{0b01};
-
-    uint32_t quant_bytewidth{2}, huff_bytewidth{4};
-
-    bool codec_force_fallback() const { return huff_bytewidth == 8; }
-
-    size_t huffman_num_uints, huffman_num_bits;
-    int    vle_sublen{512}, vle_pardeg{-1};
-
-    unsigned int x{1}, y{1}, z{1}, w{1};
-
-    struct {
-        // size_t x, y, z, w;
-        size_t len;
-    } alloclen;
-
-    size_t data_len{1}, quant_len{1}, anchor_len{1};
-    int    ndim{-1};
-
-    size_t get_len() const { return data_len; }
-
-    double eb{0.0};
-    int    dict_size{1024}, radius{512};
-
-    void load_demo_sizes();
-
-    /*******************************************************************************
-     * another configuration method, alternative to
-     *******************************************************************************/
-   public:
-    // for configuration
-    cuszCTX& set_eb(double _)
-    {
-        eb = _;
-        return *this;
-    }
-
-    cuszCTX& set_radius(int _)
-    {
-        radius    = _;
-        dict_size = radius * 2;
-        return *this;
-    }
-
-    cuszCTX& set_huffbyte(int _)
-    {
-        huff_bytewidth = _;
-        codecs_in_use  = codec_force_fallback() ? 0b11 /*use both*/ : 0b01 /*use 4-byte*/;
-        return *this;
-    }
-
-    cuszCTX& set_huffchunk(int _)
-    {
-        vle_sublen              = _;
-        use.autotune_vle_pardeg = false;
-        return *this;
-    }
-
-    cuszCTX& set_spcodec_densityfactor(int _)
-    {
-        if (_ <= 1)
-            throw std::runtime_error(
-                "Density factor for Spcodec must be >1. For example, setting the factor as 4 indicates the density "
-                "(the portion of nonzeros) is 25% in an array.");
-        nz_density_factor = _;
-        nz_density        = 1.0 / _;
-        return *this;
-    }
-
-    cuszCTX& enable_anchor(bool _)
-    {
-        use.anchor = true;
-        return *this;
-    }
-    cuszCTX& enable_input_nondestructive(bool _)
-    {
-        // placeholder
-        return *this;
-    }
-
-    cuszCTX& enable_failfast(bool _)
-    {
-        // placeholder
-        return *this;
-    }
-
-    cuszCTX& set_alloclen(size_t _)
-    {
-        alloclen.len = _;
-        return *this;
-    }
-
-    cuszCTX& set_control_string(const char* in_str);
-
-    cuszCTX& use_anchor(size_t _)
-    {
-        use.anchor = true;
-        return *this;
-    }
-
-    // set x, y, z, w, ndim, data_len
-    cuszCTX& set_len(size_t _x, size_t _y = 1, size_t _z = 1, size_t _w = 1)
-    {
-        x = _x, y = _y, z = _z, w = _w;
-
-        ndim = 4;
-        if (w == 1) ndim = 3;
-        if (z == 1) ndim = 2;
-        if (y == 1) ndim = 1;
-
-        data_len = x * y * z * w;
-
-        if (data_len == 1) throw std::runtime_error("Input data length cannot be 1 (in 1-D view).");
-        if (data_len == 0) throw std::runtime_error("Input data length cannot be 0 (in 1-D view).");
-
-        return *this;
-    }
-
-   private:
-    void derive_fnames();
-
-    void validate();
-
-   public:
-    void trap(int _status);
-
-    static void print_doc(bool full = false);
-
-   public:
-    static void parse_input_length(const char* lenstr, cuszCTX* ctx)
-    {
-        std::vector<std::string> dims;
-        ConfigHelper::parse_length_literal(lenstr, dims);
-        ctx->ndim = dims.size();
-        ctx->y = ctx->z = ctx->w = 1;
-        ctx->x                   = StrHelper::str2int(dims[0]);
-        if (ctx->ndim >= 2) ctx->y = StrHelper::str2int(dims[1]);
-        if (ctx->ndim >= 3) ctx->z = StrHelper::str2int(dims[2]);
-        if (ctx->ndim >= 4) ctx->w = StrHelper::str2int(dims[3]);
-        ctx->data_len = ctx->x * ctx->y * ctx->z * ctx->w;
-    }
-
-   public:
-    cuszCTX() = default;
-
-    cuszCTX(int argc, char** argv);
-
-    cuszCTX(const char*, bool dbg_print = false);
-};
-
-typedef struct cuszCTX cusz_context;
-
-namespace cusz {
-
-using Context   = cusz_context;
-using context_t = cusz_context*;
-
-}  // namespace cusz
-
-#endif  // ARGPARSE_HH
+#ifndef ARGPARSE_HH
+#define ARGPARSE_HH
+
+/**
+ * @file argparse.hh
+ * @author Jiannan Tian
+ * @brief Argument parser (header).
+ * @version 0.1
+ * @date 2020-09-20
+ * Created on: 20-04-24
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <cstdlib>
+#include <iostream>
+#include <regex>
+#include <string>
+
+#include "common/configs.hh"
+#include "common/definition.hh"
+#include "utils/format.hh"
+#include "utils/strhelper.hh"
+
+namespace cusz {
+
+extern const char* VERSION_TEXT;
+extern const int   version;
+extern const int   compatibility;
+
+}  // namespace cusz
+
+struct cuszCTX {
+   public:
+    // on-off's
+    struct {
+        bool construct{false}, reconstruct{false}, dryrun{false};
+        bool experiment{false};
+        bool gtest{false};
+    } cli_task;
+
+    struct {
+        bool binning{false}, logtransform{false}, prescan{false};
+    } preprocess;
+    struct {
+        bool gpu_nvcomp_cascade{false}, cpu_gzip{false};
+    } postcompress;
+
+    struct {
+        bool predefined_demo{false}, release_input{false};
+        bool anchor{false}, autotune_vle_pardeg{true}, gpu_verify{false};
+    } use;
+
+    struct {
+        bool book{false}, quant{false};
+    } export_raw;
+
+    struct {
+        bool write2disk{false}, huffman{false};
+    } skip;
+    struct {
+        bool time{false}, cr{false}, compressibility{false};
+    } report;
+
+    // filenames
+    struct {
+        std::string fname, origin_cmp, path_basename, basename, compress_output;
+    } fname;
+
+    bool verbose{false};
+
+    // Stat stat;
+
+    int read_args_status{0};
+
+    std::string opath;
+
+    std::string demo_dataset;
+    std::string dtype     = ConfigHelper::get_default_dtype();      // "f32"
+    std::string mode      = ConfigHelper::get_default_cuszmode();   // "r2r"
+    std::string predictor = ConfigHelper::get_default_predictor();  // "lorenzo"
+    std::string codec     = ConfigHelper::get_default_codec();      // "huffman-coarse"
+    std::string spcodec   = ConfigHelper::get_default_spcodec();    // "cusparse-csr"
+    std::string pipeline  = "auto";
+
+    // sparsity related: init_nnz when setting up Spcodec
+    float nz_density{SparseMethodSetup::default_density};
+    float nz_density_factor{SparseMethodSetup::default_density_factor};
+
+    uint32_t codecs_in_use{0b01};
+
+    uint32_t quant_bytewidth{2}, huff_bytewidth{4};
+
+    bool codec_force_fallback() const { return huff_bytewidth == 8; }
+
+    size_t huffman_num_uints, huffman_num_bits;
+    int    vle_sublen{512}, vle_pardeg{-1};
+
+    unsigned int x{1}, y{1}, z{1}, w{1};
+
+    struct {
+        // size_t x, y, z, w;
+        size_t len;
+    } alloclen;
+
+    size_t data_len{1}, quant_len{1}, anchor_len{1};
+    int    ndim{-1};
+
+    size_t get_len() const { return data_len; }
+
+    double eb{0.0};
+    int    dict_size{1024}, radius{512};
+
+    void load_demo_sizes();
+
+    /*******************************************************************************
+     * another configuration method, alternative to
+     *******************************************************************************/
+   public:
+    // for configuration
+    cuszCTX& set_eb(double _)
+    {
+        eb = _;
+        return *this;
+    }
+
+    cuszCTX& set_radius(int _)
+    {
+        radius    = _;
+        dict_size = radius * 2;
+        return *this;
+    }
+
+    cuszCTX& set_huffbyte(int _)
+    {
+        huff_bytewidth = _;
+        codecs_in_use  = codec_force_fallback() ? 0b11 /*use both*/ : 0b01 /*use 4-byte*/;
+        return *this;
+    }
+
+    cuszCTX& set_huffchunk(int _)
+    {
+        vle_sublen              = _;
+        use.autotune_vle_pardeg = false;
+        return *this;
+    }
+
+    cuszCTX& set_spcodec_densityfactor(int _)
+    {
+        if (_ <= 1)
+            throw std::runtime_error(
+                "Density factor for Spcodec must be >1. For example, setting the factor as 4 indicates the density "
+                "(the portion of nonzeros) is 25% in an array.");
+        nz_density_factor = _;
+        nz_density        = 1.0 / _;
+        return *this;
+    }
+
+    cuszCTX& enable_anchor(bool _)
+    {
+        use.anchor = true;
+        return *this;
+    }
+    cuszCTX& enable_input_nondestructive(bool _)
+    {
+        // placeholder
+        return *this;
+    }
+
+    cuszCTX& enable_failfast(bool _)
+    {
+        // placeholder
+        return *this;
+    }
+
+    cuszCTX& set_alloclen(size_t _)
+    {
+        alloclen.len = _;
+        return *this;
+    }
+
+    cuszCTX& set_control_string(const char* in_str);
+
+    cuszCTX& use_anchor(size_t _)
+    {
+        use.anchor = true;
+        return *this;
+    }
+
+    // set x, y, z, w, ndim, data_len
+    cuszCTX& set_len(size_t _x, size_t _y = 1, size_t _z = 1, size_t _w = 1)
+    {
+        x = _x, y = _y, z = _z, w = _w;
+
+        ndim = 4;
+        if (w == 1) ndim = 3;
+        if (z == 1) ndim = 2;
+        if (y == 1) ndim = 1;
+
+        data_len = x * y * z * w;
+
+        if (data_len == 1) throw std::runtime_error("Input data length cannot be 1 (in 1-D view).");
+        if (data_len == 0) throw std::runtime_error("Input data length cannot be 0 (in 1-D view).");
+
+        return *this;
+    }
+
+   private:
+    void derive_fnames();
+
+    void validate();
+
+   public:
+    void trap(int _status);
+
+    static void print_doc(bool full = false);
+
+   public:
+    static void parse_input_length(const char* lenstr, cuszCTX* ctx)
+    {
+        std::vector<std::string> dims;
+        ConfigHelper::parse_length_literal(lenstr, dims);
+        ctx->ndim = dims.size();
+        ctx->y = ctx->z = ctx->w = 1;
+        ctx->x                   = StrHelper::str2int(dims[0]);
+        if (ctx->ndim >= 2) ctx->y = StrHelper::str2int(dims[1]);
+        if (ctx->ndim >= 3) ctx->z = StrHelper::str2int(dims[2]);
+        if (ctx->ndim >= 4) ctx->w = StrHelper::str2int(dims[3]);
+        ctx->data_len = ctx->x * ctx->y * ctx->z * ctx->w;
+    }
+
+   public:
+    cuszCTX() = default;
+
+    cuszCTX(int argc, char** argv);
+
+    cuszCTX(const char*, bool dbg_print = false);
+};
+
+typedef struct cuszCTX cusz_context;
+
+namespace cusz {
+
+using Context   = cusz_context;
+using context_t = cusz_context*;
+
+}  // namespace cusz
+
+#endif  // ARGPARSE_HH
diff --git a/qtensor/compression/cusz/include/cusz.h b/qtensor/compression/cusz/include/cusz.h
index 694d315c..420999cc 100644
--- a/qtensor/compression/cusz/include/cusz.h
+++ b/qtensor/compression/cusz/include/cusz.h
@@ -1,60 +1,60 @@
-/**
- * @file cusz.h
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-04-29
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#include <cuda_runtime.h>
-//#define __cplusplus
-//#ifdef __cplusplus
-extern "C" {
-//#endif
-
-#ifndef CUSZ_H
-#define CUSZ_H
-
-#include <stddef.h>
-
-#include "cusz/custom.h"
-#include "cusz/record.h"
-#include "cusz/type.h"
-#include "header.h"
-
-#pragma link C++ all function
-#pragma link C++ all class
-
-cusz_compressor* cusz_create(cusz_framework* framework, cusz_datatype const type);
-
-cusz_error_status cusz_release(cusz_compressor* comp);
-
-cusz_error_status cusz_compress(
-    cusz_compressor* comp,
-    cusz_config*     config,
-    void*            uncompressed,
-    cusz_len const   uncomp_len,
-    uint8_t**        compressed,
-    size_t*          comp_bytes,
-    cusz_header*     header,
-    void*            record,
-    cudaStream_t     stream);
-
-cusz_error_status cusz_decompress(
-    cusz_compressor* comp,
-    cusz_header*     header,
-    uint8_t*         compressed,
-    size_t const     comp_len,
-    void*            decompressed,
-    cusz_len const   decomp_len,
-    void*            record,
-    cudaStream_t     stream);
-
-#endif
-
-//#ifdef __cplusplus
-}
-//#endif
+/**
+ * @file cusz.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-29
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include <cuda_runtime.h>
+//#define __cplusplus
+//#ifdef __cplusplus
+extern "C" {
+//#endif
+
+#ifndef CUSZ_H
+#define CUSZ_H
+
+#include <stddef.h>
+
+#include "cusz/custom.h"
+#include "cusz/record.h"
+#include "cusz/type.h"
+#include "header.h"
+
+#pragma link C++ all function
+#pragma link C++ all class
+
+cusz_compressor* cusz_create(cusz_framework* framework, cusz_datatype const type);
+
+cusz_error_status cusz_release(cusz_compressor* comp);
+
+cusz_error_status cusz_compress(
+    cusz_compressor* comp,
+    cusz_config*     config,
+    void*            uncompressed,
+    cusz_len const   uncomp_len,
+    uint8_t**        compressed,
+    size_t*          comp_bytes,
+    cusz_header*     header,
+    void*            record,
+    cudaStream_t     stream);
+
+cusz_error_status cusz_decompress(
+    cusz_compressor* comp,
+    cusz_header*     header,
+    uint8_t*         compressed,
+    size_t const     comp_len,
+    void*            decompressed,
+    cusz_len const   decomp_len,
+    void*            record,
+    cudaStream_t     stream);
+
+#endif
+
+//#ifdef __cplusplus
+}
+//#endif
diff --git a/qtensor/compression/cusz/include/cusz/custom.h b/qtensor/compression/cusz/include/cusz/custom.h
index c44682be..2ab7706d 100644
--- a/qtensor/compression/cusz/include/cusz/custom.h
+++ b/qtensor/compression/cusz/include/cusz/custom.h
@@ -1,26 +1,26 @@
-/**
- * @file compress.h
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-04-30
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "type.h"
-
-cusz_custom_predictor     cusz_default_predictor();
-cusz_custom_codec         cusz_default_codec();
-cusz_custom_huffman_codec cusz_default_huffman_codec();
-cusz_custom_spcodec       cusz_default_spcodec();
-cusz_custom_framework*    cusz_default_framework();
-
-#ifdef __cplusplus
-}
-#endif
+/**
+ * @file compress.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-30
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "type.h"
+
+cusz_custom_predictor     cusz_default_predictor();
+cusz_custom_codec         cusz_default_codec();
+cusz_custom_huffman_codec cusz_default_huffman_codec();
+cusz_custom_spcodec       cusz_default_spcodec();
+cusz_custom_framework*    cusz_default_framework();
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/qtensor/compression/cusz/include/cusz/it.hh b/qtensor/compression/cusz/include/cusz/it.hh
index 1e8daa34..5334acde 100644
--- a/qtensor/compression/cusz/include/cusz/it.hh
+++ b/qtensor/compression/cusz/include/cusz/it.hh
@@ -1,78 +1,78 @@
-/**
- * @file it.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.4
- * @date 2023-03-13
- *
- * (C) 2023 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include <cstdint>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <stdexcept>
-
-template <typename T, int DIM, int BLOCK>
-struct psz_buf {
-   private:
-    T*               _buf;
-    size_t           _len{1};
-    static const int stridey{BLOCK};
-    static const int stridez{BLOCK * BLOCK};
-
-   public:
-    psz_buf(bool do_memset = true)
-    {
-        if (DIM == 1) _len = BLOCK;
-        if (DIM == 2) _len = BLOCK * BLOCK;
-        if (DIM == 3) _len = BLOCK * BLOCK * BLOCK;
-        _buf = new T[_len];
-        if (do_memset) memset(_buf, 0x0, sizeof(T) * _len);
-    }
-
-    ~psz_buf() { delete[] _buf; }
-
-    T*& buf() { return _buf; }
-
-    T& operator()(int x) { return _buf[x]; }
-    T& operator()(int x, int y) { return _buf[x + y * stridey]; }
-    T& operator()(int x, int y, int z) { return _buf[x + y * stridey + z * stridez]; }
-};
-
-template <typename T, typename IDX = uint32_t>
-struct psz_outlier_serial {
-   private:
-    T*       _data;
-    IDX*     _idx;
-    uint32_t _count{0};
-    uint32_t _cap;
-
-   public:
-    psz_outlier_serial(size_t cap) : _cap(cap)
-    {
-        _data = new T[cap + 1];
-        _idx  = new IDX[cap + 1];
-        memset(_data, 0x0, sizeof(T) * cap);
-    }
-
-    ~psz_outlier_serial()
-    {
-        delete[] _data;
-        delete[] _idx;
-    }
-
-    T*&            val() { return _data; }
-    IDX*&          idx() { return _idx; }
-    uint32_t const count() { return _count; }
-
-    void record(T data, IDX idx)
-    {
-        if (_count > _cap) throw std::runtime_error("Outlier overflows.");
-        _data[_count] = data;
-        _idx[_count]  = idx;
-        ++_count;
-    }
+/**
+ * @file it.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-03-13
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <stdexcept>
+
+template <typename T, int DIM, int BLOCK>
+struct psz_buf {
+   private:
+    T*               _buf;
+    size_t           _len{1};
+    static const int stridey{BLOCK};
+    static const int stridez{BLOCK * BLOCK};
+
+   public:
+    psz_buf(bool do_memset = true)
+    {
+        if (DIM == 1) _len = BLOCK;
+        if (DIM == 2) _len = BLOCK * BLOCK;
+        if (DIM == 3) _len = BLOCK * BLOCK * BLOCK;
+        _buf = new T[_len];
+        if (do_memset) memset(_buf, 0x0, sizeof(T) * _len);
+    }
+
+    ~psz_buf() { delete[] _buf; }
+
+    T*& buf() { return _buf; }
+
+    T& operator()(int x) { return _buf[x]; }
+    T& operator()(int x, int y) { return _buf[x + y * stridey]; }
+    T& operator()(int x, int y, int z) { return _buf[x + y * stridey + z * stridez]; }
+};
+
+template <typename T, typename IDX = uint32_t>
+struct psz_outlier_serial {
+   private:
+    T*       _data;
+    IDX*     _idx;
+    uint32_t _count{0};
+    uint32_t _cap;
+
+   public:
+    psz_outlier_serial(size_t cap) : _cap(cap)
+    {
+        _data = new T[cap + 1];
+        _idx  = new IDX[cap + 1];
+        memset(_data, 0x0, sizeof(T) * cap);
+    }
+
+    ~psz_outlier_serial()
+    {
+        delete[] _data;
+        delete[] _idx;
+    }
+
+    T*&            val() { return _data; }
+    IDX*&          idx() { return _idx; }
+    uint32_t const count() { return _count; }
+
+    void record(T data, IDX idx)
+    {
+        if (_count > _cap) throw std::runtime_error("Outlier overflows.");
+        _data[_count] = data;
+        _idx[_count]  = idx;
+        ++_count;
+    }
 };
\ No newline at end of file
diff --git a/qtensor/compression/cusz/include/cusz/nd.h b/qtensor/compression/cusz/include/cusz/nd.h
index 007dfd7d..2c4443bc 100644
--- a/qtensor/compression/cusz/include/cusz/nd.h
+++ b/qtensor/compression/cusz/include/cusz/nd.h
@@ -1,15 +1,15 @@
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <stdint.h>
-#include <stdlib.h>
-
-typedef struct psz_dim3 {
-    uint32_t x, y, z;
-} psz_dim3;
-
-#ifdef __cplusplus
-}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+typedef struct psz_dim3 {
+    uint32_t x, y, z;
+} psz_dim3;
+
+#ifdef __cplusplus
+}
 #endif
\ No newline at end of file
diff --git a/qtensor/compression/cusz/include/cusz/pn.hh b/qtensor/compression/cusz/include/cusz/pn.hh
index 1c1bb472..9c0f78bf 100644
--- a/qtensor/compression/cusz/include/cusz/pn.hh
+++ b/qtensor/compression/cusz/include/cusz/pn.hh
@@ -1,49 +1,49 @@
-/**
- * @file pn.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.4
- * @date 2023-01-05
- *
- * (C) 2023 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include <stdint.h>
-#include <stdlib.h>
-
-// TODO typing should be more applicable
-
-namespace psz {
-namespace typing {
-
-// clang-format off
-template <int BYTEWIDTH> struct Int;
-template <> struct Int<1> { typedef int8_t  T; }; 
-template <> struct Int<2> { typedef int16_t T; }; 
-template <> struct Int<4> { typedef int32_t T; }; 
-template <> struct Int<8> { typedef int64_t T; };
-
-template <int BYTEWIDTH> struct UInt;
-template <> struct UInt<1> { typedef uint8_t  T; }; 
-template <> struct UInt<2> { typedef uint16_t T; }; 
-template <> struct UInt<4> { typedef uint32_t T; }; 
-template <> struct UInt<8> { typedef uint64_t T; };
-// clang-format on
-
-}  // namespace typing
-}  // namespace psz
-
-// TODO forward definition in another file
-template <int BYTEWIDTH>
-struct PN {
-    using UI = typename psz::typing::UInt<BYTEWIDTH>::T;
-    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
-
-    // reference: https://lemire.me/blog/2022/11/25/making-all-your-integers-positive-with-zigzag-encoding/
-
-    static UI encode(I* x) { return (2 * (*x)) ^ ((*x) >> (BYTEWIDTH * 8 - 1)); }
-    static UI encode(I x) { return (2 * x) ^ (x >> (BYTEWIDTH * 8 - 1)); }
-    static I  decode(UI* x) { return ((*x) >> 1) ^ (-((*x) & 1)); }
-    static I  decode(UI x) { return (x >> 1) ^ (-(x & 1)); }
-};
+/**
+ * @file pn.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-05
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+
+// TODO typing should be more applicable
+
+namespace psz {
+namespace typing {
+
+// clang-format off
+template <int BYTEWIDTH> struct Int;
+template <> struct Int<1> { typedef int8_t  T; }; 
+template <> struct Int<2> { typedef int16_t T; }; 
+template <> struct Int<4> { typedef int32_t T; }; 
+template <> struct Int<8> { typedef int64_t T; };
+
+template <int BYTEWIDTH> struct UInt;
+template <> struct UInt<1> { typedef uint8_t  T; }; 
+template <> struct UInt<2> { typedef uint16_t T; }; 
+template <> struct UInt<4> { typedef uint32_t T; }; 
+template <> struct UInt<8> { typedef uint64_t T; };
+// clang-format on
+
+}  // namespace typing
+}  // namespace psz
+
+// TODO forward definition in another file
+template <int BYTEWIDTH>
+struct PN {
+    using UI = typename psz::typing::UInt<BYTEWIDTH>::T;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    // reference: https://lemire.me/blog/2022/11/25/making-all-your-integers-positive-with-zigzag-encoding/
+
+    static UI encode(I* x) { return (2 * (*x)) ^ ((*x) >> (BYTEWIDTH * 8 - 1)); }
+    static UI encode(I x) { return (2 * x) ^ (x >> (BYTEWIDTH * 8 - 1)); }
+    static I  decode(UI* x) { return ((*x) >> 1) ^ (-((*x) & 1)); }
+    static I  decode(UI x) { return (x >> 1) ^ (-(x & 1)); }
+};
diff --git a/qtensor/compression/cusz/include/cusz/record.h b/qtensor/compression/cusz/include/cusz/record.h
index d285f1b1..3c9be515 100644
--- a/qtensor/compression/cusz/include/cusz/record.h
+++ b/qtensor/compression/cusz/include/cusz/record.h
@@ -1,38 +1,38 @@
-/**
- * @file record.h
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-04-30
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef CUSZ_RECORD_H
-#define CUSZ_RECORD_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct cusz_record_entry;
-
-struct cusz_record_entry {
-    const char* name;
-    double      time;
-
-    struct cusz_record_entry* next;
-};
-
-typedef struct cusz_record {
-    int n;
-
-    struct cusz_record_entry* head;
-} cusz_record;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
+/**
+ * @file record.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-30
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_RECORD_H
+#define CUSZ_RECORD_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct cusz_record_entry;
+
+struct cusz_record_entry {
+    const char* name;
+    double      time;
+
+    struct cusz_record_entry* next;
+};
+
+typedef struct cusz_record {
+    int n;
+
+    struct cusz_record_entry* head;
+} cusz_record;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/qtensor/compression/cusz/include/cusz/type.h b/qtensor/compression/cusz/include/cusz/type.h
index b5f2d750..73e66086 100644
--- a/qtensor/compression/cusz/include/cusz/type.h
+++ b/qtensor/compression/cusz/include/cusz/type.h
@@ -1,219 +1,219 @@
-/**
- * @file type.h
- * @author Jiannan Tian
- * @brief C-complient type definitions; no methods in this header.
- * @version 0.3
- * @date 2022-04-29
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef CUSZ_TYPE_H
-#define CUSZ_TYPE_H
-
-#include "stddef.h"
-
-enum cusz_execution_policy { CPU, CUDA };
-typedef enum cusz_execution_policy cusz_execution_policy;
-typedef enum cusz_execution_policy cusz_policy;
-typedef enum cusz_execution_policy asz_policy;
-
-//////// state enumeration
-
-typedef enum cusz_error_status {  //
-    CUSZ_SUCCESS                = 0x00,
-    CUSZ_FAIL_ONDISK_FILE_ERROR = 0x01,
-    CUSZ_FAIL_DATA_NOT_READY    = 0x02,
-    // specify error when calling CUDA API
-    CUSZ_FAIL_GPU_MALLOC,
-    CUSZ_FAIL_GPU_MEMCPY,
-    CUSZ_FAIL_GPU_ILLEGAL_ACCESS,
-    // specify error related to our own memory manager
-    CUSZ_FAIL_GPU_OUT_OF_MEMORY,
-    // when compression is useless
-    CUSZ_FAIL_INCOMPRESSIABLE,
-    // TODO component related error
-    CUSZ_FAIL_UNSUPPORTED_DATATYPE,
-    CUSZ_FAIL_UNSUPPORTED_QUANTTYPE,
-    CUSZ_FAIL_UNSUPPORTED_PRECISION,
-    CUSZ_FAIL_UNSUPPORTED_PIPELINE,
-    // not-implemented error
-    CUSZ_NOT_IMPLEMENTED = 0x0100,
-} cusz_error_status;
-
-typedef struct cusz_fixedlen_internal { /* all nullable */
-    void* encoding;
-} cusz_fixedlen_internal;
-typedef struct cusz_varlen_internal { /* all nullable */
-    void* huffman;
-    void* outlier;
-} cusz_varlen_internal;
-
-typedef enum cusz_datatype  //
-{ FP32   = 0,
-  FP64   = 1,
-  UINT8  = 10,
-  UINT16 = 11,
-  UINT32 = 12,
-  UINT64 = 13 } cusz_datatype;
-
-typedef enum cusz_executiontype  //
-{ Device = 0,
-  Host   = 1,
-  None   = 2 } cusz_executiontype;
-
-typedef enum cusz_mode  //
-{ Abs = 0,
-  Rel = 1 } cusz_mode;
-
-typedef enum cusz_pipelinetype  //
-{ Auto   = 0,
-  Dense  = 1,
-  Sparse = 2 } cusz_pipelinetype;
-
-typedef enum cusz_predictortype  //
-{ Lorenzo0  = 0,
-  LorenzoI  = 1,
-  LorenzoII = 2,
-  Spline3   = 3 } cusz_predictortype;
-
-typedef enum cusz_preprocessingtype  //
-{ FP64toFP32 = 0,
-  LogTransform,
-  ShiftedLogTransform,
-  Binning2x2,
-  Binning2x1,
-  Binning1x2,
-} cusz_preprocessingtype;
-
-typedef enum cusz_codectype  //
-{ Huffman = 0,
-  RunLength,
-  NvcompCascade,
-  NvcompLz4,
-  NvcompSnappy,
-} cusz_codectype;
-
-typedef enum cusz_spcodectype  //
-{ SparseMat = 0,
-  SparseVec = 1 } cusz_spcodectype;
-
-typedef enum cusz_huffman_booktype  //
-{ Tree      = 0,
-  Canonical = 1 } cusz_huffman_booktype;
-
-typedef enum cusz_huffman_codingtype  //
-{ Coarse = 0,
-  Fine   = 1 } cusz_huffman_codingtype;
-
-//////// configuration template
-typedef struct cusz_custom_len {
-    // clang-format off
-    union { size_t x0, x; };
-    union { size_t x1, y; };
-    union { size_t x2, z; };
-    union { size_t x3, w; };
-    // double factor;
-    // clang-format on
-} cusz_custom_len;
-typedef cusz_custom_len cusz_len;
-
-typedef struct cusz_custom_preprocessing {
-    cusz_custom_len         before;
-    cusz_custom_len         after;
-    cusz_preprocessingtype* list;
-    int                     nstep;
-
-} cusz_custom_preprocessing;
-
-typedef struct cusz_custom_predictor {
-    cusz_predictortype type;
-
-    bool anchor;
-    bool nondestructive;
-} cusz_custom_predictor;
-
-typedef struct cusz_custom_quantization {
-    int  radius;
-    bool delayed;
-} cusz_custom_quantization;
-
-typedef struct cusz_custom_codec {
-    cusz_codectype type;
-
-    bool  variable_length;
-    float presumed_density;
-} cusz_custom_codec;
-
-typedef struct cusz_custom_huffman_codec {
-    cusz_huffman_booktype   book;
-    cusz_executiontype      book_policy;
-    cusz_huffman_codingtype coding;
-
-    int booklen;
-    int coarse_pardeg;
-} cusz_custom_huffman_codec;
-
-typedef struct cusz_custom_spcodec {
-    cusz_spcodectype type;
-    float            presumed_density;
-} cusz_custom_spcodec;
-
-////// wrap-up
-
-/**
- * @deprecated The framework could be simplifed & unified.
- */
-typedef struct cusz_custom_framework {
-    cusz_datatype     datatype;
-    cusz_pipelinetype pipeline;
-
-    cusz_custom_predictor    predictor;
-    cusz_custom_quantization quantization;
-    cusz_custom_codec        codec;
-    // cusz_custom_spcodec      spcodec;
-
-    cusz_custom_huffman_codec huffman;
-} cusz_custom_framework;
-
-typedef cusz_custom_framework cusz_framework;
-
-typedef struct cusz_compressor_redundancy_compat_purpose {
-    void*           compressor;
-    cusz_framework* framework;
-    cusz_datatype   type;
-} cusz_compressor_compat;
-
-typedef cusz_compressor_compat cusz_compressor;
-
-typedef struct cusz_runtime_config {
-    double    eb;
-    cusz_mode mode;
-} cusz_runtime_config;
-typedef cusz_runtime_config cusz_config;
-
-typedef struct Res {
-    double min, max, rng, std;
-} Res;
-
-typedef struct cusz_stats {
-    // clang-format off
-    Res odata, xdata;
-    struct { double PSNR, MSE, NRMSE, coeff; } reduced;
-    struct { double abs, rel, pwrrel; size_t idx; } max_err;
-    struct { double lag_one, lag_two; } autocor;
-    double user_eb;
-    size_t len;
-    // clang-format on
-} cusz_stats;
-
-#endif
-
-#ifdef __cplusplus
-}
-#endif
+/**
+ * @file type.h
+ * @author Jiannan Tian
+ * @brief C-complient type definitions; no methods in this header.
+ * @version 0.3
+ * @date 2022-04-29
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef CUSZ_TYPE_H
+#define CUSZ_TYPE_H
+
+#include "stddef.h"
+
+enum cusz_execution_policy { CPU, CUDA };
+typedef enum cusz_execution_policy cusz_execution_policy;
+typedef enum cusz_execution_policy cusz_policy;
+typedef enum cusz_execution_policy asz_policy;
+
+//////// state enumeration
+
+typedef enum cusz_error_status {  //
+    CUSZ_SUCCESS                = 0x00,
+    CUSZ_FAIL_ONDISK_FILE_ERROR = 0x01,
+    CUSZ_FAIL_DATA_NOT_READY    = 0x02,
+    // specify error when calling CUDA API
+    CUSZ_FAIL_GPU_MALLOC,
+    CUSZ_FAIL_GPU_MEMCPY,
+    CUSZ_FAIL_GPU_ILLEGAL_ACCESS,
+    // specify error related to our own memory manager
+    CUSZ_FAIL_GPU_OUT_OF_MEMORY,
+    // when compression is useless
+    CUSZ_FAIL_INCOMPRESSIABLE,
+    // TODO component related error
+    CUSZ_FAIL_UNSUPPORTED_DATATYPE,
+    CUSZ_FAIL_UNSUPPORTED_QUANTTYPE,
+    CUSZ_FAIL_UNSUPPORTED_PRECISION,
+    CUSZ_FAIL_UNSUPPORTED_PIPELINE,
+    // not-implemented error
+    CUSZ_NOT_IMPLEMENTED = 0x0100,
+} cusz_error_status;
+
+typedef struct cusz_fixedlen_internal { /* all nullable */
+    void* encoding;
+} cusz_fixedlen_internal;
+typedef struct cusz_varlen_internal { /* all nullable */
+    void* huffman;
+    void* outlier;
+} cusz_varlen_internal;
+
+typedef enum cusz_datatype  //
+{ FP32   = 0,
+  FP64   = 1,
+  UINT8  = 10,
+  UINT16 = 11,
+  UINT32 = 12,
+  UINT64 = 13 } cusz_datatype;
+
+typedef enum cusz_executiontype  //
+{ Device = 0,
+  Host   = 1,
+  None   = 2 } cusz_executiontype;
+
+typedef enum cusz_mode  //
+{ Abs = 0,
+  Rel = 1 } cusz_mode;
+
+typedef enum cusz_pipelinetype  //
+{ Auto   = 0,
+  Dense  = 1,
+  Sparse = 2 } cusz_pipelinetype;
+
+typedef enum cusz_predictortype  //
+{ Lorenzo0  = 0,
+  LorenzoI  = 1,
+  LorenzoII = 2,
+  Spline3   = 3 } cusz_predictortype;
+
+typedef enum cusz_preprocessingtype  //
+{ FP64toFP32 = 0,
+  LogTransform,
+  ShiftedLogTransform,
+  Binning2x2,
+  Binning2x1,
+  Binning1x2,
+} cusz_preprocessingtype;
+
+typedef enum cusz_codectype  //
+{ Huffman = 0,
+  RunLength,
+  NvcompCascade,
+  NvcompLz4,
+  NvcompSnappy,
+} cusz_codectype;
+
+typedef enum cusz_spcodectype  //
+{ SparseMat = 0,
+  SparseVec = 1 } cusz_spcodectype;
+
+typedef enum cusz_huffman_booktype  //
+{ Tree      = 0,
+  Canonical = 1 } cusz_huffman_booktype;
+
+typedef enum cusz_huffman_codingtype  //
+{ Coarse = 0,
+  Fine   = 1 } cusz_huffman_codingtype;
+
+//////// configuration template
+typedef struct cusz_custom_len {
+    // clang-format off
+    union { size_t x0, x; };
+    union { size_t x1, y; };
+    union { size_t x2, z; };
+    union { size_t x3, w; };
+    // double factor;
+    // clang-format on
+} cusz_custom_len;
+typedef cusz_custom_len cusz_len;
+
+typedef struct cusz_custom_preprocessing {
+    cusz_custom_len         before;
+    cusz_custom_len         after;
+    cusz_preprocessingtype* list;
+    int                     nstep;
+
+} cusz_custom_preprocessing;
+
+typedef struct cusz_custom_predictor {
+    cusz_predictortype type;
+
+    bool anchor;
+    bool nondestructive;
+} cusz_custom_predictor;
+
+typedef struct cusz_custom_quantization {
+    int  radius;
+    bool delayed;
+} cusz_custom_quantization;
+
+typedef struct cusz_custom_codec {
+    cusz_codectype type;
+
+    bool  variable_length;
+    float presumed_density;
+} cusz_custom_codec;
+
+typedef struct cusz_custom_huffman_codec {
+    cusz_huffman_booktype   book;
+    cusz_executiontype      book_policy;
+    cusz_huffman_codingtype coding;
+
+    int booklen;
+    int coarse_pardeg;
+} cusz_custom_huffman_codec;
+
+typedef struct cusz_custom_spcodec {
+    cusz_spcodectype type;
+    float            presumed_density;
+} cusz_custom_spcodec;
+
+////// wrap-up
+
+/**
+ * @deprecated The framework could be simplifed & unified.
+ */
+typedef struct cusz_custom_framework {
+    cusz_datatype     datatype;
+    cusz_pipelinetype pipeline;
+
+    cusz_custom_predictor    predictor;
+    cusz_custom_quantization quantization;
+    cusz_custom_codec        codec;
+    // cusz_custom_spcodec      spcodec;
+
+    cusz_custom_huffman_codec huffman;
+} cusz_custom_framework;
+
+typedef cusz_custom_framework cusz_framework;
+
+typedef struct cusz_compressor_redundancy_compat_purpose {
+    void*           compressor;
+    cusz_framework* framework;
+    cusz_datatype   type;
+} cusz_compressor_compat;
+
+typedef cusz_compressor_compat cusz_compressor;
+
+typedef struct cusz_runtime_config {
+    double    eb;
+    cusz_mode mode;
+} cusz_runtime_config;
+typedef cusz_runtime_config cusz_config;
+
+typedef struct Res {
+    double min, max, rng, std;
+} Res;
+
+typedef struct cusz_stats {
+    // clang-format off
+    Res odata, xdata;
+    struct { double PSNR, MSE, NRMSE, coeff; } reduced;
+    struct { double abs, rel, pwrrel; size_t idx; } max_err;
+    struct { double lag_one, lag_two; } autocor;
+    double user_eb;
+    size_t len;
+    // clang-format on
+} cusz_stats;
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/qtensor/compression/cusz/include/framework.hh b/qtensor/compression/cusz/include/framework.hh
index b0e99960..9655fe25 100644
--- a/qtensor/compression/cusz/include/framework.hh
+++ b/qtensor/compression/cusz/include/framework.hh
@@ -1,62 +1,62 @@
-/**
- * @file framework.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-04-23
- * (create) 2021-10-06 (rev) 2022-04-23
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef CUSZ_FRAMEWORK
-#define CUSZ_FRAMEWORK
-
-#include "component.hh"
-#include "compressor.hh"
-
-namespace cusz {
-
-template <typename InputDataType, bool FastLowPrecision = true>
-struct Framework {
-   public:
-    /**
-     *
-     *   Predictor<T, E, (FP)>
-     *             |  |   ^
-     *             v  |   |
-     *     Spcodec<T> |   +---- default "fast-lowlowprecision"
-     *                v
-     *        Encoder<E, H>
-     */
-
-    using DATA    = InputDataType;
-    using ERRCTRL = ErrCtrlTrait<4, false>::type;  // predefined for mem. overlapping
-    using FP      = typename FastLowPrecisionTrait<FastLowPrecision>::type;
-    using Huff4   = HuffTrait<4>::type;
-    using Huff8   = HuffTrait<8>::type;
-    using Meta4   = MetadataTrait<4>::type;
-
-    template <class Codec, class FallbackCodec>
-    struct CompressorTemplate;
-
-    /* Predictor */
-    using CompatPurposePredictor = typename cusz::PredictionUnified<DATA, ERRCTRL, FP>;
-    using Predictor              = CompatPurposePredictor;
-
-    using CompatPurposeSpcodec = typename cusz::SpcodecVec<DATA, Meta4>;
-    using Spcodec              = CompatPurposeSpcodec;
-
-    /* Lossless Codec*/
-    using CodecHuffman32 = cusz::LosslessCodec<ERRCTRL, Huff4, Meta4>;
-    using CodecHuffman64 = cusz::LosslessCodec<ERRCTRL, Huff8, Meta4>;
-    using Codec          = CodecHuffman32;
-    using FallbackCodec  = CodecHuffman64;
-};
-
-using CompressorFP32 = cusz::Compressor<cusz::Framework<float>>;
-
-}  // namespace cusz
-
-#endif
+/**
+ * @file framework.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-23
+ * (create) 2021-10-06 (rev) 2022-04-23
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_FRAMEWORK
+#define CUSZ_FRAMEWORK
+
+#include "component.hh"
+#include "compressor.hh"
+
+namespace cusz {
+
+template <typename InputDataType, bool FastLowPrecision = true>
+struct Framework {
+   public:
+    /**
+     *
+     *   Predictor<T, E, (FP)>
+     *             |  |   ^
+     *             v  |   |
+     *     Spcodec<T> |   +---- default "fast-lowlowprecision"
+     *                v
+     *        Encoder<E, H>
+     */
+
+    using DATA    = InputDataType;
+    using ERRCTRL = ErrCtrlTrait<4, false>::type;  // predefined for mem. overlapping
+    using FP      = typename FastLowPrecisionTrait<FastLowPrecision>::type;
+    using Huff4   = HuffTrait<4>::type;
+    using Huff8   = HuffTrait<8>::type;
+    using Meta4   = MetadataTrait<4>::type;
+
+    template <class Codec, class FallbackCodec>
+    struct CompressorTemplate;
+
+    /* Predictor */
+    using CompatPurposePredictor = typename cusz::PredictionUnified<DATA, ERRCTRL, FP>;
+    using Predictor              = CompatPurposePredictor;
+
+    using CompatPurposeSpcodec = typename cusz::SpcodecVec<DATA, Meta4>;
+    using Spcodec              = CompatPurposeSpcodec;
+
+    /* Lossless Codec*/
+    using CodecHuffman32 = cusz::LosslessCodec<ERRCTRL, Huff4, Meta4>;
+    using CodecHuffman64 = cusz::LosslessCodec<ERRCTRL, Huff8, Meta4>;
+    using Codec          = CodecHuffman32;
+    using FallbackCodec  = CodecHuffman64;
+};
+
+using CompressorFP32 = cusz::Compressor<cusz::Framework<float>>;
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/include/header.h b/qtensor/compression/cusz/include/header.h
index c0fd67d8..05287edc 100644
--- a/qtensor/compression/cusz/include/header.h
+++ b/qtensor/compression/cusz/include/header.h
@@ -1,111 +1,111 @@
-#ifndef CUSZ_HEADER_H
-#define CUSZ_HEADER_H
-
-/**
- * @file header.h
- * @author Jiannan Tian
- * @brief
- * @version 0.2
- * @date 2021-01-22
- * (created) 2020-09-25, (rev.1) 2021-01-22 (rev.2) 2021-09-08 (rev.3) 2022-02-26
- *
- * @copyright (C) 2020 by Washington State University, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdlib.h>
-
-typedef struct alignas(128) cusz_header {
-    static const int HEADER = 0;
-    static const int ANCHOR = 1;
-    static const int VLE    = 2;
-    static const int SPFMT  = 3;
-
-    static const int END = 4;
-
-    uint32_t self_bytes : 16;
-    uint32_t fp : 1;
-    uint32_t byte_vle : 4;  // 4, 8
-    uint32_t nz_density_factor : 8;
-    uint32_t codecs_in_use : 2;
-    uint32_t vle_pardeg;
-    uint32_t x, y, z, w;
-    double   eb;
-    uint32_t radius : 16;
-
-    uint32_t entry[END + 1];
-
-    // uint32_t byte_uncompressed : 4;  // T; 1, 2, 4, 8
-    // uint32_t byte_errctrl : 3;       // 1, 2, 4
-    // uint32_t byte_meta : 4;          // 4, 8
-    // uint32_t ndim : 3;               // 1,2,3,4
-    // size_t   data_len;
-    // size_t   errctrl_len;
-
-} cusz_header;
-
-typedef cusz_header cuszHEADER;
-
-typedef struct alignas(128) v2_cusz_header {
-    // data segments
-    static const int HEADER = 0;
-    static const int ANCHOR = 1;
-    static const int SP_IDX = 2;
-    static const int SP_VAL = 3;
-    static const int HF     = 4;
-    static const int END    = 5;
-    uint32_t         entry[END + 1];
-
-    struct {
-        uint32_t precision : 1;
-    } data;
-
-    uint32_t x, y, z, w;
-
-    // struct {
-    // uint32_t codecs_in_use : 2;
-    double   eb;
-    uint32_t radius : 16;
-    // } config;
-
-    struct {
-        uint32_t factor : 8;  // density = 1/factor
-        uint32_t count;
-    } sp;
-
-    struct {
-        uint32_t rep_bytes : 4;  // 4, 8
-        uint32_t sublen : 28;
-        uint32_t pardeg;
-    } hf;
-
-    // TODO replace the following with hf.VAR
-    uint32_t vle_pardeg;
-
-} psz_header;
-
-#ifdef __cplusplus
-}
-#endif
-
-namespace cusz {
-
-using Header   = cusz_header;
-using header_t = cusz_header*;
-
-}  // namespace cusz
-
-namespace psz {
-
-using v2_header = v2_cusz_header;
-
-}
-
-#endif
+#ifndef CUSZ_HEADER_H
+#define CUSZ_HEADER_H
+
+/**
+ * @file header.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.2
+ * @date 2021-01-22
+ * (created) 2020-09-25, (rev.1) 2021-01-22 (rev.2) 2021-09-08 (rev.3) 2022-02-26
+ *
+ * @copyright (C) 2020 by Washington State University, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+typedef struct alignas(128) cusz_header {
+    static const int HEADER = 0;
+    static const int ANCHOR = 1;
+    static const int VLE    = 2;
+    static const int SPFMT  = 3;
+
+    static const int END = 4;
+
+    uint32_t self_bytes : 16;
+    uint32_t fp : 1;
+    uint32_t byte_vle : 4;  // 4, 8
+    uint32_t nz_density_factor : 8;
+    uint32_t codecs_in_use : 2;
+    uint32_t vle_pardeg;
+    uint32_t x, y, z, w;
+    double   eb;
+    uint32_t radius : 16;
+
+    uint32_t entry[END + 1];
+
+    // uint32_t byte_uncompressed : 4;  // T; 1, 2, 4, 8
+    // uint32_t byte_errctrl : 3;       // 1, 2, 4
+    // uint32_t byte_meta : 4;          // 4, 8
+    // uint32_t ndim : 3;               // 1,2,3,4
+    // size_t   data_len;
+    // size_t   errctrl_len;
+
+} cusz_header;
+
+typedef cusz_header cuszHEADER;
+
+typedef struct alignas(128) v2_cusz_header {
+    // data segments
+    static const int HEADER = 0;
+    static const int ANCHOR = 1;
+    static const int SP_IDX = 2;
+    static const int SP_VAL = 3;
+    static const int HF     = 4;
+    static const int END    = 5;
+    uint32_t         entry[END + 1];
+
+    struct {
+        uint32_t precision : 1;
+    } data;
+
+    uint32_t x, y, z, w;
+
+    // struct {
+    // uint32_t codecs_in_use : 2;
+    double   eb;
+    uint32_t radius : 16;
+    // } config;
+
+    struct {
+        uint32_t factor : 8;  // density = 1/factor
+        uint32_t count;
+    } sp;
+
+    struct {
+        uint32_t rep_bytes : 4;  // 4, 8
+        uint32_t sublen : 28;
+        uint32_t pardeg;
+    } hf;
+
+    // TODO replace the following with hf.VAR
+    uint32_t vle_pardeg;
+
+} psz_header;
+
+#ifdef __cplusplus
+}
+#endif
+
+namespace cusz {
+
+using Header   = cusz_header;
+using header_t = cusz_header*;
+
+}  // namespace cusz
+
+namespace psz {
+
+using v2_header = v2_cusz_header;
+
+}
+
+#endif
diff --git a/qtensor/compression/cusz/include/hf/hf.hh b/qtensor/compression/cusz/include/hf/hf.hh
index 692d0ea0..37438abb 100644
--- a/qtensor/compression/cusz/include/hf/hf.hh
+++ b/qtensor/compression/cusz/include/hf/hf.hh
@@ -1,170 +1,170 @@
-/**
- * @file codec.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-04-23
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef CUSZ_COMPONENT_CODECS_HH
-#define CUSZ_COMPONENT_CODECS_HH
-
-#include <cuda_runtime.h>
-#include <cstdint>
-#include <memory>
-
-#include "hf/hf_struct.h"
-
-#define DEFINE_ARRAY(VAR, TYPE) \
-    TYPE* d_##VAR{nullptr};     \
-    TYPE* h_##VAR{nullptr};
-
-namespace cusz {
-
-template <typename T, typename H, typename M>
-class LosslessCodec
-// : CodecInterface<T, H, M>
-{
-   public:
-    using Origin    = T;
-    using Encoded   = H;
-    using MetadataT = M;
-    using FreqT     = uint32_t;
-    using BYTE      = uint8_t;
-
-   private:
-    class impl;
-    std::unique_ptr<impl> pimpl;
-
-   public:
-    ~LosslessCodec();                                // dtor
-    LosslessCodec();                                 // ctor
-    LosslessCodec(const LosslessCodec&);             // copy ctor
-    LosslessCodec& operator=(const LosslessCodec&);  // copy assign
-    LosslessCodec(LosslessCodec&&);                  // move ctor
-    LosslessCodec& operator=(LosslessCodec&&);       // move assign
-
-    void init(size_t const, int const, int const, bool dbg_print = false);
-    void build_codebook(uint32_t*, int const, cudaStream_t = nullptr);
-    void encode(T*, size_t const, BYTE*&, size_t&, cudaStream_t = nullptr);
-    void decode(BYTE*, T*, cudaStream_t = nullptr, bool = true);
-    void clear_buffer();
-
-    float get_time_elapsed() const;
-    float get_time_book() const;
-    float get_time_lossless() const;
-};
-
-template <typename T, typename H, typename M>
-class LosslessCodec<T, H, M>::impl {
-   public:
-    using Origin    = T;
-    using Encoded   = H;
-    using MetadataT = M;
-    using FreqT     = uint32_t;
-    using BYTE      = uint8_t;
-
-   private:
-    using BOOK = H;
-    using SYM  = T;
-
-    // TODO shared header
-    struct alignas(128) Header {
-        static const int HEADER    = 0;
-        static const int REVBOOK   = 1;
-        static const int PAR_NBIT  = 2;
-        static const int PAR_ENTRY = 3;
-        static const int BITSTREAM = 4;
-        static const int END       = 5;
-
-        int       self_bytes : 16;
-        int       booklen : 16;
-        int       sublen;
-        int       pardeg;
-        size_t    uncompressed_len;
-        size_t    total_nbit;
-        size_t    total_ncell;  // TODO change to uint32_t
-        MetadataT entry[END + 1];
-
-        MetadataT subfile_size() const { return entry[END]; }
-    };
-
-    struct runtime_encode_helper {
-        static const int TMP       = 0;
-        static const int FREQ      = 1;
-        static const int BOOK      = 2;
-        static const int REVBOOK   = 3;
-        static const int PAR_NBIT  = 4;
-        static const int PAR_NCELL = 5;
-        static const int PAR_ENTRY = 6;
-        static const int BITSTREAM = 7;
-        static const int END       = 8;
-
-        uint32_t nbyte[END];
-    };
-
-    using RTE    = runtime_encode_helper;
-    using Header = struct Header;
-
-   private:
-    // array
-    DEFINE_ARRAY(tmp, H);
-    DEFINE_ARRAY(compressed, BYTE);  // alias in address
-    DEFINE_ARRAY(book, H);
-    DEFINE_ARRAY(revbook, BYTE);
-
-    DEFINE_ARRAY(par_metadata, M);
-    DEFINE_ARRAY(par_nbit, M);
-    DEFINE_ARRAY(par_ncell, M);
-    DEFINE_ARRAY(par_entry, M);
-
-    DEFINE_ARRAY(bitstream, H);
-    // helper
-    RTE rte;
-    // memory
-    static const int CELL_BITWIDTH = sizeof(H) * 8;
-    // timer
-    float milliseconds{0.0};
-    float time_hist{0.0}, time_book{0.0}, time_lossless{0.0};
-
-    hf_book*      book_desc;
-    hf_chunk*     chunk_desc_d;
-    hf_chunk*     chunk_desc_h;
-    hf_bitstream* bitstream_desc;
-
-   public:
-    ~impl();  // dtor
-    impl();   // ctor
-
-    // getter
-    float         get_time_elapsed() const;
-    float         get_time_book() const;
-    float         get_time_lossless() const;
-    size_t        get_workspace_nbyte(size_t) const;
-    size_t        get_max_output_nbyte(size_t len) const;
-    static size_t get_revbook_nbyte(int);
-    // getter for internal array
-    H*    expose_book() const;
-    BYTE* expose_revbook() const;
-    // compile-time
-    constexpr bool can_overlap_input_and_firstphase_encode();
-    // public methods
-    void init(size_t const, int const, int const, bool dbg_print = false);
-    void build_codebook(uint32_t*, int const, cudaStream_t = nullptr);
-    void encode(T*, size_t const, BYTE*&, size_t&, cudaStream_t = nullptr);
-    void decode(BYTE*, T*, cudaStream_t = nullptr, bool = true);
-    void clear_buffer();
-
-   private:
-    void subfile_collect(Header&, size_t const, int const, int const, int const, cudaStream_t stream = nullptr);
-    void dbg_println(const std::string, void*, int);
-};
-
-}  // namespace cusz
-
-#undef DEFINE_ARRAY
-
-#endif
+/**
+ * @file codec.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-23
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_COMPONENT_CODECS_HH
+#define CUSZ_COMPONENT_CODECS_HH
+
+#include <cuda_runtime.h>
+#include <cstdint>
+#include <memory>
+
+#include "hf/hf_struct.h"
+
+#define DEFINE_ARRAY(VAR, TYPE) \
+    TYPE* d_##VAR{nullptr};     \
+    TYPE* h_##VAR{nullptr};
+
+namespace cusz {
+
+template <typename T, typename H, typename M>
+class LosslessCodec
+// : CodecInterface<T, H, M>
+{
+   public:
+    using Origin    = T;
+    using Encoded   = H;
+    using MetadataT = M;
+    using FreqT     = uint32_t;
+    using BYTE      = uint8_t;
+
+   private:
+    class impl;
+    std::unique_ptr<impl> pimpl;
+
+   public:
+    ~LosslessCodec();                                // dtor
+    LosslessCodec();                                 // ctor
+    LosslessCodec(const LosslessCodec&);             // copy ctor
+    LosslessCodec& operator=(const LosslessCodec&);  // copy assign
+    LosslessCodec(LosslessCodec&&);                  // move ctor
+    LosslessCodec& operator=(LosslessCodec&&);       // move assign
+
+    void init(size_t const, int const, int const, bool dbg_print = false);
+    void build_codebook(uint32_t*, int const, cudaStream_t = nullptr);
+    void encode(T*, size_t const, BYTE*&, size_t&, cudaStream_t = nullptr);
+    void decode(BYTE*, T*, cudaStream_t = nullptr, bool = true);
+    void clear_buffer();
+
+    float get_time_elapsed() const;
+    float get_time_book() const;
+    float get_time_lossless() const;
+};
+
+template <typename T, typename H, typename M>
+class LosslessCodec<T, H, M>::impl {
+   public:
+    using Origin    = T;
+    using Encoded   = H;
+    using MetadataT = M;
+    using FreqT     = uint32_t;
+    using BYTE      = uint8_t;
+
+   private:
+    using BOOK = H;
+    using SYM  = T;
+
+    // TODO shared header
+    struct alignas(128) Header {
+        static const int HEADER    = 0;
+        static const int REVBOOK   = 1;
+        static const int PAR_NBIT  = 2;
+        static const int PAR_ENTRY = 3;
+        static const int BITSTREAM = 4;
+        static const int END       = 5;
+
+        int       self_bytes : 16;
+        int       booklen : 16;
+        int       sublen;
+        int       pardeg;
+        size_t    uncompressed_len;
+        size_t    total_nbit;
+        size_t    total_ncell;  // TODO change to uint32_t
+        MetadataT entry[END + 1];
+
+        MetadataT subfile_size() const { return entry[END]; }
+    };
+
+    struct runtime_encode_helper {
+        static const int TMP       = 0;
+        static const int FREQ      = 1;
+        static const int BOOK      = 2;
+        static const int REVBOOK   = 3;
+        static const int PAR_NBIT  = 4;
+        static const int PAR_NCELL = 5;
+        static const int PAR_ENTRY = 6;
+        static const int BITSTREAM = 7;
+        static const int END       = 8;
+
+        uint32_t nbyte[END];
+    };
+
+    using RTE    = runtime_encode_helper;
+    using Header = struct Header;
+
+   private:
+    // array
+    DEFINE_ARRAY(tmp, H);
+    DEFINE_ARRAY(compressed, BYTE);  // alias in address
+    DEFINE_ARRAY(book, H);
+    DEFINE_ARRAY(revbook, BYTE);
+
+    DEFINE_ARRAY(par_metadata, M);
+    DEFINE_ARRAY(par_nbit, M);
+    DEFINE_ARRAY(par_ncell, M);
+    DEFINE_ARRAY(par_entry, M);
+
+    DEFINE_ARRAY(bitstream, H);
+    // helper
+    RTE rte;
+    // memory
+    static const int CELL_BITWIDTH = sizeof(H) * 8;
+    // timer
+    float milliseconds{0.0};
+    float time_hist{0.0}, time_book{0.0}, time_lossless{0.0};
+
+    hf_book*      book_desc;
+    hf_chunk*     chunk_desc_d;
+    hf_chunk*     chunk_desc_h;
+    hf_bitstream* bitstream_desc;
+
+   public:
+    ~impl();  // dtor
+    impl();   // ctor
+
+    // getter
+    float         get_time_elapsed() const;
+    float         get_time_book() const;
+    float         get_time_lossless() const;
+    size_t        get_workspace_nbyte(size_t) const;
+    size_t        get_max_output_nbyte(size_t len) const;
+    static size_t get_revbook_nbyte(int);
+    // getter for internal array
+    H*    expose_book() const;
+    BYTE* expose_revbook() const;
+    // compile-time
+    constexpr bool can_overlap_input_and_firstphase_encode();
+    // public methods
+    void init(size_t const, int const, int const, bool dbg_print = false);
+    void build_codebook(uint32_t*, int const, cudaStream_t = nullptr);
+    void encode(T*, size_t const, BYTE*&, size_t&, cudaStream_t = nullptr);
+    void decode(BYTE*, T*, cudaStream_t = nullptr, bool = true);
+    void clear_buffer();
+
+   private:
+    void subfile_collect(Header&, size_t const, int const, int const, int const, cudaStream_t stream = nullptr);
+    void dbg_println(const std::string, void*, int);
+};
+
+}  // namespace cusz
+
+#undef DEFINE_ARRAY
+
+#endif
diff --git a/qtensor/compression/cusz/include/hf/hf_bookg.hh b/qtensor/compression/cusz/include/hf/hf_bookg.hh
index 3d406f0f..f6187164 100644
--- a/qtensor/compression/cusz/include/hf/hf_bookg.hh
+++ b/qtensor/compression/cusz/include/hf/hf_bookg.hh
@@ -1,45 +1,45 @@
-/**
- * @file huffman_parbook.cuh
- * @author Cody Rivera (cjrivera1@crimson.ua.edu)
- * @brief Parallel Huffman Construction to generates canonical forward codebook (header).
- *        Based on [Ostadzadeh et al. 2007] (https://dblp.org/rec/conf/pdpta/OstadzadehEZMB07.bib)
- *        "A Two-phase Practical Parallel Algorithm for Construction of Huffman Codes".
- * @version 0.1
- * @date 2020-09-20
- * Created on: 2020-06
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#ifndef PAR_HUFFMAN_H
-#define PAR_HUFFMAN_H
-
-// Parallel huffman global memory and kernels
-namespace asz {
-
-/**
- * @brief get codebook and reverse codebook in parallel
- *
- * @tparam T input type
- * @tparam H codebook type
- * @param freq input device array; frequency
- * @param codebook output device array; codebook for encoding
- * @param dict_size dictionary size; len of freq or codebook
- * @param reverse_codebook output device array; reverse codebook for decoding
- * @param time_book the returned time
- */
-template <typename T, typename H>
-void hf_buildbook_g(
-    uint32_t* freq,
-    int const booksize,
-    H*        codebook,
-    uint8_t*  reverse_codebook,
-    int const revbook_nbyte,
-    float*    time_book,
-    cudaStream_t = nullptr);
-
-}  // namespace asz
-
-#endif
+/**
+ * @file huffman_parbook.cuh
+ * @author Cody Rivera (cjrivera1@crimson.ua.edu)
+ * @brief Parallel Huffman Construction to generates canonical forward codebook (header).
+ *        Based on [Ostadzadeh et al. 2007] (https://dblp.org/rec/conf/pdpta/OstadzadehEZMB07.bib)
+ *        "A Two-phase Practical Parallel Algorithm for Construction of Huffman Codes".
+ * @version 0.1
+ * @date 2020-09-20
+ * Created on: 2020-06
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef PAR_HUFFMAN_H
+#define PAR_HUFFMAN_H
+
+// Parallel huffman global memory and kernels
+namespace asz {
+
+/**
+ * @brief get codebook and reverse codebook in parallel
+ *
+ * @tparam T input type
+ * @tparam H codebook type
+ * @param freq input device array; frequency
+ * @param codebook output device array; codebook for encoding
+ * @param dict_size dictionary size; len of freq or codebook
+ * @param reverse_codebook output device array; reverse codebook for decoding
+ * @param time_book the returned time
+ */
+template <typename T, typename H>
+void hf_buildbook_g(
+    uint32_t* freq,
+    int const booksize,
+    H*        codebook,
+    uint8_t*  reverse_codebook,
+    int const revbook_nbyte,
+    float*    time_book,
+    cudaStream_t = nullptr);
+
+}  // namespace asz
+
+#endif
diff --git a/qtensor/compression/cusz/include/hf/hf_codecg.hh b/qtensor/compression/cusz/include/hf/hf_codecg.hh
index 10cb1570..faad837a 100644
--- a/qtensor/compression/cusz/include/hf/hf_codecg.hh
+++ b/qtensor/compression/cusz/include/hf/hf_codecg.hh
@@ -1,82 +1,82 @@
-/**
- * @file launch_lossless.cuh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-06-13
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef ABAACE49_2C9E_4E3C_AEFF_B016276142E1
-#define ABAACE49_2C9E_4E3C_AEFF_B016276142E1
-
-#include <stdint.h>
-#include <stdlib.h>
-
-#include "hf_struct.h"
-
-template <int WIDTH>
-struct PackedWordByWidth;
-
-template <>
-struct PackedWordByWidth<4> {
-    uint32_t word : 24;
-    uint32_t bits : 8;
-};
-
-template <>
-struct PackedWordByWidth<8> {
-    uint64_t word : 56;
-    uint64_t bits : 8;
-};
-
-namespace asz {
-
-template <typename T, typename H, typename M>
-void hf_encode_coarse(
-    T*           uncompressed,
-    H*           d_internal_coded,
-    size_t const len,
-    uint32_t*    d_freq,
-    H*           d_book,
-    int const    booklen,
-    H*           d_bitstream,
-    M*           d_par_metadata,
-    M*           h_par_metadata,
-    int const    sublen,
-    int const    pardeg,
-    int          numSMs,
-    uint8_t*&    out_compressed,
-    size_t&      out_compressed_len,
-    float&       time_lossless,
-    cudaStream_t stream);
-
-template <typename T, typename H, typename M>
-void hf_encode_coarse_rev1(
-    T*            uncompressed,
-    size_t const  len,
-    hf_book*      book_desc,
-    hf_bitstream* bitstream_desc,
-    uint8_t*&     out_compressed,      // 22-10-12 buggy
-    size_t&       out_compressed_len,  // 22-10-12 buggy
-    float&        time_lossless,
-    cudaStream_t  stream);
-
-template <typename T, typename H, typename M>
-void hf_decode_coarse(
-    H*           d_bitstream,
-    uint8_t*     d_revbook,
-    int const    revbook_nbyte,
-    M*           d_par_nbit,
-    M*           d_par_entry,
-    int const    sublen,
-    int const    pardeg,
-    T*           out_decompressed,
-    float&       time_lossless,
-    cudaStream_t stream);
-
-}  // namespace asz
-
-#endif /* ABAACE49_2C9E_4E3C_AEFF_B016276142E1 */
+/**
+ * @file launch_lossless.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-06-13
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef ABAACE49_2C9E_4E3C_AEFF_B016276142E1
+#define ABAACE49_2C9E_4E3C_AEFF_B016276142E1
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "hf_struct.h"
+
+template <int WIDTH>
+struct PackedWordByWidth;
+
+template <>
+struct PackedWordByWidth<4> {
+    uint32_t word : 24;
+    uint32_t bits : 8;
+};
+
+template <>
+struct PackedWordByWidth<8> {
+    uint64_t word : 56;
+    uint64_t bits : 8;
+};
+
+namespace asz {
+
+template <typename T, typename H, typename M>
+void hf_encode_coarse(
+    T*           uncompressed,
+    H*           d_internal_coded,
+    size_t const len,
+    uint32_t*    d_freq,
+    H*           d_book,
+    int const    booklen,
+    H*           d_bitstream,
+    M*           d_par_metadata,
+    M*           h_par_metadata,
+    int const    sublen,
+    int const    pardeg,
+    int          numSMs,
+    uint8_t*&    out_compressed,
+    size_t&      out_compressed_len,
+    float&       time_lossless,
+    cudaStream_t stream);
+
+template <typename T, typename H, typename M>
+void hf_encode_coarse_rev1(
+    T*            uncompressed,
+    size_t const  len,
+    hf_book*      book_desc,
+    hf_bitstream* bitstream_desc,
+    uint8_t*&     out_compressed,      // 22-10-12 buggy
+    size_t&       out_compressed_len,  // 22-10-12 buggy
+    float&        time_lossless,
+    cudaStream_t  stream);
+
+template <typename T, typename H, typename M>
+void hf_decode_coarse(
+    H*           d_bitstream,
+    uint8_t*     d_revbook,
+    int const    revbook_nbyte,
+    M*           d_par_nbit,
+    M*           d_par_entry,
+    int const    sublen,
+    int const    pardeg,
+    T*           out_decompressed,
+    float&       time_lossless,
+    cudaStream_t stream);
+
+}  // namespace asz
+
+#endif /* ABAACE49_2C9E_4E3C_AEFF_B016276142E1 */
diff --git a/qtensor/compression/cusz/include/hf/hf_struct.h b/qtensor/compression/cusz/include/hf/hf_struct.h
index c289a795..20ccf206 100644
--- a/qtensor/compression/cusz/include/hf/hf_struct.h
+++ b/qtensor/compression/cusz/include/hf/hf_struct.h
@@ -1,53 +1,53 @@
-/**
- * @file hf_struct.h
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-09-14
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef DA6883A3_A70F_4690_A4FA_56644987725A
-#define DA6883A3_A70F_4690_A4FA_56644987725A
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <stdint.h>
-#include <stdlib.h>
-
-// raw pointer array; regardless of being on host or device
-typedef struct hf_book {
-    uint32_t* freq;
-    // undertermined on definition; could be uint32_t* and uint64_t*
-    void* book;
-    int   booklen;
-} hf_book;
-
-// typedef struct hf_revbook {
-// } hf_revbook;
-
-typedef struct hf_chunk {
-    void* bits;     // how many bits each chunk
-    void* cells;    // how many cells each chunk
-    void* entries;  // jump to the chunk
-} hf_chunk;
-
-typedef struct hf_bitstream {
-    void*     buffer;
-    void*     bitstream;
-    hf_chunk* d_metadata;
-    hf_chunk* h_metadata;
-    int       sublen;  // data chunksize
-    int       pardeg;  // runtime paralleism degree
-    int       numSMs;  // number of streaming multiprocessor
-} hf_bitstream;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* DA6883A3_A70F_4690_A4FA_56644987725A */
+/**
+ * @file hf_struct.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-09-14
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef DA6883A3_A70F_4690_A4FA_56644987725A
+#define DA6883A3_A70F_4690_A4FA_56644987725A
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+// raw pointer array; regardless of being on host or device
+typedef struct hf_book {
+    uint32_t* freq;
+    // undertermined on definition; could be uint32_t* and uint64_t*
+    void* book;
+    int   booklen;
+} hf_book;
+
+// typedef struct hf_revbook {
+// } hf_revbook;
+
+typedef struct hf_chunk {
+    void* bits;     // how many bits each chunk
+    void* cells;    // how many cells each chunk
+    void* entries;  // jump to the chunk
+} hf_chunk;
+
+typedef struct hf_bitstream {
+    void*     buffer;
+    void*     bitstream;
+    hf_chunk* d_metadata;
+    hf_chunk* h_metadata;
+    int       sublen;  // data chunksize
+    int       pardeg;  // runtime paralleism degree
+    int       numSMs;  // number of streaming multiprocessor
+} hf_bitstream;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* DA6883A3_A70F_4690_A4FA_56644987725A */
diff --git a/qtensor/compression/cusz/include/kernel/claunch_cuda.h b/qtensor/compression/cusz/include/kernel/claunch_cuda.h
index f19943c1..f160b5a3 100644
--- a/qtensor/compression/cusz/include/kernel/claunch_cuda.h
+++ b/qtensor/compression/cusz/include/kernel/claunch_cuda.h
@@ -1,49 +1,49 @@
-/**
- * @file claunch_cuda.h
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-07-24
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef KERNEL_CUDA_H
-#define KERNEL_CUDA_H
-
-#include <cuda_runtime.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <stdint.h>
-#include <stdlib.h>
-
-#include "../cusz/type.h"
-// #include "../hf/hf_struct.h"
-
-#define C_SPLINE3(Tliteral, Eliteral, FPliteral, T, E, FP)                                                           \
-    cusz_error_status claunch_construct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                           \
-        bool NO_R_SEPARATE, T* data, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, \
-        double const eb, int const radius, float* time_elapsed, cudaStream_t stream);                                \
-                                                                                                                     \
-    cusz_error_status claunch_reconstruct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                         \
-        T* xdata, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, double const eb,   \
-        int const radius, float* time_elapsed, cudaStream_t stream);
-
-C_SPLINE3(fp32, ui8, fp32, float, uint8_t, float);
-C_SPLINE3(fp32, ui16, fp32, float, uint16_t, float);
-C_SPLINE3(fp32, ui32, fp32, float, uint32_t, float);
-C_SPLINE3(fp32, fp32, fp32, float, float, float);
-
-#undef C_SPLINE3
-
-#undef C_COARSE_HUFFMAN_DECODE
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
+/**
+ * @file claunch_cuda.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-07-24
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef KERNEL_CUDA_H
+#define KERNEL_CUDA_H
+
+#include <cuda_runtime.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "../cusz/type.h"
+// #include "../hf/hf_struct.h"
+
+#define C_SPLINE3(Tliteral, Eliteral, FPliteral, T, E, FP)                                                           \
+    cusz_error_status claunch_construct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                           \
+        bool NO_R_SEPARATE, T* data, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, \
+        double const eb, int const radius, float* time_elapsed, cudaStream_t stream);                                \
+                                                                                                                     \
+    cusz_error_status claunch_reconstruct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                         \
+        T* xdata, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, double const eb,   \
+        int const radius, float* time_elapsed, cudaStream_t stream);
+
+C_SPLINE3(fp32, ui8, fp32, float, uint8_t, float);
+C_SPLINE3(fp32, ui16, fp32, float, uint16_t, float);
+C_SPLINE3(fp32, ui32, fp32, float, uint32_t, float);
+C_SPLINE3(fp32, fp32, fp32, float, float, float);
+
+#undef C_SPLINE3
+
+#undef C_COARSE_HUFFMAN_DECODE
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/qtensor/compression/cusz/include/kernel/cpplaunch_cuda.hh b/qtensor/compression/cusz/include/kernel/cpplaunch_cuda.hh
index 5c8ee08d..7d35d59e 100644
--- a/qtensor/compression/cusz/include/kernel/cpplaunch_cuda.hh
+++ b/qtensor/compression/cusz/include/kernel/cpplaunch_cuda.hh
@@ -1,51 +1,51 @@
-/**
- * @file cpplaunch_cuda.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-07-27
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef COMPONENT_CALL_KERNEL_HH
-#define COMPONENT_CALL_KERNEL_HH
-
-#include "../cusz/type.h"
-#include "../hf/hf_struct.h"
-
-namespace cusz {
-
-// 22-10-27 revise later
-template <typename T, typename E, typename FP>
-cusz_error_status cpplaunch_construct_Spline3(
-    bool         NO_R_SEPARATE,
-    T*           data,
-    dim3 const   len3,
-    T*           anchor,
-    dim3 const   an_len3,
-    E*           eq,
-    dim3 const   ec_len3,
-    double const eb,
-    int const    radius,
-    float*       time_elapsed,
-    cudaStream_t stream);
-
-// 22-10-27 revise later
-template <typename T, typename E, typename FP>
-cusz_error_status cpplaunch_reconstruct_Spline3(
-    T*           xdata,
-    dim3 const   len3,
-    T*           anchor,
-    dim3 const   an_len3,
-    E*           eq,
-    dim3 const   ec_len3,
-    double const eb,
-    int const    radius,
-    float*       time_elapsed,
-    cudaStream_t stream);
-
-}  // namespace cusz
-
-#endif
+/**
+ * @file cpplaunch_cuda.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-07-27
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef COMPONENT_CALL_KERNEL_HH
+#define COMPONENT_CALL_KERNEL_HH
+
+#include "../cusz/type.h"
+#include "../hf/hf_struct.h"
+
+namespace cusz {
+
+// 22-10-27 revise later
+template <typename T, typename E, typename FP>
+cusz_error_status cpplaunch_construct_Spline3(
+    bool         NO_R_SEPARATE,
+    T*           data,
+    dim3 const   len3,
+    T*           anchor,
+    dim3 const   an_len3,
+    E*           eq,
+    dim3 const   ec_len3,
+    double const eb,
+    int const    radius,
+    float*       time_elapsed,
+    cudaStream_t stream);
+
+// 22-10-27 revise later
+template <typename T, typename E, typename FP>
+cusz_error_status cpplaunch_reconstruct_Spline3(
+    T*           xdata,
+    dim3 const   len3,
+    T*           anchor,
+    dim3 const   an_len3,
+    E*           eq,
+    dim3 const   ec_len3,
+    double const eb,
+    int const    radius,
+    float*       time_elapsed,
+    cudaStream_t stream);
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/include/kernel/dryrun.cuh b/qtensor/compression/cusz/include/kernel/dryrun.cuh
index e96b3b96..d32800c1 100644
--- a/qtensor/compression/cusz/include/kernel/dryrun.cuh
+++ b/qtensor/compression/cusz/include/kernel/dryrun.cuh
@@ -1,47 +1,47 @@
-/**
- * @file dryrun.cuh
- * @author Jiannan Tian
- * @brief cuSZ dryrun mode, checking data quality from lossy compression.
- * @version 0.3
- * @date 2020-09-20
- * (create) 2020-05-14, (release) 2020-09-20, (rev1) 2021-01-25, (rev2) 2021-06-21
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#ifndef CUSZ_KERNEL_DRYRUN_CUH
-#define CUSZ_KERNEL_DRYRUN_CUH
-
-namespace cusz {
-
-template <typename Data = float, typename FP = float, int BLOCK = 256, int SEQ = 4>
-// template <typename Data = float, typename FP = float>
-__global__ void dualquant_dryrun_kernel(Data* in_data, Data* out_xdata, size_t len, FP ebx2_r, FP ebx2)
-{
-    {
-        constexpr auto  NTHREAD = BLOCK / SEQ;
-        __shared__ Data shmem[BLOCK];
-        auto            id_base = blockIdx.x * BLOCK;
-
-#pragma unroll
-        for (auto i = 0; i < SEQ; i++) {
-            auto id = id_base + threadIdx.x + i * NTHREAD;
-            if (id < len) {
-                shmem[threadIdx.x + i * NTHREAD] = round(in_data[id] * ebx2_r) * ebx2;
-                out_xdata[id]                    = shmem[threadIdx.x + i * NTHREAD];
-            }
-        }
-    }
-
-    // simplistic
-    // {
-    //     auto id = blockIdx.x * blockDim.x + threadIdx.x;
-    //     if (id < len) out_xdata[id] = round(in_data[id] * ebx2_r) * ebx2;
-    // }
-}
-
-}  // namespace cusz
-
+/**
+ * @file dryrun.cuh
+ * @author Jiannan Tian
+ * @brief cuSZ dryrun mode, checking data quality from lossy compression.
+ * @version 0.3
+ * @date 2020-09-20
+ * (create) 2020-05-14, (release) 2020-09-20, (rev1) 2021-01-25, (rev2) 2021-06-21
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_KERNEL_DRYRUN_CUH
+#define CUSZ_KERNEL_DRYRUN_CUH
+
+namespace cusz {
+
+template <typename Data = float, typename FP = float, int BLOCK = 256, int SEQ = 4>
+// template <typename Data = float, typename FP = float>
+__global__ void dualquant_dryrun_kernel(Data* in_data, Data* out_xdata, size_t len, FP ebx2_r, FP ebx2)
+{
+    {
+        constexpr auto  NTHREAD = BLOCK / SEQ;
+        __shared__ Data shmem[BLOCK];
+        auto            id_base = blockIdx.x * BLOCK;
+
+#pragma unroll
+        for (auto i = 0; i < SEQ; i++) {
+            auto id = id_base + threadIdx.x + i * NTHREAD;
+            if (id < len) {
+                shmem[threadIdx.x + i * NTHREAD] = round(in_data[id] * ebx2_r) * ebx2;
+                out_xdata[id]                    = shmem[threadIdx.x + i * NTHREAD];
+            }
+        }
+    }
+
+    // simplistic
+    // {
+    //     auto id = blockIdx.x * blockDim.x + threadIdx.x;
+    //     if (id < len) out_xdata[id] = round(in_data[id] * ebx2_r) * ebx2;
+    // }
+}
+
+}  // namespace cusz
+
 #endif
\ No newline at end of file
diff --git a/qtensor/compression/cusz/include/kernel/launch_spm.cuh b/qtensor/compression/cusz/include/kernel/launch_spm.cuh
index fe4cfaae..4f0bcdd9 100644
--- a/qtensor/compression/cusz/include/kernel/launch_spm.cuh
+++ b/qtensor/compression/cusz/include/kernel/launch_spm.cuh
@@ -1,348 +1,348 @@
-/**
- * @file launch_sparse_method.cuh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-06-13
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef CUSZ_LAUNCH_SPARSE_METHOD_CUH
-#define CUSZ_LAUNCH_SPARSE_METHOD_CUH
-
-#include <cuda_runtime.h>
-#include <cusparse.h>
-
-#include "../common.hh"
-#include "../utils.hh"
-#include "../utils/cusparse_err.cuh"
-
-// #if CUDART_VERSION >= 11020
-
-template <typename T, typename M>
-void launch_cusparse_gather_cuda11200_onward(
-    cusparseHandle_t     handle,
-    T*                   in_dense,
-    uint32_t const       num_rows,
-    uint32_t const       num_cols,
-    cusparseDnMatDescr_t dnmat,
-    cusparseSpMatDescr_t spmat,
-    void*                d_buffer,
-    size_t&              d_buffer_size,
-    M*                   d_rowptr,
-    M*                   d_colidx,
-    T*                   d_val,
-    int64_t&             nnz,
-    float&               milliseconds,
-    cudaStream_t         stream)
-{
-    auto ld = num_rows;
-
-    auto gather11_init_mat = [&]() {
-        // create dense matrix wrapper
-        CHECK_CUSPARSE(
-            cusparseCreateDnMat(&dnmat, num_rows, num_cols, ld, in_dense, cuszCUSPARSE<T>::type, CUSPARSE_ORDER_ROW));
-
-        // create CSR wrapper
-        CHECK_CUSPARSE(cusparseCreateCsr(
-            &spmat, num_rows, num_cols, 0, d_rowptr, nullptr, nullptr, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-            CUSPARSE_INDEX_BASE_ZERO, cuszCUSPARSE<T>::type));
-    };
-
-    auto gather11_init_buffer = [&]() {
-        {  // allocate an external buffer if needed
-            cuda_timer_t t;
-            t.timer_start(stream);
-
-            CHECK_CUSPARSE(cusparseDenseToSparse_bufferSize(
-                handle, dnmat, spmat, CUSPARSE_DENSETOSPARSE_ALG_DEFAULT, &d_buffer_size));
-
-            t.timer_end(stream);
-            milliseconds += t.get_time_elapsed();
-
-            CHECK_CUDA(cudaMalloc(&d_buffer, d_buffer_size));
-        }
-    };
-
-    auto gather11_analysis = [&]() {
-        cuda_timer_t t;
-        t.timer_start(stream);
-
-        CHECK_CUSPARSE(
-            cusparseDenseToSparse_analysis(handle, dnmat, spmat, CUSPARSE_DENSETOSPARSE_ALG_DEFAULT, d_buffer));
-
-        t.timer_end(stream);
-        milliseconds += t.get_time_elapsed();
-    };
-
-    int64_t num_rows_tmp, num_cols_tmp;
-
-    auto gather11_get_nnz = [&]() {
-        // get number of non-zero elements
-        CHECK_CUSPARSE(cusparseSpMatGetSize(spmat, &num_rows_tmp, &num_cols_tmp, &nnz));
-    };
-
-    auto gather11_get_rowptr = [&]() {
-        // reset offsets, column indices, and values pointers
-        CHECK_CUSPARSE(cusparseCsrSetPointers(spmat, d_rowptr, d_colidx, d_val));
-    };
-
-    auto gather11_dn2csr = [&]() {
-        cuda_timer_t t;
-        t.timer_start(stream);
-
-        CHECK_CUSPARSE(
-            cusparseDenseToSparse_convert(handle, dnmat, spmat, CUSPARSE_DENSETOSPARSE_ALG_DEFAULT, d_buffer));
-
-        t.timer_end(stream);
-        milliseconds += t.get_time_elapsed();
-    };
-
-    /********************************************************************************/
-    milliseconds = 0;
-
-    CHECK_CUSPARSE(cusparseCreate(&handle));
-    if (stream) CHECK_CUSPARSE(cusparseSetStream(handle, stream));  // TODO move out
-
-    gather11_init_mat();
-    gather11_init_buffer();
-    gather11_analysis();
-    gather11_get_nnz();
-    gather11_get_rowptr();
-    gather11_dn2csr();
-
-    // destroy matrix/vector descriptors
-    CHECK_CUSPARSE(cusparseDestroyDnMat(dnmat));
-    CHECK_CUSPARSE(cusparseDestroySpMat(spmat));
-    CHECK_CUSPARSE(cusparseDestroy(handle));
-}
-
-// void SpcodecCSR<T, M>::impl::scatter_CUDA_11020(BYTE* in_csr, T* out_dense, cudaStream_t stream, bool
-// header_on_device)
-
-template <typename T, typename M>
-void launch_cusparse_scatter_cuda11200_onward(
-    cusparseHandle_t     handle,
-    int*                 d_rowptr,
-    int*                 d_colidx,
-    T*                   d_val,
-    int const            num_rows,
-    int const            num_cols,
-    int const            nnz,
-    cusparseDnMatDescr_t dnmat,
-    cusparseSpMatDescr_t spmat,
-    void*                d_buffer,
-    size_t&              d_buffer_size,
-    T*                   out_dense,
-    float&               milliseconds,
-    cudaStream_t         stream)
-{
-    auto ld = num_rows;
-
-    auto scatter11_init_mat = [&]() {
-        CHECK_CUSPARSE(cusparseCreateCsr(
-            &spmat, num_rows, num_cols, nnz, d_rowptr, d_colidx, d_val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-            CUSPARSE_INDEX_BASE_ZERO, cuszCUSPARSE<T>::type));
-
-        CHECK_CUSPARSE(
-            cusparseCreateDnMat(&dnmat, num_rows, num_cols, ld, out_dense, cuszCUSPARSE<T>::type, CUSPARSE_ORDER_ROW));
-    };
-
-    auto scatter11_init_buffer = [&]() {
-        cuda_timer_t t;
-        t.timer_start(stream);
-
-        // allocate an external buffer if needed
-        CHECK_CUSPARSE(
-            cusparseSparseToDense_bufferSize(handle, spmat, dnmat, CUSPARSE_SPARSETODENSE_ALG_DEFAULT, &d_buffer_size));
-
-        t.timer_end(stream);
-        milliseconds += t.get_time_elapsed();
-
-        CHECK_CUDA(cudaMalloc(&d_buffer, d_buffer_size));
-    };
-
-    auto scatter11_csr2dn = [&]() {
-        cuda_timer_t t;
-        t.timer_start(stream);
-
-        CHECK_CUSPARSE(cusparseSparseToDense(handle, spmat, dnmat, CUSPARSE_SPARSETODENSE_ALG_DEFAULT, d_buffer));
-
-        t.timer_end(stream);
-        milliseconds += t.get_time_elapsed();
-    };
-
-    /******************************************************************************/
-    milliseconds = 0;
-
-    CHECK_CUSPARSE(cusparseCreate(&handle));
-    if (stream) CHECK_CUSPARSE(cusparseSetStream(handle, stream));
-
-    scatter11_init_mat();
-    scatter11_init_buffer();
-    scatter11_csr2dn();
-
-    // destroy matrix/vector descriptors
-    CHECK_CUSPARSE(cusparseDestroySpMat(spmat));
-    CHECK_CUSPARSE(cusparseDestroyDnMat(dnmat));
-    CHECK_CUSPARSE(cusparseDestroy(handle));
-}
-
-// #elif CUDART_VERSION >= 10000
-
-template <typename T, typename M>
-void launch_cusparse_gather_before_cuda11200(
-    cusparseHandle_t   handle,
-    T*                 in_dense,
-    uint32_t const     num_rows,
-    uint32_t const     num_cols,
-    cusparseMatDescr_t mat_desc,
-    void*              d_work,
-    size_t&            lwork_in_bytes,
-    M*                 d_rowptr,
-    M*                 d_colidx,
-    T*                 d_val,
-    int&               nnz,  // int is for compatibility; cuSPARSE of CUDA 11 changed data type
-    float&             milliseconds,
-    cudaStream_t       stream)
-{
-    auto ld = num_rows;
-
-    float threshold{0};
-    auto  has_ext_stream{false};
-
-    /******************************************************************************/
-
-    auto gather10_init_and_probe = [&]() {
-        {  // init
-
-            CHECK_CUSPARSE(cusparseCreateMatDescr(&mat_desc));                            // 4. create rte.mat_desc
-            CHECK_CUSPARSE(cusparseSetMatIndexBase(mat_desc, CUSPARSE_INDEX_BASE_ZERO));  // zero based
-            CHECK_CUSPARSE(cusparseSetMatType(mat_desc, CUSPARSE_MATRIX_TYPE_GENERAL));   // type
-        }
-
-        {  // probe
-            cuda_timer_t t;
-            t.timer_start(stream);
-
-            CHECK_CUSPARSE(cusparseSpruneDense2csr_bufferSizeExt(
-                handle, num_rows, num_cols, in_dense, ld, &threshold, mat_desc, d_val, d_rowptr, d_colidx,
-                &lwork_in_bytes));
-
-            t.timer_end(stream);
-            milliseconds += t.get_time_elapsed();
-        }
-
-        if (nullptr != d_work) cudaFree(d_work);
-        CHECK_CUDA(cudaMalloc((void**)&d_work, lwork_in_bytes));  // TODO where to release d_work?
-    };
-
-    auto gather10_compute_rowptr_and_nnz = [&]() {  // step 4
-        cuda_timer_t t;
-        t.timer_start(stream);
-
-        CHECK_CUSPARSE(cusparseSpruneDense2csrNnz(
-            handle, num_rows, num_cols, in_dense, ld, &threshold, mat_desc, d_rowptr, &nnz, d_work));
-
-        t.timer_end(stream);
-        milliseconds += t.get_time_elapsed();
-        CHECK_CUDA(cudaStreamSynchronize(stream));
-
-    };
-
-    auto gather10_compute_colidx_and_val = [&]() {  // step 5
-        cuda_timer_t t;
-        t.timer_start(stream);
-
-        CHECK_CUSPARSE(cusparseSpruneDense2csr(  //
-            handle, num_rows, num_cols, in_dense, ld, &threshold, mat_desc, d_val, d_rowptr, d_colidx, d_work));
-
-        t.timer_end(stream);
-        milliseconds += t.get_time_elapsed();
-        CHECK_CUDA(cudaStreamSynchronize(stream));
-    };
-
-    /********************************************************************************/
-    milliseconds = 0;
-
-    if (stream)
-        has_ext_stream = true;
-    else
-        CHECK_CUDA(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));  // 1. create stream
-    CHECK_CUSPARSE(cusparseCreate(&handle));                                    // 2. create handle
-    CHECK_CUSPARSE(cusparseSetStream(handle, stream));                          // 3. bind stream
-
-    gather10_init_and_probe();
-    gather10_compute_rowptr_and_nnz();
-    if (nnz == 0) { return; }
-    gather10_compute_colidx_and_val();
-
-    // TODO no need to destroy?
-    if (handle) cusparseDestroy(handle);
-    if (mat_desc) cusparseDestroyMatDescr(mat_desc);
-    if ((not has_ext_stream) and stream) cudaStreamDestroy(stream);
-    /********************************************************************************/
-}
-
-// #endif
-
-template <typename T, typename M>
-void launch_cusparse_scatter_before_cuda11200(
-    cusparseHandle_t   handle,
-    int*               d_rowptr,
-    int*               d_colidx,
-    T*                 d_val,
-    int const          num_rows,
-    int const          num_cols,
-    int const          nnz,
-    cusparseMatDescr_t mat_desc,
-    void*              d_buffer,
-    size_t&            d_buffer_size,
-    T*                 out_dense,
-    float&             milliseconds,
-    cudaStream_t       stream)
-{
-    auto ld = num_rows;
-
-    auto has_external_stream = false;
-
-    /******************************************************************************/
-
-    auto scatter10_init = [&]() {
-        CHECK_CUSPARSE(cusparseCreateMatDescr(&mat_desc));                            // 4. create descr
-        CHECK_CUSPARSE(cusparseSetMatIndexBase(mat_desc, CUSPARSE_INDEX_BASE_ZERO));  // zero based
-        CHECK_CUSPARSE(cusparseSetMatType(mat_desc, CUSPARSE_MATRIX_TYPE_GENERAL));   // type
-    };
-
-    auto scatter10_sparse2dense = [&]() {
-        cuda_timer_t t;
-        t.timer_start(stream);
-
-        CHECK_CUSPARSE(
-            cusparseScsr2dense(handle, num_rows, num_cols, mat_desc, d_val, d_rowptr, d_colidx, out_dense, ld));
-
-        t.timer_end();
-        milliseconds += t.get_time_elapsed();
-        CHECK_CUDA(cudaStreamSynchronize(stream));
-    };
-
-    /******************************************************************************/
-    if (stream)
-        has_external_stream = true;
-    else
-        CHECK_CUDA(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-    CHECK_CUSPARSE(cusparseCreate(&handle));
-    CHECK_CUSPARSE(cusparseSetStream(handle, stream));
-
-    scatter10_init();
-    scatter10_sparse2dense();
-
-    if (handle) cusparseDestroy(handle);
-    if (mat_desc) cusparseDestroyMatDescr(mat_desc);
-    if ((not has_external_stream) and stream) cudaStreamDestroy(stream);
-    /******************************************************************************/
-}
-
-#endif
+/**
+ * @file launch_sparse_method.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-06-13
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_LAUNCH_SPARSE_METHOD_CUH
+#define CUSZ_LAUNCH_SPARSE_METHOD_CUH
+
+#include <cuda_runtime.h>
+#include <cusparse.h>
+
+#include "../common.hh"
+#include "../utils.hh"
+#include "../utils/cusparse_err.cuh"
+
+// #if CUDART_VERSION >= 11020
+
+template <typename T, typename M>
+void launch_cusparse_gather_cuda11200_onward(
+    cusparseHandle_t     handle,
+    T*                   in_dense,
+    uint32_t const       num_rows,
+    uint32_t const       num_cols,
+    cusparseDnMatDescr_t dnmat,
+    cusparseSpMatDescr_t spmat,
+    void*                d_buffer,
+    size_t&              d_buffer_size,
+    M*                   d_rowptr,
+    M*                   d_colidx,
+    T*                   d_val,
+    int64_t&             nnz,
+    float&               milliseconds,
+    cudaStream_t         stream)
+{
+    auto ld = num_rows;
+
+    auto gather11_init_mat = [&]() {
+        // create dense matrix wrapper
+        CHECK_CUSPARSE(
+            cusparseCreateDnMat(&dnmat, num_rows, num_cols, ld, in_dense, cuszCUSPARSE<T>::type, CUSPARSE_ORDER_ROW));
+
+        // create CSR wrapper
+        CHECK_CUSPARSE(cusparseCreateCsr(
+            &spmat, num_rows, num_cols, 0, d_rowptr, nullptr, nullptr, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+            CUSPARSE_INDEX_BASE_ZERO, cuszCUSPARSE<T>::type));
+    };
+
+    auto gather11_init_buffer = [&]() {
+        {  // allocate an external buffer if needed
+            cuda_timer_t t;
+            t.timer_start(stream);
+
+            CHECK_CUSPARSE(cusparseDenseToSparse_bufferSize(
+                handle, dnmat, spmat, CUSPARSE_DENSETOSPARSE_ALG_DEFAULT, &d_buffer_size));
+
+            t.timer_end(stream);
+            milliseconds += t.get_time_elapsed();
+
+            CHECK_CUDA(cudaMalloc(&d_buffer, d_buffer_size));
+        }
+    };
+
+    auto gather11_analysis = [&]() {
+        cuda_timer_t t;
+        t.timer_start(stream);
+
+        CHECK_CUSPARSE(
+            cusparseDenseToSparse_analysis(handle, dnmat, spmat, CUSPARSE_DENSETOSPARSE_ALG_DEFAULT, d_buffer));
+
+        t.timer_end(stream);
+        milliseconds += t.get_time_elapsed();
+    };
+
+    int64_t num_rows_tmp, num_cols_tmp;
+
+    auto gather11_get_nnz = [&]() {
+        // get number of non-zero elements
+        CHECK_CUSPARSE(cusparseSpMatGetSize(spmat, &num_rows_tmp, &num_cols_tmp, &nnz));
+    };
+
+    auto gather11_get_rowptr = [&]() {
+        // reset offsets, column indices, and values pointers
+        CHECK_CUSPARSE(cusparseCsrSetPointers(spmat, d_rowptr, d_colidx, d_val));
+    };
+
+    auto gather11_dn2csr = [&]() {
+        cuda_timer_t t;
+        t.timer_start(stream);
+
+        CHECK_CUSPARSE(
+            cusparseDenseToSparse_convert(handle, dnmat, spmat, CUSPARSE_DENSETOSPARSE_ALG_DEFAULT, d_buffer));
+
+        t.timer_end(stream);
+        milliseconds += t.get_time_elapsed();
+    };
+
+    /********************************************************************************/
+    milliseconds = 0;
+
+    CHECK_CUSPARSE(cusparseCreate(&handle));
+    if (stream) CHECK_CUSPARSE(cusparseSetStream(handle, stream));  // TODO move out
+
+    gather11_init_mat();
+    gather11_init_buffer();
+    gather11_analysis();
+    gather11_get_nnz();
+    gather11_get_rowptr();
+    gather11_dn2csr();
+
+    // destroy matrix/vector descriptors
+    CHECK_CUSPARSE(cusparseDestroyDnMat(dnmat));
+    CHECK_CUSPARSE(cusparseDestroySpMat(spmat));
+    CHECK_CUSPARSE(cusparseDestroy(handle));
+}
+
+// void SpcodecCSR<T, M>::impl::scatter_CUDA_11020(BYTE* in_csr, T* out_dense, cudaStream_t stream, bool
+// header_on_device)
+
+template <typename T, typename M>
+void launch_cusparse_scatter_cuda11200_onward(
+    cusparseHandle_t     handle,
+    int*                 d_rowptr,
+    int*                 d_colidx,
+    T*                   d_val,
+    int const            num_rows,
+    int const            num_cols,
+    int const            nnz,
+    cusparseDnMatDescr_t dnmat,
+    cusparseSpMatDescr_t spmat,
+    void*                d_buffer,
+    size_t&              d_buffer_size,
+    T*                   out_dense,
+    float&               milliseconds,
+    cudaStream_t         stream)
+{
+    auto ld = num_rows;
+
+    auto scatter11_init_mat = [&]() {
+        CHECK_CUSPARSE(cusparseCreateCsr(
+            &spmat, num_rows, num_cols, nnz, d_rowptr, d_colidx, d_val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+            CUSPARSE_INDEX_BASE_ZERO, cuszCUSPARSE<T>::type));
+
+        CHECK_CUSPARSE(
+            cusparseCreateDnMat(&dnmat, num_rows, num_cols, ld, out_dense, cuszCUSPARSE<T>::type, CUSPARSE_ORDER_ROW));
+    };
+
+    auto scatter11_init_buffer = [&]() {
+        cuda_timer_t t;
+        t.timer_start(stream);
+
+        // allocate an external buffer if needed
+        CHECK_CUSPARSE(
+            cusparseSparseToDense_bufferSize(handle, spmat, dnmat, CUSPARSE_SPARSETODENSE_ALG_DEFAULT, &d_buffer_size));
+
+        t.timer_end(stream);
+        milliseconds += t.get_time_elapsed();
+
+        CHECK_CUDA(cudaMalloc(&d_buffer, d_buffer_size));
+    };
+
+    auto scatter11_csr2dn = [&]() {
+        cuda_timer_t t;
+        t.timer_start(stream);
+
+        CHECK_CUSPARSE(cusparseSparseToDense(handle, spmat, dnmat, CUSPARSE_SPARSETODENSE_ALG_DEFAULT, d_buffer));
+
+        t.timer_end(stream);
+        milliseconds += t.get_time_elapsed();
+    };
+
+    /******************************************************************************/
+    milliseconds = 0;
+
+    CHECK_CUSPARSE(cusparseCreate(&handle));
+    if (stream) CHECK_CUSPARSE(cusparseSetStream(handle, stream));
+
+    scatter11_init_mat();
+    scatter11_init_buffer();
+    scatter11_csr2dn();
+
+    // destroy matrix/vector descriptors
+    CHECK_CUSPARSE(cusparseDestroySpMat(spmat));
+    CHECK_CUSPARSE(cusparseDestroyDnMat(dnmat));
+    CHECK_CUSPARSE(cusparseDestroy(handle));
+}
+
+// #elif CUDART_VERSION >= 10000
+
+template <typename T, typename M>
+void launch_cusparse_gather_before_cuda11200(
+    cusparseHandle_t   handle,
+    T*                 in_dense,
+    uint32_t const     num_rows,
+    uint32_t const     num_cols,
+    cusparseMatDescr_t mat_desc,
+    void*              d_work,
+    size_t&            lwork_in_bytes,
+    M*                 d_rowptr,
+    M*                 d_colidx,
+    T*                 d_val,
+    int&               nnz,  // int is for compatibility; cuSPARSE of CUDA 11 changed data type
+    float&             milliseconds,
+    cudaStream_t       stream)
+{
+    auto ld = num_rows;
+
+    float threshold{0};
+    auto  has_ext_stream{false};
+
+    /******************************************************************************/
+
+    auto gather10_init_and_probe = [&]() {
+        {  // init
+
+            CHECK_CUSPARSE(cusparseCreateMatDescr(&mat_desc));                            // 4. create rte.mat_desc
+            CHECK_CUSPARSE(cusparseSetMatIndexBase(mat_desc, CUSPARSE_INDEX_BASE_ZERO));  // zero based
+            CHECK_CUSPARSE(cusparseSetMatType(mat_desc, CUSPARSE_MATRIX_TYPE_GENERAL));   // type
+        }
+
+        {  // probe
+            cuda_timer_t t;
+            t.timer_start(stream);
+
+            CHECK_CUSPARSE(cusparseSpruneDense2csr_bufferSizeExt(
+                handle, num_rows, num_cols, in_dense, ld, &threshold, mat_desc, d_val, d_rowptr, d_colidx,
+                &lwork_in_bytes));
+
+            t.timer_end(stream);
+            milliseconds += t.get_time_elapsed();
+        }
+
+        if (nullptr != d_work) cudaFree(d_work);
+        CHECK_CUDA(cudaMalloc((void**)&d_work, lwork_in_bytes));  // TODO where to release d_work?
+    };
+
+    auto gather10_compute_rowptr_and_nnz = [&]() {  // step 4
+        cuda_timer_t t;
+        t.timer_start(stream);
+
+        CHECK_CUSPARSE(cusparseSpruneDense2csrNnz(
+            handle, num_rows, num_cols, in_dense, ld, &threshold, mat_desc, d_rowptr, &nnz, d_work));
+
+        t.timer_end(stream);
+        milliseconds += t.get_time_elapsed();
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    };
+
+    auto gather10_compute_colidx_and_val = [&]() {  // step 5
+        cuda_timer_t t;
+        t.timer_start(stream);
+
+        CHECK_CUSPARSE(cusparseSpruneDense2csr(  //
+            handle, num_rows, num_cols, in_dense, ld, &threshold, mat_desc, d_val, d_rowptr, d_colidx, d_work));
+
+        t.timer_end(stream);
+        milliseconds += t.get_time_elapsed();
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+    };
+
+    /********************************************************************************/
+    milliseconds = 0;
+
+    if (stream)
+        has_ext_stream = true;
+    else
+        CHECK_CUDA(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));  // 1. create stream
+    CHECK_CUSPARSE(cusparseCreate(&handle));                                    // 2. create handle
+    CHECK_CUSPARSE(cusparseSetStream(handle, stream));                          // 3. bind stream
+
+    gather10_init_and_probe();
+    gather10_compute_rowptr_and_nnz();
+    if (nnz == 0) { return; }
+    gather10_compute_colidx_and_val();
+
+    // TODO no need to destroy?
+    if (handle) cusparseDestroy(handle);
+    if (mat_desc) cusparseDestroyMatDescr(mat_desc);
+    if ((not has_ext_stream) and stream) cudaStreamDestroy(stream);
+    /********************************************************************************/
+}
+
+// #endif
+
+template <typename T, typename M>
+void launch_cusparse_scatter_before_cuda11200(
+    cusparseHandle_t   handle,
+    int*               d_rowptr,
+    int*               d_colidx,
+    T*                 d_val,
+    int const          num_rows,
+    int const          num_cols,
+    int const          nnz,
+    cusparseMatDescr_t mat_desc,
+    void*              d_buffer,
+    size_t&            d_buffer_size,
+    T*                 out_dense,
+    float&             milliseconds,
+    cudaStream_t       stream)
+{
+    auto ld = num_rows;
+
+    auto has_external_stream = false;
+
+    /******************************************************************************/
+
+    auto scatter10_init = [&]() {
+        CHECK_CUSPARSE(cusparseCreateMatDescr(&mat_desc));                            // 4. create descr
+        CHECK_CUSPARSE(cusparseSetMatIndexBase(mat_desc, CUSPARSE_INDEX_BASE_ZERO));  // zero based
+        CHECK_CUSPARSE(cusparseSetMatType(mat_desc, CUSPARSE_MATRIX_TYPE_GENERAL));   // type
+    };
+
+    auto scatter10_sparse2dense = [&]() {
+        cuda_timer_t t;
+        t.timer_start(stream);
+
+        CHECK_CUSPARSE(
+            cusparseScsr2dense(handle, num_rows, num_cols, mat_desc, d_val, d_rowptr, d_colidx, out_dense, ld));
+
+        t.timer_end();
+        milliseconds += t.get_time_elapsed();
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+    };
+
+    /******************************************************************************/
+    if (stream)
+        has_external_stream = true;
+    else
+        CHECK_CUDA(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    CHECK_CUSPARSE(cusparseCreate(&handle));
+    CHECK_CUSPARSE(cusparseSetStream(handle, stream));
+
+    scatter10_init();
+    scatter10_sparse2dense();
+
+    if (handle) cusparseDestroy(handle);
+    if (mat_desc) cusparseDestroyMatDescr(mat_desc);
+    if ((not has_external_stream) and stream) cudaStreamDestroy(stream);
+    /******************************************************************************/
+}
+
+#endif
diff --git a/qtensor/compression/cusz/include/kernel/lorenzo_all.h b/qtensor/compression/cusz/include/kernel/lorenzo_all.h
index 89f6f38f..de9f087e 100644
--- a/qtensor/compression/cusz/include/kernel/lorenzo_all.h
+++ b/qtensor/compression/cusz/include/kernel/lorenzo_all.h
@@ -1,44 +1,44 @@
-/**
- * @file kernel_cuda.h
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-11-02
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef BD8A19DE_E881_4A26_9464_C51DAC6B14E1
-#define BD8A19DE_E881_4A26_9464_C51DAC6B14E1
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "cusz/type.h"
-
-#define C_LORENZOI(Tliteral, Eliteral, FPliteral, T, E, FP)                                           \
-    cusz_error_status compress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral(        \
-        T* const data, dim3 const len3, double const eb, E* delta, bool* signum, float* time_elapsed, \
-        cudaStream_t stream);                                                                         \
-    cusz_error_status decompress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral(      \
-        E* delta, bool* signum, dim3 const len3, double const eb, T* xdata, float* time_elapsed, cudaStream_t stream);
-
-C_LORENZOI(fp32, ui8, fp32, float, uint8_t, float);
-C_LORENZOI(fp32, ui16, fp32, float, uint16_t, float);
-C_LORENZOI(fp32, ui32, fp32, float, uint32_t, float);
-C_LORENZOI(fp32, fp32, fp32, float, float, float);
-
-C_LORENZOI(fp64, ui8, fp64, double, uint8_t, double);
-C_LORENZOI(fp64, ui16, fp64, double, uint16_t, double);
-C_LORENZOI(fp64, ui32, fp64, double, uint32_t, double);
-C_LORENZOI(fp64, fp32, fp64, double, float, double);
-
-#undef C_LORENZOI
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* BD8A19DE_E881_4A26_9464_C51DAC6B14E1 */
+/**
+ * @file kernel_cuda.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-02
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef BD8A19DE_E881_4A26_9464_C51DAC6B14E1
+#define BD8A19DE_E881_4A26_9464_C51DAC6B14E1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "cusz/type.h"
+
+#define C_LORENZOI(Tliteral, Eliteral, FPliteral, T, E, FP)                                           \
+    cusz_error_status compress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral(        \
+        T* const data, dim3 const len3, double const eb, E* delta, bool* signum, float* time_elapsed, \
+        cudaStream_t stream);                                                                         \
+    cusz_error_status decompress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral(      \
+        E* delta, bool* signum, dim3 const len3, double const eb, T* xdata, float* time_elapsed, cudaStream_t stream);
+
+C_LORENZOI(fp32, ui8, fp32, float, uint8_t, float);
+C_LORENZOI(fp32, ui16, fp32, float, uint16_t, float);
+C_LORENZOI(fp32, ui32, fp32, float, uint32_t, float);
+C_LORENZOI(fp32, fp32, fp32, float, float, float);
+
+C_LORENZOI(fp64, ui8, fp64, double, uint8_t, double);
+C_LORENZOI(fp64, ui16, fp64, double, uint16_t, double);
+C_LORENZOI(fp64, ui32, fp64, double, uint32_t, double);
+C_LORENZOI(fp64, fp32, fp64, double, float, double);
+
+#undef C_LORENZOI
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BD8A19DE_E881_4A26_9464_C51DAC6B14E1 */
diff --git a/qtensor/compression/cusz/include/kernel/lorenzo_all.hh b/qtensor/compression/cusz/include/kernel/lorenzo_all.hh
index f7308fe1..d87baffa 100644
--- a/qtensor/compression/cusz/include/kernel/lorenzo_all.hh
+++ b/qtensor/compression/cusz/include/kernel/lorenzo_all.hh
@@ -1,96 +1,96 @@
-/**
- * @file kernel_cuda.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-11-01
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef C8C37773_7EF2_439B_B0EF_14D0058DC714
-#define C8C37773_7EF2_439B_B0EF_14D0058DC714
-
-#include <stdint.h>
-#include "cusz/type.h"
-
-template <typename T, typename EQ = int32_t, typename FP = T>
-cusz_error_status compress_predict_lorenzo_i(
-    T* const     data,          // input
-    dim3 const   len3,          //
-    double const eb,            // input (config)
-    int const    radius,        //
-    EQ* const    eq,            // output
-    T*           outlier,       //
-    uint32_t*    outlier_idx,   //
-    uint32_t*    num_outliers,  //
-    float*       time_elapsed,  // optional
-    cudaStream_t stream);       //
-
-template <typename T, typename EQ = int32_t, typename FP = T>
-cusz_error_status decompress_predict_lorenzo_i(
-    EQ*            eq,            // input
-    dim3 const     len3,          //
-    T*             outlier,       //
-    uint32_t*      outlier_idx,   //
-    uint32_t const num_outliers,  //
-    double const   eb,            // input (config)
-    int const      radius,        //
-    T*             xdata,         // output
-    float*         time_elapsed,  // optional
-    cudaStream_t   stream);
-
-namespace asz {
-namespace experimental {
-
-template <typename T, typename DeltaT, typename FP>
-cusz_error_status compress_predict_lorenzo_ivar(
-    T*           data,
-    dim3 const   len3,
-    double const eb,
-    DeltaT*      delta,
-    bool*        signum,
-    float*       time_elapsed,
-    cudaStream_t stream);
-
-template <typename T, typename DeltaT, typename FP>
-cusz_error_status decompress_predict_lorenzo_ivar(
-    DeltaT*      delta,
-    bool*        signum,
-    dim3 const   len3,
-    double const eb,
-    T*           xdata,
-    float*       time_elapsed,
-    cudaStream_t stream);
-
-}  // namespace experimental
-}  // namespace asz
-
-template <typename T, typename EQ, typename FP>
-cusz_error_status compress_predict_lorenzo_iproto(
-    T* const     data,          // input
-    dim3 const   len3,          //
-    double const eb,            // input (config)
-    int const    radius,        //
-    EQ* const    eq,            // output
-    T*           outlier,       //
-    uint32_t*    outlier_idx,   //
-    uint32_t*    num_outliers,  //
-    float*       time_elapsed,  // optional
-    cudaStream_t stream);       //
-
-template <typename T, typename EQ, typename FP>
-cusz_error_status decompress_predict_lorenzo_iproto(
-    EQ*            eq,            // input
-    dim3 const     len3,          //
-    T*             outlier,       //
-    uint32_t*      outlier_idx,   //
-    uint32_t const num_outliers,  //
-    double const   eb,            // input (config)
-    int const      radius,        //
-    T*             xdata,         // output
-    float*         time_elapsed,  // optional
-    cudaStream_t   stream);
-
-#endif /* C8C37773_7EF2_439B_B0EF_14D0058DC714 */
+/**
+ * @file kernel_cuda.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-01
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef C8C37773_7EF2_439B_B0EF_14D0058DC714
+#define C8C37773_7EF2_439B_B0EF_14D0058DC714
+
+#include <stdint.h>
+#include "cusz/type.h"
+
+template <typename T, typename EQ = int32_t, typename FP = T>
+cusz_error_status compress_predict_lorenzo_i(
+    T* const     data,          // input
+    dim3 const   len3,          //
+    double const eb,            // input (config)
+    int const    radius,        //
+    EQ* const    eq,            // output
+    T*           outlier,       //
+    uint32_t*    outlier_idx,   //
+    uint32_t*    num_outliers,  //
+    float*       time_elapsed,  // optional
+    cudaStream_t stream);       //
+
+template <typename T, typename EQ = int32_t, typename FP = T>
+cusz_error_status decompress_predict_lorenzo_i(
+    EQ*            eq,            // input
+    dim3 const     len3,          //
+    T*             outlier,       //
+    uint32_t*      outlier_idx,   //
+    uint32_t const num_outliers,  //
+    double const   eb,            // input (config)
+    int const      radius,        //
+    T*             xdata,         // output
+    float*         time_elapsed,  // optional
+    cudaStream_t   stream);
+
+namespace asz {
+namespace experimental {
+
+template <typename T, typename DeltaT, typename FP>
+cusz_error_status compress_predict_lorenzo_ivar(
+    T*           data,
+    dim3 const   len3,
+    double const eb,
+    DeltaT*      delta,
+    bool*        signum,
+    float*       time_elapsed,
+    cudaStream_t stream);
+
+template <typename T, typename DeltaT, typename FP>
+cusz_error_status decompress_predict_lorenzo_ivar(
+    DeltaT*      delta,
+    bool*        signum,
+    dim3 const   len3,
+    double const eb,
+    T*           xdata,
+    float*       time_elapsed,
+    cudaStream_t stream);
+
+}  // namespace experimental
+}  // namespace asz
+
+template <typename T, typename EQ, typename FP>
+cusz_error_status compress_predict_lorenzo_iproto(
+    T* const     data,          // input
+    dim3 const   len3,          //
+    double const eb,            // input (config)
+    int const    radius,        //
+    EQ* const    eq,            // output
+    T*           outlier,       //
+    uint32_t*    outlier_idx,   //
+    uint32_t*    num_outliers,  //
+    float*       time_elapsed,  // optional
+    cudaStream_t stream);       //
+
+template <typename T, typename EQ, typename FP>
+cusz_error_status decompress_predict_lorenzo_iproto(
+    EQ*            eq,            // input
+    dim3 const     len3,          //
+    T*             outlier,       //
+    uint32_t*      outlier_idx,   //
+    uint32_t const num_outliers,  //
+    double const   eb,            // input (config)
+    int const      radius,        //
+    T*             xdata,         // output
+    float*         time_elapsed,  // optional
+    cudaStream_t   stream);
+
+#endif /* C8C37773_7EF2_439B_B0EF_14D0058DC714 */
diff --git a/qtensor/compression/cusz/include/kernel/spv_gpu.h b/qtensor/compression/cusz/include/kernel/spv_gpu.h
index fb50119c..496dd4eb 100644
--- a/qtensor/compression/cusz/include/kernel/spv_gpu.h
+++ b/qtensor/compression/cusz/include/kernel/spv_gpu.h
@@ -1,42 +1,42 @@
-/**
- * @file spv_gpu.h
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-10-29
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef B1B21251_C3C3_4BC1_B4E0_75D9D86EE7F3
-#define B1B21251_C3C3_4BC1_B4E0_75D9D86EE7F3
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <cuda_runtime.h>
-#include <stdint.h>
-
-#define SPV(Tliteral, Mliteral, T, M)                                                                               \
-    void spv_gather_T##Tliteral##_M##Mliteral(                                                                      \
-        T* in, size_t const in_len, T* d_val, uint32_t* d_idx, int* nnz, float* milliseconds, cudaStream_t stream); \
-                                                                                                                    \
-    void spv_scatter_T##Tliteral##_M##Mliteral(                                                                     \
-        T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream);
-
-SPV(ui8, ui32, uint8_t, uint32_t)
-SPV(ui16, ui32, uint16_t, uint32_t)
-SPV(ui32, ui32, uint32_t, uint32_t)
-SPV(ui64, ui32, uint64_t, uint32_t)
-SPV(fp32, ui32, float, uint32_t)
-SPV(fp64, ui32, double, uint32_t)
-
-#undef SPV
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* B1B21251_C3C3_4BC1_B4E0_75D9D86EE7F3 */
+/**
+ * @file spv_gpu.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-29
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef B1B21251_C3C3_4BC1_B4E0_75D9D86EE7F3
+#define B1B21251_C3C3_4BC1_B4E0_75D9D86EE7F3
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <cuda_runtime.h>
+#include <stdint.h>
+
+#define SPV(Tliteral, Mliteral, T, M)                                                                               \
+    void spv_gather_T##Tliteral##_M##Mliteral(                                                                      \
+        T* in, size_t const in_len, T* d_val, uint32_t* d_idx, int* nnz, float* milliseconds, cudaStream_t stream); \
+                                                                                                                    \
+    void spv_scatter_T##Tliteral##_M##Mliteral(                                                                     \
+        T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream);
+
+SPV(ui8, ui32, uint8_t, uint32_t)
+SPV(ui16, ui32, uint16_t, uint32_t)
+SPV(ui32, ui32, uint32_t, uint32_t)
+SPV(ui64, ui32, uint64_t, uint32_t)
+SPV(fp32, ui32, float, uint32_t)
+SPV(fp64, ui32, double, uint32_t)
+
+#undef SPV
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* B1B21251_C3C3_4BC1_B4E0_75D9D86EE7F3 */
diff --git a/qtensor/compression/cusz/include/kernel/spv_gpu.hh b/qtensor/compression/cusz/include/kernel/spv_gpu.hh
index 6b978abc..c2f021df 100644
--- a/qtensor/compression/cusz/include/kernel/spv_gpu.hh
+++ b/qtensor/compression/cusz/include/kernel/spv_gpu.hh
@@ -1,33 +1,33 @@
-/**
- * @file spv_gpu.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-10-29
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef A54D2009_1D4F_4113_9E26_9695A3669224
-#define A54D2009_1D4F_4113_9E26_9695A3669224
-#include <cstdint>
-
-namespace psz {
-
-template <typename T, typename M>
-void spv_gather(
-    T*           in,
-    size_t const in_len,
-    T*           d_val,
-    uint32_t*    d_idx,
-    int*         nnz,
-    float*       milliseconds,
-    cudaStream_t stream);
-
-template <typename T, typename M>
-void spv_scatter(T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream);
-
-}  // namespace psz
-
-#endif /* A54D2009_1D4F_4113_9E26_9695A3669224 */
+/**
+ * @file spv_gpu.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-29
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef A54D2009_1D4F_4113_9E26_9695A3669224
+#define A54D2009_1D4F_4113_9E26_9695A3669224
+#include <cstdint>
+
+namespace psz {
+
+template <typename T, typename M>
+void spv_gather(
+    T*           in,
+    size_t const in_len,
+    T*           d_val,
+    uint32_t*    d_idx,
+    int*         nnz,
+    float*       milliseconds,
+    cudaStream_t stream);
+
+template <typename T, typename M>
+void spv_scatter(T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream);
+
+}  // namespace psz
+
+#endif /* A54D2009_1D4F_4113_9E26_9695A3669224 */
diff --git a/qtensor/compression/cusz/include/kernel/v2_lorenzo.hh b/qtensor/compression/cusz/include/kernel/v2_lorenzo.hh
index 861a2e2c..7c8d4ce0 100644
--- a/qtensor/compression/cusz/include/kernel/v2_lorenzo.hh
+++ b/qtensor/compression/cusz/include/kernel/v2_lorenzo.hh
@@ -1,32 +1,32 @@
-/**
- * @file v2_lorenzo.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.4
- * @date 2023-01-23
- *
- * (C) 2023 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef CD52BDA6_9376_43FF_BFDA_693204FA8762
-#define CD52BDA6_9376_43FF_BFDA_693204FA8762
-
-#include "compaction.hh"
-#include "cusz/type.h"
-
-template <typename T, typename E, typename FP>
-cusz_error_status v2_compress_predict_lorenzo_i(
-    T* const          data,          // input
-    dim3 const        data_len3,     //
-    double const      eb,            // input (config)
-    int const         radius,        //
-    E* const          eq,            // output
-    dim3 const        eq_len3,       //
-    T* const          anchor,        //
-    dim3 const        anchor_len3,   //
-    CompactionDRAM<T> outlier,       //
-    float*            time_elapsed,  // optional
-    cudaStream_t      stream);            //
-
-#endif /* CD52BDA6_9376_43FF_BFDA_693204FA8762 */
+/**
+ * @file v2_lorenzo.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-23
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CD52BDA6_9376_43FF_BFDA_693204FA8762
+#define CD52BDA6_9376_43FF_BFDA_693204FA8762
+
+#include "compaction.hh"
+#include "cusz/type.h"
+
+template <typename T, typename E, typename FP>
+cusz_error_status v2_compress_predict_lorenzo_i(
+    T* const          data,          // input
+    dim3 const        data_len3,     //
+    double const      eb,            // input (config)
+    int const         radius,        //
+    E* const          eq,            // output
+    dim3 const        eq_len3,       //
+    T* const          anchor,        //
+    dim3 const        anchor_len3,   //
+    CompactionDRAM<T> outlier,       //
+    float*            time_elapsed,  // optional
+    cudaStream_t      stream);            //
+
+#endif /* CD52BDA6_9376_43FF_BFDA_693204FA8762 */
diff --git a/qtensor/compression/cusz/include/pipeline/compaction_g.inl b/qtensor/compression/cusz/include/pipeline/compaction_g.inl
index fd312c82..7a854101 100644
--- a/qtensor/compression/cusz/include/pipeline/compaction_g.inl
+++ b/qtensor/compression/cusz/include/pipeline/compaction_g.inl
@@ -1,73 +1,73 @@
-/**
- * @file compaction_g.inl
- * @author Jiannan Tian
- * @brief
- * @version 0.4
- * @date 2022-12-22
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef F712F74C_7488_4445_83EE_EE7F88A64BBA
-#define F712F74C_7488_4445_83EE_EE7F88A64BBA
-
-#include <cuda_runtime.h>
-#include <cstring>
-#include "compaction.hh"
-
-#include <stdint.h>
-#include <stdlib.h>
-
-// TODO filename -> `compaction`
-template <typename T>
-struct CompactionDRAM {
-    using type = T;
-    T*        val;
-    uint32_t* idx;
-    uint32_t* count;
-    uint32_t* h_count;
-
-    void allocate(size_t len, bool device = true)
-    {
-        if (device) {
-            cudaMalloc(&idx, sizeof(uint32_t) * len);
-            cudaMalloc(&val, sizeof(T) * len);
-            cudaMalloc(&count, sizeof(T) * 1);
-            cudaMallocHost(&h_count, sizeof(T) * 1);
-        }
-        else {
-            cudaMallocHost(&idx, sizeof(uint32_t) * len);
-            cudaMallocHost(&val, sizeof(T) * len);
-            cudaMallocHost(&count, sizeof(T) * 1);
-
-            memset(count, 0x0, sizeof(T) * 1);
-        }
-    }
-
-    void make_count_host_accessible(cudaStream_t stream)
-    {
-        cudaMemcpyAsync(h_count, count, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream);
-    }
-
-    uint32_t access_count_on_host() { return *h_count; }
-
-    void allocate_managed(size_t len)
-    {
-        cudaMallocManaged(&idx, sizeof(uint32_t) * len);
-        cudaMallocManaged(&val, sizeof(T) * len);
-        cudaMallocManaged(&count, sizeof(T) * 1);
-
-        cudaMemset(count, 0x0, sizeof(T) * 1);
-    }
-
-    void destroy()
-    {
-        if (h_count) cudaFreeHost(h_count);
-        cudaFree(idx);
-        cudaFree(val);
-        cudaFree(count);
-    }
-};
-
-#endif /* F712F74C_7488_4445_83EE_EE7F88A64BBA */
+/**
+ * @file compaction_g.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2022-12-22
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef F712F74C_7488_4445_83EE_EE7F88A64BBA
+#define F712F74C_7488_4445_83EE_EE7F88A64BBA
+
+#include <cuda_runtime.h>
+#include <cstring>
+#include "compaction.hh"
+
+#include <stdint.h>
+#include <stdlib.h>
+
+// TODO filename -> `compaction`
+template <typename T>
+struct CompactionDRAM {
+    using type = T;
+    T*        val;
+    uint32_t* idx;
+    uint32_t* count;
+    uint32_t* h_count;
+
+    void allocate(size_t len, bool device = true)
+    {
+        if (device) {
+            cudaMalloc(&idx, sizeof(uint32_t) * len);
+            cudaMalloc(&val, sizeof(T) * len);
+            cudaMalloc(&count, sizeof(T) * 1);
+            cudaMallocHost(&h_count, sizeof(T) * 1);
+        }
+        else {
+            cudaMallocHost(&idx, sizeof(uint32_t) * len);
+            cudaMallocHost(&val, sizeof(T) * len);
+            cudaMallocHost(&count, sizeof(T) * 1);
+
+            memset(count, 0x0, sizeof(T) * 1);
+        }
+    }
+
+    void make_count_host_accessible(cudaStream_t stream)
+    {
+        cudaMemcpyAsync(h_count, count, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream);
+    }
+
+    uint32_t access_count_on_host() { return *h_count; }
+
+    void allocate_managed(size_t len)
+    {
+        cudaMallocManaged(&idx, sizeof(uint32_t) * len);
+        cudaMallocManaged(&val, sizeof(T) * len);
+        cudaMallocManaged(&count, sizeof(T) * 1);
+
+        cudaMemset(count, 0x0, sizeof(T) * 1);
+    }
+
+    void destroy()
+    {
+        if (h_count) cudaFreeHost(h_count);
+        cudaFree(idx);
+        cudaFree(val);
+        cudaFree(count);
+    }
+};
+
+#endif /* F712F74C_7488_4445_83EE_EE7F88A64BBA */
diff --git a/qtensor/compression/cusz/include/pipeline/v2_compressor.hh b/qtensor/compression/cusz/include/pipeline/v2_compressor.hh
index fa843f5f..5e0c8a83 100644
--- a/qtensor/compression/cusz/include/pipeline/v2_compressor.hh
+++ b/qtensor/compression/cusz/include/pipeline/v2_compressor.hh
@@ -1,146 +1,146 @@
-/**
- * @file v2_compressor.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.4
- * @date 2023-01-29
- *
- * (C) 2023 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include <cuda_runtime.h>
-#include <memory>
-
-#include "common/type_traits.hh"
-#include "compaction.hh"
-#include "component.hh"
-#include "context.hh"
-#include "header.h"
-
-// TODO move outward
-#include "compaction_g.inl"
-
-using Context = cusz::Context;
-
-namespace psz {
-
-template <class CONFIG>
-class v2_Compressor {
-   public:
-    using BYTE = uint8_t;
-
-    using T    = typename CONFIG::Predictor::Origin;
-    using FP   = typename CONFIG::Predictor::Precision;
-    using E    = typename CONFIG::Predictor::ErrCtrl;
-    using H    = typename CONFIG::Codec::Encoded;
-    using M    = typename CONFIG::Codec::MetadataT;
-    using H_FB = typename CONFIG::FallbackCodec::Encoded;
-
-    using TimeRecord   = std::vector<std::tuple<const char*, double>>;
-    using timerecord_t = TimeRecord*;
-
-   private:
-    class impl;
-    std::unique_ptr<impl> pimpl;
-
-   public:
-    ~v2_Compressor();
-    v2_Compressor();
-    v2_Compressor(const v2_Compressor&);
-    v2_Compressor& operator=(const v2_Compressor&);
-    v2_Compressor(v2_Compressor&&);
-    v2_Compressor& operator=(v2_Compressor&&);
-
-    // methods
-    void init(Context*);
-    void init(v2_header*);
-    void destroy();
-    void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false);
-    void decompress(v2_header*, BYTE*, T*, cudaStream_t = nullptr, bool = true);
-    void clear_buffer();
-    // getter
-    void export_header(v2_header&);
-    void export_header(v2_header*);
-    void export_timerecord(TimeRecord*);
-};
-
-template <class CONFIG>
-class v2_Compressor<CONFIG>::impl {
-   public:
-    using Codec = typename CONFIG::Codec;
-    using BYTE  = uint8_t;
-    using T     = typename CONFIG::Predictor::Origin;
-    using FP    = typename CONFIG::Predictor::Precision;
-    using EQ    = uint32_t;
-    using H     = typename CONFIG::Codec::Encoded;
-    using M     = uint32_t;
-    using IDX   = uint32_t;
-    using H_FB  = typename CONFIG::FallbackCodec::Encoded;
-
-    using TimeRecord   = std::vector<std::tuple<const char*, double>>;
-    using timerecord_t = TimeRecord*;
-
-   private:
-    // state
-    // bool  use_fallback_codec{false};
-    // bool  fallback_codec_allocated{false};
-
-    BYTE* d_reserved_for_archive{nullptr};
-
-    // profiling
-    // TimeRecord timerecord;
-    // header
-    v2_header header;
-    // components
-
-    Codec* codec;
-
-    // arrays
-    T*                d_anchor;
-    uint32_t*         d_errctrl;
-    uint32_t*         d_freq;
-    CompactionDRAM<T> outlier;
-
-    int sp_factor{20};
-
-    struct {
-        float construct, hist, encode;
-    } comp_time;
-
-    struct {
-        float scatter, decode, reconstruct;
-    } decomp_time;
-
-    dim3   data_len3;
-    size_t data_len;
-
-   public:
-    ~impl();
-    impl();
-
-    // public methods
-    void init(Context* config);
-    void init(v2_header* config);
-
-    void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false);
-    void decompress(v2_header*, BYTE*, T*, cudaStream_t = nullptr, bool = true);
-
-    // getter
-    void export_header(v2_header&);
-    void export_header(v2_header*);
-    // void export_timerecord(TimeRecord*);
-    BYTE* var_archive() { return d_reserved_for_archive; };
-
-   private:
-    // helper
-    template <class ContextOrHeader>
-    void __init(ContextOrHeader*);
-
-    // void collect_compress_timerecord();
-    // void collect_decompress_timerecord();
-    void destroy();
-    // getter
-};
-
-}  // namespace psz
+/**
+ * @file v2_compressor.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-29
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include <cuda_runtime.h>
+#include <memory>
+
+#include "common/type_traits.hh"
+#include "compaction.hh"
+#include "component.hh"
+#include "context.hh"
+#include "header.h"
+
+// TODO move outward
+#include "compaction_g.inl"
+
+using Context = cusz::Context;
+
+namespace psz {
+
+template <class CONFIG>
+class v2_Compressor {
+   public:
+    using BYTE = uint8_t;
+
+    using T    = typename CONFIG::Predictor::Origin;
+    using FP   = typename CONFIG::Predictor::Precision;
+    using E    = typename CONFIG::Predictor::ErrCtrl;
+    using H    = typename CONFIG::Codec::Encoded;
+    using M    = typename CONFIG::Codec::MetadataT;
+    using H_FB = typename CONFIG::FallbackCodec::Encoded;
+
+    using TimeRecord   = std::vector<std::tuple<const char*, double>>;
+    using timerecord_t = TimeRecord*;
+
+   private:
+    class impl;
+    std::unique_ptr<impl> pimpl;
+
+   public:
+    ~v2_Compressor();
+    v2_Compressor();
+    v2_Compressor(const v2_Compressor&);
+    v2_Compressor& operator=(const v2_Compressor&);
+    v2_Compressor(v2_Compressor&&);
+    v2_Compressor& operator=(v2_Compressor&&);
+
+    // methods
+    void init(Context*);
+    void init(v2_header*);
+    void destroy();
+    void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false);
+    void decompress(v2_header*, BYTE*, T*, cudaStream_t = nullptr, bool = true);
+    void clear_buffer();
+    // getter
+    void export_header(v2_header&);
+    void export_header(v2_header*);
+    void export_timerecord(TimeRecord*);
+};
+
+template <class CONFIG>
+class v2_Compressor<CONFIG>::impl {
+   public:
+    using Codec = typename CONFIG::Codec;
+    using BYTE  = uint8_t;
+    using T     = typename CONFIG::Predictor::Origin;
+    using FP    = typename CONFIG::Predictor::Precision;
+    using EQ    = uint32_t;
+    using H     = typename CONFIG::Codec::Encoded;
+    using M     = uint32_t;
+    using IDX   = uint32_t;
+    using H_FB  = typename CONFIG::FallbackCodec::Encoded;
+
+    using TimeRecord   = std::vector<std::tuple<const char*, double>>;
+    using timerecord_t = TimeRecord*;
+
+   private:
+    // state
+    // bool  use_fallback_codec{false};
+    // bool  fallback_codec_allocated{false};
+
+    BYTE* d_reserved_for_archive{nullptr};
+
+    // profiling
+    // TimeRecord timerecord;
+    // header
+    v2_header header;
+    // components
+
+    Codec* codec;
+
+    // arrays
+    T*                d_anchor;
+    uint32_t*         d_errctrl;
+    uint32_t*         d_freq;
+    CompactionDRAM<T> outlier;
+
+    int sp_factor{20};
+
+    struct {
+        float construct, hist, encode;
+    } comp_time;
+
+    struct {
+        float scatter, decode, reconstruct;
+    } decomp_time;
+
+    dim3   data_len3;
+    size_t data_len;
+
+   public:
+    ~impl();
+    impl();
+
+    // public methods
+    void init(Context* config);
+    void init(v2_header* config);
+
+    void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false);
+    void decompress(v2_header*, BYTE*, T*, cudaStream_t = nullptr, bool = true);
+
+    // getter
+    void export_header(v2_header&);
+    void export_header(v2_header*);
+    // void export_timerecord(TimeRecord*);
+    BYTE* var_archive() { return d_reserved_for_archive; };
+
+   private:
+    // helper
+    template <class ContextOrHeader>
+    void __init(ContextOrHeader*);
+
+    // void collect_compress_timerecord();
+    // void collect_decompress_timerecord();
+    void destroy();
+    // getter
+};
+
+}  // namespace psz
diff --git a/qtensor/compression/cusz/include/stat/compare.h b/qtensor/compression/cusz/include/stat/compare.h
index 9575d72a..bc60fb0b 100644
--- a/qtensor/compression/cusz/include/stat/compare.h
+++ b/qtensor/compression/cusz/include/stat/compare.h
@@ -1,57 +1,57 @@
-/**
- * @file compare.h
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-10-09
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef CE05A256_23CB_4243_8839_B1FDA9C540D2
-#define CE05A256_23CB_4243_8839_B1FDA9C540D2
-
-#ifdef __cplus_plus
-extern "C" {
-#endif
-
-#include <stdint.h>
-#include <stdlib.h>
-#include "../cusz/type.h"
-
-#define DESCRIPTION(Tliteral, T) void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]);
-
-#define COMPARE_LOSSLESS(Tliteral, T)                                  \
-    bool cppstd_identical_T##Tliteral(T* d1, T* d2, size_t const len); \
-    bool thrustgpu_identical_T##Tliteral(T* d1, T* d2, size_t const len);
-
-#define COMPARE_LOSSY(Tliteral, T)                                                                                     \
-    bool cppstd_error_bounded_T##Tliteral(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx);    \
-    void cppstd_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len);                       \
-                                                                                                                       \
-    bool thrustgpu_error_bounded_T##Tliteral(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx); \
-    void thrustgpu_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len);
-
-DESCRIPTION(ui8, uint8_t)
-DESCRIPTION(ui16, uint16_t)
-DESCRIPTION(ui32, uint32_t)
-DESCRIPTION(fp32, float)
-DESCRIPTION(fp64, double)
-
-COMPARE_LOSSLESS(fp32, float)
-COMPARE_LOSSLESS(fp64, double)
-COMPARE_LOSSLESS(ui8, uint8_t)
-COMPARE_LOSSLESS(ui16, uint16_t)
-COMPARE_LOSSLESS(ui32, uint32_t)
-
-COMPARE_LOSSY(fp32, float)
-COMPARE_LOSSY(fp64, double)
-
-#undef CPPSTD_COMPARE
-
-#ifdef __cplus_plus
-}
-#endif
-
-#endif /* CE05A256_23CB_4243_8839_B1FDA9C540D2 */
+/**
+ * @file compare.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-09
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CE05A256_23CB_4243_8839_B1FDA9C540D2
+#define CE05A256_23CB_4243_8839_B1FDA9C540D2
+
+#ifdef __cplus_plus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "../cusz/type.h"
+
+#define DESCRIPTION(Tliteral, T) void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]);
+
+#define COMPARE_LOSSLESS(Tliteral, T)                                  \
+    bool cppstd_identical_T##Tliteral(T* d1, T* d2, size_t const len); \
+    bool thrustgpu_identical_T##Tliteral(T* d1, T* d2, size_t const len);
+
+#define COMPARE_LOSSY(Tliteral, T)                                                                                     \
+    bool cppstd_error_bounded_T##Tliteral(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx);    \
+    void cppstd_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len);                       \
+                                                                                                                       \
+    bool thrustgpu_error_bounded_T##Tliteral(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx); \
+    void thrustgpu_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len);
+
+DESCRIPTION(ui8, uint8_t)
+DESCRIPTION(ui16, uint16_t)
+DESCRIPTION(ui32, uint32_t)
+DESCRIPTION(fp32, float)
+DESCRIPTION(fp64, double)
+
+COMPARE_LOSSLESS(fp32, float)
+COMPARE_LOSSLESS(fp64, double)
+COMPARE_LOSSLESS(ui8, uint8_t)
+COMPARE_LOSSLESS(ui16, uint16_t)
+COMPARE_LOSSLESS(ui32, uint32_t)
+
+COMPARE_LOSSY(fp32, float)
+COMPARE_LOSSY(fp64, double)
+
+#undef CPPSTD_COMPARE
+
+#ifdef __cplus_plus
+}
+#endif
+
+#endif /* CE05A256_23CB_4243_8839_B1FDA9C540D2 */
diff --git a/qtensor/compression/cusz/include/stat/compare_cpu.hh b/qtensor/compression/cusz/include/stat/compare_cpu.hh
index 19846adc..3cd6c421 100644
--- a/qtensor/compression/cusz/include/stat/compare_cpu.hh
+++ b/qtensor/compression/cusz/include/stat/compare_cpu.hh
@@ -1,62 +1,62 @@
-/**
- * @file compare_cpu.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-10-09
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef C93C3857_8821_4988_B6F0_4E885060F642
-#define C93C3857_8821_4988_B6F0_4E885060F642
-
-#include "compare.h"
-
-namespace psz {
-
-template <typename T>
-bool cppstd_identical(T* d1, T* d2, size_t const len);
-
-template <typename T>
-bool cppstd_error_bounded(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx);
-
-template <typename T>
-void cppstd_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t const len);
-
-}  // namespace psz
-
-#define CPPSTD_COMPARE_LOSSLESS(Tliteral, T)                        \
-    template <>                                                     \
-    bool psz::cppstd_identical<T>(T * d1, T * d2, size_t const len) \
-    {                                                               \
-        return cppstd_identical_T##Tliteral(d1, d2, len);           \
-    }
-
-#define CPPSTD_COMPARE_LOSSY(Tliteral, T)                                                                        \
-    template <>                                                                                                  \
-    bool psz::cppstd_error_bounded<T>(T * a, T * b, size_t const len, double const eb, size_t* first_faulty_idx) \
-    {                                                                                                            \
-        return cppstd_error_bounded_T##Tliteral(a, b, len, eb, first_faulty_idx);                                \
-    }                                                                                                            \
-                                                                                                                 \
-    template <>                                                                                                  \
-    void psz::cppstd_assess_quality<T>(cusz_stats * s, T * xdata, T * odata, size_t const len)                   \
-    {                                                                                                            \
-        cppstd_assess_quality_T##Tliteral(s, xdata, odata, len);                                                 \
-    }
-
-CPPSTD_COMPARE_LOSSLESS(fp32, float)
-CPPSTD_COMPARE_LOSSLESS(fp64, double)
-CPPSTD_COMPARE_LOSSLESS(ui8, uint8_t)
-CPPSTD_COMPARE_LOSSLESS(ui16, uint16_t)
-CPPSTD_COMPARE_LOSSLESS(ui32, uint32_t)
-
-CPPSTD_COMPARE_LOSSY(fp32, float);
-CPPSTD_COMPARE_LOSSY(fp64, double);
-
-#undef CPPSTD_COMPARE_LOSSLESS
-#undef CPPSTD_COMPARE_LOSSY
-
-#endif /* C93C3857_8821_4988_B6F0_4E885060F642 */
+/**
+ * @file compare_cpu.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-09
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef C93C3857_8821_4988_B6F0_4E885060F642
+#define C93C3857_8821_4988_B6F0_4E885060F642
+
+#include "compare.h"
+
+namespace psz {
+
+template <typename T>
+bool cppstd_identical(T* d1, T* d2, size_t const len);
+
+template <typename T>
+bool cppstd_error_bounded(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx);
+
+template <typename T>
+void cppstd_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t const len);
+
+}  // namespace psz
+
+#define CPPSTD_COMPARE_LOSSLESS(Tliteral, T)                        \
+    template <>                                                     \
+    bool psz::cppstd_identical<T>(T * d1, T * d2, size_t const len) \
+    {                                                               \
+        return cppstd_identical_T##Tliteral(d1, d2, len);           \
+    }
+
+#define CPPSTD_COMPARE_LOSSY(Tliteral, T)                                                                        \
+    template <>                                                                                                  \
+    bool psz::cppstd_error_bounded<T>(T * a, T * b, size_t const len, double const eb, size_t* first_faulty_idx) \
+    {                                                                                                            \
+        return cppstd_error_bounded_T##Tliteral(a, b, len, eb, first_faulty_idx);                                \
+    }                                                                                                            \
+                                                                                                                 \
+    template <>                                                                                                  \
+    void psz::cppstd_assess_quality<T>(cusz_stats * s, T * xdata, T * odata, size_t const len)                   \
+    {                                                                                                            \
+        cppstd_assess_quality_T##Tliteral(s, xdata, odata, len);                                                 \
+    }
+
+CPPSTD_COMPARE_LOSSLESS(fp32, float)
+CPPSTD_COMPARE_LOSSLESS(fp64, double)
+CPPSTD_COMPARE_LOSSLESS(ui8, uint8_t)
+CPPSTD_COMPARE_LOSSLESS(ui16, uint16_t)
+CPPSTD_COMPARE_LOSSLESS(ui32, uint32_t)
+
+CPPSTD_COMPARE_LOSSY(fp32, float);
+CPPSTD_COMPARE_LOSSY(fp64, double);
+
+#undef CPPSTD_COMPARE_LOSSLESS
+#undef CPPSTD_COMPARE_LOSSY
+
+#endif /* C93C3857_8821_4988_B6F0_4E885060F642 */
diff --git a/qtensor/compression/cusz/include/stat/compare_gpu.hh b/qtensor/compression/cusz/include/stat/compare_gpu.hh
index 482c2fab..78013ca7 100644
--- a/qtensor/compression/cusz/include/stat/compare_gpu.hh
+++ b/qtensor/compression/cusz/include/stat/compare_gpu.hh
@@ -1,33 +1,33 @@
-/**
- * @file compare_gpu.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-10-09
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef B0EE0E82_B3AA_4946_A589_A3A6A83DD862
-#define B0EE0E82_B3AA_4946_A589_A3A6A83DD862
-
-#include "compare.h"
-
-namespace psz {
-
-template <typename T>
-void thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4]);
-
-template <typename T>
-bool thrustgpu_identical(T* d1, T* d2, size_t const len);
-
-template <typename T>
-bool thrustgpu_error_bounded(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx);
-
-template <typename T>
-void thrustgpu_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t const len);
-
-}  // namespace psz
-
-#endif /* B0EE0E82_B3AA_4946_A589_A3A6A83DD862 */
+/**
+ * @file compare_gpu.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-09
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef B0EE0E82_B3AA_4946_A589_A3A6A83DD862
+#define B0EE0E82_B3AA_4946_A589_A3A6A83DD862
+
+#include "compare.h"
+
+namespace psz {
+
+template <typename T>
+void thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4]);
+
+template <typename T>
+bool thrustgpu_identical(T* d1, T* d2, size_t const len);
+
+template <typename T>
+bool thrustgpu_error_bounded(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx);
+
+template <typename T>
+void thrustgpu_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t const len);
+
+}  // namespace psz
+
+#endif /* B0EE0E82_B3AA_4946_A589_A3A6A83DD862 */
diff --git a/qtensor/compression/cusz/include/stat/stat.h b/qtensor/compression/cusz/include/stat/stat.h
index 971d94bc..ade8deea 100644
--- a/qtensor/compression/cusz/include/stat/stat.h
+++ b/qtensor/compression/cusz/include/stat/stat.h
@@ -1,29 +1,29 @@
-/**
- * @file stat.h
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-11-02
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef BBBB5712_FF60_4262_B927_85B113FD26BA
-#define BBBB5712_FF60_4262_B927_85B113FD26BA
-
-#include "cusz/type.h"
-
-#define HIST_C(Tname, T)                                                                                 \
-    cusz_error_status histogram_T##Tname(                                                                \
-        T* in_data, size_t const in_len, uint32_t* out_freq, int const num_buckets, float* milliseconds, \
-        cudaStream_t stream);
-
-HIST_C(ui8, uint8_t)
-HIST_C(ui16, uint16_t)
-HIST_C(ui32, uint32_t)
-HIST_C(ui64, uint64_t)
-
-#undef HIST_C
-
-#endif /* BBBB5712_FF60_4262_B927_85B113FD26BA */
+/**
+ * @file stat.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-02
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef BBBB5712_FF60_4262_B927_85B113FD26BA
+#define BBBB5712_FF60_4262_B927_85B113FD26BA
+
+#include "cusz/type.h"
+
+#define HIST_C(Tname, T)                                                                                 \
+    cusz_error_status histogram_T##Tname(                                                                \
+        T* in_data, size_t const in_len, uint32_t* out_freq, int const num_buckets, float* milliseconds, \
+        cudaStream_t stream);
+
+HIST_C(ui8, uint8_t)
+HIST_C(ui16, uint16_t)
+HIST_C(ui32, uint32_t)
+HIST_C(ui64, uint64_t)
+
+#undef HIST_C
+
+#endif /* BBBB5712_FF60_4262_B927_85B113FD26BA */
diff --git a/qtensor/compression/cusz/include/stat/stat.hh b/qtensor/compression/cusz/include/stat/stat.hh
index 636192a4..fedf6417 100644
--- a/qtensor/compression/cusz/include/stat/stat.hh
+++ b/qtensor/compression/cusz/include/stat/stat.hh
@@ -1,15 +1,15 @@
-/**
- * @file stat.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-11-02
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef B005D07B_D92D_4DF0_90D0_87A7B7C310C9
-#define B005D07B_D92D_4DF0_90D0_87A7B7C310C9
-
-#endif /* B005D07B_D92D_4DF0_90D0_87A7B7C310C9 */
+/**
+ * @file stat.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-02
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef B005D07B_D92D_4DF0_90D0_87A7B7C310C9
+#define B005D07B_D92D_4DF0_90D0_87A7B7C310C9
+
+#endif /* B005D07B_D92D_4DF0_90D0_87A7B7C310C9 */
diff --git a/qtensor/compression/cusz/include/stat/stat_g.hh b/qtensor/compression/cusz/include/stat/stat_g.hh
index a76ea6f9..45f2f84d 100644
--- a/qtensor/compression/cusz/include/stat/stat_g.hh
+++ b/qtensor/compression/cusz/include/stat/stat_g.hh
@@ -1,44 +1,44 @@
-/**
- * @file stat_g.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-11-02
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef D8B68EB9_A86B_4AEA_AD4C_3DF22827E7C3
-#define D8B68EB9_A86B_4AEA_AD4C_3DF22827E7C3
-
-#include <cuda_runtime.h>
-#include "cusz/type.h"
-
-namespace asz {
-namespace stat {
-
-/**
- * @brief Get frequency: a kernel wrapper
- *
- * @tparam T input type
- * @param in_data input device array
- * @param in_len input host var; len of in_data
- * @param out_freq output device array
- * @param nbin input host var; len of out_freq
- * @param milliseconds output time elapsed
- * @param stream optional stream
- */
-template <typename T>
-cusz_error_status histogram(
-    T*           in_data,
-    size_t const in_len,
-    uint32_t*    out_freq,
-    int const    nbin,
-    float*       milliseconds,
-    cudaStream_t stream = nullptr);
-
-}  // namespace stat
-}  // namespace asz
-
-#endif /* D8B68EB9_A86B_4AEA_AD4C_3DF22827E7C3 */
+/**
+ * @file stat_g.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-02
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef D8B68EB9_A86B_4AEA_AD4C_3DF22827E7C3
+#define D8B68EB9_A86B_4AEA_AD4C_3DF22827E7C3
+
+#include <cuda_runtime.h>
+#include "cusz/type.h"
+
+namespace asz {
+namespace stat {
+
+/**
+ * @brief Get frequency: a kernel wrapper
+ *
+ * @tparam T input type
+ * @param in_data input device array
+ * @param in_len input host var; len of in_data
+ * @param out_freq output device array
+ * @param nbin input host var; len of out_freq
+ * @param milliseconds output time elapsed
+ * @param stream optional stream
+ */
+template <typename T>
+cusz_error_status histogram(
+    T*           in_data,
+    size_t const in_len,
+    uint32_t*    out_freq,
+    int const    nbin,
+    float*       milliseconds,
+    cudaStream_t stream = nullptr);
+
+}  // namespace stat
+}  // namespace asz
+
+#endif /* D8B68EB9_A86B_4AEA_AD4C_3DF22827E7C3 */
diff --git a/qtensor/compression/cusz/include/utils.hh b/qtensor/compression/cusz/include/utils.hh
index 68ec1d2b..fd15517c 100644
--- a/qtensor/compression/cusz/include/utils.hh
+++ b/qtensor/compression/cusz/include/utils.hh
@@ -1,21 +1,21 @@
-/**
- * @file utils.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2021-07-12
- *
- * (C) 2021 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef UTILS_HH
-#define UTILS_HH
-
-#include "utils/cuda_err.cuh"
-#include "utils/cuda_mem.cuh"
-#include "utils/format.hh"
-#include "utils/io.hh"
-#include "utils/strhelper.hh"
-
+/**
+ * @file utils.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-07-12
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef UTILS_HH
+#define UTILS_HH
+
+#include "utils/cuda_err.cuh"
+#include "utils/cuda_mem.cuh"
+#include "utils/format.hh"
+#include "utils/io.hh"
+#include "utils/strhelper.hh"
+
 #endif
\ No newline at end of file
diff --git a/qtensor/compression/cusz/include/utils/cuda_err.cuh b/qtensor/compression/cusz/include/utils/cuda_err.cuh
index 0812c60e..5b80b04b 100644
--- a/qtensor/compression/cusz/include/utils/cuda_err.cuh
+++ b/qtensor/compression/cusz/include/utils/cuda_err.cuh
@@ -1,185 +1,185 @@
-#ifndef CUDA_ERR_CUH
-#define CUDA_ERR_CUH
-
-/**
- * @file cuda_err.cuh
- * @author Jiannan Tian
- * @brief CUDA runtime error handling macros.
- * @version 0.2
- * @date 2020-09-20
- * Created on: 2019-10-08
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#include <cuda_runtime.h>
-#include <cstdio>
-#include <string>
-#include <stdexcept>
-#include <sstream>
-
-struct cusz_cuda_exception : public std::exception {
-  cusz_cuda_exception(const char* err, int err_code, const char* file, int line) {
-    std::stringstream ss;
-    ss << "CUDA API failed at \e[31m\e[1m" << file << ':' << line << "\e[0m with error: " << err << '(' << err_code << ')';
-    err_msg = ss.str();
-  }
-  const char* what() const noexcept {
-    return err_msg.c_str();
-  }
-  std::string err_msg;
-};
-
-// back compatibility start
-static void HandleError(cudaError_t err, const char* file, int line)
-{
-    if (err != cudaSuccess) {
-        throw cusz_cuda_exception(cudaGetErrorString(err), err, file, line);
-    }
-}
-#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
-// back compatibility end
-
-static void check_cuda_error(cudaError_t status, const char* file, int line)
-{
-    if (cudaSuccess != status) {
-        /*
-        printf("\nCUDA error/status reference (as of CUDA 11):\n");
-        printf("cudaSuccess                         -> %d\n", cudaSuccess);
-        printf("cudaErrorInvalidValue               -> %d\n", cudaErrorInvalidValue);
-        printf("cudaErrorMemoryAllocation           -> %d\n", cudaErrorMemoryAllocation);
-        printf("cudaErrorInitializationError        -> %d\n", cudaErrorInitializationError);
-        printf("cudaErrorCudartUnloading            -> %d\n", cudaErrorCudartUnloading);
-        printf("cudaErrorProfilerDisabled           -> %d\n", cudaErrorProfilerDisabled);
-        printf("cudaErrorProfilerNotInitialized (Deprecated)-> %d\n", cudaErrorProfilerNotInitialized);
-        printf("cudaErrorProfilerAlreadyStarted (Deprecated)-> %d\n", cudaErrorProfilerAlreadyStarted);
-        printf("cudaErrorProfilerAlreadyStopped (Deprecated)-> %d\n", cudaErrorProfilerAlreadyStopped);
-        printf("cudaErrorInvalidConfiguration       -> %d\n", cudaErrorInvalidConfiguration);
-        printf("cudaErrorInvalidPitchValue          -> %d\n", cudaErrorInvalidPitchValue);
-        printf("cudaErrorInvalidSymbol              -> %d\n", cudaErrorInvalidSymbol);
-        printf("cudaErrorInvalidHostPointer     (Deprecated)-> %d\n", cudaErrorInvalidHostPointer);
-        printf("cudaErrorInvalidDevicePointer   (Deprecated)-> %d\n", cudaErrorInvalidDevicePointer);
-        printf("cudaErrorInvalidTexture             -> %d\n", cudaErrorInvalidTexture);
-        printf("cudaErrorInvalidTextureBinding      -> %d\n", cudaErrorInvalidTextureBinding);
-        printf("cudaErrorInvalidChannelDescriptor   -> %d\n", cudaErrorInvalidChannelDescriptor);
-        printf("cudaErrorInvalidMemcpyDirection     -> %d\n", cudaErrorInvalidMemcpyDirection);
-        printf("cudaErrorAddressOfConstant      (Deprecated)-> %d\n", cudaErrorAddressOfConstant);
-        printf("cudaErrorTextureFetchFailed     (Deprecated)-> %d\n", cudaErrorTextureFetchFailed);
-        printf("cudaErrorTextureNotBound        (Deprecated)-> %d\n", cudaErrorTextureNotBound);
-        printf("cudaErrorSynchronizationError   (Deprecated)-> %d\n", cudaErrorSynchronizationError);
-        printf("cudaErrorInvalidFilterSetting       -> %d\n", cudaErrorInvalidFilterSetting);
-        printf("cudaErrorInvalidNormSetting         -> %d\n", cudaErrorInvalidNormSetting);
-        printf("cudaErrorMixedDeviceExecution   (Deprecated)-> %d\n", cudaErrorMixedDeviceExecution);
-        printf("cudaErrorNotYetImplemented      (Deprecated)-> %d\n", cudaErrorNotYetImplemented);
-        printf("cudaErrorMemoryValueTooLarge    (Deprecated)-> %d\n", cudaErrorMemoryValueTooLarge);
-        printf("cudaErrorInsufficientDriver         -> %d\n", cudaErrorInsufficientDriver);
-        printf("cudaErrorInvalidSurface             -> %d\n", cudaErrorInvalidSurface);
-        printf("cudaErrorDuplicateVariableName      -> %d\n", cudaErrorDuplicateVariableName);
-        printf("cudaErrorDuplicateTextureName       -> %d\n", cudaErrorDuplicateTextureName);
-        printf("cudaErrorDuplicateSurfaceName       -> %d\n", cudaErrorDuplicateSurfaceName);
-        printf("cudaErrorDevicesUnavailable         -> %d\n", cudaErrorDevicesUnavailable);
-        printf("cudaErrorIncompatibleDriverContext  -> %d\n", cudaErrorIncompatibleDriverContext);
-        printf("cudaErrorMissingConfiguration       -> %d\n", cudaErrorMissingConfiguration);
-        printf("cudaErrorPriorLaunchFailure     (Deprecated)-> %d\n", cudaErrorPriorLaunchFailure);
-        printf("cudaErrorLaunchMaxDepthExceeded     -> %d\n", cudaErrorLaunchMaxDepthExceeded);
-        printf("cudaErrorLaunchFileScopedTex        -> %d\n", cudaErrorLaunchFileScopedTex);
-        printf("cudaErrorLaunchFileScopedSurf       -> %d\n", cudaErrorLaunchFileScopedSurf);
-        printf("cudaErrorSyncDepthExceeded          -> %d\n", cudaErrorSyncDepthExceeded);
-        printf("cudaErrorLaunchPendingCountExceeded -> %d\n", cudaErrorLaunchPendingCountExceeded);
-        printf("cudaErrorInvalidDeviceFunction      -> %d\n", cudaErrorInvalidDeviceFunction);
-        printf("cudaErrorNoDevice                   -> %d\n", cudaErrorNoDevice);
-        printf("cudaErrorInvalidDevice              -> %d\n", cudaErrorInvalidDevice);
-        printf("cudaErrorStartupFailure             -> %d\n", cudaErrorStartupFailure);
-        printf("cudaErrorInvalidKernelImage         -> %d\n", cudaErrorInvalidKernelImage);
-    #if (CUDART_VERSION == 1100)
-        printf("cudaErrorDeviceUninitialized        -> %d\n", cudaErrorDeviceUninitialized);
-    #endif
-        printf("cudaErrorMapBufferObjectFailed      -> %d\n", cudaErrorMapBufferObjectFailed);
-        printf("cudaErrorUnmapBufferObjectFailed    -> %d\n", cudaErrorUnmapBufferObjectFailed);
-    #if (CUDART_VERSION == 1010)
-        printf("cudaErrorArrayIsMapped              -> %d\n", cudaErrorArrayIsMapped);
-        printf("cudaErrorAlreadyMapped              -> %d\n", cudaErrorAlreadyMapped);
-    #endif
-        printf("cudaErrorNoKernelImageForDevice     -> %d\n", cudaErrorNoKernelImageForDevice);
-    #if (CUDART_VERSION == 1010)
-        printf("cudaErrorAlreadyAcquired            -> %d\n", cudaErrorAlreadyAcquired);
-        printf("cudaErrorNotMapped                  -> %d\n", cudaErrorNotMapped);
-        printf("cudaErrorNotMappedAsArray           -> %d\n", cudaErrorNotMappedAsArray);
-        printf("cudaErrorNotMappedAsPointer         -> %d\n", cudaErrorNotMappedAsPointer);
-    #endif
-        printf("cudaErrorECCUncorrectable           -> %d\n", cudaErrorECCUncorrectable);
-        printf("cudaErrorUnsupportedLimit           -> %d\n", cudaErrorUnsupportedLimit);
-        printf("cudaErrorDeviceAlreadyInUse         -> %d\n", cudaErrorDeviceAlreadyInUse);
-        printf("cudaErrorPeerAccessUnsupported      -> %d\n", cudaErrorPeerAccessUnsupported);
-        printf("cudaErrorInvalidPtx                 -> %d\n", cudaErrorInvalidPtx);
-        printf("cudaErrorInvalidGraphicsContext     -> %d\n", cudaErrorInvalidGraphicsContext);
-        printf("cudaErrorNvlinkUncorrectable        -> %d\n", cudaErrorNvlinkUncorrectable);
-        printf("cudaErrorJitCompilerNotFound        -> %d\n", cudaErrorJitCompilerNotFound);
-    #if (CUDART_VERSION == 1010)
-        printf("cudaErrorInvalidSource              -> %d\n", cudaErrorInvalidSource);
-        printf("cudaErrorFileNotFound               -> %d\n", cudaErrorFileNotFound);
-    #endif
-        printf("cudaErrorSharedObjectSymbolNotFound -> %d\n", cudaErrorSharedObjectSymbolNotFound);
-        printf("cudaErrorSharedObjectInitFailed     -> %d\n", cudaErrorSharedObjectInitFailed);
-        printf("cudaErrorOperatingSystem            -> %d\n", cudaErrorOperatingSystem);
-        printf("cudaErrorInvalidResourceHandle      -> %d\n", cudaErrorInvalidResourceHandle);
-    #if (CUDART_VERSION == 1010)
-        printf("cudaErrorIllegalState               -> %d\n", cudaErrorIllegalState);
-        printf("cudaErrorSymbolNotFound             -> %d\n", cudaErrorSymbolNotFound);
-    #endif
-        printf("cudaErrorNotReady                   -> %d\n", cudaErrorNotReady);
-        printf("cudaErrorIllegalAddress             -> %d\n", cudaErrorIllegalAddress);
-        printf("cudaErrorLaunchOutOfResources       -> %d\n", cudaErrorLaunchOutOfResources);
-        printf("cudaErrorLaunchTimeout              -> %d\n", cudaErrorLaunchTimeout);
-    #if (CUDART_VERSION == 1010)
-        printf("cudaErrorLaunchIncompatibleTexturing-> %d\n", cudaErrorLaunchIncompatibleTexturing);
-    #endif
-        printf("cudaErrorPeerAccessAlreadyEnabled   -> %d\n", cudaErrorPeerAccessAlreadyEnabled);
-        printf("cudaErrorPeerAccessNotEnabled       -> %d\n", cudaErrorPeerAccessNotEnabled);
-        printf("cudaErrorSetOnActiveProcess         -> %d\n", cudaErrorSetOnActiveProcess);
-    #if (CUDART_VERSION == 1010)
-        printf("cudaErrorContextIsDestroyed         -> %d\n", cudaErrorContextIsDestroyed);
-    #endif
-        printf("cudaErrorAssert                     -> %d\n", cudaErrorAssert);
-        printf("cudaErrorTooManyPeers               -> %d\n", cudaErrorTooManyPeers);
-        printf("cudaErrorHostMemoryAlreadyRegistered-> %d\n", cudaErrorHostMemoryAlreadyRegistered);
-        printf("cudaErrorHostMemoryNotRegistered    -> %d\n", cudaErrorHostMemoryNotRegistered);
-        printf("cudaErrorHardwareStackError         -> %d\n", cudaErrorHardwareStackError);
-        printf("cudaErrorIllegalInstruction         -> %d\n", cudaErrorIllegalInstruction);
-        printf("cudaErrorMisalignedAddress          -> %d\n", cudaErrorMisalignedAddress);
-        printf("cudaErrorInvalidAddressSpace        -> %d\n", cudaErrorInvalidAddressSpace);
-        printf("cudaErrorInvalidPc                  -> %d\n", cudaErrorInvalidPc);
-        printf("cudaErrorLaunchFailure              -> %d\n", cudaErrorLaunchFailure);
-        printf("cudaErrorCooperativeLaunchTooLarge  -> %d\n", cudaErrorCooperativeLaunchTooLarge);
-        printf("cudaErrorNotPermitted               -> %d\n", cudaErrorNotPermitted);
-        printf("cudaErrorNotSupported               -> %d\n", cudaErrorNotSupported);
-    #if (CUDART_VERSION == 1010)
-        printf("cudaErrorSystemNotReady             -> %d\n", cudaErrorSystemNotReady);
-        printf("cudaErrorSystemDriverMismatch       -> %d\n", cudaErrorSystemDriverMismatch);
-        printf("cudaErrorCompatNotSupportedOnDevice -> %d\n", cudaErrorCompatNotSupportedOnDevice);
-        printf("cudaErrorStreamCaptureUnsupported   -> %d\n", cudaErrorStreamCaptureUnsupported);
-        printf("cudaErrorStreamCaptureInvalidated   -> %d\n", cudaErrorStreamCaptureInvalidated);
-        printf("cudaErrorStreamCaptureMerge         -> %d\n", cudaErrorStreamCaptureMerge);
-        printf("cudaErrorStreamCaptureUnmatched     -> %d\n", cudaErrorStreamCaptureUnmatched);
-        printf("cudaErrorStreamCaptureUnjoined      -> %d\n", cudaErrorStreamCaptureUnjoined);
-        printf("cudaErrorStreamCaptureIsolation     -> %d\n", cudaErrorStreamCaptureIsolation);
-        printf("cudaErrorStreamCaptureImplicit      -> %d\n", cudaErrorStreamCaptureImplicit);
-        printf("cudaErrorCapturedEvent              -> %d\n", cudaErrorCapturedEvent);
-        printf("cudaErrorStreamCaptureWrongThread   -> %d\n", cudaErrorStreamCaptureWrongThread);
-    #endif
-    #if (CUDART_VERSION == 1100)
-        printf("cudaErrorTimeout                    -> %d\n", cudaErrorTimeout);
-        printf("cudaErrorGraphExecUpdateFailure     -> %d\n", cudaErrorGraphExecUpdateFailure);
-    #endif
-        printf("cudaErrorUnknown                    -> %d\n", cudaErrorUnknown);
-        printf("cudaErrorApiFailureBase (Deprecated)-> %d\n", cudaErrorApiFailureBase);
-        */
-        throw cusz_cuda_exception(cudaGetErrorString(status), status, file, line);
-    }
-}
-
-#define CHECK_CUDA(err) (check_cuda_error(err, __FILE__, __LINE__))
-
-#endif
+#ifndef CUDA_ERR_CUH
+#define CUDA_ERR_CUH
+
+/**
+ * @file cuda_err.cuh
+ * @author Jiannan Tian
+ * @brief CUDA runtime error handling macros.
+ * @version 0.2
+ * @date 2020-09-20
+ * Created on: 2019-10-08
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <cuda_runtime.h>
+#include <cstdio>
+#include <string>
+#include <stdexcept>
+#include <sstream>
+
+struct cusz_cuda_exception : public std::exception {
+  cusz_cuda_exception(const char* err, int err_code, const char* file, int line) {
+    std::stringstream ss;
+    ss << "CUDA API failed at \e[31m\e[1m" << file << ':' << line << "\e[0m with error: " << err << '(' << err_code << ')';
+    err_msg = ss.str();
+  }
+  const char* what() const noexcept {
+    return err_msg.c_str();
+  }
+  std::string err_msg;
+};
+
+// back compatibility start
+static void HandleError(cudaError_t err, const char* file, int line)
+{
+    if (err != cudaSuccess) {
+        throw cusz_cuda_exception(cudaGetErrorString(err), err, file, line);
+    }
+}
+#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
+// back compatibility end
+
+static void check_cuda_error(cudaError_t status, const char* file, int line)
+{
+    if (cudaSuccess != status) {
+        /*
+        printf("\nCUDA error/status reference (as of CUDA 11):\n");
+        printf("cudaSuccess                         -> %d\n", cudaSuccess);
+        printf("cudaErrorInvalidValue               -> %d\n", cudaErrorInvalidValue);
+        printf("cudaErrorMemoryAllocation           -> %d\n", cudaErrorMemoryAllocation);
+        printf("cudaErrorInitializationError        -> %d\n", cudaErrorInitializationError);
+        printf("cudaErrorCudartUnloading            -> %d\n", cudaErrorCudartUnloading);
+        printf("cudaErrorProfilerDisabled           -> %d\n", cudaErrorProfilerDisabled);
+        printf("cudaErrorProfilerNotInitialized (Deprecated)-> %d\n", cudaErrorProfilerNotInitialized);
+        printf("cudaErrorProfilerAlreadyStarted (Deprecated)-> %d\n", cudaErrorProfilerAlreadyStarted);
+        printf("cudaErrorProfilerAlreadyStopped (Deprecated)-> %d\n", cudaErrorProfilerAlreadyStopped);
+        printf("cudaErrorInvalidConfiguration       -> %d\n", cudaErrorInvalidConfiguration);
+        printf("cudaErrorInvalidPitchValue          -> %d\n", cudaErrorInvalidPitchValue);
+        printf("cudaErrorInvalidSymbol              -> %d\n", cudaErrorInvalidSymbol);
+        printf("cudaErrorInvalidHostPointer     (Deprecated)-> %d\n", cudaErrorInvalidHostPointer);
+        printf("cudaErrorInvalidDevicePointer   (Deprecated)-> %d\n", cudaErrorInvalidDevicePointer);
+        printf("cudaErrorInvalidTexture             -> %d\n", cudaErrorInvalidTexture);
+        printf("cudaErrorInvalidTextureBinding      -> %d\n", cudaErrorInvalidTextureBinding);
+        printf("cudaErrorInvalidChannelDescriptor   -> %d\n", cudaErrorInvalidChannelDescriptor);
+        printf("cudaErrorInvalidMemcpyDirection     -> %d\n", cudaErrorInvalidMemcpyDirection);
+        printf("cudaErrorAddressOfConstant      (Deprecated)-> %d\n", cudaErrorAddressOfConstant);
+        printf("cudaErrorTextureFetchFailed     (Deprecated)-> %d\n", cudaErrorTextureFetchFailed);
+        printf("cudaErrorTextureNotBound        (Deprecated)-> %d\n", cudaErrorTextureNotBound);
+        printf("cudaErrorSynchronizationError   (Deprecated)-> %d\n", cudaErrorSynchronizationError);
+        printf("cudaErrorInvalidFilterSetting       -> %d\n", cudaErrorInvalidFilterSetting);
+        printf("cudaErrorInvalidNormSetting         -> %d\n", cudaErrorInvalidNormSetting);
+        printf("cudaErrorMixedDeviceExecution   (Deprecated)-> %d\n", cudaErrorMixedDeviceExecution);
+        printf("cudaErrorNotYetImplemented      (Deprecated)-> %d\n", cudaErrorNotYetImplemented);
+        printf("cudaErrorMemoryValueTooLarge    (Deprecated)-> %d\n", cudaErrorMemoryValueTooLarge);
+        printf("cudaErrorInsufficientDriver         -> %d\n", cudaErrorInsufficientDriver);
+        printf("cudaErrorInvalidSurface             -> %d\n", cudaErrorInvalidSurface);
+        printf("cudaErrorDuplicateVariableName      -> %d\n", cudaErrorDuplicateVariableName);
+        printf("cudaErrorDuplicateTextureName       -> %d\n", cudaErrorDuplicateTextureName);
+        printf("cudaErrorDuplicateSurfaceName       -> %d\n", cudaErrorDuplicateSurfaceName);
+        printf("cudaErrorDevicesUnavailable         -> %d\n", cudaErrorDevicesUnavailable);
+        printf("cudaErrorIncompatibleDriverContext  -> %d\n", cudaErrorIncompatibleDriverContext);
+        printf("cudaErrorMissingConfiguration       -> %d\n", cudaErrorMissingConfiguration);
+        printf("cudaErrorPriorLaunchFailure     (Deprecated)-> %d\n", cudaErrorPriorLaunchFailure);
+        printf("cudaErrorLaunchMaxDepthExceeded     -> %d\n", cudaErrorLaunchMaxDepthExceeded);
+        printf("cudaErrorLaunchFileScopedTex        -> %d\n", cudaErrorLaunchFileScopedTex);
+        printf("cudaErrorLaunchFileScopedSurf       -> %d\n", cudaErrorLaunchFileScopedSurf);
+        printf("cudaErrorSyncDepthExceeded          -> %d\n", cudaErrorSyncDepthExceeded);
+        printf("cudaErrorLaunchPendingCountExceeded -> %d\n", cudaErrorLaunchPendingCountExceeded);
+        printf("cudaErrorInvalidDeviceFunction      -> %d\n", cudaErrorInvalidDeviceFunction);
+        printf("cudaErrorNoDevice                   -> %d\n", cudaErrorNoDevice);
+        printf("cudaErrorInvalidDevice              -> %d\n", cudaErrorInvalidDevice);
+        printf("cudaErrorStartupFailure             -> %d\n", cudaErrorStartupFailure);
+        printf("cudaErrorInvalidKernelImage         -> %d\n", cudaErrorInvalidKernelImage);
+    #if (CUDART_VERSION == 1100)
+        printf("cudaErrorDeviceUninitialized        -> %d\n", cudaErrorDeviceUninitialized);
+    #endif
+        printf("cudaErrorMapBufferObjectFailed      -> %d\n", cudaErrorMapBufferObjectFailed);
+        printf("cudaErrorUnmapBufferObjectFailed    -> %d\n", cudaErrorUnmapBufferObjectFailed);
+    #if (CUDART_VERSION == 1010)
+        printf("cudaErrorArrayIsMapped              -> %d\n", cudaErrorArrayIsMapped);
+        printf("cudaErrorAlreadyMapped              -> %d\n", cudaErrorAlreadyMapped);
+    #endif
+        printf("cudaErrorNoKernelImageForDevice     -> %d\n", cudaErrorNoKernelImageForDevice);
+    #if (CUDART_VERSION == 1010)
+        printf("cudaErrorAlreadyAcquired            -> %d\n", cudaErrorAlreadyAcquired);
+        printf("cudaErrorNotMapped                  -> %d\n", cudaErrorNotMapped);
+        printf("cudaErrorNotMappedAsArray           -> %d\n", cudaErrorNotMappedAsArray);
+        printf("cudaErrorNotMappedAsPointer         -> %d\n", cudaErrorNotMappedAsPointer);
+    #endif
+        printf("cudaErrorECCUncorrectable           -> %d\n", cudaErrorECCUncorrectable);
+        printf("cudaErrorUnsupportedLimit           -> %d\n", cudaErrorUnsupportedLimit);
+        printf("cudaErrorDeviceAlreadyInUse         -> %d\n", cudaErrorDeviceAlreadyInUse);
+        printf("cudaErrorPeerAccessUnsupported      -> %d\n", cudaErrorPeerAccessUnsupported);
+        printf("cudaErrorInvalidPtx                 -> %d\n", cudaErrorInvalidPtx);
+        printf("cudaErrorInvalidGraphicsContext     -> %d\n", cudaErrorInvalidGraphicsContext);
+        printf("cudaErrorNvlinkUncorrectable        -> %d\n", cudaErrorNvlinkUncorrectable);
+        printf("cudaErrorJitCompilerNotFound        -> %d\n", cudaErrorJitCompilerNotFound);
+    #if (CUDART_VERSION == 1010)
+        printf("cudaErrorInvalidSource              -> %d\n", cudaErrorInvalidSource);
+        printf("cudaErrorFileNotFound               -> %d\n", cudaErrorFileNotFound);
+    #endif
+        printf("cudaErrorSharedObjectSymbolNotFound -> %d\n", cudaErrorSharedObjectSymbolNotFound);
+        printf("cudaErrorSharedObjectInitFailed     -> %d\n", cudaErrorSharedObjectInitFailed);
+        printf("cudaErrorOperatingSystem            -> %d\n", cudaErrorOperatingSystem);
+        printf("cudaErrorInvalidResourceHandle      -> %d\n", cudaErrorInvalidResourceHandle);
+    #if (CUDART_VERSION == 1010)
+        printf("cudaErrorIllegalState               -> %d\n", cudaErrorIllegalState);
+        printf("cudaErrorSymbolNotFound             -> %d\n", cudaErrorSymbolNotFound);
+    #endif
+        printf("cudaErrorNotReady                   -> %d\n", cudaErrorNotReady);
+        printf("cudaErrorIllegalAddress             -> %d\n", cudaErrorIllegalAddress);
+        printf("cudaErrorLaunchOutOfResources       -> %d\n", cudaErrorLaunchOutOfResources);
+        printf("cudaErrorLaunchTimeout              -> %d\n", cudaErrorLaunchTimeout);
+    #if (CUDART_VERSION == 1010)
+        printf("cudaErrorLaunchIncompatibleTexturing-> %d\n", cudaErrorLaunchIncompatibleTexturing);
+    #endif
+        printf("cudaErrorPeerAccessAlreadyEnabled   -> %d\n", cudaErrorPeerAccessAlreadyEnabled);
+        printf("cudaErrorPeerAccessNotEnabled       -> %d\n", cudaErrorPeerAccessNotEnabled);
+        printf("cudaErrorSetOnActiveProcess         -> %d\n", cudaErrorSetOnActiveProcess);
+    #if (CUDART_VERSION == 1010)
+        printf("cudaErrorContextIsDestroyed         -> %d\n", cudaErrorContextIsDestroyed);
+    #endif
+        printf("cudaErrorAssert                     -> %d\n", cudaErrorAssert);
+        printf("cudaErrorTooManyPeers               -> %d\n", cudaErrorTooManyPeers);
+        printf("cudaErrorHostMemoryAlreadyRegistered-> %d\n", cudaErrorHostMemoryAlreadyRegistered);
+        printf("cudaErrorHostMemoryNotRegistered    -> %d\n", cudaErrorHostMemoryNotRegistered);
+        printf("cudaErrorHardwareStackError         -> %d\n", cudaErrorHardwareStackError);
+        printf("cudaErrorIllegalInstruction         -> %d\n", cudaErrorIllegalInstruction);
+        printf("cudaErrorMisalignedAddress          -> %d\n", cudaErrorMisalignedAddress);
+        printf("cudaErrorInvalidAddressSpace        -> %d\n", cudaErrorInvalidAddressSpace);
+        printf("cudaErrorInvalidPc                  -> %d\n", cudaErrorInvalidPc);
+        printf("cudaErrorLaunchFailure              -> %d\n", cudaErrorLaunchFailure);
+        printf("cudaErrorCooperativeLaunchTooLarge  -> %d\n", cudaErrorCooperativeLaunchTooLarge);
+        printf("cudaErrorNotPermitted               -> %d\n", cudaErrorNotPermitted);
+        printf("cudaErrorNotSupported               -> %d\n", cudaErrorNotSupported);
+    #if (CUDART_VERSION == 1010)
+        printf("cudaErrorSystemNotReady             -> %d\n", cudaErrorSystemNotReady);
+        printf("cudaErrorSystemDriverMismatch       -> %d\n", cudaErrorSystemDriverMismatch);
+        printf("cudaErrorCompatNotSupportedOnDevice -> %d\n", cudaErrorCompatNotSupportedOnDevice);
+        printf("cudaErrorStreamCaptureUnsupported   -> %d\n", cudaErrorStreamCaptureUnsupported);
+        printf("cudaErrorStreamCaptureInvalidated   -> %d\n", cudaErrorStreamCaptureInvalidated);
+        printf("cudaErrorStreamCaptureMerge         -> %d\n", cudaErrorStreamCaptureMerge);
+        printf("cudaErrorStreamCaptureUnmatched     -> %d\n", cudaErrorStreamCaptureUnmatched);
+        printf("cudaErrorStreamCaptureUnjoined      -> %d\n", cudaErrorStreamCaptureUnjoined);
+        printf("cudaErrorStreamCaptureIsolation     -> %d\n", cudaErrorStreamCaptureIsolation);
+        printf("cudaErrorStreamCaptureImplicit      -> %d\n", cudaErrorStreamCaptureImplicit);
+        printf("cudaErrorCapturedEvent              -> %d\n", cudaErrorCapturedEvent);
+        printf("cudaErrorStreamCaptureWrongThread   -> %d\n", cudaErrorStreamCaptureWrongThread);
+    #endif
+    #if (CUDART_VERSION == 1100)
+        printf("cudaErrorTimeout                    -> %d\n", cudaErrorTimeout);
+        printf("cudaErrorGraphExecUpdateFailure     -> %d\n", cudaErrorGraphExecUpdateFailure);
+    #endif
+        printf("cudaErrorUnknown                    -> %d\n", cudaErrorUnknown);
+        printf("cudaErrorApiFailureBase (Deprecated)-> %d\n", cudaErrorApiFailureBase);
+        */
+        throw cusz_cuda_exception(cudaGetErrorString(status), status, file, line);
+    }
+}
+
+#define CHECK_CUDA(err) (check_cuda_error(err, __FILE__, __LINE__))
+
+#endif
diff --git a/qtensor/compression/cusz/include/utils/cuda_mem.cuh b/qtensor/compression/cusz/include/utils/cuda_mem.cuh
index 723028ab..46e52e33 100644
--- a/qtensor/compression/cusz/include/utils/cuda_mem.cuh
+++ b/qtensor/compression/cusz/include/utils/cuda_mem.cuh
@@ -1,100 +1,100 @@
-#ifndef UTILS_CUDA_MEM_CUH
-#define UTILS_CUDA_MEM_CUH
-
-/**
- * @file cuda_mem.cuh
- * @author Jiannan Tian
- * @brief CUDA memory operation wrappers.
- * @version 0.2
- * @date 2020-09-20
- * Created on 2020-04-30
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#include <cuda_runtime.h>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <stdexcept>
-#include <string>
-
-template <int NUM>
-static inline bool __is_aligned_at(const void* ptr)
-{  //
-    return reinterpret_cast<uintptr_t>(ptr) % NUM == 0;
-};
-
-template <typename T, int NUM>
-static size_t __cusz_get_alignable_len(size_t len)
-{
-    return ((sizeof(T) * len - 1) / NUM + 1) * NUM;
-}
-
-static const int CUSZ_ALIGN_NUM = 128;
-
-/**
- * @brief when using memory pool, alignment at 128 is necessary
- *
- * @tparam SRC
- * @tparam DST
- * @param src
- * @return DST*
- */
-template <typename DST, typename SRC = uint8_t>
-DST* designate(SRC* src)
-{
-    // TODO check alignment
-    auto aligned = __is_aligned_at<CUSZ_ALIGN_NUM>(src);
-    if (not aligned) throw std::runtime_error("not aligned at " + std::to_string(CUSZ_ALIGN_NUM) + " bytes");
-
-    return reinterpret_cast<DST*>(src);
-}
-
-template <typename DST, typename SRC>
-DST* free_repurpose(SRC* src)
-{
-    // aligning at 4 byte; does not raise misalignment
-    // may not result in optimal performance considering coalescing
-    auto aligned = __is_aligned_at<4>(src);
-    if (not aligned) throw std::runtime_error("not aligned at 4 bytes");
-
-    return reinterpret_cast<DST*>(src);
-}
-
-namespace mem {
-
-enum MemcpyDirection { h2d, d2h };
-
-template <typename T>
-inline T* create_CUDA_space(size_t len, uint8_t filling_val = 0x00)
-{
-    T* d_var;
-    cudaMalloc(&d_var, len * sizeof(T));
-    cudaMemset(d_var, filling_val, len * sizeof(T));
-    return d_var;
-}
-
-template <typename T>
-inline T* create_devspace_memcpy_h2d(T* var, size_t l)
-{
-    T* d_var;
-    cudaMalloc(&d_var, l * sizeof(T));
-    cudaMemcpy(d_var, var, l * sizeof(T), cudaMemcpyHostToDevice);
-    return d_var;
-}
-template <typename T>
-inline T* create_devspace_memcpy_d2h(T* d_var, size_t l)
-{
-    // auto var = new T[l];
-    T* var;
-    cudaMallocHost(&var, l * sizeof(T));
-    cudaMemcpy(var, d_var, l * sizeof(T), cudaMemcpyDeviceToHost);
-    return var;
-}
-
-}  // namespace mem
-
-#endif
+#ifndef UTILS_CUDA_MEM_CUH
+#define UTILS_CUDA_MEM_CUH
+
+/**
+ * @file cuda_mem.cuh
+ * @author Jiannan Tian
+ * @brief CUDA memory operation wrappers.
+ * @version 0.2
+ * @date 2020-09-20
+ * Created on 2020-04-30
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <cuda_runtime.h>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+
+template <int NUM>
+static inline bool __is_aligned_at(const void* ptr)
+{  //
+    return reinterpret_cast<uintptr_t>(ptr) % NUM == 0;
+};
+
+template <typename T, int NUM>
+static size_t __cusz_get_alignable_len(size_t len)
+{
+    return ((sizeof(T) * len - 1) / NUM + 1) * NUM;
+}
+
+static const int CUSZ_ALIGN_NUM = 128;
+
+/**
+ * @brief when using memory pool, alignment at 128 is necessary
+ *
+ * @tparam SRC
+ * @tparam DST
+ * @param src
+ * @return DST*
+ */
+template <typename DST, typename SRC = uint8_t>
+DST* designate(SRC* src)
+{
+    // TODO check alignment
+    auto aligned = __is_aligned_at<CUSZ_ALIGN_NUM>(src);
+    if (not aligned) throw std::runtime_error("not aligned at " + std::to_string(CUSZ_ALIGN_NUM) + " bytes");
+
+    return reinterpret_cast<DST*>(src);
+}
+
+template <typename DST, typename SRC>
+DST* free_repurpose(SRC* src)
+{
+    // aligning at 4 byte; does not raise misalignment
+    // may not result in optimal performance considering coalescing
+    auto aligned = __is_aligned_at<4>(src);
+    if (not aligned) throw std::runtime_error("not aligned at 4 bytes");
+
+    return reinterpret_cast<DST*>(src);
+}
+
+namespace mem {
+
+enum MemcpyDirection { h2d, d2h };
+
+template <typename T>
+inline T* create_CUDA_space(size_t len, uint8_t filling_val = 0x00)
+{
+    T* d_var;
+    cudaMalloc(&d_var, len * sizeof(T));
+    cudaMemset(d_var, filling_val, len * sizeof(T));
+    return d_var;
+}
+
+template <typename T>
+inline T* create_devspace_memcpy_h2d(T* var, size_t l)
+{
+    T* d_var;
+    cudaMalloc(&d_var, l * sizeof(T));
+    cudaMemcpy(d_var, var, l * sizeof(T), cudaMemcpyHostToDevice);
+    return d_var;
+}
+template <typename T>
+inline T* create_devspace_memcpy_d2h(T* d_var, size_t l)
+{
+    // auto var = new T[l];
+    T* var;
+    cudaMallocHost(&var, l * sizeof(T));
+    cudaMemcpy(var, d_var, l * sizeof(T), cudaMemcpyDeviceToHost);
+    return var;
+}
+
+}  // namespace mem
+
+#endif
diff --git a/qtensor/compression/cusz/include/utils/cusparse_err.cuh b/qtensor/compression/cusz/include/utils/cusparse_err.cuh
index 2086ca44..e2f77bb6 100644
--- a/qtensor/compression/cusz/include/utils/cusparse_err.cuh
+++ b/qtensor/compression/cusz/include/utils/cusparse_err.cuh
@@ -1,60 +1,60 @@
-#ifndef UTILS_CUSPARSE_ERR_CUH
-#define UTILS_CUSPARSE_ERR_CUH
-
-/**
- * @file cuda_err.cuh
- * @author Jiannan Tian
- * @brief CUDA runtime error handling macros.
- * @version 0.2
- * @date 2020-09-20
- * Created on: 2019-10-08
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#include <cuda_runtime.h>
-#include <cusparse.h>
-#include <cstdio>
-
-// block cusparse for generic testing
-#ifndef noCUSPARSE
-
-static void check_cusparse_error(cusparseStatus_t status, const char* file, int line)
-{
-    if (CUSPARSE_STATUS_SUCCESS != status) {
-        printf("\nCUSPARSE status reference (as of CUDA 11):\n");
-        printf("CUSPARSE_STATUS_SUCCESS                   -> %d\n", CUSPARSE_STATUS_SUCCESS);
-        printf("CUSPARSE_STATUS_NOT_INITIALIZED           -> %d\n", CUSPARSE_STATUS_NOT_INITIALIZED);
-        printf("CUSPARSE_STATUS_ALLOC_FAILED              -> %d\n", CUSPARSE_STATUS_ALLOC_FAILED);
-        printf("CUSPARSE_STATUS_INVALID_VALUE             -> %d\n", CUSPARSE_STATUS_INVALID_VALUE);
-        printf("CUSPARSE_STATUS_ARCH_MISMATCH             -> %d\n", CUSPARSE_STATUS_ARCH_MISMATCH);
-        printf("CUSPARSE_STATUS_EXECUTION_FAILED          -> %d\n", CUSPARSE_STATUS_EXECUTION_FAILED);
-        printf("CUSPARSE_STATUS_INTERNAL_ERROR            -> %d\n", CUSPARSE_STATUS_INTERNAL_ERROR);
-        printf("CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED -> %d\n", CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
-#if (CUDART_VERSION == 1010)
-        printf("CUSPARSE_STATUS_NOT_SUPPORTED             -> %d\n", CUSPARSE_STATUS_NOT_SUPPORTED);
-#endif
-#if (CUDART_VERSION == 1100)
-        printf("CUSPARSE_STATUS_INSUFFICIENT_RESOURCES    -> %d\n", CUSPARSE_STATUS_INSUFFICIENT_RESOURCES);
-#endif
-#if (CUDART_VERSION == 1100)
-        printf("CUSPARSE_STATUS_INSUFFICIENT_RESOURCES    -> %d\n", CUSPARSE_STATUS_INSUFFICIENT_RESOURCES);
-#endif
-        printf("\n");
-
-#if (CUDART_VERSION >= 1010)
-        printf(
-            "CUSPARSE API failed at \e[31m\e[1m%s:%d\e[0m with error: %s (%d)\n", file, line,
-            cusparseGetErrorString(status), status);
-#endif
-        exit(EXIT_FAILURE);
-    }
-}
-
-#define CHECK_CUSPARSE(err) (check_cusparse_error(err, __FILE__, __LINE__))
-
-#endif
-
-#endif
+#ifndef UTILS_CUSPARSE_ERR_CUH
+#define UTILS_CUSPARSE_ERR_CUH
+
+/**
+ * @file cuda_err.cuh
+ * @author Jiannan Tian
+ * @brief CUDA runtime error handling macros.
+ * @version 0.2
+ * @date 2020-09-20
+ * Created on: 2019-10-08
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <cuda_runtime.h>
+#include <cusparse.h>
+#include <cstdio>
+
+// block cusparse for generic testing
+#ifndef noCUSPARSE
+
+static void check_cusparse_error(cusparseStatus_t status, const char* file, int line)
+{
+    if (CUSPARSE_STATUS_SUCCESS != status) {
+        printf("\nCUSPARSE status reference (as of CUDA 11):\n");
+        printf("CUSPARSE_STATUS_SUCCESS                   -> %d\n", CUSPARSE_STATUS_SUCCESS);
+        printf("CUSPARSE_STATUS_NOT_INITIALIZED           -> %d\n", CUSPARSE_STATUS_NOT_INITIALIZED);
+        printf("CUSPARSE_STATUS_ALLOC_FAILED              -> %d\n", CUSPARSE_STATUS_ALLOC_FAILED);
+        printf("CUSPARSE_STATUS_INVALID_VALUE             -> %d\n", CUSPARSE_STATUS_INVALID_VALUE);
+        printf("CUSPARSE_STATUS_ARCH_MISMATCH             -> %d\n", CUSPARSE_STATUS_ARCH_MISMATCH);
+        printf("CUSPARSE_STATUS_EXECUTION_FAILED          -> %d\n", CUSPARSE_STATUS_EXECUTION_FAILED);
+        printf("CUSPARSE_STATUS_INTERNAL_ERROR            -> %d\n", CUSPARSE_STATUS_INTERNAL_ERROR);
+        printf("CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED -> %d\n", CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+#if (CUDART_VERSION == 1010)
+        printf("CUSPARSE_STATUS_NOT_SUPPORTED             -> %d\n", CUSPARSE_STATUS_NOT_SUPPORTED);
+#endif
+#if (CUDART_VERSION == 1100)
+        printf("CUSPARSE_STATUS_INSUFFICIENT_RESOURCES    -> %d\n", CUSPARSE_STATUS_INSUFFICIENT_RESOURCES);
+#endif
+#if (CUDART_VERSION == 1100)
+        printf("CUSPARSE_STATUS_INSUFFICIENT_RESOURCES    -> %d\n", CUSPARSE_STATUS_INSUFFICIENT_RESOURCES);
+#endif
+        printf("\n");
+
+#if (CUDART_VERSION >= 1010)
+        printf(
+            "CUSPARSE API failed at \e[31m\e[1m%s:%d\e[0m with error: %s (%d)\n", file, line,
+            cusparseGetErrorString(status), status);
+#endif
+        exit(EXIT_FAILURE);
+    }
+}
+
+#define CHECK_CUSPARSE(err) (check_cusparse_error(err, __FILE__, __LINE__))
+
+#endif
+
+#endif
diff --git a/qtensor/compression/cusz/include/utils/format.hh b/qtensor/compression/cusz/include/utils/format.hh
index 196f7248..ae1d6079 100644
--- a/qtensor/compression/cusz/include/utils/format.hh
+++ b/qtensor/compression/cusz/include/utils/format.hh
@@ -1,57 +1,57 @@
-#ifndef UTILS_FORMAT_HH
-#define UTILS_FORMAT_HH
-
-/**
- * @file format.hh
- * @author Jiannan Tian
- * @brief Formatting for log print (header).
- * @version 0.2
- * @date 2020-09-20
- * Created on 2020-04-27
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#include <iostream>
-#include <sstream>
-#include <string>
-
-
-const std::string LOG_NULL      = "      ";
-const std::string LOG_INFO      = "  ::  ";
-const std::string LOG_ERR       = " ERR  ";
-const std::string LOG_WARN      = "WARN  ";
-const std::string LOG_DBG       = " dbg  ";
-const std::string LOG_EXCEPTION = "  !!  ";
-
-// https://stackoverflow.com/a/26080768/8740097  CC BY-SA 3.0
-template <typename T>
-void build(std::ostream& o, T t)
-{
-    o << t << " ";
-}
-
-template <typename T, typename... Args>
-void build(std::ostream& o, T t, Args... args)  // recursive variadic function
-{
-    build(o, t);
-    build(o, args...);
-}
-
-template <typename... Args>
-void LOGGING(const std::string& log_head, Args... args)
-{
-    std::ostringstream oss;
-    oss << log_head;
-    build(oss, args...);
-
-    oss.seekp(0, std::ios::end);
-    std::stringstream::pos_type offset = oss.tellp();
-    if (log_head == LOG_DBG) { std::cout << "\e[2m"; }  // dbg
-    std::cout << oss.str() << std::endl;                // print content
-    if (log_head == LOG_DBG) std::cout << "\e[0m";      // finish printing dbg
-}
-
-#endif  // FORMAT_HH
+#ifndef UTILS_FORMAT_HH
+#define UTILS_FORMAT_HH
+
+/**
+ * @file format.hh
+ * @author Jiannan Tian
+ * @brief Formatting for log print (header).
+ * @version 0.2
+ * @date 2020-09-20
+ * Created on 2020-04-27
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <iostream>
+#include <sstream>
+#include <string>
+
+
+const std::string LOG_NULL      = "      ";
+const std::string LOG_INFO      = "  ::  ";
+const std::string LOG_ERR       = " ERR  ";
+const std::string LOG_WARN      = "WARN  ";
+const std::string LOG_DBG       = " dbg  ";
+const std::string LOG_EXCEPTION = "  !!  ";
+
+// https://stackoverflow.com/a/26080768/8740097  CC BY-SA 3.0
+template <typename T>
+void build(std::ostream& o, T t)
+{
+    o << t << " ";
+}
+
+template <typename T, typename... Args>
+void build(std::ostream& o, T t, Args... args)  // recursive variadic function
+{
+    build(o, t);
+    build(o, args...);
+}
+
+template <typename... Args>
+void LOGGING(const std::string& log_head, Args... args)
+{
+    std::ostringstream oss;
+    oss << log_head;
+    build(oss, args...);
+
+    oss.seekp(0, std::ios::end);
+    std::stringstream::pos_type offset = oss.tellp();
+    if (log_head == LOG_DBG) { std::cout << "\e[2m"; }  // dbg
+    std::cout << oss.str() << std::endl;                // print content
+    if (log_head == LOG_DBG) std::cout << "\e[0m";      // finish printing dbg
+}
+
+#endif  // FORMAT_HH
diff --git a/qtensor/compression/cusz/include/utils/io.hh b/qtensor/compression/cusz/include/utils/io.hh
index de71334d..574432ef 100644
--- a/qtensor/compression/cusz/include/utils/io.hh
+++ b/qtensor/compression/cusz/include/utils/io.hh
@@ -1,59 +1,59 @@
-#ifndef UTILS_IO_HH
-#define UTILS_IO_HH
-
-/**
- * @file io.hh
- * @author Jiannan Tian
- * @brief Read and write binary.
- * @version 0.2
- * @date 2020-09-20
- * Created on 2019-08-27
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#include <fstream>
-#include <iostream>
-
-namespace io {
-
-template <typename T>
-T* read_binary_to_new_array(const std::string& fname, size_t dtype_len)
-{
-    std::ifstream ifs(fname.c_str(), std::ios::binary | std::ios::in);
-    if (not ifs.is_open()) {
-        std::cerr << "fail to open " << fname << std::endl;
-        exit(1);
-    }
-    auto _a = new T[dtype_len]();
-    ifs.read(reinterpret_cast<char*>(_a), std::streamsize(dtype_len * sizeof(T)));
-    ifs.close();
-    return _a;
-}
-
-template <typename T>
-void read_binary_to_array(const std::string& fname, T* _a, size_t dtype_len)
-{
-    std::ifstream ifs(fname.c_str(), std::ios::binary | std::ios::in);
-    if (not ifs.is_open()) {
-        std::cerr << "fail to open " << fname << std::endl;
-        exit(1);
-    }
-    ifs.read(reinterpret_cast<char*>(_a), std::streamsize(dtype_len * sizeof(T)));
-    ifs.close();
-}
-
-template <typename T>
-void write_array_to_binary(const std::string& fname, T* const _a, size_t const dtype_len)
-{
-    std::ofstream ofs(fname.c_str(), std::ios::binary | std::ios::out);
-    if (not ofs.is_open()) return;
-    ofs.write(reinterpret_cast<const char*>(_a), std::streamsize(dtype_len * sizeof(T)));
-    ofs.close();
-}
-
-}  // namespace io
-
-#endif  // IO_HH
+#ifndef UTILS_IO_HH
+#define UTILS_IO_HH
+
+/**
+ * @file io.hh
+ * @author Jiannan Tian
+ * @brief Read and write binary.
+ * @version 0.2
+ * @date 2020-09-20
+ * Created on 2019-08-27
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <fstream>
+#include <iostream>
+
+namespace io {
+
+template <typename T>
+T* read_binary_to_new_array(const std::string& fname, size_t dtype_len)
+{
+    std::ifstream ifs(fname.c_str(), std::ios::binary | std::ios::in);
+    if (not ifs.is_open()) {
+        std::cerr << "fail to open " << fname << std::endl;
+        exit(1);
+    }
+    auto _a = new T[dtype_len]();
+    ifs.read(reinterpret_cast<char*>(_a), std::streamsize(dtype_len * sizeof(T)));
+    ifs.close();
+    return _a;
+}
+
+template <typename T>
+void read_binary_to_array(const std::string& fname, T* _a, size_t dtype_len)
+{
+    std::ifstream ifs(fname.c_str(), std::ios::binary | std::ios::in);
+    if (not ifs.is_open()) {
+        std::cerr << "fail to open " << fname << std::endl;
+        exit(1);
+    }
+    ifs.read(reinterpret_cast<char*>(_a), std::streamsize(dtype_len * sizeof(T)));
+    ifs.close();
+}
+
+template <typename T>
+void write_array_to_binary(const std::string& fname, T* const _a, size_t const dtype_len)
+{
+    std::ofstream ofs(fname.c_str(), std::ios::binary | std::ios::out);
+    if (not ofs.is_open()) return;
+    ofs.write(reinterpret_cast<const char*>(_a), std::streamsize(dtype_len * sizeof(T)));
+    ofs.close();
+}
+
+}  // namespace io
+
+#endif  // IO_HH
diff --git a/qtensor/compression/cusz/include/utils/print_gpu.h b/qtensor/compression/cusz/include/utils/print_gpu.h
index 67dcc30a..d4cded5e 100644
--- a/qtensor/compression/cusz/include/utils/print_gpu.h
+++ b/qtensor/compression/cusz/include/utils/print_gpu.h
@@ -1,45 +1,45 @@
-/**
- * @file print.h
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-10-28
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef E02AE628_9C8A_4100_8C73_A3B74B7128F6
-#define E02AE628_9C8A_4100_8C73_A3B74B7128F6
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define PRINT_INT_LESS_THAN_64(Tliteral, T) void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset);
-
-PRINT_INT_LESS_THAN_64(i8, int8_t)
-PRINT_INT_LESS_THAN_64(i16, int16_t)
-PRINT_INT_LESS_THAN_64(i32, int32_t)
-
-void peek_device_data_Ti64(int64_t* d_arr, size_t num, size_t offset);
-
-#define PRINT_UINT_LESS_THAN_64(Tliteral, T) void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset);
-
-PRINT_UINT_LESS_THAN_64(ui8, uint8_t)
-PRINT_UINT_LESS_THAN_64(ui16, uint16_t)
-PRINT_UINT_LESS_THAN_64(ui32, uint32_t)
-
-void peek_device_data_Tui64(uint64_t* d_arr, size_t num, size_t offset);
-
-void peek_device_data_Tfp32(float* d_arr, size_t num, size_t offset);
-void peek_device_data_Tfp64(double* d_arr, size_t num, size_t offset);
-
-#undef PRINT_INT_LESS_THAN_64
-#undef PRINT_UINT_LESS_THAN_64
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* E02AE628_9C8A_4100_8C73_A3B74B7128F6 */
+/**
+ * @file print.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-28
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef E02AE628_9C8A_4100_8C73_A3B74B7128F6
+#define E02AE628_9C8A_4100_8C73_A3B74B7128F6
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PRINT_INT_LESS_THAN_64(Tliteral, T) void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset);
+
+PRINT_INT_LESS_THAN_64(i8, int8_t)
+PRINT_INT_LESS_THAN_64(i16, int16_t)
+PRINT_INT_LESS_THAN_64(i32, int32_t)
+
+void peek_device_data_Ti64(int64_t* d_arr, size_t num, size_t offset);
+
+#define PRINT_UINT_LESS_THAN_64(Tliteral, T) void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset);
+
+PRINT_UINT_LESS_THAN_64(ui8, uint8_t)
+PRINT_UINT_LESS_THAN_64(ui16, uint16_t)
+PRINT_UINT_LESS_THAN_64(ui32, uint32_t)
+
+void peek_device_data_Tui64(uint64_t* d_arr, size_t num, size_t offset);
+
+void peek_device_data_Tfp32(float* d_arr, size_t num, size_t offset);
+void peek_device_data_Tfp64(double* d_arr, size_t num, size_t offset);
+
+#undef PRINT_INT_LESS_THAN_64
+#undef PRINT_UINT_LESS_THAN_64
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* E02AE628_9C8A_4100_8C73_A3B74B7128F6 */
diff --git a/qtensor/compression/cusz/include/utils/print_gpu.hh b/qtensor/compression/cusz/include/utils/print_gpu.hh
index cffcbf22..c3236f62 100644
--- a/qtensor/compression/cusz/include/utils/print_gpu.hh
+++ b/qtensor/compression/cusz/include/utils/print_gpu.hh
@@ -1,21 +1,21 @@
-/**
- * @file print_gpu.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-10-29
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "print_gpu.h"
-
-namespace psz {
-
-template <typename T>
-void peek_device_data(T* d_arr, size_t num, size_t offset = 0);
-
-}  // namespace psz
-
-#undef PEEK_DEVICE_DATA
+/**
+ * @file print_gpu.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-29
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "print_gpu.h"
+
+namespace psz {
+
+template <typename T>
+void peek_device_data(T* d_arr, size_t num, size_t offset = 0);
+
+}  // namespace psz
+
+#undef PEEK_DEVICE_DATA
diff --git a/qtensor/compression/cusz/include/utils/strhelper.hh b/qtensor/compression/cusz/include/utils/strhelper.hh
index 6768edeb..a95dc96f 100644
--- a/qtensor/compression/cusz/include/utils/strhelper.hh
+++ b/qtensor/compression/cusz/include/utils/strhelper.hh
@@ -1,144 +1,144 @@
-/**
- * @file strhelper.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2021-09-19
- *
- * (C) 2021 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef CUSZ_UTILS_STRHELPER_HH
-#define CUSZ_UTILS_STRHELPER_HH
-
-#include <iostream>
-#include <regex>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "format.hh"
-
-using std::cerr;
-using std::endl;
-
-using ss_t     = std::stringstream;
-using map_t    = std::unordered_map<std::string, std::string>;
-using str_list = std::vector<std::string>;
-
-struct StrHelper {
-    static unsigned int str2int(const char* s)
-    {
-        char* end;
-        auto  res = std::strtol(s, &end, 10);
-        if (*end) {
-            const char* notif = "invalid option value, non-convertible part: ";
-            cerr << LOG_ERR << notif << "\e[1m" << s << "\e[0m" << endl;
-        }
-        return res;
-    }
-
-    static unsigned int str2int(std::string s) { return str2int(s.c_str()); }
-
-    static double str2fp(const char* s)
-    {
-        char* end;
-        auto  res = std::strtod(s, &end);
-        if (*end) {
-            const char* notif = "invalid option value, non-convertible part: ";
-            cerr << LOG_ERR << notif << "\e[1m" << end << "\e[0m" << endl;
-        }
-        return res;
-    }
-
-    static double str2fp(std::string s) { return str2fp(s.c_str()); }
-
-    static bool is_kv_pair(std::string s) { return s.find("=") != std::string::npos; }
-
-    static std::pair<std::string, std::string> separate_kv(std::string& s)
-    {
-        std::string delimiter = "=";
-
-        if (s.find(delimiter) == std::string::npos)
-            throw std::runtime_error("\e[1mnot a correct key-value syntax, must be \"opt=value\"\e[0m");
-
-        std::string k = s.substr(0, s.find(delimiter));
-        std::string v = s.substr(s.find(delimiter) + delimiter.length(), std::string::npos);
-
-        return std::make_pair(k, v);
-    }
-
-    static void parse_strlist_as_kv(const char* in_str, map_t& kv_list)
-    {
-        ss_t ss(in_str);
-        while (ss.good()) {
-            std::string tmp;
-            std::getline(ss, tmp, ',');
-            kv_list.insert(separate_kv(tmp));
-        }
-    }
-
-    static void parse_strlist(const char* in_str, str_list& list)
-    {
-        ss_t ss(in_str);
-        while (ss.good()) {
-            std::string tmp;
-            std::getline(ss, tmp, ',');
-            list.push_back(tmp);
-        }
-    }
-
-    static std::pair<std::string, bool> parse_kv_onoff(std::string in_str)
-    {
-        auto       kv_literal = "(.*?)=(on|ON|off|OFF)";
-        std::regex kv_pattern(kv_literal);
-        std::regex onoff_pattern("on|ON|off|OFF");
-
-        bool        onoff = false;
-        std::string k, v;
-
-        std::smatch kv_match;
-        if (std::regex_match(in_str, kv_match, kv_pattern)) {
-            // the 1st match: whole string
-            // the 2nd: k, the 3rd: v
-            if (kv_match.size() == 3) {
-                k = kv_match[1].str(), v = kv_match[2].str();
-
-                std::smatch v_match;
-                if (std::regex_match(v, v_match, onoff_pattern)) {  //
-                    onoff = (v == "on") or (v == "ON");
-                }
-                else {
-                    throw std::runtime_error("not legal (k=v)-syntax");
-                }
-            }
-        }
-        return std::make_pair(k, onoff);
-    }
-
-    static std::string doc_format(const std::string& s)
-    {
-        std::regex  gray("%(.*?)%");
-        std::string gray_text("\e[37m$1\e[0m");
-
-        std::regex  bful("@(.*?)@");
-        std::string bful_text("\e[1m\e[4m$1\e[0m");
-        std::regex  bf("\\*(.*?)\\*");
-        std::string bf_text("\e[1m$1\e[0m");
-        std::regex  ul(R"(_((\w|-|\d|\.)+?)_)");
-        std::string ul_text("\e[4m$1\e[0m");
-        std::regex  red(R"(\^\^(.*?)\^\^)");
-        std::string red_text("\e[31m$1\e[0m");
-
-        auto a = std::regex_replace(s, bful, bful_text);
-        auto b = std::regex_replace(a, bf, bf_text);
-        auto c = std::regex_replace(b, ul, ul_text);
-        auto d = std::regex_replace(c, red, red_text);
-        auto e = std::regex_replace(d, gray, gray_text);
-
-        return e;
-    }
-};
-
-#endif
+/**
+ * @file strhelper.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-09-19
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_UTILS_STRHELPER_HH
+#define CUSZ_UTILS_STRHELPER_HH
+
+#include <iostream>
+#include <regex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "format.hh"
+
+using std::cerr;
+using std::endl;
+
+using ss_t     = std::stringstream;
+using map_t    = std::unordered_map<std::string, std::string>;
+using str_list = std::vector<std::string>;
+
+struct StrHelper {
+    static unsigned int str2int(const char* s)
+    {
+        char* end;
+        auto  res = std::strtol(s, &end, 10);
+        if (*end) {
+            const char* notif = "invalid option value, non-convertible part: ";
+            cerr << LOG_ERR << notif << "\e[1m" << s << "\e[0m" << endl;
+        }
+        return res;
+    }
+
+    static unsigned int str2int(std::string s) { return str2int(s.c_str()); }
+
+    static double str2fp(const char* s)
+    {
+        char* end;
+        auto  res = std::strtod(s, &end);
+        if (*end) {
+            const char* notif = "invalid option value, non-convertible part: ";
+            cerr << LOG_ERR << notif << "\e[1m" << end << "\e[0m" << endl;
+        }
+        return res;
+    }
+
+    static double str2fp(std::string s) { return str2fp(s.c_str()); }
+
+    static bool is_kv_pair(std::string s) { return s.find("=") != std::string::npos; }
+
+    static std::pair<std::string, std::string> separate_kv(std::string& s)
+    {
+        std::string delimiter = "=";
+
+        if (s.find(delimiter) == std::string::npos)
+            throw std::runtime_error("\e[1mnot a correct key-value syntax, must be \"opt=value\"\e[0m");
+
+        std::string k = s.substr(0, s.find(delimiter));
+        std::string v = s.substr(s.find(delimiter) + delimiter.length(), std::string::npos);
+
+        return std::make_pair(k, v);
+    }
+
+    static void parse_strlist_as_kv(const char* in_str, map_t& kv_list)
+    {
+        ss_t ss(in_str);
+        while (ss.good()) {
+            std::string tmp;
+            std::getline(ss, tmp, ',');
+            kv_list.insert(separate_kv(tmp));
+        }
+    }
+
+    static void parse_strlist(const char* in_str, str_list& list)
+    {
+        ss_t ss(in_str);
+        while (ss.good()) {
+            std::string tmp;
+            std::getline(ss, tmp, ',');
+            list.push_back(tmp);
+        }
+    }
+
+    static std::pair<std::string, bool> parse_kv_onoff(std::string in_str)
+    {
+        auto       kv_literal = "(.*?)=(on|ON|off|OFF)";
+        std::regex kv_pattern(kv_literal);
+        std::regex onoff_pattern("on|ON|off|OFF");
+
+        bool        onoff = false;
+        std::string k, v;
+
+        std::smatch kv_match;
+        if (std::regex_match(in_str, kv_match, kv_pattern)) {
+            // the 1st match: whole string
+            // the 2nd: k, the 3rd: v
+            if (kv_match.size() == 3) {
+                k = kv_match[1].str(), v = kv_match[2].str();
+
+                std::smatch v_match;
+                if (std::regex_match(v, v_match, onoff_pattern)) {  //
+                    onoff = (v == "on") or (v == "ON");
+                }
+                else {
+                    throw std::runtime_error("not legal (k=v)-syntax");
+                }
+            }
+        }
+        return std::make_pair(k, onoff);
+    }
+
+    static std::string doc_format(const std::string& s)
+    {
+        std::regex  gray("%(.*?)%");
+        std::string gray_text("\e[37m$1\e[0m");
+
+        std::regex  bful("@(.*?)@");
+        std::string bful_text("\e[1m\e[4m$1\e[0m");
+        std::regex  bf("\\*(.*?)\\*");
+        std::string bf_text("\e[1m$1\e[0m");
+        std::regex  ul(R"(_((\w|-|\d|\.)+?)_)");
+        std::string ul_text("\e[4m$1\e[0m");
+        std::regex  red(R"(\^\^(.*?)\^\^)");
+        std::string red_text("\e[31m$1\e[0m");
+
+        auto a = std::regex_replace(s, bful, bful_text);
+        auto b = std::regex_replace(a, bf, bf_text);
+        auto c = std::regex_replace(b, ul, ul_text);
+        auto d = std::regex_replace(c, red, red_text);
+        auto e = std::regex_replace(d, gray, gray_text);
+
+        return e;
+    }
+};
+
+#endif
diff --git a/qtensor/compression/cusz/include/utils/timer.h b/qtensor/compression/cusz/include/utils/timer.h
index c38cb0dd..41efb730 100644
--- a/qtensor/compression/cusz/include/utils/timer.h
+++ b/qtensor/compression/cusz/include/utils/timer.h
@@ -1,92 +1,92 @@
-/**
- * @file timer.h
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-10-31
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef B36B7228_E9EC_4E61_A1DC_19A4352C4EB3
-#define B36B7228_E9EC_4E61_A1DC_19A4352C4EB3
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "../cusz/type.h"
-
-struct asz_timer;
-typedef struct asz_timer asz_timer;
-typedef struct asz_timer asz_cputimer;
-
-struct asz_cudatimer;
-typedef struct asz_cudatimer asz_cudatimer;
-
-// top-level/dispatcher
-// asz_timer* asz_timer_create(asz_policy const p, void* stream);
-// void       asz_timer_destroy(asz_timer* t);
-// void       asz_timer_start(asz_timer* t);
-// void       asz_timer_end(asz_timer* t);
-// double     asz_time_elapsed(asz_timer* t);
-
-asz_timer* asz_cputimer_create();
-void       asz_cputimer_destroy(asz_timer* t);
-void       asz_cputimer_start(asz_timer* t);
-void       asz_cputimer_end(asz_timer* t);
-double     asz_cputime_elapsed(asz_timer* t);
-
-// 22-11-01 adding wrapper incurs unexpeted overhead in timing
-asz_cudatimer* asz_cudatimer_create();
-void           asz_cudatimer_destroy(asz_cudatimer* t);
-void           asz_cudatimer_start(asz_cudatimer* t);
-void           asz_cudatimer_end(asz_cudatimer* t);
-double         asz_cudatime_elapsed(asz_cudatimer* t);
-
-asz_cudatimer* asz_cudastreamtimer_create(void* stream);
-void           asz_cudastreamtimer_destroy(asz_cudatimer* t);
-void           asz_cudastreamtimer_start(asz_cudatimer* t);
-void           asz_cudastreamtimer_end(asz_cudatimer* t);
-double         asz_cudastreamtime_elapsed(asz_cudatimer* t);
-
-// 22-11-01 CUDA timing snippet instead
-#define CREATE_CUDAEVENT_PAIR \
-    cudaEvent_t a, b;         \
-    cudaEventCreate(&a);      \
-    cudaEventCreate(&b);
-
-#define DESTROY_CUDAEVENT_PAIR \
-    cudaEventDestroy(a);       \
-    cudaEventDestroy(b);
-
-#define START_CUDAEVENT_RECORDING(STREAM) cudaEventRecord(a, STREAM);
-#define STOP_CUDAEVENT_RECORDING(STREAM) \
-    cudaEventRecord(b, STREAM);          \
-    cudaEventSynchronize(b);
-
-#define TIME_ELAPSED_CUDAEVENT(PTR_MILLISEC) cudaEventElapsedTime(PTR_MILLISEC, a, b);
-
-// 22-11-01 HIP timing snippet instead
-#define CREATE_HIPEVENT_PAIR \
-    hipEvent_t a, b;         \
-    hipEventCreate(&a);      \
-    hipEventCreate(&b);
-
-#define DESTROY_HIPEVENT_PAIR \
-    hipEventDestroy(a);       \
-    hipEventDestroy(b);
-
-#define START_HIPEVENT_RECORDING(STREAM) hipEventRecord(a, STREAM);
-#define STOP_HIPEVENT_RECORDING(STREAM) \
-    hipEventRecord(b, STREAM);          \
-    hipEventSynchronize(b);
-
-#define TIME_ELAPSED_HIPEVENT(PTR_MILLISEC) hipEventElapsedTime(PTR_MILLISEC, a, b);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* B36B7228_E9EC_4E61_A1DC_19A4352C4EB3 */
+/**
+ * @file timer.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-31
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef B36B7228_E9EC_4E61_A1DC_19A4352C4EB3
+#define B36B7228_E9EC_4E61_A1DC_19A4352C4EB3
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cusz/type.h"
+
+struct asz_timer;
+typedef struct asz_timer asz_timer;
+typedef struct asz_timer asz_cputimer;
+
+struct asz_cudatimer;
+typedef struct asz_cudatimer asz_cudatimer;
+
+// top-level/dispatcher
+// asz_timer* asz_timer_create(asz_policy const p, void* stream);
+// void       asz_timer_destroy(asz_timer* t);
+// void       asz_timer_start(asz_timer* t);
+// void       asz_timer_end(asz_timer* t);
+// double     asz_time_elapsed(asz_timer* t);
+
+asz_timer* asz_cputimer_create();
+void       asz_cputimer_destroy(asz_timer* t);
+void       asz_cputimer_start(asz_timer* t);
+void       asz_cputimer_end(asz_timer* t);
+double     asz_cputime_elapsed(asz_timer* t);
+
+// 22-11-01 adding wrapper incurs unexpeted overhead in timing
+asz_cudatimer* asz_cudatimer_create();
+void           asz_cudatimer_destroy(asz_cudatimer* t);
+void           asz_cudatimer_start(asz_cudatimer* t);
+void           asz_cudatimer_end(asz_cudatimer* t);
+double         asz_cudatime_elapsed(asz_cudatimer* t);
+
+asz_cudatimer* asz_cudastreamtimer_create(void* stream);
+void           asz_cudastreamtimer_destroy(asz_cudatimer* t);
+void           asz_cudastreamtimer_start(asz_cudatimer* t);
+void           asz_cudastreamtimer_end(asz_cudatimer* t);
+double         asz_cudastreamtime_elapsed(asz_cudatimer* t);
+
+// 22-11-01 CUDA timing snippet instead
+#define CREATE_CUDAEVENT_PAIR \
+    cudaEvent_t a, b;         \
+    cudaEventCreate(&a);      \
+    cudaEventCreate(&b);
+
+#define DESTROY_CUDAEVENT_PAIR \
+    cudaEventDestroy(a);       \
+    cudaEventDestroy(b);
+
+#define START_CUDAEVENT_RECORDING(STREAM) cudaEventRecord(a, STREAM);
+#define STOP_CUDAEVENT_RECORDING(STREAM) \
+    cudaEventRecord(b, STREAM);          \
+    cudaEventSynchronize(b);
+
+#define TIME_ELAPSED_CUDAEVENT(PTR_MILLISEC) cudaEventElapsedTime(PTR_MILLISEC, a, b);
+
+// 22-11-01 HIP timing snippet instead
+#define CREATE_HIPEVENT_PAIR \
+    hipEvent_t a, b;         \
+    hipEventCreate(&a);      \
+    hipEventCreate(&b);
+
+#define DESTROY_HIPEVENT_PAIR \
+    hipEventDestroy(a);       \
+    hipEventDestroy(b);
+
+#define START_HIPEVENT_RECORDING(STREAM) hipEventRecord(a, STREAM);
+#define STOP_HIPEVENT_RECORDING(STREAM) \
+    hipEventRecord(b, STREAM);          \
+    hipEventSynchronize(b);
+
+#define TIME_ELAPSED_HIPEVENT(PTR_MILLISEC) hipEventElapsedTime(PTR_MILLISEC, a, b);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* B36B7228_E9EC_4E61_A1DC_19A4352C4EB3 */
diff --git a/qtensor/compression/cusz/include/utils/timer.hh b/qtensor/compression/cusz/include/utils/timer.hh
index 6ba7d35b..c820d451 100644
--- a/qtensor/compression/cusz/include/utils/timer.hh
+++ b/qtensor/compression/cusz/include/utils/timer.hh
@@ -1,153 +1,153 @@
-/**
- * @file timer.hh
- * @author Jiannan Tian
- * @brief High-resolution timer wrapper from <chrono> and util functions for timing both CPU and CUDA function
- * @version 0.2
- * @date 2021-01-05
- * (created) 2019-08-26 (rev) 2021-12-23
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#ifndef UTILS_TIMER_HH
-#define UTILS_TIMER_HH
-
-#include <chrono>
-#include <utility>
-
-using hires         = std::chrono::high_resolution_clock;
-using duration_t    = std::chrono::duration<double>;
-using hires_clock_t = std::chrono::time_point<hires>;
-
-typedef struct Timer {
-    hires_clock_t start, end;
-
-    void   timer_start() { start = hires::now(); }
-    void   timer_end() { end = hires::now(); }
-    double get_time_elapsed() { return static_cast<duration_t>(end - start).count(); }
-
-} host_timer_t;
-
-#ifdef __CUDACC__
-
-/**
- * @brief CUDA event based timer. Synopsis:
- * cuda_timer_t t;
- * t.timer_start();
- * kernel<<<grid_dim, block_dim, nbytes, stream>>>(...);
- * t.timer_end();
- * cudaStreamSynchronize(stream);
- * auto ms = t.get_time_elapsed();
- *
- */
-typedef struct CUDATimer {
-    cudaEvent_t start, stop;
-    float       milliseconds;
-
-    // stream not involved
-    void timer_start()
-    {
-        cudaEventCreate(&start);
-        cudaEventCreate(&stop);
-        cudaEventRecord(start);
-    }
-
-    void timer_end()
-    {
-        cudaEventRecord(stop);
-        cudaEventSynchronize(stop);
-    }
-
-    // stream involved
-    void timer_start(cudaStream_t stream)
-    {
-        cudaEventCreate(&start);
-        cudaEventCreate(&stop);
-
-        cudaEventRecord(start, stream);  // set event as not occurred
-    }
-
-    void timer_end(cudaStream_t stream)
-    {
-        cudaEventRecord(stop, stream);
-        cudaEventSynchronize(stop);  // block host until `stream` meets `stop`
-    }
-
-    // get time
-    float get_time_elapsed()
-    {
-        cudaEventElapsedTime(&milliseconds, start, stop);
-        return milliseconds;
-    }
-
-} cuda_timer_t;
-
-#endif
-
-// TODO handle return; testing
-/**
- * @brief A timer wrapper for arbitrary function (no handling return for now);
- * Adapted from https://stackoverflow.com/a/33900479/8740097 (CC BY-SA 3.0)
- *
- * @tparam F auto function type
- * @tparam Args variadic function argument type
- * @param func non-return function to be timed
- * @param args variadic function arguments
- * @return double time in seconds
- */
-template <typename F, typename... Args>
-double TimeThisRoutine(F func, Args&&... args)
-{
-    auto t0 = hires::now();
-    func(std::forward<Args>(args)...);
-    return static_cast<duration_t>(hires::now() - t0).count();
-}
-
-#ifdef __CUDACC__
-typedef struct CUDAKernelConfig {
-    dim3         dim_grid;
-    dim3         dim_block;
-    size_t       shmem_nbyte{0};
-    cudaStream_t stream;
-
-} kernelcfg;
-
-// TODO use cudaEvent
-/**
- * @brief A timer wrapper for arbitrary CUDA function
- *
- * @tparam F auto function type
- * @tparam Args variadic function argument type
- * @param func CUDA kernel function to be time
- * @param cfg CUDA kernel config
- * @param args variadic function arguments
- * @return double time in seconds
- */
-template <typename F, typename... Args>
-float TimeThisCUDARoutine(F func, kernelcfg cfg, Args&&... args)
-{
-    cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    cudaEventRecord(start);
-    func<<<cfg.dim_grid, cfg.dim_block, cfg.shmem_nbyte, cfg.stream>>>(  //
-        args...
-        // std::forward<Args>(args)... // also works
-    );
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-
-    cudaStreamSynchronize(cfg.stream);
-
-    float milliseconds;
-    cudaEventElapsedTime(&milliseconds, start, stop);
-
-    return milliseconds;
-}
-
-#endif
-
-#endif  // UTILS_TIMER_HH
+/**
+ * @file timer.hh
+ * @author Jiannan Tian
+ * @brief High-resolution timer wrapper from <chrono> and util functions for timing both CPU and CUDA function
+ * @version 0.2
+ * @date 2021-01-05
+ * (created) 2019-08-26 (rev) 2021-12-23
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef UTILS_TIMER_HH
+#define UTILS_TIMER_HH
+
+#include <chrono>
+#include <utility>
+
+using hires         = std::chrono::high_resolution_clock;
+using duration_t    = std::chrono::duration<double>;
+using hires_clock_t = std::chrono::time_point<hires>;
+
+typedef struct Timer {
+    hires_clock_t start, end;
+
+    void   timer_start() { start = hires::now(); }
+    void   timer_end() { end = hires::now(); }
+    double get_time_elapsed() { return static_cast<duration_t>(end - start).count(); }
+
+} host_timer_t;
+
+#ifdef __CUDACC__
+
+/**
+ * @brief CUDA event based timer. Synopsis:
+ * cuda_timer_t t;
+ * t.timer_start();
+ * kernel<<<grid_dim, block_dim, nbytes, stream>>>(...);
+ * t.timer_end();
+ * cudaStreamSynchronize(stream);
+ * auto ms = t.get_time_elapsed();
+ *
+ */
+typedef struct CUDATimer {
+    cudaEvent_t start, stop;
+    float       milliseconds;
+
+    // stream not involved
+    void timer_start()
+    {
+        cudaEventCreate(&start);
+        cudaEventCreate(&stop);
+        cudaEventRecord(start);
+    }
+
+    void timer_end()
+    {
+        cudaEventRecord(stop);
+        cudaEventSynchronize(stop);
+    }
+
+    // stream involved
+    void timer_start(cudaStream_t stream)
+    {
+        cudaEventCreate(&start);
+        cudaEventCreate(&stop);
+
+        cudaEventRecord(start, stream);  // set event as not occurred
+    }
+
+    void timer_end(cudaStream_t stream)
+    {
+        cudaEventRecord(stop, stream);
+        cudaEventSynchronize(stop);  // block host until `stream` meets `stop`
+    }
+
+    // get time
+    float get_time_elapsed()
+    {
+        cudaEventElapsedTime(&milliseconds, start, stop);
+        return milliseconds;
+    }
+
+} cuda_timer_t;
+
+#endif
+
+// TODO handle return; testing
+/**
+ * @brief A timer wrapper for arbitrary function (no handling return for now);
+ * Adapted from https://stackoverflow.com/a/33900479/8740097 (CC BY-SA 3.0)
+ *
+ * @tparam F auto function type
+ * @tparam Args variadic function argument type
+ * @param func non-return function to be timed
+ * @param args variadic function arguments
+ * @return double time in seconds
+ */
+template <typename F, typename... Args>
+double TimeThisRoutine(F func, Args&&... args)
+{
+    auto t0 = hires::now();
+    func(std::forward<Args>(args)...);
+    return static_cast<duration_t>(hires::now() - t0).count();
+}
+
+#ifdef __CUDACC__
+typedef struct CUDAKernelConfig {
+    dim3         dim_grid;
+    dim3         dim_block;
+    size_t       shmem_nbyte{0};
+    cudaStream_t stream;
+
+} kernelcfg;
+
+// TODO use cudaEvent
+/**
+ * @brief A timer wrapper for arbitrary CUDA function
+ *
+ * @tparam F auto function type
+ * @tparam Args variadic function argument type
+ * @param func CUDA kernel function to be time
+ * @param cfg CUDA kernel config
+ * @param args variadic function arguments
+ * @return double time in seconds
+ */
+template <typename F, typename... Args>
+float TimeThisCUDARoutine(F func, kernelcfg cfg, Args&&... args)
+{
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    cudaEventRecord(start);
+    func<<<cfg.dim_grid, cfg.dim_block, cfg.shmem_nbyte, cfg.stream>>>(  //
+        args...
+        // std::forward<Args>(args)... // also works
+    );
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+
+    cudaStreamSynchronize(cfg.stream);
+
+    float milliseconds;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+
+    return milliseconds;
+}
+
+#endif
+
+#endif  // UTILS_TIMER_HH
diff --git a/qtensor/compression/cusz/src/cli/cli.cu b/qtensor/compression/cusz/src/cli/cli.cu
index 01c61565..64084cba 100644
--- a/qtensor/compression/cusz/src/cli/cli.cu
+++ b/qtensor/compression/cusz/src/cli/cli.cu
@@ -1,14 +1,14 @@
-/**
- * @file cli.cu
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-03-07
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#include "cli.cuh"
-
-template class cusz::CLI<float>;
+/**
+ * @file cli.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-03-07
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "cli.cuh"
+
+template class cusz::CLI<float>;
diff --git a/qtensor/compression/cusz/src/cli/cli.cuh b/qtensor/compression/cusz/src/cli/cli.cuh
index da94a347..14a9103d 100644
--- a/qtensor/compression/cusz/src/cli/cli.cuh
+++ b/qtensor/compression/cusz/src/cli/cli.cuh
@@ -1,195 +1,195 @@
-/**
- * @file cli.cuh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-02-20
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef CLI_CUH
-#define CLI_CUH
-
-#include <string>
-#include <type_traits>
-
-#include "cli/analyzer.hh"
-#include "cli/dryrun_part.cuh"
-#include "cli/query.hh"
-#include "cli/timerecord_viewer.hh"
-#include "cusz.h"
-#include "framework.hh"
-
-namespace cusz {
-
-template <typename Data = float>
-class CLI {
-   private:
-    using Header = cuszHEADER;
-    using T      = Data;
-
-    const static auto HOST        = cusz::LOC::HOST;
-    const static auto DEVICE      = cusz::LOC::DEVICE;
-    const static auto HOST_DEVICE = cusz::LOC::HOST_DEVICE;
-
-    using context_t = cuszCTX*;
-    using header_t  = cuszHEADER*;
-
-   public:
-    CLI() = default;
-
-    template <class Predictor>
-    static void cli_dryrun(context_t ctx, bool dualquant = true)
-    {
-        BaseCompressor<Predictor> analysis;
-
-        uint3        xyz{ctx->x, ctx->y, ctx->z};
-        cudaStream_t stream;
-        cudaStreamCreate(&stream);
-
-        if (not dualquant) {
-            analysis.init_dualquant_dryrun(xyz);
-            analysis.dualquant_dryrun(ctx->fname.fname, ctx->eb, ctx->mode == "r2r", stream);
-            analysis.destroy_dualquant_dryrun();
-        }
-        else {
-            analysis.init_generic_dryrun(xyz);
-            analysis.generic_dryrun(ctx->fname.fname, ctx->eb, 512, ctx->mode == "r2r", stream);
-            analysis.destroy_generic_dryrun();
-        }
-        cudaStreamDestroy(stream);
-    }
-
-   private:
-    void write_compressed_to_disk(std::string compressed_name, BYTE* compressed, size_t compressed_len)
-    {
-        Capsule<BYTE> file("cusza");
-        file.set_len(compressed_len)
-            .set_dptr(compressed)
-            .mallochost()
-            .device2host()
-            .tofile(compressed_name)
-            .freehost()
-            .free();
-    }
-
-    void try_write_decompressed_to_disk(Capsule<T>& xdata, std::string basename, bool skip_write)
-    {
-        if (not skip_write) xdata.device2host().tofile(basename + ".cuszx");
-    }
-
-    // template <typename compressor_t>
-    void cli_construct(context_t ctx, cusz_compressor* compressor, cudaStream_t stream)
-    {
-        Capsule<T> input("uncompressed");
-        BYTE*      compressed;
-        size_t     compressed_len;
-        Header     header;
-        auto       len      = ctx->get_len();
-        auto       basename = ctx->fname.fname;
-
-        auto load_uncompressed = [&](std::string fname) {
-            input
-                .set_len(len)  //
-                .mallochost()
-                .malloc()
-                .fromfile(fname)
-                .host2device();
-        };
-
-        auto adjust_eb = [&]() {
-            if (ctx->mode == "r2r") ctx->eb *= input.prescan().get_rng();
-        };
-
-        /******************************************************************************/
-
-        load_uncompressed(basename);
-        adjust_eb();
-
-        TimeRecord timerecord;
-
-        cusz_config* config     = new cusz_config{.eb = ctx->eb, .mode = Rel};
-        cusz_len     uncomp_len = cusz_len{ctx->x, ctx->y, ctx->z, 1};
-
-        cusz_compress(
-            compressor, config, input.dptr(), uncomp_len, &compressed, &compressed_len, &header, (void*)&timerecord,
-            stream);
-
-        if (ctx->report.time) TimeRecordViewer::view_compression(&timerecord, input.nbyte(), compressed_len);
-        write_compressed_to_disk(basename + ".cusza", compressed, compressed_len);
-    }
-
-    // template <typename compressor_t>
-    void cli_reconstruct(context_t ctx, cusz_compressor* compressor, cudaStream_t stream)
-    {
-        Capsule<BYTE> compressed("compressed");
-        Capsule<T>    decompressed("decompressed"), original("cmp");
-        auto          header   = new Header;
-        auto          basename = (*ctx).fname.fname;
-
-        auto load_compressed = [&](std::string compressed_name) {
-            auto compressed_len = ConfigHelper::get_filesize(compressed_name);
-            compressed
-                .set_len(compressed_len)  //
-                .mallochost()
-                .malloc()
-                .fromfile(compressed_name)
-                .host2device();
-        };
-
-        /******************************************************************************/
-
-        load_compressed(basename + ".cusza");
-        memcpy(header, compressed.hptr(), sizeof(Header));
-        auto len = ConfigHelper::get_uncompressed_len(header);
-
-        decompressed  //
-            .set_len(len)
-            .mallochost()
-            .malloc();
-        original.set_len(len);
-
-        TimeRecord timerecord;
-
-        cusz_len decomp_len = cusz_len{header->x, header->y, header->z, 1};
-
-        cusz_decompress(
-            compressor, header, compressed.dptr(), ConfigHelper::get_filesize(header), decompressed.dptr(), decomp_len,
-            (void*)&timerecord, stream);
-
-        if (ctx->report.time) TimeRecordViewer::view_decompression(&timerecord, decompressed.nbyte());
-        QualityViewer::view(header, decompressed, original, (*ctx).fname.origin_cmp);
-        try_write_decompressed_to_disk(decompressed, basename, (*ctx).skip.write2disk);
-
-        decompressed.freehost().free();
-    }
-
-   public:
-    // TODO determine dtype & predictor in here
-    void dispatch(context_t ctx)
-    {
-        // TODO disable predictor selection; to specify in another way
-        // auto predictor = (*ctx).predictor;
-
-        cusz_framework*  framework  = cusz_default_framework();
-        cusz_compressor* compressor = cusz_create(framework, FP32);
-
-        cudaStream_t stream;
-        CHECK_CUDA(cudaStreamCreate(&stream));
-
-        // TODO hardcoded predictor type
-        if ((*ctx).cli_task.dryrun) cli_dryrun<typename Framework<float>::Predictor>(ctx);
-
-        if ((*ctx).cli_task.construct) cli_construct(ctx, compressor, stream);
-
-        if ((*ctx).cli_task.reconstruct) cli_reconstruct(ctx, compressor, stream);
-
-        if (stream) cudaStreamDestroy(stream);
-    }
-};
-
-}  // namespace cusz
-
-#endif
+/**
+ * @file cli.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-02-20
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CLI_CUH
+#define CLI_CUH
+
+#include <string>
+#include <type_traits>
+
+#include "cli/analyzer.hh"
+#include "cli/dryrun_part.cuh"
+#include "cli/query.hh"
+#include "cli/timerecord_viewer.hh"
+#include "cusz.h"
+#include "framework.hh"
+
+namespace cusz {
+
+template <typename Data = float>
+class CLI {
+   private:
+    using Header = cuszHEADER;
+    using T      = Data;
+
+    const static auto HOST        = cusz::LOC::HOST;
+    const static auto DEVICE      = cusz::LOC::DEVICE;
+    const static auto HOST_DEVICE = cusz::LOC::HOST_DEVICE;
+
+    using context_t = cuszCTX*;
+    using header_t  = cuszHEADER*;
+
+   public:
+    CLI() = default;
+
+    template <class Predictor>
+    static void cli_dryrun(context_t ctx, bool dualquant = true)
+    {
+        BaseCompressor<Predictor> analysis;
+
+        uint3        xyz{ctx->x, ctx->y, ctx->z};
+        cudaStream_t stream;
+        cudaStreamCreate(&stream);
+
+        if (not dualquant) {
+            analysis.init_dualquant_dryrun(xyz);
+            analysis.dualquant_dryrun(ctx->fname.fname, ctx->eb, ctx->mode == "r2r", stream);
+            analysis.destroy_dualquant_dryrun();
+        }
+        else {
+            analysis.init_generic_dryrun(xyz);
+            analysis.generic_dryrun(ctx->fname.fname, ctx->eb, 512, ctx->mode == "r2r", stream);
+            analysis.destroy_generic_dryrun();
+        }
+        cudaStreamDestroy(stream);
+    }
+
+   private:
+    void write_compressed_to_disk(std::string compressed_name, BYTE* compressed, size_t compressed_len)
+    {
+        Capsule<BYTE> file("cusza");
+        file.set_len(compressed_len)
+            .set_dptr(compressed)
+            .mallochost()
+            .device2host()
+            .tofile(compressed_name)
+            .freehost()
+            .free();
+    }
+
+    void try_write_decompressed_to_disk(Capsule<T>& xdata, std::string basename, bool skip_write)
+    {
+        if (not skip_write) xdata.device2host().tofile(basename + ".cuszx");
+    }
+
+    // template <typename compressor_t>
+    void cli_construct(context_t ctx, cusz_compressor* compressor, cudaStream_t stream)
+    {
+        Capsule<T> input("uncompressed");
+        BYTE*      compressed;
+        size_t     compressed_len;
+        Header     header;
+        auto       len      = ctx->get_len();
+        auto       basename = ctx->fname.fname;
+
+        auto load_uncompressed = [&](std::string fname) {
+            input
+                .set_len(len)  //
+                .mallochost()
+                .malloc()
+                .fromfile(fname)
+                .host2device();
+        };
+
+        auto adjust_eb = [&]() {
+            if (ctx->mode == "r2r") ctx->eb *= input.prescan().get_rng();
+        };
+
+        /******************************************************************************/
+
+        load_uncompressed(basename);
+        adjust_eb();
+
+        TimeRecord timerecord;
+
+        cusz_config* config     = new cusz_config{.eb = ctx->eb, .mode = Rel};
+        cusz_len     uncomp_len = cusz_len{ctx->x, ctx->y, ctx->z, 1};
+
+        cusz_compress(
+            compressor, config, input.dptr(), uncomp_len, &compressed, &compressed_len, &header, (void*)&timerecord,
+            stream);
+
+        if (ctx->report.time) TimeRecordViewer::view_compression(&timerecord, input.nbyte(), compressed_len);
+        write_compressed_to_disk(basename + ".cusza", compressed, compressed_len);
+    }
+
+    // template <typename compressor_t>
+    void cli_reconstruct(context_t ctx, cusz_compressor* compressor, cudaStream_t stream)
+    {
+        Capsule<BYTE> compressed("compressed");
+        Capsule<T>    decompressed("decompressed"), original("cmp");
+        auto          header   = new Header;
+        auto          basename = (*ctx).fname.fname;
+
+        auto load_compressed = [&](std::string compressed_name) {
+            auto compressed_len = ConfigHelper::get_filesize(compressed_name);
+            compressed
+                .set_len(compressed_len)  //
+                .mallochost()
+                .malloc()
+                .fromfile(compressed_name)
+                .host2device();
+        };
+
+        /******************************************************************************/
+
+        load_compressed(basename + ".cusza");
+        memcpy(header, compressed.hptr(), sizeof(Header));
+        auto len = ConfigHelper::get_uncompressed_len(header);
+
+        decompressed  //
+            .set_len(len)
+            .mallochost()
+            .malloc();
+        original.set_len(len);
+
+        TimeRecord timerecord;
+
+        cusz_len decomp_len = cusz_len{header->x, header->y, header->z, 1};
+
+        cusz_decompress(
+            compressor, header, compressed.dptr(), ConfigHelper::get_filesize(header), decompressed.dptr(), decomp_len,
+            (void*)&timerecord, stream);
+
+        if (ctx->report.time) TimeRecordViewer::view_decompression(&timerecord, decompressed.nbyte());
+        QualityViewer::view(header, decompressed, original, (*ctx).fname.origin_cmp);
+        try_write_decompressed_to_disk(decompressed, basename, (*ctx).skip.write2disk);
+
+        decompressed.freehost().free();
+    }
+
+   public:
+    // TODO determine dtype & predictor in here
+    void dispatch(context_t ctx)
+    {
+        // TODO disable predictor selection; to specify in another way
+        // auto predictor = (*ctx).predictor;
+
+        cusz_framework*  framework  = cusz_default_framework();
+        cusz_compressor* compressor = cusz_create(framework, FP32);
+
+        cudaStream_t stream;
+        CHECK_CUDA(cudaStreamCreate(&stream));
+
+        // TODO hardcoded predictor type
+        if ((*ctx).cli_task.dryrun) cli_dryrun<typename Framework<float>::Predictor>(ctx);
+
+        if ((*ctx).cli_task.construct) cli_construct(ctx, compressor, stream);
+
+        if ((*ctx).cli_task.reconstruct) cli_reconstruct(ctx, compressor, stream);
+
+        if (stream) cudaStreamDestroy(stream);
+    }
+};
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/src/cli/dryrun_part.cu b/qtensor/compression/cusz/src/cli/dryrun_part.cu
index 41311b6b..c3a8a1c4 100644
--- a/qtensor/compression/cusz/src/cli/dryrun_part.cu
+++ b/qtensor/compression/cusz/src/cli/dryrun_part.cu
@@ -1,17 +1,17 @@
-/**
- * @file base_compressor.cu
- * @author Jiannan Tian
- * @brief Predictor-only Base Compressor; can also be used for dryrun.
- * @version 0.3
- * @date 2021-10-05
- *
- * (C) 2021 by Washington State University, Argonne National Laboratory
- *
- */
-
-#include "dryrun_part.cuh"
-
-template class cusz::BaseCompressor<cusz::PredictionUnified<  //
-    DataTrait<4>::type,
-    ErrCtrlTrait<2>::type,
-    FastLowPrecisionTrait<true>::type>>;
+/**
+ * @file base_compressor.cu
+ * @author Jiannan Tian
+ * @brief Predictor-only Base Compressor; can also be used for dryrun.
+ * @version 0.3
+ * @date 2021-10-05
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "dryrun_part.cuh"
+
+template class cusz::BaseCompressor<cusz::PredictionUnified<  //
+    DataTrait<4>::type,
+    ErrCtrlTrait<2>::type,
+    FastLowPrecisionTrait<true>::type>>;
diff --git a/qtensor/compression/cusz/src/cli/dryrun_part.cuh b/qtensor/compression/cusz/src/cli/dryrun_part.cuh
index e6fd4579..0013e790 100644
--- a/qtensor/compression/cusz/src/cli/dryrun_part.cuh
+++ b/qtensor/compression/cusz/src/cli/dryrun_part.cuh
@@ -1,196 +1,196 @@
-/**
- * @file base_compressor.cuh
- * @author Jiannan Tian
- * @brief Predictor-only Base Compressor; can also be used for dryrun.
- * @version 0.3
- * @date 2021-10-05
- *
- * (C) 2021 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef BASE_COMPRESSOR_CUH
-#define BASE_COMPRESSOR_CUH
-
-#include "cli/analyzer.hh"
-#include "cli/quality_viewer.hh"
-#include "cli/verify.hh"
-#include "common.hh"
-#include "component.hh"
-#include "context.hh"
-#include "kernel/dryrun.cuh"
-#include "stat/compare_gpu.hh"
-#include "utils.hh"
-
-/**
- * @brief bare metal, can run predictor to check data quality and compressibility
- *
- * @tparam T for data type
- * @tparam E for error control type
- */
-
-namespace cusz {
-
-template <class Predictor>
-class BaseCompressor {
-   public:
-    using BYTE = uint8_t;
-    using T    = typename Predictor::Origin;
-    using FP   = typename Predictor::Precision;
-    using E    = typename Predictor::ErrCtrl;
-
-   private:
-    struct NonCritical {
-        Predictor* p;
-        Capsule<T> original;
-        Capsule<E> errctrl;  // TODO change to 4-byte
-        Capsule<T> outlier;
-        Capsule<T> anchor;
-        Capsule<T> reconst;
-
-        NonCritical(dim3 size) { p = new Predictor; }
-    };
-
-    struct NonCritical* nc;
-
-   protected:
-    cuszCTX* ctx;
-
-    int    dict_size;
-    double eb;
-
-    dim3 xyz;
-
-   public:
-    /**
-     * @brief Generic dryrun; performing predictor.construct() and .reconstruct()
-     *
-     * @param fname filename
-     * @param eb (host variable) error bound; future: absolute error bound only
-     * @param radius (host variable) limiting radius
-     * @param r2r if relative-to-value-range
-     * @param stream CUDA stream
-     * @return BaseCompressor& this object instance
-     */
-    BaseCompressor& generic_dryrun(const std::string fname, double eb, int radius, bool r2r, cudaStream_t stream)
-    {
-        if (not nc) throw std::runtime_error("NonCritical struct has no instance.");
-
-        // LOGGING(LOG_INFO, "invoke dry-run");
-
-        nc->original.fromfile(fname).host2device_async(stream);
-        CHECK_CUDA(cudaStreamSynchronize(stream));
-
-        if (r2r) {
-            double max, min, rng;
-            nc->original.prescan(max, min, rng);
-            eb *= rng;
-        }
-
-        auto xyz = dim3(ctx->x, ctx->y, ctx->z);
-
-        // nc->p->construct(
-        //     LorenzoI, xyz, nc->original.dptr, nc->anchor.dptr, nc->errctrl.dptr, nc->outlier.dptr, eb, radius,
-        //     stream);
-        // nc->p->reconstruct(
-        //     LorenzoI, xyz, nc->outlier.dptr, nc->anchor.dptr, nc->errctrl.dptr, nc->reconst.dptr, eb, radius,
-        //     stream);
-
-        nc->reconst.device2host_async(stream);
-        CHECK_CUDA(cudaStreamSynchronize(stream));
-
-        cusz_stats stat;
-        psz::thrustgpu_assess_quality<T>(&stat, nc->reconst.hptr(), nc->original.hptr(), nc->p->get_len_data());
-        cusz::QualityViewer::print_metrics_cross<T>(&stat, 0, true);
-
-        return *this;
-    }
-
-    /**
-     * @brief Dual-quant dryrun; performing integerization & its reverse procedure
-     *
-     * @param eb (host variable) error bound; future: absolute error bound only
-     * @param r2r if relative-to-value-range
-     * @param stream CUDA stream
-     * @return BaseCompressor& this object instance
-     */
-    BaseCompressor& dualquant_dryrun(const std::string fname, double eb, bool r2r, cudaStream_t stream)
-    {
-        auto len = nc->original.len();
-
-        nc->original.fromfile(fname).host2device_async(stream);
-        CHECK_CUDA(cudaStreamSynchronize(stream));
-
-        if (r2r) {
-            double max, min, rng;
-            nc->original.prescan(max, min, rng);
-            eb *= rng;
-        }
-
-        auto ebx2_r = 1 / (eb * 2);
-        auto ebx2   = eb * 2;
-
-        cusz::dualquant_dryrun_kernel                                              //
-            <<<ConfigHelper::get_npart(len, 256), 256, 256 * sizeof(T), stream>>>  //
-            (nc->original.dptr(), nc->reconst.dptr(), len, ebx2_r, ebx2);
-
-        nc->reconst.device2host_async(stream);
-        CHECK_CUDA(cudaStreamSynchronize(stream));
-
-        cusz_stats stat;
-        psz::thrustgpu_assess_quality(&stat, nc->reconst.hptr(), nc->original.hptr(), len);
-        cusz::QualityViewer::print_metrics_cross<T>(&stat, 0, true);
-
-        return *this;
-    }
-
-   public:
-    BaseCompressor() = default;
-
-    ~BaseCompressor() {}
-
-   public:
-    // dry run
-    void init_generic_dryrun(dim3 size)
-    {  //
-        auto len = size.x * size.y * size.z;
-        nc       = new struct NonCritical(size);
-
-        nc->original.set_len(len).mallochost().malloc();
-        nc->outlier.set_len(len).mallochost().malloc();
-        nc->errctrl.set_len(len).mallochost().malloc();
-        nc->anchor.set_len(nc->p->get_len_anchor()).mallochost().malloc();
-        nc->reconst.set_len(len).mallochost().malloc();
-    }
-
-    void destroy_generic_dryrun()
-    {
-        delete nc->p;
-        nc->original.freehost().free();
-        nc->outlier.freehost().free();
-        nc->errctrl.freehost().free();
-        nc->anchor.freehost().free();
-        nc->reconst.freehost().free();
-        delete nc;
-    }
-
-    void init_dualquant_dryrun(dim3 size)
-    {
-        auto len = size.x * size.y * size.z;
-        nc       = new struct NonCritical(size);
-        nc->original.set_len(len).mallochost().malloc();
-        nc->reconst.set_len(len).mallochost().malloc();
-    }
-
-    void destroy_dualquant_dryrun()
-    {
-        nc->original.freehost().free();
-        nc->reconst.freehost().free();
-
-        delete nc;
-    }
-};
-
-}  // namespace cusz
-
-#endif
+/**
+ * @file base_compressor.cuh
+ * @author Jiannan Tian
+ * @brief Predictor-only Base Compressor; can also be used for dryrun.
+ * @version 0.3
+ * @date 2021-10-05
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef BASE_COMPRESSOR_CUH
+#define BASE_COMPRESSOR_CUH
+
+#include "cli/analyzer.hh"
+#include "cli/quality_viewer.hh"
+#include "cli/verify.hh"
+#include "common.hh"
+#include "component.hh"
+#include "context.hh"
+#include "kernel/dryrun.cuh"
+#include "stat/compare_gpu.hh"
+#include "utils.hh"
+
+/**
+ * @brief bare metal, can run predictor to check data quality and compressibility
+ *
+ * @tparam T for data type
+ * @tparam E for error control type
+ */
+
+namespace cusz {
+
+template <class Predictor>
+class BaseCompressor {
+   public:
+    using BYTE = uint8_t;
+    using T    = typename Predictor::Origin;
+    using FP   = typename Predictor::Precision;
+    using E    = typename Predictor::ErrCtrl;
+
+   private:
+    struct NonCritical {
+        Predictor* p;
+        Capsule<T> original;
+        Capsule<E> errctrl;  // TODO change to 4-byte
+        Capsule<T> outlier;
+        Capsule<T> anchor;
+        Capsule<T> reconst;
+
+        NonCritical(dim3 size) { p = new Predictor; }
+    };
+
+    struct NonCritical* nc;
+
+   protected:
+    cuszCTX* ctx;
+
+    int    dict_size;
+    double eb;
+
+    dim3 xyz;
+
+   public:
+    /**
+     * @brief Generic dryrun; performing predictor.construct() and .reconstruct()
+     *
+     * @param fname filename
+     * @param eb (host variable) error bound; future: absolute error bound only
+     * @param radius (host variable) limiting radius
+     * @param r2r if relative-to-value-range
+     * @param stream CUDA stream
+     * @return BaseCompressor& this object instance
+     */
+    BaseCompressor& generic_dryrun(const std::string fname, double eb, int radius, bool r2r, cudaStream_t stream)
+    {
+        if (not nc) throw std::runtime_error("NonCritical struct has no instance.");
+
+        // LOGGING(LOG_INFO, "invoke dry-run");
+
+        nc->original.fromfile(fname).host2device_async(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        if (r2r) {
+            double max, min, rng;
+            nc->original.prescan(max, min, rng);
+            eb *= rng;
+        }
+
+        auto xyz = dim3(ctx->x, ctx->y, ctx->z);
+
+        // nc->p->construct(
+        //     LorenzoI, xyz, nc->original.dptr, nc->anchor.dptr, nc->errctrl.dptr, nc->outlier.dptr, eb, radius,
+        //     stream);
+        // nc->p->reconstruct(
+        //     LorenzoI, xyz, nc->outlier.dptr, nc->anchor.dptr, nc->errctrl.dptr, nc->reconst.dptr, eb, radius,
+        //     stream);
+
+        nc->reconst.device2host_async(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        cusz_stats stat;
+        psz::thrustgpu_assess_quality<T>(&stat, nc->reconst.hptr(), nc->original.hptr(), nc->p->get_len_data());
+        cusz::QualityViewer::print_metrics_cross<T>(&stat, 0, true);
+
+        return *this;
+    }
+
+    /**
+     * @brief Dual-quant dryrun; performing integerization & its reverse procedure
+     *
+     * @param eb (host variable) error bound; future: absolute error bound only
+     * @param r2r if relative-to-value-range
+     * @param stream CUDA stream
+     * @return BaseCompressor& this object instance
+     */
+    BaseCompressor& dualquant_dryrun(const std::string fname, double eb, bool r2r, cudaStream_t stream)
+    {
+        auto len = nc->original.len();
+
+        nc->original.fromfile(fname).host2device_async(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        if (r2r) {
+            double max, min, rng;
+            nc->original.prescan(max, min, rng);
+            eb *= rng;
+        }
+
+        auto ebx2_r = 1 / (eb * 2);
+        auto ebx2   = eb * 2;
+
+        cusz::dualquant_dryrun_kernel                                              //
+            <<<ConfigHelper::get_npart(len, 256), 256, 256 * sizeof(T), stream>>>  //
+            (nc->original.dptr(), nc->reconst.dptr(), len, ebx2_r, ebx2);
+
+        nc->reconst.device2host_async(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        cusz_stats stat;
+        psz::thrustgpu_assess_quality(&stat, nc->reconst.hptr(), nc->original.hptr(), len);
+        cusz::QualityViewer::print_metrics_cross<T>(&stat, 0, true);
+
+        return *this;
+    }
+
+   public:
+    BaseCompressor() = default;
+
+    ~BaseCompressor() {}
+
+   public:
+    // dry run
+    void init_generic_dryrun(dim3 size)
+    {  //
+        auto len = size.x * size.y * size.z;
+        nc       = new struct NonCritical(size);
+
+        nc->original.set_len(len).mallochost().malloc();
+        nc->outlier.set_len(len).mallochost().malloc();
+        nc->errctrl.set_len(len).mallochost().malloc();
+        nc->anchor.set_len(nc->p->get_len_anchor()).mallochost().malloc();
+        nc->reconst.set_len(len).mallochost().malloc();
+    }
+
+    void destroy_generic_dryrun()
+    {
+        delete nc->p;
+        nc->original.freehost().free();
+        nc->outlier.freehost().free();
+        nc->errctrl.freehost().free();
+        nc->anchor.freehost().free();
+        nc->reconst.freehost().free();
+        delete nc;
+    }
+
+    void init_dualquant_dryrun(dim3 size)
+    {
+        auto len = size.x * size.y * size.z;
+        nc       = new struct NonCritical(size);
+        nc->original.set_len(len).mallochost().malloc();
+        nc->reconst.set_len(len).mallochost().malloc();
+    }
+
+    void destroy_dualquant_dryrun()
+    {
+        nc->original.freehost().free();
+        nc->reconst.freehost().free();
+
+        delete nc;
+    }
+};
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/src/cli_bin.cu b/qtensor/compression/cusz/src/cli_bin.cu
index f3e50d64..c59c00f9 100644
--- a/qtensor/compression/cusz/src/cli_bin.cu
+++ b/qtensor/compression/cusz/src/cli_bin.cu
@@ -1,27 +1,27 @@
-/**
- * @file cusz-cli.cu
- * @author Jiannan Tian
- * @brief Driver program of cuSZ.
- * @version 0.1
- * @date 2020-09-20
- * (created) 2019-12-30 (rev) 2022-02-20
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#include "cli/cli.cuh"
-
-int main(int argc, char** argv)
-{
-    auto ctx = new cuszCTX(argc, argv);
-
-    if (ctx->verbose) {
-        Diagnostics::GetMachineProperties();
-        GpuDiagnostics::GetDeviceProperty();
-    }
-
-    cusz::CLI<float> cusz_cli;
-    cusz_cli.dispatch(ctx);
-}
+/**
+ * @file cusz-cli.cu
+ * @author Jiannan Tian
+ * @brief Driver program of cuSZ.
+ * @version 0.1
+ * @date 2020-09-20
+ * (created) 2019-12-30 (rev) 2022-02-20
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include "cli/cli.cuh"
+
+int main(int argc, char** argv)
+{
+    auto ctx = new cuszCTX(argc, argv);
+
+    if (ctx->verbose) {
+        Diagnostics::GetMachineProperties();
+        GpuDiagnostics::GetDeviceProperty();
+    }
+
+    cusz::CLI<float> cusz_cli;
+    cusz_cli.dispatch(ctx);
+}
diff --git a/qtensor/compression/cusz/src/compressor.cc b/qtensor/compression/cusz/src/compressor.cc
index 7b62db5a..7482293b 100644
--- a/qtensor/compression/cusz/src/compressor.cc
+++ b/qtensor/compression/cusz/src/compressor.cc
@@ -1,149 +1,149 @@
-/**
- * @file compressor.cc
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-04-23
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#include "compressor.hh"
-#include "common/configs.hh"
-#include "framework.hh"
-
-namespace cusz {
-
-template <class B>
-Compressor<B>::~Compressor()
-{
-    pimpl.reset();
-}
-
-template <class B>
-Compressor<B>::Compressor() : pimpl{std::make_unique<impl>()}
-{
-}
-
-template <class B>
-Compressor<B>::Compressor(const Compressor<B>& old) : pimpl{std::make_unique<impl>(*old.pimpl)}
-{
-}
-
-template <class B>
-Compressor<B>& Compressor<B>::operator=(const Compressor<B>& old)
-{
-    *pimpl = *old.pimpl;
-    return *this;
-}
-
-template <class B>
-Compressor<B>::Compressor(Compressor<B>&&) = default;
-
-template <class B>
-Compressor<B>& Compressor<B>::operator=(Compressor<B>&&) = default;
-
-//------------------------------------------------------------------------------
-
-template <class B>
-void Compressor<B>::init(Context* config, bool dbg_print)
-{
-    pimpl->init(config, dbg_print);
-}
-
-template <class B>
-void Compressor<B>::init(Header* config, bool dbg_print)
-{
-    pimpl->init(config, dbg_print);
-}
-
-template <class B>
-void Compressor<B>::compress(
-    Context*          config,
-    Compressor<B>::T* uncompressed,
-    BYTE*&            compressed,
-    size_t&           compressed_len,
-    cudaStream_t      stream,
-    bool              dbg_print)
-{
-    pimpl->compress(config, uncompressed, compressed, compressed_len, stream, dbg_print);
-}
-
-template <class B>
-void Compressor<B>::decompress(
-    Header*           config,
-    BYTE*             compressed,
-    Compressor<B>::T* decompressed,
-    cudaStream_t      stream,
-    bool              dbg_print)
-{
-    pimpl->decompress(config, compressed, decompressed, stream, dbg_print);
-}
-
-template <class B>
-void Compressor<B>::clear_buffer()
-{
-    pimpl->clear_buffer();
-}
-
-// getter
-
-template <class B>
-void Compressor<B>::export_header(Header& header)
-{
-    pimpl->export_header(header);
-}
-
-template <class B>
-void Compressor<B>::export_header(Header* header)
-{
-    pimpl->export_header(header);
-}
-
-template <class B>
-void Compressor<B>::export_timerecord(TimeRecord* ext_timerecord)
-{
-    pimpl->export_timerecord(ext_timerecord);
-}
-
-}  // namespace cusz
-
-// extra helper
-namespace cusz {
-
-int CompressorHelper::autotune_coarse_parvle(Context* ctx)
-{
-    auto tune_coarse_huffman_sublen = [](size_t len) {
-        int current_dev = 0;
-        cudaSetDevice(current_dev);
-        cudaDeviceProp dev_prop{};
-        cudaGetDeviceProperties(&dev_prop, current_dev);
-
-        auto nSM               = dev_prop.multiProcessorCount;
-        auto allowed_block_dim = dev_prop.maxThreadsPerBlock;
-        auto deflate_nthread   = allowed_block_dim * nSM / HuffmanHelper::DEFLATE_CONSTANT;
-        auto optimal_sublen    = ConfigHelper::get_npart(len, deflate_nthread);
-        optimal_sublen         = ConfigHelper::get_npart(optimal_sublen, HuffmanHelper::BLOCK_DIM_DEFLATE) *
-                         HuffmanHelper::BLOCK_DIM_DEFLATE;
-
-        return optimal_sublen;
-    };
-
-    auto get_coarse_pardeg = [&](size_t len, int& sublen, int& pardeg) {
-        sublen = tune_coarse_huffman_sublen(len);
-        pardeg = ConfigHelper::get_npart(len, sublen);
-    };
-
-    // TODO should be move to somewhere else, e.g., cusz::par_optmizer
-    if (ctx->use.autotune_vle_pardeg)
-        get_coarse_pardeg(ctx->data_len, ctx->vle_sublen, ctx->vle_pardeg);
-    else
-        ctx->vle_pardeg = ConfigHelper::get_npart(ctx->data_len, ctx->vle_sublen);
-
-    return ctx->vle_pardeg;
-}
-
-}  // namespace cusz
-
-template class cusz::Compressor<cusz::Framework<float>>;
+/**
+ * @file compressor.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-23
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "compressor.hh"
+#include "common/configs.hh"
+#include "framework.hh"
+
+namespace cusz {
+
+template <class B>
+Compressor<B>::~Compressor()
+{
+    pimpl.reset();
+}
+
+template <class B>
+Compressor<B>::Compressor() : pimpl{std::make_unique<impl>()}
+{
+}
+
+template <class B>
+Compressor<B>::Compressor(const Compressor<B>& old) : pimpl{std::make_unique<impl>(*old.pimpl)}
+{
+}
+
+template <class B>
+Compressor<B>& Compressor<B>::operator=(const Compressor<B>& old)
+{
+    *pimpl = *old.pimpl;
+    return *this;
+}
+
+template <class B>
+Compressor<B>::Compressor(Compressor<B>&&) = default;
+
+template <class B>
+Compressor<B>& Compressor<B>::operator=(Compressor<B>&&) = default;
+
+//------------------------------------------------------------------------------
+
+template <class B>
+void Compressor<B>::init(Context* config, bool dbg_print)
+{
+    pimpl->init(config, dbg_print);
+}
+
+template <class B>
+void Compressor<B>::init(Header* config, bool dbg_print)
+{
+    pimpl->init(config, dbg_print);
+}
+
+template <class B>
+void Compressor<B>::compress(
+    Context*          config,
+    Compressor<B>::T* uncompressed,
+    BYTE*&            compressed,
+    size_t&           compressed_len,
+    cudaStream_t      stream,
+    bool              dbg_print)
+{
+    pimpl->compress(config, uncompressed, compressed, compressed_len, stream, dbg_print);
+}
+
+template <class B>
+void Compressor<B>::decompress(
+    Header*           config,
+    BYTE*             compressed,
+    Compressor<B>::T* decompressed,
+    cudaStream_t      stream,
+    bool              dbg_print)
+{
+    pimpl->decompress(config, compressed, decompressed, stream, dbg_print);
+}
+
+template <class B>
+void Compressor<B>::clear_buffer()
+{
+    pimpl->clear_buffer();
+}
+
+// getter
+
+template <class B>
+void Compressor<B>::export_header(Header& header)
+{
+    pimpl->export_header(header);
+}
+
+template <class B>
+void Compressor<B>::export_header(Header* header)
+{
+    pimpl->export_header(header);
+}
+
+template <class B>
+void Compressor<B>::export_timerecord(TimeRecord* ext_timerecord)
+{
+    pimpl->export_timerecord(ext_timerecord);
+}
+
+}  // namespace cusz
+
+// extra helper
+namespace cusz {
+
+int CompressorHelper::autotune_coarse_parvle(Context* ctx)
+{
+    auto tune_coarse_huffman_sublen = [](size_t len) {
+        int current_dev = 0;
+        cudaSetDevice(current_dev);
+        cudaDeviceProp dev_prop{};
+        cudaGetDeviceProperties(&dev_prop, current_dev);
+
+        auto nSM               = dev_prop.multiProcessorCount;
+        auto allowed_block_dim = dev_prop.maxThreadsPerBlock;
+        auto deflate_nthread   = allowed_block_dim * nSM / HuffmanHelper::DEFLATE_CONSTANT;
+        auto optimal_sublen    = ConfigHelper::get_npart(len, deflate_nthread);
+        optimal_sublen         = ConfigHelper::get_npart(optimal_sublen, HuffmanHelper::BLOCK_DIM_DEFLATE) *
+                         HuffmanHelper::BLOCK_DIM_DEFLATE;
+
+        return optimal_sublen;
+    };
+
+    auto get_coarse_pardeg = [&](size_t len, int& sublen, int& pardeg) {
+        sublen = tune_coarse_huffman_sublen(len);
+        pardeg = ConfigHelper::get_npart(len, sublen);
+    };
+
+    // TODO should be move to somewhere else, e.g., cusz::par_optmizer
+    if (ctx->use.autotune_vle_pardeg)
+        get_coarse_pardeg(ctx->data_len, ctx->vle_sublen, ctx->vle_pardeg);
+    else
+        ctx->vle_pardeg = ConfigHelper::get_npart(ctx->data_len, ctx->vle_sublen);
+
+    return ctx->vle_pardeg;
+}
+
+}  // namespace cusz
+
+template class cusz::Compressor<cusz::Framework<float>>;
diff --git a/qtensor/compression/cusz/src/context.cc b/qtensor/compression/cusz/src/context.cc
index c85f3d24..3356323b 100644
--- a/qtensor/compression/cusz/src/context.cc
+++ b/qtensor/compression/cusz/src/context.cc
@@ -1,493 +1,493 @@
-/**
- * @file argparse.cc
- * @author Jiannan Tian
- * @brief Argument parser.
- * @version 0.1
- * @date 2020-09-20
- * Created on: 20-04-24
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#include <cassert>
-#include <cmath>
-#include <cstring>
-#include <set>
-#include <stdexcept>
-#include <unordered_map>
-
-#include "cli/document.hh"
-#include "context.hh"
-
-namespace cusz {
-const char* VERSION_TEXT  = "2023-01-23 (unstable; pre-0.4)";
-const int   VERSION       = 20230123;
-const int   COMPATIBILITY = 0;
-}  // namespace cusz
-
-namespace {
-
-void set_preprocess(cusz::context_t ctx, const char* in_str)
-{
-    str_list opts;
-    StrHelper::parse_strlist(in_str, opts);
-
-    for (auto k : opts) {
-        // TODO
-    }
-}
-
-void set_report(cusz::context_t ctx, const char* in_str)
-{
-    str_list opts;
-    StrHelper::parse_strlist(in_str, opts);
-
-    for (auto o : opts) {
-        if (StrHelper::is_kv_pair(o)) {
-            auto kv = StrHelper::parse_kv_onoff(o);
-
-            if (kv.first == "cr")
-                ctx->report.cr = kv.second;
-            else if (kv.first == "compressibility")
-                ctx->report.compressibility = kv.second;
-            else if (kv.first == "time")
-                ctx->report.time = kv.second;
-        }
-        else {
-            if (o == "cr")
-                ctx->report.cr = true;
-            else if (o == "compressibility")
-                ctx->report.compressibility = true;
-            else if (o == "time")
-                ctx->report.time = true;
-        }
-    }
-}
-
-void set_config(cusz::context_t ctx, const char* in_str, bool dbg_print = false)
-{
-    map_t opts;
-    StrHelper::parse_strlist_as_kv(in_str, opts);
-
-    if (dbg_print) {
-        for (auto kv : opts) printf("%-*s %-s\n", 10, kv.first.c_str(), kv.second.c_str());
-        std::cout << "\n";
-    }
-
-    std::string k, v;
-    char*       end;
-
-    auto optmatch   = [&](std::vector<std::string> vs) -> bool { return ConfigHelper::check_opt_in_list(k, vs); };
-    auto is_enabled = [&](auto& v) -> bool { return v == "on" or v == "ON"; };
-
-    for (auto kv : opts) {
-        k = kv.first;
-        v = kv.second;
-
-        if (optmatch({"type", "dtype"})) {
-            ConfigHelper::check_dtype(v, false);
-            ctx->dtype = v;
-        }
-        else if (optmatch({"eb", "errorbound"})) {
-            ctx->eb = StrHelper::str2fp(v);
-        }
-        else if (optmatch({"mode"})) {
-            ConfigHelper::check_cuszmode(v, true);
-            ctx->mode = v;
-        }
-        else if (optmatch({"len", "length"})) {
-            cuszCTX::parse_input_length(v.c_str(), ctx);
-        }
-        else if (optmatch({"alloclen"})) {
-            ctx->alloclen.len = StrHelper::str2int(v);
-        }
-        else if (optmatch({"demo"})) {
-            ctx->use.predefined_demo = true;
-            ctx->demo_dataset        = std::string(v);
-            ctx->load_demo_sizes();
-        }
-        else if (optmatch({"cap", "booklen", "dictsize"})) {
-            ctx->dict_size = StrHelper::str2int(v);
-            ctx->radius    = ctx->dict_size / 2;
-        }
-        else if (optmatch({"radius"})) {
-            ctx->radius    = StrHelper::str2int(v);
-            ctx->dict_size = ctx->radius * 2;
-        }
-        else if (optmatch({"huffbyte"})) {
-            ctx->huff_bytewidth = StrHelper::str2int(v);
-            ctx->codecs_in_use  = ctx->codec_force_fallback() ? 0b11 /*use both*/ : 0b01 /*use 4-byte*/;
-        }
-        else if (optmatch({"huffchunk"})) {
-            ctx->vle_sublen              = StrHelper::str2int(v);
-            ctx->use.autotune_vle_pardeg = false;
-        }
-        else if (optmatch({"predictor"})) {
-            ctx->predictor = std::string(v);
-        }
-        else if (optmatch({"codec"})) {
-            // placeholder
-        }
-        else if (optmatch({"spcodec"})) {
-            // placeholder
-        }
-        else if (optmatch({"anchor"}) and is_enabled(v)) {
-            ctx->use.anchor = true;
-        }
-        else if (optmatch({"nondestructive"}) and is_enabled(v)) {
-            // placeholder
-        }
-        else if (optmatch({"failfast"}) and is_enabled(v)) {
-            // placeholder
-        }
-        else if (optmatch({"releaseinput"}) and is_enabled(v)) {
-            ctx->use.release_input = true;
-        }
-        else if (optmatch({"pipeline"})) {
-            ctx->pipeline = v;
-        }
-        else if (optmatch({"density"})) {  // refer to `SparseMethodSetup` in `config.hh`
-            ctx->nz_density        = StrHelper::str2fp(v);
-            ctx->nz_density_factor = 1 / ctx->nz_density;
-        }
-        else if (optmatch({"densityfactor"})) {  // refer to `SparseMethodSetup` in `config.hh`
-            ctx->nz_density_factor = StrHelper::str2fp(v);
-            ctx->nz_density        = 1 / ctx->nz_density_factor;
-        }
-        else if (optmatch({"gpuverify"}) and is_enabled(v)) {
-            ctx->use.gpu_verify = true;
-        }
-
-        // when to enable anchor
-        if (ctx->predictor == "spline3") {
-            // unconditionally use anchor when it is spline3
-            ctx->use.anchor = true;
-        }
-    }
-}
-
-void set_from_cli_input(cusz::context_t ctx, int const argc, char** const argv)
-{
-    int i = 1;
-
-    auto check_next = [&]() {
-        if (i + 1 >= argc) throw std::runtime_error("out-of-range at" + std::string(argv[i]));
-    };
-
-    std::string opt;
-    auto optmatch = [&](std::vector<std::string> vs) -> bool { return ConfigHelper::check_opt_in_list(opt, vs); };
-
-    while (i < argc) {
-        if (argv[i][0] == '-') {
-            opt = std::string(argv[i]);
-
-            if (optmatch({"-c", "--config"})) {
-                check_next();
-                set_config(ctx, argv[++i]);
-            }
-            else if (optmatch({"-R", "--report"})) {
-                check_next();
-                set_report(ctx, argv[++i]);
-            }
-            else if (optmatch({"-h", "--help"})) {
-                cusz::Context::print_doc(true);
-                exit(0);
-            }
-            else if (optmatch({"-v", "--version"})) {
-                std::cout << ">>>>  cusz build: " << cusz::VERSION_TEXT << "\n";
-                exit(0);
-            }
-            else if (optmatch({"-m", "--mode"})) {
-                check_next();
-                ctx->mode = std::string(argv[++i]);
-                if (ctx->mode == "r2r") ctx->preprocess.prescan = true;
-            }
-            else if (optmatch({"-e", "--eb", "--error-bound"})) {
-                check_next();
-                char* end;
-                ctx->eb = std::strtod(argv[++i], &end);
-            }
-            else if (optmatch({"-p", "--predictor"})) {
-                check_next();
-                ctx->predictor = std::string(argv[++i]);
-            }
-            else if (optmatch({"-c", "--codec"})) {
-                check_next();
-                // placeholder
-            }
-            else if (optmatch({"-s", "--spcodec"})) {
-                check_next();
-                // placeholder
-            }
-            else if (optmatch({"-t", "--type", "--dtype"})) {
-                check_next();
-                std::string s = std::string(std::string(argv[++i]));
-                if (s == "f32" or s == "fp4")
-                    ctx->dtype = "f32";
-                else if (s == "f64" or s == "fp8")
-                    ctx->dtype = "f64";
-            }
-            else if (optmatch({"-i", "--input"})) {
-                check_next();
-                ctx->fname.fname = std::string(argv[++i]);
-            }
-            else if (optmatch({"-l", "--len"})) {
-                check_next();
-                cusz::Context::parse_input_length(argv[++i], ctx);
-            }
-            else if (optmatch({"-L", "--allocation-len"})) {
-                check_next();
-                // placeholder
-            }
-            else if (optmatch({"-z", "--zip", "--compress"})) {
-                ctx->cli_task.construct = true;
-            }
-            else if (optmatch({"-x", "--unzip", "--decompress"})) {
-                ctx->cli_task.reconstruct = true;
-            }
-            else if (optmatch({"-r", "--dry-run"})) {
-                ctx->cli_task.dryrun = true;
-            }
-            else if (optmatch({"--anchor"})) {
-                ctx->use.anchor = true;
-            }
-            else if (optmatch({"--nondestructive", "--input-nondestructive"})) {
-                // placeholder
-            }
-            else if (optmatch({"--failfast"})) {
-                // placeholder
-            }
-            else if (optmatch({"-P", "--pre", "--preprocess"})) {
-                check_next();
-                std::string pre(argv[++i]);
-                if (pre.find("binning") != std::string::npos) { ctx->preprocess.binning = true; }
-            }
-            else if (optmatch({"-T", "--post", "--postprocess"})) {
-                check_next();
-                std::string post(argv[++i]);
-                if (post.find("gzip") != std::string::npos) { ctx->postcompress.cpu_gzip = true; }
-                if (post.find("nvcomp") != std::string::npos) { ctx->postcompress.gpu_nvcomp_cascade = true; }
-            }
-            else if (optmatch({"-V", "--verbose"})) {
-                ctx->verbose = true;
-            }
-            else if (optmatch({"--pipeline"})) {
-                check_next();
-                ctx->pipeline = std::string(argv[++i]);
-            }
-            else if (optmatch({"--demo"})) {
-                check_next();
-                ctx->use.predefined_demo = true;
-                ctx->demo_dataset        = std::string(argv[++i]);
-                ctx->load_demo_sizes();
-            }
-            else if (optmatch({"-S", "-X", "--skip", "--exclude"})) {
-                check_next();
-                std::string exclude(argv[++i]);
-                if (exclude.find("huffman") != std::string::npos) { ctx->skip.huffman = true; }
-                if (exclude.find("write2disk") != std::string::npos) { ctx->skip.write2disk = true; }
-            }
-            else if (optmatch({"--opath"})) {
-                check_next();
-                ctx->opath = std::string(argv[++i]);
-            }
-            else if (optmatch({"--origin", "--compare"})) {
-                check_next();
-                ctx->fname.origin_cmp = std::string(argv[++i]);
-            }
-            else {
-                const char* notif_prefix = "invalid option value at position ";
-                char*       notif;
-                int         size = asprintf(&notif, "%d: %s", i, argv[i]);
-                cerr << LOG_ERR << notif_prefix << "\e[1m" << notif << "\e[0m"
-                     << "\n";
-                cerr << std::string(LOG_NULL.length() + strlen(notif_prefix), ' ');
-                cerr << "\e[1m";
-                cerr << std::string(strlen(notif), '~');
-                cerr << "\e[0m\n";
-
-                ctx->trap(-1);
-            }
-        }
-        else {
-            const char* notif_prefix = "invalid option at position ";
-            char*       notif;
-            int         size = asprintf(&notif, "%d: %s", i, argv[i]);
-            cerr << LOG_ERR << notif_prefix << "\e[1m" << notif
-                 << "\e[0m"
-                    "\n"
-                 << std::string(LOG_NULL.length() + strlen(notif_prefix), ' ')  //
-                 << "\e[1m"                                                     //
-                 << std::string(strlen(notif), '~')                             //
-                 << "\e[0m\n";
-
-            ctx->trap(-1);
-        }
-        i++;
-    }
-}
-
-}  // namespace
-
-cuszCTX& cuszCTX::set_control_string(const char* in_str)
-{
-    set_config(this, in_str);
-    return *this;
-}
-
-void cuszCTX::load_demo_sizes()
-{
-    const std::unordered_map<std::string, std::vector<int>> dataset_entries = {
-        {std::string("hacc"), {280953867, 1, 1, 1, 1}},    {std::string("hacc1b"), {1073726487, 1, 1, 1, 1}},
-        {std::string("cesm"), {3600, 1800, 1, 1, 2}},      {std::string("hurricane"), {500, 500, 100, 1, 3}},
-        {std::string("nyx-s"), {512, 512, 512, 1, 3}},     {std::string("nyx-m"), {1024, 1024, 1024, 1, 3}},
-        {std::string("qmc"), {288, 69, 7935, 1, 3}},       {std::string("qmcpre"), {69, 69, 33120, 1, 3}},
-        {std::string("exafel"), {388, 59200, 1, 1, 2}},    {std::string("rtm"), {235, 849, 849, 1, 3}},
-        {std::string("parihaka"), {1168, 1126, 922, 1, 3}}};
-
-    if (not demo_dataset.empty()) {
-        auto f = dataset_entries.find(demo_dataset);
-        if (f == dataset_entries.end()) throw std::runtime_error("no such dataset as" + demo_dataset);
-        auto demo_xyzw = f->second;
-
-        x = demo_xyzw[0], y = demo_xyzw[1], z = demo_xyzw[2], w = demo_xyzw[3];
-        ndim = demo_xyzw[4];
-    }
-    data_len = x * y * z * w;
-}
-
-void cuszCTX::trap(int _status) { this->read_args_status = _status; }
-
-void cuszCTX::validate()
-{
-    bool to_abort = false;
-    if (fname.fname.empty()) {
-        cerr << LOG_ERR << "must specify input file" << endl;
-        to_abort = true;
-    }
-
-    if (data_len == 1 and not use.predefined_demo) {
-        if (cli_task.construct or cli_task.dryrun) {
-            cerr << LOG_ERR << "wrong input size" << endl;
-            to_abort = true;
-        }
-    }
-    if (not cli_task.construct and not cli_task.reconstruct and not cli_task.dryrun) {
-        cerr << LOG_ERR << "select compress (-z), decompress (-x) or dry-run (-r)" << endl;
-        to_abort = true;
-    }
-    if (false == ConfigHelper::check_dtype(dtype, false)) {
-        if (cli_task.construct or cli_task.dryrun) {
-            std::cout << dtype << endl;
-            cerr << LOG_ERR << "must specify data type" << endl;
-            to_abort = true;
-        }
-    }
-
-    if (quant_bytewidth == 1)
-        assert(dict_size <= 256);
-    else if (quant_bytewidth == 2)
-        assert(dict_size <= 65536);
-
-    if (cli_task.dryrun and cli_task.construct and cli_task.reconstruct) {
-        cerr << LOG_WARN << "no need to dry-run, compress and decompress at the same time" << endl;
-        cerr << LOG_WARN << "dryrun only" << endl << endl;
-        cli_task.construct   = false;
-        cli_task.reconstruct = false;
-    }
-    else if (cli_task.dryrun and cli_task.construct) {
-        cerr << LOG_WARN << "no need to dry-run and compress at the same time" << endl;
-        cerr << LOG_WARN << "dryrun only" << endl << endl;
-        cli_task.construct = false;
-    }
-    else if (cli_task.dryrun and cli_task.reconstruct) {
-        cerr << LOG_WARN << "no need to dry-run and decompress at the same time" << endl;
-        cerr << LOG_WARN << "will dryrun only" << endl << endl;
-        cli_task.reconstruct = false;
-    }
-
-    if (to_abort) {
-        print_doc();
-        exit(-1);
-    }
-}
-
-cuszCTX::cuszCTX(int argc, char** const argv)
-{
-    std::string opt;
-    auto optmatch = [&](std::vector<std::string> vs) -> bool { return ConfigHelper::check_opt_in_list(opt, vs); };
-
-    if (argc == 1) {
-        print_doc();
-        exit(0);
-    }
-
-    /******************************************************************************/
-    /* phase 0: parse */
-    set_from_cli_input(this, argc, argv);
-
-    // special treatment
-    if (predictor == "spline3") {
-        // unconditionally use anchor when it is spline3
-        use.anchor = true;
-    }
-
-    /******************************************************************************/
-    /* phase 1: check syntax */
-    if (read_args_status != 0) {
-        std::cout << LOG_INFO << "Exiting..." << endl;
-        // after printing ALL argument errors
-        exit(-1);
-    }
-
-    /******************************************************************************/
-    /* phase 2: check if legal */
-    validate();
-
-    /******************************************************************************/
-    /* phase 3: sort out filenames */
-    derive_fnames();
-}
-
-cuszCTX::cuszCTX(const char* in_str, bool dbg_print)
-{
-    /**
-     **  >>> syntax
-     **  comma-separated key-pairs
-     **  "key1=val1,key2=val2[,...]"
-     **
-     **  >>> example
-     **  "predictor=lorenzo,size=3600x1800"
-     **
-     **/
-
-    set_config(this, in_str, dbg_print);
-}
-
-void cuszCTX::print_doc(bool full)
-{
-    std::cout << "\n>>>>  cusz build: " << cusz::VERSION_TEXT << "\n";
-
-    if (full)
-        std::cout << StrHelper::doc_format(cusz_full_doc) << std::endl;
-    else
-        std::cout << cusz_short_doc << std::endl;
-}
-
-void cuszCTX::derive_fnames()
-{
-    // (1) "fname"          -> "", "fname"
-    // (2) "./fname"        -> "./" "fname"
-    // (3) "/path/to/fname" -> "/path/to", "fname"
-    auto input_path = fname.fname.substr(0, fname.fname.rfind('/') + 1);
-    if (not cli_task.construct and cli_task.reconstruct) fname.fname = fname.fname.substr(0, fname.fname.rfind('.'));
-    fname.basename = fname.fname.substr(fname.fname.rfind('/') + 1);
-
-    if (opath.empty()) opath = input_path.empty() ? opath = "" : opath = input_path;
-    opath += "/";
-
-    fname.path_basename   = opath + fname.basename;
-    fname.compress_output = fname.path_basename + ".cusza";
-}
+/**
+ * @file argparse.cc
+ * @author Jiannan Tian
+ * @brief Argument parser.
+ * @version 0.1
+ * @date 2020-09-20
+ * Created on: 20-04-24
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <set>
+#include <stdexcept>
+#include <unordered_map>
+
+#include "cli/document.hh"
+#include "context.hh"
+
+namespace cusz {
+const char* VERSION_TEXT  = "2023-01-23 (unstable; pre-0.4)";
+const int   VERSION       = 20230123;
+const int   COMPATIBILITY = 0;
+}  // namespace cusz
+
+namespace {
+
+void set_preprocess(cusz::context_t ctx, const char* in_str)
+{
+    str_list opts;
+    StrHelper::parse_strlist(in_str, opts);
+
+    for (auto k : opts) {
+        // TODO
+    }
+}
+
+void set_report(cusz::context_t ctx, const char* in_str)
+{
+    str_list opts;
+    StrHelper::parse_strlist(in_str, opts);
+
+    for (auto o : opts) {
+        if (StrHelper::is_kv_pair(o)) {
+            auto kv = StrHelper::parse_kv_onoff(o);
+
+            if (kv.first == "cr")
+                ctx->report.cr = kv.second;
+            else if (kv.first == "compressibility")
+                ctx->report.compressibility = kv.second;
+            else if (kv.first == "time")
+                ctx->report.time = kv.second;
+        }
+        else {
+            if (o == "cr")
+                ctx->report.cr = true;
+            else if (o == "compressibility")
+                ctx->report.compressibility = true;
+            else if (o == "time")
+                ctx->report.time = true;
+        }
+    }
+}
+
+void set_config(cusz::context_t ctx, const char* in_str, bool dbg_print = false)
+{
+    map_t opts;
+    StrHelper::parse_strlist_as_kv(in_str, opts);
+
+    if (dbg_print) {
+        for (auto kv : opts) printf("%-*s %-s\n", 10, kv.first.c_str(), kv.second.c_str());
+        std::cout << "\n";
+    }
+
+    std::string k, v;
+    char*       end;
+
+    auto optmatch   = [&](std::vector<std::string> vs) -> bool { return ConfigHelper::check_opt_in_list(k, vs); };
+    auto is_enabled = [&](auto& v) -> bool { return v == "on" or v == "ON"; };
+
+    for (auto kv : opts) {
+        k = kv.first;
+        v = kv.second;
+
+        if (optmatch({"type", "dtype"})) {
+            ConfigHelper::check_dtype(v, false);
+            ctx->dtype = v;
+        }
+        else if (optmatch({"eb", "errorbound"})) {
+            ctx->eb = StrHelper::str2fp(v);
+        }
+        else if (optmatch({"mode"})) {
+            ConfigHelper::check_cuszmode(v, true);
+            ctx->mode = v;
+        }
+        else if (optmatch({"len", "length"})) {
+            cuszCTX::parse_input_length(v.c_str(), ctx);
+        }
+        else if (optmatch({"alloclen"})) {
+            ctx->alloclen.len = StrHelper::str2int(v);
+        }
+        else if (optmatch({"demo"})) {
+            ctx->use.predefined_demo = true;
+            ctx->demo_dataset        = std::string(v);
+            ctx->load_demo_sizes();
+        }
+        else if (optmatch({"cap", "booklen", "dictsize"})) {
+            ctx->dict_size = StrHelper::str2int(v);
+            ctx->radius    = ctx->dict_size / 2;
+        }
+        else if (optmatch({"radius"})) {
+            ctx->radius    = StrHelper::str2int(v);
+            ctx->dict_size = ctx->radius * 2;
+        }
+        else if (optmatch({"huffbyte"})) {
+            ctx->huff_bytewidth = StrHelper::str2int(v);
+            ctx->codecs_in_use  = ctx->codec_force_fallback() ? 0b11 /*use both*/ : 0b01 /*use 4-byte*/;
+        }
+        else if (optmatch({"huffchunk"})) {
+            ctx->vle_sublen              = StrHelper::str2int(v);
+            ctx->use.autotune_vle_pardeg = false;
+        }
+        else if (optmatch({"predictor"})) {
+            ctx->predictor = std::string(v);
+        }
+        else if (optmatch({"codec"})) {
+            // placeholder
+        }
+        else if (optmatch({"spcodec"})) {
+            // placeholder
+        }
+        else if (optmatch({"anchor"}) and is_enabled(v)) {
+            ctx->use.anchor = true;
+        }
+        else if (optmatch({"nondestructive"}) and is_enabled(v)) {
+            // placeholder
+        }
+        else if (optmatch({"failfast"}) and is_enabled(v)) {
+            // placeholder
+        }
+        else if (optmatch({"releaseinput"}) and is_enabled(v)) {
+            ctx->use.release_input = true;
+        }
+        else if (optmatch({"pipeline"})) {
+            ctx->pipeline = v;
+        }
+        else if (optmatch({"density"})) {  // refer to `SparseMethodSetup` in `config.hh`
+            ctx->nz_density        = StrHelper::str2fp(v);
+            ctx->nz_density_factor = 1 / ctx->nz_density;
+        }
+        else if (optmatch({"densityfactor"})) {  // refer to `SparseMethodSetup` in `config.hh`
+            ctx->nz_density_factor = StrHelper::str2fp(v);
+            ctx->nz_density        = 1 / ctx->nz_density_factor;
+        }
+        else if (optmatch({"gpuverify"}) and is_enabled(v)) {
+            ctx->use.gpu_verify = true;
+        }
+
+        // when to enable anchor
+        if (ctx->predictor == "spline3") {
+            // unconditionally use anchor when it is spline3
+            ctx->use.anchor = true;
+        }
+    }
+}
+
+void set_from_cli_input(cusz::context_t ctx, int const argc, char** const argv)
+{
+    int i = 1;
+
+    auto check_next = [&]() {
+        if (i + 1 >= argc) throw std::runtime_error("out-of-range at" + std::string(argv[i]));
+    };
+
+    std::string opt;
+    auto optmatch = [&](std::vector<std::string> vs) -> bool { return ConfigHelper::check_opt_in_list(opt, vs); };
+
+    while (i < argc) {
+        if (argv[i][0] == '-') {
+            opt = std::string(argv[i]);
+
+            if (optmatch({"-c", "--config"})) {
+                check_next();
+                set_config(ctx, argv[++i]);
+            }
+            else if (optmatch({"-R", "--report"})) {
+                check_next();
+                set_report(ctx, argv[++i]);
+            }
+            else if (optmatch({"-h", "--help"})) {
+                cusz::Context::print_doc(true);
+                exit(0);
+            }
+            else if (optmatch({"-v", "--version"})) {
+                std::cout << ">>>>  cusz build: " << cusz::VERSION_TEXT << "\n";
+                exit(0);
+            }
+            else if (optmatch({"-m", "--mode"})) {
+                check_next();
+                ctx->mode = std::string(argv[++i]);
+                if (ctx->mode == "r2r") ctx->preprocess.prescan = true;
+            }
+            else if (optmatch({"-e", "--eb", "--error-bound"})) {
+                check_next();
+                char* end;
+                ctx->eb = std::strtod(argv[++i], &end);
+            }
+            else if (optmatch({"-p", "--predictor"})) {
+                check_next();
+                ctx->predictor = std::string(argv[++i]);
+            }
+            else if (optmatch({"-c", "--codec"})) {
+                check_next();
+                // placeholder
+            }
+            else if (optmatch({"-s", "--spcodec"})) {
+                check_next();
+                // placeholder
+            }
+            else if (optmatch({"-t", "--type", "--dtype"})) {
+                check_next();
+                std::string s = std::string(std::string(argv[++i]));
+                if (s == "f32" or s == "fp4")
+                    ctx->dtype = "f32";
+                else if (s == "f64" or s == "fp8")
+                    ctx->dtype = "f64";
+            }
+            else if (optmatch({"-i", "--input"})) {
+                check_next();
+                ctx->fname.fname = std::string(argv[++i]);
+            }
+            else if (optmatch({"-l", "--len"})) {
+                check_next();
+                cusz::Context::parse_input_length(argv[++i], ctx);
+            }
+            else if (optmatch({"-L", "--allocation-len"})) {
+                check_next();
+                // placeholder
+            }
+            else if (optmatch({"-z", "--zip", "--compress"})) {
+                ctx->cli_task.construct = true;
+            }
+            else if (optmatch({"-x", "--unzip", "--decompress"})) {
+                ctx->cli_task.reconstruct = true;
+            }
+            else if (optmatch({"-r", "--dry-run"})) {
+                ctx->cli_task.dryrun = true;
+            }
+            else if (optmatch({"--anchor"})) {
+                ctx->use.anchor = true;
+            }
+            else if (optmatch({"--nondestructive", "--input-nondestructive"})) {
+                // placeholder
+            }
+            else if (optmatch({"--failfast"})) {
+                // placeholder
+            }
+            else if (optmatch({"-P", "--pre", "--preprocess"})) {
+                check_next();
+                std::string pre(argv[++i]);
+                if (pre.find("binning") != std::string::npos) { ctx->preprocess.binning = true; }
+            }
+            else if (optmatch({"-T", "--post", "--postprocess"})) {
+                check_next();
+                std::string post(argv[++i]);
+                if (post.find("gzip") != std::string::npos) { ctx->postcompress.cpu_gzip = true; }
+                if (post.find("nvcomp") != std::string::npos) { ctx->postcompress.gpu_nvcomp_cascade = true; }
+            }
+            else if (optmatch({"-V", "--verbose"})) {
+                ctx->verbose = true;
+            }
+            else if (optmatch({"--pipeline"})) {
+                check_next();
+                ctx->pipeline = std::string(argv[++i]);
+            }
+            else if (optmatch({"--demo"})) {
+                check_next();
+                ctx->use.predefined_demo = true;
+                ctx->demo_dataset        = std::string(argv[++i]);
+                ctx->load_demo_sizes();
+            }
+            else if (optmatch({"-S", "-X", "--skip", "--exclude"})) {
+                check_next();
+                std::string exclude(argv[++i]);
+                if (exclude.find("huffman") != std::string::npos) { ctx->skip.huffman = true; }
+                if (exclude.find("write2disk") != std::string::npos) { ctx->skip.write2disk = true; }
+            }
+            else if (optmatch({"--opath"})) {
+                check_next();
+                ctx->opath = std::string(argv[++i]);
+            }
+            else if (optmatch({"--origin", "--compare"})) {
+                check_next();
+                ctx->fname.origin_cmp = std::string(argv[++i]);
+            }
+            else {
+                const char* notif_prefix = "invalid option value at position ";
+                char*       notif;
+                int         size = asprintf(&notif, "%d: %s", i, argv[i]);
+                cerr << LOG_ERR << notif_prefix << "\e[1m" << notif << "\e[0m"
+                     << "\n";
+                cerr << std::string(LOG_NULL.length() + strlen(notif_prefix), ' ');
+                cerr << "\e[1m";
+                cerr << std::string(strlen(notif), '~');
+                cerr << "\e[0m\n";
+
+                ctx->trap(-1);
+            }
+        }
+        else {
+            const char* notif_prefix = "invalid option at position ";
+            char*       notif;
+            int         size = asprintf(&notif, "%d: %s", i, argv[i]);
+            cerr << LOG_ERR << notif_prefix << "\e[1m" << notif
+                 << "\e[0m"
+                    "\n"
+                 << std::string(LOG_NULL.length() + strlen(notif_prefix), ' ')  //
+                 << "\e[1m"                                                     //
+                 << std::string(strlen(notif), '~')                             //
+                 << "\e[0m\n";
+
+            ctx->trap(-1);
+        }
+        i++;
+    }
+}
+
+}  // namespace
+
+cuszCTX& cuszCTX::set_control_string(const char* in_str)
+{
+    set_config(this, in_str);
+    return *this;
+}
+
+void cuszCTX::load_demo_sizes()
+{
+    const std::unordered_map<std::string, std::vector<int>> dataset_entries = {
+        {std::string("hacc"), {280953867, 1, 1, 1, 1}},    {std::string("hacc1b"), {1073726487, 1, 1, 1, 1}},
+        {std::string("cesm"), {3600, 1800, 1, 1, 2}},      {std::string("hurricane"), {500, 500, 100, 1, 3}},
+        {std::string("nyx-s"), {512, 512, 512, 1, 3}},     {std::string("nyx-m"), {1024, 1024, 1024, 1, 3}},
+        {std::string("qmc"), {288, 69, 7935, 1, 3}},       {std::string("qmcpre"), {69, 69, 33120, 1, 3}},
+        {std::string("exafel"), {388, 59200, 1, 1, 2}},    {std::string("rtm"), {235, 849, 849, 1, 3}},
+        {std::string("parihaka"), {1168, 1126, 922, 1, 3}}};
+
+    if (not demo_dataset.empty()) {
+        auto f = dataset_entries.find(demo_dataset);
+        if (f == dataset_entries.end()) throw std::runtime_error("no such dataset as" + demo_dataset);
+        auto demo_xyzw = f->second;
+
+        x = demo_xyzw[0], y = demo_xyzw[1], z = demo_xyzw[2], w = demo_xyzw[3];
+        ndim = demo_xyzw[4];
+    }
+    data_len = x * y * z * w;
+}
+
+void cuszCTX::trap(int _status) { this->read_args_status = _status; }
+
+void cuszCTX::validate()
+{
+    bool to_abort = false;
+    if (fname.fname.empty()) {
+        cerr << LOG_ERR << "must specify input file" << endl;
+        to_abort = true;
+    }
+
+    if (data_len == 1 and not use.predefined_demo) {
+        if (cli_task.construct or cli_task.dryrun) {
+            cerr << LOG_ERR << "wrong input size" << endl;
+            to_abort = true;
+        }
+    }
+    if (not cli_task.construct and not cli_task.reconstruct and not cli_task.dryrun) {
+        cerr << LOG_ERR << "select compress (-z), decompress (-x) or dry-run (-r)" << endl;
+        to_abort = true;
+    }
+    if (false == ConfigHelper::check_dtype(dtype, false)) {
+        if (cli_task.construct or cli_task.dryrun) {
+            std::cout << dtype << endl;
+            cerr << LOG_ERR << "must specify data type" << endl;
+            to_abort = true;
+        }
+    }
+
+    if (quant_bytewidth == 1)
+        assert(dict_size <= 256);
+    else if (quant_bytewidth == 2)
+        assert(dict_size <= 65536);
+
+    if (cli_task.dryrun and cli_task.construct and cli_task.reconstruct) {
+        cerr << LOG_WARN << "no need to dry-run, compress and decompress at the same time" << endl;
+        cerr << LOG_WARN << "dryrun only" << endl << endl;
+        cli_task.construct   = false;
+        cli_task.reconstruct = false;
+    }
+    else if (cli_task.dryrun and cli_task.construct) {
+        cerr << LOG_WARN << "no need to dry-run and compress at the same time" << endl;
+        cerr << LOG_WARN << "dryrun only" << endl << endl;
+        cli_task.construct = false;
+    }
+    else if (cli_task.dryrun and cli_task.reconstruct) {
+        cerr << LOG_WARN << "no need to dry-run and decompress at the same time" << endl;
+        cerr << LOG_WARN << "will dryrun only" << endl << endl;
+        cli_task.reconstruct = false;
+    }
+
+    if (to_abort) {
+        print_doc();
+        exit(-1);
+    }
+}
+
+cuszCTX::cuszCTX(int argc, char** const argv)
+{
+    std::string opt;
+    auto optmatch = [&](std::vector<std::string> vs) -> bool { return ConfigHelper::check_opt_in_list(opt, vs); };
+
+    if (argc == 1) {
+        print_doc();
+        exit(0);
+    }
+
+    /******************************************************************************/
+    /* phase 0: parse */
+    set_from_cli_input(this, argc, argv);
+
+    // special treatment
+    if (predictor == "spline3") {
+        // unconditionally use anchor when it is spline3
+        use.anchor = true;
+    }
+
+    /******************************************************************************/
+    /* phase 1: check syntax */
+    if (read_args_status != 0) {
+        std::cout << LOG_INFO << "Exiting..." << endl;
+        // after printing ALL argument errors
+        exit(-1);
+    }
+
+    /******************************************************************************/
+    /* phase 2: check if legal */
+    validate();
+
+    /******************************************************************************/
+    /* phase 3: sort out filenames */
+    derive_fnames();
+}
+
+cuszCTX::cuszCTX(const char* in_str, bool dbg_print)
+{
+    /**
+     **  >>> syntax
+     **  comma-separated key-pairs
+     **  "key1=val1,key2=val2[,...]"
+     **
+     **  >>> example
+     **  "predictor=lorenzo,size=3600x1800"
+     **
+     **/
+
+    set_config(this, in_str, dbg_print);
+}
+
+void cuszCTX::print_doc(bool full)
+{
+    std::cout << "\n>>>>  cusz build: " << cusz::VERSION_TEXT << "\n";
+
+    if (full)
+        std::cout << StrHelper::doc_format(cusz_full_doc) << std::endl;
+    else
+        std::cout << cusz_short_doc << std::endl;
+}
+
+void cuszCTX::derive_fnames()
+{
+    // (1) "fname"          -> "", "fname"
+    // (2) "./fname"        -> "./" "fname"
+    // (3) "/path/to/fname" -> "/path/to", "fname"
+    auto input_path = fname.fname.substr(0, fname.fname.rfind('/') + 1);
+    if (not cli_task.construct and cli_task.reconstruct) fname.fname = fname.fname.substr(0, fname.fname.rfind('.'));
+    fname.basename = fname.fname.substr(fname.fname.rfind('/') + 1);
+
+    if (opath.empty()) opath = input_path.empty() ? opath = "" : opath = input_path;
+    opath += "/";
+
+    fname.path_basename   = opath + fname.basename;
+    fname.compress_output = fname.path_basename + ".cusza";
+}
diff --git a/qtensor/compression/cusz/src/cusz/custom.cc b/qtensor/compression/cusz/src/cusz/custom.cc
index ad9eff89..6717e842 100644
--- a/qtensor/compression/cusz/src/cusz/custom.cc
+++ b/qtensor/compression/cusz/src/cusz/custom.cc
@@ -1,34 +1,34 @@
-/**
- * @file custom.cc
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-04-30
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#include "cusz/custom.h"
-
-extern "C" {
-
-cusz_custom_predictor     cusz_default_predictor() { return {LorenzoI, false, false}; }
-cusz_custom_quantization  cusz_default_quantization() { return {512, false}; }
-cusz_custom_codec         cusz_default_codec() { return {Huffman, true, 0.5}; }
-cusz_custom_huffman_codec cusz_default_huffman_codec() { return {Canonical, Device, Coarse, 1024, 768}; }
-cusz_custom_spcodec       cusz_default_spcodec() { return {SparseMat, 0.2}; }
-cusz_custom_framework*    cusz_default_framework()
-{
-    return new cusz_custom_framework{
-        FP32,  // placeholder; set in another function call
-        Auto, cusz_default_predictor(), cusz_default_quantization(), cusz_default_codec(),
-        // cusz_default_spcodec(),
-        cusz_default_huffman_codec()};
-}
-
-void cusz_set_datatype(cusz_custom_framework* config, cusz_datatype datatype) { config->datatype = datatype; }
-void cusz_set_pipelinetype(cusz_custom_framework* config, cusz_pipelinetype pipeline) { config->pipeline = pipeline; }
-
-// end of extern C
-}
+/**
+ * @file custom.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-30
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "cusz/custom.h"
+
+extern "C" {
+
+cusz_custom_predictor     cusz_default_predictor() { return {LorenzoI, false, false}; }
+cusz_custom_quantization  cusz_default_quantization() { return {512, false}; }
+cusz_custom_codec         cusz_default_codec() { return {Huffman, true, 0.5}; }
+cusz_custom_huffman_codec cusz_default_huffman_codec() { return {Canonical, Device, Coarse, 1024, 768}; }
+cusz_custom_spcodec       cusz_default_spcodec() { return {SparseMat, 0.2}; }
+cusz_custom_framework*    cusz_default_framework()
+{
+    return new cusz_custom_framework{
+        FP32,  // placeholder; set in another function call
+        Auto, cusz_default_predictor(), cusz_default_quantization(), cusz_default_codec(),
+        // cusz_default_spcodec(),
+        cusz_default_huffman_codec()};
+}
+
+void cusz_set_datatype(cusz_custom_framework* config, cusz_datatype datatype) { config->datatype = datatype; }
+void cusz_set_pipelinetype(cusz_custom_framework* config, cusz_pipelinetype pipeline) { config->pipeline = pipeline; }
+
+// end of extern C
+}
diff --git a/qtensor/compression/cusz/src/cusz_lib.cc b/qtensor/compression/cusz/src/cusz_lib.cc
index 723b80b1..d6bad3c6 100644
--- a/qtensor/compression/cusz/src/cusz_lib.cc
+++ b/qtensor/compression/cusz/src/cusz_lib.cc
@@ -1,115 +1,115 @@
-/**
- * @file cusz_lib.cc
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-05-01
- * (rev.1) 2023-01-29
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#include <stdexcept>
-
-#include <thrust/device_ptr.h>
-#include <thrust/extrema.h>
-
-#include "component.hh"
-#include "compressor.hh"
-#include "context.hh"
-#include "cusz.h"
-#include "cusz/custom.h"
-#include "cusz/type.h"
-#include "framework.hh"
-
-cusz_compressor* cusz_create(cusz_framework* _framework, cusz_datatype _type)
-{
-    auto comp = new cusz_compressor{.framework = _framework, .type = _type};
-
-    if (comp->type == FP32) {
-        using DATA       = float;
-        using Compressor = cusz::CompressorFP32;
-
-        comp->compressor = new Compressor();
-    }
-    else {
-        throw std::runtime_error("Type is not supported.");
-    }
-
-    return comp;
-}
-
-cusz_error_status cusz_release(cusz_compressor* comp)
-{
-    delete comp;
-    return CUSZ_SUCCESS;
-}
-
-cusz_error_status cusz_compress(
-    cusz_compressor* comp,
-    cusz_config*     config,
-    void*            uncompressed,
-    cusz_len const   uncomp_len,
-    uint8_t**        compressed,
-    size_t*          comp_bytes,
-    cusz_header*     header,
-    void*            record,
-    cudaStream_t     stream)
-{
-    // cusz::TimeRecord cpp_record;
-
-    auto context = new cusz_context();
-    (*context)
-        .set_len(uncomp_len.x, uncomp_len.y, uncomp_len.z, uncomp_len.w)
-        .set_eb(config->eb)
-        .set_control_string(config->eb == Rel ? "mode=r2r" : "mode=abs");
-
-    // Be cautious of autotuning! The default value of pardeg is not robust.
-    cusz::CompressorHelper::autotune_coarse_parvle(static_cast<cusz_context*>(context));
-
-    if (comp->type == FP32) {
-        using DATA       = float;
-        using Compressor = cusz::CompressorFP32;
-
-        // TODO add memlen & datalen comparison
-        static_cast<Compressor*>(comp->compressor)->init(context);
-        static_cast<Compressor*>(comp->compressor)
-            ->compress(context, static_cast<DATA*>(uncompressed), *compressed, *comp_bytes, stream);
-        static_cast<Compressor*>(comp->compressor)->export_header(*header);
-        static_cast<Compressor*>(comp->compressor)->export_timerecord((cusz::TimeRecord*)record);
-    }
-    else {
-        throw std::runtime_error(std::string(__FUNCTION__) + ": Type is not supported.");
-    }
-
-    return CUSZ_SUCCESS;
-}
-
-cusz_error_status cusz_decompress(
-    cusz_compressor* comp,
-    cusz_header*     header,
-    uint8_t*         compressed,
-    size_t const     comp_len,
-    void*            decompressed,
-    cusz_len const   decomp_len,
-    void*            record,
-    cudaStream_t     stream)
-{
-    // cusz::TimeRecord cpp_record;
-
-    if (comp->type == FP32) {
-        using DATA       = float;
-        using Compressor = cusz::CompressorFP32;
-
-        static_cast<Compressor*>(comp->compressor)->init(header);
-        static_cast<Compressor*>(comp->compressor)
-            ->decompress(header, compressed, static_cast<DATA*>(decompressed), stream);
-        static_cast<Compressor*>(comp->compressor)->export_timerecord((cusz::TimeRecord*)record);
-    }
-    else {
-        throw std::runtime_error(std::string(__FUNCTION__) + ": Type is not supported.");
-    }
-
-    return CUSZ_SUCCESS;
+/**
+ * @file cusz_lib.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-05-01
+ * (rev.1) 2023-01-29
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include <stdexcept>
+
+#include <thrust/device_ptr.h>
+#include <thrust/extrema.h>
+
+#include "component.hh"
+#include "compressor.hh"
+#include "context.hh"
+#include "cusz.h"
+#include "cusz/custom.h"
+#include "cusz/type.h"
+#include "framework.hh"
+
+cusz_compressor* cusz_create(cusz_framework* _framework, cusz_datatype _type)
+{
+    auto comp = new cusz_compressor{.framework = _framework, .type = _type};
+
+    if (comp->type == FP32) {
+        using DATA       = float;
+        using Compressor = cusz::CompressorFP32;
+
+        comp->compressor = new Compressor();
+    }
+    else {
+        throw std::runtime_error("Type is not supported.");
+    }
+
+    return comp;
+}
+
+cusz_error_status cusz_release(cusz_compressor* comp)
+{
+    delete comp;
+    return CUSZ_SUCCESS;
+}
+
+cusz_error_status cusz_compress(
+    cusz_compressor* comp,
+    cusz_config*     config,
+    void*            uncompressed,
+    cusz_len const   uncomp_len,
+    uint8_t**        compressed,
+    size_t*          comp_bytes,
+    cusz_header*     header,
+    void*            record,
+    cudaStream_t     stream)
+{
+    // cusz::TimeRecord cpp_record;
+
+    auto context = new cusz_context();
+    (*context)
+        .set_len(uncomp_len.x, uncomp_len.y, uncomp_len.z, uncomp_len.w)
+        .set_eb(config->eb)
+        .set_control_string(config->eb == Rel ? "mode=r2r" : "mode=abs");
+
+    // Be cautious of autotuning! The default value of pardeg is not robust.
+    cusz::CompressorHelper::autotune_coarse_parvle(static_cast<cusz_context*>(context));
+
+    if (comp->type == FP32) {
+        using DATA       = float;
+        using Compressor = cusz::CompressorFP32;
+
+        // TODO add memlen & datalen comparison
+        static_cast<Compressor*>(comp->compressor)->init(context);
+        static_cast<Compressor*>(comp->compressor)
+            ->compress(context, static_cast<DATA*>(uncompressed), *compressed, *comp_bytes, stream);
+        static_cast<Compressor*>(comp->compressor)->export_header(*header);
+        static_cast<Compressor*>(comp->compressor)->export_timerecord((cusz::TimeRecord*)record);
+    }
+    else {
+        throw std::runtime_error(std::string(__FUNCTION__) + ": Type is not supported.");
+    }
+
+    return CUSZ_SUCCESS;
+}
+
+cusz_error_status cusz_decompress(
+    cusz_compressor* comp,
+    cusz_header*     header,
+    uint8_t*         compressed,
+    size_t const     comp_len,
+    void*            decompressed,
+    cusz_len const   decomp_len,
+    void*            record,
+    cudaStream_t     stream)
+{
+    // cusz::TimeRecord cpp_record;
+
+    if (comp->type == FP32) {
+        using DATA       = float;
+        using Compressor = cusz::CompressorFP32;
+
+        static_cast<Compressor*>(comp->compressor)->init(header);
+        static_cast<Compressor*>(comp->compressor)
+            ->decompress(header, compressed, static_cast<DATA*>(decompressed), stream);
+        static_cast<Compressor*>(comp->compressor)->export_timerecord((cusz::TimeRecord*)record);
+    }
+    else {
+        throw std::runtime_error(std::string(__FUNCTION__) + ": Type is not supported.");
+    }
+
+    return CUSZ_SUCCESS;
 }
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/cusz_version.h.in b/qtensor/compression/cusz/src/cusz_version.h.in
index 1bd3344f..09a2d3d7 100644
--- a/qtensor/compression/cusz/src/cusz_version.h.in
+++ b/qtensor/compression/cusz/src/cusz_version.h.in
@@ -1,3 +1,3 @@
-#define CUSZ_MAJOR_VERSION @PROJECT_VERSION_MAJOR@
-#define CUSZ_MINOR_VERSION @PROJECT_VERSION_MINOR@
-#define CUSZ_PATCH_VERSION @PROJECT_VERSION_PATCH@
+#define CUSZ_MAJOR_VERSION @PROJECT_VERSION_MAJOR@
+#define CUSZ_MINOR_VERSION @PROJECT_VERSION_MINOR@
+#define CUSZ_PATCH_VERSION @PROJECT_VERSION_PATCH@
diff --git a/qtensor/compression/cusz/src/cusz_wrapper.cu b/qtensor/compression/cusz/src/cusz_wrapper.cu
index 2827123d..a9b1f760 100644
--- a/qtensor/compression/cusz/src/cusz_wrapper.cu
+++ b/qtensor/compression/cusz/src/cusz_wrapper.cu
@@ -1,154 +1,154 @@
-//#include "cuszx_entry.h"
-//#include "szx_defines.h"
-//#include "szx_BytesToolkit.h"
-//#include "szx_TypeManager.h"
-//#include "timingGPU.h"
-
-#include "cusz.h"
-#include "cli/quality_viewer.hh"
-#include "cli/timerecord_viewer.hh"
-#include "utils/io.hh"
-#include "utils/print_gpu.hh"
-
-// template <typename T>
-extern "C"{
-unsigned char* cusz_device_compress(float *data, float r2r_error,size_t len,size_t *outSize)
-{
-    /* For demo, we use 3600x1800 CESM data. */
-
-    cusz_header header;
-    uint8_t*    exposed_compressed;
-    uint8_t*    compressed;
-    size_t      compressed_len;
-
-    float *d_uncompressed, *h_uncompressed;
-    float *d_decompressed, *h_decompressed;
-
-    d_uncompressed = data;
-
-    cudaStream_t stream;
-    cudaStreamCreate(&stream);
-
-    // using default
-    // cusz_framework* framework = cusz_default_framework();
-    // alternatively
-    cusz_framework fw = cusz_framework{
-        .pipeline     = Auto,
-        .predictor    = cusz_custom_predictor{.type = LorenzoI},
-        .quantization = cusz_custom_quantization{.radius = 512},
-        .codec        = cusz_custom_codec{.type = Huffman}};
-    cusz_framework* framework = &fw;
-
-    // Brace initializing a struct pointer is not supported by all host compilers
-    // when nvcc forwards.
-    // cusz_framework* framework = new cusz_framework{
-    //     .pipeline     = Auto,
-    //     .predictor    = cusz_custom_predictor{.type = LorenzoI},
-    //     .quantization = cusz_custom_quantization{.radius = 512},
-    //     .codec        = cusz_custom_codec{.type = Huffman}};
-
-
-    cusz_compressor* comp       = cusz_create(framework, FP32);
-    cusz_config*     config     = new cusz_config{.eb = r2r_error, .mode = Rel};
-    cusz_len         uncomp_len = cusz_len{len, 1, 1, 1};  // x, y, z, w
-    cusz_len         decomp_len = uncomp_len;
-
-    cusz::TimeRecord compress_timerecord;
-    
-
-    {
-        cusz_compress(
-            comp, config, d_uncompressed, uncomp_len, &exposed_compressed, &compressed_len, &header,
-            (void*)&compress_timerecord, stream);
-
-        /* User can interpret the collected time information in other ways. */
-        cusz::TimeRecordViewer::view_compression(&compress_timerecord, len * sizeof(float), compressed_len);
-
-        /* verify header */
-        printf("header.%-*s : %x\n", 12, "(addr)", &header);
-        printf("header.%-*s : %lu, %lu, %lu\n", 12, "{x,y,z}", header.x, header.y, header.z);
-        printf("header.%-*s : %lu\n", 12, "filesize", ConfigHelper::get_filesize(&header));
-    }
-
-    /* If needed, User should perform a memcopy to transfer `exposed_compressed` before `compressor` is destroyed. */
-    cudaMalloc(&compressed, compressed_len);
-    cudaMemcpy(compressed, exposed_compressed, compressed_len, cudaMemcpyDeviceToDevice);
-    cudaFree(exposed_compressed);
-    cudaStreamDestroy(stream);
-    *outSize = compressed_len;
-    return compressed;
-}
-
-float* cusz_device_decompress(uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error){
-    cusz::TimeRecord decompress_timerecord;
-    cudaStream_t stream;
-    cusz_header header;
-    float* d_decompressed;
-    cudaMalloc(&d_decompressed, sizeof(float) * len);
-
-    cusz_framework fw = cusz_framework{
-        .pipeline     = Auto,
-        .predictor    = cusz_custom_predictor{.type = LorenzoI},
-        .quantization = cusz_custom_quantization{.radius = 512},
-        .codec        = cusz_custom_codec{.type = Huffman}};
-    cusz_framework* framework = &fw;
-
-    cusz_compressor* comp       = cusz_create(framework, FP32);
-    cusz_config*     config     = new cusz_config{.eb = r2r_error, .mode = Rel};
-    cusz_len         uncomp_len = cusz_len{len, 1, 1, 1};  // x, y, z, w
-    cusz_len         decomp_len = uncomp_len;
-
-
-    cudaStreamCreate(&stream);
-    {
-        cusz_decompress(
-            comp, &header, cmpbytes, compressed_len, d_decompressed, decomp_len,
-            (void*)&decompress_timerecord, stream);
-
-        cusz::TimeRecordViewer::view_decompression(&decompress_timerecord, len * sizeof(float));
-    }
-
-
-    cusz_release(comp);
-
-    // cudaFree(cmpbytes);
-    cudaStreamDestroy(stream);
-    return d_decompressed;
-}
-
-
-    // unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize){
-    //     float max,min;
-    //     unsigned char* bytes;
-    //     max = data[0];
-    //     min = data[0];
-    //     for (size_t i = 0; i < nbEle; i++)
-    //     {
-    //         if(data[i] > max) max = data[i];
-    //         if(data[i] < min) min = data[i];
-    //     }
-        
-    //     float threshold = r2r_threshold*(max-min);
-    //     float errBound = r2r_err*(max-min);
-    //     bytes = cuSZx_fast_compress_args_unpredictable_blocked_float(data, outSize, errBound, nbEle, blockSize, threshold);
-   	//     // printf("outSize %p\n", bytes);
-    //     return bytes;
-    // }
-
-    // float* cuSZx_integrated_decompress(unsigned char *bytes, size_t nbEle){
-    //     // printf("test\n");
-    //     float**data;
-	//     cuSZx_fast_decompress_args_unpredictable_blocked_float(data, nbEle, bytes);
-    //     return *data;
-    // }
-
-    // unsigned char* cuSZx_device_compress(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold){
-    //     return device_ptr_cuSZx_compress_float(oriData, outSize, absErrBound, nbEle, blockSize, threshold);
-    // }
-
-    // float* cuSZx_device_decompress(size_t nbEle, unsigned char* cmpBytes){
-    //     return device_ptr_cuSZx_decompress_float(nbEle, cmpBytes);
-    // }
-    
-    
-}
+//#include "cuszx_entry.h"
+//#include "szx_defines.h"
+//#include "szx_BytesToolkit.h"
+//#include "szx_TypeManager.h"
+//#include "timingGPU.h"
+
+#include "cusz.h"
+#include "cli/quality_viewer.hh"
+#include "cli/timerecord_viewer.hh"
+#include "utils/io.hh"
+#include "utils/print_gpu.hh"
+
+// template <typename T>
+extern "C"{
+unsigned char* cusz_device_compress(float *data, float r2r_error,size_t len,size_t *outSize)
+{
+    /* For demo, we use 3600x1800 CESM data. */
+
+    cusz_header header;
+    uint8_t*    exposed_compressed;
+    uint8_t*    compressed;
+    size_t      compressed_len;
+
+    float *d_uncompressed, *h_uncompressed;
+    float *d_decompressed, *h_decompressed;
+
+    d_uncompressed = data;
+
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // using default
+    // cusz_framework* framework = cusz_default_framework();
+    // alternatively
+    cusz_framework fw = cusz_framework{
+        .pipeline     = Auto,
+        .predictor    = cusz_custom_predictor{.type = LorenzoI},
+        .quantization = cusz_custom_quantization{.radius = 512},
+        .codec        = cusz_custom_codec{.type = Huffman}};
+    cusz_framework* framework = &fw;
+
+    // Brace initializing a struct pointer is not supported by all host compilers
+    // when nvcc forwards.
+    // cusz_framework* framework = new cusz_framework{
+    //     .pipeline     = Auto,
+    //     .predictor    = cusz_custom_predictor{.type = LorenzoI},
+    //     .quantization = cusz_custom_quantization{.radius = 512},
+    //     .codec        = cusz_custom_codec{.type = Huffman}};
+
+
+    cusz_compressor* comp       = cusz_create(framework, FP32);
+    cusz_config*     config     = new cusz_config{.eb = r2r_error, .mode = Rel};
+    cusz_len         uncomp_len = cusz_len{len, 1, 1, 1};  // x, y, z, w
+    cusz_len         decomp_len = uncomp_len;
+
+    cusz::TimeRecord compress_timerecord;
+    
+
+    {
+        cusz_compress(
+            comp, config, d_uncompressed, uncomp_len, &exposed_compressed, &compressed_len, &header,
+            (void*)&compress_timerecord, stream);
+
+        /* User can interpret the collected time information in other ways. */
+        cusz::TimeRecordViewer::view_compression(&compress_timerecord, len * sizeof(float), compressed_len);
+
+        /* verify header */
+        printf("header.%-*s : %x\n", 12, "(addr)", &header);
+        printf("header.%-*s : %lu, %lu, %lu\n", 12, "{x,y,z}", header.x, header.y, header.z);
+        printf("header.%-*s : %lu\n", 12, "filesize", ConfigHelper::get_filesize(&header));
+    }
+
+    /* If needed, User should perform a memcopy to transfer `exposed_compressed` before `compressor` is destroyed. */
+    cudaMalloc(&compressed, compressed_len);
+    cudaMemcpy(compressed, exposed_compressed, compressed_len, cudaMemcpyDeviceToDevice);
+    cudaFree(exposed_compressed);
+    cudaStreamDestroy(stream);
+    *outSize = compressed_len;
+    return compressed;
+}
+
+float* cusz_device_decompress(uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error){
+    cusz::TimeRecord decompress_timerecord;
+    cudaStream_t stream;
+    cusz_header header;
+    float* d_decompressed;
+    cudaMalloc(&d_decompressed, sizeof(float) * len);
+
+    cusz_framework fw = cusz_framework{
+        .pipeline     = Auto,
+        .predictor    = cusz_custom_predictor{.type = LorenzoI},
+        .quantization = cusz_custom_quantization{.radius = 512},
+        .codec        = cusz_custom_codec{.type = Huffman}};
+    cusz_framework* framework = &fw;
+
+    cusz_compressor* comp       = cusz_create(framework, FP32);
+    cusz_config*     config     = new cusz_config{.eb = r2r_error, .mode = Rel};
+    cusz_len         uncomp_len = cusz_len{len, 1, 1, 1};  // x, y, z, w
+    cusz_len         decomp_len = uncomp_len;
+
+
+    cudaStreamCreate(&stream);
+    {
+        cusz_decompress(
+            comp, &header, cmpbytes, compressed_len, d_decompressed, decomp_len,
+            (void*)&decompress_timerecord, stream);
+
+        cusz::TimeRecordViewer::view_decompression(&decompress_timerecord, len * sizeof(float));
+    }
+
+
+    cusz_release(comp);
+
+    // cudaFree(cmpbytes);
+    cudaStreamDestroy(stream);
+    return d_decompressed;
+}
+
+
+    // unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize){
+    //     float max,min;
+    //     unsigned char* bytes;
+    //     max = data[0];
+    //     min = data[0];
+    //     for (size_t i = 0; i < nbEle; i++)
+    //     {
+    //         if(data[i] > max) max = data[i];
+    //         if(data[i] < min) min = data[i];
+    //     }
+        
+    //     float threshold = r2r_threshold*(max-min);
+    //     float errBound = r2r_err*(max-min);
+    //     bytes = cuSZx_fast_compress_args_unpredictable_blocked_float(data, outSize, errBound, nbEle, blockSize, threshold);
+   	//     // printf("outSize %p\n", bytes);
+    //     return bytes;
+    // }
+
+    // float* cuSZx_integrated_decompress(unsigned char *bytes, size_t nbEle){
+    //     // printf("test\n");
+    //     float**data;
+	//     cuSZx_fast_decompress_args_unpredictable_blocked_float(data, nbEle, bytes);
+    //     return *data;
+    // }
+
+    // unsigned char* cuSZx_device_compress(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold){
+    //     return device_ptr_cuSZx_compress_float(oriData, outSize, absErrBound, nbEle, blockSize, threshold);
+    // }
+
+    // float* cuSZx_device_decompress(size_t nbEle, unsigned char* cmpBytes){
+    //     return device_ptr_cuSZx_decompress_float(nbEle, cmpBytes);
+    // }
+    
+    
+}
diff --git a/qtensor/compression/cusz/src/cusz_wrapper.py b/qtensor/compression/cusz/src/cusz_wrapper.py
index e588c492..682bd3e6 100644
--- a/qtensor/compression/cusz/src/cusz_wrapper.py
+++ b/qtensor/compression/cusz/src/cusz_wrapper.py
@@ -1,173 +1,173 @@
-import numpy as np
-import ctypes
-from ctypes import *
-import random
-from qtensor.tools.lazy_import import cupy as cp
-import time
-import torch
-
-from pathlib import Path
-LIB_PATH = str(Path(__file__).parent/'libcusz_wrapper.so')
-CUSZ_PATH = str(Path(__file__).parent/'libcusz.so')
-# unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize)
-
-# unsigned char* cusz_device_compress(float *data, float r2r_error,size_t len,size_t *outSize)
-
-def get_device_compress():
-    dll_base = ctypes.CDLL(CUSZ_PATH, mode=ctypes.RTLD_GLOBAL)
-    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
-    func = dll.cusz_device_compress
-    # Returns: unsigned char *bytes
-    # Needs: float *data, float r2r_error,size_t len,size_t *outSize
-    func.argtypes = [POINTER(c_float), c_float, c_size_t, POINTER(c_size_t)]
-    func.restype = POINTER(c_ubyte)
-    return func
-
-# float* cusz_device_decompress(uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error){
-
-def get_device_decompress():
-
-    dll_base = ctypes.CDLL(CUSZ_PATH, mode=ctypes.RTLD_GLOBAL)
-    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
-    func = dll.cusz_device_decompress
-    # Returns: float *newData
-    # Needs: size_t nbEle, unsigned char *cmpBytes
-    func.argtypes = [POINTER(c_ubyte), c_size_t, c_size_t, c_float]
-    func.restype = POINTER(c_float)
-    return func
-
-
-def cusz_device_compress(oriData, absErrBound, nbEle, blockSize,threshold):
-    __cuszx_device_compress = get_device_compress()
-    #print(nbEle)
-    ori_nbEle = nbEle
-    variable = ctypes.c_size_t(0)
-    outSize = ctypes.pointer(variable)
-
-    oriData = oriData.flatten()
-    ori_real = oriData.real
-    ori_imag = oriData.imag
-    oriData = cp.concatenate((ori_real, ori_imag))
-    #nbEle = len(oriData)
-    sample = oriData[::2]
-    #print(nbEle)
-    d = cp.amax(oriData) - cp.amin(oriData)
-    #print("max min time (s): " +str(time.time()-v_time))
-    d = d.get()
-    if d.dtype == np.complex64:
-        #d = min(d.real, d.imag)
-        d = d.real
-    # absErrBound = absErrBound*(d)
-    threshold = threshold*(d)
-    s_1 = time.time() 
-    #print(cp.get_array_module(oriData))    
-    truth_values = abs(oriData)<=threshold
-    oriData[truth_values] = 0.0
-    
-    nbEle = oriData.shape[0]
-    
-
-    oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float))
-    #print("starting")
-    # float *data, float r2r_error,size_t len,size_t *outSize
-    o_bytes = __cuszx_device_compress(oriData_p,np.float32(absErrBound), np.ulonglong(nbEle), outSize)
-  
-
-    return (o_bytes,outSize.contents.value, absErrBound), outSize
-
-
-def cusz_device_decompress(nbEle, cmpBytes, owner, dtype):
-    __cuszx_device_decompress=get_device_decompress()
-    (cmpBytes, cmpsize, err_bound) = cmpBytes
-
-    nbEle_p = ctypes.c_size_t(nbEle)
-    # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error
-    newData = __cuszx_device_decompress(cmpBytes,nbEle_p, ctypes.c_size_t(cmpsize), np.float32(err_bound))
-
-    # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
-    # -- Workaround to convert GPU pointer to int
-    p_decompressed_ptr = ctypes.addressof(newData)
-    # cast to int64 pointer
-    # (effectively converting pointer to pointer to addr to pointer to int64)
-    p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-    decompressed_int = p_decompressed_int.contents
-    # --
-    pointer_for_free = decompressed_int.value
-    # self.decompressed_own.append(decompressed_int.value)
-    mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0)
-    mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
-    #print("mem ptr")
-    #print(mem_ptr)
-    arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr)
-
-    # res = cp.zeros((nbEle,))
-    # ## need to convert newData to cupy
-    # cp.place(res,bitmap,arr)
-
-    c_res = cp.zeros(int(nbEle/2), np.complex64)
-    c_res.real = arr[0:int(nbEle/2)]
-    c_res.imag = arr[int(nbEle/2):]
-    return (c_res, pointer_for_free)
-
-### Example of device compress/decompress wrapper usage
-class Comp():
-    def __init__(self):
-        self.name = "dummy"
-
-def free_compressed(ptr):
-    p_ptr = ctypes.addressof(ptr)
-    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
-    decomp_int = p_int.contents
-    cp.cuda.runtime.free(decomp_int.value)
-
-
-if __name__ == "__main__":
-    
-    DATA_SIZE = int(1024)
-    MAX_D = 10.0
-    MIN_D = -10.0
-    RANGE = MAX_D - MIN_D
-    r2r_threshold = 0.002
-    r2r_error = 0.0001
-
-    in_vector = np.fromfile("all_sample.bin", dtype=np.complex64)
-    #print(np.max(in_vector))
-    DATA_SIZE = len(in_vector)
-    #range_vr = np.max(in_vector)-np.min(in_vector)
-    #r2r_threshold = r2r_threshold*range_vr
-    #r2r_error = r2r_error*range_vr
-    #in_vector = np.zeros((DATA_SIZE,))
-    #for i in range(0,int(DATA_SIZE/4)):
-    #    in_vector[i] = 0.0
-    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
-    #    in_vector[i] = 5.0
-    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
-    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
-    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
-    #    in_vector[i] = -7.0
-    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
-    #    in_vector[i] = 0.001
-
-    print(DATA_SIZE)
-    #in_vector = in_vector.astype('float32')
-    in_vector_gpu = cp.asarray(in_vector)
-    
-    # variable = ctypes.c_size_t(0)
-    # outSize = ctypes.pointer(variable)
-    for i in range(200):
-        s_time = time.time()
-        o_bytes, outSize = cusz_device_compress(in_vector_gpu, r2r_error, DATA_SIZE, 256, r2r_threshold)
-        print("Time python: "+str(time.time()-s_time))
-        print(outSize[0])
-        print("Compress Success...starting decompress ")
-        comp = Comp()
-
-        s_time = time.time()
-        (d_bytes,ptr )= cusz_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
-        
-        free_compressed(o_bytes[0])
-        cp.cuda.runtime.free(ptr)
-        print("Time python: "+str(time.time()-s_time))
-    #for i in d_bytes:
-    #    print(i)
-        print("Decompress Success")
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+from qtensor.tools.lazy_import import cupy as cp
+import time
+import torch
+
+from pathlib import Path
+LIB_PATH = str(Path(__file__).parent/'libcusz_wrapper.so')
+CUSZ_PATH = str(Path(__file__).parent/'libcusz.so')
+# unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize)
+
+# unsigned char* cusz_device_compress(float *data, float r2r_error,size_t len,size_t *outSize)
+
+def get_device_compress():
+    dll_base = ctypes.CDLL(CUSZ_PATH, mode=ctypes.RTLD_GLOBAL)
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.cusz_device_compress
+    # Returns: unsigned char *bytes
+    # Needs: float *data, float r2r_error,size_t len,size_t *outSize
+    func.argtypes = [POINTER(c_float), c_float, c_size_t, POINTER(c_size_t)]
+    func.restype = POINTER(c_ubyte)
+    return func
+
+# float* cusz_device_decompress(uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error){
+
+def get_device_decompress():
+
+    dll_base = ctypes.CDLL(CUSZ_PATH, mode=ctypes.RTLD_GLOBAL)
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.cusz_device_decompress
+    # Returns: float *newData
+    # Needs: size_t nbEle, unsigned char *cmpBytes
+    func.argtypes = [POINTER(c_ubyte), c_size_t, c_size_t, c_float]
+    func.restype = POINTER(c_float)
+    return func
+
+
+def cusz_device_compress(oriData, absErrBound, nbEle, blockSize,threshold):
+    __cuszx_device_compress = get_device_compress()
+    #print(nbEle)
+    ori_nbEle = nbEle
+    variable = ctypes.c_size_t(0)
+    outSize = ctypes.pointer(variable)
+
+    oriData = oriData.flatten()
+    ori_real = oriData.real
+    ori_imag = oriData.imag
+    oriData = cp.concatenate((ori_real, ori_imag))
+    #nbEle = len(oriData)
+    sample = oriData[::2]
+    #print(nbEle)
+    d = cp.amax(oriData) - cp.amin(oriData)
+    #print("max min time (s): " +str(time.time()-v_time))
+    d = d.get()
+    if d.dtype == np.complex64:
+        #d = min(d.real, d.imag)
+        d = d.real
+    # absErrBound = absErrBound*(d)
+    threshold = threshold*(d)
+    s_1 = time.time() 
+    #print(cp.get_array_module(oriData))    
+    truth_values = abs(oriData)<=threshold
+    oriData[truth_values] = 0.0
+    
+    nbEle = oriData.shape[0]
+    
+
+    oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float))
+    #print("starting")
+    # float *data, float r2r_error,size_t len,size_t *outSize
+    o_bytes = __cuszx_device_compress(oriData_p,np.float32(absErrBound), np.ulonglong(nbEle), outSize)
+  
+
+    return (o_bytes,outSize.contents.value, absErrBound), outSize
+
+
+def cusz_device_decompress(nbEle, cmpBytes, owner, dtype):
+    __cuszx_device_decompress=get_device_decompress()
+    (cmpBytes, cmpsize, err_bound) = cmpBytes
+
+    nbEle_p = ctypes.c_size_t(nbEle)
+    # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error
+    newData = __cuszx_device_decompress(cmpBytes,nbEle_p, ctypes.c_size_t(cmpsize), np.float32(err_bound))
+
+    # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
+    # -- Workaround to convert GPU pointer to int
+    p_decompressed_ptr = ctypes.addressof(newData)
+    # cast to int64 pointer
+    # (effectively converting pointer to pointer to addr to pointer to int64)
+    p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decompressed_int = p_decompressed_int.contents
+    # --
+    pointer_for_free = decompressed_int.value
+    # self.decompressed_own.append(decompressed_int.value)
+    mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0)
+    mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
+    #print("mem ptr")
+    #print(mem_ptr)
+    arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr)
+
+    # res = cp.zeros((nbEle,))
+    # ## need to convert newData to cupy
+    # cp.place(res,bitmap,arr)
+
+    c_res = cp.zeros(int(nbEle/2), np.complex64)
+    c_res.real = arr[0:int(nbEle/2)]
+    c_res.imag = arr[int(nbEle/2):]
+    return (c_res, pointer_for_free)
+
+### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
+
+def free_compressed(ptr):
+    p_ptr = ctypes.addressof(ptr)
+    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decomp_int = p_int.contents
+    cp.cuda.runtime.free(decomp_int.value)
+
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = int(1024)
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.002
+    r2r_error = 0.0001
+
+    in_vector = np.fromfile("all_sample.bin", dtype=np.complex64)
+    #print(np.max(in_vector))
+    DATA_SIZE = len(in_vector)
+    #range_vr = np.max(in_vector)-np.min(in_vector)
+    #r2r_threshold = r2r_threshold*range_vr
+    #r2r_error = r2r_error*range_vr
+    #in_vector = np.zeros((DATA_SIZE,))
+    #for i in range(0,int(DATA_SIZE/4)):
+    #    in_vector[i] = 0.0
+    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+    #    in_vector[i] = 5.0
+    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
+    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+    #    in_vector[i] = -7.0
+    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+    #    in_vector[i] = 0.001
+
+    print(DATA_SIZE)
+    #in_vector = in_vector.astype('float32')
+    in_vector_gpu = cp.asarray(in_vector)
+    
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+    for i in range(200):
+        s_time = time.time()
+        o_bytes, outSize = cusz_device_compress(in_vector_gpu, r2r_error, DATA_SIZE, 256, r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        print(outSize[0])
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        (d_bytes,ptr )= cusz_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
+        
+        free_compressed(o_bytes[0])
+        cp.cuda.runtime.free(ptr)
+        print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
+        print("Decompress Success")
diff --git a/qtensor/compression/cusz/src/detail/compare_cpu.inl b/qtensor/compression/cusz/src/detail/compare_cpu.inl
index 1617fc38..b09eb558 100644
--- a/qtensor/compression/cusz/src/detail/compare_cpu.inl
+++ b/qtensor/compression/cusz/src/detail/compare_cpu.inl
@@ -1,109 +1,109 @@
-/**
- * @file _compare.hh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-10-08
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef C0E747B4_066F_4B04_A3D2_00E1A3B7D682
-#define C0E747B4_066F_4B04_A3D2_00E1A3B7D682
-
-#include <algorithm>
-#include <cmath>
-#include <cstdint>
-#include <cstdlib>
-#include "cusz/type.h"
-
-namespace psz {
-namespace detail {
-
-template <typename T>
-bool cppstd_identical(T* d1, T* d2, size_t const len)
-{
-    return std::equal(d1, d1 + len, d2);
-}
-
-template <typename T>
-bool cppstd_error_bounded(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx = nullptr)
-{
-    // debugging
-
-    bool eb_ed = true;
-    for (size_t i = 0; i < len; i++) {
-        if (fabs(a[i] - b[i]) > 1.001 * eb) {
-            if (first_faulty_idx) *first_faulty_idx = i;
-            return false;
-        }
-    }
-    return true;
-}
-
-template <typename T>
-void cppstd_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t const len)
-{
-    double max_odata = odata[0], min_odata = odata[0];
-    double max_xdata = xdata[0], min_xdata = xdata[0];
-    double max_abserr = max_abserr = fabs(xdata[0] - odata[0]);
-
-    double sum_0 = 0, sum_x = 0;
-    for (size_t i = 0; i < len; i++) sum_0 += odata[i], sum_x += xdata[i];
-
-    double mean_odata = sum_0 / len, mean_xdata = sum_x / len;
-    double sum_var_odata = 0, sum_var_xdata = 0, sum_err2 = 0, sum_corr = 0, rel_abserr = 0;
-
-    double max_pwrrel_abserr = 0;
-    size_t max_abserr_index  = 0;
-    for (size_t i = 0; i < len; i++) {
-        max_odata = max_odata < odata[i] ? odata[i] : max_odata;
-        min_odata = min_odata > odata[i] ? odata[i] : min_odata;
-
-        max_xdata = max_xdata < odata[i] ? odata[i] : max_xdata;
-        min_xdata = min_xdata > xdata[i] ? xdata[i] : min_xdata;
-
-        float abserr = fabs(xdata[i] - odata[i]);
-        if (odata[i] != 0) {
-            rel_abserr        = abserr / fabs(odata[i]);
-            max_pwrrel_abserr = max_pwrrel_abserr < rel_abserr ? rel_abserr : max_pwrrel_abserr;
-        }
-        max_abserr_index = max_abserr < abserr ? i : max_abserr_index;
-        max_abserr       = max_abserr < abserr ? abserr : max_abserr;
-        sum_corr += (odata[i] - mean_odata) * (xdata[i] - mean_xdata);
-        sum_var_odata += (odata[i] - mean_odata) * (odata[i] - mean_odata);
-        sum_var_xdata += (xdata[i] - mean_xdata) * (xdata[i] - mean_xdata);
-        sum_err2 += abserr * abserr;
-    }
-    double std_odata = sqrt(sum_var_odata / len);
-    double std_xdata = sqrt(sum_var_xdata / len);
-    double ee        = sum_corr / len;
-
-    s->len = len;
-
-    s->odata.max = max_odata;
-    s->odata.min = min_odata;
-    s->odata.rng = max_odata - min_odata;
-    s->odata.std = std_odata;
-
-    s->xdata.max = max_xdata;
-    s->xdata.min = min_xdata;
-    s->xdata.rng = max_xdata - min_xdata;
-    s->xdata.std = std_xdata;
-
-    s->max_err.idx    = max_abserr_index;
-    s->max_err.abs    = max_abserr;
-    s->max_err.rel    = max_abserr / s->odata.rng;
-    s->max_err.pwrrel = max_pwrrel_abserr;
-
-    s->reduced.coeff = ee / std_odata / std_xdata;
-    s->reduced.MSE   = sum_err2 / len;
-    s->reduced.NRMSE = sqrt(s->reduced.MSE) / s->odata.rng;
-    s->reduced.PSNR  = 20 * log10(s->odata.rng) - 10 * log10(s->reduced.MSE);
-}
-
-}  // namespace detail
-}  // namespace psz
-
-#endif /* C0E747B4_066F_4B04_A3D2_00E1A3B7D682 */
+/**
+ * @file _compare.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-08
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef C0E747B4_066F_4B04_A3D2_00E1A3B7D682
+#define C0E747B4_066F_4B04_A3D2_00E1A3B7D682
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include "cusz/type.h"
+
+namespace psz {
+namespace detail {
+
+template <typename T>
+bool cppstd_identical(T* d1, T* d2, size_t const len)
+{
+    return std::equal(d1, d1 + len, d2);
+}
+
+template <typename T>
+bool cppstd_error_bounded(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx = nullptr)
+{
+    // debugging
+
+    bool eb_ed = true;
+    for (size_t i = 0; i < len; i++) {
+        if (fabs(a[i] - b[i]) > 1.001 * eb) {
+            if (first_faulty_idx) *first_faulty_idx = i;
+            return false;
+        }
+    }
+    return true;
+}
+
+template <typename T>
+void cppstd_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t const len)
+{
+    double max_odata = odata[0], min_odata = odata[0];
+    double max_xdata = xdata[0], min_xdata = xdata[0];
+    double max_abserr = max_abserr = fabs(xdata[0] - odata[0]);
+
+    double sum_0 = 0, sum_x = 0;
+    for (size_t i = 0; i < len; i++) sum_0 += odata[i], sum_x += xdata[i];
+
+    double mean_odata = sum_0 / len, mean_xdata = sum_x / len;
+    double sum_var_odata = 0, sum_var_xdata = 0, sum_err2 = 0, sum_corr = 0, rel_abserr = 0;
+
+    double max_pwrrel_abserr = 0;
+    size_t max_abserr_index  = 0;
+    for (size_t i = 0; i < len; i++) {
+        max_odata = max_odata < odata[i] ? odata[i] : max_odata;
+        min_odata = min_odata > odata[i] ? odata[i] : min_odata;
+
+        max_xdata = max_xdata < odata[i] ? odata[i] : max_xdata;
+        min_xdata = min_xdata > xdata[i] ? xdata[i] : min_xdata;
+
+        float abserr = fabs(xdata[i] - odata[i]);
+        if (odata[i] != 0) {
+            rel_abserr        = abserr / fabs(odata[i]);
+            max_pwrrel_abserr = max_pwrrel_abserr < rel_abserr ? rel_abserr : max_pwrrel_abserr;
+        }
+        max_abserr_index = max_abserr < abserr ? i : max_abserr_index;
+        max_abserr       = max_abserr < abserr ? abserr : max_abserr;
+        sum_corr += (odata[i] - mean_odata) * (xdata[i] - mean_xdata);
+        sum_var_odata += (odata[i] - mean_odata) * (odata[i] - mean_odata);
+        sum_var_xdata += (xdata[i] - mean_xdata) * (xdata[i] - mean_xdata);
+        sum_err2 += abserr * abserr;
+    }
+    double std_odata = sqrt(sum_var_odata / len);
+    double std_xdata = sqrt(sum_var_xdata / len);
+    double ee        = sum_corr / len;
+
+    s->len = len;
+
+    s->odata.max = max_odata;
+    s->odata.min = min_odata;
+    s->odata.rng = max_odata - min_odata;
+    s->odata.std = std_odata;
+
+    s->xdata.max = max_xdata;
+    s->xdata.min = min_xdata;
+    s->xdata.rng = max_xdata - min_xdata;
+    s->xdata.std = std_xdata;
+
+    s->max_err.idx    = max_abserr_index;
+    s->max_err.abs    = max_abserr;
+    s->max_err.rel    = max_abserr / s->odata.rng;
+    s->max_err.pwrrel = max_pwrrel_abserr;
+
+    s->reduced.coeff = ee / std_odata / std_xdata;
+    s->reduced.MSE   = sum_err2 / len;
+    s->reduced.NRMSE = sqrt(s->reduced.MSE) / s->odata.rng;
+    s->reduced.PSNR  = 20 * log10(s->odata.rng) - 10 * log10(s->reduced.MSE);
+}
+
+}  // namespace detail
+}  // namespace psz
+
+#endif /* C0E747B4_066F_4B04_A3D2_00E1A3B7D682 */
diff --git a/qtensor/compression/cusz/src/detail/compare_gpu.inl b/qtensor/compression/cusz/src/detail/compare_gpu.inl
index 12ec3475..851fc4a2 100644
--- a/qtensor/compression/cusz/src/detail/compare_gpu.inl
+++ b/qtensor/compression/cusz/src/detail/compare_gpu.inl
@@ -1,193 +1,193 @@
-/**
- * @file _compare.cuh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-10-08
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef F7DF2FE5_571E_48C1_965D_0B19D1CC14D4
-#define F7DF2FE5_571E_48C1_965D_0B19D1CC14D4
-
-#include <math.h>
-#include <thrust/count.h>
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
-#include <thrust/equal.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/tuple.h>
-
-#include "cusz/type.h"
-
-namespace psz {
-namespace detail {
-
-static const int MINVAL = 0;
-static const int MAXVAL = 1;
-static const int AVGVAL = 2;
-static const int RNG    = 3;
-
-template <typename T>
-bool thrustgpu_identical(T* d1, T* d2, size_t const len)
-{
-    return thrust::equal(thrust::device, d1, d1 + len, d2);
-}
-
-template <typename T>
-bool thrustgpu_error_bounded(T* a, T* b, size_t const len, double eb, size_t* first_faulty_idx = nullptr)
-{
-    thrust::device_ptr<T>             a_ = thrust::device_pointer_cast(a);
-    thrust::device_ptr<T>             b_ = thrust::device_pointer_cast(b);
-    thrust::constant_iterator<double> eb_(eb);
-    using tup = thrust::tuple<T, T, double>;
-
-    auto ab_begin = thrust::make_zip_iterator(thrust::make_tuple(a_, b_, eb_));
-    auto ab_end   = thrust::make_zip_iterator(thrust::make_tuple(a_ + len, b_ + len, eb_));
-
-    // Let compiler figure out the type.
-    auto iter = thrust::find_if(thrust::device, ab_begin, ab_end, [] __device__(tup t) {
-        // debug use
-        // if (fabs(thrust::get<1>(t) - thrust::get<0>(t)) > thrust::get<2>(t))
-        //     printf("a: %f\tb: %f\teb: %lf\n", (float)thrust::get<1>(t), (float)thrust::get<0>(t), thrust::get<2>(t));
-
-        return fabs(thrust::get<1>(t) - thrust::get<0>(t)) > 1.001 * thrust::get<2>(t);
-    });
-
-    if (iter == ab_end) { return true; }
-    else {
-        // *first_faulty_idx = iter - ab_begin;
-        return false;
-    }
-}
-
-template <typename T>
-void thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])
-{
-    thrust::device_ptr<T> g_ptr = thrust::device_pointer_cast(d_ptr);
-
-    auto minel  = thrust::min_element(g_ptr, g_ptr + len) - g_ptr;
-    auto maxel  = thrust::max_element(g_ptr, g_ptr + len) - g_ptr;
-    res[MINVAL] = *(g_ptr + minel);
-    res[MAXVAL] = *(g_ptr + maxel);
-    res[RNG]    = res[MAXVAL] - res[MINVAL];
-
-    auto sum    = thrust::reduce(g_ptr, g_ptr + len, (T)0.0, thrust::plus<T>());
-    res[AVGVAL] = sum / len;
-}
-
-template <typename T>
-void thrustgpu_get_extrema(thrust::device_ptr<T> g_ptr, size_t len, T res[4])
-{
-    auto minel  = thrust::min_element(g_ptr, g_ptr + len) - g_ptr;
-    auto maxel  = thrust::max_element(g_ptr, g_ptr + len) - g_ptr;
-    res[MINVAL] = *(g_ptr + minel);
-    res[MAXVAL] = *(g_ptr + maxel);
-    res[RNG]    = res[MAXVAL] - res[MINVAL];
-
-    auto sum    = thrust::reduce(g_ptr, g_ptr + len, (T)0.0, thrust::plus<T>());
-    res[AVGVAL] = sum / len;
-}
-
-template <typename T>
-void thrustgpu_get_maxerr(
-    T*      reconstructed,  // in
-    T*      original,       // in
-    size_t  len,            // in
-    T&      maximum_val,    // out
-    size_t& maximum_loc,    // out
-    bool    destructive = false)
-{
-    T* diff;
-
-    if (destructive) {
-        diff = original;  // aliasing
-    }
-    else {
-        cudaMalloc(&diff, sizeof(T) * len);
-    }
-
-    auto expr = [=] __device__(T rel, T oel) { return rel - oel; };
-
-    // typesafe (also with exec-policy binding)
-    thrust::device_ptr<T> r(reconstructed);
-    thrust::device_ptr<T> o(original);
-    thrust::device_ptr<T> d(diff);
-
-    thrust::transform(r, r + len, o, d, expr);
-
-    auto maximum_ptr = thrust::max_element(d, d + len);
-    maximum_val      = *maximum_ptr;
-    maximum_loc      = maximum_ptr - d;
-
-    if (not destructive) { cudaFree(diff); }
-}
-
-template <typename T>
-void thrustgpu_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t len)
-{
-    using tup = thrust::tuple<T, T>;
-
-    thrust::device_ptr<T> p_odata = thrust::device_pointer_cast(odata);  // origin
-    thrust::device_ptr<T> p_xdata = thrust::device_pointer_cast(xdata);
-
-    T odata_res[4], xdata_res[4];
-
-    thrustgpu_get_extrema(p_odata, len, odata_res);
-    thrustgpu_get_extrema(p_xdata, len, xdata_res);
-
-    auto begin = thrust::make_zip_iterator(thrust::make_tuple(p_odata, p_xdata));
-    auto end   = thrust::make_zip_iterator(thrust::make_tuple(p_odata + len, p_xdata + len));
-
-    // clang-format off
-    auto corr      = [=] __host__ __device__(tup t)  { return (thrust::get<0>(t) - odata[AVGVAL]) * (thrust::get<1>(t) - xdata[AVGVAL]); };
-    auto err2      = []  __host__ __device__(tup t)  { T f = thrust::get<0>(t) - thrust::get<1>(t); return f * f; };
-    auto var_odata = [=] __host__ __device__(T a) { T f = a - odata[AVGVAL]; return f * f; };
-    auto var_xdata = [=] __host__ __device__(T a) { T f = a - xdata[AVGVAL]; return f * f; };
-
-    auto sum_err2      = thrust::transform_reduce(begin, end, err2, 0.0f, thrust::plus<T>());
-    auto sum_corr      = thrust::transform_reduce(begin, end, corr, 0.0f, thrust::plus<T>());
-    auto sum_var_odata = thrust::transform_reduce(p_odata, p_odata + len, var_odata, 0.0f, thrust::plus<T>());
-    auto sum_var_xdata = thrust::transform_reduce(p_xdata, p_xdata + len, var_xdata, 0.0f, thrust::plus<T>());
-    // clang-format on
-
-    double std_odata = sqrt(sum_var_odata / len);
-    double std_xdata = sqrt(sum_var_xdata / len);
-    double ee        = sum_corr / len;
-
-    // -----------------------------------------------------------------------------
-    T      max_abserr{0};
-    size_t max_abserr_index{0};
-    thrustgpu_get_maxerr(xdata, odata, len, max_abserr, max_abserr_index, false);
-    // -----------------------------------------------------------------------------
-
-    s->len = len;
-
-    s->odata.max = odata_res[MAXVAL];
-    s->odata.min = odata_res[MINVAL];
-    s->odata.rng = odata_res[MAXVAL] - odata_res[MINVAL];
-    s->odata.std = std_odata;
-
-    s->xdata.max = xdata_res[MAXVAL];
-    s->xdata.min = xdata_res[MINVAL];
-    s->xdata.rng = xdata_res[MAXVAL] - xdata_res[MINVAL];
-    s->xdata.std = std_xdata;
-
-    s->max_err.idx    = max_abserr_index;
-    s->max_err.abs    = max_abserr;
-    s->max_err.rel    = max_abserr / s->odata.rng;
-    s->max_err.pwrrel = NAN;
-
-    s->reduced.coeff = ee / std_odata / std_xdata;
-    s->reduced.MSE   = sum_err2 / len;
-    s->reduced.NRMSE = sqrt(s->reduced.MSE) / s->odata.rng;
-    s->reduced.PSNR  = 20 * log10(s->odata.rng) - 10 * log10(s->reduced.MSE);
-}
-
-}  // namespace detail
-}  // namespace psz
-
-#endif /* F7DF2FE5_571E_48C1_965D_0B19D1CC14D4 */
+/**
+ * @file _compare.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-08
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef F7DF2FE5_571E_48C1_965D_0B19D1CC14D4
+#define F7DF2FE5_571E_48C1_965D_0B19D1CC14D4
+
+#include <math.h>
+#include <thrust/count.h>
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/equal.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/tuple.h>
+
+#include "cusz/type.h"
+
+namespace psz {
+namespace detail {
+
+static const int MINVAL = 0;
+static const int MAXVAL = 1;
+static const int AVGVAL = 2;
+static const int RNG    = 3;
+
+template <typename T>
+bool thrustgpu_identical(T* d1, T* d2, size_t const len)
+{
+    return thrust::equal(thrust::device, d1, d1 + len, d2);
+}
+
+template <typename T>
+bool thrustgpu_error_bounded(T* a, T* b, size_t const len, double eb, size_t* first_faulty_idx = nullptr)
+{
+    thrust::device_ptr<T>             a_ = thrust::device_pointer_cast(a);
+    thrust::device_ptr<T>             b_ = thrust::device_pointer_cast(b);
+    thrust::constant_iterator<double> eb_(eb);
+    using tup = thrust::tuple<T, T, double>;
+
+    auto ab_begin = thrust::make_zip_iterator(thrust::make_tuple(a_, b_, eb_));
+    auto ab_end   = thrust::make_zip_iterator(thrust::make_tuple(a_ + len, b_ + len, eb_));
+
+    // Let compiler figure out the type.
+    auto iter = thrust::find_if(thrust::device, ab_begin, ab_end, [] __device__(tup t) {
+        // debug use
+        // if (fabs(thrust::get<1>(t) - thrust::get<0>(t)) > thrust::get<2>(t))
+        //     printf("a: %f\tb: %f\teb: %lf\n", (float)thrust::get<1>(t), (float)thrust::get<0>(t), thrust::get<2>(t));
+
+        return fabs(thrust::get<1>(t) - thrust::get<0>(t)) > 1.001 * thrust::get<2>(t);
+    });
+
+    if (iter == ab_end) { return true; }
+    else {
+        // *first_faulty_idx = iter - ab_begin;
+        return false;
+    }
+}
+
+template <typename T>
+void thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])
+{
+    thrust::device_ptr<T> g_ptr = thrust::device_pointer_cast(d_ptr);
+
+    auto minel  = thrust::min_element(g_ptr, g_ptr + len) - g_ptr;
+    auto maxel  = thrust::max_element(g_ptr, g_ptr + len) - g_ptr;
+    res[MINVAL] = *(g_ptr + minel);
+    res[MAXVAL] = *(g_ptr + maxel);
+    res[RNG]    = res[MAXVAL] - res[MINVAL];
+
+    auto sum    = thrust::reduce(g_ptr, g_ptr + len, (T)0.0, thrust::plus<T>());
+    res[AVGVAL] = sum / len;
+}
+
+template <typename T>
+void thrustgpu_get_extrema(thrust::device_ptr<T> g_ptr, size_t len, T res[4])
+{
+    auto minel  = thrust::min_element(g_ptr, g_ptr + len) - g_ptr;
+    auto maxel  = thrust::max_element(g_ptr, g_ptr + len) - g_ptr;
+    res[MINVAL] = *(g_ptr + minel);
+    res[MAXVAL] = *(g_ptr + maxel);
+    res[RNG]    = res[MAXVAL] - res[MINVAL];
+
+    auto sum    = thrust::reduce(g_ptr, g_ptr + len, (T)0.0, thrust::plus<T>());
+    res[AVGVAL] = sum / len;
+}
+
+template <typename T>
+void thrustgpu_get_maxerr(
+    T*      reconstructed,  // in
+    T*      original,       // in
+    size_t  len,            // in
+    T&      maximum_val,    // out
+    size_t& maximum_loc,    // out
+    bool    destructive = false)
+{
+    T* diff;
+
+    if (destructive) {
+        diff = original;  // aliasing
+    }
+    else {
+        cudaMalloc(&diff, sizeof(T) * len);
+    }
+
+    auto expr = [=] __device__(T rel, T oel) { return rel - oel; };
+
+    // typesafe (also with exec-policy binding)
+    thrust::device_ptr<T> r(reconstructed);
+    thrust::device_ptr<T> o(original);
+    thrust::device_ptr<T> d(diff);
+
+    thrust::transform(r, r + len, o, d, expr);
+
+    auto maximum_ptr = thrust::max_element(d, d + len);
+    maximum_val      = *maximum_ptr;
+    maximum_loc      = maximum_ptr - d;
+
+    if (not destructive) { cudaFree(diff); }
+}
+
+template <typename T>
+void thrustgpu_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t len)
+{
+    using tup = thrust::tuple<T, T>;
+
+    thrust::device_ptr<T> p_odata = thrust::device_pointer_cast(odata);  // origin
+    thrust::device_ptr<T> p_xdata = thrust::device_pointer_cast(xdata);
+
+    T odata_res[4], xdata_res[4];
+
+    thrustgpu_get_extrema(p_odata, len, odata_res);
+    thrustgpu_get_extrema(p_xdata, len, xdata_res);
+
+    auto begin = thrust::make_zip_iterator(thrust::make_tuple(p_odata, p_xdata));
+    auto end   = thrust::make_zip_iterator(thrust::make_tuple(p_odata + len, p_xdata + len));
+
+    // clang-format off
+    auto corr      = [=] __host__ __device__(tup t)  { return (thrust::get<0>(t) - odata[AVGVAL]) * (thrust::get<1>(t) - xdata[AVGVAL]); };
+    auto err2      = []  __host__ __device__(tup t)  { T f = thrust::get<0>(t) - thrust::get<1>(t); return f * f; };
+    auto var_odata = [=] __host__ __device__(T a) { T f = a - odata[AVGVAL]; return f * f; };
+    auto var_xdata = [=] __host__ __device__(T a) { T f = a - xdata[AVGVAL]; return f * f; };
+
+    auto sum_err2      = thrust::transform_reduce(begin, end, err2, 0.0f, thrust::plus<T>());
+    auto sum_corr      = thrust::transform_reduce(begin, end, corr, 0.0f, thrust::plus<T>());
+    auto sum_var_odata = thrust::transform_reduce(p_odata, p_odata + len, var_odata, 0.0f, thrust::plus<T>());
+    auto sum_var_xdata = thrust::transform_reduce(p_xdata, p_xdata + len, var_xdata, 0.0f, thrust::plus<T>());
+    // clang-format on
+
+    double std_odata = sqrt(sum_var_odata / len);
+    double std_xdata = sqrt(sum_var_xdata / len);
+    double ee        = sum_corr / len;
+
+    // -----------------------------------------------------------------------------
+    T      max_abserr{0};
+    size_t max_abserr_index{0};
+    thrustgpu_get_maxerr(xdata, odata, len, max_abserr, max_abserr_index, false);
+    // -----------------------------------------------------------------------------
+
+    s->len = len;
+
+    s->odata.max = odata_res[MAXVAL];
+    s->odata.min = odata_res[MINVAL];
+    s->odata.rng = odata_res[MAXVAL] - odata_res[MINVAL];
+    s->odata.std = std_odata;
+
+    s->xdata.max = xdata_res[MAXVAL];
+    s->xdata.min = xdata_res[MINVAL];
+    s->xdata.rng = xdata_res[MAXVAL] - xdata_res[MINVAL];
+    s->xdata.std = std_xdata;
+
+    s->max_err.idx    = max_abserr_index;
+    s->max_err.abs    = max_abserr;
+    s->max_err.rel    = max_abserr / s->odata.rng;
+    s->max_err.pwrrel = NAN;
+
+    s->reduced.coeff = ee / std_odata / std_xdata;
+    s->reduced.MSE   = sum_err2 / len;
+    s->reduced.NRMSE = sqrt(s->reduced.MSE) / s->odata.rng;
+    s->reduced.PSNR  = 20 * log10(s->odata.rng) - 10 * log10(s->reduced.MSE);
+}
+
+}  // namespace detail
+}  // namespace psz
+
+#endif /* F7DF2FE5_571E_48C1_965D_0B19D1CC14D4 */
diff --git a/qtensor/compression/cusz/src/detail/compressor_impl.cu b/qtensor/compression/cusz/src/detail/compressor_impl.cu
index 83b819ae..3974e15b 100644
--- a/qtensor/compression/cusz/src/detail/compressor_impl.cu
+++ b/qtensor/compression/cusz/src/detail/compressor_impl.cu
@@ -1,18 +1,18 @@
-/**
- * @file compressor.cu
- * @author Jiannan Tian
- * @brief cuSZ compressor of the default path
- * @version 0.3
- * @date 2021-10-05
- * (create) 2020-02-12; (release) 2020-09-20;
- * (rev.1) 2021-01-16; (rev.2) 2021-07-12; (rev.3) 2021-09-06; (rev.4) 2021-10-05
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#include "compressor_impl.inl"
-#include "framework.hh"
-
-template class cusz::Compressor<cusz::Framework<float>>::impl;
+/**
+ * @file compressor.cu
+ * @author Jiannan Tian
+ * @brief cuSZ compressor of the default path
+ * @version 0.3
+ * @date 2021-10-05
+ * (create) 2020-02-12; (release) 2020-09-20;
+ * (rev.1) 2021-01-16; (rev.2) 2021-07-12; (rev.3) 2021-09-06; (rev.4) 2021-10-05
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include "compressor_impl.inl"
+#include "framework.hh"
+
+template class cusz::Compressor<cusz::Framework<float>>::impl;
diff --git a/qtensor/compression/cusz/src/detail/compressor_impl.inl b/qtensor/compression/cusz/src/detail/compressor_impl.inl
index a36f339a..46704ba6 100644
--- a/qtensor/compression/cusz/src/detail/compressor_impl.inl
+++ b/qtensor/compression/cusz/src/detail/compressor_impl.inl
@@ -1,479 +1,479 @@
-/**
- * @file compressor_impl.cuh
- * @author Jiannan Tian
- * @brief cuSZ compressor of the default path
- * @version 0.3
- * @date 2021-10-05
- * (create) 2020-02-12; (release) 2020-09-20;
- * (rev.1) 2021-01-16; (rev.2) 2021-07-12; (rev.3) 2021-09-06; (rev.4) 2021-10-05
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#ifndef CUSZ_DEFAULT_PATH_CUH
-#define CUSZ_DEFAULT_PATH_CUH
-
-#include <cuda_runtime.h>
-#include <thrust/device_ptr.h>
-#include <thrust/execution_policy.h>
-#include <iostream>
-
-#include "component.hh"
-#include "compressor.hh"
-#include "header.h"
-#include "kernel/cpplaunch_cuda.hh"
-#include "stat/stat_g.hh"
-#include "utils/cuda_err.cuh"
-
-#define DEFINE_DEV(VAR, TYPE) TYPE* d_##VAR{nullptr};
-#define DEFINE_HOST(VAR, TYPE) TYPE* h_##VAR{nullptr};
-#define FREEDEV(VAR) CHECK_CUDA(cudaFree(d_##VAR));
-#define FREEHOST(VAR) CHECK_CUDA(cudaFreeHost(h_##VAR));
-
-#define PRINT_ENTRY(VAR) printf("%d %-*s:  %'10u\n", (int)Header::VAR, 14, #VAR, header.entry[Header::VAR]);
-
-#define DEVICE2DEVICE_COPY(VAR, FIELD)                                                                 \
-    if (nbyte[Header::FIELD] != 0 and VAR != nullptr) {                                                \
-        auto dst = d_reserved_compressed + header.entry[Header::FIELD];                                \
-        auto src = reinterpret_cast<BYTE*>(VAR);                                                       \
-        CHECK_CUDA(cudaMemcpyAsync(dst, src, nbyte[Header::FIELD], cudaMemcpyDeviceToDevice, stream)); \
-    }
-
-#define ACCESSOR(SYM, TYPE) reinterpret_cast<TYPE*>(in_compressed + header->entry[Header::SYM])
-
-namespace cusz {
-
-constexpr auto kHOST        = cusz::LOC::HOST;
-constexpr auto kDEVICE      = cusz::LOC::DEVICE;
-constexpr auto kHOST_DEVICE = cusz::LOC::HOST_DEVICE;
-
-#define TEMPLATE_TYPE template <class BINDING>
-#define IMPL Compressor<BINDING>::impl
-
-TEMPLATE_TYPE
-uint32_t IMPL::get_len_data() { return data_len3.x * data_len3.y * data_len3.z; }
-
-TEMPLATE_TYPE
-IMPL::impl()
-{
-    predictor = new Predictor;
-
-    spcodec  = new Spcodec;
-    codec    = new Codec;
-    fb_codec = new FallbackCodec;
-}
-
-TEMPLATE_TYPE
-void IMPL::destroy()
-{
-    if (spcodec) delete spcodec;
-    if (codec) delete codec;
-    if (fb_codec) delete codec;
-    if (predictor) delete predictor;
-}
-
-TEMPLATE_TYPE
-IMPL::~impl() { destroy(); }
-
-//------------------------------------------------------------------------------
-
-// TODO
-TEMPLATE_TYPE
-void IMPL::init(Context* config, bool dbg_print) { init_detail(config, dbg_print); }
-
-TEMPLATE_TYPE
-void IMPL::init(Header* config, bool dbg_print) { init_detail(config, dbg_print); }
-
-template <class T>
-void peek_devdata(T* d_arr, size_t num = 20)
-{
-    thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__ __host__(const T i) { printf("%u\t", i); });
-    printf("\n");
-}
-
-TEMPLATE_TYPE
-void IMPL::compress(
-    Context*     config,
-    T*           uncompressed,
-    BYTE*&       compressed,
-    size_t&      compressed_len,
-    cudaStream_t stream,
-    bool         dbg_print)
-{
-    auto const eb                = config->eb;
-    auto const radius            = config->radius;
-    auto const pardeg            = config->vle_pardeg;
-    auto const codecs_in_use     = config->codecs_in_use;
-    auto const nz_density_factor = config->nz_density_factor;
-
-    if (dbg_print) {
-        std::cout << "eb\t" << eb << endl;
-        std::cout << "radius\t" << radius << endl;
-        std::cout << "pardeg\t" << pardeg << endl;
-        std::cout << "codecs_in_use\t" << codecs_in_use << endl;
-        std::cout << "nz_density_factor\t" << nz_density_factor << endl;
-    }
-
-    data_len3                 = dim3(config->x, config->y, config->z);
-    auto codec_force_fallback = config->codec_force_fallback();
-
-    header.codecs_in_use     = codecs_in_use;
-    header.nz_density_factor = nz_density_factor;
-
-    T*     d_anchor{nullptr};   // predictor out1
-    E*     d_errctrl{nullptr};  // predictor out2
-    T*     d_outlier{nullptr};  // predictor out3
-    BYTE*  d_spfmt{nullptr};
-    size_t spfmt_outlen{0};
-
-    BYTE*  d_codec_out{nullptr};
-    size_t codec_outlen{0};
-
-    size_t data_len, errctrl_len, sublen, spcodec_inlen;
-    auto   booklen = radius * 2;
-
-    auto derive_lengths_after_prediction = [&]() {
-        data_len      = predictor->get_len_data();
-        errctrl_len   = data_len;
-        spcodec_inlen = data_len;
-        sublen        = ConfigHelper::get_npart(data_len, pardeg);
-
-        // std::cout << "datalen\t" << data_len << '\n';
-        // std::cout << "errctrl_len\t" << errctrl_len << '\n';
-        // std::cout << "spcodec_inlen\t" << spcodec_inlen << '\n';
-        // std::cout << "sublen\t" << sublen << '\n';
-    };
-
-    auto update_header = [&]() {
-        header.x          = data_len3.x;
-        header.y          = data_len3.y;
-        header.z          = data_len3.z;
-        header.w          = 1;  // placeholder
-        header.radius     = radius;
-        header.vle_pardeg = pardeg;
-        header.eb         = eb;
-        header.byte_vle   = use_fallback_codec ? 8 : 4;
-    };
-
-    /******************************************************************************/
-
-    // Prediction is the dependency of the rest procedures.
-    predictor->construct(LorenzoI, data_len3, uncompressed, &d_anchor, &d_errctrl, &d_outlier, eb, radius, stream);
-    // peek_devdata(d_errctrl);
-
-    derive_lengths_after_prediction();
-    /******************************************************************************/
-
-    asz::stat::histogram<E>(d_errctrl, errctrl_len, d_freq, booklen, &time_hist, stream);
-
-    /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream));
-
-    // TODO remove duplicate get_frequency inside encode_with_exception()
-    encode_with_exception(
-        d_errctrl, errctrl_len,                                 // input
-        d_freq, booklen, sublen, pardeg, codec_force_fallback,  // config
-        d_codec_out, codec_outlen,                              // output
-        stream, dbg_print);
-
-    (*spcodec).encode(d_outlier, spcodec_inlen, d_spfmt, spfmt_outlen, stream, dbg_print);
-
-    /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream));
-
-    /******************************************************************************/
-
-    update_header();
-    subfile_collect(
-        d_anchor, (*predictor).get_len_anchor(),  //
-        d_codec_out, codec_outlen,                //
-        d_spfmt, spfmt_outlen,                    //
-        stream, dbg_print);
-
-    // output
-    compressed_len = ConfigHelper::get_filesize(&header);
-    compressed     = d_reserved_compressed;
-
-    collect_compress_timerecord();
-
-    // considering that codec can be consecutively in use, and can compress data of different huff-byte
-    use_fallback_codec = false;
-}
-
-TEMPLATE_TYPE
-void IMPL::clear_buffer()
-{  //
-    (*predictor).clear_buffer();
-    (*codec).clear_buffer();
-    (*spcodec).clear_buffer();
-}
-
-TEMPLATE_TYPE
-void IMPL::decompress(Header* header, BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool dbg_print)
-{
-    // TODO host having copy of header when compressing
-    if (not header) {
-        header = new Header;
-        CHECK_CUDA(cudaMemcpyAsync(header, in_compressed, sizeof(Header), cudaMemcpyDeviceToHost, stream));
-        CHECK_CUDA(cudaStreamSynchronize(stream));
-    }
-
-    data_len3 = dim3(header->x, header->y, header->z);
-
-    use_fallback_codec      = header->byte_vle == 8;
-    double const eb         = header->eb;
-    int const    radius     = header->radius;
-    auto const   vle_pardeg = header->vle_pardeg;
-
-    // The inputs of components are from `compressed`.
-    auto d_anchor = ACCESSOR(ANCHOR, T);
-    auto d_vle    = ACCESSOR(VLE, BYTE);
-    auto d_sp     = ACCESSOR(SPFMT, BYTE);
-
-    // wire the workspace
-    auto d_errctrl = (*predictor).expose_quant();  // reuse space
-
-    // wire and aliasing
-    auto d_outlier       = out_decompressed;
-    auto d_outlier_xdata = out_decompressed;
-
-    auto spcodec_do            = [&]() { (*spcodec).decode(d_sp, d_outlier, stream); };
-    auto decode_with_exception = [&]() {
-        if (not use_fallback_codec) {  //
-            (*codec).decode(d_vle, d_errctrl);
-        }
-        else {
-            if (not fallback_codec_allocated) {
-                (*fb_codec).init((*predictor).get_len_quant(), radius * 2, vle_pardeg, /*dbg print*/ false);
-                fallback_codec_allocated = true;
-            }
-            (*fb_codec).decode(d_vle, d_errctrl);
-        }
-    };
-    auto predictor_do = [&]() {
-        (*predictor).reconstruct(LorenzoI, data_len3, d_outlier_xdata, d_anchor, d_errctrl, eb, radius, stream);
-    };
-
-    // process
-    spcodec_do(), decode_with_exception(), predictor_do();
-
-    collect_decompress_timerecord();
-
-    // clear state for the next decompression after reporting
-    use_fallback_codec = false;
-}
-
-// public getter
-TEMPLATE_TYPE
-void IMPL::export_header(Header& ext_header) { ext_header = header; }
-
-TEMPLATE_TYPE
-void IMPL::export_header(Header* ext_header) { *ext_header = header; }
-
-TEMPLATE_TYPE
-void IMPL::export_timerecord(TimeRecord* ext_timerecord)
-{
-    if (ext_timerecord) *ext_timerecord = timerecord;
-}
-
-// helper
-TEMPLATE_TYPE
-void IMPL::init_codec(size_t codec_in_len, unsigned int codec_config, int max_booklen, int pardeg, bool dbg_print)
-{
-    if (codec_config == 0b00) throw std::runtime_error("Argument codec_config must have set bit(s).");
-    if (codec_config bitand 0b01) {
-        if (dbg_print) LOGGING(LOG_INFO, "allocated 4-byte codec");
-        (*codec).init(codec_in_len, max_booklen, pardeg, dbg_print);
-    }
-    if (codec_config bitand 0b10) {
-        if (dbg_print) LOGGING(LOG_INFO, "allocated 8-byte (fallback) codec");
-        (*fb_codec).init(codec_in_len, max_booklen, pardeg, dbg_print);
-        fallback_codec_allocated = true;
-    }
-};
-
-TEMPLATE_TYPE
-template <class CONFIG>
-void IMPL::init_detail(CONFIG* config, bool dbg_print)
-{
-    const auto cfg_radius      = config->radius;
-    const auto cfg_pardeg      = config->vle_pardeg;
-    const auto density_factor  = config->nz_density_factor;
-    const auto codec_config    = config->codecs_in_use;
-    const auto cfg_max_booklen = cfg_radius * 2;
-    const auto x               = config->x;
-    const auto y               = config->y;
-    const auto z               = config->z;
-
-    size_t spcodec_in_len, codec_in_len;
-
-    (*predictor).init(LorenzoI, x, y, z, dbg_print);
-
-    spcodec_in_len = (*predictor).get_alloclen_data();
-    codec_in_len   = (*predictor).get_alloclen_quant();
-
-    (*spcodec).init(spcodec_in_len, density_factor, dbg_print);
-
-    {
-        auto bytes = sizeof(cusz::FREQ) * cfg_max_booklen;
-        cudaMalloc(&d_freq, bytes);
-        cudaMemset(d_freq, 0x0, bytes);
-
-        // cudaMalloc(&d_freq_another, bytes);
-        // cudaMemset(d_freq_another, 0x0, bytes);
-    }
-
-    init_codec(codec_in_len, codec_config, cfg_max_booklen, cfg_pardeg, dbg_print);
-
-    CHECK_CUDA(cudaMalloc(&d_reserved_compressed, (*predictor).get_alloclen_data() * sizeof(T) / 2));
-}
-
-TEMPLATE_TYPE
-void IMPL::collect_compress_timerecord()
-{
-#define COLLECT_TIME(NAME, TIME) timerecord.push_back({const_cast<const char*>(NAME), TIME});
-
-    if (not timerecord.empty()) timerecord.clear();
-
-    COLLECT_TIME("predict", (*predictor).get_time_elapsed());
-    COLLECT_TIME("histogram", time_hist);
-
-    if (not use_fallback_codec) {
-        COLLECT_TIME("book", (*codec).get_time_book());
-        COLLECT_TIME("huff-enc", (*codec).get_time_lossless());
-    }
-    else {
-        COLLECT_TIME("book", (*fb_codec).get_time_book());
-        COLLECT_TIME("huff-enc", (*fb_codec).get_time_lossless());
-    }
-
-    COLLECT_TIME("outlier", (*spcodec).get_time_elapsed());
-}
-
-TEMPLATE_TYPE
-void IMPL::collect_decompress_timerecord()
-{
-    if (not timerecord.empty()) timerecord.clear();
-
-    COLLECT_TIME("outlier", (*spcodec).get_time_elapsed());
-
-    if (not use_fallback_codec) {  //
-        COLLECT_TIME("huff-dec", (*codec).get_time_lossless());
-    }
-    else {  //
-        COLLECT_TIME("huff-dec", (*fb_codec).get_time_lossless());
-    }
-
-    COLLECT_TIME("predict", (*predictor).get_time_elapsed());
-}
-
-TEMPLATE_TYPE
-void IMPL::encode_with_exception(
-    E*           d_in,
-    size_t       inlen,
-    cusz::FREQ*  d_freq,
-    int          booklen,
-    int          sublen,
-    int          pardeg,
-    bool         codec_force_fallback,
-    BYTE*&       d_out,
-    size_t&      outlen,
-    cudaStream_t stream,
-    bool         dbg_print)
-{
-    auto build_codebook_using = [&](auto encoder) { encoder->build_codebook(d_freq, booklen, stream); };
-    auto encode_with          = [&](auto encoder) { encoder->encode(d_in, inlen, d_out, outlen, stream); };
-
-    auto try_fallback_alloc = [&]() {
-        use_fallback_codec = true;
-        if (not fallback_codec_allocated) {
-            LOGGING(LOG_EXCEPTION, "online allocate fallback (8-byte) codec");
-            fb_codec->init(inlen, booklen, pardeg, dbg_print);
-            fallback_codec_allocated = true;
-        }
-    };
-
-    /******************************************************************************/
-    if (not codec_force_fallback) {
-        try {
-            build_codebook_using(codec);
-            encode_with(codec);
-        }
-        catch (const std::runtime_error& e) {
-            LOGGING(LOG_EXCEPTION, "switch to fallback codec");
-            try_fallback_alloc();
-
-            build_codebook_using(fb_codec);
-            encode_with(fb_codec);
-        }
-    }
-    else {
-        LOGGING(LOG_INFO, "force switch to fallback codec");
-        try_fallback_alloc();
-
-        build_codebook_using(fb_codec);
-        encode_with(fb_codec);
-    }
-}
-
-TEMPLATE_TYPE
-void IMPL::subfile_collect(
-    T*           d_anchor,
-    size_t       anchor_len,
-    BYTE*        d_codec_out,
-    size_t       codec_outlen,
-    BYTE*        d_spfmt_out,
-    size_t       spfmt_outlen,
-    cudaStream_t stream,
-    bool         dbg_print)
-{
-    header.self_bytes = sizeof(Header);
-    uint32_t nbyte[Header::END];
-    nbyte[Header::HEADER] = sizeof(Header);
-    nbyte[Header::ANCHOR] = sizeof(T) * anchor_len;
-    nbyte[Header::VLE]    = sizeof(BYTE) * codec_outlen;
-    nbyte[Header::SPFMT]  = sizeof(BYTE) * spfmt_outlen;
-
-    header.entry[0] = 0;
-    // *.END + 1; need to know the ending position
-    for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] = nbyte[i - 1]; }
-    for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; }
-
-    auto debug_header_entry = [&]() {
-        printf("\nsubfile collect in compressor:\n");
-        printf("  ENTRIES\n");
-
-        PRINT_ENTRY(HEADER);
-        PRINT_ENTRY(ANCHOR);
-        PRINT_ENTRY(VLE);
-        PRINT_ENTRY(SPFMT);
-        PRINT_ENTRY(END);
-        printf("\n");
-    };
-
-    if (dbg_print) debug_header_entry();
-
-    CHECK_CUDA(cudaMemcpyAsync(d_reserved_compressed, &header, sizeof(header), cudaMemcpyHostToDevice, stream));
-
-    DEVICE2DEVICE_COPY(d_anchor, ANCHOR)
-    DEVICE2DEVICE_COPY(d_codec_out, VLE)
-    DEVICE2DEVICE_COPY(d_spfmt_out, SPFMT)
-
-    /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream));
-}
-
-}  // namespace cusz
-
-#undef FREEDEV
-#undef FREEHOST
-#undef DEFINE_DEV
-#undef DEFINE_HOST
-#undef DEVICE2DEVICE_COPY
-#undef PRINT_ENTRY
-#undef ACCESSOR
-#undef COLLECT_TIME
-
-#undef TEMPLATE_TYPE
-#undef IMPL
-
-#endif
+/**
+ * @file compressor_impl.cuh
+ * @author Jiannan Tian
+ * @brief cuSZ compressor of the default path
+ * @version 0.3
+ * @date 2021-10-05
+ * (create) 2020-02-12; (release) 2020-09-20;
+ * (rev.1) 2021-01-16; (rev.2) 2021-07-12; (rev.3) 2021-09-06; (rev.4) 2021-10-05
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_DEFAULT_PATH_CUH
+#define CUSZ_DEFAULT_PATH_CUH
+
+#include <cuda_runtime.h>
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <iostream>
+
+#include "component.hh"
+#include "compressor.hh"
+#include "header.h"
+#include "kernel/cpplaunch_cuda.hh"
+#include "stat/stat_g.hh"
+#include "utils/cuda_err.cuh"
+
+#define DEFINE_DEV(VAR, TYPE) TYPE* d_##VAR{nullptr};
+#define DEFINE_HOST(VAR, TYPE) TYPE* h_##VAR{nullptr};
+#define FREEDEV(VAR) CHECK_CUDA(cudaFree(d_##VAR));
+#define FREEHOST(VAR) CHECK_CUDA(cudaFreeHost(h_##VAR));
+
+#define PRINT_ENTRY(VAR) printf("%d %-*s:  %'10u\n", (int)Header::VAR, 14, #VAR, header.entry[Header::VAR]);
+
+#define DEVICE2DEVICE_COPY(VAR, FIELD)                                                                 \
+    if (nbyte[Header::FIELD] != 0 and VAR != nullptr) {                                                \
+        auto dst = d_reserved_compressed + header.entry[Header::FIELD];                                \
+        auto src = reinterpret_cast<BYTE*>(VAR);                                                       \
+        CHECK_CUDA(cudaMemcpyAsync(dst, src, nbyte[Header::FIELD], cudaMemcpyDeviceToDevice, stream)); \
+    }
+
+#define ACCESSOR(SYM, TYPE) reinterpret_cast<TYPE*>(in_compressed + header->entry[Header::SYM])
+
+namespace cusz {
+
+constexpr auto kHOST        = cusz::LOC::HOST;
+constexpr auto kDEVICE      = cusz::LOC::DEVICE;
+constexpr auto kHOST_DEVICE = cusz::LOC::HOST_DEVICE;
+
+#define TEMPLATE_TYPE template <class BINDING>
+#define IMPL Compressor<BINDING>::impl
+
+TEMPLATE_TYPE
+uint32_t IMPL::get_len_data() { return data_len3.x * data_len3.y * data_len3.z; }
+
+TEMPLATE_TYPE
+IMPL::impl()
+{
+    predictor = new Predictor;
+
+    spcodec  = new Spcodec;
+    codec    = new Codec;
+    fb_codec = new FallbackCodec;
+}
+
+TEMPLATE_TYPE
+void IMPL::destroy()
+{
+    if (spcodec) delete spcodec;
+    if (codec) delete codec;
+    if (fb_codec) delete codec;
+    if (predictor) delete predictor;
+}
+
+TEMPLATE_TYPE
+IMPL::~impl() { destroy(); }
+
+//------------------------------------------------------------------------------
+
+// TODO
+TEMPLATE_TYPE
+void IMPL::init(Context* config, bool dbg_print) { init_detail(config, dbg_print); }
+
+TEMPLATE_TYPE
+void IMPL::init(Header* config, bool dbg_print) { init_detail(config, dbg_print); }
+
+template <class T>
+void peek_devdata(T* d_arr, size_t num = 20)
+{
+    thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__ __host__(const T i) { printf("%u\t", i); });
+    printf("\n");
+}
+
+TEMPLATE_TYPE
+void IMPL::compress(
+    Context*     config,
+    T*           uncompressed,
+    BYTE*&       compressed,
+    size_t&      compressed_len,
+    cudaStream_t stream,
+    bool         dbg_print)
+{
+    auto const eb                = config->eb;
+    auto const radius            = config->radius;
+    auto const pardeg            = config->vle_pardeg;
+    auto const codecs_in_use     = config->codecs_in_use;
+    auto const nz_density_factor = config->nz_density_factor;
+
+    if (dbg_print) {
+        std::cout << "eb\t" << eb << endl;
+        std::cout << "radius\t" << radius << endl;
+        std::cout << "pardeg\t" << pardeg << endl;
+        std::cout << "codecs_in_use\t" << codecs_in_use << endl;
+        std::cout << "nz_density_factor\t" << nz_density_factor << endl;
+    }
+
+    data_len3                 = dim3(config->x, config->y, config->z);
+    auto codec_force_fallback = config->codec_force_fallback();
+
+    header.codecs_in_use     = codecs_in_use;
+    header.nz_density_factor = nz_density_factor;
+
+    T*     d_anchor{nullptr};   // predictor out1
+    E*     d_errctrl{nullptr};  // predictor out2
+    T*     d_outlier{nullptr};  // predictor out3
+    BYTE*  d_spfmt{nullptr};
+    size_t spfmt_outlen{0};
+
+    BYTE*  d_codec_out{nullptr};
+    size_t codec_outlen{0};
+
+    size_t data_len, errctrl_len, sublen, spcodec_inlen;
+    auto   booklen = radius * 2;
+
+    auto derive_lengths_after_prediction = [&]() {
+        data_len      = predictor->get_len_data();
+        errctrl_len   = data_len;
+        spcodec_inlen = data_len;
+        sublen        = ConfigHelper::get_npart(data_len, pardeg);
+
+        // std::cout << "datalen\t" << data_len << '\n';
+        // std::cout << "errctrl_len\t" << errctrl_len << '\n';
+        // std::cout << "spcodec_inlen\t" << spcodec_inlen << '\n';
+        // std::cout << "sublen\t" << sublen << '\n';
+    };
+
+    auto update_header = [&]() {
+        header.x          = data_len3.x;
+        header.y          = data_len3.y;
+        header.z          = data_len3.z;
+        header.w          = 1;  // placeholder
+        header.radius     = radius;
+        header.vle_pardeg = pardeg;
+        header.eb         = eb;
+        header.byte_vle   = use_fallback_codec ? 8 : 4;
+    };
+
+    /******************************************************************************/
+
+    // Prediction is the dependency of the rest procedures.
+    predictor->construct(LorenzoI, data_len3, uncompressed, &d_anchor, &d_errctrl, &d_outlier, eb, radius, stream);
+    // peek_devdata(d_errctrl);
+
+    derive_lengths_after_prediction();
+    /******************************************************************************/
+
+    asz::stat::histogram<E>(d_errctrl, errctrl_len, d_freq, booklen, &time_hist, stream);
+
+    /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    // TODO remove duplicate get_frequency inside encode_with_exception()
+    encode_with_exception(
+        d_errctrl, errctrl_len,                                 // input
+        d_freq, booklen, sublen, pardeg, codec_force_fallback,  // config
+        d_codec_out, codec_outlen,                              // output
+        stream, dbg_print);
+
+    (*spcodec).encode(d_outlier, spcodec_inlen, d_spfmt, spfmt_outlen, stream, dbg_print);
+
+    /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    /******************************************************************************/
+
+    update_header();
+    subfile_collect(
+        d_anchor, (*predictor).get_len_anchor(),  //
+        d_codec_out, codec_outlen,                //
+        d_spfmt, spfmt_outlen,                    //
+        stream, dbg_print);
+
+    // output
+    compressed_len = ConfigHelper::get_filesize(&header);
+    compressed     = d_reserved_compressed;
+
+    collect_compress_timerecord();
+
+    // considering that codec can be consecutively in use, and can compress data of different huff-byte
+    use_fallback_codec = false;
+}
+
+TEMPLATE_TYPE
+void IMPL::clear_buffer()
+{  //
+    (*predictor).clear_buffer();
+    (*codec).clear_buffer();
+    (*spcodec).clear_buffer();
+}
+
+TEMPLATE_TYPE
+void IMPL::decompress(Header* header, BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool dbg_print)
+{
+    // TODO host having copy of header when compressing
+    if (not header) {
+        header = new Header;
+        CHECK_CUDA(cudaMemcpyAsync(header, in_compressed, sizeof(Header), cudaMemcpyDeviceToHost, stream));
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+    }
+
+    data_len3 = dim3(header->x, header->y, header->z);
+
+    use_fallback_codec      = header->byte_vle == 8;
+    double const eb         = header->eb;
+    int const    radius     = header->radius;
+    auto const   vle_pardeg = header->vle_pardeg;
+
+    // The inputs of components are from `compressed`.
+    auto d_anchor = ACCESSOR(ANCHOR, T);
+    auto d_vle    = ACCESSOR(VLE, BYTE);
+    auto d_sp     = ACCESSOR(SPFMT, BYTE);
+
+    // wire the workspace
+    auto d_errctrl = (*predictor).expose_quant();  // reuse space
+
+    // wire and aliasing
+    auto d_outlier       = out_decompressed;
+    auto d_outlier_xdata = out_decompressed;
+
+    auto spcodec_do            = [&]() { (*spcodec).decode(d_sp, d_outlier, stream); };
+    auto decode_with_exception = [&]() {
+        if (not use_fallback_codec) {  //
+            (*codec).decode(d_vle, d_errctrl);
+        }
+        else {
+            if (not fallback_codec_allocated) {
+                (*fb_codec).init((*predictor).get_len_quant(), radius * 2, vle_pardeg, /*dbg print*/ false);
+                fallback_codec_allocated = true;
+            }
+            (*fb_codec).decode(d_vle, d_errctrl);
+        }
+    };
+    auto predictor_do = [&]() {
+        (*predictor).reconstruct(LorenzoI, data_len3, d_outlier_xdata, d_anchor, d_errctrl, eb, radius, stream);
+    };
+
+    // process
+    spcodec_do(), decode_with_exception(), predictor_do();
+
+    collect_decompress_timerecord();
+
+    // clear state for the next decompression after reporting
+    use_fallback_codec = false;
+}
+
+// public getter
+TEMPLATE_TYPE
+void IMPL::export_header(Header& ext_header) { ext_header = header; }
+
+TEMPLATE_TYPE
+void IMPL::export_header(Header* ext_header) { *ext_header = header; }
+
+TEMPLATE_TYPE
+void IMPL::export_timerecord(TimeRecord* ext_timerecord)
+{
+    if (ext_timerecord) *ext_timerecord = timerecord;
+}
+
+// helper
+TEMPLATE_TYPE
+void IMPL::init_codec(size_t codec_in_len, unsigned int codec_config, int max_booklen, int pardeg, bool dbg_print)
+{
+    if (codec_config == 0b00) throw std::runtime_error("Argument codec_config must have set bit(s).");
+    if (codec_config bitand 0b01) {
+        if (dbg_print) LOGGING(LOG_INFO, "allocated 4-byte codec");
+        (*codec).init(codec_in_len, max_booklen, pardeg, dbg_print);
+    }
+    if (codec_config bitand 0b10) {
+        if (dbg_print) LOGGING(LOG_INFO, "allocated 8-byte (fallback) codec");
+        (*fb_codec).init(codec_in_len, max_booklen, pardeg, dbg_print);
+        fallback_codec_allocated = true;
+    }
+};
+
+TEMPLATE_TYPE
+template <class CONFIG>
+void IMPL::init_detail(CONFIG* config, bool dbg_print)
+{
+    const auto cfg_radius      = config->radius;
+    const auto cfg_pardeg      = config->vle_pardeg;
+    const auto density_factor  = config->nz_density_factor;
+    const auto codec_config    = config->codecs_in_use;
+    const auto cfg_max_booklen = cfg_radius * 2;
+    const auto x               = config->x;
+    const auto y               = config->y;
+    const auto z               = config->z;
+
+    size_t spcodec_in_len, codec_in_len;
+
+    (*predictor).init(LorenzoI, x, y, z, dbg_print);
+
+    spcodec_in_len = (*predictor).get_alloclen_data();
+    codec_in_len   = (*predictor).get_alloclen_quant();
+
+    (*spcodec).init(spcodec_in_len, density_factor, dbg_print);
+
+    {
+        auto bytes = sizeof(cusz::FREQ) * cfg_max_booklen;
+        cudaMalloc(&d_freq, bytes);
+        cudaMemset(d_freq, 0x0, bytes);
+
+        // cudaMalloc(&d_freq_another, bytes);
+        // cudaMemset(d_freq_another, 0x0, bytes);
+    }
+
+    init_codec(codec_in_len, codec_config, cfg_max_booklen, cfg_pardeg, dbg_print);
+
+    CHECK_CUDA(cudaMalloc(&d_reserved_compressed, (*predictor).get_alloclen_data() * sizeof(T) / 2));
+}
+
+TEMPLATE_TYPE
+void IMPL::collect_compress_timerecord()
+{
+#define COLLECT_TIME(NAME, TIME) timerecord.push_back({const_cast<const char*>(NAME), TIME});
+
+    if (not timerecord.empty()) timerecord.clear();
+
+    COLLECT_TIME("predict", (*predictor).get_time_elapsed());
+    COLLECT_TIME("histogram", time_hist);
+
+    if (not use_fallback_codec) {
+        COLLECT_TIME("book", (*codec).get_time_book());
+        COLLECT_TIME("huff-enc", (*codec).get_time_lossless());
+    }
+    else {
+        COLLECT_TIME("book", (*fb_codec).get_time_book());
+        COLLECT_TIME("huff-enc", (*fb_codec).get_time_lossless());
+    }
+
+    COLLECT_TIME("outlier", (*spcodec).get_time_elapsed());
+}
+
+TEMPLATE_TYPE
+void IMPL::collect_decompress_timerecord()
+{
+    if (not timerecord.empty()) timerecord.clear();
+
+    COLLECT_TIME("outlier", (*spcodec).get_time_elapsed());
+
+    if (not use_fallback_codec) {  //
+        COLLECT_TIME("huff-dec", (*codec).get_time_lossless());
+    }
+    else {  //
+        COLLECT_TIME("huff-dec", (*fb_codec).get_time_lossless());
+    }
+
+    COLLECT_TIME("predict", (*predictor).get_time_elapsed());
+}
+
+TEMPLATE_TYPE
+void IMPL::encode_with_exception(
+    E*           d_in,
+    size_t       inlen,
+    cusz::FREQ*  d_freq,
+    int          booklen,
+    int          sublen,
+    int          pardeg,
+    bool         codec_force_fallback,
+    BYTE*&       d_out,
+    size_t&      outlen,
+    cudaStream_t stream,
+    bool         dbg_print)
+{
+    auto build_codebook_using = [&](auto encoder) { encoder->build_codebook(d_freq, booklen, stream); };
+    auto encode_with          = [&](auto encoder) { encoder->encode(d_in, inlen, d_out, outlen, stream); };
+
+    auto try_fallback_alloc = [&]() {
+        use_fallback_codec = true;
+        if (not fallback_codec_allocated) {
+            LOGGING(LOG_EXCEPTION, "online allocate fallback (8-byte) codec");
+            fb_codec->init(inlen, booklen, pardeg, dbg_print);
+            fallback_codec_allocated = true;
+        }
+    };
+
+    /******************************************************************************/
+    if (not codec_force_fallback) {
+        try {
+            build_codebook_using(codec);
+            encode_with(codec);
+        }
+        catch (const std::runtime_error& e) {
+            LOGGING(LOG_EXCEPTION, "switch to fallback codec");
+            try_fallback_alloc();
+
+            build_codebook_using(fb_codec);
+            encode_with(fb_codec);
+        }
+    }
+    else {
+        LOGGING(LOG_INFO, "force switch to fallback codec");
+        try_fallback_alloc();
+
+        build_codebook_using(fb_codec);
+        encode_with(fb_codec);
+    }
+}
+
+TEMPLATE_TYPE
+void IMPL::subfile_collect(
+    T*           d_anchor,
+    size_t       anchor_len,
+    BYTE*        d_codec_out,
+    size_t       codec_outlen,
+    BYTE*        d_spfmt_out,
+    size_t       spfmt_outlen,
+    cudaStream_t stream,
+    bool         dbg_print)
+{
+    header.self_bytes = sizeof(Header);
+    uint32_t nbyte[Header::END];
+    nbyte[Header::HEADER] = sizeof(Header);
+    nbyte[Header::ANCHOR] = sizeof(T) * anchor_len;
+    nbyte[Header::VLE]    = sizeof(BYTE) * codec_outlen;
+    nbyte[Header::SPFMT]  = sizeof(BYTE) * spfmt_outlen;
+
+    header.entry[0] = 0;
+    // *.END + 1; need to know the ending position
+    for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] = nbyte[i - 1]; }
+    for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; }
+
+    auto debug_header_entry = [&]() {
+        printf("\nsubfile collect in compressor:\n");
+        printf("  ENTRIES\n");
+
+        PRINT_ENTRY(HEADER);
+        PRINT_ENTRY(ANCHOR);
+        PRINT_ENTRY(VLE);
+        PRINT_ENTRY(SPFMT);
+        PRINT_ENTRY(END);
+        printf("\n");
+    };
+
+    if (dbg_print) debug_header_entry();
+
+    CHECK_CUDA(cudaMemcpyAsync(d_reserved_compressed, &header, sizeof(header), cudaMemcpyHostToDevice, stream));
+
+    DEVICE2DEVICE_COPY(d_anchor, ANCHOR)
+    DEVICE2DEVICE_COPY(d_codec_out, VLE)
+    DEVICE2DEVICE_COPY(d_spfmt_out, SPFMT)
+
+    /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream));
+}
+
+}  // namespace cusz
+
+#undef FREEDEV
+#undef FREEHOST
+#undef DEFINE_DEV
+#undef DEFINE_HOST
+#undef DEVICE2DEVICE_COPY
+#undef PRINT_ENTRY
+#undef ACCESSOR
+#undef COLLECT_TIME
+
+#undef TEMPLATE_TYPE
+#undef IMPL
+
+#endif
diff --git a/qtensor/compression/cusz/src/detail/spmat.cu b/qtensor/compression/cusz/src/detail/spmat.cu
index 141d2acb..b6a95bb2 100644
--- a/qtensor/compression/cusz/src/detail/spmat.cu
+++ b/qtensor/compression/cusz/src/detail/spmat.cu
@@ -1,14 +1,14 @@
-/**
- * @file spmat.cu
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2021-09-28
- *
- * (C) 2021 by Washington State University, Argonne National Laboratory
- *
- */
-
-#include "detail/spmat.cuh"
-
-template struct cusz::SpcodecCSR<float, uint32_t>::impl;
+/**
+ * @file spmat.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-09-28
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "detail/spmat.cuh"
+
+template struct cusz::SpcodecCSR<float, uint32_t>::impl;
diff --git a/qtensor/compression/cusz/src/detail/spv_gpu.inl b/qtensor/compression/cusz/src/detail/spv_gpu.inl
index 4775926e..4c724bd5 100644
--- a/qtensor/compression/cusz/src/detail/spv_gpu.inl
+++ b/qtensor/compression/cusz/src/detail/spv_gpu.inl
@@ -1,77 +1,77 @@
-/**
- * @file spv_gpu.inl
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-08-22
- * (update) 2022-10-29
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef F88E11A6_6B61_4C6F_8B2E_30EEAAB4D204
-#define F88E11A6_6B61_4C6F_8B2E_30EEAAB4D204
-
-#include <thrust/copy.h>
-#include <thrust/count.h>
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/permutation_iterator.h>
-#include <thrust/tuple.h>
-
-#include "utils/timer.h"
-
-namespace psz {
-namespace detail {
-
-template <typename T, typename M>
-void spv_gather(
-    T*           in,
-    size_t const in_len,
-    T*           d_val,
-    uint32_t*    d_idx,
-    int*         nnz,
-    float*       milliseconds,
-    cudaStream_t stream)
-{
-    using thrust::placeholders::_1;
-
-    thrust::cuda::par.on(stream);
-    thrust::counting_iterator<uint32_t> zero(0);
-
-    CREATE_CUDAEVENT_PAIR;
-    START_CUDAEVENT_RECORDING(stream);
-
-    // find out the indices
-    *nnz = thrust::copy_if(thrust::device, zero, zero + in_len, in, d_idx, _1 != 0) - d_idx;
-
-    // fetch corresponding values
-    thrust::copy(
-        thrust::device, thrust::make_permutation_iterator(in, d_idx),
-        thrust::make_permutation_iterator(in + *nnz, d_idx + *nnz), d_val);
-
-    STOP_CUDAEVENT_RECORDING(stream);
-    TIME_ELAPSED_CUDAEVENT(milliseconds);
-    DESTROY_CUDAEVENT_PAIR;
-}
-
-template <typename T, typename M>
-void spv_scatter(T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream)
-{
-    thrust::cuda::par.on(stream);
-
-    CREATE_CUDAEVENT_PAIR;
-    START_CUDAEVENT_RECORDING(stream);
-
-    thrust::scatter(thrust::device, d_val, d_val + nnz, d_idx, decoded);
-
-    STOP_CUDAEVENT_RECORDING(stream);
-    TIME_ELAPSED_CUDAEVENT(milliseconds);
-    DESTROY_CUDAEVENT_PAIR;
-}
-
-}  // namespace detail
-}  // namespace psz
-
-#endif /* F88E11A6_6B61_4C6F_8B2E_30EEAAB4D204 */
+/**
+ * @file spv_gpu.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-08-22
+ * (update) 2022-10-29
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef F88E11A6_6B61_4C6F_8B2E_30EEAAB4D204
+#define F88E11A6_6B61_4C6F_8B2E_30EEAAB4D204
+
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/tuple.h>
+
+#include "utils/timer.h"
+
+namespace psz {
+namespace detail {
+
+template <typename T, typename M>
+void spv_gather(
+    T*           in,
+    size_t const in_len,
+    T*           d_val,
+    uint32_t*    d_idx,
+    int*         nnz,
+    float*       milliseconds,
+    cudaStream_t stream)
+{
+    using thrust::placeholders::_1;
+
+    thrust::cuda::par.on(stream);
+    thrust::counting_iterator<uint32_t> zero(0);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    // find out the indices
+    *nnz = thrust::copy_if(thrust::device, zero, zero + in_len, in, d_idx, _1 != 0) - d_idx;
+
+    // fetch corresponding values
+    thrust::copy(
+        thrust::device, thrust::make_permutation_iterator(in, d_idx),
+        thrust::make_permutation_iterator(in + *nnz, d_idx + *nnz), d_val);
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    TIME_ELAPSED_CUDAEVENT(milliseconds);
+    DESTROY_CUDAEVENT_PAIR;
+}
+
+template <typename T, typename M>
+void spv_scatter(T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream)
+{
+    thrust::cuda::par.on(stream);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    thrust::scatter(thrust::device, d_val, d_val + nnz, d_idx, decoded);
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    TIME_ELAPSED_CUDAEVENT(milliseconds);
+    DESTROY_CUDAEVENT_PAIR;
+}
+
+}  // namespace detail
+}  // namespace psz
+
+#endif /* F88E11A6_6B61_4C6F_8B2E_30EEAAB4D204 */
diff --git a/qtensor/compression/cusz/src/detail/spvec.cu b/qtensor/compression/cusz/src/detail/spvec.cu
index e9b9ab6f..7ed562db 100644
--- a/qtensor/compression/cusz/src/detail/spvec.cu
+++ b/qtensor/compression/cusz/src/detail/spvec.cu
@@ -1,18 +1,18 @@
-/**
- * @file spvec.cu
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-03-01
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#include "detail/spvec.cuh"
-
-template struct cusz::SpcodecVec<float>::impl;
-template struct cusz::SpcodecVec<uint8_t>::impl;
-template struct cusz::SpcodecVec<uint16_t>::impl;
-template struct cusz::SpcodecVec<uint32_t>::impl;
-// template struct cusz::SpcodecVec<double>::impl;
+/**
+ * @file spvec.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-03-01
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "detail/spvec.cuh"
+
+template struct cusz::SpcodecVec<float>::impl;
+template struct cusz::SpcodecVec<uint8_t>::impl;
+template struct cusz::SpcodecVec<uint16_t>::impl;
+template struct cusz::SpcodecVec<uint32_t>::impl;
+// template struct cusz::SpcodecVec<double>::impl;
diff --git a/qtensor/compression/cusz/src/experimental/Makefile b/qtensor/compression/cusz/src/experimental/Makefile
index cecce6f5..22807665 100644
--- a/qtensor/compression/cusz/src/experimental/Makefile
+++ b/qtensor/compression/cusz/src/experimental/Makefile
@@ -1,7 +1,7 @@
-altlorenzo:
-	nvcc -lineinfo -std=c++17 \
-		--extended-lambda \
-		-DDPCPP_SHOWCASE \
-		../wrapper/extrap_lorenzo.cu \
-		dpcpp_demo_lorenzo.cu \
-		-o dpcpp_demo_lorenzo
+altlorenzo:
+	nvcc -lineinfo -std=c++17 \
+		--extended-lambda \
+		-DDPCPP_SHOWCASE \
+		../wrapper/extrap_lorenzo.cu \
+		dpcpp_demo_lorenzo.cu \
+		-o dpcpp_demo_lorenzo
diff --git a/qtensor/compression/cusz/src/experimental/dpcpp_demo_lorenzo.cu b/qtensor/compression/cusz/src/experimental/dpcpp_demo_lorenzo.cu
index 375d648d..6d5123a0 100644
--- a/qtensor/compression/cusz/src/experimental/dpcpp_demo_lorenzo.cu
+++ b/qtensor/compression/cusz/src/experimental/dpcpp_demo_lorenzo.cu
@@ -1,120 +1,120 @@
-/**
- * @file withwrapper_lorenzo.cu
- * @author Jiannan Tian
- * @brief A temporary test case using high-level component/API.
- * @version 0.3
- * @date 2021-06-21
- *
- * (C) 2021 by Washington State University, Argonne National Laboratory
- *
- */
-
-#include <pwd.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <algorithm>
-#include <iostream>
-#include <string>
-#include "../utils/io.hh"
-#include "../utils/verify.hh"
-
-#pragma message "--extended-lambda causes migration error (nvcc is incapable to be a wellrounded compiler)."
-// #include "../utils/verify_gpu.cuh"
-#include "../component/extrap_lorenzo.h"
-
-using std::cout;
-using std::endl;
-
-using Data  = float;
-using Quant = uint16_t;
-using FP    = float;
-
-Data eb;
-Data maxval, minval;
-
-// dim3   stride3;
-size_t len1;
-int    radius = 512;
-
-namespace {
-
-#ifndef __CUDACC__
-struct __dim3_compat {
-    unsigned int x, y, z;
-    __dim3_compat(unsigned int _x, unsigned int _y, unsigned int _z){};
-};
-
-using dim3 = __dim3_compat;
-#endif
-
-auto get_npart = [](auto size, auto subsize) {
-    static_assert(
-        std::numeric_limits<decltype(size)>::is_integer and std::numeric_limits<decltype(subsize)>::is_integer,
-        "[get_npart] must be plain interger types.");
-    return (size + subsize - 1) / subsize;
-};
-auto get_len_from_dim3 = [](dim3 size) { return size.x * size.y * size.z; };
-auto get_stride3       = [](dim3 size) -> dim3 { return dim3(1, size.x, size.x * size.y); };
-
-}  // namespace
-
-void test_lorenzo(std::string fname, int ndim, dim3 size3)
-{
-    cout << "filename: " << fname << '\n';
-
-    Data*  h_data{nullptr};
-    Data*  d_data{nullptr};
-    Data*  h2_data{nullptr};
-    Quant* d_quant{nullptr};
-
-    auto len1 = get_len_from_dim3(size3);
-    cout << "len1 from dim3: " << len1 << endl;
-
-    cudaMallocHost(&h_data, len1 * sizeof(Data));
-    io::read_binary_to_array(fname, h_data, len1);
-    cudaMallocHost(&h2_data, len1 * sizeof(Data));
-    memcpy(h2_data, h_data, len1 * sizeof(Data));
-
-    cudaMalloc(&d_data, len1 * sizeof(Data));
-    cudaMemcpy(d_data, h_data, len1 * sizeof(Data), cudaMemcpyHostToDevice);
-    cudaMalloc(&d_quant, len1 * sizeof(Quant));
-
-    auto maxval = *std::max_element(h_data, h_data + len1);
-    auto minval = *std::min_element(h_data, h_data + len1);
-    eb          = 1e-3 * (maxval - minval);
-
-    compress_lorenzo_construct<Data, Quant, FP>(d_data, d_quant, size3, ndim, eb, radius);
-    decompress_lorenzo_reconstruct<Data, Quant, FP>(d_data, d_quant, size3, ndim, eb, radius);
-
-    cudaMemcpy(h_data, d_data, len1 * sizeof(Data), cudaMemcpyDeviceToHost);
-
-    // TODO GPU verification does not print
-    // {
-    //     Stat stat_gpu;
-    //     verify_data_GPU(&stat_gpu, h_data, h2_data, len1);
-    //     cusz::QualityViewer::print_metrics_cross<Data>(&stat_gpu, false, eb, 0, 1, false, true);
-    // }
-    {
-        Stat stat;
-        cusz::verify_data(&stat, h_data, h2_data, len1);
-        cusz::QualityViewer::print_metrics_cross<Data>(&stat, false, eb, 0, 1, false, false);
-    }
-
-    // clear up
-    cudaFree(d_data);
-    cudaFree(d_quant);
-    cudaFreeHost(h_data);
-    cudaFreeHost(h2_data);
-}
-
-int main()
-{
-    struct passwd* pw      = getpwuid(getuid());
-    const char*    homedir = pw->pw_dir;
-
-    test_lorenzo(std::string(homedir) + "/datafields/vx", 1, dim3(280953867, 1, 1));
-    test_lorenzo(std::string(homedir) + "/datafields/CLDHGH", 2, dim3(3600, 1800, 1));
-    test_lorenzo(std::string(homedir) + "/datafields/CLOUDf48", 3, dim3(500, 500, 100));
-
-    return 0;
-}
+/**
+ * @file withwrapper_lorenzo.cu
+ * @author Jiannan Tian
+ * @brief A temporary test case using high-level component/API.
+ * @version 0.3
+ * @date 2021-06-21
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include <pwd.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include "../utils/io.hh"
+#include "../utils/verify.hh"
+
+#pragma message "--extended-lambda causes migration error (nvcc is incapable to be a wellrounded compiler)."
+// #include "../utils/verify_gpu.cuh"
+#include "../component/extrap_lorenzo.h"
+
+using std::cout;
+using std::endl;
+
+using Data  = float;
+using Quant = uint16_t;
+using FP    = float;
+
+Data eb;
+Data maxval, minval;
+
+// dim3   stride3;
+size_t len1;
+int    radius = 512;
+
+namespace {
+
+#ifndef __CUDACC__
+struct __dim3_compat {
+    unsigned int x, y, z;
+    __dim3_compat(unsigned int _x, unsigned int _y, unsigned int _z){};
+};
+
+using dim3 = __dim3_compat;
+#endif
+
+auto get_npart = [](auto size, auto subsize) {
+    static_assert(
+        std::numeric_limits<decltype(size)>::is_integer and std::numeric_limits<decltype(subsize)>::is_integer,
+        "[get_npart] must be plain interger types.");
+    return (size + subsize - 1) / subsize;
+};
+auto get_len_from_dim3 = [](dim3 size) { return size.x * size.y * size.z; };
+auto get_stride3       = [](dim3 size) -> dim3 { return dim3(1, size.x, size.x * size.y); };
+
+}  // namespace
+
+void test_lorenzo(std::string fname, int ndim, dim3 size3)
+{
+    cout << "filename: " << fname << '\n';
+
+    Data*  h_data{nullptr};
+    Data*  d_data{nullptr};
+    Data*  h2_data{nullptr};
+    Quant* d_quant{nullptr};
+
+    auto len1 = get_len_from_dim3(size3);
+    cout << "len1 from dim3: " << len1 << endl;
+
+    cudaMallocHost(&h_data, len1 * sizeof(Data));
+    io::read_binary_to_array(fname, h_data, len1);
+    cudaMallocHost(&h2_data, len1 * sizeof(Data));
+    memcpy(h2_data, h_data, len1 * sizeof(Data));
+
+    cudaMalloc(&d_data, len1 * sizeof(Data));
+    cudaMemcpy(d_data, h_data, len1 * sizeof(Data), cudaMemcpyHostToDevice);
+    cudaMalloc(&d_quant, len1 * sizeof(Quant));
+
+    auto maxval = *std::max_element(h_data, h_data + len1);
+    auto minval = *std::min_element(h_data, h_data + len1);
+    eb          = 1e-3 * (maxval - minval);
+
+    compress_lorenzo_construct<Data, Quant, FP>(d_data, d_quant, size3, ndim, eb, radius);
+    decompress_lorenzo_reconstruct<Data, Quant, FP>(d_data, d_quant, size3, ndim, eb, radius);
+
+    cudaMemcpy(h_data, d_data, len1 * sizeof(Data), cudaMemcpyDeviceToHost);
+
+    // TODO GPU verification does not print
+    // {
+    //     Stat stat_gpu;
+    //     verify_data_GPU(&stat_gpu, h_data, h2_data, len1);
+    //     cusz::QualityViewer::print_metrics_cross<Data>(&stat_gpu, false, eb, 0, 1, false, true);
+    // }
+    {
+        Stat stat;
+        cusz::verify_data(&stat, h_data, h2_data, len1);
+        cusz::QualityViewer::print_metrics_cross<Data>(&stat, false, eb, 0, 1, false, false);
+    }
+
+    // clear up
+    cudaFree(d_data);
+    cudaFree(d_quant);
+    cudaFreeHost(h_data);
+    cudaFreeHost(h2_data);
+}
+
+int main()
+{
+    struct passwd* pw      = getpwuid(getuid());
+    const char*    homedir = pw->pw_dir;
+
+    test_lorenzo(std::string(homedir) + "/datafields/vx", 1, dim3(280953867, 1, 1));
+    test_lorenzo(std::string(homedir) + "/datafields/CLDHGH", 2, dim3(3600, 1800, 1));
+    test_lorenzo(std::string(homedir) + "/datafields/CLOUDf48", 3, dim3(500, 500, 100));
+
+    return 0;
+}
diff --git a/qtensor/compression/cusz/src/hf/detail/hf_bookg.inl b/qtensor/compression/cusz/src/hf/detail/hf_bookg.inl
index 3fb9ef82..27890728 100644
--- a/qtensor/compression/cusz/src/hf/detail/hf_bookg.inl
+++ b/qtensor/compression/cusz/src/hf/detail/hf_bookg.inl
@@ -1,742 +1,742 @@
-/**
- * @file huffman_parbook.cu
- * @author Cody Rivera (cjrivera1@crimson.ua.edu)
- * @brief Parallel Huffman Construction to generates canonical forward codebook.
- *        Based on [Ostadzadeh et al. 2007] (https://dblp.org/rec/conf/pdpta/OstadzadehEZMB07.bib)
- *        "A Two-phase Practical Parallel Algorithm for Construction of Huffman Codes".
- * @version 0.1
- * @date 2020-10-24
- * (created) 2020-05 (rev) 2021-06-21
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#ifndef C883A574_4491_40E8_A083_1B6E8FB56670
-#define C883A574_4491_40E8_A083_1B6E8FB56670
-
-#include <cooperative_groups.h>
-#include <cuda.h>
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include <thrust/sort.h>
-#include <algorithm>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <type_traits>
-
-#include "common.hh"
-#include "hf/hf_bookg.hh"
-#include "par_merge.inl"
-#include "utils.hh"
-#include "utils/timer.h"
-
-using std::cout;
-using std::endl;
-namespace cg = cooperative_groups;
-
-// GenerateCL Locals
-__device__ int iNodesFront = 0;
-__device__ int iNodesRear  = 0;
-__device__ int lNodesCur   = 0;
-
-__device__ int iNodesSize = 0;
-__device__ int curLeavesNum;
-
-__device__ int minFreq;
-
-__device__ int tempLength;
-
-__device__ int mergeFront;
-__device__ int mergeRear;
-
-__device__ int lNodesIndex;
-
-// GenerateCW Locals
-__device__ int CCL;
-__device__ int CDPI;
-__device__ int newCDPI;
-
-// Profiling
-__device__ long long int s[10];
-__device__ long long int st[10];
-
-// Mathematically correct mod
-#define MOD(a, b) ((((a) % (b)) + (b)) % (b))
-
-namespace par_huffman {
-namespace detail {
-
-// clang-format off
-template <typename T>             __global__ void GPU_FillArraySequence(T*, unsigned int);
-template <typename T>             __global__ void GPU_GetFirstNonzeroIndex(T*, unsigned int, unsigned int*);
-template <typename T>             __global__ void GPU_ReverseArray(T*, unsigned int);
-template <typename H, typename T> __global__ void GPU_ReorderByIndex(H*, T*, unsigned int);
-// clang-format on
-
-}  // namespace detail
-}  // namespace par_huffman
-
-namespace par_huffman {
-
-// Codeword length
-template <typename F>
-__global__ void GPU_GenerateCL(F*, F*, int, F*, int*, F*, int*, F*, int*, int*, F*, int*, int*, uint32_t*, int, int);
-
-// Forward Codebook
-template <typename F, typename H>
-__global__ void GPU_GenerateCW(F* CL, H* CW, H* first, H* entry, int size);
-
-}  // namespace par_huffman
-
-// Parallel huffman code generation
-// clang-format off
-template <typename F>
-__global__ void par_huffman::GPU_GenerateCL(
-    F*  histogram,  F* CL,  int size,
-    /* Global Arrays */
-    F* lNodesFreq,  int* lNodesLeader,
-    F* iNodesFreq,  int* iNodesLeader,
-    F* tempFreq,    int* tempIsLeaf,    int* tempIndex,
-    F* copyFreq,    int* copyIsLeaf,    int* copyIndex,
-    uint32_t* diagonal_path_intersections, int mblocks, int mthreads)
-{
-    // clang-format on
-
-    extern __shared__ int32_t shmem[];
-    // Shared variables
-    int32_t& x_top     = shmem[0];
-    int32_t& y_top     = shmem[1];
-    int32_t& x_bottom  = shmem[2];
-    int32_t& y_bottom  = shmem[3];
-    int32_t& found     = shmem[4];
-    int32_t* oneorzero = &shmem[5];
-
-    unsigned int       thread       = (blockIdx.x * blockDim.x) + threadIdx.x;
-    const unsigned int i            = thread;  // Adaptation for easier porting
-    auto               current_grid = cg::this_grid();
-
-    /* Initialization */
-    if (thread < size) {
-        lNodesLeader[i] = -1;
-        CL[i]           = 0;
-    }
-
-    if (thread == 0) {
-        iNodesFront = 0;
-        iNodesRear  = 0;
-        lNodesCur   = 0;
-
-        iNodesSize = 0;
-    }
-    current_grid.sync();
-
-    /* While there is not exactly one internal node */
-    while (lNodesCur < size || iNodesSize > 1) {
-        /* Combine two most frequent nodes on same level */
-        if (thread == 0) {
-            F   midFreq[4];
-            int midIsLeaf[4];
-            for (int i = 0; i < 4; ++i) midFreq[i] = UINT_MAX;
-
-            if (lNodesCur < size) {
-                midFreq[0]   = lNodesFreq[lNodesCur];
-                midIsLeaf[0] = 1;
-            }
-            if (lNodesCur < size - 1) {
-                midFreq[1]   = lNodesFreq[lNodesCur + 1];
-                midIsLeaf[1] = 1;
-            }
-            if (iNodesSize >= 1) {
-                midFreq[2]   = iNodesFreq[iNodesFront];
-                midIsLeaf[2] = 0;
-            }
-            if (iNodesSize >= 2) {
-                midFreq[3]   = iNodesFreq[MOD(iNodesFront + 1, size)];
-                midIsLeaf[3] = 0;
-            }
-
-            /* Select the minimum of minimums - 4elt sorting network */
-            /* TODO There's likely a good 1-warp faster way to do this */
-            {
-                F   tempFreq;
-                int tempIsLeaf;
-                if (midFreq[1] > midFreq[3]) {
-                    tempFreq     = midFreq[1];
-                    midFreq[1]   = midFreq[3];
-                    midFreq[3]   = tempFreq;
-                    tempIsLeaf   = midIsLeaf[1];
-                    midIsLeaf[1] = midIsLeaf[3];
-                    midIsLeaf[3] = tempIsLeaf;
-                }
-                if (midFreq[0] > midFreq[2]) {
-                    tempFreq     = midFreq[0];
-                    midFreq[0]   = midFreq[2];
-                    midFreq[2]   = tempFreq;
-                    tempIsLeaf   = midIsLeaf[0];
-                    midIsLeaf[0] = midIsLeaf[2];
-                    midIsLeaf[2] = tempIsLeaf;
-                }
-                if (midFreq[0] > midFreq[1]) {
-                    tempFreq     = midFreq[0];
-                    midFreq[0]   = midFreq[1];
-                    midFreq[1]   = tempFreq;
-                    tempIsLeaf   = midIsLeaf[0];
-                    midIsLeaf[0] = midIsLeaf[1];
-                    midIsLeaf[1] = tempIsLeaf;
-                }
-                if (midFreq[2] > midFreq[3]) {
-                    tempFreq     = midFreq[2];
-                    midFreq[2]   = midFreq[3];
-                    midFreq[3]   = tempFreq;
-                    tempIsLeaf   = midIsLeaf[2];
-                    midIsLeaf[2] = midIsLeaf[3];
-                    midIsLeaf[3] = tempIsLeaf;
-                }
-                if (midFreq[1] > midFreq[2]) {
-                    tempFreq     = midFreq[1];
-                    midFreq[1]   = midFreq[2];
-                    midFreq[2]   = tempFreq;
-                    tempIsLeaf   = midIsLeaf[1];
-                    midIsLeaf[1] = midIsLeaf[2];
-                    midIsLeaf[2] = tempIsLeaf;
-                }
-            }
-
-            minFreq = midFreq[0];
-            if (midFreq[1] < UINT_MAX) { minFreq += midFreq[1]; }
-            iNodesFreq[iNodesRear]   = minFreq;
-            iNodesLeader[iNodesRear] = -1;
-
-            /* If is leaf */
-            if (midIsLeaf[0]) {
-                lNodesLeader[lNodesCur] = iNodesRear;
-                ++CL[lNodesCur], ++lNodesCur;
-            }
-            else {
-                iNodesLeader[iNodesFront] = iNodesRear;
-                iNodesFront               = MOD(iNodesFront + 1, size);
-            }
-            if (midIsLeaf[1]) {
-                lNodesLeader[lNodesCur] = iNodesRear;
-                ++CL[lNodesCur], ++lNodesCur;
-            }
-            else {
-                iNodesLeader[iNodesFront] = iNodesRear;
-                iNodesFront               = MOD(iNodesFront + 1, size); /* ? */
-            }
-
-            // iNodesRear = MOD(iNodesRear + 1, size);
-
-            iNodesSize = MOD(iNodesRear - iNodesFront, size);
-        }
-
-        // int curLeavesNum;
-        /* Select elements to copy -- parallelized */
-        curLeavesNum = 0;
-        current_grid.sync();
-        if (i >= lNodesCur && i < size) {
-            // Parallel component
-            int threadCurLeavesNum;
-            if (lNodesFreq[i] <= minFreq) {
-                threadCurLeavesNum = i - lNodesCur + 1;
-                // Atomic max -- Largest valid index
-                atomicMax(&curLeavesNum, threadCurLeavesNum);
-            }
-
-            if (i - lNodesCur < curLeavesNum) {
-                copyFreq[i - lNodesCur]   = lNodesFreq[i];
-                copyIndex[i - lNodesCur]  = i;
-                copyIsLeaf[i - lNodesCur] = 1;
-            }
-        }
-
-        current_grid.sync();
-
-        /* Updates Iterators */
-        if (thread == 0) {
-            mergeRear  = iNodesRear;
-            mergeFront = iNodesFront;
-
-            if ((curLeavesNum + iNodesSize) % 2 == 0) { iNodesFront = iNodesRear; }
-            /* Odd number of nodes to merge - leave out one*/
-            else if (
-                (iNodesSize != 0)                                                                        //
-                and (curLeavesNum == 0                                                                   //
-                     or (histogram[lNodesCur + curLeavesNum] <= iNodesFreq[MOD(iNodesRear - 1, size)]))  //
-            ) {
-                mergeRear   = MOD(mergeRear - 1, size);
-                iNodesFront = MOD(iNodesRear - 1, size);
-            }
-            else {
-                iNodesFront = iNodesRear;
-                --curLeavesNum;
-            }
-
-            lNodesCur  = lNodesCur + curLeavesNum;
-            iNodesRear = MOD(iNodesRear + 1, size);
-        }
-        current_grid.sync();
-
-        /* Parallelized Merging Phase */
-
-        /*if (thread == 0) {
-            merge(copyFreq, copyIndex, copyIsLeaf, 0, curLeavesNum,
-                    iNodesFreq, mergeFront, mergeRear, size,
-                    tempFreq, tempIndex, tempIsLeaf, tempLength);
-                    }*/
-
-        parMerge(
-            copyFreq, copyIndex, copyIsLeaf, 0, curLeavesNum,  //
-            iNodesFreq, mergeFront, mergeRear, size,           //
-            tempFreq, tempIndex, tempIsLeaf, tempLength,       //
-            diagonal_path_intersections, mblocks, mthreads,    //
-            x_top, y_top, x_bottom, y_bottom, found, oneorzero);
-        current_grid.sync();
-
-        /* Melding phase -- New */
-        if (thread < tempLength / 2) {
-            int ind           = MOD(iNodesRear + i, size);
-            iNodesFreq[ind]   = tempFreq[(2 * i)] + tempFreq[(2 * i) + 1];
-            iNodesLeader[ind] = -1;
-
-            if (tempIsLeaf[(2 * i)]) {
-                lNodesLeader[tempIndex[(2 * i)]] = ind;
-                ++CL[tempIndex[(2 * i)]];
-            }
-            else {
-                iNodesLeader[tempIndex[(2 * i)]] = ind;
-            }
-            if (tempIsLeaf[(2 * i) + 1]) {
-                lNodesLeader[tempIndex[(2 * i) + 1]] = ind;
-                ++CL[tempIndex[(2 * i) + 1]];
-            }
-            else {
-                iNodesLeader[tempIndex[(2 * i) + 1]] = ind;
-            }
-        }
-        current_grid.sync();
-
-        if (thread == 0) { iNodesRear = MOD(iNodesRear + (tempLength / 2), size); }
-        current_grid.sync();
-
-        /* Update leaders */
-        if (thread < size) {
-            if (lNodesLeader[i] != -1) {
-                if (iNodesLeader[lNodesLeader[i]] != -1) {
-                    lNodesLeader[i] = iNodesLeader[lNodesLeader[i]];
-                    ++CL[i];
-                }
-            }
-        }
-        current_grid.sync();
-
-        if (thread == 0) { iNodesSize = MOD(iNodesRear - iNodesFront, size); }
-        current_grid.sync();
-    }
-}
-
-// Parallelized with atomic writes, but could replace with Jiannan's similar code
-template <typename F, typename H>
-__global__ void par_huffman::GPU_GenerateCW(F* CL, H* CW, H* first, H* entry, int size)
-{
-    unsigned int       thread       = (blockIdx.x * blockDim.x) + threadIdx.x;
-    const unsigned int i            = thread;  // Porting convenience
-    auto               current_grid = cg::this_grid();
-    auto               type_bw      = sizeof(H) * 8;
-
-    /* Reverse in place - Probably a more CUDA-appropriate way */
-    if (thread < size / 2) {
-        F temp           = CL[i];
-        CL[i]            = CL[size - i - 1];
-        CL[size - i - 1] = temp;
-    }
-    current_grid.sync();
-
-    if (thread == 0) {
-        CCL        = CL[0];
-        CDPI       = 0;
-        newCDPI    = size - 1;
-        entry[CCL] = 0;
-
-        // Edge case -- only one input symbol
-        CW[CDPI]       = 0;
-        first[CCL]     = CW[CDPI] ^ (((H)1 << (H)CL[CDPI]) - 1);
-        entry[CCL + 1] = 1;
-    }
-    current_grid.sync();
-
-    // Initialize first and entry arrays
-    if (thread < CCL) {
-        // Initialization of first to Max ensures that unused code
-        // lengths are skipped over in decoding.
-        first[i] = std::numeric_limits<H>::max();
-        entry[i] = 0;
-    }
-    // Initialize first element of entry
-    current_grid.sync();
-
-    while (CDPI < size - 1) {
-        // CDPI update
-        if (i < size - 1 && CL[i + 1] > CCL) { atomicMin(&newCDPI, i); }
-        current_grid.sync();
-
-        // Last element to update
-        const int updateEnd = (newCDPI >= size - 1) ? type_bw : CL[newCDPI + 1];
-        // Fill base
-        const int curEntryVal = entry[CCL];
-        // Number of elements of length CCL
-        const int numCCL = (newCDPI - CDPI + 1);
-
-        // Get first codeword
-        if (i == 0) {
-            if (CDPI == 0) { CW[newCDPI] = 0; }
-            else {
-                CW[newCDPI] = CW[CDPI];  // Pre-stored
-            }
-        }
-        current_grid.sync();
-
-        if (i < size) {
-            // Parallel canonical codeword generation
-            if (i >= CDPI && i < newCDPI) { CW[i] = CW[newCDPI] + (newCDPI - i); }
-        }
-
-        // Update entry and first arrays in O(1) time
-        if (thread > CCL && thread < updateEnd) { entry[i] = curEntryVal + numCCL; }
-        // Add number of entries to next CCL
-        if (thread == 0) {
-            if (updateEnd < type_bw) { entry[updateEnd] = curEntryVal + numCCL; }
-        }
-        current_grid.sync();
-
-        // Update first array in O(1) time
-        if (thread == CCL) {
-            // Flip least significant CL[CDPI] bits
-            first[CCL] = CW[CDPI] ^ (((H)1 << (H)CL[CDPI]) - 1);
-        }
-        if (thread > CCL && thread < updateEnd) { first[i] = std::numeric_limits<H>::max(); }
-        current_grid.sync();
-
-        if (thread == 0) {
-            if (newCDPI < size - 1) {
-                int CLDiff = CL[newCDPI + 1] - CL[newCDPI];
-                // Add and shift -- Next canonical code
-                CW[newCDPI + 1] = ((CW[CDPI] + 1) << CLDiff);
-                CCL             = CL[newCDPI + 1];
-
-                ++newCDPI;
-            }
-
-            // Update CDPI to newCDPI after codeword length increase
-            CDPI    = newCDPI;
-            newCDPI = size - 1;
-        }
-        current_grid.sync();
-    }
-
-    if (thread < size) {
-        /* Make encoded codeword compatible with CUSZ */
-        CW[i] = (CW[i] | (((H)CL[i] & (H)0xffu) << ((sizeof(H) * 8) - 8))) ^ (((H)1 << (H)CL[i]) - 1);
-    }
-    current_grid.sync();
-
-    /* Reverse partial codebook */
-    if (thread < size / 2) {
-        H temp           = CW[i];
-        CW[i]            = CW[size - i - 1];
-        CW[size - i - 1] = temp;
-    }
-}
-
-// TODO forceinilne?
-// Helper implementations
-template <typename T>
-__global__ void par_huffman::detail::GPU_FillArraySequence(T* array, unsigned int size)
-{
-    unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (thread < size) { array[thread] = thread; }
-}
-
-// Precondition -- Result is preset to be equal to size
-template <typename T>
-__global__ void par_huffman::detail::GPU_GetFirstNonzeroIndex(T* array, unsigned int size, unsigned int* result)
-{
-    unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (array[thread] != 0) { atomicMin(result, thread); }
-}
-
-namespace par_huffman {
-namespace detail {
-__global__ void GPU_GetMaxCWLength(unsigned int* CL, unsigned int size, unsigned int* result)
-{
-    (void)size;
-    unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (thread == 0) { *result = CL[0]; }
-}
-
-}  // namespace detail
-}  // namespace par_huffman
-
-/**
- * @brief Reorders given a set of indices. Programmer must ensure that all index[i]
- * are unique or else race conditions may occur
- *
- * @tparam T
- * @tparam Q
- * @param array e.g., codebook
- * @param index e.g., input data
- * @param size
- * @return __global__
- */
-template <typename H, typename T>
-__global__ void par_huffman::detail::GPU_ReorderByIndex(H* array, T* index, unsigned int size)
-{
-    unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x;
-    H            temp;
-    T            newIndex;
-    if (thread < size) {
-        temp                 = array[thread];
-        newIndex             = index[thread];
-        array[(int)newIndex] = temp;
-    }
-}
-
-// Reverses a given array.
-template <typename T>
-__global__ void par_huffman::detail::GPU_ReverseArray(T* array, unsigned int size)
-{
-    unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (thread < size / 2) {
-        T temp                   = array[thread];
-        array[thread]            = array[size - thread - 1];
-        array[size - thread - 1] = temp;
-    }
-}
-
-// Parallel codebook generation wrapper
-template <typename T, typename H>
-void asz::hf_buildbook_g(
-    uint32_t*    freq,
-    int const    dict_size,
-    H*           codebook,
-    uint8_t*     reverse_codebook,
-    int const    revbook_nbyte,
-    float*       time_book,
-    cudaStream_t stream)
-{
-    // Metadata
-    auto type_bw  = sizeof(H) * 8;
-    auto _d_first = reinterpret_cast<H*>(reverse_codebook);
-    auto _d_entry = reinterpret_cast<H*>(reverse_codebook + (sizeof(H) * type_bw));
-    auto _d_qcode = reinterpret_cast<T*>(reverse_codebook + (sizeof(H) * 2 * type_bw));
-
-    CREATE_CUDAEVENT_PAIR;
-    START_CUDAEVENT_RECORDING(stream);
-
-    // Sort Qcodes by frequency
-    int nblocks = (dict_size / 1024) + 1;
-    par_huffman::detail::GPU_FillArraySequence<T><<<nblocks, 1024>>>(_d_qcode, (unsigned int)dict_size);
-    cudaStreamSynchronize(stream);
-
-    /**
-     * Originally from par_huffman_sortbyfreq.cu by Cody Rivera (cjrivera1@crimson.ua.edu)
-     * Sorts quantization codes by frequency, using a key-value sort. This functionality is placed in a separate
-     * compilation unit as thrust calls fail in par_huffman.cu.
-     *
-     * Resolved by
-     * 1) inlining function
-     * 2) using `thrust::device_pointer_cast(var)` instead of `thrust::device_pointer<T>(var)`
-     */
-    auto lambda_sort_by_freq = [] __host__(auto freq, auto len, auto qcode) {
-        thrust::sort_by_key(
-            thrust::device_pointer_cast(freq), thrust::device_pointer_cast(freq + len),
-            thrust::device_pointer_cast(qcode));
-    };
-
-    lambda_sort_by_freq(freq, dict_size, _d_qcode);
-    cudaStreamSynchronize(stream);
-
-    unsigned int* d_first_nonzero_index;
-    unsigned int  first_nonzero_index = dict_size;
-    cudaMalloc(&d_first_nonzero_index, sizeof(unsigned int));
-    cudaMemcpy(d_first_nonzero_index, &first_nonzero_index, sizeof(unsigned int), cudaMemcpyHostToDevice);
-    par_huffman::detail::GPU_GetFirstNonzeroIndex<unsigned int>
-        <<<nblocks, 1024>>>(freq, dict_size, d_first_nonzero_index);
-    cudaStreamSynchronize(stream);
-    cudaMemcpy(&first_nonzero_index, d_first_nonzero_index, sizeof(unsigned int), cudaMemcpyDeviceToHost);
-    cudaFree(d_first_nonzero_index);
-
-    int           nz_dict_size   = dict_size - first_nonzero_index;
-    unsigned int* _nz_d_freq     = freq + first_nonzero_index;
-    H*            _nz_d_codebook = codebook + first_nonzero_index;
-    int           nz_nblocks     = (nz_dict_size / 1024) + 1;
-
-    // Memory Allocation -- Perhaps put in another wrapper
-    // clang-format off
-    unsigned int *CL         = nullptr;
-    /*unsigned int* lNodesFreq*/         int *lNodesLeader = nullptr;
-    unsigned int *iNodesFreq = nullptr;  int *iNodesLeader = nullptr;
-    unsigned int *tempFreq   = nullptr;  int *tempIsLeaf   = nullptr;  int *tempIndex = nullptr;
-    unsigned int *copyFreq   = nullptr;  int *copyIsLeaf   = nullptr;  int *copyIndex = nullptr;
-    cudaMalloc(&CL,           nz_dict_size * sizeof(unsigned int) );
-    cudaMalloc(&lNodesLeader, nz_dict_size * sizeof(int)          );
-    cudaMalloc(&iNodesFreq,   nz_dict_size * sizeof(unsigned int) );
-    cudaMalloc(&iNodesLeader, nz_dict_size * sizeof(int)          );
-    cudaMalloc(&tempFreq,     nz_dict_size * sizeof(unsigned int) );
-    cudaMalloc(&tempIsLeaf,   nz_dict_size * sizeof(int)          );
-    cudaMalloc(&tempIndex,    nz_dict_size * sizeof(int)          );
-    cudaMalloc(&copyFreq,     nz_dict_size * sizeof(unsigned int) );
-    cudaMalloc(&copyIsLeaf,   nz_dict_size * sizeof(int)          );
-    cudaMalloc(&copyIndex,    nz_dict_size * sizeof(int)          );
-    cudaMemset(CL, 0,         nz_dict_size * sizeof(int)          );
-    // clang-format on
-
-    // Grid configuration for CL -- based on Cooperative Groups
-    int            cg_mblocks;
-    int            cg_blocks_sm;
-    int            device_id;
-    int            mthreads = 32;  // 1 warp
-    cudaDeviceProp deviceProp;
-    cudaGetDevice(&device_id);
-    cudaGetDeviceProperties(&deviceProp, device_id);
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &cg_blocks_sm, par_huffman::GPU_GenerateCL<unsigned int>, mthreads, 5 * sizeof(int32_t) + 32 * sizeof(int32_t));
-    cg_mblocks = deviceProp.multiProcessorCount * cg_blocks_sm;
-
-    int ELTS_PER_SEQ_MERGE = 16;
-    int mblocks            = std::min(cg_mblocks, (nz_dict_size / ELTS_PER_SEQ_MERGE) + 1);
-
-    // Exit if not enough exposed parallelism -- TODO modify kernels so this is unneeded
-    int tthreads = mthreads * mblocks;
-    if (tthreads < nz_dict_size) {
-        cout << LOG_ERR << "Insufficient on-device parallelism to construct a " << nz_dict_size
-             << " non-zero item codebook" << endl;
-        cout << LOG_ERR << "Provided parallelism: " << mblocks << " blocks, " << mthreads << " threads, " << tthreads
-             << " total" << endl
-             << endl;
-        // cout << LOG_ERR << "Exiting cuSZ ..." << endl;
-        throw std::system_error();
-        // exit(1);
-    }
-
-    uint32_t* diagonal_path_intersections;
-    cudaMalloc(&diagonal_path_intersections, (2 * (mblocks + 1)) * sizeof(uint32_t));
-
-    // Codebook already init'ed
-    cudaStreamSynchronize(stream);
-
-    // Call first kernel
-    // Collect arguments
-    void* CL_Args[] = {(void*)&_nz_d_freq,   (void*)&CL,
-                       (void*)&nz_dict_size, (void*)&_nz_d_freq,
-                       (void*)&lNodesLeader, (void*)&iNodesFreq,
-                       (void*)&iNodesLeader, (void*)&tempFreq,
-                       (void*)&tempIsLeaf,   (void*)&tempIndex,
-                       (void*)&copyFreq,     (void*)&copyIsLeaf,
-                       (void*)&copyIndex,    (void*)&diagonal_path_intersections,
-                       (void*)&mblocks,      (void*)&mthreads};
-    // Cooperative Launch
-    cudaLaunchCooperativeKernel(
-        (void*)par_huffman::GPU_GenerateCL<unsigned int>, mblocks, mthreads, CL_Args,
-        5 * sizeof(int32_t) + 32 * sizeof(int32_t));
-    cudaStreamSynchronize(stream);
-
-    // Exits if the highest codeword length is greater than what
-    // the adaptive representation can handle
-    // TODO do  proper cleanup
-
-    unsigned int* d_max_CL;
-    unsigned int  max_CL;
-    cudaMalloc(&d_max_CL, sizeof(unsigned int));
-    par_huffman::detail::GPU_GetMaxCWLength<<<1, 1>>>(CL, nz_dict_size, d_max_CL);
-    cudaStreamSynchronize(stream);
-    cudaMemcpy(&max_CL, d_max_CL, sizeof(unsigned int), cudaMemcpyDeviceToHost);
-    cudaFree(d_max_CL);
-
-    int max_CW_bits = (sizeof(H) * 8) - 8;
-    if (max_CL > max_CW_bits) {
-        cout << LOG_ERR << "Cannot store all Huffman codewords in " << max_CW_bits + 8 << "-bit representation" << endl;
-        cout << LOG_ERR << "Huffman codeword representation requires at least " << max_CL + 8
-             << " bits (longest codeword: " << max_CL << " bits)" << endl;
-        // cout << LOG_ERR << "(Consider running with -H 8 for 8-byte representation)" << endl << endl;
-        // cout << LOG_ERR << "Exiting cuSZ ..." << endl;
-        // exit(1);
-        throw std::runtime_error("Falling back to 8-byte Codec.");
-    }
-
-    // Configure CW for 1024 threads/block
-    int cg_cw_mblocks = (cg_mblocks * mthreads) / 1024;
-    int cw_mblocks    = std::min(cg_cw_mblocks, nz_nblocks);
-
-    // Exit if not enough exposed parallelism -- TODO modify kernels so this is unneeded
-    int cw_tthreads = cw_mblocks * 1024;
-    if (cw_tthreads < nz_dict_size) {
-        cout << LOG_ERR << "Insufficient on-device parallelism to construct a " << nz_dict_size
-             << " non-zero item codebook" << endl;
-        cout << LOG_ERR << "Provided parallelism: " << cw_mblocks << " blocks, " << 1024 << " threads, " << cw_tthreads
-             << " total" << endl
-             << endl;
-        // cout << LOG_ERR << "Exiting cuSZ ..." << endl;
-        // exit(1);
-        throw std::system_error();
-    }
-
-    void* CW_Args[] = {
-        (void*)&CL,              //
-        (void*)&_nz_d_codebook,  //
-        (void*)&_d_first,        //
-        (void*)&_d_entry,        //
-        (void*)&nz_dict_size};
-
-    // Call second kernel
-    cudaLaunchCooperativeKernel(
-        (void*)par_huffman::GPU_GenerateCW<unsigned int, H>,  //
-        cw_mblocks,                                           //
-        1024,                                                 //
-        CW_Args);
-    cudaStreamSynchronize(stream);
-
-#ifdef D_DEBUG_PRINT
-    print_codebook<H><<<1, 32>>>(codebook, dict_size);  // PASS
-    cudaStreamSynchronize(stream);
-#endif
-
-    // Reverse _d_qcode and codebook
-    par_huffman::detail::GPU_ReverseArray<H><<<nblocks, 1024>>>(codebook, (unsigned int)dict_size);
-    par_huffman::detail::GPU_ReverseArray<T><<<nblocks, 1024>>>(_d_qcode, (unsigned int)dict_size);
-    cudaStreamSynchronize(stream);
-
-    par_huffman::detail::GPU_ReorderByIndex<H, T><<<nblocks, 1024>>>(codebook, _d_qcode, (unsigned int)dict_size);
-    cudaStreamSynchronize(stream);
-
-    STOP_CUDAEVENT_RECORDING(stream);
-    TIME_ELAPSED_CUDAEVENT(time_book);
-    DESTROY_CUDAEVENT_PAIR;
-
-    // Cleanup
-    cudaFree(CL);
-    cudaFree(lNodesLeader);
-    cudaFree(iNodesFreq);
-    cudaFree(iNodesLeader);
-    cudaFree(tempFreq);
-    cudaFree(tempIsLeaf);
-    cudaFree(tempIndex);
-    cudaFree(copyFreq);
-    cudaFree(copyIsLeaf);
-    cudaFree(copyIndex);
-    cudaFree(diagonal_path_intersections);
-    cudaStreamSynchronize(stream);
-
-#ifdef D_DEBUG_PRINT
-    print_codebook<H><<<1, 32>>>(codebook, dict_size);  // PASS
-    cudaStreamSynchronize(stream);
-#endif
-}
-
-#endif /* C883A574_4491_40E8_A083_1B6E8FB56670 */
+/**
+ * @file huffman_parbook.cu
+ * @author Cody Rivera (cjrivera1@crimson.ua.edu)
+ * @brief Parallel Huffman Construction to generates canonical forward codebook.
+ *        Based on [Ostadzadeh et al. 2007] (https://dblp.org/rec/conf/pdpta/OstadzadehEZMB07.bib)
+ *        "A Two-phase Practical Parallel Algorithm for Construction of Huffman Codes".
+ * @version 0.1
+ * @date 2020-10-24
+ * (created) 2020-05 (rev) 2021-06-21
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef C883A574_4491_40E8_A083_1B6E8FB56670
+#define C883A574_4491_40E8_A083_1B6E8FB56670
+
+#include <cooperative_groups.h>
+#include <cuda.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sort.h>
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <type_traits>
+
+#include "common.hh"
+#include "hf/hf_bookg.hh"
+#include "par_merge.inl"
+#include "utils.hh"
+#include "utils/timer.h"
+
+using std::cout;
+using std::endl;
+namespace cg = cooperative_groups;
+
+// GenerateCL Locals
+__device__ int iNodesFront = 0;
+__device__ int iNodesRear  = 0;
+__device__ int lNodesCur   = 0;
+
+__device__ int iNodesSize = 0;
+__device__ int curLeavesNum;
+
+__device__ int minFreq;
+
+__device__ int tempLength;
+
+__device__ int mergeFront;
+__device__ int mergeRear;
+
+__device__ int lNodesIndex;
+
+// GenerateCW Locals
+__device__ int CCL;
+__device__ int CDPI;
+__device__ int newCDPI;
+
+// Profiling
+__device__ long long int s[10];
+__device__ long long int st[10];
+
+// Mathematically correct mod
+#define MOD(a, b) ((((a) % (b)) + (b)) % (b))
+
+namespace par_huffman {
+namespace detail {
+
+// clang-format off
+template <typename T>             __global__ void GPU_FillArraySequence(T*, unsigned int);
+template <typename T>             __global__ void GPU_GetFirstNonzeroIndex(T*, unsigned int, unsigned int*);
+template <typename T>             __global__ void GPU_ReverseArray(T*, unsigned int);
+template <typename H, typename T> __global__ void GPU_ReorderByIndex(H*, T*, unsigned int);
+// clang-format on
+
+}  // namespace detail
+}  // namespace par_huffman
+
+namespace par_huffman {
+
+// Codeword length
+template <typename F>
+__global__ void GPU_GenerateCL(F*, F*, int, F*, int*, F*, int*, F*, int*, int*, F*, int*, int*, uint32_t*, int, int);
+
+// Forward Codebook
+template <typename F, typename H>
+__global__ void GPU_GenerateCW(F* CL, H* CW, H* first, H* entry, int size);
+
+}  // namespace par_huffman
+
+// Parallel huffman code generation
+// clang-format off
+template <typename F>
+__global__ void par_huffman::GPU_GenerateCL(
+    F*  histogram,  F* CL,  int size,
+    /* Global Arrays */
+    F* lNodesFreq,  int* lNodesLeader,
+    F* iNodesFreq,  int* iNodesLeader,
+    F* tempFreq,    int* tempIsLeaf,    int* tempIndex,
+    F* copyFreq,    int* copyIsLeaf,    int* copyIndex,
+    uint32_t* diagonal_path_intersections, int mblocks, int mthreads)
+{
+    // clang-format on
+
+    extern __shared__ int32_t shmem[];
+    // Shared variables
+    int32_t& x_top     = shmem[0];
+    int32_t& y_top     = shmem[1];
+    int32_t& x_bottom  = shmem[2];
+    int32_t& y_bottom  = shmem[3];
+    int32_t& found     = shmem[4];
+    int32_t* oneorzero = &shmem[5];
+
+    unsigned int       thread       = (blockIdx.x * blockDim.x) + threadIdx.x;
+    const unsigned int i            = thread;  // Adaptation for easier porting
+    auto               current_grid = cg::this_grid();
+
+    /* Initialization */
+    if (thread < size) {
+        lNodesLeader[i] = -1;
+        CL[i]           = 0;
+    }
+
+    if (thread == 0) {
+        iNodesFront = 0;
+        iNodesRear  = 0;
+        lNodesCur   = 0;
+
+        iNodesSize = 0;
+    }
+    current_grid.sync();
+
+    /* While there is not exactly one internal node */
+    while (lNodesCur < size || iNodesSize > 1) {
+        /* Combine two most frequent nodes on same level */
+        if (thread == 0) {
+            F   midFreq[4];
+            int midIsLeaf[4];
+            for (int i = 0; i < 4; ++i) midFreq[i] = UINT_MAX;
+
+            if (lNodesCur < size) {
+                midFreq[0]   = lNodesFreq[lNodesCur];
+                midIsLeaf[0] = 1;
+            }
+            if (lNodesCur < size - 1) {
+                midFreq[1]   = lNodesFreq[lNodesCur + 1];
+                midIsLeaf[1] = 1;
+            }
+            if (iNodesSize >= 1) {
+                midFreq[2]   = iNodesFreq[iNodesFront];
+                midIsLeaf[2] = 0;
+            }
+            if (iNodesSize >= 2) {
+                midFreq[3]   = iNodesFreq[MOD(iNodesFront + 1, size)];
+                midIsLeaf[3] = 0;
+            }
+
+            /* Select the minimum of minimums - 4elt sorting network */
+            /* TODO There's likely a good 1-warp faster way to do this */
+            {
+                F   tempFreq;
+                int tempIsLeaf;
+                if (midFreq[1] > midFreq[3]) {
+                    tempFreq     = midFreq[1];
+                    midFreq[1]   = midFreq[3];
+                    midFreq[3]   = tempFreq;
+                    tempIsLeaf   = midIsLeaf[1];
+                    midIsLeaf[1] = midIsLeaf[3];
+                    midIsLeaf[3] = tempIsLeaf;
+                }
+                if (midFreq[0] > midFreq[2]) {
+                    tempFreq     = midFreq[0];
+                    midFreq[0]   = midFreq[2];
+                    midFreq[2]   = tempFreq;
+                    tempIsLeaf   = midIsLeaf[0];
+                    midIsLeaf[0] = midIsLeaf[2];
+                    midIsLeaf[2] = tempIsLeaf;
+                }
+                if (midFreq[0] > midFreq[1]) {
+                    tempFreq     = midFreq[0];
+                    midFreq[0]   = midFreq[1];
+                    midFreq[1]   = tempFreq;
+                    tempIsLeaf   = midIsLeaf[0];
+                    midIsLeaf[0] = midIsLeaf[1];
+                    midIsLeaf[1] = tempIsLeaf;
+                }
+                if (midFreq[2] > midFreq[3]) {
+                    tempFreq     = midFreq[2];
+                    midFreq[2]   = midFreq[3];
+                    midFreq[3]   = tempFreq;
+                    tempIsLeaf   = midIsLeaf[2];
+                    midIsLeaf[2] = midIsLeaf[3];
+                    midIsLeaf[3] = tempIsLeaf;
+                }
+                if (midFreq[1] > midFreq[2]) {
+                    tempFreq     = midFreq[1];
+                    midFreq[1]   = midFreq[2];
+                    midFreq[2]   = tempFreq;
+                    tempIsLeaf   = midIsLeaf[1];
+                    midIsLeaf[1] = midIsLeaf[2];
+                    midIsLeaf[2] = tempIsLeaf;
+                }
+            }
+
+            minFreq = midFreq[0];
+            if (midFreq[1] < UINT_MAX) { minFreq += midFreq[1]; }
+            iNodesFreq[iNodesRear]   = minFreq;
+            iNodesLeader[iNodesRear] = -1;
+
+            /* If is leaf */
+            if (midIsLeaf[0]) {
+                lNodesLeader[lNodesCur] = iNodesRear;
+                ++CL[lNodesCur], ++lNodesCur;
+            }
+            else {
+                iNodesLeader[iNodesFront] = iNodesRear;
+                iNodesFront               = MOD(iNodesFront + 1, size);
+            }
+            if (midIsLeaf[1]) {
+                lNodesLeader[lNodesCur] = iNodesRear;
+                ++CL[lNodesCur], ++lNodesCur;
+            }
+            else {
+                iNodesLeader[iNodesFront] = iNodesRear;
+                iNodesFront               = MOD(iNodesFront + 1, size); /* ? */
+            }
+
+            // iNodesRear = MOD(iNodesRear + 1, size);
+
+            iNodesSize = MOD(iNodesRear - iNodesFront, size);
+        }
+
+        // int curLeavesNum;
+        /* Select elements to copy -- parallelized */
+        curLeavesNum = 0;
+        current_grid.sync();
+        if (i >= lNodesCur && i < size) {
+            // Parallel component
+            int threadCurLeavesNum;
+            if (lNodesFreq[i] <= minFreq) {
+                threadCurLeavesNum = i - lNodesCur + 1;
+                // Atomic max -- Largest valid index
+                atomicMax(&curLeavesNum, threadCurLeavesNum);
+            }
+
+            if (i - lNodesCur < curLeavesNum) {
+                copyFreq[i - lNodesCur]   = lNodesFreq[i];
+                copyIndex[i - lNodesCur]  = i;
+                copyIsLeaf[i - lNodesCur] = 1;
+            }
+        }
+
+        current_grid.sync();
+
+        /* Updates Iterators */
+        if (thread == 0) {
+            mergeRear  = iNodesRear;
+            mergeFront = iNodesFront;
+
+            if ((curLeavesNum + iNodesSize) % 2 == 0) { iNodesFront = iNodesRear; }
+            /* Odd number of nodes to merge - leave out one*/
+            else if (
+                (iNodesSize != 0)                                                                        //
+                and (curLeavesNum == 0                                                                   //
+                     or (histogram[lNodesCur + curLeavesNum] <= iNodesFreq[MOD(iNodesRear - 1, size)]))  //
+            ) {
+                mergeRear   = MOD(mergeRear - 1, size);
+                iNodesFront = MOD(iNodesRear - 1, size);
+            }
+            else {
+                iNodesFront = iNodesRear;
+                --curLeavesNum;
+            }
+
+            lNodesCur  = lNodesCur + curLeavesNum;
+            iNodesRear = MOD(iNodesRear + 1, size);
+        }
+        current_grid.sync();
+
+        /* Parallelized Merging Phase */
+
+        /*if (thread == 0) {
+            merge(copyFreq, copyIndex, copyIsLeaf, 0, curLeavesNum,
+                    iNodesFreq, mergeFront, mergeRear, size,
+                    tempFreq, tempIndex, tempIsLeaf, tempLength);
+                    }*/
+
+        parMerge(
+            copyFreq, copyIndex, copyIsLeaf, 0, curLeavesNum,  //
+            iNodesFreq, mergeFront, mergeRear, size,           //
+            tempFreq, tempIndex, tempIsLeaf, tempLength,       //
+            diagonal_path_intersections, mblocks, mthreads,    //
+            x_top, y_top, x_bottom, y_bottom, found, oneorzero);
+        current_grid.sync();
+
+        /* Melding phase -- New */
+        if (thread < tempLength / 2) {
+            int ind           = MOD(iNodesRear + i, size);
+            iNodesFreq[ind]   = tempFreq[(2 * i)] + tempFreq[(2 * i) + 1];
+            iNodesLeader[ind] = -1;
+
+            if (tempIsLeaf[(2 * i)]) {
+                lNodesLeader[tempIndex[(2 * i)]] = ind;
+                ++CL[tempIndex[(2 * i)]];
+            }
+            else {
+                iNodesLeader[tempIndex[(2 * i)]] = ind;
+            }
+            if (tempIsLeaf[(2 * i) + 1]) {
+                lNodesLeader[tempIndex[(2 * i) + 1]] = ind;
+                ++CL[tempIndex[(2 * i) + 1]];
+            }
+            else {
+                iNodesLeader[tempIndex[(2 * i) + 1]] = ind;
+            }
+        }
+        current_grid.sync();
+
+        if (thread == 0) { iNodesRear = MOD(iNodesRear + (tempLength / 2), size); }
+        current_grid.sync();
+
+        /* Update leaders */
+        if (thread < size) {
+            if (lNodesLeader[i] != -1) {
+                if (iNodesLeader[lNodesLeader[i]] != -1) {
+                    lNodesLeader[i] = iNodesLeader[lNodesLeader[i]];
+                    ++CL[i];
+                }
+            }
+        }
+        current_grid.sync();
+
+        if (thread == 0) { iNodesSize = MOD(iNodesRear - iNodesFront, size); }
+        current_grid.sync();
+    }
+}
+
+// Parallelized with atomic writes, but could replace with Jiannan's similar code
+template <typename F, typename H>
+__global__ void par_huffman::GPU_GenerateCW(F* CL, H* CW, H* first, H* entry, int size)
+{
+    unsigned int       thread       = (blockIdx.x * blockDim.x) + threadIdx.x;
+    const unsigned int i            = thread;  // Porting convenience
+    auto               current_grid = cg::this_grid();
+    auto               type_bw      = sizeof(H) * 8;
+
+    /* Reverse in place - Probably a more CUDA-appropriate way */
+    if (thread < size / 2) {
+        F temp           = CL[i];
+        CL[i]            = CL[size - i - 1];
+        CL[size - i - 1] = temp;
+    }
+    current_grid.sync();
+
+    if (thread == 0) {
+        CCL        = CL[0];
+        CDPI       = 0;
+        newCDPI    = size - 1;
+        entry[CCL] = 0;
+
+        // Edge case -- only one input symbol
+        CW[CDPI]       = 0;
+        first[CCL]     = CW[CDPI] ^ (((H)1 << (H)CL[CDPI]) - 1);
+        entry[CCL + 1] = 1;
+    }
+    current_grid.sync();
+
+    // Initialize first and entry arrays
+    if (thread < CCL) {
+        // Initialization of first to Max ensures that unused code
+        // lengths are skipped over in decoding.
+        first[i] = std::numeric_limits<H>::max();
+        entry[i] = 0;
+    }
+    // Initialize first element of entry
+    current_grid.sync();
+
+    while (CDPI < size - 1) {
+        // CDPI update
+        if (i < size - 1 && CL[i + 1] > CCL) { atomicMin(&newCDPI, i); }
+        current_grid.sync();
+
+        // Last element to update
+        const int updateEnd = (newCDPI >= size - 1) ? type_bw : CL[newCDPI + 1];
+        // Fill base
+        const int curEntryVal = entry[CCL];
+        // Number of elements of length CCL
+        const int numCCL = (newCDPI - CDPI + 1);
+
+        // Get first codeword
+        if (i == 0) {
+            if (CDPI == 0) { CW[newCDPI] = 0; }
+            else {
+                CW[newCDPI] = CW[CDPI];  // Pre-stored
+            }
+        }
+        current_grid.sync();
+
+        if (i < size) {
+            // Parallel canonical codeword generation
+            if (i >= CDPI && i < newCDPI) { CW[i] = CW[newCDPI] + (newCDPI - i); }
+        }
+
+        // Update entry and first arrays in O(1) time
+        if (thread > CCL && thread < updateEnd) { entry[i] = curEntryVal + numCCL; }
+        // Add number of entries to next CCL
+        if (thread == 0) {
+            if (updateEnd < type_bw) { entry[updateEnd] = curEntryVal + numCCL; }
+        }
+        current_grid.sync();
+
+        // Update first array in O(1) time
+        if (thread == CCL) {
+            // Flip least significant CL[CDPI] bits
+            first[CCL] = CW[CDPI] ^ (((H)1 << (H)CL[CDPI]) - 1);
+        }
+        if (thread > CCL && thread < updateEnd) { first[i] = std::numeric_limits<H>::max(); }
+        current_grid.sync();
+
+        if (thread == 0) {
+            if (newCDPI < size - 1) {
+                int CLDiff = CL[newCDPI + 1] - CL[newCDPI];
+                // Add and shift -- Next canonical code
+                CW[newCDPI + 1] = ((CW[CDPI] + 1) << CLDiff);
+                CCL             = CL[newCDPI + 1];
+
+                ++newCDPI;
+            }
+
+            // Update CDPI to newCDPI after codeword length increase
+            CDPI    = newCDPI;
+            newCDPI = size - 1;
+        }
+        current_grid.sync();
+    }
+
+    if (thread < size) {
+        /* Make encoded codeword compatible with CUSZ */
+        CW[i] = (CW[i] | (((H)CL[i] & (H)0xffu) << ((sizeof(H) * 8) - 8))) ^ (((H)1 << (H)CL[i]) - 1);
+    }
+    current_grid.sync();
+
+    /* Reverse partial codebook */
+    if (thread < size / 2) {
+        H temp           = CW[i];
+        CW[i]            = CW[size - i - 1];
+        CW[size - i - 1] = temp;
+    }
+}
+
+// TODO forceinilne?
+// Helper implementations
+template <typename T>
+__global__ void par_huffman::detail::GPU_FillArraySequence(T* array, unsigned int size)
+{
+    unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (thread < size) { array[thread] = thread; }
+}
+
+// Precondition -- Result is preset to be equal to size
+template <typename T>
+__global__ void par_huffman::detail::GPU_GetFirstNonzeroIndex(T* array, unsigned int size, unsigned int* result)
+{
+    unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (array[thread] != 0) { atomicMin(result, thread); }
+}
+
+namespace par_huffman {
+namespace detail {
+__global__ void GPU_GetMaxCWLength(unsigned int* CL, unsigned int size, unsigned int* result)
+{
+    (void)size;
+    unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (thread == 0) { *result = CL[0]; }
+}
+
+}  // namespace detail
+}  // namespace par_huffman
+
+/**
+ * @brief Reorders given a set of indices. Programmer must ensure that all index[i]
+ * are unique or else race conditions may occur
+ *
+ * @tparam T
+ * @tparam Q
+ * @param array e.g., codebook
+ * @param index e.g., input data
+ * @param size
+ * @return __global__
+ */
+template <typename H, typename T>
+__global__ void par_huffman::detail::GPU_ReorderByIndex(H* array, T* index, unsigned int size)
+{
+    unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x;
+    H            temp;
+    T            newIndex;
+    if (thread < size) {
+        temp                 = array[thread];
+        newIndex             = index[thread];
+        array[(int)newIndex] = temp;
+    }
+}
+
+// Reverses a given array.
+template <typename T>
+__global__ void par_huffman::detail::GPU_ReverseArray(T* array, unsigned int size)
+{
+    unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (thread < size / 2) {
+        T temp                   = array[thread];
+        array[thread]            = array[size - thread - 1];
+        array[size - thread - 1] = temp;
+    }
+}
+
+// Parallel codebook generation wrapper
+template <typename T, typename H>
+void asz::hf_buildbook_g(
+    uint32_t*    freq,
+    int const    dict_size,
+    H*           codebook,
+    uint8_t*     reverse_codebook,
+    int const    revbook_nbyte,
+    float*       time_book,
+    cudaStream_t stream)
+{
+    // Metadata
+    auto type_bw  = sizeof(H) * 8;
+    auto _d_first = reinterpret_cast<H*>(reverse_codebook);
+    auto _d_entry = reinterpret_cast<H*>(reverse_codebook + (sizeof(H) * type_bw));
+    auto _d_qcode = reinterpret_cast<T*>(reverse_codebook + (sizeof(H) * 2 * type_bw));
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    // Sort Qcodes by frequency
+    int nblocks = (dict_size / 1024) + 1;
+    par_huffman::detail::GPU_FillArraySequence<T><<<nblocks, 1024>>>(_d_qcode, (unsigned int)dict_size);
+    cudaStreamSynchronize(stream);
+
+    /**
+     * Originally from par_huffman_sortbyfreq.cu by Cody Rivera (cjrivera1@crimson.ua.edu)
+     * Sorts quantization codes by frequency, using a key-value sort. This functionality is placed in a separate
+     * compilation unit as thrust calls fail in par_huffman.cu.
+     *
+     * Resolved by
+     * 1) inlining function
+     * 2) using `thrust::device_pointer_cast(var)` instead of `thrust::device_pointer<T>(var)`
+     */
+    auto lambda_sort_by_freq = [] __host__(auto freq, auto len, auto qcode) {
+        thrust::sort_by_key(
+            thrust::device_pointer_cast(freq), thrust::device_pointer_cast(freq + len),
+            thrust::device_pointer_cast(qcode));
+    };
+
+    lambda_sort_by_freq(freq, dict_size, _d_qcode);
+    cudaStreamSynchronize(stream);
+
+    unsigned int* d_first_nonzero_index;
+    unsigned int  first_nonzero_index = dict_size;
+    cudaMalloc(&d_first_nonzero_index, sizeof(unsigned int));
+    cudaMemcpy(d_first_nonzero_index, &first_nonzero_index, sizeof(unsigned int), cudaMemcpyHostToDevice);
+    par_huffman::detail::GPU_GetFirstNonzeroIndex<unsigned int>
+        <<<nblocks, 1024>>>(freq, dict_size, d_first_nonzero_index);
+    cudaStreamSynchronize(stream);
+    cudaMemcpy(&first_nonzero_index, d_first_nonzero_index, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+    cudaFree(d_first_nonzero_index);
+
+    int           nz_dict_size   = dict_size - first_nonzero_index;
+    unsigned int* _nz_d_freq     = freq + first_nonzero_index;
+    H*            _nz_d_codebook = codebook + first_nonzero_index;
+    int           nz_nblocks     = (nz_dict_size / 1024) + 1;
+
+    // Memory Allocation -- Perhaps put in another wrapper
+    // clang-format off
+    unsigned int *CL         = nullptr;
+    /*unsigned int* lNodesFreq*/         int *lNodesLeader = nullptr;
+    unsigned int *iNodesFreq = nullptr;  int *iNodesLeader = nullptr;
+    unsigned int *tempFreq   = nullptr;  int *tempIsLeaf   = nullptr;  int *tempIndex = nullptr;
+    unsigned int *copyFreq   = nullptr;  int *copyIsLeaf   = nullptr;  int *copyIndex = nullptr;
+    cudaMalloc(&CL,           nz_dict_size * sizeof(unsigned int) );
+    cudaMalloc(&lNodesLeader, nz_dict_size * sizeof(int)          );
+    cudaMalloc(&iNodesFreq,   nz_dict_size * sizeof(unsigned int) );
+    cudaMalloc(&iNodesLeader, nz_dict_size * sizeof(int)          );
+    cudaMalloc(&tempFreq,     nz_dict_size * sizeof(unsigned int) );
+    cudaMalloc(&tempIsLeaf,   nz_dict_size * sizeof(int)          );
+    cudaMalloc(&tempIndex,    nz_dict_size * sizeof(int)          );
+    cudaMalloc(&copyFreq,     nz_dict_size * sizeof(unsigned int) );
+    cudaMalloc(&copyIsLeaf,   nz_dict_size * sizeof(int)          );
+    cudaMalloc(&copyIndex,    nz_dict_size * sizeof(int)          );
+    cudaMemset(CL, 0,         nz_dict_size * sizeof(int)          );
+    // clang-format on
+
+    // Grid configuration for CL -- based on Cooperative Groups
+    int            cg_mblocks;
+    int            cg_blocks_sm;
+    int            device_id;
+    int            mthreads = 32;  // 1 warp
+    cudaDeviceProp deviceProp;
+    cudaGetDevice(&device_id);
+    cudaGetDeviceProperties(&deviceProp, device_id);
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &cg_blocks_sm, par_huffman::GPU_GenerateCL<unsigned int>, mthreads, 5 * sizeof(int32_t) + 32 * sizeof(int32_t));
+    cg_mblocks = deviceProp.multiProcessorCount * cg_blocks_sm;
+
+    int ELTS_PER_SEQ_MERGE = 16;
+    int mblocks            = std::min(cg_mblocks, (nz_dict_size / ELTS_PER_SEQ_MERGE) + 1);
+
+    // Exit if not enough exposed parallelism -- TODO modify kernels so this is unneeded
+    int tthreads = mthreads * mblocks;
+    if (tthreads < nz_dict_size) {
+        cout << LOG_ERR << "Insufficient on-device parallelism to construct a " << nz_dict_size
+             << " non-zero item codebook" << endl;
+        cout << LOG_ERR << "Provided parallelism: " << mblocks << " blocks, " << mthreads << " threads, " << tthreads
+             << " total" << endl
+             << endl;
+        // cout << LOG_ERR << "Exiting cuSZ ..." << endl;
+        throw std::system_error();
+        // exit(1);
+    }
+
+    uint32_t* diagonal_path_intersections;
+    cudaMalloc(&diagonal_path_intersections, (2 * (mblocks + 1)) * sizeof(uint32_t));
+
+    // Codebook already init'ed
+    cudaStreamSynchronize(stream);
+
+    // Call first kernel
+    // Collect arguments
+    void* CL_Args[] = {(void*)&_nz_d_freq,   (void*)&CL,
+                       (void*)&nz_dict_size, (void*)&_nz_d_freq,
+                       (void*)&lNodesLeader, (void*)&iNodesFreq,
+                       (void*)&iNodesLeader, (void*)&tempFreq,
+                       (void*)&tempIsLeaf,   (void*)&tempIndex,
+                       (void*)&copyFreq,     (void*)&copyIsLeaf,
+                       (void*)&copyIndex,    (void*)&diagonal_path_intersections,
+                       (void*)&mblocks,      (void*)&mthreads};
+    // Cooperative Launch
+    cudaLaunchCooperativeKernel(
+        (void*)par_huffman::GPU_GenerateCL<unsigned int>, mblocks, mthreads, CL_Args,
+        5 * sizeof(int32_t) + 32 * sizeof(int32_t));
+    cudaStreamSynchronize(stream);
+
+    // Exits if the highest codeword length is greater than what
+    // the adaptive representation can handle
+    // TODO do  proper cleanup
+
+    unsigned int* d_max_CL;
+    unsigned int  max_CL;
+    cudaMalloc(&d_max_CL, sizeof(unsigned int));
+    par_huffman::detail::GPU_GetMaxCWLength<<<1, 1>>>(CL, nz_dict_size, d_max_CL);
+    cudaStreamSynchronize(stream);
+    cudaMemcpy(&max_CL, d_max_CL, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+    cudaFree(d_max_CL);
+
+    int max_CW_bits = (sizeof(H) * 8) - 8;
+    if (max_CL > max_CW_bits) {
+        cout << LOG_ERR << "Cannot store all Huffman codewords in " << max_CW_bits + 8 << "-bit representation" << endl;
+        cout << LOG_ERR << "Huffman codeword representation requires at least " << max_CL + 8
+             << " bits (longest codeword: " << max_CL << " bits)" << endl;
+        // cout << LOG_ERR << "(Consider running with -H 8 for 8-byte representation)" << endl << endl;
+        // cout << LOG_ERR << "Exiting cuSZ ..." << endl;
+        // exit(1);
+        throw std::runtime_error("Falling back to 8-byte Codec.");
+    }
+
+    // Configure CW for 1024 threads/block
+    int cg_cw_mblocks = (cg_mblocks * mthreads) / 1024;
+    int cw_mblocks    = std::min(cg_cw_mblocks, nz_nblocks);
+
+    // Exit if not enough exposed parallelism -- TODO modify kernels so this is unneeded
+    int cw_tthreads = cw_mblocks * 1024;
+    if (cw_tthreads < nz_dict_size) {
+        cout << LOG_ERR << "Insufficient on-device parallelism to construct a " << nz_dict_size
+             << " non-zero item codebook" << endl;
+        cout << LOG_ERR << "Provided parallelism: " << cw_mblocks << " blocks, " << 1024 << " threads, " << cw_tthreads
+             << " total" << endl
+             << endl;
+        // cout << LOG_ERR << "Exiting cuSZ ..." << endl;
+        // exit(1);
+        throw std::system_error();
+    }
+
+    void* CW_Args[] = {
+        (void*)&CL,              //
+        (void*)&_nz_d_codebook,  //
+        (void*)&_d_first,        //
+        (void*)&_d_entry,        //
+        (void*)&nz_dict_size};
+
+    // Call second kernel
+    cudaLaunchCooperativeKernel(
+        (void*)par_huffman::GPU_GenerateCW<unsigned int, H>,  //
+        cw_mblocks,                                           //
+        1024,                                                 //
+        CW_Args);
+    cudaStreamSynchronize(stream);
+
+#ifdef D_DEBUG_PRINT
+    print_codebook<H><<<1, 32>>>(codebook, dict_size);  // PASS
+    cudaStreamSynchronize(stream);
+#endif
+
+    // Reverse _d_qcode and codebook
+    par_huffman::detail::GPU_ReverseArray<H><<<nblocks, 1024>>>(codebook, (unsigned int)dict_size);
+    par_huffman::detail::GPU_ReverseArray<T><<<nblocks, 1024>>>(_d_qcode, (unsigned int)dict_size);
+    cudaStreamSynchronize(stream);
+
+    par_huffman::detail::GPU_ReorderByIndex<H, T><<<nblocks, 1024>>>(codebook, _d_qcode, (unsigned int)dict_size);
+    cudaStreamSynchronize(stream);
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    TIME_ELAPSED_CUDAEVENT(time_book);
+    DESTROY_CUDAEVENT_PAIR;
+
+    // Cleanup
+    cudaFree(CL);
+    cudaFree(lNodesLeader);
+    cudaFree(iNodesFreq);
+    cudaFree(iNodesLeader);
+    cudaFree(tempFreq);
+    cudaFree(tempIsLeaf);
+    cudaFree(tempIndex);
+    cudaFree(copyFreq);
+    cudaFree(copyIsLeaf);
+    cudaFree(copyIndex);
+    cudaFree(diagonal_path_intersections);
+    cudaStreamSynchronize(stream);
+
+#ifdef D_DEBUG_PRINT
+    print_codebook<H><<<1, 32>>>(codebook, dict_size);  // PASS
+    cudaStreamSynchronize(stream);
+#endif
+}
+
+#endif /* C883A574_4491_40E8_A083_1B6E8FB56670 */
diff --git a/qtensor/compression/cusz/src/hf/detail/hf_codecg.inl b/qtensor/compression/cusz/src/hf/detail/hf_codecg.inl
index 04c8883b..2e8cf159 100644
--- a/qtensor/compression/cusz/src/hf/detail/hf_codecg.inl
+++ b/qtensor/compression/cusz/src/hf/detail/hf_codecg.inl
@@ -1,296 +1,296 @@
-/**
- * @file codec_huffman.cuh
- * @author Jiannan Tian
- * @brief Huffman kernel definitions
- * @version 0.2
- * @date 2020-02-13
- * (created) 2020-02-02, (rev1) 2021-02-13, (rev2) 2021-12-29
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#ifndef CUSZ_KERNEL_CODEC_HUFFMAN_CUH
-#define CUSZ_KERNEL_CODEC_HUFFMAN_CUH
-
-#include <cuda_runtime.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <cstdio>
-#include <cstdlib>
-#include <limits>
-
-#include "common.hh"
-#include "hf/hf_bookg.hh"
-#include "hf/hf_codecg.hh"
-#include "hf/hf_struct.h"
-#include "utils/cuda_err.cuh"
-#include "utils/timer.h"
-
-#define TIX threadIdx.x
-#define BIX blockIdx.x
-#define BDX blockDim.x
-
-#if __has_include(<cub/cub.cuh>)
-// #pragma message __FILE__ ": (CUDA 11 onward), cub from system path"
-#include <cub/cub.cuh>
-#else
-// #pragma message __FILE__ ": (CUDA 10 or earlier), cub from git submodule"
-#include "../../third_party/cub/cub/cub.cuh"
-#endif
-
-using BYTE = uint8_t;
-
-extern __shared__ char __codec_huffman_uninitialized[];
-
-struct __helper {
-    __device__ __forceinline__ static unsigned int local_tid_1() { return threadIdx.x; }
-    __device__ __forceinline__ static unsigned int global_tid_1() { return blockIdx.x * blockDim.x + threadIdx.x; }
-    __device__ __forceinline__ static unsigned int block_stride_1() { return blockDim.x; }
-    __device__ __forceinline__ static unsigned int grid_stride_1() { return blockDim.x * gridDim.x; }
-    template <int SEQ>
-    __device__ __forceinline__ static unsigned int global_tid()
-    {
-        return blockIdx.x * blockDim.x * SEQ + threadIdx.x;
-    }
-    template <int SEQ>
-    __device__ __forceinline__ static unsigned int grid_stride()
-    {
-        return blockDim.x * gridDim.x * SEQ;
-    }
-};
-
-template <typename UNCOMPRESSED, typename COMPRESSED, typename MetadataT>
-__global__ void hf_decode_kernel(
-    COMPRESSED*   compressed,
-    uint8_t*      revbook,
-    MetadataT*    par_nbit,
-    MetadataT*    par_entry,
-    int const     revbook_nbyte,
-    int const     sublen,
-    int const     pardeg,
-    UNCOMPRESSED* out_uncompressed);
-
-namespace asz {
-namespace detail {
-
-template <typename UNCOMPRESSED, typename ENCODED>
-__global__ void hf_encode_phase1_fill(
-    UNCOMPRESSED* in_uncompressed,
-    size_t const  in_uncompressed_len,
-    ENCODED*      in_book,
-    int const     in_booklen,
-    ENCODED*      out_encoded);
-
-template <typename COMPRESSED, typename MetadataT>
-__global__ void hf_encode_phase2_deflate(
-    COMPRESSED*  inout_inplace,
-    size_t const len,
-    MetadataT*   par_nbit,
-    MetadataT*   par_ncell,
-    int const    sublen,
-    int const    pardeg);
-
-template <typename Huff, typename Meta>
-__global__ void
-hf_encode_phase4_concatenate(Huff* gapped, Meta* par_entry, Meta* par_ncell, int const cfg_sublen, Huff* non_gapped);
-
-// TODO change size_t to unsigned int
-template <typename COMPRESSED, typename UNCOMPRESSED>
-__device__ void
-hf_decode_single_thread_inflate(COMPRESSED* input, UNCOMPRESSED* out, int const total_bw, BYTE* revbook);
-
-}  // namespace detail
-}  // namespace asz
-
-// TODO change size_t to unsigned int
-template <typename COMPRESSED, typename UNCOMPRESSED>
-__device__ void
-asz::detail::hf_decode_single_thread_inflate(COMPRESSED* input, UNCOMPRESSED* out, int const total_bw, BYTE* revbook)
-{
-    static const auto DTYPE_WIDTH = sizeof(COMPRESSED) * 8;
-
-    int  next_bit;
-    auto idx_bit  = 0;
-    auto idx_byte = 0;
-    auto idx_out  = 0;
-
-    COMPRESSED bufr = input[idx_byte];
-
-    auto       first = reinterpret_cast<COMPRESSED*>(revbook);
-    auto       entry = first + DTYPE_WIDTH;
-    auto       keys  = reinterpret_cast<UNCOMPRESSED*>(revbook + sizeof(COMPRESSED) * (2 * DTYPE_WIDTH));
-    COMPRESSED v     = (bufr >> (DTYPE_WIDTH - 1)) & 0x1;  // get the first bit
-    auto       l     = 1;
-    auto       i     = 0;
-
-    while (i < total_bw) {
-        while (v < first[l]) {  // append next i_cb bit
-            ++i;
-            idx_byte = i / DTYPE_WIDTH;  // [1:exclusive]
-            idx_bit  = i % DTYPE_WIDTH;
-            if (idx_bit == 0) {
-                // idx_byte += 1; // [1:exclusive]
-                bufr = input[idx_byte];
-            }
-
-            next_bit = ((bufr >> (DTYPE_WIDTH - 1 - idx_bit)) & 0x1);
-            v        = (v << 1) | next_bit;
-            ++l;
-        }
-        out[idx_out++] = keys[entry[l] + v - first[l]];
-        {
-            ++i;
-            idx_byte = i / DTYPE_WIDTH;  // [2:exclusive]
-            idx_bit  = i % DTYPE_WIDTH;
-            if (idx_bit == 0) {
-                // idx_byte += 1; // [2:exclusive]
-                bufr = input[idx_byte];
-            }
-
-            next_bit = ((bufr >> (DTYPE_WIDTH - 1 - idx_bit)) & 0x1);
-            v        = 0x0 | next_bit;
-        }
-        l = 1;
-    }
-}
-
-template <typename UNCOMPRESSED, typename ENCODED>
-__global__ void asz::detail::hf_encode_phase1_fill(
-    UNCOMPRESSED* in_uncompressed,
-    size_t const  in_uncompressed_len,
-    ENCODED*      in_book,
-    int const     in_booklen,
-    ENCODED*      out_encoded)
-{
-    auto shmem_cb = reinterpret_cast<ENCODED*>(__codec_huffman_uninitialized);
-
-    // load from global memory
-    for (auto idx = __helper::local_tid_1();  //
-         idx < in_booklen;                    //
-         idx += __helper::block_stride_1())
-        shmem_cb[idx] = in_book[idx];
-
-    __syncthreads();
-
-    for (auto idx = __helper::global_tid_1();  //
-         idx < in_uncompressed_len;            //
-         idx += __helper::grid_stride_1()      //
-    )
-        out_encoded[idx] = shmem_cb[(int)in_uncompressed[idx]];
-}
-
-template <typename COMPRESSED, typename MetadataT>
-__global__ void asz::detail::hf_encode_phase2_deflate(
-    COMPRESSED*  inout_inplace,
-    size_t const len,
-    MetadataT*   par_nbit,
-    MetadataT*   par_ncell,
-    int const    sublen,
-    int const    pardeg)
-{
-    constexpr int CELL_BITWIDTH = sizeof(COMPRESSED) * 8;
-
-    auto tid = BIX * BDX + TIX;
-
-    if (tid * sublen < len) {
-        int         residue_bits = CELL_BITWIDTH;
-        int         total_bits   = 0;
-        COMPRESSED* ptr          = inout_inplace + tid * sublen;
-        COMPRESSED  bufr;
-        uint8_t     word_width;
-
-        auto did = tid * sublen;
-        for (auto i = 0; i < sublen; i++, did++) {
-            if (did == len) break;
-
-            COMPRESSED packed_word = inout_inplace[tid * sublen + i];
-            auto       word_ptr    = reinterpret_cast<struct PackedWordByWidth<sizeof(COMPRESSED)>*>(&packed_word);
-            word_width             = word_ptr->bits;
-            word_ptr->bits         = (uint8_t)0x0;
-
-            if (residue_bits == CELL_BITWIDTH) {  // a new unit of compact format
-                bufr = 0x0;
-            }
-            ////////////////////////////////////////////////////////////////
-
-            if (word_width <= residue_bits) {
-                residue_bits -= word_width;
-                bufr |= packed_word << residue_bits;
-
-                if (residue_bits == 0) {
-                    residue_bits = CELL_BITWIDTH;
-                    *(ptr++)     = bufr;
-                }
-            }
-            else {
-                // example: we have 5-bit code 11111 but 3 bits available in (*ptr)
-                // 11111 for the residue 3 bits in (*ptr); 11111 for 2 bits of (*(++ptr)), starting with MSB
-                // ^^^                                        ^^
-                auto l_bits = word_width - residue_bits;
-                auto r_bits = CELL_BITWIDTH - l_bits;
-
-                bufr |= packed_word >> l_bits;
-                *(ptr++) = bufr;
-                bufr     = packed_word << r_bits;
-
-                residue_bits = r_bits;
-            }
-            total_bits += word_width;
-        }
-        *ptr = bufr;  // manage the last unit
-
-        par_nbit[tid]  = total_bits;
-        par_ncell[tid] = (total_bits + CELL_BITWIDTH - 1) / CELL_BITWIDTH;
-    }
-}
-
-template <typename Huff, typename Meta>
-__global__ void asz::detail::hf_encode_phase4_concatenate(
-    Huff*     gapped,
-    Meta*     par_entry,
-    Meta*     par_ncell,
-    int const cfg_sublen,
-    Huff*     non_gapped)
-{
-    auto n   = par_ncell[blockIdx.x];
-    auto src = gapped + cfg_sublen * blockIdx.x;
-    auto dst = non_gapped + par_entry[blockIdx.x];
-
-    for (auto i = threadIdx.x; i < n; i += blockDim.x) {  // block-stride
-        dst[i] = src[i];
-    }
-}
-
-template <typename UNCOMPRESSED, typename COMPRESSED, typename MetadataT>
-__global__ void hf_decode_kernel(
-    COMPRESSED*   compressed,
-    uint8_t*      revbook,
-    MetadataT*    par_nbit,
-    MetadataT*    par_entry,
-    int const     revbook_nbyte,
-    int const     sublen,
-    int const     pardeg,
-    UNCOMPRESSED* out_uncompressed)
-{
-    extern __shared__ uint8_t shmem[];
-    constexpr auto            block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE;
-
-    auto R = (revbook_nbyte - 1 + block_dim) / block_dim;
-
-    for (auto i = 0; i < R; i++) {
-        if (TIX + i * block_dim < revbook_nbyte) shmem[TIX + i * block_dim] = revbook[TIX + i * block_dim];
-    }
-    __syncthreads();
-
-    auto gid = BIX * BDX + TIX;
-
-    if (gid < pardeg) {
-        asz::detail::hf_decode_single_thread_inflate(
-            compressed + par_entry[gid], out_uncompressed + sublen * gid, par_nbit[gid], shmem);
-        __syncthreads();
-    }
-}
-
-#endif
+/**
+ * @file codec_huffman.cuh
+ * @author Jiannan Tian
+ * @brief Huffman kernel definitions
+ * @version 0.2
+ * @date 2020-02-13
+ * (created) 2020-02-02, (rev1) 2021-02-13, (rev2) 2021-12-29
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_KERNEL_CODEC_HUFFMAN_CUH
+#define CUSZ_KERNEL_CODEC_HUFFMAN_CUH
+
+#include <cuda_runtime.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <cstdio>
+#include <cstdlib>
+#include <limits>
+
+#include "common.hh"
+#include "hf/hf_bookg.hh"
+#include "hf/hf_codecg.hh"
+#include "hf/hf_struct.h"
+#include "utils/cuda_err.cuh"
+#include "utils/timer.h"
+
+#define TIX threadIdx.x
+#define BIX blockIdx.x
+#define BDX blockDim.x
+
+#if __has_include(<cub/cub.cuh>)
+// #pragma message __FILE__ ": (CUDA 11 onward), cub from system path"
+#include <cub/cub.cuh>
+#else
+// #pragma message __FILE__ ": (CUDA 10 or earlier), cub from git submodule"
+#include "../../third_party/cub/cub/cub.cuh"
+#endif
+
+using BYTE = uint8_t;
+
+extern __shared__ char __codec_huffman_uninitialized[];
+
+struct __helper {
+    __device__ __forceinline__ static unsigned int local_tid_1() { return threadIdx.x; }
+    __device__ __forceinline__ static unsigned int global_tid_1() { return blockIdx.x * blockDim.x + threadIdx.x; }
+    __device__ __forceinline__ static unsigned int block_stride_1() { return blockDim.x; }
+    __device__ __forceinline__ static unsigned int grid_stride_1() { return blockDim.x * gridDim.x; }
+    template <int SEQ>
+    __device__ __forceinline__ static unsigned int global_tid()
+    {
+        return blockIdx.x * blockDim.x * SEQ + threadIdx.x;
+    }
+    template <int SEQ>
+    __device__ __forceinline__ static unsigned int grid_stride()
+    {
+        return blockDim.x * gridDim.x * SEQ;
+    }
+};
+
+template <typename UNCOMPRESSED, typename COMPRESSED, typename MetadataT>
+__global__ void hf_decode_kernel(
+    COMPRESSED*   compressed,
+    uint8_t*      revbook,
+    MetadataT*    par_nbit,
+    MetadataT*    par_entry,
+    int const     revbook_nbyte,
+    int const     sublen,
+    int const     pardeg,
+    UNCOMPRESSED* out_uncompressed);
+
+namespace asz {
+namespace detail {
+
+template <typename UNCOMPRESSED, typename ENCODED>
+__global__ void hf_encode_phase1_fill(
+    UNCOMPRESSED* in_uncompressed,
+    size_t const  in_uncompressed_len,
+    ENCODED*      in_book,
+    int const     in_booklen,
+    ENCODED*      out_encoded);
+
+template <typename COMPRESSED, typename MetadataT>
+__global__ void hf_encode_phase2_deflate(
+    COMPRESSED*  inout_inplace,
+    size_t const len,
+    MetadataT*   par_nbit,
+    MetadataT*   par_ncell,
+    int const    sublen,
+    int const    pardeg);
+
+template <typename Huff, typename Meta>
+__global__ void
+hf_encode_phase4_concatenate(Huff* gapped, Meta* par_entry, Meta* par_ncell, int const cfg_sublen, Huff* non_gapped);
+
+// TODO change size_t to unsigned int
+template <typename COMPRESSED, typename UNCOMPRESSED>
+__device__ void
+hf_decode_single_thread_inflate(COMPRESSED* input, UNCOMPRESSED* out, int const total_bw, BYTE* revbook);
+
+}  // namespace detail
+}  // namespace asz
+
+// TODO change size_t to unsigned int
+template <typename COMPRESSED, typename UNCOMPRESSED>
+__device__ void
+asz::detail::hf_decode_single_thread_inflate(COMPRESSED* input, UNCOMPRESSED* out, int const total_bw, BYTE* revbook)
+{
+    static const auto DTYPE_WIDTH = sizeof(COMPRESSED) * 8;
+
+    int  next_bit;
+    auto idx_bit  = 0;
+    auto idx_byte = 0;
+    auto idx_out  = 0;
+
+    COMPRESSED bufr = input[idx_byte];
+
+    auto       first = reinterpret_cast<COMPRESSED*>(revbook);
+    auto       entry = first + DTYPE_WIDTH;
+    auto       keys  = reinterpret_cast<UNCOMPRESSED*>(revbook + sizeof(COMPRESSED) * (2 * DTYPE_WIDTH));
+    COMPRESSED v     = (bufr >> (DTYPE_WIDTH - 1)) & 0x1;  // get the first bit
+    auto       l     = 1;
+    auto       i     = 0;
+
+    while (i < total_bw) {
+        while (v < first[l]) {  // append next i_cb bit
+            ++i;
+            idx_byte = i / DTYPE_WIDTH;  // [1:exclusive]
+            idx_bit  = i % DTYPE_WIDTH;
+            if (idx_bit == 0) {
+                // idx_byte += 1; // [1:exclusive]
+                bufr = input[idx_byte];
+            }
+
+            next_bit = ((bufr >> (DTYPE_WIDTH - 1 - idx_bit)) & 0x1);
+            v        = (v << 1) | next_bit;
+            ++l;
+        }
+        out[idx_out++] = keys[entry[l] + v - first[l]];
+        {
+            ++i;
+            idx_byte = i / DTYPE_WIDTH;  // [2:exclusive]
+            idx_bit  = i % DTYPE_WIDTH;
+            if (idx_bit == 0) {
+                // idx_byte += 1; // [2:exclusive]
+                bufr = input[idx_byte];
+            }
+
+            next_bit = ((bufr >> (DTYPE_WIDTH - 1 - idx_bit)) & 0x1);
+            v        = 0x0 | next_bit;
+        }
+        l = 1;
+    }
+}
+
+template <typename UNCOMPRESSED, typename ENCODED>
+__global__ void asz::detail::hf_encode_phase1_fill(
+    UNCOMPRESSED* in_uncompressed,
+    size_t const  in_uncompressed_len,
+    ENCODED*      in_book,
+    int const     in_booklen,
+    ENCODED*      out_encoded)
+{
+    auto shmem_cb = reinterpret_cast<ENCODED*>(__codec_huffman_uninitialized);
+
+    // load from global memory
+    for (auto idx = __helper::local_tid_1();  //
+         idx < in_booklen;                    //
+         idx += __helper::block_stride_1())
+        shmem_cb[idx] = in_book[idx];
+
+    __syncthreads();
+
+    for (auto idx = __helper::global_tid_1();  //
+         idx < in_uncompressed_len;            //
+         idx += __helper::grid_stride_1()      //
+    )
+        out_encoded[idx] = shmem_cb[(int)in_uncompressed[idx]];
+}
+
+template <typename COMPRESSED, typename MetadataT>
+__global__ void asz::detail::hf_encode_phase2_deflate(
+    COMPRESSED*  inout_inplace,
+    size_t const len,
+    MetadataT*   par_nbit,
+    MetadataT*   par_ncell,
+    int const    sublen,
+    int const    pardeg)
+{
+    constexpr int CELL_BITWIDTH = sizeof(COMPRESSED) * 8;
+
+    auto tid = BIX * BDX + TIX;
+
+    if (tid * sublen < len) {
+        int         residue_bits = CELL_BITWIDTH;
+        int         total_bits   = 0;
+        COMPRESSED* ptr          = inout_inplace + tid * sublen;
+        COMPRESSED  bufr;
+        uint8_t     word_width;
+
+        auto did = tid * sublen;
+        for (auto i = 0; i < sublen; i++, did++) {
+            if (did == len) break;
+
+            COMPRESSED packed_word = inout_inplace[tid * sublen + i];
+            auto       word_ptr    = reinterpret_cast<struct PackedWordByWidth<sizeof(COMPRESSED)>*>(&packed_word);
+            word_width             = word_ptr->bits;
+            word_ptr->bits         = (uint8_t)0x0;
+
+            if (residue_bits == CELL_BITWIDTH) {  // a new unit of compact format
+                bufr = 0x0;
+            }
+            ////////////////////////////////////////////////////////////////
+
+            if (word_width <= residue_bits) {
+                residue_bits -= word_width;
+                bufr |= packed_word << residue_bits;
+
+                if (residue_bits == 0) {
+                    residue_bits = CELL_BITWIDTH;
+                    *(ptr++)     = bufr;
+                }
+            }
+            else {
+                // example: we have 5-bit code 11111 but 3 bits available in (*ptr)
+                // 11111 for the residue 3 bits in (*ptr); 11111 for 2 bits of (*(++ptr)), starting with MSB
+                // ^^^                                        ^^
+                auto l_bits = word_width - residue_bits;
+                auto r_bits = CELL_BITWIDTH - l_bits;
+
+                bufr |= packed_word >> l_bits;
+                *(ptr++) = bufr;
+                bufr     = packed_word << r_bits;
+
+                residue_bits = r_bits;
+            }
+            total_bits += word_width;
+        }
+        *ptr = bufr;  // manage the last unit
+
+        par_nbit[tid]  = total_bits;
+        par_ncell[tid] = (total_bits + CELL_BITWIDTH - 1) / CELL_BITWIDTH;
+    }
+}
+
+template <typename Huff, typename Meta>
+__global__ void asz::detail::hf_encode_phase4_concatenate(
+    Huff*     gapped,
+    Meta*     par_entry,
+    Meta*     par_ncell,
+    int const cfg_sublen,
+    Huff*     non_gapped)
+{
+    auto n   = par_ncell[blockIdx.x];
+    auto src = gapped + cfg_sublen * blockIdx.x;
+    auto dst = non_gapped + par_entry[blockIdx.x];
+
+    for (auto i = threadIdx.x; i < n; i += blockDim.x) {  // block-stride
+        dst[i] = src[i];
+    }
+}
+
+template <typename UNCOMPRESSED, typename COMPRESSED, typename MetadataT>
+__global__ void hf_decode_kernel(
+    COMPRESSED*   compressed,
+    uint8_t*      revbook,
+    MetadataT*    par_nbit,
+    MetadataT*    par_entry,
+    int const     revbook_nbyte,
+    int const     sublen,
+    int const     pardeg,
+    UNCOMPRESSED* out_uncompressed)
+{
+    extern __shared__ uint8_t shmem[];
+    constexpr auto            block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE;
+
+    auto R = (revbook_nbyte - 1 + block_dim) / block_dim;
+
+    for (auto i = 0; i < R; i++) {
+        if (TIX + i * block_dim < revbook_nbyte) shmem[TIX + i * block_dim] = revbook[TIX + i * block_dim];
+    }
+    __syncthreads();
+
+    auto gid = BIX * BDX + TIX;
+
+    if (gid < pardeg) {
+        asz::detail::hf_decode_single_thread_inflate(
+            compressed + par_entry[gid], out_uncompressed + sublen * gid, par_nbit[gid], shmem);
+        __syncthreads();
+    }
+}
+
+#endif
diff --git a/qtensor/compression/cusz/src/hf/detail/hf_pimpl.inl b/qtensor/compression/cusz/src/hf/detail/hf_pimpl.inl
index 7a330ba6..4ed9b580 100644
--- a/qtensor/compression/cusz/src/hf/detail/hf_pimpl.inl
+++ b/qtensor/compression/cusz/src/hf/detail/hf_pimpl.inl
@@ -1,364 +1,364 @@
-/**
- * @file huffman_coarse.cuh
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2021-12-17
- * (created) 2020-04-24 (rev1) 2021-09-05 (rev2) 2021-12-29
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * @copyright (C) 2021 by Washington State University, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#ifndef CUSZ_COMPONENT_HUFFMAN_COARSE_CUH
-#define CUSZ_COMPONENT_HUFFMAN_COARSE_CUH
-
-#include <cuda.h>
-// #include <clocale>
-// #include <cstdint>
-// #include <exception>
-// #include <functional>
-#include <iostream>
-#include <numeric>
-// #include <type_traits>
-
-using std::cout;
-
-#include "common/definition.hh"
-#include "common/type_traits.hh"
-#include "utils.hh"
-
-#include "hf/hf.hh"
-#include "hf/hf_bookg.hh"
-#include "hf/hf_codecg.hh"
-
-/******************************************************************************
-                            macros for shorthand writing
- ******************************************************************************/
-
-#define EXPORT_NBYTE(FIELD) nbyte[Header::FIELD] = rte.nbyte[RTE::FIELD];
-
-#define DEVICE2DEVICE_COPY(VAR, FIELD)                                            \
-    {                                                                             \
-        constexpr auto D2D = cudaMemcpyDeviceToDevice;                            \
-        auto           dst = d_compressed + header.entry[Header::FIELD];          \
-        auto           src = reinterpret_cast<BYTE*>(d_##VAR);                    \
-        CHECK_CUDA(cudaMemcpyAsync(dst, src, nbyte[Header::FIELD], D2D, stream)); \
-    }
-
-#define ACCESSOR(SYM, TYPE) reinterpret_cast<TYPE*>(in_compressed + header.entry[Header::SYM])
-
-#define HC_ALLOCHOST(VAR, SYM)                     \
-    cudaMallocHost(&h_##VAR, rte.nbyte[RTE::SYM]); \
-    memset(h_##VAR, 0x0, rte.nbyte[RTE::SYM]);
-
-#define HC_ALLOCDEV(VAR, SYM)                  \
-    cudaMalloc(&d_##VAR, rte.nbyte[RTE::SYM]); \
-    cudaMemset(d_##VAR, 0x0, rte.nbyte[RTE::SYM]);
-
-#define HC_FREEHOST(VAR)       \
-    if (h_##VAR) {             \
-        cudaFreeHost(h_##VAR); \
-        h_##VAR = nullptr;     \
-    }
-
-#define HC_FREEDEV(VAR)    \
-    if (d_##VAR) {         \
-        cudaFree(d_##VAR); \
-        d_##VAR = nullptr; \
-    }
-
-/******************************************************************************
-                                class definition
- ******************************************************************************/
-
-#define TEMPLATE_TYPE template <typename T, typename H, typename M>
-#define IMPL LosslessCodec<T, H, M>::impl
-
-namespace cusz {
-
-TEMPLATE_TYPE
-IMPL::~impl()
-{
-    HC_FREEDEV(tmp);
-    HC_FREEDEV(book);
-    HC_FREEDEV(revbook);
-    HC_FREEDEV(par_nbit);
-    HC_FREEDEV(par_ncell);
-    HC_FREEDEV(par_entry);
-    HC_FREEDEV(bitstream);
-
-    HC_FREEHOST(book);
-    HC_FREEHOST(revbook);
-    HC_FREEHOST(par_nbit);
-    HC_FREEHOST(par_ncell);
-    HC_FREEHOST(par_entry);
-}
-
-TEMPLATE_TYPE
-IMPL::impl() = default;
-
-//------------------------------------------------------------------------------
-
-TEMPLATE_TYPE
-void IMPL::init(size_t const in_uncompressed_len, int const booklen, int const pardeg, bool dbg_print)
-{
-    auto max_compressed_bytes = [&]() { return in_uncompressed_len / 2 * sizeof(H); };
-
-    auto debug = [&]() {
-        setlocale(LC_NUMERIC, "");
-        printf("\nHuffmanCoarse<T, H, M>::init() debugging:\n");
-        printf("CUdeviceptr nbyte: %d\n", (int)sizeof(CUdeviceptr));
-        dbg_println("TMP", d_tmp, RTE::TMP);
-        dbg_println("BOOK", d_book, RTE::BOOK);
-        dbg_println("REVBOOK", d_revbook, RTE::REVBOOK);
-        dbg_println("PAR_NBIT", d_par_nbit, RTE::PAR_NBIT);
-        dbg_println("PAR_NCELL", d_par_ncell, RTE::PAR_NCELL);
-        dbg_println("BITSTREAM", d_bitstream, RTE::BITSTREAM);
-        printf("\n");
-    };
-
-    memset(rte.nbyte, 0, sizeof(uint32_t) * RTE::END);
-    // memset(rte.entry, 0, sizeof(uint32_t) * (RTE::END + 1));
-
-    rte.nbyte[RTE::TMP]       = sizeof(H) * in_uncompressed_len;
-    rte.nbyte[RTE::BOOK]      = sizeof(H) * booklen;
-    rte.nbyte[RTE::REVBOOK]   = get_revbook_nbyte(booklen);
-    rte.nbyte[RTE::PAR_NBIT]  = sizeof(M) * pardeg;
-    rte.nbyte[RTE::PAR_NCELL] = sizeof(M) * pardeg;
-    rte.nbyte[RTE::PAR_ENTRY] = sizeof(M) * pardeg;
-    rte.nbyte[RTE::BITSTREAM] = max_compressed_bytes();
-
-    HC_ALLOCDEV(tmp, TMP);
-
-    {
-        auto total_bytes = rte.nbyte[RTE::BOOK] + rte.nbyte[RTE::REVBOOK];
-        cudaMalloc(&d_book, total_bytes);
-        cudaMemset(d_book, 0x0, total_bytes);
-
-        d_revbook = reinterpret_cast<uint8_t*>(d_book + booklen);
-    }
-
-    {
-        cudaMalloc(&d_par_metadata, rte.nbyte[RTE::PAR_NBIT] * 3);
-        cudaMemset(d_par_metadata, 0x0, rte.nbyte[RTE::PAR_NBIT] * 3);
-
-        d_par_nbit  = d_par_metadata;
-        d_par_ncell = d_par_metadata + pardeg;
-        d_par_entry = d_par_metadata + pardeg * 2;
-    }
-
-    HC_ALLOCDEV(bitstream, BITSTREAM);
-
-    // standalone definition for output
-    d_compressed = reinterpret_cast<BYTE*>(d_tmp);
-
-    HC_ALLOCHOST(book, BOOK);
-    HC_ALLOCHOST(revbook, REVBOOK);
-
-    {
-        cudaMallocHost(&h_par_metadata, rte.nbyte[RTE::PAR_NBIT] * 3);
-        // cudaMemset(h_par_nbit, 0x0, rte.nbyte[RTE::PAR_NBIT] * 3);
-
-        h_par_nbit  = h_par_metadata;
-        h_par_ncell = h_par_metadata + pardeg;
-        h_par_entry = h_par_metadata + pardeg * 2;
-    }
-
-    int numSMs;
-    cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, 0);
-
-    int sublen = (in_uncompressed_len - 1) / pardeg + 1;
-
-    book_desc      = new hf_book{nullptr, d_book, booklen};
-    chunk_desc_d   = new hf_chunk{d_par_nbit, d_par_ncell, d_par_entry};
-    chunk_desc_h   = new hf_chunk{h_par_nbit, h_par_ncell, h_par_entry};
-    bitstream_desc = new hf_bitstream{d_tmp, d_bitstream, chunk_desc_d, chunk_desc_h, sublen, pardeg, numSMs};
-
-    if (dbg_print) debug();
-}
-
-TEMPLATE_TYPE
-void IMPL::build_codebook(cusz::FREQ* freq, int const booklen, cudaStream_t stream)
-{
-    book_desc->freq = freq;
-    asz::hf_buildbook_g<T, H>(freq, booklen, d_book, d_revbook, get_revbook_nbyte(booklen), &time_book, stream);
-}
-
-TEMPLATE_TYPE
-void IMPL::encode(
-    T*           in_uncompressed,
-    size_t const in_uncompressed_len,
-    BYTE*&       out_compressed,
-    size_t&      out_compressed_len,
-    cudaStream_t stream)
-{
-    time_lossless = 0;
-
-    struct Header header;
-
-    asz::hf_encode_coarse_rev1<T, H, M>(
-        in_uncompressed, in_uncompressed_len,  //
-        book_desc, bitstream_desc,             //
-        out_compressed, out_compressed_len, time_lossless, stream);
-
-    header.total_nbit =
-        std::accumulate((M*)chunk_desc_h->bits, (M*)chunk_desc_h->bits + bitstream_desc->pardeg, (size_t)0);
-    header.total_ncell =
-        std::accumulate((M*)chunk_desc_h->cells, (M*)chunk_desc_h->cells + bitstream_desc->pardeg, (size_t)0);
-    // update with the precise BITSTREAM nbyte
-    rte.nbyte[RTE::BITSTREAM] = sizeof(H) * header.total_ncell;
-
-    // d_revbook and revbook_nbyte is hidden; need to improve here
-    subfile_collect(
-        header, in_uncompressed_len, book_desc->booklen, bitstream_desc->sublen, bitstream_desc->pardeg, stream);
-
-    out_compressed     = d_compressed;
-    out_compressed_len = header.subfile_size();
-}
-
-TEMPLATE_TYPE
-void IMPL::decode(BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool header_on_device)
-{
-    Header header;
-    if (header_on_device)
-        CHECK_CUDA(cudaMemcpyAsync(&header, in_compressed, sizeof(header), cudaMemcpyDeviceToHost, stream));
-
-    auto d_revbook   = ACCESSOR(REVBOOK, BYTE);
-    auto d_par_nbit  = ACCESSOR(PAR_NBIT, M);
-    auto d_par_entry = ACCESSOR(PAR_ENTRY, M);
-    auto d_bitstream = ACCESSOR(BITSTREAM, H);
-
-    auto const revbook_nbyte = get_revbook_nbyte(header.booklen);
-
-    // launch_coarse_grained_Huffman_decoding<T, H, M>(
-    asz::hf_decode_coarse<T, H, M>(
-        d_bitstream, d_revbook, revbook_nbyte, d_par_nbit, d_par_entry, header.sublen, header.pardeg, out_decompressed,
-        time_lossless, stream);
-}
-
-TEMPLATE_TYPE
-void IMPL::clear_buffer()
-{
-    cudaMemset(d_tmp, 0x0, rte.nbyte[RTE::TMP]);
-    cudaMemset(d_book, 0x0, rte.nbyte[RTE::BOOK]);
-    cudaMemset(d_revbook, 0x0, rte.nbyte[RTE::REVBOOK]);
-    cudaMemset(d_par_nbit, 0x0, rte.nbyte[RTE::PAR_NBIT]);
-    cudaMemset(d_par_ncell, 0x0, rte.nbyte[RTE::PAR_NCELL]);
-    cudaMemset(d_par_entry, 0x0, rte.nbyte[RTE::PAR_ENTRY]);
-    cudaMemset(d_bitstream, 0x0, rte.nbyte[RTE::BITSTREAM]);
-}
-
-// private helper
-TEMPLATE_TYPE
-void IMPL::subfile_collect(
-    Header&      header,
-    size_t const in_uncompressed_len,
-    int const    booklen,
-    int const    sublen,
-    int const    pardeg,
-    cudaStream_t stream)
-{
-    auto BARRIER = [&]() {
-        if (stream)
-            CHECK_CUDA(cudaStreamSynchronize(stream));
-        else
-            CHECK_CUDA(cudaDeviceSynchronize());
-    };
-
-    header.self_bytes       = sizeof(Header);
-    header.booklen          = booklen;
-    header.sublen           = sublen;
-    header.pardeg           = pardeg;
-    header.uncompressed_len = in_uncompressed_len;
-
-    MetadataT nbyte[Header::END];
-    nbyte[Header::HEADER] = sizeof(Header);
-
-    EXPORT_NBYTE(REVBOOK)
-    EXPORT_NBYTE(PAR_NBIT)
-    EXPORT_NBYTE(PAR_ENTRY)
-    EXPORT_NBYTE(BITSTREAM)
-
-    header.entry[0] = 0;
-    // *.END + 1: need to know the ending position
-    for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] = nbyte[i - 1]; }
-    for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; }
-
-    // auto debug_header_entry = [&]() {
-    //     for (auto i = 0; i < Header::END + 1; i++) printf("%d, header entry: %d\n", i, header.entry[i]);
-    // };
-    // debug_header_entry();
-
-    CHECK_CUDA(cudaMemcpyAsync(d_compressed, &header, sizeof(header), cudaMemcpyHostToDevice, stream));
-
-    /* debug */ BARRIER();
-
-    DEVICE2DEVICE_COPY(revbook, REVBOOK)
-    DEVICE2DEVICE_COPY(par_nbit, PAR_NBIT)
-    DEVICE2DEVICE_COPY(par_entry, PAR_ENTRY)
-    DEVICE2DEVICE_COPY(bitstream, BITSTREAM)
-}
-
-// getter
-TEMPLATE_TYPE
-float IMPL::get_time_elapsed() const { return milliseconds; }
-
-TEMPLATE_TYPE
-float IMPL::get_time_book() const { return time_book; }
-TEMPLATE_TYPE
-float IMPL::get_time_lossless() const { return time_lossless; }
-
-TEMPLATE_TYPE
-H* IMPL::expose_book() const { return d_book; }
-
-TEMPLATE_TYPE
-BYTE* IMPL::expose_revbook() const { return d_revbook; }
-
-// TODO this kind of space will be overlapping with quant-codes
-TEMPLATE_TYPE
-size_t IMPL::get_workspace_nbyte(size_t len) const { return sizeof(H) * len; }
-
-TEMPLATE_TYPE
-size_t IMPL::get_max_output_nbyte(size_t len) const { return sizeof(H) * len / 2; }
-
-TEMPLATE_TYPE
-size_t IMPL::get_revbook_nbyte(int dict_size) { return sizeof(BOOK) * (2 * CELL_BITWIDTH) + sizeof(SYM) * dict_size; }
-
-TEMPLATE_TYPE
-constexpr bool IMPL::can_overlap_input_and_firstphase_encode() { return sizeof(T) == sizeof(H); }
-
-// auxiliary
-TEMPLATE_TYPE
-void IMPL::dbg_println(const std::string SYM_name, void* VAR, int SYM)
-{
-    CUdeviceptr pbase0{0};
-    size_t      psize0{0};
-
-    cuMemGetAddressRange(&pbase0, &psize0, (CUdeviceptr)VAR);
-    printf(
-        "%s:\n"
-        "\t(supposed) pointer : %p\n"
-        "\t(supposed) bytes   : %'9lu\n"
-        "\t(queried)  pbase0  : %p\n"
-        "\t(queried)  psize0  : %'9lu\n",
-        SYM_name.c_str(), (void*)VAR, (size_t)rte.nbyte[SYM], (void*)&pbase0, psize0);
-    pbase0 = 0, psize0 = 0;
-}
-
-}  // namespace cusz
-
-#undef HC_ALLOCDEV
-#undef HC_ALLOCHOST
-#undef HC_FREEDEV
-#undef HC_FREEHOST
-#undef EXPORT_NBYTE
-#undef ACCESSOR
-#undef DEVICE2DEVICE_COPY
-
-#undef TEMPLATE_TYPE
-#undef IMPL
-
-#endif
+/**
+ * @file huffman_coarse.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-12-17
+ * (created) 2020-04-24 (rev1) 2021-09-05 (rev2) 2021-12-29
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * @copyright (C) 2021 by Washington State University, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_COMPONENT_HUFFMAN_COARSE_CUH
+#define CUSZ_COMPONENT_HUFFMAN_COARSE_CUH
+
+#include <cuda.h>
+// #include <clocale>
+// #include <cstdint>
+// #include <exception>
+// #include <functional>
+#include <iostream>
+#include <numeric>
+// #include <type_traits>
+
+using std::cout;
+
+#include "common/definition.hh"
+#include "common/type_traits.hh"
+#include "utils.hh"
+
+#include "hf/hf.hh"
+#include "hf/hf_bookg.hh"
+#include "hf/hf_codecg.hh"
+
+/******************************************************************************
+                            macros for shorthand writing
+ ******************************************************************************/
+
+#define EXPORT_NBYTE(FIELD) nbyte[Header::FIELD] = rte.nbyte[RTE::FIELD];
+
+#define DEVICE2DEVICE_COPY(VAR, FIELD)                                            \
+    {                                                                             \
+        constexpr auto D2D = cudaMemcpyDeviceToDevice;                            \
+        auto           dst = d_compressed + header.entry[Header::FIELD];          \
+        auto           src = reinterpret_cast<BYTE*>(d_##VAR);                    \
+        CHECK_CUDA(cudaMemcpyAsync(dst, src, nbyte[Header::FIELD], D2D, stream)); \
+    }
+
+#define ACCESSOR(SYM, TYPE) reinterpret_cast<TYPE*>(in_compressed + header.entry[Header::SYM])
+
+#define HC_ALLOCHOST(VAR, SYM)                     \
+    cudaMallocHost(&h_##VAR, rte.nbyte[RTE::SYM]); \
+    memset(h_##VAR, 0x0, rte.nbyte[RTE::SYM]);
+
+#define HC_ALLOCDEV(VAR, SYM)                  \
+    cudaMalloc(&d_##VAR, rte.nbyte[RTE::SYM]); \
+    cudaMemset(d_##VAR, 0x0, rte.nbyte[RTE::SYM]);
+
+#define HC_FREEHOST(VAR)       \
+    if (h_##VAR) {             \
+        cudaFreeHost(h_##VAR); \
+        h_##VAR = nullptr;     \
+    }
+
+#define HC_FREEDEV(VAR)    \
+    if (d_##VAR) {         \
+        cudaFree(d_##VAR); \
+        d_##VAR = nullptr; \
+    }
+
+/******************************************************************************
+                                class definition
+ ******************************************************************************/
+
+#define TEMPLATE_TYPE template <typename T, typename H, typename M>
+#define IMPL LosslessCodec<T, H, M>::impl
+
+namespace cusz {
+
+TEMPLATE_TYPE
+IMPL::~impl()
+{
+    HC_FREEDEV(tmp);
+    HC_FREEDEV(book);
+    HC_FREEDEV(revbook);
+    HC_FREEDEV(par_nbit);
+    HC_FREEDEV(par_ncell);
+    HC_FREEDEV(par_entry);
+    HC_FREEDEV(bitstream);
+
+    HC_FREEHOST(book);
+    HC_FREEHOST(revbook);
+    HC_FREEHOST(par_nbit);
+    HC_FREEHOST(par_ncell);
+    HC_FREEHOST(par_entry);
+}
+
+TEMPLATE_TYPE
+IMPL::impl() = default;
+
+//------------------------------------------------------------------------------
+
+TEMPLATE_TYPE
+void IMPL::init(size_t const in_uncompressed_len, int const booklen, int const pardeg, bool dbg_print)
+{
+    auto max_compressed_bytes = [&]() { return in_uncompressed_len / 2 * sizeof(H); };
+
+    auto debug = [&]() {
+        setlocale(LC_NUMERIC, "");
+        printf("\nHuffmanCoarse<T, H, M>::init() debugging:\n");
+        printf("CUdeviceptr nbyte: %d\n", (int)sizeof(CUdeviceptr));
+        dbg_println("TMP", d_tmp, RTE::TMP);
+        dbg_println("BOOK", d_book, RTE::BOOK);
+        dbg_println("REVBOOK", d_revbook, RTE::REVBOOK);
+        dbg_println("PAR_NBIT", d_par_nbit, RTE::PAR_NBIT);
+        dbg_println("PAR_NCELL", d_par_ncell, RTE::PAR_NCELL);
+        dbg_println("BITSTREAM", d_bitstream, RTE::BITSTREAM);
+        printf("\n");
+    };
+
+    memset(rte.nbyte, 0, sizeof(uint32_t) * RTE::END);
+    // memset(rte.entry, 0, sizeof(uint32_t) * (RTE::END + 1));
+
+    rte.nbyte[RTE::TMP]       = sizeof(H) * in_uncompressed_len;
+    rte.nbyte[RTE::BOOK]      = sizeof(H) * booklen;
+    rte.nbyte[RTE::REVBOOK]   = get_revbook_nbyte(booklen);
+    rte.nbyte[RTE::PAR_NBIT]  = sizeof(M) * pardeg;
+    rte.nbyte[RTE::PAR_NCELL] = sizeof(M) * pardeg;
+    rte.nbyte[RTE::PAR_ENTRY] = sizeof(M) * pardeg;
+    rte.nbyte[RTE::BITSTREAM] = max_compressed_bytes();
+
+    HC_ALLOCDEV(tmp, TMP);
+
+    {
+        auto total_bytes = rte.nbyte[RTE::BOOK] + rte.nbyte[RTE::REVBOOK];
+        cudaMalloc(&d_book, total_bytes);
+        cudaMemset(d_book, 0x0, total_bytes);
+
+        d_revbook = reinterpret_cast<uint8_t*>(d_book + booklen);
+    }
+
+    {
+        cudaMalloc(&d_par_metadata, rte.nbyte[RTE::PAR_NBIT] * 3);
+        cudaMemset(d_par_metadata, 0x0, rte.nbyte[RTE::PAR_NBIT] * 3);
+
+        d_par_nbit  = d_par_metadata;
+        d_par_ncell = d_par_metadata + pardeg;
+        d_par_entry = d_par_metadata + pardeg * 2;
+    }
+
+    HC_ALLOCDEV(bitstream, BITSTREAM);
+
+    // standalone definition for output
+    d_compressed = reinterpret_cast<BYTE*>(d_tmp);
+
+    HC_ALLOCHOST(book, BOOK);
+    HC_ALLOCHOST(revbook, REVBOOK);
+
+    {
+        cudaMallocHost(&h_par_metadata, rte.nbyte[RTE::PAR_NBIT] * 3);
+        // cudaMemset(h_par_nbit, 0x0, rte.nbyte[RTE::PAR_NBIT] * 3);
+
+        h_par_nbit  = h_par_metadata;
+        h_par_ncell = h_par_metadata + pardeg;
+        h_par_entry = h_par_metadata + pardeg * 2;
+    }
+
+    int numSMs;
+    cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, 0);
+
+    int sublen = (in_uncompressed_len - 1) / pardeg + 1;
+
+    book_desc      = new hf_book{nullptr, d_book, booklen};
+    chunk_desc_d   = new hf_chunk{d_par_nbit, d_par_ncell, d_par_entry};
+    chunk_desc_h   = new hf_chunk{h_par_nbit, h_par_ncell, h_par_entry};
+    bitstream_desc = new hf_bitstream{d_tmp, d_bitstream, chunk_desc_d, chunk_desc_h, sublen, pardeg, numSMs};
+
+    if (dbg_print) debug();
+}
+
+TEMPLATE_TYPE
+void IMPL::build_codebook(cusz::FREQ* freq, int const booklen, cudaStream_t stream)
+{
+    book_desc->freq = freq;
+    asz::hf_buildbook_g<T, H>(freq, booklen, d_book, d_revbook, get_revbook_nbyte(booklen), &time_book, stream);
+}
+
+TEMPLATE_TYPE
+void IMPL::encode(
+    T*           in_uncompressed,
+    size_t const in_uncompressed_len,
+    BYTE*&       out_compressed,
+    size_t&      out_compressed_len,
+    cudaStream_t stream)
+{
+    time_lossless = 0;
+
+    struct Header header;
+
+    asz::hf_encode_coarse_rev1<T, H, M>(
+        in_uncompressed, in_uncompressed_len,  //
+        book_desc, bitstream_desc,             //
+        out_compressed, out_compressed_len, time_lossless, stream);
+
+    header.total_nbit =
+        std::accumulate((M*)chunk_desc_h->bits, (M*)chunk_desc_h->bits + bitstream_desc->pardeg, (size_t)0);
+    header.total_ncell =
+        std::accumulate((M*)chunk_desc_h->cells, (M*)chunk_desc_h->cells + bitstream_desc->pardeg, (size_t)0);
+    // update with the precise BITSTREAM nbyte
+    rte.nbyte[RTE::BITSTREAM] = sizeof(H) * header.total_ncell;
+
+    // d_revbook and revbook_nbyte is hidden; need to improve here
+    subfile_collect(
+        header, in_uncompressed_len, book_desc->booklen, bitstream_desc->sublen, bitstream_desc->pardeg, stream);
+
+    out_compressed     = d_compressed;
+    out_compressed_len = header.subfile_size();
+}
+
+TEMPLATE_TYPE
+void IMPL::decode(BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool header_on_device)
+{
+    Header header;
+    if (header_on_device)
+        CHECK_CUDA(cudaMemcpyAsync(&header, in_compressed, sizeof(header), cudaMemcpyDeviceToHost, stream));
+
+    auto d_revbook   = ACCESSOR(REVBOOK, BYTE);
+    auto d_par_nbit  = ACCESSOR(PAR_NBIT, M);
+    auto d_par_entry = ACCESSOR(PAR_ENTRY, M);
+    auto d_bitstream = ACCESSOR(BITSTREAM, H);
+
+    auto const revbook_nbyte = get_revbook_nbyte(header.booklen);
+
+    // launch_coarse_grained_Huffman_decoding<T, H, M>(
+    asz::hf_decode_coarse<T, H, M>(
+        d_bitstream, d_revbook, revbook_nbyte, d_par_nbit, d_par_entry, header.sublen, header.pardeg, out_decompressed,
+        time_lossless, stream);
+}
+
+TEMPLATE_TYPE
+void IMPL::clear_buffer()
+{
+    cudaMemset(d_tmp, 0x0, rte.nbyte[RTE::TMP]);
+    cudaMemset(d_book, 0x0, rte.nbyte[RTE::BOOK]);
+    cudaMemset(d_revbook, 0x0, rte.nbyte[RTE::REVBOOK]);
+    cudaMemset(d_par_nbit, 0x0, rte.nbyte[RTE::PAR_NBIT]);
+    cudaMemset(d_par_ncell, 0x0, rte.nbyte[RTE::PAR_NCELL]);
+    cudaMemset(d_par_entry, 0x0, rte.nbyte[RTE::PAR_ENTRY]);
+    cudaMemset(d_bitstream, 0x0, rte.nbyte[RTE::BITSTREAM]);
+}
+
+// private helper
+TEMPLATE_TYPE
+void IMPL::subfile_collect(
+    Header&      header,
+    size_t const in_uncompressed_len,
+    int const    booklen,
+    int const    sublen,
+    int const    pardeg,
+    cudaStream_t stream)
+{
+    auto BARRIER = [&]() {
+        if (stream)
+            CHECK_CUDA(cudaStreamSynchronize(stream));
+        else
+            CHECK_CUDA(cudaDeviceSynchronize());
+    };
+
+    header.self_bytes       = sizeof(Header);
+    header.booklen          = booklen;
+    header.sublen           = sublen;
+    header.pardeg           = pardeg;
+    header.uncompressed_len = in_uncompressed_len;
+
+    MetadataT nbyte[Header::END];
+    nbyte[Header::HEADER] = sizeof(Header);
+
+    EXPORT_NBYTE(REVBOOK)
+    EXPORT_NBYTE(PAR_NBIT)
+    EXPORT_NBYTE(PAR_ENTRY)
+    EXPORT_NBYTE(BITSTREAM)
+
+    header.entry[0] = 0;
+    // *.END + 1: need to know the ending position
+    for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] = nbyte[i - 1]; }
+    for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; }
+
+    // auto debug_header_entry = [&]() {
+    //     for (auto i = 0; i < Header::END + 1; i++) printf("%d, header entry: %d\n", i, header.entry[i]);
+    // };
+    // debug_header_entry();
+
+    CHECK_CUDA(cudaMemcpyAsync(d_compressed, &header, sizeof(header), cudaMemcpyHostToDevice, stream));
+
+    /* debug */ BARRIER();
+
+    DEVICE2DEVICE_COPY(revbook, REVBOOK)
+    DEVICE2DEVICE_COPY(par_nbit, PAR_NBIT)
+    DEVICE2DEVICE_COPY(par_entry, PAR_ENTRY)
+    DEVICE2DEVICE_COPY(bitstream, BITSTREAM)
+}
+
+// getter
+TEMPLATE_TYPE
+float IMPL::get_time_elapsed() const { return milliseconds; }
+
+TEMPLATE_TYPE
+float IMPL::get_time_book() const { return time_book; }
+TEMPLATE_TYPE
+float IMPL::get_time_lossless() const { return time_lossless; }
+
+TEMPLATE_TYPE
+H* IMPL::expose_book() const { return d_book; }
+
+TEMPLATE_TYPE
+BYTE* IMPL::expose_revbook() const { return d_revbook; }
+
+// TODO this kind of space will be overlapping with quant-codes
+TEMPLATE_TYPE
+size_t IMPL::get_workspace_nbyte(size_t len) const { return sizeof(H) * len; }
+
+TEMPLATE_TYPE
+size_t IMPL::get_max_output_nbyte(size_t len) const { return sizeof(H) * len / 2; }
+
+TEMPLATE_TYPE
+size_t IMPL::get_revbook_nbyte(int dict_size) { return sizeof(BOOK) * (2 * CELL_BITWIDTH) + sizeof(SYM) * dict_size; }
+
+TEMPLATE_TYPE
+constexpr bool IMPL::can_overlap_input_and_firstphase_encode() { return sizeof(T) == sizeof(H); }
+
+// auxiliary
+TEMPLATE_TYPE
+void IMPL::dbg_println(const std::string SYM_name, void* VAR, int SYM)
+{
+    CUdeviceptr pbase0{0};
+    size_t      psize0{0};
+
+    cuMemGetAddressRange(&pbase0, &psize0, (CUdeviceptr)VAR);
+    printf(
+        "%s:\n"
+        "\t(supposed) pointer : %p\n"
+        "\t(supposed) bytes   : %'9lu\n"
+        "\t(queried)  pbase0  : %p\n"
+        "\t(queried)  psize0  : %'9lu\n",
+        SYM_name.c_str(), (void*)VAR, (size_t)rte.nbyte[SYM], (void*)&pbase0, psize0);
+    pbase0 = 0, psize0 = 0;
+}
+
+}  // namespace cusz
+
+#undef HC_ALLOCDEV
+#undef HC_ALLOCHOST
+#undef HC_FREEDEV
+#undef HC_FREEHOST
+#undef EXPORT_NBYTE
+#undef ACCESSOR
+#undef DEVICE2DEVICE_COPY
+
+#undef TEMPLATE_TYPE
+#undef IMPL
+
+#endif
diff --git a/qtensor/compression/cusz/src/hf/detail/par_merge.inl b/qtensor/compression/cusz/src/hf/detail/par_merge.inl
index 6e934a08..70068967 100644
--- a/qtensor/compression/cusz/src/hf/detail/par_merge.inl
+++ b/qtensor/compression/cusz/src/hf/detail/par_merge.inl
@@ -1,445 +1,445 @@
-/*
- * Authors:
- *  Oded Green (ogreen@gatech.edu), Rob McColl (robert.c.mccoll@gmail.com)
- *  High Performance Computing Lab, Georgia Tech
- *
- * Future Publication:
- * GPU MergePath: A GPU Merging Algorithm
- * ACM International Conference on Supercomputing 2012
- * June 25-29 2012, San Servolo, Venice, Italy
- *
- * (C) 2012 Georgia Institute of Technology
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * - Redistributions of source code must retain the above copyright notice,
- *   this list of conditions and the following disclaimer.
- * - Redistributions in binary form must reproduce the above copyright notice,
- *   this list of conditions and the following disclaimer in the documentation
- *   and/or other materials provided with the distribution.
- * - Neither the name of the Georgia Institute of Technology nor the names of
- *   its contributors may be used to endorse or promote products derived from
- *   this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
- * THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/**
- * @file par_merge.h
- * @author Oded Green (ogreen@gatech.edu), Rob McColl (robert.c.mccoll@gmail.com))
- * @brief Modified and adapted by Cody Rivera
- * @version 0.3
- * @date 2020-10-24
- * (created) 2020-06 (rev) 2021-06-21
- *
- */
-
-#ifndef CUSZ_KERNEL_PAR_MERGE_CUH
-#define CUSZ_KERNEL_PAR_MERGE_CUH
-
-#include <cuda.h>
-#include <float.h>
-#include <limits.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <cooperative_groups.h>
-namespace cg = cooperative_groups;
-
-#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
-#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
-// Mathematically correct modulo
-#define MOD(a, b) ((((a) % (b)) + (b)) % (b))
-
-/* MERGETYPE
- * Performs <runs> merges of two sorted pseudorandom <vec_t> arrays of length <size>
- * Times the runs and reports on the average time
- * Checks the output of each merge for correctness
- */
-#define PADDING 1024
-
-/********************************************************************************
- * signature
- ********************************************************************************/
-
-// Partition array
-template <typename F>
-__device__ void cudaWorkloadDiagonals(
-    F*        copyFreq,
-    int*      copyIndex,
-    int*      copyIsLeaf,
-    int       cStart,
-    int       cEnd,
-    F*        iNodesFreq,
-    int       iStart,
-    int       iEnd,
-    int       iNodesCap,
-    uint32_t* diagonal_path_intersections,
-    /* Shared Memory */
-    int32_t& x_top,
-    int32_t& y_top,
-    int32_t& x_bottom,
-    int32_t& y_bottom,
-    int32_t& found,
-    int32_t* oneorzero);
-
-// Merge partitions
-template <typename F>
-__device__ void cudaMergeSinglePath(
-    F*        copyFreq,
-    int*      copyIndex,
-    int*      copyIsLeaf,
-    int       cStart,
-    int       cEnd,
-    F*        iNodesFreq,
-    int       iStart,
-    int       iEnd,
-    int       iNodesCap,
-    uint32_t* diagonal_path_intersections,
-    F*        tempFreq,
-    int*      tempIndex,
-    int*      tempIsLeaf,
-    int       tempLength);
-
-template <typename F>
-__device__ void parMerge(
-    F*        copyFreq,
-    int*      copyIndex,
-    int*      copyIsLeaf,
-    int       cStart,
-    int       cEnd,
-    F*        iNodesFreq,
-    int       iStart,
-    int       iEnd,
-    int       iNodesCap,
-    F*        tempFreq,
-    int*      tempIndex,
-    int*      tempIsLeaf,
-    int&      tempLength,
-    uint32_t* diagonal_path_intersections,
-    int       blocks,
-    int       threads,
-    /* Shared Memory */
-    int32_t& x_top,
-    int32_t& y_top,
-    int32_t& x_bottom,
-    int32_t& y_bottom,
-    int32_t& found,
-    int32_t* oneorzero);
-
-template <typename F>
-__device__ void merge(
-    F*   copyFreq,
-    int* copyIndex,
-    int* copyIsLeaf,
-    int  cStart,
-    int  cEnd,
-    F*   iNodesFreq,
-    int  iStart,
-    int  iEnd,
-    int  iNodesCap,
-    F*   tempFreq,
-    int* tempIndex,
-    int* tempIsLeaf,
-    int& tempLength);
-
-/********************************************************************************
- * definition
- ********************************************************************************/
-
-// clang-format off
-template <typename F>
-__device__ void parMerge(
-    F* copyFreq,    int* copyIndex,  int* copyIsLeaf,   int  cStart,    int cEnd,
-    F* iNodesFreq,  int  iStart,     int  iEnd,         int  iNodesCap,
-    F* tempFreq,    int* tempIndex,  int* tempIsLeaf,   int& tempLength,
-    uint32_t* diagonal_path_intersections, int blocks,  int  threads,
-    /* Shared Memory */
-    int32_t& x_top, int32_t& y_top,  int32_t& x_bottom, int32_t& y_bottom,
-    int32_t& found, int32_t* oneorzero)
-    {
-    // clang-format on
-    auto current_grid = cg::this_grid();
-    current_grid.sync();
-    tempLength = (cEnd - cStart) + MOD(iEnd - iStart, iNodesCap);
-
-    if (tempLength == 0) return;
-
-    // Perform the global diagonal intersection serach to divide work among SMs
-    cudaWorkloadDiagonals<F>(
-        copyFreq, copyIndex, copyIsLeaf, cStart, cEnd,  //
-        iNodesFreq, iStart, iEnd, iNodesCap,            //
-        diagonal_path_intersections,                    //
-        x_top, y_top, x_bottom, y_bottom, found, oneorzero);
-    current_grid.sync();
-
-    // Merge between global diagonals independently on each block
-    cudaMergeSinglePath<F>(
-        copyFreq, copyIndex, copyIsLeaf, cStart, cEnd,  //
-        iNodesFreq, iStart, iEnd, iNodesCap,            //
-        diagonal_path_intersections,                    //
-        tempFreq, tempIndex, tempIsLeaf, tempLength);
-    current_grid.sync();
-}
-
-/* CUDAWORKLOADDIAGONALS
- * Performs a 32-wide binary search on one glboal diagonal per block to find the intersection with the path.
- * This divides the workload into independent merges for the next step
- */
-// clang-format off
-template <typename F>
-__device__ void cudaWorkloadDiagonals(
-    F*  copyFreq,   int* copyIndex, int* copyIsLeaf,
-    int cStart,     int  cEnd,
-    F*  iNodesFreq,
-    int iStart,     int  iEnd,      int  iNodesCap,
-    uint32_t* diagonal_path_intersections,
-    /* Shared Memory */
-    int32_t& x_top, int32_t& y_top, int32_t& x_bottom, int32_t& y_bottom,
-    int32_t& found, int32_t* oneorzero)
-{
-    // clang-format on
-    uint32_t A_length = cEnd - cStart;
-    uint32_t B_length = MOD(iEnd - iStart, iNodesCap);
-    // Calculate combined index around the MergePath "matrix"
-    int32_t combinedIndex = ((uint64_t)blockIdx.x * ((uint64_t)A_length + (uint64_t)B_length)) / (uint64_t)gridDim.x;
-    /*
-    __shared__ int32_t x_top, y_top, x_bottom, y_bottom,  found;
-    __shared__ int32_t oneorzero[32];
-    */
-    int threadOffset = threadIdx.x - 16;
-
-    if (threadIdx.x < 32) {
-        // Figure out the coordinates of our diagonal
-        if (A_length >= B_length) {
-            x_top    = MIN(combinedIndex, A_length);
-            y_top    = combinedIndex > A_length ? combinedIndex - (A_length) : 0;
-            x_bottom = y_top;
-            y_bottom = x_top;
-        }
-        else {
-            y_bottom = MIN(combinedIndex, B_length);
-            x_bottom = combinedIndex > B_length ? combinedIndex - (B_length) : 0;
-            y_top    = x_bottom;
-            x_top    = y_bottom;
-        }
-    }
-
-    // if (threadIdx.x == 0) {
-    //    printf("Diagonal block %d: (%d, %d) to (%d, %d)\n", blockIdx.x, x_top, y_top, x_bottom, y_bottom);
-    //}
-
-    found = 0;
-
-    // Search the diagonal
-    while (!found) {
-        // Update our coordinates within the 32-wide section of the diagonal
-        int32_t current_x = x_top - ((x_top - x_bottom) >> 1) - threadOffset;
-        int32_t current_y = y_top + ((y_bottom - y_top) >> 1) + threadOffset;
-        int32_t getfrom_x = current_x + cStart - 1;
-        // Below statement is a more efficient, divmodless version of the following
-        // int32_t getfrom_y = MOD(iStart + current_y, iNodesCap);
-        int32_t getfrom_y = iStart + current_y;
-
-        if (threadIdx.x < 32) {
-            if (getfrom_y >= iNodesCap) getfrom_y -= iNodesCap;
-
-            // Are we a '1' or '0' with respect to A[x] <= B[x]
-            if (current_x > (int32_t)A_length or current_y < 0) { oneorzero[threadIdx.x] = 0; }
-            else if (current_y >= (int32_t)B_length || current_x < 1) {
-                oneorzero[threadIdx.x] = 1;
-            }
-            else {
-                oneorzero[threadIdx.x] = (copyFreq[getfrom_x] <= iNodesFreq[getfrom_y]) ? 1 : 0;
-            }
-        }
-
-        __syncthreads();
-
-        // If we find the meeting of the '1's and '0's, we found the
-        // intersection of the path and diagonal
-        if (threadIdx.x > 0 and                                     //
-            threadIdx.x < 32 and                                    //
-            (oneorzero[threadIdx.x] != oneorzero[threadIdx.x - 1])  //
-        ) {
-            found = 1;
-
-            diagonal_path_intersections[blockIdx.x]                 = current_x;
-            diagonal_path_intersections[blockIdx.x + gridDim.x + 1] = current_y;
-        }
-
-        __syncthreads();
-
-        // Adjust the search window on the diagonal
-        if (threadIdx.x == 16) {
-            if (oneorzero[31] != 0) {
-                x_bottom = current_x;
-                y_bottom = current_y;
-            }
-            else {
-                x_top = current_x;
-                y_top = current_y;
-            }
-        }
-        __syncthreads();
-    }
-
-    // Set the boundary diagonals (through 0,0 and A_length,B_length)
-    if (threadIdx.x == 0 && blockIdx.x == 0) {
-        diagonal_path_intersections[0]                         = 0;
-        diagonal_path_intersections[gridDim.x + 1]             = 0;
-        diagonal_path_intersections[gridDim.x]                 = A_length;
-        diagonal_path_intersections[gridDim.x + gridDim.x + 1] = B_length;
-    }
-}
-
-// Serial merge
-// clang-format off
-template <typename F>
-__device__ void merge(
-    F*   copyFreq,   int* copyIndex, int* copyIsLeaf, int  cStart,    int  cEnd,
-    F*   iNodesFreq, int  iStart,    int  iEnd,       int  iNodesCap,
-    F*   tempFreq,   int* tempIndex, int* tempIsLeaf, int& tempLength)
-{
-    // clang-format on
-    int len      = 0;
-    int iterCopy = cStart, iterINodes = iStart;
-
-    while (iterCopy < cEnd && MOD(iEnd - iterINodes, iNodesCap) > 0) {
-        if (copyFreq[iterCopy] <= iNodesFreq[iterINodes]) {
-            tempFreq[len]   = copyFreq[iterCopy];
-            tempIndex[len]  = copyIndex[iterCopy];
-            tempIsLeaf[len] = copyIsLeaf[iterCopy];
-            ++iterCopy;
-        }
-        else {
-            tempFreq[len]   = iNodesFreq[iterINodes];
-            tempIndex[len]  = iterINodes;
-            tempIsLeaf[len] = 0;
-            iterINodes      = MOD(iterINodes + 1, iNodesCap);
-        }
-        ++len;
-    }
-
-    while (iterCopy < cEnd) {
-        tempFreq[len]   = copyFreq[iterCopy];
-        tempIndex[len]  = copyIndex[iterCopy];
-        tempIsLeaf[len] = copyIsLeaf[iterCopy];
-        ++iterCopy;
-        ++len;
-    }
-    while (MOD(iEnd - iterINodes, iNodesCap) > 0) {
-        tempFreq[len]   = iNodesFreq[iterINodes];
-        tempIndex[len]  = iterINodes;
-        tempIsLeaf[len] = 0;
-        iterINodes      = MOD(iterINodes + 1, iNodesCap);
-        ++len;
-    }
-
-    tempLength = len;
-}
-
-/* CUDAMERGESINGLEPATH
- * Performs merge windows within a thread block from that block's global diagonal
- * intersection to the next
- */
-#define K 512
-#define PAD_SIZE 0
-
-// clang-format off
-template <typename F>
-__device__ void cudaMergeSinglePath(
-    F*  copyFreq,   int* copyIndex, int* copyIsLeaf,
-    int cStart,     int  cEnd,
-    F*  iNodesFreq,
-    int iStart,     int  iEnd,      int  iNodesCap,
-    uint32_t* diagonal_path_intersections,
-    F*  tempFreq,   int* tempIndex, int* tempIsLeaf,
-    int tempLength)
-{
-    // clang-format on
-    // Temporary Code -- Serial Merge Per Block
-    if (threadIdx.x == 0) {
-        // Boundaries
-        int x_block_top  = diagonal_path_intersections[blockIdx.x];
-        int y_block_top  = diagonal_path_intersections[blockIdx.x + gridDim.x + 1];
-        int x_block_stop = diagonal_path_intersections[blockIdx.x + 1];
-        int y_block_stop = diagonal_path_intersections[blockIdx.x + gridDim.x + 2];
-
-        // Actual indexes
-        int x_start = x_block_top + cStart;
-        int x_end   = x_block_stop + cStart;
-        int y_start = MOD(iStart + y_block_top, iNodesCap);
-        int y_end   = MOD(iStart + y_block_stop, iNodesCap);
-
-        int offset = x_block_top + y_block_top;
-
-        int dummy;  // Unused result
-        // TODO optimize serial merging of each partition
-        merge(
-            copyFreq, copyIndex, copyIsLeaf, x_start, x_end,  //
-            iNodesFreq, y_start, y_end, iNodesCap,            //
-            tempFreq + offset, tempIndex + offset, tempIsLeaf + offset, dummy);
-        if (0) {
-            printf(
-                "block: %d x: %d %d, y: %d %d, contrib: %d\n", blockIdx.x, x_block_top, x_block_stop, y_block_top,
-                y_block_stop, dummy);
-        }
-    }
-}
-
-// `unsigned int` instantiations
-template __device__ void parMerge<unsigned int>(
-    unsigned int* copyFreq,
-    int*          copyIndex,
-    int*          copyIsLeaf,
-    int           cStart,
-    int           cEnd,
-    unsigned int* iNodesFreq,
-    int           iStart,
-    int           iEnd,
-    int           iNodesCap,
-    unsigned int* tempFreq,
-    int*          tempIndex,
-    int*          tempIsLeaf,
-    int&          tempLength,
-    uint32_t*     diagonal_path_intersections,
-    int           blocks,
-    int           threads,
-    /* Shared Memory */
-    int32_t& x_top,
-    int32_t& y_top,
-    int32_t& x_bottom,
-    int32_t& y_bottom,
-    int32_t& found,
-    int32_t* oneorzero);
-
-template __device__ void merge<unsigned int>(
-    unsigned int* copyFreq,
-    int*          copyIndex,
-    int*          copyIsLeaf,
-    int           cStart,
-    int           cEnd,
-    unsigned int* iNodesFreq,
-    int           iStart,
-    int           iEnd,
-    int           iNodesCap,
-    unsigned int* tempFreq,
-    int*          tempIndex,
-    int*          tempIsLeaf,
-    int&          tempLength);
-
+/*
+ * Authors:
+ *  Oded Green (ogreen@gatech.edu), Rob McColl (robert.c.mccoll@gmail.com)
+ *  High Performance Computing Lab, Georgia Tech
+ *
+ * Future Publication:
+ * GPU MergePath: A GPU Merging Algorithm
+ * ACM International Conference on Supercomputing 2012
+ * June 25-29 2012, San Servolo, Venice, Italy
+ *
+ * (C) 2012 Georgia Institute of Technology
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the Georgia Institute of Technology nor the names of
+ *   its contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file par_merge.h
+ * @author Oded Green (ogreen@gatech.edu), Rob McColl (robert.c.mccoll@gmail.com))
+ * @brief Modified and adapted by Cody Rivera
+ * @version 0.3
+ * @date 2020-10-24
+ * (created) 2020-06 (rev) 2021-06-21
+ *
+ */
+
+#ifndef CUSZ_KERNEL_PAR_MERGE_CUH
+#define CUSZ_KERNEL_PAR_MERGE_CUH
+
+#include <cuda.h>
+#include <float.h>
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <cooperative_groups.h>
+namespace cg = cooperative_groups;
+
+#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
+#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
+// Mathematically correct modulo
+#define MOD(a, b) ((((a) % (b)) + (b)) % (b))
+
+/* MERGETYPE
+ * Performs <runs> merges of two sorted pseudorandom <vec_t> arrays of length <size>
+ * Times the runs and reports on the average time
+ * Checks the output of each merge for correctness
+ */
+#define PADDING 1024
+
+/********************************************************************************
+ * signature
+ ********************************************************************************/
+
+// Partition array
+template <typename F>
+__device__ void cudaWorkloadDiagonals(
+    F*        copyFreq,
+    int*      copyIndex,
+    int*      copyIsLeaf,
+    int       cStart,
+    int       cEnd,
+    F*        iNodesFreq,
+    int       iStart,
+    int       iEnd,
+    int       iNodesCap,
+    uint32_t* diagonal_path_intersections,
+    /* Shared Memory */
+    int32_t& x_top,
+    int32_t& y_top,
+    int32_t& x_bottom,
+    int32_t& y_bottom,
+    int32_t& found,
+    int32_t* oneorzero);
+
+// Merge partitions
+template <typename F>
+__device__ void cudaMergeSinglePath(
+    F*        copyFreq,
+    int*      copyIndex,
+    int*      copyIsLeaf,
+    int       cStart,
+    int       cEnd,
+    F*        iNodesFreq,
+    int       iStart,
+    int       iEnd,
+    int       iNodesCap,
+    uint32_t* diagonal_path_intersections,
+    F*        tempFreq,
+    int*      tempIndex,
+    int*      tempIsLeaf,
+    int       tempLength);
+
+template <typename F>
+__device__ void parMerge(
+    F*        copyFreq,
+    int*      copyIndex,
+    int*      copyIsLeaf,
+    int       cStart,
+    int       cEnd,
+    F*        iNodesFreq,
+    int       iStart,
+    int       iEnd,
+    int       iNodesCap,
+    F*        tempFreq,
+    int*      tempIndex,
+    int*      tempIsLeaf,
+    int&      tempLength,
+    uint32_t* diagonal_path_intersections,
+    int       blocks,
+    int       threads,
+    /* Shared Memory */
+    int32_t& x_top,
+    int32_t& y_top,
+    int32_t& x_bottom,
+    int32_t& y_bottom,
+    int32_t& found,
+    int32_t* oneorzero);
+
+template <typename F>
+__device__ void merge(
+    F*   copyFreq,
+    int* copyIndex,
+    int* copyIsLeaf,
+    int  cStart,
+    int  cEnd,
+    F*   iNodesFreq,
+    int  iStart,
+    int  iEnd,
+    int  iNodesCap,
+    F*   tempFreq,
+    int* tempIndex,
+    int* tempIsLeaf,
+    int& tempLength);
+
+/********************************************************************************
+ * definition
+ ********************************************************************************/
+
+// clang-format off
+template <typename F>
+__device__ void parMerge(
+    F* copyFreq,    int* copyIndex,  int* copyIsLeaf,   int  cStart,    int cEnd,
+    F* iNodesFreq,  int  iStart,     int  iEnd,         int  iNodesCap,
+    F* tempFreq,    int* tempIndex,  int* tempIsLeaf,   int& tempLength,
+    uint32_t* diagonal_path_intersections, int blocks,  int  threads,
+    /* Shared Memory */
+    int32_t& x_top, int32_t& y_top,  int32_t& x_bottom, int32_t& y_bottom,
+    int32_t& found, int32_t* oneorzero)
+    {
+    // clang-format on
+    auto current_grid = cg::this_grid();
+    current_grid.sync();
+    tempLength = (cEnd - cStart) + MOD(iEnd - iStart, iNodesCap);
+
+    if (tempLength == 0) return;
+
+    // Perform the global diagonal intersection serach to divide work among SMs
+    cudaWorkloadDiagonals<F>(
+        copyFreq, copyIndex, copyIsLeaf, cStart, cEnd,  //
+        iNodesFreq, iStart, iEnd, iNodesCap,            //
+        diagonal_path_intersections,                    //
+        x_top, y_top, x_bottom, y_bottom, found, oneorzero);
+    current_grid.sync();
+
+    // Merge between global diagonals independently on each block
+    cudaMergeSinglePath<F>(
+        copyFreq, copyIndex, copyIsLeaf, cStart, cEnd,  //
+        iNodesFreq, iStart, iEnd, iNodesCap,            //
+        diagonal_path_intersections,                    //
+        tempFreq, tempIndex, tempIsLeaf, tempLength);
+    current_grid.sync();
+}
+
+/* CUDAWORKLOADDIAGONALS
+ * Performs a 32-wide binary search on one glboal diagonal per block to find the intersection with the path.
+ * This divides the workload into independent merges for the next step
+ */
+// clang-format off
+template <typename F>
+__device__ void cudaWorkloadDiagonals(
+    F*  copyFreq,   int* copyIndex, int* copyIsLeaf,
+    int cStart,     int  cEnd,
+    F*  iNodesFreq,
+    int iStart,     int  iEnd,      int  iNodesCap,
+    uint32_t* diagonal_path_intersections,
+    /* Shared Memory */
+    int32_t& x_top, int32_t& y_top, int32_t& x_bottom, int32_t& y_bottom,
+    int32_t& found, int32_t* oneorzero)
+{
+    // clang-format on
+    uint32_t A_length = cEnd - cStart;
+    uint32_t B_length = MOD(iEnd - iStart, iNodesCap);
+    // Calculate combined index around the MergePath "matrix"
+    int32_t combinedIndex = ((uint64_t)blockIdx.x * ((uint64_t)A_length + (uint64_t)B_length)) / (uint64_t)gridDim.x;
+    /*
+    __shared__ int32_t x_top, y_top, x_bottom, y_bottom,  found;
+    __shared__ int32_t oneorzero[32];
+    */
+    int threadOffset = threadIdx.x - 16;
+
+    if (threadIdx.x < 32) {
+        // Figure out the coordinates of our diagonal
+        if (A_length >= B_length) {
+            x_top    = MIN(combinedIndex, A_length);
+            y_top    = combinedIndex > A_length ? combinedIndex - (A_length) : 0;
+            x_bottom = y_top;
+            y_bottom = x_top;
+        }
+        else {
+            y_bottom = MIN(combinedIndex, B_length);
+            x_bottom = combinedIndex > B_length ? combinedIndex - (B_length) : 0;
+            y_top    = x_bottom;
+            x_top    = y_bottom;
+        }
+    }
+
+    // if (threadIdx.x == 0) {
+    //    printf("Diagonal block %d: (%d, %d) to (%d, %d)\n", blockIdx.x, x_top, y_top, x_bottom, y_bottom);
+    //}
+
+    found = 0;
+
+    // Search the diagonal
+    while (!found) {
+        // Update our coordinates within the 32-wide section of the diagonal
+        int32_t current_x = x_top - ((x_top - x_bottom) >> 1) - threadOffset;
+        int32_t current_y = y_top + ((y_bottom - y_top) >> 1) + threadOffset;
+        int32_t getfrom_x = current_x + cStart - 1;
+        // Below statement is a more efficient, divmodless version of the following
+        // int32_t getfrom_y = MOD(iStart + current_y, iNodesCap);
+        int32_t getfrom_y = iStart + current_y;
+
+        if (threadIdx.x < 32) {
+            if (getfrom_y >= iNodesCap) getfrom_y -= iNodesCap;
+
+            // Are we a '1' or '0' with respect to A[x] <= B[x]
+            if (current_x > (int32_t)A_length or current_y < 0) { oneorzero[threadIdx.x] = 0; }
+            else if (current_y >= (int32_t)B_length || current_x < 1) {
+                oneorzero[threadIdx.x] = 1;
+            }
+            else {
+                oneorzero[threadIdx.x] = (copyFreq[getfrom_x] <= iNodesFreq[getfrom_y]) ? 1 : 0;
+            }
+        }
+
+        __syncthreads();
+
+        // If we find the meeting of the '1's and '0's, we found the
+        // intersection of the path and diagonal
+        if (threadIdx.x > 0 and                                     //
+            threadIdx.x < 32 and                                    //
+            (oneorzero[threadIdx.x] != oneorzero[threadIdx.x - 1])  //
+        ) {
+            found = 1;
+
+            diagonal_path_intersections[blockIdx.x]                 = current_x;
+            diagonal_path_intersections[blockIdx.x + gridDim.x + 1] = current_y;
+        }
+
+        __syncthreads();
+
+        // Adjust the search window on the diagonal
+        if (threadIdx.x == 16) {
+            if (oneorzero[31] != 0) {
+                x_bottom = current_x;
+                y_bottom = current_y;
+            }
+            else {
+                x_top = current_x;
+                y_top = current_y;
+            }
+        }
+        __syncthreads();
+    }
+
+    // Set the boundary diagonals (through 0,0 and A_length,B_length)
+    if (threadIdx.x == 0 && blockIdx.x == 0) {
+        diagonal_path_intersections[0]                         = 0;
+        diagonal_path_intersections[gridDim.x + 1]             = 0;
+        diagonal_path_intersections[gridDim.x]                 = A_length;
+        diagonal_path_intersections[gridDim.x + gridDim.x + 1] = B_length;
+    }
+}
+
+// Serial merge
+// clang-format off
+template <typename F>
+__device__ void merge(
+    F*   copyFreq,   int* copyIndex, int* copyIsLeaf, int  cStart,    int  cEnd,
+    F*   iNodesFreq, int  iStart,    int  iEnd,       int  iNodesCap,
+    F*   tempFreq,   int* tempIndex, int* tempIsLeaf, int& tempLength)
+{
+    // clang-format on
+    int len      = 0;
+    int iterCopy = cStart, iterINodes = iStart;
+
+    while (iterCopy < cEnd && MOD(iEnd - iterINodes, iNodesCap) > 0) {
+        if (copyFreq[iterCopy] <= iNodesFreq[iterINodes]) {
+            tempFreq[len]   = copyFreq[iterCopy];
+            tempIndex[len]  = copyIndex[iterCopy];
+            tempIsLeaf[len] = copyIsLeaf[iterCopy];
+            ++iterCopy;
+        }
+        else {
+            tempFreq[len]   = iNodesFreq[iterINodes];
+            tempIndex[len]  = iterINodes;
+            tempIsLeaf[len] = 0;
+            iterINodes      = MOD(iterINodes + 1, iNodesCap);
+        }
+        ++len;
+    }
+
+    while (iterCopy < cEnd) {
+        tempFreq[len]   = copyFreq[iterCopy];
+        tempIndex[len]  = copyIndex[iterCopy];
+        tempIsLeaf[len] = copyIsLeaf[iterCopy];
+        ++iterCopy;
+        ++len;
+    }
+    while (MOD(iEnd - iterINodes, iNodesCap) > 0) {
+        tempFreq[len]   = iNodesFreq[iterINodes];
+        tempIndex[len]  = iterINodes;
+        tempIsLeaf[len] = 0;
+        iterINodes      = MOD(iterINodes + 1, iNodesCap);
+        ++len;
+    }
+
+    tempLength = len;
+}
+
+/* CUDAMERGESINGLEPATH
+ * Performs merge windows within a thread block from that block's global diagonal
+ * intersection to the next
+ */
+#define K 512
+#define PAD_SIZE 0
+
+// clang-format off
+template <typename F>
+__device__ void cudaMergeSinglePath(
+    F*  copyFreq,   int* copyIndex, int* copyIsLeaf,
+    int cStart,     int  cEnd,
+    F*  iNodesFreq,
+    int iStart,     int  iEnd,      int  iNodesCap,
+    uint32_t* diagonal_path_intersections,
+    F*  tempFreq,   int* tempIndex, int* tempIsLeaf,
+    int tempLength)
+{
+    // clang-format on
+    // Temporary Code -- Serial Merge Per Block
+    if (threadIdx.x == 0) {
+        // Boundaries
+        int x_block_top  = diagonal_path_intersections[blockIdx.x];
+        int y_block_top  = diagonal_path_intersections[blockIdx.x + gridDim.x + 1];
+        int x_block_stop = diagonal_path_intersections[blockIdx.x + 1];
+        int y_block_stop = diagonal_path_intersections[blockIdx.x + gridDim.x + 2];
+
+        // Actual indexes
+        int x_start = x_block_top + cStart;
+        int x_end   = x_block_stop + cStart;
+        int y_start = MOD(iStart + y_block_top, iNodesCap);
+        int y_end   = MOD(iStart + y_block_stop, iNodesCap);
+
+        int offset = x_block_top + y_block_top;
+
+        int dummy;  // Unused result
+        // TODO optimize serial merging of each partition
+        merge(
+            copyFreq, copyIndex, copyIsLeaf, x_start, x_end,  //
+            iNodesFreq, y_start, y_end, iNodesCap,            //
+            tempFreq + offset, tempIndex + offset, tempIsLeaf + offset, dummy);
+        if (0) {
+            printf(
+                "block: %d x: %d %d, y: %d %d, contrib: %d\n", blockIdx.x, x_block_top, x_block_stop, y_block_top,
+                y_block_stop, dummy);
+        }
+    }
+}
+
+// `unsigned int` instantiations
+template __device__ void parMerge<unsigned int>(
+    unsigned int* copyFreq,
+    int*          copyIndex,
+    int*          copyIsLeaf,
+    int           cStart,
+    int           cEnd,
+    unsigned int* iNodesFreq,
+    int           iStart,
+    int           iEnd,
+    int           iNodesCap,
+    unsigned int* tempFreq,
+    int*          tempIndex,
+    int*          tempIsLeaf,
+    int&          tempLength,
+    uint32_t*     diagonal_path_intersections,
+    int           blocks,
+    int           threads,
+    /* Shared Memory */
+    int32_t& x_top,
+    int32_t& y_top,
+    int32_t& x_bottom,
+    int32_t& y_bottom,
+    int32_t& found,
+    int32_t* oneorzero);
+
+template __device__ void merge<unsigned int>(
+    unsigned int* copyFreq,
+    int*          copyIndex,
+    int*          copyIsLeaf,
+    int           cStart,
+    int           cEnd,
+    unsigned int* iNodesFreq,
+    int           iStart,
+    int           iEnd,
+    int           iNodesCap,
+    unsigned int* tempFreq,
+    int*          tempIndex,
+    int*          tempIsLeaf,
+    int&          tempLength);
+
 #endif
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/hf/hf.cc b/qtensor/compression/cusz/src/hf/hf.cc
index 19387263..54b95b25 100644
--- a/qtensor/compression/cusz/src/hf/hf.cc
+++ b/qtensor/compression/cusz/src/hf/hf.cc
@@ -1,109 +1,109 @@
-/**
- * @file codec.cc
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-04-23
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#include "common/type_traits.hh"
-
-#include "hf/hf.hh"
-#include "hf/hf_bookg.hh"
-#include "hf/hf_codecg.hh"
-
-namespace cusz {
-
-#define TEMPLATE_TYPE template <typename T, typename H, typename M>
-#define HUFFMAN_COARSE LosslessCodec<T, H, M>
-
-TEMPLATE_TYPE
-HUFFMAN_COARSE::~LosslessCodec() { pimpl.reset(); }
-
-TEMPLATE_TYPE
-HUFFMAN_COARSE::LosslessCodec() : pimpl{std::make_unique<impl>()} {}
-
-TEMPLATE_TYPE
-HUFFMAN_COARSE::LosslessCodec(const HUFFMAN_COARSE& old) : pimpl{std::make_unique<impl>(*old.pimpl)}
-{
-    // TODO allocation/deep copy
-}
-
-TEMPLATE_TYPE
-HUFFMAN_COARSE& HUFFMAN_COARSE::operator=(const HUFFMAN_COARSE& old)
-{
-    *pimpl = *old.pimpl;
-    // TODO allocation/deep copy
-    return *this;
-}
-
-TEMPLATE_TYPE
-HUFFMAN_COARSE::LosslessCodec(HUFFMAN_COARSE&&) = default;
-
-TEMPLATE_TYPE
-HUFFMAN_COARSE& HUFFMAN_COARSE::operator=(HUFFMAN_COARSE&&) = default;
-
-//------------------------------------------------------------------------------
-
-TEMPLATE_TYPE
-void HUFFMAN_COARSE::init(size_t const in_uncompressed_len, int const booklen, int const pardeg, bool dbg_print)
-{
-    pimpl->init(in_uncompressed_len, booklen, pardeg, dbg_print);
-}
-
-TEMPLATE_TYPE
-void HUFFMAN_COARSE::build_codebook(uint32_t* freq, int const booklen, cudaStream_t stream)
-{
-    pimpl->build_codebook(freq, booklen, stream);
-}
-
-TEMPLATE_TYPE
-void HUFFMAN_COARSE::encode(
-    T*           in_uncompressed,
-    size_t const in_uncompressed_len,
-    BYTE*&       out_compressed,
-    size_t&      out_compressed_len,
-    cudaStream_t stream)
-{
-    pimpl->encode(in_uncompressed, in_uncompressed_len, out_compressed, out_compressed_len, stream);
-}
-
-TEMPLATE_TYPE
-void HUFFMAN_COARSE::decode(BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool header_on_device)
-{
-    pimpl->decode(in_compressed, out_decompressed, stream, header_on_device);
-}
-
-TEMPLATE_TYPE
-void HUFFMAN_COARSE::clear_buffer() { pimpl->clear_buffer(); }
-
-TEMPLATE_TYPE
-float HUFFMAN_COARSE::get_time_elapsed() const { return pimpl->get_time_elapsed(); }
-
-TEMPLATE_TYPE
-float HUFFMAN_COARSE::get_time_book() const { return pimpl->get_time_book(); }
-TEMPLATE_TYPE
-float HUFFMAN_COARSE::get_time_lossless() const { return pimpl->get_time_lossless(); }
-
-#undef TEMPLATE_TYPE
-#undef HUFFMAN_COARSE
-
-}  // namespace cusz
-
-#define HUFFCOARSE_CC(E, ETF, H, M) \
-    template class cusz::LosslessCodec<ErrCtrlTrait<E, ETF>::type, HuffTrait<H>::type, MetadataTrait<M>::type>;
-
-HUFFCOARSE_CC(1, false, 4, 4)  // uint
-HUFFCOARSE_CC(1, false, 8, 4)  //
-HUFFCOARSE_CC(2, false, 4, 4)  //
-HUFFCOARSE_CC(2, false, 8, 4)  //
-HUFFCOARSE_CC(4, false, 4, 4)  //
-HUFFCOARSE_CC(4, false, 8, 4)  //
-
-HUFFCOARSE_CC(4, true, 4, 4)  // float
-HUFFCOARSE_CC(4, true, 8, 4)  //
-
-#undef HUFFCOARSE_CC
+/**
+ * @file codec.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-23
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "common/type_traits.hh"
+
+#include "hf/hf.hh"
+#include "hf/hf_bookg.hh"
+#include "hf/hf_codecg.hh"
+
+namespace cusz {
+
+#define TEMPLATE_TYPE template <typename T, typename H, typename M>
+#define HUFFMAN_COARSE LosslessCodec<T, H, M>
+
+TEMPLATE_TYPE
+HUFFMAN_COARSE::~LosslessCodec() { pimpl.reset(); }
+
+TEMPLATE_TYPE
+HUFFMAN_COARSE::LosslessCodec() : pimpl{std::make_unique<impl>()} {}
+
+TEMPLATE_TYPE
+HUFFMAN_COARSE::LosslessCodec(const HUFFMAN_COARSE& old) : pimpl{std::make_unique<impl>(*old.pimpl)}
+{
+    // TODO allocation/deep copy
+}
+
+TEMPLATE_TYPE
+HUFFMAN_COARSE& HUFFMAN_COARSE::operator=(const HUFFMAN_COARSE& old)
+{
+    *pimpl = *old.pimpl;
+    // TODO allocation/deep copy
+    return *this;
+}
+
+TEMPLATE_TYPE
+HUFFMAN_COARSE::LosslessCodec(HUFFMAN_COARSE&&) = default;
+
+TEMPLATE_TYPE
+HUFFMAN_COARSE& HUFFMAN_COARSE::operator=(HUFFMAN_COARSE&&) = default;
+
+//------------------------------------------------------------------------------
+
+TEMPLATE_TYPE
+void HUFFMAN_COARSE::init(size_t const in_uncompressed_len, int const booklen, int const pardeg, bool dbg_print)
+{
+    pimpl->init(in_uncompressed_len, booklen, pardeg, dbg_print);
+}
+
+TEMPLATE_TYPE
+void HUFFMAN_COARSE::build_codebook(uint32_t* freq, int const booklen, cudaStream_t stream)
+{
+    pimpl->build_codebook(freq, booklen, stream);
+}
+
+TEMPLATE_TYPE
+void HUFFMAN_COARSE::encode(
+    T*           in_uncompressed,
+    size_t const in_uncompressed_len,
+    BYTE*&       out_compressed,
+    size_t&      out_compressed_len,
+    cudaStream_t stream)
+{
+    pimpl->encode(in_uncompressed, in_uncompressed_len, out_compressed, out_compressed_len, stream);
+}
+
+TEMPLATE_TYPE
+void HUFFMAN_COARSE::decode(BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool header_on_device)
+{
+    pimpl->decode(in_compressed, out_decompressed, stream, header_on_device);
+}
+
+TEMPLATE_TYPE
+void HUFFMAN_COARSE::clear_buffer() { pimpl->clear_buffer(); }
+
+TEMPLATE_TYPE
+float HUFFMAN_COARSE::get_time_elapsed() const { return pimpl->get_time_elapsed(); }
+
+TEMPLATE_TYPE
+float HUFFMAN_COARSE::get_time_book() const { return pimpl->get_time_book(); }
+TEMPLATE_TYPE
+float HUFFMAN_COARSE::get_time_lossless() const { return pimpl->get_time_lossless(); }
+
+#undef TEMPLATE_TYPE
+#undef HUFFMAN_COARSE
+
+}  // namespace cusz
+
+#define HUFFCOARSE_CC(E, ETF, H, M) \
+    template class cusz::LosslessCodec<ErrCtrlTrait<E, ETF>::type, HuffTrait<H>::type, MetadataTrait<M>::type>;
+
+HUFFCOARSE_CC(1, false, 4, 4)  // uint
+HUFFCOARSE_CC(1, false, 8, 4)  //
+HUFFCOARSE_CC(2, false, 4, 4)  //
+HUFFCOARSE_CC(2, false, 8, 4)  //
+HUFFCOARSE_CC(4, false, 4, 4)  //
+HUFFCOARSE_CC(4, false, 8, 4)  //
+
+HUFFCOARSE_CC(4, true, 4, 4)  // float
+HUFFCOARSE_CC(4, true, 8, 4)  //
+
+#undef HUFFCOARSE_CC
diff --git a/qtensor/compression/cusz/src/hf/hf_bookg.cu b/qtensor/compression/cusz/src/hf/hf_bookg.cu
index fc6d3ac9..9bcb37ba 100644
--- a/qtensor/compression/cusz/src/hf/hf_bookg.cu
+++ b/qtensor/compression/cusz/src/hf/hf_bookg.cu
@@ -1,33 +1,33 @@
-/**
- * @file hf_bookg.cu
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-11-03
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "detail/hf_bookg.inl"
-#include "hf/hf_bookg.hh"
-
-#define PAR_BOOK(T, H) \
-    template void asz::hf_buildbook_g<T, H>(uint32_t*, int const, H*, uint8_t*, int const, float*, cudaStream_t);
-
-PAR_BOOK(uint8_t, uint32_t);
-PAR_BOOK(uint16_t, uint32_t);
-PAR_BOOK(uint32_t, uint32_t);
-PAR_BOOK(float, uint32_t);
-
-PAR_BOOK(uint8_t, uint64_t);
-PAR_BOOK(uint16_t, uint64_t);
-PAR_BOOK(uint32_t, uint64_t);
-PAR_BOOK(float, uint64_t);
-
-PAR_BOOK(uint8_t, unsigned long long);
-PAR_BOOK(uint16_t, unsigned long long);
-PAR_BOOK(uint32_t, unsigned long long);
-PAR_BOOK(float, unsigned long long);
-
-#undef PAR_BOOK
+/**
+ * @file hf_bookg.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "detail/hf_bookg.inl"
+#include "hf/hf_bookg.hh"
+
+#define PAR_BOOK(T, H) \
+    template void asz::hf_buildbook_g<T, H>(uint32_t*, int const, H*, uint8_t*, int const, float*, cudaStream_t);
+
+PAR_BOOK(uint8_t, uint32_t);
+PAR_BOOK(uint16_t, uint32_t);
+PAR_BOOK(uint32_t, uint32_t);
+PAR_BOOK(float, uint32_t);
+
+PAR_BOOK(uint8_t, uint64_t);
+PAR_BOOK(uint16_t, uint64_t);
+PAR_BOOK(uint32_t, uint64_t);
+PAR_BOOK(float, uint64_t);
+
+PAR_BOOK(uint8_t, unsigned long long);
+PAR_BOOK(uint16_t, unsigned long long);
+PAR_BOOK(uint32_t, unsigned long long);
+PAR_BOOK(float, unsigned long long);
+
+#undef PAR_BOOK
diff --git a/qtensor/compression/cusz/src/hf/hf_codecg.cu b/qtensor/compression/cusz/src/hf/hf_codecg.cu
index 9b7d9f0b..54da37f0 100644
--- a/qtensor/compression/cusz/src/hf/hf_codecg.cu
+++ b/qtensor/compression/cusz/src/hf/hf_codecg.cu
@@ -1,269 +1,269 @@
-/**
- * @file hf_codecg.cu
- * @author Jiannan Tian
- * @brief kernel wrappers; launching Huffman kernels
- * @version 0.3
- * @date 2022-11-02
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include "detail/hf_codecg.inl"
-#include "hf/hf_bookg.hh"
-#include "hf/hf_codecg.hh"
-
-template <typename T, typename H, typename M>
-void asz::hf_encode_coarse(
-    T*           uncompressed,
-    H*           d_internal_coded,
-    size_t const len,
-    uint32_t*    d_freq,
-    H*           d_book,
-    int const    booklen,
-    H*           d_bitstream,
-    M*           d_par_metadata,
-    M*           h_par_metadata,
-    int const    sublen,
-    int const    pardeg,
-    int          numSMs,
-    uint8_t*&    out_compressed,
-    size_t&      out_compressed_len,
-    float&       time_lossless,
-    cudaStream_t stream)
-{
-    auto d_par_nbit  = d_par_metadata;
-    auto d_par_ncell = d_par_metadata + pardeg;
-    auto d_par_entry = d_par_metadata + pardeg * 2;
-
-    auto h_par_nbit  = h_par_metadata;
-    auto h_par_ncell = h_par_metadata + pardeg;
-    auto h_par_entry = h_par_metadata + pardeg * 2;
-
-    CREATE_CUDAEVENT_PAIR;
-
-    /* phase 1 */
-    {
-        auto block_dim = HuffmanHelper::BLOCK_DIM_ENCODE;
-        auto grid_dim  = ConfigHelper::get_npart(len, block_dim);
-
-        START_CUDAEVENT_RECORDING(stream);
-
-        asz::detail::hf_encode_phase1_fill<T, H>                //
-            <<<8 * numSMs, 256, sizeof(H) * booklen, stream>>>  //
-            (uncompressed, len, d_book, booklen, d_internal_coded);
-
-        STOP_CUDAEVENT_RECORDING(stream);
-        CHECK_CUDA(cudaStreamSynchronize(stream));
-
-        float stage_time;
-        TIME_ELAPSED_CUDAEVENT(&stage_time);
-        time_lossless += stage_time;
-    }
-
-    /* phase 2 */
-    {
-        auto block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE;
-        auto grid_dim  = ConfigHelper::get_npart(pardeg, block_dim);
-
-        START_CUDAEVENT_RECORDING(stream);
-
-        asz::detail::hf_encode_phase2_deflate<H>  //
-            <<<grid_dim, block_dim, 0, stream>>>  //
-            (d_internal_coded, len, d_par_nbit, d_par_ncell, sublen, pardeg);
-
-        STOP_CUDAEVENT_RECORDING(stream);
-        CHECK_CUDA(cudaStreamSynchronize(stream));
-
-        float stage_time;
-        TIME_ELAPSED_CUDAEVENT(&stage_time);
-        time_lossless += stage_time;
-    }
-
-    /* phase 3 */
-    {
-        CHECK_CUDA(cudaMemcpyAsync(h_par_nbit, d_par_nbit, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream));
-        CHECK_CUDA(cudaMemcpyAsync(h_par_ncell, d_par_ncell, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream));
-        CHECK_CUDA(cudaStreamSynchronize(stream));
-
-        memcpy(h_par_entry + 1, h_par_ncell, (pardeg - 1) * sizeof(M));
-        for (auto i = 1; i < pardeg; i++) h_par_entry[i] += h_par_entry[i - 1];  // inclusive scan
-
-        CHECK_CUDA(cudaMemcpyAsync(d_par_entry, h_par_entry, pardeg * sizeof(M), cudaMemcpyHostToDevice, stream));
-        CHECK_CUDA(cudaStreamSynchronize(stream));
-    }
-
-    /* phase 4 */
-    {
-        START_CUDAEVENT_RECORDING(stream);
-
-        asz::detail::hf_encode_phase4_concatenate<H, M><<<pardeg, 128, 0, stream>>>  //
-            (d_internal_coded, d_par_entry, d_par_ncell, sublen, d_bitstream);
-
-        STOP_CUDAEVENT_RECORDING(stream);
-        CHECK_CUDA(cudaStreamSynchronize(stream));
-
-        float stage_time;
-        TIME_ELAPSED_CUDAEVENT(&stage_time);
-        time_lossless += stage_time;
-    }
-
-    DESTROY_CUDAEVENT_PAIR;
-}
-
-template <typename T, typename H, typename M>
-void asz::hf_encode_coarse_rev1(
-    T*            uncompressed,
-    size_t const  len,
-    hf_book*      book_desc,
-    hf_bitstream* bitstream_desc,
-    uint8_t*&     out_compressed,      // 22-10-12 buggy
-    size_t&       out_compressed_len,  // 22-10-12 buggy
-    float&        time_lossless,
-    cudaStream_t  stream)
-{
-    CREATE_CUDAEVENT_PAIR;
-
-    H*        d_buffer    = (H*)bitstream_desc->buffer;
-    H*        d_bitstream = (H*)bitstream_desc->bitstream;
-    H*        d_book      = (H*)book_desc->book;
-    int const booklen     = book_desc->booklen;
-    int const sublen      = bitstream_desc->sublen;
-    int const pardeg      = bitstream_desc->pardeg;
-    int const numSMs      = bitstream_desc->numSMs;
-    // uint32_t* d_freq      = book_desc->freq;
-
-    auto d_par_nbit  = (M*)bitstream_desc->d_metadata->bits;
-    auto d_par_ncell = (M*)bitstream_desc->d_metadata->cells;
-    auto d_par_entry = (M*)bitstream_desc->d_metadata->entries;
-
-    auto h_par_nbit  = (M*)bitstream_desc->h_metadata->bits;
-    auto h_par_ncell = (M*)bitstream_desc->h_metadata->cells;
-    auto h_par_entry = (M*)bitstream_desc->h_metadata->entries;
-
-    /* phase 1 */
-    {
-        auto block_dim = HuffmanHelper::BLOCK_DIM_ENCODE;
-        auto grid_dim  = ConfigHelper::get_npart(len, block_dim);
-
-        START_CUDAEVENT_RECORDING(stream);
-
-        asz::detail::hf_encode_phase1_fill<T, H>                //
-            <<<8 * numSMs, 256, sizeof(H) * booklen, stream>>>  //
-            (uncompressed, len, d_book, booklen, d_buffer);
-
-        STOP_CUDAEVENT_RECORDING(stream);
-        CHECK_CUDA(cudaStreamSynchronize(stream));
-
-        float stage_time;
-        TIME_ELAPSED_CUDAEVENT(&stage_time);
-        time_lossless += stage_time;
-    }
-
-    /* phase 2 */
-    {
-        auto block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE;
-        auto grid_dim  = ConfigHelper::get_npart(pardeg, block_dim);
-
-        START_CUDAEVENT_RECORDING(stream);
-
-        asz::detail::hf_encode_phase2_deflate<H>  //
-            <<<grid_dim, block_dim, 0, stream>>>  //
-            (d_buffer, len, d_par_nbit, d_par_ncell, sublen, pardeg);
-
-        STOP_CUDAEVENT_RECORDING(stream);
-        CHECK_CUDA(cudaStreamSynchronize(stream));
-
-        float stage_time;
-        TIME_ELAPSED_CUDAEVENT(&stage_time);
-        time_lossless += stage_time;
-    }
-
-    /* phase 3 */
-    {
-        CHECK_CUDA(cudaMemcpyAsync(h_par_nbit, d_par_nbit, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream));
-        CHECK_CUDA(cudaMemcpyAsync(h_par_ncell, d_par_ncell, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream));
-        CHECK_CUDA(cudaStreamSynchronize(stream));
-
-        memcpy(h_par_entry + 1, h_par_ncell, (pardeg - 1) * sizeof(M));
-        for (auto i = 1; i < pardeg; i++) h_par_entry[i] += h_par_entry[i - 1];  // inclusive scan
-
-        CHECK_CUDA(cudaMemcpyAsync(d_par_entry, h_par_entry, pardeg * sizeof(M), cudaMemcpyHostToDevice, stream));
-        CHECK_CUDA(cudaStreamSynchronize(stream));
-    }
-
-    /* phase 4 */
-    {
-        START_CUDAEVENT_RECORDING(stream);
-
-        asz::detail::hf_encode_phase4_concatenate<H, M><<<pardeg, 128, 0, stream>>>  //
-            (d_buffer, d_par_entry, d_par_ncell, sublen, d_bitstream);
-
-        STOP_CUDAEVENT_RECORDING(stream);
-
-        CHECK_CUDA(cudaStreamSynchronize(stream));
-
-        float stage_time;
-        TIME_ELAPSED_CUDAEVENT(&stage_time);
-        time_lossless += stage_time;
-    }
-}
-
-template <typename T, typename H, typename M>
-void asz::hf_decode_coarse(
-    H*           d_bitstream,
-    uint8_t*     d_revbook,
-    int const    revbook_nbyte,
-    M*           d_par_nbit,
-    M*           d_par_entry,
-    int const    sublen,
-    int const    pardeg,
-    T*           out_decompressed,
-    float&       time_lossless,
-    cudaStream_t stream)
-{
-    auto const block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE;  // = deflating
-    auto const grid_dim  = ConfigHelper::get_npart(pardeg, block_dim);
-
-    CREATE_CUDAEVENT_PAIR;
-    START_CUDAEVENT_RECORDING(stream)
-
-    hf_decode_kernel<T, H, M>                             //
-        <<<grid_dim, block_dim, revbook_nbyte, stream>>>  //
-        (d_bitstream, d_revbook, d_par_nbit, d_par_entry, revbook_nbyte, sublen, pardeg, out_decompressed);
-
-    STOP_CUDAEVENT_RECORDING(stream)
-    cudaStreamSynchronize(stream);
-
-    TIME_ELAPSED_CUDAEVENT(&time_lossless);
-    DESTROY_CUDAEVENT_PAIR;
-}
-
-#define HF_CODEC_INIT(T, H, M)                                                                                     \
-    template void asz::hf_encode_coarse<T, H, M>(                                                                  \
-        T*, H*, size_t const, uint32_t*, H*, int const, H*, M*, M*, int const, int const, int, uint8_t*&, size_t&, \
-        float&, cudaStream_t);                                                                                     \
-                                                                                                                   \
-    template void asz::hf_encode_coarse_rev1<T, H, M>(                                                             \
-        T*, size_t const, hf_book*, hf_bitstream*, uint8_t*&, size_t&, float&, cudaStream_t);                      \
-                                                                                                                   \
-    template void asz::hf_decode_coarse<T, H, M>(                                                                  \
-        H*, uint8_t*, int const, M*, M*, int const, int const, T*, float&, cudaStream_t);
-
-HF_CODEC_INIT(uint8_t, uint32_t, uint32_t);
-HF_CODEC_INIT(uint16_t, uint32_t, uint32_t);
-HF_CODEC_INIT(uint32_t, uint32_t, uint32_t);
-HF_CODEC_INIT(float, uint32_t, uint32_t);
-HF_CODEC_INIT(uint8_t, uint64_t, uint32_t);
-HF_CODEC_INIT(uint16_t, uint64_t, uint32_t);
-HF_CODEC_INIT(uint32_t, uint64_t, uint32_t);
-HF_CODEC_INIT(float, uint64_t, uint32_t);
-HF_CODEC_INIT(uint8_t, unsigned long long, uint32_t);
-HF_CODEC_INIT(uint16_t, unsigned long long, uint32_t);
-HF_CODEC_INIT(uint32_t, unsigned long long, uint32_t);
-HF_CODEC_INIT(float, unsigned long long, uint32_t);
-
-#undef HFBOOK_INIT
-#undef HF_CODEC_INIT
+/**
+ * @file hf_codecg.cu
+ * @author Jiannan Tian
+ * @brief kernel wrappers; launching Huffman kernels
+ * @version 0.3
+ * @date 2022-11-02
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "detail/hf_codecg.inl"
+#include "hf/hf_bookg.hh"
+#include "hf/hf_codecg.hh"
+
+template <typename T, typename H, typename M>
+void asz::hf_encode_coarse(
+    T*           uncompressed,
+    H*           d_internal_coded,
+    size_t const len,
+    uint32_t*    d_freq,
+    H*           d_book,
+    int const    booklen,
+    H*           d_bitstream,
+    M*           d_par_metadata,
+    M*           h_par_metadata,
+    int const    sublen,
+    int const    pardeg,
+    int          numSMs,
+    uint8_t*&    out_compressed,
+    size_t&      out_compressed_len,
+    float&       time_lossless,
+    cudaStream_t stream)
+{
+    auto d_par_nbit  = d_par_metadata;
+    auto d_par_ncell = d_par_metadata + pardeg;
+    auto d_par_entry = d_par_metadata + pardeg * 2;
+
+    auto h_par_nbit  = h_par_metadata;
+    auto h_par_ncell = h_par_metadata + pardeg;
+    auto h_par_entry = h_par_metadata + pardeg * 2;
+
+    CREATE_CUDAEVENT_PAIR;
+
+    /* phase 1 */
+    {
+        auto block_dim = HuffmanHelper::BLOCK_DIM_ENCODE;
+        auto grid_dim  = ConfigHelper::get_npart(len, block_dim);
+
+        START_CUDAEVENT_RECORDING(stream);
+
+        asz::detail::hf_encode_phase1_fill<T, H>                //
+            <<<8 * numSMs, 256, sizeof(H) * booklen, stream>>>  //
+            (uncompressed, len, d_book, booklen, d_internal_coded);
+
+        STOP_CUDAEVENT_RECORDING(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        float stage_time;
+        TIME_ELAPSED_CUDAEVENT(&stage_time);
+        time_lossless += stage_time;
+    }
+
+    /* phase 2 */
+    {
+        auto block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE;
+        auto grid_dim  = ConfigHelper::get_npart(pardeg, block_dim);
+
+        START_CUDAEVENT_RECORDING(stream);
+
+        asz::detail::hf_encode_phase2_deflate<H>  //
+            <<<grid_dim, block_dim, 0, stream>>>  //
+            (d_internal_coded, len, d_par_nbit, d_par_ncell, sublen, pardeg);
+
+        STOP_CUDAEVENT_RECORDING(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        float stage_time;
+        TIME_ELAPSED_CUDAEVENT(&stage_time);
+        time_lossless += stage_time;
+    }
+
+    /* phase 3 */
+    {
+        CHECK_CUDA(cudaMemcpyAsync(h_par_nbit, d_par_nbit, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream));
+        CHECK_CUDA(cudaMemcpyAsync(h_par_ncell, d_par_ncell, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream));
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        memcpy(h_par_entry + 1, h_par_ncell, (pardeg - 1) * sizeof(M));
+        for (auto i = 1; i < pardeg; i++) h_par_entry[i] += h_par_entry[i - 1];  // inclusive scan
+
+        CHECK_CUDA(cudaMemcpyAsync(d_par_entry, h_par_entry, pardeg * sizeof(M), cudaMemcpyHostToDevice, stream));
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+    }
+
+    /* phase 4 */
+    {
+        START_CUDAEVENT_RECORDING(stream);
+
+        asz::detail::hf_encode_phase4_concatenate<H, M><<<pardeg, 128, 0, stream>>>  //
+            (d_internal_coded, d_par_entry, d_par_ncell, sublen, d_bitstream);
+
+        STOP_CUDAEVENT_RECORDING(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        float stage_time;
+        TIME_ELAPSED_CUDAEVENT(&stage_time);
+        time_lossless += stage_time;
+    }
+
+    DESTROY_CUDAEVENT_PAIR;
+}
+
+template <typename T, typename H, typename M>
+void asz::hf_encode_coarse_rev1(
+    T*            uncompressed,
+    size_t const  len,
+    hf_book*      book_desc,
+    hf_bitstream* bitstream_desc,
+    uint8_t*&     out_compressed,      // 22-10-12 buggy
+    size_t&       out_compressed_len,  // 22-10-12 buggy
+    float&        time_lossless,
+    cudaStream_t  stream)
+{
+    CREATE_CUDAEVENT_PAIR;
+
+    H*        d_buffer    = (H*)bitstream_desc->buffer;
+    H*        d_bitstream = (H*)bitstream_desc->bitstream;
+    H*        d_book      = (H*)book_desc->book;
+    int const booklen     = book_desc->booklen;
+    int const sublen      = bitstream_desc->sublen;
+    int const pardeg      = bitstream_desc->pardeg;
+    int const numSMs      = bitstream_desc->numSMs;
+    // uint32_t* d_freq      = book_desc->freq;
+
+    auto d_par_nbit  = (M*)bitstream_desc->d_metadata->bits;
+    auto d_par_ncell = (M*)bitstream_desc->d_metadata->cells;
+    auto d_par_entry = (M*)bitstream_desc->d_metadata->entries;
+
+    auto h_par_nbit  = (M*)bitstream_desc->h_metadata->bits;
+    auto h_par_ncell = (M*)bitstream_desc->h_metadata->cells;
+    auto h_par_entry = (M*)bitstream_desc->h_metadata->entries;
+
+    /* phase 1 */
+    {
+        auto block_dim = HuffmanHelper::BLOCK_DIM_ENCODE;
+        auto grid_dim  = ConfigHelper::get_npart(len, block_dim);
+
+        START_CUDAEVENT_RECORDING(stream);
+
+        asz::detail::hf_encode_phase1_fill<T, H>                //
+            <<<8 * numSMs, 256, sizeof(H) * booklen, stream>>>  //
+            (uncompressed, len, d_book, booklen, d_buffer);
+
+        STOP_CUDAEVENT_RECORDING(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        float stage_time;
+        TIME_ELAPSED_CUDAEVENT(&stage_time);
+        time_lossless += stage_time;
+    }
+
+    /* phase 2 */
+    {
+        auto block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE;
+        auto grid_dim  = ConfigHelper::get_npart(pardeg, block_dim);
+
+        START_CUDAEVENT_RECORDING(stream);
+
+        asz::detail::hf_encode_phase2_deflate<H>  //
+            <<<grid_dim, block_dim, 0, stream>>>  //
+            (d_buffer, len, d_par_nbit, d_par_ncell, sublen, pardeg);
+
+        STOP_CUDAEVENT_RECORDING(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        float stage_time;
+        TIME_ELAPSED_CUDAEVENT(&stage_time);
+        time_lossless += stage_time;
+    }
+
+    /* phase 3 */
+    {
+        CHECK_CUDA(cudaMemcpyAsync(h_par_nbit, d_par_nbit, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream));
+        CHECK_CUDA(cudaMemcpyAsync(h_par_ncell, d_par_ncell, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream));
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        memcpy(h_par_entry + 1, h_par_ncell, (pardeg - 1) * sizeof(M));
+        for (auto i = 1; i < pardeg; i++) h_par_entry[i] += h_par_entry[i - 1];  // inclusive scan
+
+        CHECK_CUDA(cudaMemcpyAsync(d_par_entry, h_par_entry, pardeg * sizeof(M), cudaMemcpyHostToDevice, stream));
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+    }
+
+    /* phase 4 */
+    {
+        START_CUDAEVENT_RECORDING(stream);
+
+        asz::detail::hf_encode_phase4_concatenate<H, M><<<pardeg, 128, 0, stream>>>  //
+            (d_buffer, d_par_entry, d_par_ncell, sublen, d_bitstream);
+
+        STOP_CUDAEVENT_RECORDING(stream);
+
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        float stage_time;
+        TIME_ELAPSED_CUDAEVENT(&stage_time);
+        time_lossless += stage_time;
+    }
+}
+
+template <typename T, typename H, typename M>
+void asz::hf_decode_coarse(
+    H*           d_bitstream,
+    uint8_t*     d_revbook,
+    int const    revbook_nbyte,
+    M*           d_par_nbit,
+    M*           d_par_entry,
+    int const    sublen,
+    int const    pardeg,
+    T*           out_decompressed,
+    float&       time_lossless,
+    cudaStream_t stream)
+{
+    auto const block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE;  // = deflating
+    auto const grid_dim  = ConfigHelper::get_npart(pardeg, block_dim);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream)
+
+    hf_decode_kernel<T, H, M>                             //
+        <<<grid_dim, block_dim, revbook_nbyte, stream>>>  //
+        (d_bitstream, d_revbook, d_par_nbit, d_par_entry, revbook_nbyte, sublen, pardeg, out_decompressed);
+
+    STOP_CUDAEVENT_RECORDING(stream)
+    cudaStreamSynchronize(stream);
+
+    TIME_ELAPSED_CUDAEVENT(&time_lossless);
+    DESTROY_CUDAEVENT_PAIR;
+}
+
+#define HF_CODEC_INIT(T, H, M)                                                                                     \
+    template void asz::hf_encode_coarse<T, H, M>(                                                                  \
+        T*, H*, size_t const, uint32_t*, H*, int const, H*, M*, M*, int const, int const, int, uint8_t*&, size_t&, \
+        float&, cudaStream_t);                                                                                     \
+                                                                                                                   \
+    template void asz::hf_encode_coarse_rev1<T, H, M>(                                                             \
+        T*, size_t const, hf_book*, hf_bitstream*, uint8_t*&, size_t&, float&, cudaStream_t);                      \
+                                                                                                                   \
+    template void asz::hf_decode_coarse<T, H, M>(                                                                  \
+        H*, uint8_t*, int const, M*, M*, int const, int const, T*, float&, cudaStream_t);
+
+HF_CODEC_INIT(uint8_t, uint32_t, uint32_t);
+HF_CODEC_INIT(uint16_t, uint32_t, uint32_t);
+HF_CODEC_INIT(uint32_t, uint32_t, uint32_t);
+HF_CODEC_INIT(float, uint32_t, uint32_t);
+HF_CODEC_INIT(uint8_t, uint64_t, uint32_t);
+HF_CODEC_INIT(uint16_t, uint64_t, uint32_t);
+HF_CODEC_INIT(uint32_t, uint64_t, uint32_t);
+HF_CODEC_INIT(float, uint64_t, uint32_t);
+HF_CODEC_INIT(uint8_t, unsigned long long, uint32_t);
+HF_CODEC_INIT(uint16_t, unsigned long long, uint32_t);
+HF_CODEC_INIT(uint32_t, unsigned long long, uint32_t);
+HF_CODEC_INIT(float, unsigned long long, uint32_t);
+
+#undef HFBOOK_INIT
+#undef HF_CODEC_INIT
diff --git a/qtensor/compression/cusz/src/hf/hf_pimpl.cu b/qtensor/compression/cusz/src/hf/hf_pimpl.cu
index 595ccea4..08a35282 100644
--- a/qtensor/compression/cusz/src/hf/hf_pimpl.cu
+++ b/qtensor/compression/cusz/src/hf/hf_pimpl.cu
@@ -1,31 +1,31 @@
-/**
- * @file huffman_coarse.cu
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2021-12-17
- * (created) 2020-04-24 (rev1) 2021-09-05 (rev2) 2021-12-29
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * @copyright (C) 2021 by Washington State University, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#include "detail/hf_pimpl.inl"
-#include "hf/hf.hh"
-
-#define HUFFCOARSE(E, ETF, H, M) \
-    template class cusz::LosslessCodec<ErrCtrlTrait<E, ETF>::type, HuffTrait<H>::type, MetadataTrait<M>::type>::impl;
-
-HUFFCOARSE(1, false, 4, 4)  // uint
-HUFFCOARSE(1, false, 8, 4)  //
-HUFFCOARSE(2, false, 4, 4)  //
-HUFFCOARSE(2, false, 8, 4)  //
-HUFFCOARSE(4, false, 4, 4)  //
-HUFFCOARSE(4, false, 8, 4)  //
-
-HUFFCOARSE(4, true, 4, 4)  // float
-HUFFCOARSE(4, true, 8, 4)  //
-
-#undef HUFFCOARSE
+/**
+ * @file huffman_coarse.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-12-17
+ * (created) 2020-04-24 (rev1) 2021-09-05 (rev2) 2021-12-29
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * @copyright (C) 2021 by Washington State University, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include "detail/hf_pimpl.inl"
+#include "hf/hf.hh"
+
+#define HUFFCOARSE(E, ETF, H, M) \
+    template class cusz::LosslessCodec<ErrCtrlTrait<E, ETF>::type, HuffTrait<H>::type, MetadataTrait<M>::type>::impl;
+
+HUFFCOARSE(1, false, 4, 4)  // uint
+HUFFCOARSE(1, false, 8, 4)  //
+HUFFCOARSE(2, false, 4, 4)  //
+HUFFCOARSE(2, false, 8, 4)  //
+HUFFCOARSE(4, false, 4, 4)  //
+HUFFCOARSE(4, false, 8, 4)  //
+
+HUFFCOARSE(4, true, 4, 4)  // float
+HUFFCOARSE(4, true, 8, 4)  //
+
+#undef HUFFCOARSE
diff --git a/qtensor/compression/cusz/src/kernel/claunch_cuda.cu b/qtensor/compression/cusz/src/kernel/claunch_cuda.cu
index 5433d7d8..146a8cd1 100644
--- a/qtensor/compression/cusz/src/kernel/claunch_cuda.cu
+++ b/qtensor/compression/cusz/src/kernel/claunch_cuda.cu
@@ -1,76 +1,76 @@
-/**
- * @file kernel_cuda.cc
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-07-24
- *
- * (C) 2022 by Washington State University, Argonne National Laboratory
- *
- */
-
-#include "detail/hist.inl"
-#include "detail/spline3.inl"
-// #include "hf/hf_codecg.hh"
-// #include "hf/hf_struct.h"
-#include "kernel/claunch_cuda.h"
-#include "kernel/cpplaunch_cuda.hh"
-#include "utils/cuda_err.cuh"
-
-#define C_SPLINE3(Tliteral, Eliteral, FPliteral, T, E, FP)                                                           \
-    cusz_error_status claunch_construct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                           \
-        bool NO_R_SEPARATE, T* data, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, \
-        double const eb, int const radius, float* time_elapsed, cudaStream_t stream)                                 \
-    {                                                                                                                \
-        if (NO_R_SEPARATE)                                                                                           \
-            launch_construct_Spline3<T, E, FP, true>(                                                                \
-                data, len3, anchor, an_len3, errctrl, ec_len3, eb, radius, *time_elapsed, stream);                   \
-        else                                                                                                         \
-            launch_construct_Spline3<T, E, FP, false>(                                                               \
-                data, len3, anchor, an_len3, errctrl, ec_len3, eb, radius, *time_elapsed, stream);                   \
-        return CUSZ_SUCCESS;                                                                                         \
-    }                                                                                                                \
-    cusz_error_status claunch_reconstruct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                         \
-        T* xdata, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, double const eb,   \
-        int const radius, float* time_elapsed, cudaStream_t stream)                                                  \
-    {                                                                                                                \
-        launch_reconstruct_Spline3<T, E, FP>(                                                                        \
-            xdata, len3, anchor, an_len3, errctrl, ec_len3, eb, radius, *time_elapsed, stream);                      \
-        return CUSZ_SUCCESS;                                                                                         \
-    }
-
-C_SPLINE3(fp32, ui8, fp32, float, uint8_t, float);
-C_SPLINE3(fp32, ui16, fp32, float, uint16_t, float);
-C_SPLINE3(fp32, ui32, fp32, float, uint32_t, float);
-C_SPLINE3(fp32, fp32, fp32, float, float, float);
-
-#undef C_SPLINE3
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#define CPP_SPLINE3(Tliteral, Eliteral, FPliteral, T, E, FP)                                                    \
-    template <>                                                                                                 \
-    cusz_error_status cusz::cpplaunch_construct_Spline3<T, E, FP>(                                              \
-        bool NO_R_SEPARATE, T* data, dim3 const len3, T* anchor, dim3 const an_len3, E* eq, dim3 const ec_len3, \
-        double const eb, int const radius, float* time_elapsed, cudaStream_t stream)                            \
-    {                                                                                                           \
-        return claunch_construct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                             \
-            NO_R_SEPARATE, data, len3, anchor, an_len3, eq, ec_len3, eb, radius, time_elapsed, stream);         \
-    }                                                                                                           \
-                                                                                                                \
-    template <>                                                                                                 \
-    cusz_error_status cusz::cpplaunch_reconstruct_Spline3<T, E, FP>(                                            \
-        T * xdata, dim3 const len3, T* anchor, dim3 const an_len3, E* eq, dim3 const ec_len3, double const eb,  \
-        int const radius, float* time_elapsed, cudaStream_t stream)                                             \
-    {                                                                                                           \
-        return claunch_reconstruct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                           \
-            xdata, len3, anchor, an_len3, eq, ec_len3, eb, radius, time_elapsed, stream);                       \
-    }
-
-CPP_SPLINE3(fp32, ui8, fp32, float, uint8_t, float);
-CPP_SPLINE3(fp32, ui16, fp32, float, uint16_t, float);
-CPP_SPLINE3(fp32, ui32, fp32, float, uint32_t, float);
-CPP_SPLINE3(fp32, fp32, fp32, float, float, float);
-
-#undef CPP_SPLINE3
+/**
+ * @file kernel_cuda.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-07-24
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "detail/hist.inl"
+#include "detail/spline3.inl"
+// #include "hf/hf_codecg.hh"
+// #include "hf/hf_struct.h"
+#include "kernel/claunch_cuda.h"
+#include "kernel/cpplaunch_cuda.hh"
+#include "utils/cuda_err.cuh"
+
+#define C_SPLINE3(Tliteral, Eliteral, FPliteral, T, E, FP)                                                           \
+    cusz_error_status claunch_construct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                           \
+        bool NO_R_SEPARATE, T* data, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, \
+        double const eb, int const radius, float* time_elapsed, cudaStream_t stream)                                 \
+    {                                                                                                                \
+        if (NO_R_SEPARATE)                                                                                           \
+            launch_construct_Spline3<T, E, FP, true>(                                                                \
+                data, len3, anchor, an_len3, errctrl, ec_len3, eb, radius, *time_elapsed, stream);                   \
+        else                                                                                                         \
+            launch_construct_Spline3<T, E, FP, false>(                                                               \
+                data, len3, anchor, an_len3, errctrl, ec_len3, eb, radius, *time_elapsed, stream);                   \
+        return CUSZ_SUCCESS;                                                                                         \
+    }                                                                                                                \
+    cusz_error_status claunch_reconstruct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                         \
+        T* xdata, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, double const eb,   \
+        int const radius, float* time_elapsed, cudaStream_t stream)                                                  \
+    {                                                                                                                \
+        launch_reconstruct_Spline3<T, E, FP>(                                                                        \
+            xdata, len3, anchor, an_len3, errctrl, ec_len3, eb, radius, *time_elapsed, stream);                      \
+        return CUSZ_SUCCESS;                                                                                         \
+    }
+
+C_SPLINE3(fp32, ui8, fp32, float, uint8_t, float);
+C_SPLINE3(fp32, ui16, fp32, float, uint16_t, float);
+C_SPLINE3(fp32, ui32, fp32, float, uint32_t, float);
+C_SPLINE3(fp32, fp32, fp32, float, float, float);
+
+#undef C_SPLINE3
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define CPP_SPLINE3(Tliteral, Eliteral, FPliteral, T, E, FP)                                                    \
+    template <>                                                                                                 \
+    cusz_error_status cusz::cpplaunch_construct_Spline3<T, E, FP>(                                              \
+        bool NO_R_SEPARATE, T* data, dim3 const len3, T* anchor, dim3 const an_len3, E* eq, dim3 const ec_len3, \
+        double const eb, int const radius, float* time_elapsed, cudaStream_t stream)                            \
+    {                                                                                                           \
+        return claunch_construct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                             \
+            NO_R_SEPARATE, data, len3, anchor, an_len3, eq, ec_len3, eb, radius, time_elapsed, stream);         \
+    }                                                                                                           \
+                                                                                                                \
+    template <>                                                                                                 \
+    cusz_error_status cusz::cpplaunch_reconstruct_Spline3<T, E, FP>(                                            \
+        T * xdata, dim3 const len3, T* anchor, dim3 const an_len3, E* eq, dim3 const ec_len3, double const eb,  \
+        int const radius, float* time_elapsed, cudaStream_t stream)                                             \
+    {                                                                                                           \
+        return claunch_reconstruct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                           \
+            xdata, len3, anchor, an_len3, eq, ec_len3, eb, radius, time_elapsed, stream);                       \
+    }
+
+CPP_SPLINE3(fp32, ui8, fp32, float, uint8_t, float);
+CPP_SPLINE3(fp32, ui16, fp32, float, uint16_t, float);
+CPP_SPLINE3(fp32, ui32, fp32, float, uint32_t, float);
+CPP_SPLINE3(fp32, fp32, fp32, float, float, float);
+
+#undef CPP_SPLINE3
diff --git a/qtensor/compression/cusz/src/kernel/detail/hist.inl b/qtensor/compression/cusz/src/kernel/detail/hist.inl
index a3781eb6..1950970d 100644
--- a/qtensor/compression/cusz/src/kernel/detail/hist.inl
+++ b/qtensor/compression/cusz/src/kernel/detail/hist.inl
@@ -1,100 +1,100 @@
-/**
- * @file hist.inl
- * @author Cody Rivera (cjrivera1@crimson.ua.edu), Megan Hickman Fulp (mlhickm@g.clemson.edu)
- * @brief Fast histogramming from [Gómez-Luna et al. 2013]
- * @version 0.1
- * @date 2020-09-20
- * Created on 2020-02-16
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#ifndef CUSZ_KERNEL_HIST_CUH
-#define CUSZ_KERNEL_HIST_CUH
-
-#include <cuda_runtime.h>
-#include <cstdio>
-#include <limits>
-
-#include "common.hh"
-#include "utils/timer.h"
-
-#define MIN(a, b) ((a) < (b)) ? (a) : (b)
-const static unsigned int WARP_SIZE = 32;
-
-#define tix threadIdx.x
-#define tiy threadIdx.y
-#define tiz threadIdx.z
-#define bix blockIdx.x
-#define biy blockIdx.y
-#define biz blockIdx.z
-#define bdx blockDim.x
-#define bdy blockDim.y
-#define bdz blockDim.z
-
-namespace kernel {
-
-template <typename Input>
-__global__ void NaiveHistogram(Input in_data[], int out_freq[], int N, int symbols_per_thread);
-
-/* Copied from J. Gomez-Luna et al */
-template <typename T, typename FREQ>
-__global__ void p2013Histogram(T*, FREQ*, size_t, int, int);
-
-}  // namespace kernel
-
-template <typename T>
-__global__ void kernel::NaiveHistogram(T in_data[], int out_freq[], int N, int symbols_per_thread)
-{
-    unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-    unsigned int j;
-    if (i * symbols_per_thread < N) {  // if there is a symbol to count,
-        for (j = i * symbols_per_thread; j < (i + 1) * symbols_per_thread; j++) {
-            if (j < N) {
-                unsigned int item = in_data[j];  // Symbol to count
-                atomicAdd(&out_freq[item], 1);   // update bin count by 1
-            }
-        }
-    }
-}
-
-template <typename T, typename FREQ>
-__global__ void kernel::p2013Histogram(T* in_data, FREQ* out_freq, size_t N, int nbin, int R)
-{
-    // static_assert(
-    //     std::numeric_limits<T>::is_integer and (not std::numeric_limits<T>::is_signed),
-    //     "T must be `unsigned integer` type of {1,2,4} bytes");
-
-    extern __shared__ int Hs[/*(nbin + 1) * R*/];
-
-    const unsigned int warp_id     = (int)(tix / WARP_SIZE);
-    const unsigned int lane        = tix % WARP_SIZE;
-    const unsigned int warps_block = bdx / WARP_SIZE;
-    const unsigned int off_rep     = (nbin + 1) * (tix % R);
-    const unsigned int begin       = (N / warps_block) * warp_id + WARP_SIZE * blockIdx.x + lane;
-    unsigned int       end         = (N / warps_block) * (warp_id + 1);
-    const unsigned int step        = WARP_SIZE * gridDim.x;
-
-    // final warp handles data outside of the warps_block partitions
-    if (warp_id >= warps_block - 1) end = N;
-
-    for (unsigned int pos = tix; pos < (nbin + 1) * R; pos += bdx) Hs[pos] = 0;
-    __syncthreads();
-
-    for (unsigned int i = begin; i < end; i += step) {
-        int d = in_data[i];
-        d     = d <= 0 and d >= nbin ? nbin / 2 : d;
-        atomicAdd(&Hs[off_rep + d], 1);
-    }
-    __syncthreads();
-
-    for (unsigned int pos = tix; pos < nbin; pos += bdx) {
-        int sum = 0;
-        for (int base = 0; base < (nbin + 1) * R; base += nbin + 1) { sum += Hs[base + pos]; }
-        atomicAdd(out_freq + pos, sum);
-    }
-}
-
-#endif
+/**
+ * @file hist.inl
+ * @author Cody Rivera (cjrivera1@crimson.ua.edu), Megan Hickman Fulp (mlhickm@g.clemson.edu)
+ * @brief Fast histogramming from [Gómez-Luna et al. 2013]
+ * @version 0.1
+ * @date 2020-09-20
+ * Created on 2020-02-16
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_KERNEL_HIST_CUH
+#define CUSZ_KERNEL_HIST_CUH
+
+#include <cuda_runtime.h>
+#include <cstdio>
+#include <limits>
+
+#include "common.hh"
+#include "utils/timer.h"
+
+#define MIN(a, b) ((a) < (b)) ? (a) : (b)
+const static unsigned int WARP_SIZE = 32;
+
+#define tix threadIdx.x
+#define tiy threadIdx.y
+#define tiz threadIdx.z
+#define bix blockIdx.x
+#define biy blockIdx.y
+#define biz blockIdx.z
+#define bdx blockDim.x
+#define bdy blockDim.y
+#define bdz blockDim.z
+
+namespace kernel {
+
+template <typename Input>
+__global__ void NaiveHistogram(Input in_data[], int out_freq[], int N, int symbols_per_thread);
+
+/* Copied from J. Gomez-Luna et al */
+template <typename T, typename FREQ>
+__global__ void p2013Histogram(T*, FREQ*, size_t, int, int);
+
+}  // namespace kernel
+
+template <typename T>
+__global__ void kernel::NaiveHistogram(T in_data[], int out_freq[], int N, int symbols_per_thread)
+{
+    unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+    unsigned int j;
+    if (i * symbols_per_thread < N) {  // if there is a symbol to count,
+        for (j = i * symbols_per_thread; j < (i + 1) * symbols_per_thread; j++) {
+            if (j < N) {
+                unsigned int item = in_data[j];  // Symbol to count
+                atomicAdd(&out_freq[item], 1);   // update bin count by 1
+            }
+        }
+    }
+}
+
+template <typename T, typename FREQ>
+__global__ void kernel::p2013Histogram(T* in_data, FREQ* out_freq, size_t N, int nbin, int R)
+{
+    // static_assert(
+    //     std::numeric_limits<T>::is_integer and (not std::numeric_limits<T>::is_signed),
+    //     "T must be `unsigned integer` type of {1,2,4} bytes");
+
+    extern __shared__ int Hs[/*(nbin + 1) * R*/];
+
+    const unsigned int warp_id     = (int)(tix / WARP_SIZE);
+    const unsigned int lane        = tix % WARP_SIZE;
+    const unsigned int warps_block = bdx / WARP_SIZE;
+    const unsigned int off_rep     = (nbin + 1) * (tix % R);
+    const unsigned int begin       = (N / warps_block) * warp_id + WARP_SIZE * blockIdx.x + lane;
+    unsigned int       end         = (N / warps_block) * (warp_id + 1);
+    const unsigned int step        = WARP_SIZE * gridDim.x;
+
+    // final warp handles data outside of the warps_block partitions
+    if (warp_id >= warps_block - 1) end = N;
+
+    for (unsigned int pos = tix; pos < (nbin + 1) * R; pos += bdx) Hs[pos] = 0;
+    __syncthreads();
+
+    for (unsigned int i = begin; i < end; i += step) {
+        int d = in_data[i];
+        d     = d <= 0 and d >= nbin ? nbin / 2 : d;
+        atomicAdd(&Hs[off_rep + d], 1);
+    }
+    __syncthreads();
+
+    for (unsigned int pos = tix; pos < nbin; pos += bdx) {
+        int sum = 0;
+        for (int base = 0; base < (nbin + 1) * R; base += nbin + 1) { sum += Hs[base + pos]; }
+        atomicAdd(out_freq + pos, sum);
+    }
+}
+
+#endif
diff --git a/qtensor/compression/cusz/src/kernel/detail/lorenzo.inl b/qtensor/compression/cusz/src/kernel/detail/lorenzo.inl
index 0e1f9acd..28fd3bdc 100644
--- a/qtensor/compression/cusz/src/kernel/detail/lorenzo.inl
+++ b/qtensor/compression/cusz/src/kernel/detail/lorenzo.inl
@@ -1,816 +1,816 @@
-/**
- * @file lorenzo.inl
- * @author Jiannan Tian
- * @brief Dual-ErrCtrl Lorenzo method.
- * @version 0.2
- * @date 2021-01-16
- * (create) 2019-09-23; (release) 2020-09-20; (rev1) 2021-01-16; (rev2) 2021-02-20; (rev3) 2021-04-11
- * (rev4) 2021-04-30
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#ifndef CUSZ_KERNEL_LORENZO_CUH
-#define CUSZ_KERNEL_LORENZO_CUH
-
-#include <cstddef>
-// #include "utils/cuda_err.cuh"
-// #include "utils/timer.h"
-
-#if __has_include(<cub/cub.cuh>)
-// #pragma message __FILE__ ": (CUDA 11 onward), cub from system path"
-#include <cub/cub.cuh>
-#else
-// #pragma message __FILE__ ": (CUDA 10 or earlier), cub from git submodule"
-#include "../../third_party/cub/cub/cub.cuh"
-#endif
-
-#if __cplusplus >= 201703L
-#define CONSTEXPR constexpr
-#else
-#define CONSTEXPR
-#endif
-
-#define TIX threadIdx.x
-#define TIY threadIdx.y
-#define TIZ threadIdx.z
-#define BIX blockIdx.x
-#define BIY blockIdx.y
-#define BIZ blockIdx.z
-#define BDX blockDim.x
-#define BDY blockDim.y
-#define BDZ blockDim.z
-
-using DIM    = unsigned int;
-using STRIDE = unsigned int;
-
-namespace cusz {
-
-/**
- * @brief compress-time 1D Lorenzo pred-quant kernel
- *
- * @tparam Data type of input data
- * @tparam ErrCtrl type of error-control code
- * @tparam FP type for internal floating-point processing
- * @tparam BLOCK block size
- * @tparam SEQ degree of sequentiality
- * @param data input
- * @param errctrl output 1
- * @param outlier output 2
- * @param len3 data length in 3D
- * @param stride3 data stride in 3D
- * @param radius quant-code radius
- * @param ebx2_r precalculated reciprocal of eb*2
- */
-template <typename Data, typename ErrCtrl, typename FP = float, int BLOCK = 256, int SEQ = 4>
-__global__ void
-c_lorenzo_1d1l(Data* data, ErrCtrl* errctrl, Data* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2_r);
-
-/**
- * @brief compress-time 2D Lorenzo pred-quant kernel
- *
- * @tparam Data type of input data
- * @tparam ErrCtrl type of error-control code
- * @tparam FP type for internal floating-point processing
- * @tparam BLOCK block size
- * @tparam SEQ degree of sequentiality
- * @param data input
- * @param errctrl output 1
- * @param outlier output 2
- * @param len3 data length in 3D
- * @param stride3 data stride in 3D
- * @param radius quant-code radius
- * @param ebx2_r precalculated reciprocal of eb*2
- */
-template <typename Data, typename ErrCtrl, typename FP = float>
-__global__ void c_lorenzo_2d1l_16x16data_mapto16x2(
-    Data*    data,
-    ErrCtrl* errctrl,
-    Data*    outlier,
-    dim3     len3,
-    dim3     stride3,
-    int      radius,
-    FP       ebx2_r);
-
-/**
- * @brief compress-time 3D Lorenzo pred-quant kernel
- *
- * @tparam Data type of input data
- * @tparam ErrCtrl type of error-control code
- * @tparam FP type for internal floating-point processing
- * @tparam BLOCK block size
- * @tparam SEQ degree of sequentiality
- * @param data input
- * @param errctrl output 1
- * @param outlier output 2
- * @param len3 data length in 3D
- * @param stride3 data stride in 3D
- * @param radius quant-code radius
- * @param ebx2_r precalculated reciprocal of eb*2
- */
-template <typename Data, typename ErrCtrl, typename FP = float>
-__global__ void c_lorenzo_3d1l_32x8x8data_mapto32x1x8(
-    Data*    data,
-    ErrCtrl* errctrl,
-    Data*    outlier,
-    dim3     len3,
-    dim3     stride3,
-    int      radius,
-    FP       ebx2_r);
-
-/**
- * @brief decompress-time 1D Lorenzo pred-quant kernel
- *
- * @tparam Data type of input data
- * @tparam ErrCtrl type of error-control code
- * @tparam FP type for internal floating-point processing
- * @tparam BLOCK block size
- * @tparam SEQ degree of sequentiality
- * @param outlier input 1
- * @param quant input 2
- * @param xdata output
- * @param len3 data length in 3D
- * @param stride3 data stride in 3D
- * @param radius quant-code radius
- * @param ebx2 precalculated eb*2
- */
-template <
-    typename Data,
-    typename ErrCtrl,
-    typename FP = float,
-    int BLOCK   = 256,
-    int SEQ     = 8>
-__global__ void x_lorenzo_1d1l(
-    Data*    outlier,  //
-    ErrCtrl* quant,
-    Data*    xdata,
-    dim3     len3,
-    dim3     stride3,
-    int      radius,
-    FP       ebx2);
-
-/**
- * @brief decompress-time 2D Lorenzo pred-quant kernel
- *
- * @tparam Data type of input data
- * @tparam ErrCtrl type of error-control code
- * @tparam FP type for internal floating-point processing
- * @tparam BLOCK block size
- * @tparam SEQ degree of sequentiality
- * @param outlier input 1
- * @param quant input 2
- * @param xdata output
- * @param len3 data length in 3D
- * @param stride3 data stride in 3D
- * @param radius quant-code radius
- * @param ebx2 precalculated eb*2
- */
-template <typename Data, typename ErrCtrl, typename FP = float>
-__global__ void x_lorenzo_2d1l_16x16data_mapto16x2(
-    Data*    outlier,
-    ErrCtrl* quant,
-    Data*    xdata,
-    dim3     len3,
-    dim3     stride3,
-    int      radius,
-    FP       ebx2);
-
-/**
- * @brief decompress-time 3D Lorenzo pred-quant kernel
- *
- * @tparam Data type of input data
- * @tparam ErrCtrl type of error-control code
- * @tparam FP type for internal floating-point processing
- * @tparam BLOCK block size
- * @tparam SEQ degree of sequentiality
- * @param outlier input 1
- * @param quant input 2
- * @param xdata output
- * @param len3 data length in 3D
- * @param stride3 data stride in 3D
- * @param radius quant-code radius
- * @param ebx2 precalculated eb*2
- */
-template <typename Data, typename ErrCtrl, typename FP = float>
-__global__ void x_lorenzo_3d1l_32x8x8data_mapto32x1x8(
-    Data*    outlier,
-    ErrCtrl* quant,
-    Data*    xdata,
-    dim3     len3,
-    dim3     stride3,
-    int      radius,
-    FP       ebx2);
-
-/**
- * @brief decompress-time 3D Lorenzo pred-quant kernel (variant)
- *
- * @tparam Data type of input data
- * @tparam ErrCtrl type of error-control code
- * @tparam FP type for internal floating-point processing
- * @tparam BLOCK block size
- * @tparam SEQ degree of sequentiality
- * @param outlier input 1
- * @param quant input 2
- * @param xdata output
- * @param len3 data length in 3D
- * @param stride3 data stride in 3D
- * @param radius quant-code radius
- * @param ebx2 precalculated eb*2
- */
-template <typename Data, typename ErrCtrl, typename FP = float>
-__global__ void x_lorenzo_3d1lvar_32x8x8data_mapto32x1x8(
-    Data*    outlier,
-    ErrCtrl* quant,
-    Data*    xdata,
-    dim3     len3,
-    dim3     stride3,
-    int      radius,
-    FP       ebx2);
-
-}  // namespace cusz
-
-namespace {
-
-/**
- * @brief (Original SZ/cuSZ design) 1D: separate delta by radius in to quant-code and outlier
- */
-template <typename Data, typename ErrCtrl, int SEQ, bool FIRST_POINT>
-__forceinline__ __device__ void pred1d_radius_separate(
-    Data              thread_scope[SEQ],
-    volatile Data*    shmem_data,
-    volatile ErrCtrl* shmem_quant,
-    int               radius,
-    Data              from_last_stripe = 0)
-{
-    if CONSTEXPR (FIRST_POINT) {  // i == 0
-        Data delta                 = thread_scope[0] - from_last_stripe;
-        bool quantizable           = fabs(delta) < radius;
-        Data candidate             = delta + radius;
-        shmem_data[0 + TIX * SEQ]  = (1 - quantizable) * candidate;  // output; reuse data for outlier
-        shmem_quant[0 + TIX * SEQ] = quantizable * static_cast<ErrCtrl>(candidate);
-    }
-    else {
-#pragma unroll
-        for (auto i = 1; i < SEQ; i++) {
-            Data delta                 = thread_scope[i] - thread_scope[i - 1];
-            bool quantizable           = fabs(delta) < radius;
-            Data candidate             = delta + radius;
-            shmem_data[i + TIX * SEQ]  = (1 - quantizable) * candidate;  // output; reuse data for outlier
-            shmem_quant[i + TIX * SEQ] = quantizable * static_cast<ErrCtrl>(candidate);
-        }
-        __syncthreads();
-    }
-}
-
-template <typename Data, typename FP, int NTHREAD, int SEQ>
-__forceinline__ __device__ void load1d(
-    Data*          data,
-    unsigned int   dimx,
-    unsigned int   id_base,
-    volatile Data* shmem_data,
-    Data           thread_scope[SEQ],
-    Data&          from_last_stripe,
-    FP             ebx2_r)
-{
-#pragma unroll
-    for (auto i = 0; i < SEQ; i++) {
-        auto id = id_base + TIX + i * NTHREAD;
-        if (id < dimx) { shmem_data[TIX + i * NTHREAD] = round(data[id] * ebx2_r); }
-    }
-    __syncthreads();
-
-    for (auto i = 0; i < SEQ; i++) thread_scope[i] = shmem_data[TIX * SEQ + i];
-
-    if (TIX > 0) from_last_stripe = shmem_data[TIX * SEQ - 1];
-    __syncthreads();
-}
-
-template <typename Data, typename ErrCtrl, int NTHREAD, int SEQ, bool NO_R_SEPARATE>
-__forceinline__ __device__ void write1d(
-    volatile Data*    shmem_data,
-    Data*             data,
-    unsigned int      dimx,
-    unsigned int      id_base,
-    volatile ErrCtrl* shmem_quant = nullptr,
-    ErrCtrl*          quant       = nullptr)
-{
-#pragma unroll
-    for (auto i = 0; i < SEQ; i++) {
-        auto id = id_base + TIX + i * NTHREAD;
-        if (id < dimx) {
-            if CONSTEXPR (NO_R_SEPARATE) {  // TODO no-radius-separate uses shmem_data
-                quant[id] = shmem_data[TIX + i * NTHREAD];
-            }
-            else {
-                data[id]  = shmem_data[TIX + i * NTHREAD];
-                quant[id] = shmem_quant[TIX + i * NTHREAD];
-            }
-        }
-    }
-}
-
-template <typename Data, typename FP, int YSEQ>
-__forceinline__ __device__ void load2d_prequant(
-    Data*        data,
-    Data         center[YSEQ + 1],
-    unsigned int dimx,
-    unsigned int dimy,
-    unsigned int stridey,
-    unsigned int gix,
-    unsigned int giy_base,
-    FP           ebx2_r)
-{
-    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
-
-#pragma unroll
-    for (auto i = 0; i < YSEQ; i++) {
-        if (gix < dimx and giy_base + i < dimy) center[i + 1] = round(data[get_gid(i)] * ebx2_r);
-    }
-    auto tmp = __shfl_up_sync(0xffffffff, center[YSEQ], 16);  // same-warp, next-16
-    if (TIY == 1) center[0] = tmp;
-}
-
-template <typename Data, typename FP, int YSEQ>
-__forceinline__ __device__ void pred2d(Data center[YSEQ + 1])
-{
-    /* prediction
-         original form:  Data delta = center[i] - center[i - 1] + west[i] - west[i - 1];
-            short form:  Data delta = center[i] - west[i];
-       */
-#pragma unroll
-    for (auto i = YSEQ; i > 0; i--) {
-        center[i] -= center[i - 1];
-        auto west = __shfl_up_sync(0xffffffff, center[i], 1, 16);
-        if (TIX > 0) center[i] -= west;
-    }
-    __syncthreads();
-}
-
-template <typename Data, typename ErrCtrl, int YSEQ>
-__forceinline__ __device__ void postquant_write2d(
-    Data         center[YSEQ + 1],
-    ErrCtrl*     quant,
-    Data*        outlier,
-    unsigned int dimx,
-    unsigned int dimy,
-    unsigned int stridey,
-    int          radius,
-    unsigned int gix,
-    unsigned int giy_base)
-{
-    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
-
-#pragma unroll
-    for (auto i = 1; i < YSEQ + 1; i++) {
-        auto gid = get_gid(i - 1);
-
-        if (gix < dimx and giy_base + i - 1 < dimy) {
-            bool quantizable = fabs(center[i]) < radius;
-            Data candidate   = center[i] + radius;
-            outlier[gid]     = (1 - quantizable) * candidate;  // output; reuse data for outlier
-            quant[gid]       = quantizable * static_cast<ErrCtrl>(candidate);
-        }
-    }
-}
-
-}  // namespace
-
-template <
-    typename Data,
-    typename ErrCtrl,
-    typename FP,
-    int BLOCK,
-    int SEQ>
-__global__ void cusz::c_lorenzo_1d1l(  //
-    Data*    data,
-    ErrCtrl* quant,
-    Data*    outlier,
-    dim3     len3,
-    dim3     stride3,
-    int      radius,
-    FP       ebx2_r)
-{
-    constexpr auto NTHREAD = BLOCK / SEQ;
-
-    __shared__ struct {
-        union {
-            uint8_t uninitialized[BLOCK * sizeof(Data) + BLOCK * sizeof(ErrCtrl)];
-            Data    data[BLOCK];
-        } space;
-    } shmem;
-
-    auto id_base = BIX * BLOCK;
-
-    Data thread_scope[SEQ];
-    Data from_last_stripe{0};
-
-    /********************************************************************************
-     * load from DRAM using striped layout, perform prequant
-     ********************************************************************************/
-    load1d<Data, FP, NTHREAD, SEQ>(data, len3.x, id_base, shmem.space.data, thread_scope, from_last_stripe, ebx2_r);
-
-    // the original SZ/cuSZ design
-    auto shmem_quant = reinterpret_cast<ErrCtrl*>(shmem.space.uninitialized + sizeof(Data) * BLOCK);
-    pred1d_radius_separate<Data, ErrCtrl, SEQ, true>(
-        thread_scope, shmem.space.data, shmem_quant, radius, from_last_stripe);
-    pred1d_radius_separate<Data, ErrCtrl, SEQ, false>(thread_scope, shmem.space.data, shmem_quant, radius);
-    write1d<Data, ErrCtrl, NTHREAD, SEQ, false>(shmem.space.data, outlier, len3.x, id_base, shmem_quant, quant);
-}
-
-template <typename Data, typename ErrCtrl, typename FP>
-__global__ void cusz::c_lorenzo_2d1l_16x16data_mapto16x2(
-    Data*    data,
-    ErrCtrl* quant,
-    Data*    outlier,
-    dim3     len3,
-    dim3     stride3,
-    int      radius,
-    FP       ebx2_r)
-{
-    constexpr auto BLOCK = 16;
-    constexpr auto YSEQ  = 8;
-
-    Data center[YSEQ + 1] = {0};  // nw  n
-                                  //  w  center
-
-    auto gix      = BIX * BDX + TIX;           // BDX == 16
-    auto giy_base = BIY * BLOCK + TIY * YSEQ;  // BDY * YSEQ = BLOCK == 16
-
-    load2d_prequant<Data, FP, YSEQ>(data, center, len3.x, len3.y, stride3.y, gix, giy_base, ebx2_r);
-    pred2d<Data, FP, YSEQ>(center);
-    postquant_write2d<Data, ErrCtrl, YSEQ>(center, quant, outlier, len3.x, len3.y, stride3.y, radius, gix, giy_base);
-}
-
-template <typename Data, typename ErrCtrl, typename FP>
-__global__ void cusz::c_lorenzo_3d1l_32x8x8data_mapto32x1x8(
-    Data*    data,
-    ErrCtrl* quant,
-    Data*    outlier,
-    dim3     len3,
-    dim3     stride3,
-    int      radius,
-    FP       ebx2_r)
-{
-    constexpr auto  BLOCK = 8;
-    __shared__ Data shmem[8][8][32];
-
-    auto z = TIZ;
-
-    auto gix      = BIX * (BLOCK * 4) + TIX;
-    auto giy_base = BIY * BLOCK;
-    auto giz      = BIZ * BLOCK + z;
-    auto base_id  = gix + giy_base * stride3.y + giz * stride3.z;
-
-    /********************************************************************************
-     * load from DRAM, perform prequant
-     ********************************************************************************/
-    if (gix < len3.x and giz < len3.z) {
-        for (auto y = 0; y < BLOCK; y++) {
-            if (giy_base + y < len3.y) {
-                shmem[z][y][TIX] = round(data[base_id + y * stride3.y] * ebx2_r);  // prequant (fp presence)
-            }
-        }
-    }
-    __syncthreads();  // necessary to ensure correctness
-
-    auto x = TIX % 8;
-
-    for (auto y = 0; y < BLOCK; y++) {
-        Data delta;
-
-        /********************************************************************************
-         * prediction
-         ********************************************************************************/
-        delta = shmem[z][y][TIX] - ((z > 0 and y > 0 and x > 0 ? shmem[z - 1][y - 1][TIX - 1] : 0)  // dist=3
-                                    - (y > 0 and x > 0 ? shmem[z][y - 1][TIX - 1] : 0)              // dist=2
-                                    - (z > 0 and x > 0 ? shmem[z - 1][y][TIX - 1] : 0)              //
-                                    - (z > 0 and y > 0 ? shmem[z - 1][y - 1][TIX] : 0)              //
-                                    + (x > 0 ? shmem[z][y][TIX - 1] : 0)                            // dist=1
-                                    + (y > 0 ? shmem[z][y - 1][TIX] : 0)                            //
-                                    + (z > 0 ? shmem[z - 1][y][TIX] : 0));                          //
-
-        auto id = base_id + (y * stride3.y);
-
-        bool quantizable = fabs(delta) < radius;
-        Data candidate   = delta + radius;
-        if (gix < len3.x and (giy_base + y) < len3.y and giz < len3.z) {
-            outlier[id] = (1 - quantizable) * candidate;  // output; reuse data for outlier
-            quant[id]   = quantizable * static_cast<ErrCtrl>(candidate);
-        }
-    }
-    /* EOF */
-}
-
-template <typename Data, typename ErrCtrl, typename FP, int BLOCK, int SEQ>
-__global__ void cusz::x_lorenzo_1d1l(  //
-    Data*    outlier,
-    ErrCtrl* quant,
-    Data*    xdata,
-    dim3     len3,
-    dim3     stride3,
-    int      radius,
-    FP       ebx2)
-{
-    constexpr auto block_dim = BLOCK / SEQ;  // dividable
-
-    // coalesce-load (warp-striped) and transpose in shmem (similar for store)
-    typedef cub::BlockLoad<Data, block_dim, SEQ, cub::BLOCK_LOAD_WARP_TRANSPOSE>    BlockLoadT_outlier;
-    typedef cub::BlockLoad<ErrCtrl, block_dim, SEQ, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT_quant;
-    typedef cub::BlockStore<Data, block_dim, SEQ, cub::BLOCK_STORE_WARP_TRANSPOSE>  BlockStoreT_xdata;
-    typedef cub::BlockScan<Data, block_dim, cub::BLOCK_SCAN_RAKING_MEMOIZE>
-        BlockScanT_xdata;  // TODO autoselect algorithm
-
-    __shared__ union TempStorage {  // overlap shared memory space
-        typename BlockLoadT_outlier::TempStorage load_outlier;
-        typename BlockLoadT_quant::TempStorage   load_quant;
-        typename BlockStoreT_xdata::TempStorage  store_xdata;
-        typename BlockScanT_xdata::TempStorage   scan_xdata;
-    } temp_storage;
-
-    // thread-scope tiled data
-    union ThreadData {
-        Data xdata[SEQ];
-        Data outlier[SEQ];
-    } thread_scope;
-    ErrCtrl thread_scope_quant[SEQ];
-
-    /********************************************************************************
-     * load to thread-private array (fuse at the same time)
-     * (BIX * BDX * SEQ) denotes the start of the data chunk that belongs to this thread block
-     ********************************************************************************/
-    BlockLoadT_quant(temp_storage.load_quant).Load(quant + (BIX * BDX) * SEQ, thread_scope_quant);
-    __syncthreads();  // barrier for shmem reuse
-    BlockLoadT_outlier(temp_storage.load_outlier).Load(outlier + (BIX * BDX) * SEQ, thread_scope.outlier);
-    __syncthreads();  // barrier for shmem reuse
-
-#pragma unroll
-    for (auto i = 0; i < SEQ; i++) {
-        auto id = (BIX * BDX + TIX) * SEQ + i;
-        thread_scope.xdata[i] =
-            id < len3.x ? thread_scope.outlier[i] + static_cast<Data>(thread_scope_quant[i]) - radius : 0;
-    }
-    __syncthreads();
-
-    /********************************************************************************
-     * perform partial-sum using cub::InclusiveSum
-     ********************************************************************************/
-    BlockScanT_xdata(temp_storage.scan_xdata).InclusiveSum(thread_scope.xdata, thread_scope.xdata);
-    __syncthreads();  // barrier for shmem reuse
-
-    /********************************************************************************
-     * scale by ebx2 and write to DRAM
-     ********************************************************************************/
-#pragma unroll
-    for (auto i = 0; i < SEQ; i++) thread_scope.xdata[i] *= ebx2;
-    __syncthreads();  // barrier for shmem reuse
-
-    BlockStoreT_xdata(temp_storage.store_xdata).Store(xdata + (BIX * BDX) * SEQ, thread_scope.xdata);
-}
-
-template <typename Data, typename ErrCtrl, typename FP>
-__global__ void cusz::x_lorenzo_2d1l_16x16data_mapto16x2(
-    Data*    outlier,
-    ErrCtrl* quant,
-    Data*    xdata,
-    dim3     len3,
-    dim3     stride3,
-    int      radius,
-    FP       ebx2)
-{
-    constexpr auto BLOCK = 16;
-    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
-    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
-
-    __shared__ Data intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
-    Data            thread_scope[YSEQ];
-    /*
-      .  ------> gix (x)
-      |  t00    t01    t02    t03    ... t0f
-      |  ts00_0 ts00_0 ts00_0 ts00_0
-     giy ts00_1 ts00_1 ts00_1 ts00_1
-     (y)  |      |      |      |
-         ts00_7 ts00_7 ts00_7 ts00_7
-
-      |  t10    t11    t12    t13    ... t1f
-      |  ts00_0 ts00_0 ts00_0 ts00_0
-     giy ts00_1 ts00_1 ts00_1 ts00_1
-     (y)  |      |      |      |
-         ts00_7 ts00_7 ts00_7 ts00_7
-     */
-
-    auto gix      = BIX * BLOCK + TIX;
-    auto giy_base = BIY * BLOCK + TIY * YSEQ;  // BDY * YSEQ = BLOCK == 16
-    auto get_gid  = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
-
-    /********************************************************************************
-     * load to thread-private array (fuse at the same time)
-     ********************************************************************************/
-#pragma unroll
-    for (auto i = 0; i < YSEQ; i++) {
-        auto gid = get_gid(i);
-        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
-        if (gix < len3.x and giy_base + i < len3.y)
-            thread_scope[i] = outlier[gid] + static_cast<Data>(quant[gid]) - radius;  // fuse
-        else
-            thread_scope[i] = 0;  // TODO set as init state?
-    }
-
-    /********************************************************************************
-     * partial-sum along y-axis, sequantially
-     ********************************************************************************/
-    for (auto i = 1; i < YSEQ; i++) thread_scope[i] += thread_scope[i - 1];
-    // two-pass: store for cross-threadscope update
-    if (TIY == 0) intermediate[TIX] = thread_scope[YSEQ - 1];
-    __syncthreads();
-    // two-pass: load and update
-    if (TIY == 1) {
-        auto tmp = intermediate[TIX];
-#pragma unroll
-        for (auto& i : thread_scope) i += tmp;
-    }
-
-    /********************************************************************************
-     * in-warp partial-sum along x-axis
-     ********************************************************************************/
-#pragma unroll
-    for (auto& i : thread_scope) {
-        for (auto d = 1; d < BLOCK; d *= 2) {
-            Data n = __shfl_up_sync(0xffffffff, i, d, 16);
-            if (TIX >= d) i += n;
-        }
-        i *= ebx2;
-    }
-
-    /********************************************************************************
-     * write to DRAM
-     ********************************************************************************/
-#pragma unroll
-    for (auto i = 0; i < YSEQ; i++) {
-        auto gid = get_gid(i);
-        if (gix < len3.x and giy_base + i < len3.y) xdata[gid] = thread_scope[i];
-    }
-}
-
-template <typename Data, typename ErrCtrl, typename FP>
-__global__ void cusz::x_lorenzo_3d1l_32x8x8data_mapto32x1x8(
-    Data*    outlier,
-    ErrCtrl* quant,
-    Data*    xdata,
-    dim3     len3,
-    dim3     stride3,
-    int      radius,
-    FP       ebx2)
-{
-    constexpr auto BLOCK = 8;
-    constexpr auto YSEQ  = BLOCK;
-    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
-
-    __shared__ Data intermediate[BLOCK][4][8];
-    Data            thread_scope[YSEQ];
-
-    auto seg_id  = TIX / 8;
-    auto seg_tix = TIX % 8;
-
-    auto gix = BIX * (4 * BLOCK) + TIX, giy_base = BIY * BLOCK, giz = BIZ * BLOCK + TIZ;
-    auto get_gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
-
-    /********************************************************************************
-     * load to thread-private array (fuse at the same time)
-     ********************************************************************************/
-#pragma unroll
-    for (auto y = 0; y < YSEQ; y++) {
-        auto gid = get_gid(y);
-        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
-            thread_scope[y] = outlier[gid] + static_cast<Data>(quant[gid]) - static_cast<Data>(radius);  // fuse
-        else
-            thread_scope[y] = 0;
-    }
-
-    /********************************************************************************
-     * partial-sum along y-axis, sequantially
-     ********************************************************************************/
-    for (auto y = 1; y < YSEQ; y++) thread_scope[y] += thread_scope[y - 1];
-
-    /********************************************************************************
-     * ND partial-sums along x- and z-axis
-     * in-warp shuffle used: in order to perform, it's transposed after X-partial sum
-     ********************************************************************************/
-    auto dist = 1;
-    Data addend;
-
-#pragma unroll
-    for (auto i = 0; i < BLOCK; i++) {
-        Data val = thread_scope[i];
-
-        for (dist = 1; dist < BLOCK; dist *= 2) {
-            addend = __shfl_up_sync(0xffffffff, val, dist, 8);
-            if (seg_tix >= dist) val += addend;
-        }
-
-        // x-z transpose
-        intermediate[TIZ][seg_id][seg_tix] = val;
-        __syncthreads();
-        val = intermediate[seg_tix][seg_id][TIZ];
-        __syncthreads();
-
-        for (dist = 1; dist < BLOCK; dist *= 2) {
-            addend = __shfl_up_sync(0xffffffff, val, dist, 8);
-            if (seg_tix >= dist) val += addend;
-        }
-
-        intermediate[TIZ][seg_id][seg_tix] = val;
-        __syncthreads();
-        val = intermediate[seg_tix][seg_id][TIZ];
-        __syncthreads();
-
-        thread_scope[i] = val;
-    }
-
-    /********************************************************************************
-     * write to DRAM
-     ********************************************************************************/
-#pragma unroll
-    for (auto y = 0; y < YSEQ; y++) {
-        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) { xdata[get_gid(y)] = thread_scope[y] * ebx2; }
-    }
-    /* EOF */
-}
-
-/********************************************************************************
- * experimental prototype toward further optmization
- ********************************************************************************/
-template <typename Data, typename ErrCtrl, typename FP>
-__global__ void cusz::x_lorenzo_3d1lvar_32x8x8data_mapto32x1x8(
-    Data*    outlier,
-    ErrCtrl* quant,
-    Data*    xdata,
-    dim3     len3,
-    dim3     stride3,
-    int      radius,
-    FP       ebx2)
-{
-    constexpr auto BLOCK = 8;
-    constexpr auto YSEQ  = BLOCK;
-    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
-
-    __shared__ Data intermediate[BLOCK][4][8];
-    Data            thread_scope = 0;
-
-    auto seg_id  = TIX / 8;
-    auto seg_tix = TIX % 8;
-
-    auto gix = BIX * (4 * BLOCK) + TIX, giy_base = BIY * BLOCK, giz = BIZ * BLOCK + TIZ;
-    auto get_gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
-
-    auto y = 0;
-
-    // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
-#pragma unroll
-    for (y = 0; y < YSEQ; y++) {
-        auto gid = get_gid(y);
-        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
-            thread_scope += outlier[gid] + static_cast<Data>(quant[gid]) - static_cast<Data>(radius);  // fuse
-
-        Data val = thread_scope;
-
-        // shuffle, ND partial-sums
-        for (auto dist = 1; dist < BLOCK; dist *= 2) {
-            Data addend = __shfl_up_sync(0xffffffff, val, dist, 8);
-            if (seg_tix >= dist) val += addend;
-        }
-
-        // x-z transpose
-        intermediate[TIZ][seg_id][seg_tix] = val;
-        __syncthreads();
-        val = intermediate[seg_tix][seg_id][TIZ];
-        __syncthreads();
-
-        for (auto dist = 1; dist < BLOCK; dist *= 2) {
-            Data addend = __shfl_up_sync(0xffffffff, val, dist, 8);
-            if (seg_tix >= dist) val += addend;
-        }
-
-        intermediate[TIZ][seg_id][seg_tix] = val;
-        __syncthreads();
-        val = intermediate[seg_tix][seg_id][TIZ];
-        __syncthreads();
-
-        // thread_scope += val;
-
-        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) { xdata[get_gid(y)] = val * ebx2; }
-    }
-}
-
-#undef TIX
-#undef TIY
-#undef TIZ
-#undef BIX
-#undef BIY
-#undef BIZ
-#undef BDX
-#undef BDY
-#undef BDZ
-
-#endif
+/**
+ * @file lorenzo.inl
+ * @author Jiannan Tian
+ * @brief Dual-ErrCtrl Lorenzo method.
+ * @version 0.2
+ * @date 2021-01-16
+ * (create) 2019-09-23; (release) 2020-09-20; (rev1) 2021-01-16; (rev2) 2021-02-20; (rev3) 2021-04-11
+ * (rev4) 2021-04-30
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_KERNEL_LORENZO_CUH
+#define CUSZ_KERNEL_LORENZO_CUH
+
+#include <cstddef>
+// #include "utils/cuda_err.cuh"
+// #include "utils/timer.h"
+
+#if __has_include(<cub/cub.cuh>)
+// #pragma message __FILE__ ": (CUDA 11 onward), cub from system path"
+#include <cub/cub.cuh>
+#else
+// #pragma message __FILE__ ": (CUDA 10 or earlier), cub from git submodule"
+#include "../../third_party/cub/cub/cub.cuh"
+#endif
+
+#if __cplusplus >= 201703L
+#define CONSTEXPR constexpr
+#else
+#define CONSTEXPR
+#endif
+
+#define TIX threadIdx.x
+#define TIY threadIdx.y
+#define TIZ threadIdx.z
+#define BIX blockIdx.x
+#define BIY blockIdx.y
+#define BIZ blockIdx.z
+#define BDX blockDim.x
+#define BDY blockDim.y
+#define BDZ blockDim.z
+
+using DIM    = unsigned int;
+using STRIDE = unsigned int;
+
+namespace cusz {
+
+/**
+ * @brief compress-time 1D Lorenzo pred-quant kernel
+ *
+ * @tparam Data type of input data
+ * @tparam ErrCtrl type of error-control code
+ * @tparam FP type for internal floating-point processing
+ * @tparam BLOCK block size
+ * @tparam SEQ degree of sequentiality
+ * @param data input
+ * @param errctrl output 1
+ * @param outlier output 2
+ * @param len3 data length in 3D
+ * @param stride3 data stride in 3D
+ * @param radius quant-code radius
+ * @param ebx2_r precalculated reciprocal of eb*2
+ */
+template <typename Data, typename ErrCtrl, typename FP = float, int BLOCK = 256, int SEQ = 4>
+__global__ void
+c_lorenzo_1d1l(Data* data, ErrCtrl* errctrl, Data* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2_r);
+
+/**
+ * @brief compress-time 2D Lorenzo pred-quant kernel
+ *
+ * @tparam Data type of input data
+ * @tparam ErrCtrl type of error-control code
+ * @tparam FP type for internal floating-point processing
+ * @tparam BLOCK block size
+ * @tparam SEQ degree of sequentiality
+ * @param data input
+ * @param errctrl output 1
+ * @param outlier output 2
+ * @param len3 data length in 3D
+ * @param stride3 data stride in 3D
+ * @param radius quant-code radius
+ * @param ebx2_r precalculated reciprocal of eb*2
+ */
+template <typename Data, typename ErrCtrl, typename FP = float>
+__global__ void c_lorenzo_2d1l_16x16data_mapto16x2(
+    Data*    data,
+    ErrCtrl* errctrl,
+    Data*    outlier,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2_r);
+
+/**
+ * @brief compress-time 3D Lorenzo pred-quant kernel
+ *
+ * @tparam Data type of input data
+ * @tparam ErrCtrl type of error-control code
+ * @tparam FP type for internal floating-point processing
+ * @tparam BLOCK block size
+ * @tparam SEQ degree of sequentiality
+ * @param data input
+ * @param errctrl output 1
+ * @param outlier output 2
+ * @param len3 data length in 3D
+ * @param stride3 data stride in 3D
+ * @param radius quant-code radius
+ * @param ebx2_r precalculated reciprocal of eb*2
+ */
+template <typename Data, typename ErrCtrl, typename FP = float>
+__global__ void c_lorenzo_3d1l_32x8x8data_mapto32x1x8(
+    Data*    data,
+    ErrCtrl* errctrl,
+    Data*    outlier,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2_r);
+
+/**
+ * @brief decompress-time 1D Lorenzo pred-quant kernel
+ *
+ * @tparam Data type of input data
+ * @tparam ErrCtrl type of error-control code
+ * @tparam FP type for internal floating-point processing
+ * @tparam BLOCK block size
+ * @tparam SEQ degree of sequentiality
+ * @param outlier input 1
+ * @param quant input 2
+ * @param xdata output
+ * @param len3 data length in 3D
+ * @param stride3 data stride in 3D
+ * @param radius quant-code radius
+ * @param ebx2 precalculated eb*2
+ */
+template <
+    typename Data,
+    typename ErrCtrl,
+    typename FP = float,
+    int BLOCK   = 256,
+    int SEQ     = 8>
+__global__ void x_lorenzo_1d1l(
+    Data*    outlier,  //
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2);
+
+/**
+ * @brief decompress-time 2D Lorenzo pred-quant kernel
+ *
+ * @tparam Data type of input data
+ * @tparam ErrCtrl type of error-control code
+ * @tparam FP type for internal floating-point processing
+ * @tparam BLOCK block size
+ * @tparam SEQ degree of sequentiality
+ * @param outlier input 1
+ * @param quant input 2
+ * @param xdata output
+ * @param len3 data length in 3D
+ * @param stride3 data stride in 3D
+ * @param radius quant-code radius
+ * @param ebx2 precalculated eb*2
+ */
+template <typename Data, typename ErrCtrl, typename FP = float>
+__global__ void x_lorenzo_2d1l_16x16data_mapto16x2(
+    Data*    outlier,
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2);
+
+/**
+ * @brief decompress-time 3D Lorenzo pred-quant kernel
+ *
+ * @tparam Data type of input data
+ * @tparam ErrCtrl type of error-control code
+ * @tparam FP type for internal floating-point processing
+ * @tparam BLOCK block size
+ * @tparam SEQ degree of sequentiality
+ * @param outlier input 1
+ * @param quant input 2
+ * @param xdata output
+ * @param len3 data length in 3D
+ * @param stride3 data stride in 3D
+ * @param radius quant-code radius
+ * @param ebx2 precalculated eb*2
+ */
+template <typename Data, typename ErrCtrl, typename FP = float>
+__global__ void x_lorenzo_3d1l_32x8x8data_mapto32x1x8(
+    Data*    outlier,
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2);
+
+/**
+ * @brief decompress-time 3D Lorenzo pred-quant kernel (variant)
+ *
+ * @tparam Data type of input data
+ * @tparam ErrCtrl type of error-control code
+ * @tparam FP type for internal floating-point processing
+ * @tparam BLOCK block size
+ * @tparam SEQ degree of sequentiality
+ * @param outlier input 1
+ * @param quant input 2
+ * @param xdata output
+ * @param len3 data length in 3D
+ * @param stride3 data stride in 3D
+ * @param radius quant-code radius
+ * @param ebx2 precalculated eb*2
+ */
+template <typename Data, typename ErrCtrl, typename FP = float>
+__global__ void x_lorenzo_3d1lvar_32x8x8data_mapto32x1x8(
+    Data*    outlier,
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2);
+
+}  // namespace cusz
+
+namespace {
+
+/**
+ * @brief (Original SZ/cuSZ design) 1D: separate delta by radius in to quant-code and outlier
+ */
+template <typename Data, typename ErrCtrl, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void pred1d_radius_separate(
+    Data              thread_scope[SEQ],
+    volatile Data*    shmem_data,
+    volatile ErrCtrl* shmem_quant,
+    int               radius,
+    Data              from_last_stripe = 0)
+{
+    if CONSTEXPR (FIRST_POINT) {  // i == 0
+        Data delta                 = thread_scope[0] - from_last_stripe;
+        bool quantizable           = fabs(delta) < radius;
+        Data candidate             = delta + radius;
+        shmem_data[0 + TIX * SEQ]  = (1 - quantizable) * candidate;  // output; reuse data for outlier
+        shmem_quant[0 + TIX * SEQ] = quantizable * static_cast<ErrCtrl>(candidate);
+    }
+    else {
+#pragma unroll
+        for (auto i = 1; i < SEQ; i++) {
+            Data delta                 = thread_scope[i] - thread_scope[i - 1];
+            bool quantizable           = fabs(delta) < radius;
+            Data candidate             = delta + radius;
+            shmem_data[i + TIX * SEQ]  = (1 - quantizable) * candidate;  // output; reuse data for outlier
+            shmem_quant[i + TIX * SEQ] = quantizable * static_cast<ErrCtrl>(candidate);
+        }
+        __syncthreads();
+    }
+}
+
+template <typename Data, typename FP, int NTHREAD, int SEQ>
+__forceinline__ __device__ void load1d(
+    Data*          data,
+    unsigned int   dimx,
+    unsigned int   id_base,
+    volatile Data* shmem_data,
+    Data           thread_scope[SEQ],
+    Data&          from_last_stripe,
+    FP             ebx2_r)
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id = id_base + TIX + i * NTHREAD;
+        if (id < dimx) { shmem_data[TIX + i * NTHREAD] = round(data[id] * ebx2_r); }
+    }
+    __syncthreads();
+
+    for (auto i = 0; i < SEQ; i++) thread_scope[i] = shmem_data[TIX * SEQ + i];
+
+    if (TIX > 0) from_last_stripe = shmem_data[TIX * SEQ - 1];
+    __syncthreads();
+}
+
+template <typename Data, typename ErrCtrl, int NTHREAD, int SEQ, bool NO_R_SEPARATE>
+__forceinline__ __device__ void write1d(
+    volatile Data*    shmem_data,
+    Data*             data,
+    unsigned int      dimx,
+    unsigned int      id_base,
+    volatile ErrCtrl* shmem_quant = nullptr,
+    ErrCtrl*          quant       = nullptr)
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id = id_base + TIX + i * NTHREAD;
+        if (id < dimx) {
+            if CONSTEXPR (NO_R_SEPARATE) {  // TODO no-radius-separate uses shmem_data
+                quant[id] = shmem_data[TIX + i * NTHREAD];
+            }
+            else {
+                data[id]  = shmem_data[TIX + i * NTHREAD];
+                quant[id] = shmem_quant[TIX + i * NTHREAD];
+            }
+        }
+    }
+}
+
+template <typename Data, typename FP, int YSEQ>
+__forceinline__ __device__ void load2d_prequant(
+    Data*        data,
+    Data         center[YSEQ + 1],
+    unsigned int dimx,
+    unsigned int dimy,
+    unsigned int stridey,
+    unsigned int gix,
+    unsigned int giy_base,
+    FP           ebx2_r)
+{
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        if (gix < dimx and giy_base + i < dimy) center[i + 1] = round(data[get_gid(i)] * ebx2_r);
+    }
+    auto tmp = __shfl_up_sync(0xffffffff, center[YSEQ], 16);  // same-warp, next-16
+    if (TIY == 1) center[0] = tmp;
+}
+
+template <typename Data, typename FP, int YSEQ>
+__forceinline__ __device__ void pred2d(Data center[YSEQ + 1])
+{
+    /* prediction
+         original form:  Data delta = center[i] - center[i - 1] + west[i] - west[i - 1];
+            short form:  Data delta = center[i] - west[i];
+       */
+#pragma unroll
+    for (auto i = YSEQ; i > 0; i--) {
+        center[i] -= center[i - 1];
+        auto west = __shfl_up_sync(0xffffffff, center[i], 1, 16);
+        if (TIX > 0) center[i] -= west;
+    }
+    __syncthreads();
+}
+
+template <typename Data, typename ErrCtrl, int YSEQ>
+__forceinline__ __device__ void postquant_write2d(
+    Data         center[YSEQ + 1],
+    ErrCtrl*     quant,
+    Data*        outlier,
+    unsigned int dimx,
+    unsigned int dimy,
+    unsigned int stridey,
+    int          radius,
+    unsigned int gix,
+    unsigned int giy_base)
+{
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 1; i < YSEQ + 1; i++) {
+        auto gid = get_gid(i - 1);
+
+        if (gix < dimx and giy_base + i - 1 < dimy) {
+            bool quantizable = fabs(center[i]) < radius;
+            Data candidate   = center[i] + radius;
+            outlier[gid]     = (1 - quantizable) * candidate;  // output; reuse data for outlier
+            quant[gid]       = quantizable * static_cast<ErrCtrl>(candidate);
+        }
+    }
+}
+
+}  // namespace
+
+template <
+    typename Data,
+    typename ErrCtrl,
+    typename FP,
+    int BLOCK,
+    int SEQ>
+__global__ void cusz::c_lorenzo_1d1l(  //
+    Data*    data,
+    ErrCtrl* quant,
+    Data*    outlier,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2_r)
+{
+    constexpr auto NTHREAD = BLOCK / SEQ;
+
+    __shared__ struct {
+        union {
+            uint8_t uninitialized[BLOCK * sizeof(Data) + BLOCK * sizeof(ErrCtrl)];
+            Data    data[BLOCK];
+        } space;
+    } shmem;
+
+    auto id_base = BIX * BLOCK;
+
+    Data thread_scope[SEQ];
+    Data from_last_stripe{0};
+
+    /********************************************************************************
+     * load from DRAM using striped layout, perform prequant
+     ********************************************************************************/
+    load1d<Data, FP, NTHREAD, SEQ>(data, len3.x, id_base, shmem.space.data, thread_scope, from_last_stripe, ebx2_r);
+
+    // the original SZ/cuSZ design
+    auto shmem_quant = reinterpret_cast<ErrCtrl*>(shmem.space.uninitialized + sizeof(Data) * BLOCK);
+    pred1d_radius_separate<Data, ErrCtrl, SEQ, true>(
+        thread_scope, shmem.space.data, shmem_quant, radius, from_last_stripe);
+    pred1d_radius_separate<Data, ErrCtrl, SEQ, false>(thread_scope, shmem.space.data, shmem_quant, radius);
+    write1d<Data, ErrCtrl, NTHREAD, SEQ, false>(shmem.space.data, outlier, len3.x, id_base, shmem_quant, quant);
+}
+
+template <typename Data, typename ErrCtrl, typename FP>
+__global__ void cusz::c_lorenzo_2d1l_16x16data_mapto16x2(
+    Data*    data,
+    ErrCtrl* quant,
+    Data*    outlier,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2_r)
+{
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = 8;
+
+    Data center[YSEQ + 1] = {0};  // nw  n
+                                  //  w  center
+
+    auto gix      = BIX * BDX + TIX;           // BDX == 16
+    auto giy_base = BIY * BLOCK + TIY * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    load2d_prequant<Data, FP, YSEQ>(data, center, len3.x, len3.y, stride3.y, gix, giy_base, ebx2_r);
+    pred2d<Data, FP, YSEQ>(center);
+    postquant_write2d<Data, ErrCtrl, YSEQ>(center, quant, outlier, len3.x, len3.y, stride3.y, radius, gix, giy_base);
+}
+
+template <typename Data, typename ErrCtrl, typename FP>
+__global__ void cusz::c_lorenzo_3d1l_32x8x8data_mapto32x1x8(
+    Data*    data,
+    ErrCtrl* quant,
+    Data*    outlier,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2_r)
+{
+    constexpr auto  BLOCK = 8;
+    __shared__ Data shmem[8][8][32];
+
+    auto z = TIZ;
+
+    auto gix      = BIX * (BLOCK * 4) + TIX;
+    auto giy_base = BIY * BLOCK;
+    auto giz      = BIZ * BLOCK + z;
+    auto base_id  = gix + giy_base * stride3.y + giz * stride3.z;
+
+    /********************************************************************************
+     * load from DRAM, perform prequant
+     ********************************************************************************/
+    if (gix < len3.x and giz < len3.z) {
+        for (auto y = 0; y < BLOCK; y++) {
+            if (giy_base + y < len3.y) {
+                shmem[z][y][TIX] = round(data[base_id + y * stride3.y] * ebx2_r);  // prequant (fp presence)
+            }
+        }
+    }
+    __syncthreads();  // necessary to ensure correctness
+
+    auto x = TIX % 8;
+
+    for (auto y = 0; y < BLOCK; y++) {
+        Data delta;
+
+        /********************************************************************************
+         * prediction
+         ********************************************************************************/
+        delta = shmem[z][y][TIX] - ((z > 0 and y > 0 and x > 0 ? shmem[z - 1][y - 1][TIX - 1] : 0)  // dist=3
+                                    - (y > 0 and x > 0 ? shmem[z][y - 1][TIX - 1] : 0)              // dist=2
+                                    - (z > 0 and x > 0 ? shmem[z - 1][y][TIX - 1] : 0)              //
+                                    - (z > 0 and y > 0 ? shmem[z - 1][y - 1][TIX] : 0)              //
+                                    + (x > 0 ? shmem[z][y][TIX - 1] : 0)                            // dist=1
+                                    + (y > 0 ? shmem[z][y - 1][TIX] : 0)                            //
+                                    + (z > 0 ? shmem[z - 1][y][TIX] : 0));                          //
+
+        auto id = base_id + (y * stride3.y);
+
+        bool quantizable = fabs(delta) < radius;
+        Data candidate   = delta + radius;
+        if (gix < len3.x and (giy_base + y) < len3.y and giz < len3.z) {
+            outlier[id] = (1 - quantizable) * candidate;  // output; reuse data for outlier
+            quant[id]   = quantizable * static_cast<ErrCtrl>(candidate);
+        }
+    }
+    /* EOF */
+}
+
+template <typename Data, typename ErrCtrl, typename FP, int BLOCK, int SEQ>
+__global__ void cusz::x_lorenzo_1d1l(  //
+    Data*    outlier,
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2)
+{
+    constexpr auto block_dim = BLOCK / SEQ;  // dividable
+
+    // coalesce-load (warp-striped) and transpose in shmem (similar for store)
+    typedef cub::BlockLoad<Data, block_dim, SEQ, cub::BLOCK_LOAD_WARP_TRANSPOSE>    BlockLoadT_outlier;
+    typedef cub::BlockLoad<ErrCtrl, block_dim, SEQ, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT_quant;
+    typedef cub::BlockStore<Data, block_dim, SEQ, cub::BLOCK_STORE_WARP_TRANSPOSE>  BlockStoreT_xdata;
+    typedef cub::BlockScan<Data, block_dim, cub::BLOCK_SCAN_RAKING_MEMOIZE>
+        BlockScanT_xdata;  // TODO autoselect algorithm
+
+    __shared__ union TempStorage {  // overlap shared memory space
+        typename BlockLoadT_outlier::TempStorage load_outlier;
+        typename BlockLoadT_quant::TempStorage   load_quant;
+        typename BlockStoreT_xdata::TempStorage  store_xdata;
+        typename BlockScanT_xdata::TempStorage   scan_xdata;
+    } temp_storage;
+
+    // thread-scope tiled data
+    union ThreadData {
+        Data xdata[SEQ];
+        Data outlier[SEQ];
+    } thread_scope;
+    ErrCtrl thread_scope_quant[SEQ];
+
+    /********************************************************************************
+     * load to thread-private array (fuse at the same time)
+     * (BIX * BDX * SEQ) denotes the start of the data chunk that belongs to this thread block
+     ********************************************************************************/
+    BlockLoadT_quant(temp_storage.load_quant).Load(quant + (BIX * BDX) * SEQ, thread_scope_quant);
+    __syncthreads();  // barrier for shmem reuse
+    BlockLoadT_outlier(temp_storage.load_outlier).Load(outlier + (BIX * BDX) * SEQ, thread_scope.outlier);
+    __syncthreads();  // barrier for shmem reuse
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id = (BIX * BDX + TIX) * SEQ + i;
+        thread_scope.xdata[i] =
+            id < len3.x ? thread_scope.outlier[i] + static_cast<Data>(thread_scope_quant[i]) - radius : 0;
+    }
+    __syncthreads();
+
+    /********************************************************************************
+     * perform partial-sum using cub::InclusiveSum
+     ********************************************************************************/
+    BlockScanT_xdata(temp_storage.scan_xdata).InclusiveSum(thread_scope.xdata, thread_scope.xdata);
+    __syncthreads();  // barrier for shmem reuse
+
+    /********************************************************************************
+     * scale by ebx2 and write to DRAM
+     ********************************************************************************/
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) thread_scope.xdata[i] *= ebx2;
+    __syncthreads();  // barrier for shmem reuse
+
+    BlockStoreT_xdata(temp_storage.store_xdata).Store(xdata + (BIX * BDX) * SEQ, thread_scope.xdata);
+}
+
+template <typename Data, typename ErrCtrl, typename FP>
+__global__ void cusz::x_lorenzo_2d1l_16x16data_mapto16x2(
+    Data*    outlier,
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2)
+{
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
+    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
+
+    __shared__ Data intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
+    Data            thread_scope[YSEQ];
+    /*
+      .  ------> gix (x)
+      |  t00    t01    t02    t03    ... t0f
+      |  ts00_0 ts00_0 ts00_0 ts00_0
+     giy ts00_1 ts00_1 ts00_1 ts00_1
+     (y)  |      |      |      |
+         ts00_7 ts00_7 ts00_7 ts00_7
+
+      |  t10    t11    t12    t13    ... t1f
+      |  ts00_0 ts00_0 ts00_0 ts00_0
+     giy ts00_1 ts00_1 ts00_1 ts00_1
+     (y)  |      |      |      |
+         ts00_7 ts00_7 ts00_7 ts00_7
+     */
+
+    auto gix      = BIX * BLOCK + TIX;
+    auto giy_base = BIY * BLOCK + TIY * YSEQ;  // BDY * YSEQ = BLOCK == 16
+    auto get_gid  = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
+
+    /********************************************************************************
+     * load to thread-private array (fuse at the same time)
+     ********************************************************************************/
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
+        if (gix < len3.x and giy_base + i < len3.y)
+            thread_scope[i] = outlier[gid] + static_cast<Data>(quant[gid]) - radius;  // fuse
+        else
+            thread_scope[i] = 0;  // TODO set as init state?
+    }
+
+    /********************************************************************************
+     * partial-sum along y-axis, sequantially
+     ********************************************************************************/
+    for (auto i = 1; i < YSEQ; i++) thread_scope[i] += thread_scope[i - 1];
+    // two-pass: store for cross-threadscope update
+    if (TIY == 0) intermediate[TIX] = thread_scope[YSEQ - 1];
+    __syncthreads();
+    // two-pass: load and update
+    if (TIY == 1) {
+        auto tmp = intermediate[TIX];
+#pragma unroll
+        for (auto& i : thread_scope) i += tmp;
+    }
+
+    /********************************************************************************
+     * in-warp partial-sum along x-axis
+     ********************************************************************************/
+#pragma unroll
+    for (auto& i : thread_scope) {
+        for (auto d = 1; d < BLOCK; d *= 2) {
+            Data n = __shfl_up_sync(0xffffffff, i, d, 16);
+            if (TIX >= d) i += n;
+        }
+        i *= ebx2;
+    }
+
+    /********************************************************************************
+     * write to DRAM
+     ********************************************************************************/
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        if (gix < len3.x and giy_base + i < len3.y) xdata[gid] = thread_scope[i];
+    }
+}
+
+template <typename Data, typename ErrCtrl, typename FP>
+__global__ void cusz::x_lorenzo_3d1l_32x8x8data_mapto32x1x8(
+    Data*    outlier,
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2)
+{
+    constexpr auto BLOCK = 8;
+    constexpr auto YSEQ  = BLOCK;
+    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
+
+    __shared__ Data intermediate[BLOCK][4][8];
+    Data            thread_scope[YSEQ];
+
+    auto seg_id  = TIX / 8;
+    auto seg_tix = TIX % 8;
+
+    auto gix = BIX * (4 * BLOCK) + TIX, giy_base = BIY * BLOCK, giz = BIZ * BLOCK + TIZ;
+    auto get_gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
+
+    /********************************************************************************
+     * load to thread-private array (fuse at the same time)
+     ********************************************************************************/
+#pragma unroll
+    for (auto y = 0; y < YSEQ; y++) {
+        auto gid = get_gid(y);
+        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
+            thread_scope[y] = outlier[gid] + static_cast<Data>(quant[gid]) - static_cast<Data>(radius);  // fuse
+        else
+            thread_scope[y] = 0;
+    }
+
+    /********************************************************************************
+     * partial-sum along y-axis, sequantially
+     ********************************************************************************/
+    for (auto y = 1; y < YSEQ; y++) thread_scope[y] += thread_scope[y - 1];
+
+    /********************************************************************************
+     * ND partial-sums along x- and z-axis
+     * in-warp shuffle used: in order to perform, it's transposed after X-partial sum
+     ********************************************************************************/
+    auto dist = 1;
+    Data addend;
+
+#pragma unroll
+    for (auto i = 0; i < BLOCK; i++) {
+        Data val = thread_scope[i];
+
+        for (dist = 1; dist < BLOCK; dist *= 2) {
+            addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+            if (seg_tix >= dist) val += addend;
+        }
+
+        // x-z transpose
+        intermediate[TIZ][seg_id][seg_tix] = val;
+        __syncthreads();
+        val = intermediate[seg_tix][seg_id][TIZ];
+        __syncthreads();
+
+        for (dist = 1; dist < BLOCK; dist *= 2) {
+            addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+            if (seg_tix >= dist) val += addend;
+        }
+
+        intermediate[TIZ][seg_id][seg_tix] = val;
+        __syncthreads();
+        val = intermediate[seg_tix][seg_id][TIZ];
+        __syncthreads();
+
+        thread_scope[i] = val;
+    }
+
+    /********************************************************************************
+     * write to DRAM
+     ********************************************************************************/
+#pragma unroll
+    for (auto y = 0; y < YSEQ; y++) {
+        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) { xdata[get_gid(y)] = thread_scope[y] * ebx2; }
+    }
+    /* EOF */
+}
+
+/********************************************************************************
+ * experimental prototype toward further optmization
+ ********************************************************************************/
+template <typename Data, typename ErrCtrl, typename FP>
+__global__ void cusz::x_lorenzo_3d1lvar_32x8x8data_mapto32x1x8(
+    Data*    outlier,
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2)
+{
+    constexpr auto BLOCK = 8;
+    constexpr auto YSEQ  = BLOCK;
+    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
+
+    __shared__ Data intermediate[BLOCK][4][8];
+    Data            thread_scope = 0;
+
+    auto seg_id  = TIX / 8;
+    auto seg_tix = TIX % 8;
+
+    auto gix = BIX * (4 * BLOCK) + TIX, giy_base = BIY * BLOCK, giz = BIZ * BLOCK + TIZ;
+    auto get_gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
+
+    auto y = 0;
+
+    // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
+#pragma unroll
+    for (y = 0; y < YSEQ; y++) {
+        auto gid = get_gid(y);
+        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
+            thread_scope += outlier[gid] + static_cast<Data>(quant[gid]) - static_cast<Data>(radius);  // fuse
+
+        Data val = thread_scope;
+
+        // shuffle, ND partial-sums
+        for (auto dist = 1; dist < BLOCK; dist *= 2) {
+            Data addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+            if (seg_tix >= dist) val += addend;
+        }
+
+        // x-z transpose
+        intermediate[TIZ][seg_id][seg_tix] = val;
+        __syncthreads();
+        val = intermediate[seg_tix][seg_id][TIZ];
+        __syncthreads();
+
+        for (auto dist = 1; dist < BLOCK; dist *= 2) {
+            Data addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+            if (seg_tix >= dist) val += addend;
+        }
+
+        intermediate[TIZ][seg_id][seg_tix] = val;
+        __syncthreads();
+        val = intermediate[seg_tix][seg_id][TIZ];
+        __syncthreads();
+
+        // thread_scope += val;
+
+        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) { xdata[get_gid(y)] = val * ebx2; }
+    }
+}
+
+#undef TIX
+#undef TIY
+#undef TIZ
+#undef BIX
+#undef BIY
+#undef BIZ
+#undef BDX
+#undef BDY
+#undef BDZ
+
+#endif
diff --git a/qtensor/compression/cusz/src/kernel/detail/lorenzo23.inl b/qtensor/compression/cusz/src/kernel/detail/lorenzo23.inl
index 764f44ec..83a52b4b 100644
--- a/qtensor/compression/cusz/src/kernel/detail/lorenzo23.inl
+++ b/qtensor/compression/cusz/src/kernel/detail/lorenzo23.inl
@@ -1,1237 +1,1237 @@
-/**
- * @file lorenzo23.inl
- * @author Jiannan Tian
- * @brief
- * @version 0.4
- * @date 2022-12-22
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "subroutine.inl"
-
-namespace subr = psz::cuda::__device;
-
-namespace psz {
-namespace cuda {
-namespace __kernel {
-
-////////////////////////////////////////////////////////////////////////////////
-// 1D
-
-namespace v0 {
-
-template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
-__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier);
-
-template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
-__global__ void x_lorenzo_1d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata);
-
-namespace compaction {
-
-template <typename T, typename EQ, typename FP, int BLOCK, int SEQ, typename Compaction = CompactionDRAM<T>>
-__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
-
-}
-
-namespace delta_only {
-
-template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
-__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta);
-
-template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
-__global__ void x_lorenzo_1d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
-
-}  // namespace delta_only
-
-}  // namespace v0
-
-namespace v1_pn {
-
-namespace compaction {
-
-template <typename T, typename EQ, typename FP, int BLOCK, int SEQ, typename Compaction = CompactionDRAM<T>>
-__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
-
-}  // namespace compaction
-
-template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
-__global__ void x_lorenzo_1d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
-
-namespace delta_only {
-
-template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
-__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta);
-
-template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
-__global__ void x_lorenzo_1d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
-
-}  // namespace delta_only
-
-}  // namespace v1_pn
-
-////////////////////////////////////////////////////////////////////////////////
-// 2D
-
-namespace v0 {
-
-template <typename T, typename EQ, typename FP>
-__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier);
-
-template <typename T, typename EQ, typename FP>
-__global__ void x_lorenzo_2d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata);
-
-namespace delta_only {
-
-template <typename T, typename EQ, typename FP>
-__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta);
-
-template <typename T, typename EQ, typename FP>
-__global__ void x_lorenzo_2d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
-
-}  // namespace delta_only
-
-namespace compaction {
-
-template <typename T, typename EQ, typename FP, typename Compaction = CompactionDRAM<T>>
-__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
-
-}  // namespace compaction
-
-}  // namespace v0
-
-namespace v1_pn {
-
-namespace compaction {
-
-template <typename T, typename EQ, typename FP, typename Compaction = CompactionDRAM<T>>
-__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
-
-}  // namespace compaction
-
-template <typename T, typename EQ, typename FP>
-__global__ void x_lorenzo_2d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
-
-namespace delta_only {
-
-template <typename T, typename EQ, typename FP>
-__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta);
-
-template <typename T, typename EQ, typename FP>
-__global__ void x_lorenzo_2d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
-
-}  // namespace delta_only
-
-}  // namespace v1_pn
-
-////////////////////////////////////////////////////////////////////////////////
-// 3D
-
-namespace v0 {
-
-// TODO -> `legacy`
-namespace legacy {
-template <typename T, typename EQ, typename FP>
-__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier);
-
-}
-
-template <typename T, typename EQ, typename FP>
-__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier);
-
-template <typename T, typename EQ, typename FP>
-__global__ void x_lorenzo_3d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata);
-
-namespace delta_only {
-
-template <typename T, typename EQ, typename FP>
-__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant);
-
-template <typename T, typename EQ, typename FP>
-__global__ void x_lorenzo_3d1l(EQ* quant, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
-
-}  // namespace delta_only
-
-namespace compaction {
-
-template <typename T, typename EQ, typename FP, typename Compaction = CompactionDRAM<T>>
-__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
-
-}
-
-}  // namespace v0
-
-namespace v1_pn {
-
-namespace compaction {
-
-template <typename T, typename EQ, typename FP, typename Compaction = CompactionDRAM<T>>
-__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
-
-}
-
-template <typename T, typename EQ, typename FP>
-__global__ void x_lorenzo_3d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
-
-namespace delta_only {
-
-template <typename T, typename EQ, typename FP>
-__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant);
-
-template <typename T, typename EQ, typename FP>
-__global__ void x_lorenzo_3d1l(EQ* quant, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
-
-}  // namespace delta_only
-
-}  // namespace v1_pn
-
-}  // namespace __kernel
-}  // namespace cuda
-}  // namespace psz
-
-////////////////////////////////////////////////////////////////////////////////
-// 1D definition
-
-template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
-__global__ void
-psz::cuda::__kernel::v0::c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier)
-{
-    namespace subr_v0 = psz::cuda::__device::v0;
-
-    constexpr auto NTHREAD = BLOCK / SEQ;
-
-    __shared__ struct {
-        union {
-            T data[BLOCK];
-            T outlier[BLOCK];
-        };
-        EQ quant[BLOCK];
-    } s;
-
-    T prev{0};
-    T thp_data[SEQ];
-
-    auto id_base = blockIdx.x * BLOCK;
-
-    subr_v0::load_prequant_1d<T, FP, NTHREAD, SEQ>(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r);
-    subr_v0::predict_quantize_1d<T, EQ, SEQ, true>(thp_data, s.quant, s.outlier, radius, prev);
-    subr_v0::predict_quantize_1d<T, EQ, SEQ, false>(thp_data, s.quant, s.outlier, radius);
-    subr_v0::write_1d<EQ, T, NTHREAD, SEQ, false>(s.quant, s.outlier, len3.x, id_base, quant, outlier);
-}
-
-template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
-__global__ void
-psz::cuda::__kernel::v0::delta_only::c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant)
-{
-    namespace subr_v0 = psz::cuda::__device::v0;
-
-    constexpr auto NTHREAD = BLOCK / SEQ;
-
-    __shared__ struct {
-        union {
-            T data[BLOCK];
-            T outlier[BLOCK];
-        };
-        EQ quant[BLOCK];
-    } s;
-
-    T prev{0};
-    T thp_data[SEQ];
-
-    auto id_base = blockIdx.x * BLOCK;
-
-    subr_v0::load_prequant_1d<T, FP, NTHREAD, SEQ>(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r);
-    subr_v0::predict_quantize__no_outlier_1d<T, EQ, SEQ, true>(thp_data, s.quant, prev);
-    subr_v0::predict_quantize__no_outlier_1d<T, EQ, SEQ, false>(thp_data, s.quant);
-    subr_v0::write_1d<EQ, T, NTHREAD, SEQ, false>(s.quant, nullptr, len3.x, id_base, quant, nullptr);
-}
-
-template <typename T, typename EQ, typename FP, int BLOCK, int SEQ, typename Compaction>
-__global__ void psz::cuda::__kernel::v0::compaction::c_lorenzo_1d1l(
-    T*         data,
-    dim3       len3,
-    dim3       stride3,
-    int        radius,
-    FP         ebx2_r,
-    EQ*        quant,
-    Compaction outlier_desc)
-{
-    namespace subr_v0  = psz::cuda::__device::v0;
-    namespace subr_v0c = psz::cuda::__device::v0::compaction;
-
-    constexpr auto NTHREAD = BLOCK / SEQ;
-
-    __shared__ struct {
-        union {
-            T data[BLOCK];
-            T outlier[BLOCK];
-        };
-        EQ quant[BLOCK];
-    } s;
-
-    T prev{0};
-    T thp_data[SEQ];
-
-    auto id_base = blockIdx.x * BLOCK;
-
-    subr_v0::load_prequant_1d<T, FP, NTHREAD, SEQ>(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r);
-    subr_v0c::predict_quantize_1d<T, EQ, SEQ, true>(thp_data, s.quant, len3.x, radius, id_base, outlier_desc, prev);
-    subr_v0c::predict_quantize_1d<T, EQ, SEQ, false>(thp_data, s.quant, len3.x, radius, id_base, outlier_desc);
-    subr_v0::write_1d<EQ, T, NTHREAD, SEQ, true>(s.quant, nullptr, len3.x, id_base, quant, nullptr);
-}
-
-template <typename T, typename EQ, typename FP, int BLOCK, int SEQ, typename Compaction>
-__global__ void psz::cuda::__kernel::v1_pn::compaction::c_lorenzo_1d1l(  //
-    T*         data,
-    dim3       len3,
-    dim3       stride3,
-    int        radius,
-    FP         ebx2_r,
-    EQ*        quant,
-    Compaction outlier)
-{
-    namespace subr_v0  = psz::cuda::__device::v0;
-    namespace subr_v1c = psz::cuda::__device::v1_pn::compaction;
-
-    constexpr auto NTHREAD = BLOCK / SEQ;
-
-    __shared__ struct {
-        union {
-            T data[BLOCK];
-            T outlier[BLOCK];
-        };
-        EQ quant[BLOCK];
-    } s;
-
-    T prev{0};
-    T thp_data[SEQ];
-
-    auto id_base = blockIdx.x * BLOCK;
-
-    subr_v0::load_prequant_1d<T, FP, NTHREAD, SEQ>(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r);
-    subr_v1c::predict_quantize_1d<T, EQ, SEQ, true>(thp_data, s.quant, s.outlier, radius, prev);
-    subr_v1c::predict_quantize_1d<T, EQ, SEQ, false>(thp_data, s.quant, s.outlier, radius);
-    subr_v0::write_1d<EQ, T, NTHREAD, SEQ, false>(s.quant, s.outlier, len3.x, id_base, quant, outlier);
-}
-
-template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
-__global__ void psz::cuda::__kernel::v0::x_lorenzo_1d1l(  //
-    EQ*  quant,
-    T*   outlier,
-    dim3 len3,
-    dim3 stride3,
-    int  radius,
-    FP   ebx2,
-    T*   xdata)
-{
-    namespace subr_v0 = psz::cuda::__device::v0;
-    namespace wave32  = psz::cuda::__device::wave32;
-
-    constexpr auto NTHREAD = BLOCK / SEQ;  // equiv. to blockDim.x
-
-    __shared__ struct {
-        union {
-            T outlier[BLOCK];
-            T xdata[BLOCK];
-        };
-        // even if it's wave64, "/32" works
-        T exchange_in[NTHREAD / 32];
-        T exchange_out[NTHREAD / 32];
-    } s;
-
-    T thp_data[SEQ];
-
-    auto id_base = blockIdx.x * BLOCK;
-
-    subr_v0::load_fuse_1d<T, EQ, NTHREAD, SEQ>(quant, outlier, len3.x, id_base, radius, s.xdata, thp_data);
-    subr_v0::block_scan_1d<T, SEQ, NTHREAD>(thp_data, ebx2, s.exchange_in, s.exchange_out, s.xdata);
-    subr_v0::write_1d<T, T, NTHREAD, SEQ, true>(s.xdata, nullptr, len3.x, id_base, xdata, nullptr);
-}
-
-template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
-__global__ void psz::cuda::__kernel::v0::delta_only::x_lorenzo_1d1l(  //
-    EQ*  quant,
-    dim3 len3,
-    dim3 stride3,
-    FP   ebx2,
-    T*   xdata)
-{
-    namespace subr_v0 = psz::cuda::__device::v0;
-
-    constexpr auto NTHREAD = BLOCK / SEQ;  // equiv. to blockDim.x
-
-    __shared__ struct {
-        T xdata[BLOCK];
-        // even if it's wave64, "/32" works
-        T exchange_in[NTHREAD / 32];
-        T exchange_out[NTHREAD / 32];
-    } s;
-
-    T thp_data[SEQ];
-
-    auto id_base = blockIdx.x * BLOCK;
-
-    subr_v0::delta_only::load_1d<T, EQ, NTHREAD, SEQ>(quant, len3.x, id_base, s.xdata, thp_data);
-    subr_v0::block_scan_1d<T, SEQ, NTHREAD>(thp_data, ebx2, s.exchange_in, s.exchange_out, s.xdata);
-    subr_v0::write_1d<T, T, NTHREAD, SEQ, true>(s.xdata, nullptr, len3.x, id_base, xdata, nullptr);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// 2D definition
-
-template <typename T, typename EQ, typename FP>
-__global__ void
-psz::cuda::__kernel::v0::c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier)
-{
-    namespace subr_v0 = psz::cuda::__device::v0;
-
-    constexpr auto BLOCK = 16;
-    constexpr auto YSEQ  = 8;
-
-    T center[YSEQ + 1] = {0};  // NW  N       first element <- 0
-                               //  W  center
-
-    auto gix      = blockIdx.x * BLOCK + threadIdx.x;         // BDX == BLOCK == 16
-    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
-
-    subr_v0::load_prequant_2d<T, FP, YSEQ>(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center);
-    subr_v0::predict_2d<T, EQ, YSEQ>(center);
-    subr_v0::quantize_write_2d<T, EQ, YSEQ>(center, len3.x, gix, len3.y, giy_base, stride3.y, radius, quant, outlier);
-}
-
-template <typename T, typename EQ, typename FP>
-__global__ void
-psz::cuda::__kernel::v0::delta_only::c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant)
-{
-    namespace subr_v0 = psz::cuda::__device::v0;
-
-    constexpr auto BLOCK = 16;
-    constexpr auto YSEQ  = 8;
-
-    T center[YSEQ + 1] = {0};  // NW  N       first element <- 0
-                               //  W  center
-
-    auto gix      = blockIdx.x * BLOCK + threadIdx.x;         // BDX == BLOCK == 16
-    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
-
-    subr_v0::load_prequant_2d<T, FP, YSEQ>(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center);
-    subr_v0::predict_2d<T, EQ, YSEQ>(center);
-    subr_v0::delta_only::quantize_write_2d<T, EQ, YSEQ>(center, len3.x, gix, len3.y, giy_base, stride3.y, quant);
-}
-
-template <typename T, typename EQ, typename FP>
-__global__ void
-psz::cuda::__kernel::v1_pn::delta_only::c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant)
-{
-    namespace subr_v0  = psz::cuda::__device::v0;
-    namespace subr_v1d = psz::cuda::__device::v1_pn::delta_only;
-
-    constexpr auto BLOCK = 16;
-    constexpr auto YSEQ  = 8;
-
-    T center[YSEQ + 1] = {0};  // NW  N       first element <- 0
-                               //  W  center
-
-    auto gix      = blockIdx.x * BLOCK + threadIdx.x;         // BDX == BLOCK == 16
-    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
-
-    subr_v0::load_prequant_2d<T, FP, YSEQ>(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center);
-    subr_v0::predict_2d<T, EQ, YSEQ>(center);
-    subr_v1d::quantize_write_2d<T, EQ, YSEQ>(center, len3.x, gix, len3.y, giy_base, stride3.y, quant);
-}
-
-template <typename T, typename EQ, typename FP, typename Compaction>
-__global__ void psz::cuda::__kernel::v0::compaction::c_lorenzo_2d1l(
-    T*         data,
-    dim3       len3,
-    dim3       stride3,
-    int        radius,
-    FP         ebx2_r,
-    EQ*        quant,
-    Compaction outlier)
-{
-    namespace subr_v0 = psz::cuda::__device::v0;
-
-    constexpr auto BLOCK = 16;
-    constexpr auto YSEQ  = 8;
-
-    T center[YSEQ + 1] = {0};  // NW  N       first element <- 0
-                               //  W  center
-
-    auto gix      = blockIdx.x * BLOCK + threadIdx.x;         // BDX == BLOCK == 16
-    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
-
-    subr_v0::load_prequant_2d<T, FP, YSEQ>(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center);
-    subr_v0::predict_2d<T, EQ, YSEQ>(center);
-    subr_v0::compaction::quantize_write_2d<T, EQ, YSEQ>(
-        center, len3.x, gix, len3.y, giy_base, stride3.y, radius, quant, outlier);
-}
-
-// 16x16 data block maps to 16x2 (one warp) thread block
-template <typename T, typename EQ, typename FP>
-__global__ void psz::cuda::__kernel::v0::x_lorenzo_2d1l(  //
-    EQ*  quant,
-    T*   outlier,
-    dim3 len3,
-    dim3 stride3,
-    int  radius,
-    FP   ebx2,
-    T*   xdata)
-{
-    namespace subr_v0 = psz::cuda::__device::v0;
-
-    constexpr auto BLOCK = 16;
-    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
-    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
-
-    __shared__ T intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
-    T            thread_private[YSEQ];
-
-    auto gix      = blockIdx.x * BLOCK + threadIdx.x;
-    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
-
-    auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
-
-    subr_v0::load_fuse_2d<T, EQ, YSEQ>(
-        quant, outlier, len3.x, gix, len3.y, giy_base, stride3.y, radius, thread_private);
-    subr_v0::block_scan_2d<T, EQ, FP, YSEQ>(thread_private, intermediate, ebx2);
-    subr_v0::decomp_write_2d<T, YSEQ>(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata);
-}
-
-// 16x16 data block maps to 16x2 (one warp) thread block
-template <typename T, typename EQ, typename FP>
-__global__ void psz::cuda::__kernel::v1_pn::x_lorenzo_2d1l(  //
-    EQ*  quant,
-    T*   outlier,
-    dim3 len3,
-    dim3 stride3,
-    FP   ebx2,
-    T*   xdata)
-{
-    namespace subr_v0    = psz::cuda::__device::v0;
-    namespace subr_v1_pn = psz::cuda::__device::v1_pn;
-
-    constexpr auto BLOCK = 16;
-    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
-    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
-
-    __shared__ T intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
-    T            thread_private[YSEQ];
-
-    auto gix      = blockIdx.x * BLOCK + threadIdx.x;
-    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
-
-    auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
-
-    subr_v1_pn::load_fuse_2d<T, EQ, YSEQ>(quant, outlier, len3.x, gix, len3.y, giy_base, stride3.y, thread_private);
-    subr_v0::block_scan_2d<T, EQ, FP, YSEQ>(thread_private, intermediate, ebx2);
-    subr_v0::decomp_write_2d<T, YSEQ>(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata);
-}
-
-// 16x16 data block maps to 16x2 (one warp) thread block
-template <typename T, typename EQ, typename FP>
-__global__ void psz::cuda::__kernel::v0::delta_only::x_lorenzo_2d1l(  //
-    EQ*  quant,
-    dim3 len3,
-    dim3 stride3,
-    FP   ebx2,
-    T*   xdata)
-{
-    namespace subr_v0 = psz::cuda::__device::v0;
-
-    constexpr auto BLOCK = 16;
-    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
-    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
-
-    __shared__ T intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
-    T            thread_private[YSEQ];
-
-    auto gix      = blockIdx.x * BLOCK + threadIdx.x;
-    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
-
-    auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
-
-    subr_v0::delta_only::load_2d<T, EQ, YSEQ>(quant, len3.x, gix, len3.y, giy_base, stride3.y, thread_private);
-    subr_v0::block_scan_2d<T, EQ, FP, YSEQ>(thread_private, intermediate, ebx2);
-    subr_v0::decomp_write_2d<T, YSEQ>(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata);
-}
-
-// 16x16 data block maps to 16x2 (one warp) thread block
-template <typename T, typename EQ, typename FP>
-__global__ void psz::cuda::__kernel::v1_pn::delta_only::x_lorenzo_2d1l(  //
-    EQ*  quant,
-    dim3 len3,
-    dim3 stride3,
-    FP   ebx2,
-    T*   xdata)
-{
-    namespace subr_v0    = psz::cuda::__device::v0;
-    namespace subr_v1_pn = psz::cuda::__device::v1_pn;
-
-    constexpr auto BLOCK = 16;
-    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
-    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
-
-    __shared__ T intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
-    T            thread_private[YSEQ];
-
-    auto gix      = blockIdx.x * BLOCK + threadIdx.x;
-    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
-
-    auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
-
-    subr_v1_pn::delta_only::load_2d<T, EQ, YSEQ>(quant, len3.x, gix, len3.y, giy_base, stride3.y, thread_private);
-    subr_v0::block_scan_2d<T, EQ, FP, YSEQ>(thread_private, intermediate, ebx2);
-    subr_v0::decomp_write_2d<T, YSEQ>(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata);
-}
-
-template <typename T, typename EQ, typename FP>
-__global__ void psz::cuda::__kernel::v0::legacy::c_lorenzo_3d1l(
-    T*   data,
-    dim3 len3,
-    dim3 stride3,
-    int  radius,
-    FP   ebx2_r,
-    EQ*  quant,
-    T*   outlier)
-{
-    constexpr auto BLOCK = 8;
-    __shared__ T   s[8][8][32];
-
-    auto z = threadIdx.z;
-
-    auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
-    auto giy_base = blockIdx.y * BLOCK;
-    auto giz      = blockIdx.z * BLOCK + z;
-    auto base_id  = gix + giy_base * stride3.y + giz * stride3.z;
-
-    auto giy = [&](auto y) { return giy_base + y; };
-    auto gid = [&](auto y) { return base_id + y * stride3.y; };
-
-    auto load_prequant_3d = [&]() {
-        if (gix < len3.x and giz < len3.z) {
-            for (auto y = 0; y < BLOCK; y++)
-                if (giy(y) < len3.y) s[z][y][threadIdx.x] = round(data[gid(y)] * ebx2_r);  // prequant (fp presence)
-        }
-        __syncthreads();
-    };
-
-    auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) {
-        bool quantizable = fabs(delta) < radius;
-        T    candidate   = delta + radius;
-        if (x < len3.x and y < len3.y and z < len3.z) {
-            quant[gid]   = quantizable * static_cast<EQ>(candidate);
-            outlier[gid] = (not quantizable) * candidate;
-        }
-    };
-
-    auto x = threadIdx.x % 8;
-
-    auto predict_3d = [&](auto y) {
-        T delta = s[z][y][threadIdx.x] -                                               //
-                  ((z > 0 and y > 0 and x > 0 ? s[z - 1][y - 1][threadIdx.x - 1] : 0)  // dist=3
-                   - (y > 0 and x > 0 ? s[z][y - 1][threadIdx.x - 1] : 0)              // dist=2
-                   - (z > 0 and x > 0 ? s[z - 1][y][threadIdx.x - 1] : 0)              //
-                   - (z > 0 and y > 0 ? s[z - 1][y - 1][threadIdx.x] : 0)              //
-                   + (x > 0 ? s[z][y][threadIdx.x - 1] : 0)                            // dist=1
-                   + (y > 0 ? s[z][y - 1][threadIdx.x] : 0)                            //
-                   + (z > 0 ? s[z - 1][y][threadIdx.x] : 0));                          //
-        return delta;
-    };
-
-    ////////////////////////////////////////////////////////////////////////////
-
-    load_prequant_3d();
-    for (auto y = 0; y < BLOCK; y++) {
-        auto delta = predict_3d(y);
-        quantize_write(delta, gix, giy(y), giz, gid(y));
-    }
-}
-
-template <typename T, typename EQ, typename FP>
-__global__ void
-psz::cuda::__kernel::v0::c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier)
-{
-    constexpr auto BLOCK = 8;
-    __shared__ T   s[9][33];
-    T              delta[BLOCK + 1] = {0};  // first el = 0
-
-    const auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
-    const auto giy      = blockIdx.y * BLOCK + threadIdx.y;
-    const auto giz_base = blockIdx.z * BLOCK;
-    const auto base_id  = gix + giy * stride3.y + giz_base * stride3.z;
-
-    auto giz = [&](auto z) { return giz_base + z; };
-    auto gid = [&](auto z) { return base_id + z * stride3.z; };
-
-    auto load_prequant_3d = [&]() {
-        if (gix < len3.x and giy < len3.y) {
-            for (auto z = 0; z < BLOCK; z++)
-                if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r);  // prequant (fp presence)
-        }
-        __syncthreads();
-    };
-
-    auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) {
-        bool quantizable = fabs(delta) < radius;
-        T    candidate   = delta + radius;
-        if (x < len3.x and y < len3.y and z < len3.z) {
-            quant[gid]   = quantizable * static_cast<EQ>(candidate);
-            outlier[gid] = (not quantizable) * candidate;
-        }
-    };
-
-    ////////////////////////////////////////////////////////////////////////////
-
-    /* z-direction, sequential in private buffer
-       delta = + (s[z][y][x] - s[z-1][y][x])
-               - (s[z][y][x-1] - s[z-1][y][x-1])
-               + (s[z][y-1][x-1] - s[z-1][y-1][x-1])
-               - (s[z][y-1][x] - s[z-1][y-1][x])
-
-       x-direction, shuffle
-       delta = + (s[z][y][x] - s[z][y][x-1])
-               - (s[z][y-1][x] - s[z][y-1][x-1])
-
-       y-direction, shmem
-       delta = s[z][y][x] - s[z][y-1][x]
-     */
-
-    load_prequant_3d();
-
-    for (auto z = BLOCK; z > 0; z--) {
-        // z-direction
-        delta[z] -= delta[z - 1];
-
-        // x-direction
-        auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8);
-        if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x;
-
-        // y-direction, exchange via shmem
-        // ghost padding along y
-        s[threadIdx.y + 1][threadIdx.x] = delta[z];
-        __syncthreads();
-
-        delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x];
-
-        // now delta[z] is delta
-        quantize_write(delta[z], gix, giy, giz(z - 1), gid(z - 1));
-    }
-}
-
-template <typename T, typename EQ, typename FP>
-__global__ void psz::cuda::__kernel::v0::delta_only::c_lorenzo_3d1l(  //
-    T*   data,
-    dim3 len3,
-    dim3 stride3,
-    FP   ebx2_r,
-    EQ*  quant)
-{
-    constexpr auto BLOCK = 8;
-    __shared__ T   s[9][33];
-    T              delta[BLOCK + 1] = {0};  // first el = 0
-
-    const auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
-    const auto giy      = blockIdx.y * BLOCK + threadIdx.y;
-    const auto giz_base = blockIdx.z * BLOCK;
-    const auto base_id  = gix + giy * stride3.y + giz_base * stride3.z;
-
-    auto giz = [&](auto z) { return giz_base + z; };
-    auto gid = [&](auto z) { return base_id + z * stride3.z; };
-
-    auto load_prequant_3d = [&]() {
-        if (gix < len3.x and giy < len3.y) {
-            for (auto z = 0; z < BLOCK; z++)
-                if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r);  // prequant (fp presence)
-        }
-        __syncthreads();
-    };
-
-    auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) {
-        if (x < len3.x and y < len3.y and z < len3.z) quant[gid] = static_cast<EQ>(delta);
-    };
-
-    ////////////////////////////////////////////////////////////////////////////
-
-    load_prequant_3d();
-
-    for (auto z = BLOCK; z > 0; z--) {
-        // z-direction
-        delta[z] -= delta[z - 1];
-
-        // x-direction
-        auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8);
-        if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x;
-
-        // y-direction, exchange via shmem
-        // ghost padding along y
-        s[threadIdx.y + 1][threadIdx.x] = delta[z];
-        __syncthreads();
-
-        delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x];
-
-        // now delta[z] is delta
-        quantize_write(delta[z], gix, giy, giz(z - 1), gid(z - 1));
-    }
-}
-
-template <typename T, typename EQ, typename FP>
-__global__ void psz::cuda::__kernel::v1_pn::delta_only::c_lorenzo_3d1l(  //
-    T*   data,
-    dim3 len3,
-    dim3 stride3,
-    FP   ebx2_r,
-    EQ*  quant)
-{
-    constexpr auto BYTEWIDTH = sizeof(EQ);
-
-    using UI = EQ;
-    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
-
-    constexpr auto BLOCK = 8;
-    __shared__ T   s[9][33];
-    T              delta[BLOCK + 1] = {0};  // first el = 0
-
-    const auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
-    const auto giy      = blockIdx.y * BLOCK + threadIdx.y;
-    const auto giz_base = blockIdx.z * BLOCK;
-    const auto base_id  = gix + giy * stride3.y + giz_base * stride3.z;
-
-    auto giz = [&](auto z) { return giz_base + z; };
-    auto gid = [&](auto z) { return base_id + z * stride3.z; };
-
-    auto load_prequant_3d = [&]() {
-        if (gix < len3.x and giy < len3.y) {
-            for (auto z = 0; z < BLOCK; z++)
-                if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r);  // prequant (fp presence)
-        }
-        __syncthreads();
-    };
-
-    auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) {
-        if (x < len3.x and y < len3.y and z < len3.z) quant[gid] = PN<BYTEWIDTH>::encode(static_cast<I>(delta));
-    };
-
-    ////////////////////////////////////////////////////////////////////////////
-
-    load_prequant_3d();
-
-    for (auto z = BLOCK; z > 0; z--) {
-        // z-direction
-        delta[z] -= delta[z - 1];
-
-        // x-direction
-        auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8);
-        if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x;
-
-        // y-direction, exchange via shmem
-        // ghost padding along y
-        s[threadIdx.y + 1][threadIdx.x] = delta[z];
-        __syncthreads();
-
-        delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x];
-
-        // now delta[z] is delta
-        quantize_write(delta[z], gix, giy, giz(z - 1), gid(z - 1));
-    }
-}
-
-template <typename T, typename EQ, typename FP, typename Compaction>
-__global__ void psz::cuda::__kernel::v0::compaction::c_lorenzo_3d1l(
-    T*         data,
-    dim3       len3,
-    dim3       stride3,
-    int        radius,
-    FP         ebx2_r,
-    EQ*        quant,
-    Compaction outlier)
-{
-    constexpr auto BLOCK = 8;
-    __shared__ T   s[9][33];
-    T              delta[BLOCK + 1] = {0};  // first el = 0
-
-    const auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
-    const auto giy      = blockIdx.y * BLOCK + threadIdx.y;
-    const auto giz_base = blockIdx.z * BLOCK;
-    const auto base_id  = gix + giy * stride3.y + giz_base * stride3.z;
-
-    auto giz = [&](auto z) { return giz_base + z; };
-    auto gid = [&](auto z) { return base_id + z * stride3.z; };
-
-    auto load_prequant_3d = [&]() {
-        if (gix < len3.x and giy < len3.y) {
-            for (auto z = 0; z < BLOCK; z++)
-                if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r);  // prequant (fp presence)
-        }
-        __syncthreads();
-    };
-
-    auto quantize_compact_write = [&](T delta, auto x, auto y, auto z, auto gid) {
-        bool quantizable = fabs(delta) < radius;
-        T    candidate   = delta + radius;
-        if (x < len3.x and y < len3.y and z < len3.z) {
-            quant[gid] = quantizable * static_cast<EQ>(candidate);
-            if (not quantizable) {
-                auto cur_idx         = atomicAdd(outlier.count, 1);
-                outlier.idx[cur_idx] = gid;
-                outlier.val[cur_idx] = candidate;
-            }
-        }
-    };
-
-    ////////////////////////////////////////////////////////////////////////////
-
-    load_prequant_3d();
-
-    for (auto z = BLOCK; z > 0; z--) {
-        // z-direction
-        delta[z] -= delta[z - 1];
-
-        // x-direction
-        auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8);
-        if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x;
-
-        // y-direction, exchange via shmem
-        // ghost padding along y
-        s[threadIdx.y + 1][threadIdx.x] = delta[z];
-        __syncthreads();
-
-        delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x];
-
-        // now delta[z] is delta
-        quantize_compact_write(delta[z], gix, giy, giz(z - 1), gid(z - 1));
-    }
-}
-
-template <typename T, typename EQ, typename FP, typename Compaction>
-__global__ void psz::cuda::__kernel::v1_pn::compaction::c_lorenzo_3d1l(
-    T*         data,
-    dim3       len3,
-    dim3       stride3,
-    int        radius,
-    FP         ebx2_r,
-    EQ*        quant,
-    Compaction outlier)
-{
-    constexpr auto BYTEWIDTH = sizeof(EQ);
-
-    using UI = EQ;
-    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
-
-    constexpr auto BLOCK = 8;
-    __shared__ T   s[9][33];
-    T              delta[BLOCK + 1] = {0};  // first el = 0
-
-    const auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
-    const auto giy      = blockIdx.y * BLOCK + threadIdx.y;
-    const auto giz_base = blockIdx.z * BLOCK;
-    const auto base_id  = gix + giy * stride3.y + giz_base * stride3.z;
-
-    auto giz = [&](auto z) { return giz_base + z; };
-    auto gid = [&](auto z) { return base_id + z * stride3.z; };
-
-    // TODO move to subroutine.inl
-    auto load_prequant_3d = [&]() {
-        if (gix < len3.x and giy < len3.y) {
-            for (auto z = 0; z < BLOCK; z++)
-                if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r);  // prequant (fp presence)
-        }
-        __syncthreads();
-    };
-
-    auto quantize_compact_write = [&](T delta, auto x, auto y, auto z, auto gid) {
-        bool quantizable = fabs(delta) < radius;
-        UI   UI_delta    = PN<BYTEWIDTH>::encode(static_cast<I>(delta));
-
-        T candidate = delta + radius;
-        if (x < len3.x and y < len3.y and z < len3.z) {
-            quant[gid] = quantizable * UI_delta;
-            if (not quantizable) {
-                auto cur_idx         = atomicAdd(outlier.count, 1);
-                outlier.idx[cur_idx] = gid;
-                outlier.val[cur_idx] = UI_delta;
-            }
-        }
-    };
-
-    ////////////////////////////////////////////////////////////////////////////
-
-    load_prequant_3d();
-
-    for (auto z = BLOCK; z > 0; z--) {
-        // z-direction
-        delta[z] -= delta[z - 1];
-
-        // x-direction
-        auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8);
-        if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x;
-
-        // y-direction, exchange via shmem
-        // ghost padding along y
-        s[threadIdx.y + 1][threadIdx.x] = delta[z];
-        __syncthreads();
-
-        delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x];
-
-        // now delta[z] is delta
-        quantize_compact_write(delta[z], gix, giy, giz(z - 1), gid(z - 1));
-    }
-}
-
-// 32x8x8 data block maps to 32x1x8 thread block
-template <typename T, typename EQ, typename FP>
-__global__ void psz::cuda::__kernel::v0::x_lorenzo_3d1l(  //
-    EQ*  quant,
-    T*   outlier,
-    dim3 len3,
-    dim3 stride3,
-    int  radius,
-    FP   ebx2,
-    T*   xdata)
-{
-    constexpr auto BLOCK = 8;
-    constexpr auto YSEQ  = BLOCK;
-    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
-
-    __shared__ T intermediate[BLOCK][4][8];
-    T            thread_private[YSEQ];
-
-    auto seg_id  = threadIdx.x / 8;
-    auto seg_tix = threadIdx.x % 8;
-
-    auto gix      = blockIdx.x * (4 * BLOCK) + threadIdx.x;
-    auto giy_base = blockIdx.y * BLOCK;
-    auto giy      = [&](auto y) { return giy_base + y; };
-    auto giz      = blockIdx.z * BLOCK + threadIdx.z;
-    auto gid      = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
-
-    auto load_fuse_3d = [&]() {
-    // load to thread-private array (fuse at the same time)
-#pragma unroll
-        for (auto y = 0; y < YSEQ; y++) {
-            if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
-                thread_private[y] = outlier[gid(y)] + static_cast<T>(quant[gid(y)]) - radius;  // fuse
-            else
-                thread_private[y] = 0;
-        }
-    };
-
-    auto block_scan_3d = [&]() {
-        // partial-sum along y-axis, sequentially
-        for (auto y = 1; y < YSEQ; y++) thread_private[y] += thread_private[y - 1];
-
-#pragma unroll
-        for (auto i = 0; i < BLOCK; i++) {
-            // ND partial-sums along x- and z-axis
-            // in-warp shuffle used: in order to perform, it's transposed after X-partial sum
-            T val = thread_private[i];
-
-            for (auto dist = 1; dist < BLOCK; dist *= 2) {
-                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
-                if (seg_tix >= dist) val += addend;
-            }
-
-            // x-z transpose
-            intermediate[threadIdx.z][seg_id][seg_tix] = val;
-            __syncthreads();
-            val = intermediate[seg_tix][seg_id][threadIdx.z];
-            __syncthreads();
-
-            for (auto dist = 1; dist < BLOCK; dist *= 2) {
-                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
-                if (seg_tix >= dist) val += addend;
-            }
-
-            intermediate[threadIdx.z][seg_id][seg_tix] = val;
-            __syncthreads();
-            val = intermediate[seg_tix][seg_id][threadIdx.z];
-            __syncthreads();
-
-            thread_private[i] = val;
-        }
-    };
-
-    auto decomp_write_3d = [&]() {
-#pragma unroll
-        for (auto y = 0; y < YSEQ; y++)
-            if (gix < len3.x and giy(y) < len3.y and giz < len3.z) xdata[gid(y)] = thread_private[y] * ebx2;
-    };
-
-    ////////////////////////////////////////////////////////////////////////////
-    load_fuse_3d();
-    block_scan_3d();
-    decomp_write_3d();
-}
-
-// 32x8x8 data block maps to 32x1x8 thread block
-template <typename T, typename EQ, typename FP>
-__global__ void psz::cuda::__kernel::v1_pn::x_lorenzo_3d1l(  //
-    EQ*  quant,
-    T*   outlier,
-    dim3 len3,
-    dim3 stride3,
-    FP   ebx2,
-    T*   xdata)
-{
-    constexpr auto BYTEWIDTH = sizeof(EQ);
-
-    using UI = EQ;
-    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
-
-    constexpr auto BLOCK = 8;
-    constexpr auto YSEQ  = BLOCK;
-    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
-
-    __shared__ T intermediate[BLOCK][4][8];
-    T            thread_private[YSEQ];
-
-    auto seg_id  = threadIdx.x / 8;
-    auto seg_tix = threadIdx.x % 8;
-
-    auto gix      = blockIdx.x * (4 * BLOCK) + threadIdx.x;
-    auto giy_base = blockIdx.y * BLOCK;
-    auto giy      = [&](auto y) { return giy_base + y; };
-    auto giz      = blockIdx.z * BLOCK + threadIdx.z;
-    auto gid      = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
-
-    auto load_fuse_3d = [&]() {
-    // load to thread-private array (fuse at the same time)
-#pragma unroll
-        for (auto y = 0; y < YSEQ; y++) {
-            if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
-                thread_private[y] = outlier[gid(y)] + PN<BYTEWIDTH>::decode(quant[gid(y)]);  // fuse
-            else
-                thread_private[y] = 0;
-        }
-    };
-
-    auto block_scan_3d = [&]() {
-        // partial-sum along y-axis, sequentially
-        for (auto y = 1; y < YSEQ; y++) thread_private[y] += thread_private[y - 1];
-
-#pragma unroll
-        for (auto i = 0; i < BLOCK; i++) {
-            // ND partial-sums along x- and z-axis
-            // in-warp shuffle used: in order to perform, it's transposed after X-partial sum
-            T val = thread_private[i];
-
-            for (auto dist = 1; dist < BLOCK; dist *= 2) {
-                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
-                if (seg_tix >= dist) val += addend;
-            }
-
-            // x-z transpose
-            intermediate[threadIdx.z][seg_id][seg_tix] = val;
-            __syncthreads();
-            val = intermediate[seg_tix][seg_id][threadIdx.z];
-            __syncthreads();
-
-            for (auto dist = 1; dist < BLOCK; dist *= 2) {
-                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
-                if (seg_tix >= dist) val += addend;
-            }
-
-            intermediate[threadIdx.z][seg_id][seg_tix] = val;
-            __syncthreads();
-            val = intermediate[seg_tix][seg_id][threadIdx.z];
-            __syncthreads();
-
-            thread_private[i] = val;
-        }
-    };
-
-    auto decomp_write_3d = [&]() {
-#pragma unroll
-        for (auto y = 0; y < YSEQ; y++)
-            if (gix < len3.x and giy(y) < len3.y and giz < len3.z) xdata[gid(y)] = thread_private[y] * ebx2;
-    };
-
-    ////////////////////////////////////////////////////////////////////////////
-    load_fuse_3d();
-    block_scan_3d();
-    decomp_write_3d();
-}
-
-// 32x8x8 data block maps to 32x1x8 thread block
-template <typename T, typename EQ, typename FP>
-__global__ void psz::cuda::__kernel::v0::delta_only::x_lorenzo_3d1l(  //
-    EQ*  quant,
-    dim3 len3,
-    dim3 stride3,
-    FP   ebx2,
-    T*   xdata)
-{
-    constexpr auto BLOCK = 8;
-    constexpr auto YSEQ  = BLOCK;
-    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
-
-    __shared__ T intermediate[BLOCK][4][8];
-    T            thread_private[YSEQ];
-
-    auto seg_id  = threadIdx.x / 8;
-    auto seg_tix = threadIdx.x % 8;
-
-    auto gix      = blockIdx.x * (4 * BLOCK) + threadIdx.x;
-    auto giy_base = blockIdx.y * BLOCK;
-    auto giy      = [&](auto y) { return giy_base + y; };
-    auto giz      = blockIdx.z * BLOCK + threadIdx.z;
-    auto gid      = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
-
-    auto load_3d = [&]() {
-    // load to thread-private array (fuse at the same time)
-#pragma unroll
-        for (auto y = 0; y < YSEQ; y++) {
-            if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
-                thread_private[y] = static_cast<T>(quant[gid(y)]);  // fuse
-            else
-                thread_private[y] = 0;
-        }
-    };
-
-    auto block_scan_3d = [&]() {
-        // partial-sum along y-axis, sequentially
-        for (auto y = 1; y < YSEQ; y++) thread_private[y] += thread_private[y - 1];
-
-#pragma unroll
-        for (auto i = 0; i < BLOCK; i++) {
-            // ND partial-sums along x- and z-axis
-            // in-warp shuffle used: in order to perform, it's transposed after X-partial sum
-            T val = thread_private[i];
-
-            for (auto dist = 1; dist < BLOCK; dist *= 2) {
-                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
-                if (seg_tix >= dist) val += addend;
-            }
-
-            // x-z transpose
-            intermediate[threadIdx.z][seg_id][seg_tix] = val;
-            __syncthreads();
-            val = intermediate[seg_tix][seg_id][threadIdx.z];
-            __syncthreads();
-
-            for (auto dist = 1; dist < BLOCK; dist *= 2) {
-                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
-                if (seg_tix >= dist) val += addend;
-            }
-
-            intermediate[threadIdx.z][seg_id][seg_tix] = val;
-            __syncthreads();
-            val = intermediate[seg_tix][seg_id][threadIdx.z];
-            __syncthreads();
-
-            thread_private[i] = val;
-        }
-    };
-
-    auto decomp_write_3d = [&]() {
-#pragma unroll
-        for (auto y = 0; y < YSEQ; y++)
-            if (gix < len3.x and giy(y) < len3.y and giz < len3.z) xdata[gid(y)] = thread_private[y] * ebx2;
-    };
-
-    ////////////////////////////////////////////////////////////////////////////
-    load_3d();
-    block_scan_3d();
-    decomp_write_3d();
-}
+/**
+ * @file lorenzo23.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2022-12-22
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "subroutine.inl"
+
+namespace subr = psz::cuda::__device;
+
+namespace psz {
+namespace cuda {
+namespace __kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+// 1D
+
+namespace v0 {
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier);
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void x_lorenzo_1d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata);
+
+namespace compaction {
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ, typename Compaction = CompactionDRAM<T>>
+__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
+
+}
+
+namespace delta_only {
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta);
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void x_lorenzo_1d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+}  // namespace delta_only
+
+}  // namespace v0
+
+namespace v1_pn {
+
+namespace compaction {
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ, typename Compaction = CompactionDRAM<T>>
+__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
+
+}  // namespace compaction
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void x_lorenzo_1d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+namespace delta_only {
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta);
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void x_lorenzo_1d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+}  // namespace delta_only
+
+}  // namespace v1_pn
+
+////////////////////////////////////////////////////////////////////////////////
+// 2D
+
+namespace v0 {
+
+template <typename T, typename EQ, typename FP>
+__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier);
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_2d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata);
+
+namespace delta_only {
+
+template <typename T, typename EQ, typename FP>
+__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta);
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_2d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+}  // namespace delta_only
+
+namespace compaction {
+
+template <typename T, typename EQ, typename FP, typename Compaction = CompactionDRAM<T>>
+__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
+
+}  // namespace compaction
+
+}  // namespace v0
+
+namespace v1_pn {
+
+namespace compaction {
+
+template <typename T, typename EQ, typename FP, typename Compaction = CompactionDRAM<T>>
+__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
+
+}  // namespace compaction
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_2d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+namespace delta_only {
+
+template <typename T, typename EQ, typename FP>
+__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta);
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_2d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+}  // namespace delta_only
+
+}  // namespace v1_pn
+
+////////////////////////////////////////////////////////////////////////////////
+// 3D
+
+namespace v0 {
+
+// TODO -> `legacy`
+namespace legacy {
+template <typename T, typename EQ, typename FP>
+__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier);
+
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier);
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_3d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata);
+
+namespace delta_only {
+
+template <typename T, typename EQ, typename FP>
+__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant);
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_3d1l(EQ* quant, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+}  // namespace delta_only
+
+namespace compaction {
+
+template <typename T, typename EQ, typename FP, typename Compaction = CompactionDRAM<T>>
+__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
+
+}
+
+}  // namespace v0
+
+namespace v1_pn {
+
+namespace compaction {
+
+template <typename T, typename EQ, typename FP, typename Compaction = CompactionDRAM<T>>
+__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
+
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_3d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+namespace delta_only {
+
+template <typename T, typename EQ, typename FP>
+__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant);
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_3d1l(EQ* quant, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+}  // namespace delta_only
+
+}  // namespace v1_pn
+
+}  // namespace __kernel
+}  // namespace cuda
+}  // namespace psz
+
+////////////////////////////////////////////////////////////////////////////////
+// 1D definition
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void
+psz::cuda::__kernel::v0::c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto NTHREAD = BLOCK / SEQ;
+
+    __shared__ struct {
+        union {
+            T data[BLOCK];
+            T outlier[BLOCK];
+        };
+        EQ quant[BLOCK];
+    } s;
+
+    T prev{0};
+    T thp_data[SEQ];
+
+    auto id_base = blockIdx.x * BLOCK;
+
+    subr_v0::load_prequant_1d<T, FP, NTHREAD, SEQ>(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r);
+    subr_v0::predict_quantize_1d<T, EQ, SEQ, true>(thp_data, s.quant, s.outlier, radius, prev);
+    subr_v0::predict_quantize_1d<T, EQ, SEQ, false>(thp_data, s.quant, s.outlier, radius);
+    subr_v0::write_1d<EQ, T, NTHREAD, SEQ, false>(s.quant, s.outlier, len3.x, id_base, quant, outlier);
+}
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void
+psz::cuda::__kernel::v0::delta_only::c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto NTHREAD = BLOCK / SEQ;
+
+    __shared__ struct {
+        union {
+            T data[BLOCK];
+            T outlier[BLOCK];
+        };
+        EQ quant[BLOCK];
+    } s;
+
+    T prev{0};
+    T thp_data[SEQ];
+
+    auto id_base = blockIdx.x * BLOCK;
+
+    subr_v0::load_prequant_1d<T, FP, NTHREAD, SEQ>(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r);
+    subr_v0::predict_quantize__no_outlier_1d<T, EQ, SEQ, true>(thp_data, s.quant, prev);
+    subr_v0::predict_quantize__no_outlier_1d<T, EQ, SEQ, false>(thp_data, s.quant);
+    subr_v0::write_1d<EQ, T, NTHREAD, SEQ, false>(s.quant, nullptr, len3.x, id_base, quant, nullptr);
+}
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ, typename Compaction>
+__global__ void psz::cuda::__kernel::v0::compaction::c_lorenzo_1d1l(
+    T*         data,
+    dim3       len3,
+    dim3       stride3,
+    int        radius,
+    FP         ebx2_r,
+    EQ*        quant,
+    Compaction outlier_desc)
+{
+    namespace subr_v0  = psz::cuda::__device::v0;
+    namespace subr_v0c = psz::cuda::__device::v0::compaction;
+
+    constexpr auto NTHREAD = BLOCK / SEQ;
+
+    __shared__ struct {
+        union {
+            T data[BLOCK];
+            T outlier[BLOCK];
+        };
+        EQ quant[BLOCK];
+    } s;
+
+    T prev{0};
+    T thp_data[SEQ];
+
+    auto id_base = blockIdx.x * BLOCK;
+
+    subr_v0::load_prequant_1d<T, FP, NTHREAD, SEQ>(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r);
+    subr_v0c::predict_quantize_1d<T, EQ, SEQ, true>(thp_data, s.quant, len3.x, radius, id_base, outlier_desc, prev);
+    subr_v0c::predict_quantize_1d<T, EQ, SEQ, false>(thp_data, s.quant, len3.x, radius, id_base, outlier_desc);
+    subr_v0::write_1d<EQ, T, NTHREAD, SEQ, true>(s.quant, nullptr, len3.x, id_base, quant, nullptr);
+}
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ, typename Compaction>
+__global__ void psz::cuda::__kernel::v1_pn::compaction::c_lorenzo_1d1l(  //
+    T*         data,
+    dim3       len3,
+    dim3       stride3,
+    int        radius,
+    FP         ebx2_r,
+    EQ*        quant,
+    Compaction outlier)
+{
+    namespace subr_v0  = psz::cuda::__device::v0;
+    namespace subr_v1c = psz::cuda::__device::v1_pn::compaction;
+
+    constexpr auto NTHREAD = BLOCK / SEQ;
+
+    __shared__ struct {
+        union {
+            T data[BLOCK];
+            T outlier[BLOCK];
+        };
+        EQ quant[BLOCK];
+    } s;
+
+    T prev{0};
+    T thp_data[SEQ];
+
+    auto id_base = blockIdx.x * BLOCK;
+
+    subr_v0::load_prequant_1d<T, FP, NTHREAD, SEQ>(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r);
+    subr_v1c::predict_quantize_1d<T, EQ, SEQ, true>(thp_data, s.quant, s.outlier, radius, prev);
+    subr_v1c::predict_quantize_1d<T, EQ, SEQ, false>(thp_data, s.quant, s.outlier, radius);
+    subr_v0::write_1d<EQ, T, NTHREAD, SEQ, false>(s.quant, s.outlier, len3.x, id_base, quant, outlier);
+}
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void psz::cuda::__kernel::v0::x_lorenzo_1d1l(  //
+    EQ*  quant,
+    T*   outlier,
+    dim3 len3,
+    dim3 stride3,
+    int  radius,
+    FP   ebx2,
+    T*   xdata)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+    namespace wave32  = psz::cuda::__device::wave32;
+
+    constexpr auto NTHREAD = BLOCK / SEQ;  // equiv. to blockDim.x
+
+    __shared__ struct {
+        union {
+            T outlier[BLOCK];
+            T xdata[BLOCK];
+        };
+        // even if it's wave64, "/32" works
+        T exchange_in[NTHREAD / 32];
+        T exchange_out[NTHREAD / 32];
+    } s;
+
+    T thp_data[SEQ];
+
+    auto id_base = blockIdx.x * BLOCK;
+
+    subr_v0::load_fuse_1d<T, EQ, NTHREAD, SEQ>(quant, outlier, len3.x, id_base, radius, s.xdata, thp_data);
+    subr_v0::block_scan_1d<T, SEQ, NTHREAD>(thp_data, ebx2, s.exchange_in, s.exchange_out, s.xdata);
+    subr_v0::write_1d<T, T, NTHREAD, SEQ, true>(s.xdata, nullptr, len3.x, id_base, xdata, nullptr);
+}
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void psz::cuda::__kernel::v0::delta_only::x_lorenzo_1d1l(  //
+    EQ*  quant,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2,
+    T*   xdata)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto NTHREAD = BLOCK / SEQ;  // equiv. to blockDim.x
+
+    __shared__ struct {
+        T xdata[BLOCK];
+        // even if it's wave64, "/32" works
+        T exchange_in[NTHREAD / 32];
+        T exchange_out[NTHREAD / 32];
+    } s;
+
+    T thp_data[SEQ];
+
+    auto id_base = blockIdx.x * BLOCK;
+
+    subr_v0::delta_only::load_1d<T, EQ, NTHREAD, SEQ>(quant, len3.x, id_base, s.xdata, thp_data);
+    subr_v0::block_scan_1d<T, SEQ, NTHREAD>(thp_data, ebx2, s.exchange_in, s.exchange_out, s.xdata);
+    subr_v0::write_1d<T, T, NTHREAD, SEQ, true>(s.xdata, nullptr, len3.x, id_base, xdata, nullptr);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// 2D definition
+
+template <typename T, typename EQ, typename FP>
+__global__ void
+psz::cuda::__kernel::v0::c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = 8;
+
+    T center[YSEQ + 1] = {0};  // NW  N       first element <- 0
+                               //  W  center
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;         // BDX == BLOCK == 16
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    subr_v0::load_prequant_2d<T, FP, YSEQ>(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center);
+    subr_v0::predict_2d<T, EQ, YSEQ>(center);
+    subr_v0::quantize_write_2d<T, EQ, YSEQ>(center, len3.x, gix, len3.y, giy_base, stride3.y, radius, quant, outlier);
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void
+psz::cuda::__kernel::v0::delta_only::c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = 8;
+
+    T center[YSEQ + 1] = {0};  // NW  N       first element <- 0
+                               //  W  center
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;         // BDX == BLOCK == 16
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    subr_v0::load_prequant_2d<T, FP, YSEQ>(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center);
+    subr_v0::predict_2d<T, EQ, YSEQ>(center);
+    subr_v0::delta_only::quantize_write_2d<T, EQ, YSEQ>(center, len3.x, gix, len3.y, giy_base, stride3.y, quant);
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void
+psz::cuda::__kernel::v1_pn::delta_only::c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant)
+{
+    namespace subr_v0  = psz::cuda::__device::v0;
+    namespace subr_v1d = psz::cuda::__device::v1_pn::delta_only;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = 8;
+
+    T center[YSEQ + 1] = {0};  // NW  N       first element <- 0
+                               //  W  center
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;         // BDX == BLOCK == 16
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    subr_v0::load_prequant_2d<T, FP, YSEQ>(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center);
+    subr_v0::predict_2d<T, EQ, YSEQ>(center);
+    subr_v1d::quantize_write_2d<T, EQ, YSEQ>(center, len3.x, gix, len3.y, giy_base, stride3.y, quant);
+}
+
+template <typename T, typename EQ, typename FP, typename Compaction>
+__global__ void psz::cuda::__kernel::v0::compaction::c_lorenzo_2d1l(
+    T*         data,
+    dim3       len3,
+    dim3       stride3,
+    int        radius,
+    FP         ebx2_r,
+    EQ*        quant,
+    Compaction outlier)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = 8;
+
+    T center[YSEQ + 1] = {0};  // NW  N       first element <- 0
+                               //  W  center
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;         // BDX == BLOCK == 16
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    subr_v0::load_prequant_2d<T, FP, YSEQ>(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center);
+    subr_v0::predict_2d<T, EQ, YSEQ>(center);
+    subr_v0::compaction::quantize_write_2d<T, EQ, YSEQ>(
+        center, len3.x, gix, len3.y, giy_base, stride3.y, radius, quant, outlier);
+}
+
+// 16x16 data block maps to 16x2 (one warp) thread block
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v0::x_lorenzo_2d1l(  //
+    EQ*  quant,
+    T*   outlier,
+    dim3 len3,
+    dim3 stride3,
+    int  radius,
+    FP   ebx2,
+    T*   xdata)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
+    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
+
+    __shared__ T intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
+    T            thread_private[YSEQ];
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
+
+    subr_v0::load_fuse_2d<T, EQ, YSEQ>(
+        quant, outlier, len3.x, gix, len3.y, giy_base, stride3.y, radius, thread_private);
+    subr_v0::block_scan_2d<T, EQ, FP, YSEQ>(thread_private, intermediate, ebx2);
+    subr_v0::decomp_write_2d<T, YSEQ>(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata);
+}
+
+// 16x16 data block maps to 16x2 (one warp) thread block
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v1_pn::x_lorenzo_2d1l(  //
+    EQ*  quant,
+    T*   outlier,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2,
+    T*   xdata)
+{
+    namespace subr_v0    = psz::cuda::__device::v0;
+    namespace subr_v1_pn = psz::cuda::__device::v1_pn;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
+    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
+
+    __shared__ T intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
+    T            thread_private[YSEQ];
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
+
+    subr_v1_pn::load_fuse_2d<T, EQ, YSEQ>(quant, outlier, len3.x, gix, len3.y, giy_base, stride3.y, thread_private);
+    subr_v0::block_scan_2d<T, EQ, FP, YSEQ>(thread_private, intermediate, ebx2);
+    subr_v0::decomp_write_2d<T, YSEQ>(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata);
+}
+
+// 16x16 data block maps to 16x2 (one warp) thread block
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v0::delta_only::x_lorenzo_2d1l(  //
+    EQ*  quant,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2,
+    T*   xdata)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
+    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
+
+    __shared__ T intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
+    T            thread_private[YSEQ];
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
+
+    subr_v0::delta_only::load_2d<T, EQ, YSEQ>(quant, len3.x, gix, len3.y, giy_base, stride3.y, thread_private);
+    subr_v0::block_scan_2d<T, EQ, FP, YSEQ>(thread_private, intermediate, ebx2);
+    subr_v0::decomp_write_2d<T, YSEQ>(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata);
+}
+
+// 16x16 data block maps to 16x2 (one warp) thread block
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v1_pn::delta_only::x_lorenzo_2d1l(  //
+    EQ*  quant,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2,
+    T*   xdata)
+{
+    namespace subr_v0    = psz::cuda::__device::v0;
+    namespace subr_v1_pn = psz::cuda::__device::v1_pn;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
+    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
+
+    __shared__ T intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
+    T            thread_private[YSEQ];
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
+
+    subr_v1_pn::delta_only::load_2d<T, EQ, YSEQ>(quant, len3.x, gix, len3.y, giy_base, stride3.y, thread_private);
+    subr_v0::block_scan_2d<T, EQ, FP, YSEQ>(thread_private, intermediate, ebx2);
+    subr_v0::decomp_write_2d<T, YSEQ>(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata);
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v0::legacy::c_lorenzo_3d1l(
+    T*   data,
+    dim3 len3,
+    dim3 stride3,
+    int  radius,
+    FP   ebx2_r,
+    EQ*  quant,
+    T*   outlier)
+{
+    constexpr auto BLOCK = 8;
+    __shared__ T   s[8][8][32];
+
+    auto z = threadIdx.z;
+
+    auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK;
+    auto giz      = blockIdx.z * BLOCK + z;
+    auto base_id  = gix + giy_base * stride3.y + giz * stride3.z;
+
+    auto giy = [&](auto y) { return giy_base + y; };
+    auto gid = [&](auto y) { return base_id + y * stride3.y; };
+
+    auto load_prequant_3d = [&]() {
+        if (gix < len3.x and giz < len3.z) {
+            for (auto y = 0; y < BLOCK; y++)
+                if (giy(y) < len3.y) s[z][y][threadIdx.x] = round(data[gid(y)] * ebx2_r);  // prequant (fp presence)
+        }
+        __syncthreads();
+    };
+
+    auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) {
+        bool quantizable = fabs(delta) < radius;
+        T    candidate   = delta + radius;
+        if (x < len3.x and y < len3.y and z < len3.z) {
+            quant[gid]   = quantizable * static_cast<EQ>(candidate);
+            outlier[gid] = (not quantizable) * candidate;
+        }
+    };
+
+    auto x = threadIdx.x % 8;
+
+    auto predict_3d = [&](auto y) {
+        T delta = s[z][y][threadIdx.x] -                                               //
+                  ((z > 0 and y > 0 and x > 0 ? s[z - 1][y - 1][threadIdx.x - 1] : 0)  // dist=3
+                   - (y > 0 and x > 0 ? s[z][y - 1][threadIdx.x - 1] : 0)              // dist=2
+                   - (z > 0 and x > 0 ? s[z - 1][y][threadIdx.x - 1] : 0)              //
+                   - (z > 0 and y > 0 ? s[z - 1][y - 1][threadIdx.x] : 0)              //
+                   + (x > 0 ? s[z][y][threadIdx.x - 1] : 0)                            // dist=1
+                   + (y > 0 ? s[z][y - 1][threadIdx.x] : 0)                            //
+                   + (z > 0 ? s[z - 1][y][threadIdx.x] : 0));                          //
+        return delta;
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    load_prequant_3d();
+    for (auto y = 0; y < BLOCK; y++) {
+        auto delta = predict_3d(y);
+        quantize_write(delta, gix, giy(y), giz, gid(y));
+    }
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void
+psz::cuda::__kernel::v0::c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier)
+{
+    constexpr auto BLOCK = 8;
+    __shared__ T   s[9][33];
+    T              delta[BLOCK + 1] = {0};  // first el = 0
+
+    const auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
+    const auto giy      = blockIdx.y * BLOCK + threadIdx.y;
+    const auto giz_base = blockIdx.z * BLOCK;
+    const auto base_id  = gix + giy * stride3.y + giz_base * stride3.z;
+
+    auto giz = [&](auto z) { return giz_base + z; };
+    auto gid = [&](auto z) { return base_id + z * stride3.z; };
+
+    auto load_prequant_3d = [&]() {
+        if (gix < len3.x and giy < len3.y) {
+            for (auto z = 0; z < BLOCK; z++)
+                if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r);  // prequant (fp presence)
+        }
+        __syncthreads();
+    };
+
+    auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) {
+        bool quantizable = fabs(delta) < radius;
+        T    candidate   = delta + radius;
+        if (x < len3.x and y < len3.y and z < len3.z) {
+            quant[gid]   = quantizable * static_cast<EQ>(candidate);
+            outlier[gid] = (not quantizable) * candidate;
+        }
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    /* z-direction, sequential in private buffer
+       delta = + (s[z][y][x] - s[z-1][y][x])
+               - (s[z][y][x-1] - s[z-1][y][x-1])
+               + (s[z][y-1][x-1] - s[z-1][y-1][x-1])
+               - (s[z][y-1][x] - s[z-1][y-1][x])
+
+       x-direction, shuffle
+       delta = + (s[z][y][x] - s[z][y][x-1])
+               - (s[z][y-1][x] - s[z][y-1][x-1])
+
+       y-direction, shmem
+       delta = s[z][y][x] - s[z][y-1][x]
+     */
+
+    load_prequant_3d();
+
+    for (auto z = BLOCK; z > 0; z--) {
+        // z-direction
+        delta[z] -= delta[z - 1];
+
+        // x-direction
+        auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8);
+        if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x;
+
+        // y-direction, exchange via shmem
+        // ghost padding along y
+        s[threadIdx.y + 1][threadIdx.x] = delta[z];
+        __syncthreads();
+
+        delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x];
+
+        // now delta[z] is delta
+        quantize_write(delta[z], gix, giy, giz(z - 1), gid(z - 1));
+    }
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v0::delta_only::c_lorenzo_3d1l(  //
+    T*   data,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2_r,
+    EQ*  quant)
+{
+    constexpr auto BLOCK = 8;
+    __shared__ T   s[9][33];
+    T              delta[BLOCK + 1] = {0};  // first el = 0
+
+    const auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
+    const auto giy      = blockIdx.y * BLOCK + threadIdx.y;
+    const auto giz_base = blockIdx.z * BLOCK;
+    const auto base_id  = gix + giy * stride3.y + giz_base * stride3.z;
+
+    auto giz = [&](auto z) { return giz_base + z; };
+    auto gid = [&](auto z) { return base_id + z * stride3.z; };
+
+    auto load_prequant_3d = [&]() {
+        if (gix < len3.x and giy < len3.y) {
+            for (auto z = 0; z < BLOCK; z++)
+                if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r);  // prequant (fp presence)
+        }
+        __syncthreads();
+    };
+
+    auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) {
+        if (x < len3.x and y < len3.y and z < len3.z) quant[gid] = static_cast<EQ>(delta);
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    load_prequant_3d();
+
+    for (auto z = BLOCK; z > 0; z--) {
+        // z-direction
+        delta[z] -= delta[z - 1];
+
+        // x-direction
+        auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8);
+        if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x;
+
+        // y-direction, exchange via shmem
+        // ghost padding along y
+        s[threadIdx.y + 1][threadIdx.x] = delta[z];
+        __syncthreads();
+
+        delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x];
+
+        // now delta[z] is delta
+        quantize_write(delta[z], gix, giy, giz(z - 1), gid(z - 1));
+    }
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v1_pn::delta_only::c_lorenzo_3d1l(  //
+    T*   data,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2_r,
+    EQ*  quant)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    constexpr auto BLOCK = 8;
+    __shared__ T   s[9][33];
+    T              delta[BLOCK + 1] = {0};  // first el = 0
+
+    const auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
+    const auto giy      = blockIdx.y * BLOCK + threadIdx.y;
+    const auto giz_base = blockIdx.z * BLOCK;
+    const auto base_id  = gix + giy * stride3.y + giz_base * stride3.z;
+
+    auto giz = [&](auto z) { return giz_base + z; };
+    auto gid = [&](auto z) { return base_id + z * stride3.z; };
+
+    auto load_prequant_3d = [&]() {
+        if (gix < len3.x and giy < len3.y) {
+            for (auto z = 0; z < BLOCK; z++)
+                if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r);  // prequant (fp presence)
+        }
+        __syncthreads();
+    };
+
+    auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) {
+        if (x < len3.x and y < len3.y and z < len3.z) quant[gid] = PN<BYTEWIDTH>::encode(static_cast<I>(delta));
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    load_prequant_3d();
+
+    for (auto z = BLOCK; z > 0; z--) {
+        // z-direction
+        delta[z] -= delta[z - 1];
+
+        // x-direction
+        auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8);
+        if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x;
+
+        // y-direction, exchange via shmem
+        // ghost padding along y
+        s[threadIdx.y + 1][threadIdx.x] = delta[z];
+        __syncthreads();
+
+        delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x];
+
+        // now delta[z] is delta
+        quantize_write(delta[z], gix, giy, giz(z - 1), gid(z - 1));
+    }
+}
+
+template <typename T, typename EQ, typename FP, typename Compaction>
+__global__ void psz::cuda::__kernel::v0::compaction::c_lorenzo_3d1l(
+    T*         data,
+    dim3       len3,
+    dim3       stride3,
+    int        radius,
+    FP         ebx2_r,
+    EQ*        quant,
+    Compaction outlier)
+{
+    constexpr auto BLOCK = 8;
+    __shared__ T   s[9][33];
+    T              delta[BLOCK + 1] = {0};  // first el = 0
+
+    const auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
+    const auto giy      = blockIdx.y * BLOCK + threadIdx.y;
+    const auto giz_base = blockIdx.z * BLOCK;
+    const auto base_id  = gix + giy * stride3.y + giz_base * stride3.z;
+
+    auto giz = [&](auto z) { return giz_base + z; };
+    auto gid = [&](auto z) { return base_id + z * stride3.z; };
+
+    auto load_prequant_3d = [&]() {
+        if (gix < len3.x and giy < len3.y) {
+            for (auto z = 0; z < BLOCK; z++)
+                if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r);  // prequant (fp presence)
+        }
+        __syncthreads();
+    };
+
+    auto quantize_compact_write = [&](T delta, auto x, auto y, auto z, auto gid) {
+        bool quantizable = fabs(delta) < radius;
+        T    candidate   = delta + radius;
+        if (x < len3.x and y < len3.y and z < len3.z) {
+            quant[gid] = quantizable * static_cast<EQ>(candidate);
+            if (not quantizable) {
+                auto cur_idx         = atomicAdd(outlier.count, 1);
+                outlier.idx[cur_idx] = gid;
+                outlier.val[cur_idx] = candidate;
+            }
+        }
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    load_prequant_3d();
+
+    for (auto z = BLOCK; z > 0; z--) {
+        // z-direction
+        delta[z] -= delta[z - 1];
+
+        // x-direction
+        auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8);
+        if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x;
+
+        // y-direction, exchange via shmem
+        // ghost padding along y
+        s[threadIdx.y + 1][threadIdx.x] = delta[z];
+        __syncthreads();
+
+        delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x];
+
+        // now delta[z] is delta
+        quantize_compact_write(delta[z], gix, giy, giz(z - 1), gid(z - 1));
+    }
+}
+
+template <typename T, typename EQ, typename FP, typename Compaction>
+__global__ void psz::cuda::__kernel::v1_pn::compaction::c_lorenzo_3d1l(
+    T*         data,
+    dim3       len3,
+    dim3       stride3,
+    int        radius,
+    FP         ebx2_r,
+    EQ*        quant,
+    Compaction outlier)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    constexpr auto BLOCK = 8;
+    __shared__ T   s[9][33];
+    T              delta[BLOCK + 1] = {0};  // first el = 0
+
+    const auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
+    const auto giy      = blockIdx.y * BLOCK + threadIdx.y;
+    const auto giz_base = blockIdx.z * BLOCK;
+    const auto base_id  = gix + giy * stride3.y + giz_base * stride3.z;
+
+    auto giz = [&](auto z) { return giz_base + z; };
+    auto gid = [&](auto z) { return base_id + z * stride3.z; };
+
+    // TODO move to subroutine.inl
+    auto load_prequant_3d = [&]() {
+        if (gix < len3.x and giy < len3.y) {
+            for (auto z = 0; z < BLOCK; z++)
+                if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r);  // prequant (fp presence)
+        }
+        __syncthreads();
+    };
+
+    auto quantize_compact_write = [&](T delta, auto x, auto y, auto z, auto gid) {
+        bool quantizable = fabs(delta) < radius;
+        UI   UI_delta    = PN<BYTEWIDTH>::encode(static_cast<I>(delta));
+
+        T candidate = delta + radius;
+        if (x < len3.x and y < len3.y and z < len3.z) {
+            quant[gid] = quantizable * UI_delta;
+            if (not quantizable) {
+                auto cur_idx         = atomicAdd(outlier.count, 1);
+                outlier.idx[cur_idx] = gid;
+                outlier.val[cur_idx] = UI_delta;
+            }
+        }
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    load_prequant_3d();
+
+    for (auto z = BLOCK; z > 0; z--) {
+        // z-direction
+        delta[z] -= delta[z - 1];
+
+        // x-direction
+        auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8);
+        if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x;
+
+        // y-direction, exchange via shmem
+        // ghost padding along y
+        s[threadIdx.y + 1][threadIdx.x] = delta[z];
+        __syncthreads();
+
+        delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x];
+
+        // now delta[z] is delta
+        quantize_compact_write(delta[z], gix, giy, giz(z - 1), gid(z - 1));
+    }
+}
+
+// 32x8x8 data block maps to 32x1x8 thread block
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v0::x_lorenzo_3d1l(  //
+    EQ*  quant,
+    T*   outlier,
+    dim3 len3,
+    dim3 stride3,
+    int  radius,
+    FP   ebx2,
+    T*   xdata)
+{
+    constexpr auto BLOCK = 8;
+    constexpr auto YSEQ  = BLOCK;
+    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
+
+    __shared__ T intermediate[BLOCK][4][8];
+    T            thread_private[YSEQ];
+
+    auto seg_id  = threadIdx.x / 8;
+    auto seg_tix = threadIdx.x % 8;
+
+    auto gix      = blockIdx.x * (4 * BLOCK) + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK;
+    auto giy      = [&](auto y) { return giy_base + y; };
+    auto giz      = blockIdx.z * BLOCK + threadIdx.z;
+    auto gid      = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
+
+    auto load_fuse_3d = [&]() {
+    // load to thread-private array (fuse at the same time)
+#pragma unroll
+        for (auto y = 0; y < YSEQ; y++) {
+            if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
+                thread_private[y] = outlier[gid(y)] + static_cast<T>(quant[gid(y)]) - radius;  // fuse
+            else
+                thread_private[y] = 0;
+        }
+    };
+
+    auto block_scan_3d = [&]() {
+        // partial-sum along y-axis, sequentially
+        for (auto y = 1; y < YSEQ; y++) thread_private[y] += thread_private[y - 1];
+
+#pragma unroll
+        for (auto i = 0; i < BLOCK; i++) {
+            // ND partial-sums along x- and z-axis
+            // in-warp shuffle used: in order to perform, it's transposed after X-partial sum
+            T val = thread_private[i];
+
+            for (auto dist = 1; dist < BLOCK; dist *= 2) {
+                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+                if (seg_tix >= dist) val += addend;
+            }
+
+            // x-z transpose
+            intermediate[threadIdx.z][seg_id][seg_tix] = val;
+            __syncthreads();
+            val = intermediate[seg_tix][seg_id][threadIdx.z];
+            __syncthreads();
+
+            for (auto dist = 1; dist < BLOCK; dist *= 2) {
+                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+                if (seg_tix >= dist) val += addend;
+            }
+
+            intermediate[threadIdx.z][seg_id][seg_tix] = val;
+            __syncthreads();
+            val = intermediate[seg_tix][seg_id][threadIdx.z];
+            __syncthreads();
+
+            thread_private[i] = val;
+        }
+    };
+
+    auto decomp_write_3d = [&]() {
+#pragma unroll
+        for (auto y = 0; y < YSEQ; y++)
+            if (gix < len3.x and giy(y) < len3.y and giz < len3.z) xdata[gid(y)] = thread_private[y] * ebx2;
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+    load_fuse_3d();
+    block_scan_3d();
+    decomp_write_3d();
+}
+
+// 32x8x8 data block maps to 32x1x8 thread block
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v1_pn::x_lorenzo_3d1l(  //
+    EQ*  quant,
+    T*   outlier,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2,
+    T*   xdata)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    constexpr auto BLOCK = 8;
+    constexpr auto YSEQ  = BLOCK;
+    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
+
+    __shared__ T intermediate[BLOCK][4][8];
+    T            thread_private[YSEQ];
+
+    auto seg_id  = threadIdx.x / 8;
+    auto seg_tix = threadIdx.x % 8;
+
+    auto gix      = blockIdx.x * (4 * BLOCK) + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK;
+    auto giy      = [&](auto y) { return giy_base + y; };
+    auto giz      = blockIdx.z * BLOCK + threadIdx.z;
+    auto gid      = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
+
+    auto load_fuse_3d = [&]() {
+    // load to thread-private array (fuse at the same time)
+#pragma unroll
+        for (auto y = 0; y < YSEQ; y++) {
+            if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
+                thread_private[y] = outlier[gid(y)] + PN<BYTEWIDTH>::decode(quant[gid(y)]);  // fuse
+            else
+                thread_private[y] = 0;
+        }
+    };
+
+    auto block_scan_3d = [&]() {
+        // partial-sum along y-axis, sequentially
+        for (auto y = 1; y < YSEQ; y++) thread_private[y] += thread_private[y - 1];
+
+#pragma unroll
+        for (auto i = 0; i < BLOCK; i++) {
+            // ND partial-sums along x- and z-axis
+            // in-warp shuffle used: in order to perform, it's transposed after X-partial sum
+            T val = thread_private[i];
+
+            for (auto dist = 1; dist < BLOCK; dist *= 2) {
+                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+                if (seg_tix >= dist) val += addend;
+            }
+
+            // x-z transpose
+            intermediate[threadIdx.z][seg_id][seg_tix] = val;
+            __syncthreads();
+            val = intermediate[seg_tix][seg_id][threadIdx.z];
+            __syncthreads();
+
+            for (auto dist = 1; dist < BLOCK; dist *= 2) {
+                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+                if (seg_tix >= dist) val += addend;
+            }
+
+            intermediate[threadIdx.z][seg_id][seg_tix] = val;
+            __syncthreads();
+            val = intermediate[seg_tix][seg_id][threadIdx.z];
+            __syncthreads();
+
+            thread_private[i] = val;
+        }
+    };
+
+    auto decomp_write_3d = [&]() {
+#pragma unroll
+        for (auto y = 0; y < YSEQ; y++)
+            if (gix < len3.x and giy(y) < len3.y and giz < len3.z) xdata[gid(y)] = thread_private[y] * ebx2;
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+    load_fuse_3d();
+    block_scan_3d();
+    decomp_write_3d();
+}
+
+// 32x8x8 data block maps to 32x1x8 thread block
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v0::delta_only::x_lorenzo_3d1l(  //
+    EQ*  quant,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2,
+    T*   xdata)
+{
+    constexpr auto BLOCK = 8;
+    constexpr auto YSEQ  = BLOCK;
+    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
+
+    __shared__ T intermediate[BLOCK][4][8];
+    T            thread_private[YSEQ];
+
+    auto seg_id  = threadIdx.x / 8;
+    auto seg_tix = threadIdx.x % 8;
+
+    auto gix      = blockIdx.x * (4 * BLOCK) + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK;
+    auto giy      = [&](auto y) { return giy_base + y; };
+    auto giz      = blockIdx.z * BLOCK + threadIdx.z;
+    auto gid      = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
+
+    auto load_3d = [&]() {
+    // load to thread-private array (fuse at the same time)
+#pragma unroll
+        for (auto y = 0; y < YSEQ; y++) {
+            if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
+                thread_private[y] = static_cast<T>(quant[gid(y)]);  // fuse
+            else
+                thread_private[y] = 0;
+        }
+    };
+
+    auto block_scan_3d = [&]() {
+        // partial-sum along y-axis, sequentially
+        for (auto y = 1; y < YSEQ; y++) thread_private[y] += thread_private[y - 1];
+
+#pragma unroll
+        for (auto i = 0; i < BLOCK; i++) {
+            // ND partial-sums along x- and z-axis
+            // in-warp shuffle used: in order to perform, it's transposed after X-partial sum
+            T val = thread_private[i];
+
+            for (auto dist = 1; dist < BLOCK; dist *= 2) {
+                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+                if (seg_tix >= dist) val += addend;
+            }
+
+            // x-z transpose
+            intermediate[threadIdx.z][seg_id][seg_tix] = val;
+            __syncthreads();
+            val = intermediate[seg_tix][seg_id][threadIdx.z];
+            __syncthreads();
+
+            for (auto dist = 1; dist < BLOCK; dist *= 2) {
+                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+                if (seg_tix >= dist) val += addend;
+            }
+
+            intermediate[threadIdx.z][seg_id][seg_tix] = val;
+            __syncthreads();
+            val = intermediate[seg_tix][seg_id][threadIdx.z];
+            __syncthreads();
+
+            thread_private[i] = val;
+        }
+    };
+
+    auto decomp_write_3d = [&]() {
+#pragma unroll
+        for (auto y = 0; y < YSEQ; y++)
+            if (gix < len3.x and giy(y) < len3.y and giz < len3.z) xdata[gid(y)] = thread_private[y] * ebx2;
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+    load_3d();
+    block_scan_3d();
+    decomp_write_3d();
+}
diff --git a/qtensor/compression/cusz/src/kernel/detail/lorenzo_proto.inl b/qtensor/compression/cusz/src/kernel/detail/lorenzo_proto.inl
index 2ed25984..5a317a60 100644
--- a/qtensor/compression/cusz/src/kernel/detail/lorenzo_proto.inl
+++ b/qtensor/compression/cusz/src/kernel/detail/lorenzo_proto.inl
@@ -1,214 +1,214 @@
-/**
- * @file lorenzo_proto.inl
- * @author Jiannan Tian
- * @brief (prototype) Dual-EQ Lorenzo method.
- * @version 0.2
- * @date 2021-01-16
- * (create) 2019-09-23; (release) 2020-09-20; (rev1) 2021-01-16; (rev2) 2021-02-20; (rev3) 2021-04-11
- * (rev4) 2021-04-30
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#ifndef CUSZ_KERNEL_LORENZO_PROTOTYPE_CUH
-#define CUSZ_KERNEL_LORENZO_PROTOTYPE_CUH
-
-#include <cstddef>
-#include <stdexcept>
-
-#include "utils/cuda_err.cuh"
-#include "utils/timer.h"
-
-namespace psz {
-
-namespace cuda {
-namespace __kernel {
-
-namespace prototype {  // easy algorithmic description
-
-template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 256>
-__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* eq, T* outlier)
-{
-    __shared__ T buf[BLK];
-
-    auto id = blockIdx.x * blockDim.x + threadIdx.x;
-    if (id < len3.x) {
-        buf[threadIdx.x] = round(data[id] * ebx2_r);  // prequant (fp presence)
-    }
-    __syncthreads();  // necessary to ensure correctness
-
-    T delta = buf[threadIdx.x] - (threadIdx.x == 0 ? 0 : buf[threadIdx.x - 1]);
-
-    bool quantizable = fabs(delta) < radius;
-    T    candidate   = delta + radius;
-    if (id < len3.x) {                             // postquant
-        data[id] = (1 - quantizable) * candidate;  // output; reuse data for outlier
-        eq[id]   = quantizable * static_cast<EQ>(candidate);
-    }
-}
-
-template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 16>
-__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* eq, T* outlier)
-{
-    __shared__ T buf[BLK][BLK + 1];
-
-    auto y = threadIdx.y, x = threadIdx.x;
-    auto giy = blockIdx.y * blockDim.y + y, gix = blockIdx.x * blockDim.x + x;
-
-    auto id = gix + giy * stride3.y;  // low to high dim, inner to outer
-    if (gix < len3.x and giy < len3.y) {
-        buf[y][x] = round(data[id] * ebx2_r);  // prequant (fp presence)
-    }
-    __syncthreads();  // necessary to ensure correctness
-
-    T delta = buf[y][x] - ((x > 0 ? buf[y][x - 1] : 0) +                // dist=1
-                           (y > 0 ? buf[y - 1][x] : 0) -                // dist=1
-                           (x > 0 and y > 0 ? buf[y - 1][x - 1] : 0));  // dist=2
-
-    bool quantizable = fabs(delta) < radius;
-    T    candidate   = delta + radius;
-    if (gix < len3.x and giy < len3.y) {
-        data[id] = (1 - quantizable) * candidate;  // output; reuse data for outlier
-        eq[id]   = quantizable * static_cast<EQ>(candidate);
-    }
-}
-
-template <typename T, typename EQ, typename FP, int BLK = 8>
-__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* eq, T* outlier)
-{
-    __shared__ T buf[BLK][BLK][BLK + 1];
-
-    auto z = threadIdx.z, y = threadIdx.y, x = threadIdx.x;
-    auto giz = blockIdx.z * blockDim.z + z, giy = blockIdx.y * blockDim.y + y, gix = blockIdx.x * blockDim.x + x;
-
-    auto id = gix + giy * stride3.y + giz * stride3.z;  // low to high in dim, inner to outer
-    if (gix < len3.x and giy < len3.y and giz < len3.z) {
-        buf[z][y][x] = round(data[id] * ebx2_r);  // prequant (fp presence)
-    }
-    __syncthreads();  // necessary to ensure correctness
-
-    T delta = buf[z][y][x] - ((z > 0 and y > 0 and x > 0 ? buf[z - 1][y - 1][x - 1] : 0)  // dist=3
-                              - (y > 0 and x > 0 ? buf[z][y - 1][x - 1] : 0)              // dist=2
-                              - (z > 0 and x > 0 ? buf[z - 1][y][x - 1] : 0)              //
-                              - (z > 0 and y > 0 ? buf[z - 1][y - 1][x] : 0)              //
-                              + (x > 0 ? buf[z][y][x - 1] : 0)                            // dist=1
-                              + (y > 0 ? buf[z][y - 1][x] : 0)                            //
-                              + (z > 0 ? buf[z - 1][y][x] : 0));                          //
-
-    bool quantizable = fabs(delta) < radius;
-    T    candidate   = delta + radius;
-    if (gix < len3.x and giy < len3.y and giz < len3.z) {
-        data[id] = (1 - quantizable) * candidate;  // output; reuse data for outlier
-        eq[id]   = quantizable * static_cast<EQ>(candidate);
-    }
-}
-
-template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 256>
-__global__ void x_lorenzo_1d1l(EQ* eq, T* scattered_outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata)
-{
-    __shared__ T buf[BLK];
-
-    auto id = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (id < len3.x)
-        buf[threadIdx.x] = scattered_outlier[id] + static_cast<T>(eq[id]) - radius;  // fuse
-    else
-        buf[threadIdx.x] = 0;
-    __syncthreads();
-
-    for (auto d = 1; d < BLK; d *= 2) {
-        T n = 0;
-        if (threadIdx.x >= d) n = buf[threadIdx.x - d];  // like __shfl_up_sync(0x1f, var, d); warp_sync
-        __syncthreads();
-        if (threadIdx.x >= d) buf[threadIdx.x] += n;
-        __syncthreads();
-    }
-
-    if (id < len3.x) { xdata[id] = buf[threadIdx.x] * ebx2; }
-}
-
-template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 16>
-__global__ void x_lorenzo_2d1l(EQ* eq, T* scattered_outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata)
-{
-    __shared__ T buf[BLK][BLK + 1];
-
-    auto   giy = blockIdx.y * blockDim.y + threadIdx.y, gix = blockIdx.x * blockDim.x + threadIdx.x;
-    size_t id = gix + giy * stride3.y;
-
-    if (gix < len3.x and giy < len3.y)
-        buf[threadIdx.y][threadIdx.x] = scattered_outlier[id] + static_cast<T>(eq[id]) - radius;  // fuse
-    else
-        buf[threadIdx.y][threadIdx.x] = 0;
-    __syncthreads();
-
-    for (auto d = 1; d < BLK; d *= 2) {
-        T n = 0;
-        if (threadIdx.x >= d) n = buf[threadIdx.y][threadIdx.x - d];
-        __syncthreads();
-        if (threadIdx.x >= d) buf[threadIdx.y][threadIdx.x] += n;
-        __syncthreads();
-    }
-
-    for (auto d = 1; d < BLK; d *= 2) {
-        T n = 0;
-        if (threadIdx.y >= d) n = buf[threadIdx.y - d][threadIdx.x];
-        __syncthreads();
-        if (threadIdx.y >= d) buf[threadIdx.y][threadIdx.x] += n;
-        __syncthreads();
-    }
-
-    if (gix < len3.x and giy < len3.y) { xdata[id] = buf[threadIdx.y][threadIdx.x] * ebx2; }
-}
-
-template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 8>
-__global__ void x_lorenzo_3d1l(EQ* eq, T* scattered_outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata)
-{
-    __shared__ T buf[BLK][BLK][BLK + 1];
-
-    auto giz = blockIdx.z * BLK + threadIdx.z, giy = blockIdx.y * BLK + threadIdx.y,
-         gix  = blockIdx.x * BLK + threadIdx.x;
-    size_t id = gix + giy * stride3.y + giz * stride3.z;  // low to high in dim, inner to outer
-
-    if (gix < len3.x and giy < len3.y and giz < len3.z)
-        buf[threadIdx.z][threadIdx.y][threadIdx.x] = scattered_outlier[id] + static_cast<T>(eq[id]) - radius;  // id
-    else
-        buf[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
-    __syncthreads();
-
-    for (auto dist = 1; dist < BLK; dist *= 2) {
-        T addend = 0;
-        if (threadIdx.x >= dist) addend = buf[threadIdx.z][threadIdx.y][threadIdx.x - dist];
-        __syncthreads();
-        if (threadIdx.x >= dist) buf[threadIdx.z][threadIdx.y][threadIdx.x] += addend;
-        __syncthreads();
-    }
-
-    for (auto dist = 1; dist < BLK; dist *= 2) {
-        T addend = 0;
-        if (threadIdx.y >= dist) addend = buf[threadIdx.z][threadIdx.y - dist][threadIdx.x];
-        __syncthreads();
-        if (threadIdx.y >= dist) buf[threadIdx.z][threadIdx.y][threadIdx.x] += addend;
-        __syncthreads();
-    }
-
-    for (auto dist = 1; dist < BLK; dist *= 2) {
-        T addend = 0;
-        if (threadIdx.z >= dist) addend = buf[threadIdx.z - dist][threadIdx.y][threadIdx.x];
-        __syncthreads();
-        if (threadIdx.z >= dist) buf[threadIdx.z][threadIdx.y][threadIdx.x] += addend;
-        __syncthreads();
-    }
-
-    if (gix < len3.x and giy < len3.y and giz < len3.z) {
-        xdata[id] = buf[threadIdx.z][threadIdx.y][threadIdx.x] * ebx2;
-    }
-}
-
-}  // namespace prototype
-}  // namespace __kernel
-}  // namespace cuda
-}  // namespace psz
-
-#endif
+/**
+ * @file lorenzo_proto.inl
+ * @author Jiannan Tian
+ * @brief (prototype) Dual-EQ Lorenzo method.
+ * @version 0.2
+ * @date 2021-01-16
+ * (create) 2019-09-23; (release) 2020-09-20; (rev1) 2021-01-16; (rev2) 2021-02-20; (rev3) 2021-04-11
+ * (rev4) 2021-04-30
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_KERNEL_LORENZO_PROTOTYPE_CUH
+#define CUSZ_KERNEL_LORENZO_PROTOTYPE_CUH
+
+#include <cstddef>
+#include <stdexcept>
+
+#include "utils/cuda_err.cuh"
+#include "utils/timer.h"
+
+namespace psz {
+
+namespace cuda {
+namespace __kernel {
+
+namespace prototype {  // easy algorithmic description
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 256>
+__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* eq, T* outlier)
+{
+    __shared__ T buf[BLK];
+
+    auto id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id < len3.x) {
+        buf[threadIdx.x] = round(data[id] * ebx2_r);  // prequant (fp presence)
+    }
+    __syncthreads();  // necessary to ensure correctness
+
+    T delta = buf[threadIdx.x] - (threadIdx.x == 0 ? 0 : buf[threadIdx.x - 1]);
+
+    bool quantizable = fabs(delta) < radius;
+    T    candidate   = delta + radius;
+    if (id < len3.x) {                             // postquant
+        data[id] = (1 - quantizable) * candidate;  // output; reuse data for outlier
+        eq[id]   = quantizable * static_cast<EQ>(candidate);
+    }
+}
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 16>
+__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* eq, T* outlier)
+{
+    __shared__ T buf[BLK][BLK + 1];
+
+    auto y = threadIdx.y, x = threadIdx.x;
+    auto giy = blockIdx.y * blockDim.y + y, gix = blockIdx.x * blockDim.x + x;
+
+    auto id = gix + giy * stride3.y;  // low to high dim, inner to outer
+    if (gix < len3.x and giy < len3.y) {
+        buf[y][x] = round(data[id] * ebx2_r);  // prequant (fp presence)
+    }
+    __syncthreads();  // necessary to ensure correctness
+
+    T delta = buf[y][x] - ((x > 0 ? buf[y][x - 1] : 0) +                // dist=1
+                           (y > 0 ? buf[y - 1][x] : 0) -                // dist=1
+                           (x > 0 and y > 0 ? buf[y - 1][x - 1] : 0));  // dist=2
+
+    bool quantizable = fabs(delta) < radius;
+    T    candidate   = delta + radius;
+    if (gix < len3.x and giy < len3.y) {
+        data[id] = (1 - quantizable) * candidate;  // output; reuse data for outlier
+        eq[id]   = quantizable * static_cast<EQ>(candidate);
+    }
+}
+
+template <typename T, typename EQ, typename FP, int BLK = 8>
+__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* eq, T* outlier)
+{
+    __shared__ T buf[BLK][BLK][BLK + 1];
+
+    auto z = threadIdx.z, y = threadIdx.y, x = threadIdx.x;
+    auto giz = blockIdx.z * blockDim.z + z, giy = blockIdx.y * blockDim.y + y, gix = blockIdx.x * blockDim.x + x;
+
+    auto id = gix + giy * stride3.y + giz * stride3.z;  // low to high in dim, inner to outer
+    if (gix < len3.x and giy < len3.y and giz < len3.z) {
+        buf[z][y][x] = round(data[id] * ebx2_r);  // prequant (fp presence)
+    }
+    __syncthreads();  // necessary to ensure correctness
+
+    T delta = buf[z][y][x] - ((z > 0 and y > 0 and x > 0 ? buf[z - 1][y - 1][x - 1] : 0)  // dist=3
+                              - (y > 0 and x > 0 ? buf[z][y - 1][x - 1] : 0)              // dist=2
+                              - (z > 0 and x > 0 ? buf[z - 1][y][x - 1] : 0)              //
+                              - (z > 0 and y > 0 ? buf[z - 1][y - 1][x] : 0)              //
+                              + (x > 0 ? buf[z][y][x - 1] : 0)                            // dist=1
+                              + (y > 0 ? buf[z][y - 1][x] : 0)                            //
+                              + (z > 0 ? buf[z - 1][y][x] : 0));                          //
+
+    bool quantizable = fabs(delta) < radius;
+    T    candidate   = delta + radius;
+    if (gix < len3.x and giy < len3.y and giz < len3.z) {
+        data[id] = (1 - quantizable) * candidate;  // output; reuse data for outlier
+        eq[id]   = quantizable * static_cast<EQ>(candidate);
+    }
+}
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 256>
+__global__ void x_lorenzo_1d1l(EQ* eq, T* scattered_outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata)
+{
+    __shared__ T buf[BLK];
+
+    auto id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (id < len3.x)
+        buf[threadIdx.x] = scattered_outlier[id] + static_cast<T>(eq[id]) - radius;  // fuse
+    else
+        buf[threadIdx.x] = 0;
+    __syncthreads();
+
+    for (auto d = 1; d < BLK; d *= 2) {
+        T n = 0;
+        if (threadIdx.x >= d) n = buf[threadIdx.x - d];  // like __shfl_up_sync(0x1f, var, d); warp_sync
+        __syncthreads();
+        if (threadIdx.x >= d) buf[threadIdx.x] += n;
+        __syncthreads();
+    }
+
+    if (id < len3.x) { xdata[id] = buf[threadIdx.x] * ebx2; }
+}
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 16>
+__global__ void x_lorenzo_2d1l(EQ* eq, T* scattered_outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata)
+{
+    __shared__ T buf[BLK][BLK + 1];
+
+    auto   giy = blockIdx.y * blockDim.y + threadIdx.y, gix = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t id = gix + giy * stride3.y;
+
+    if (gix < len3.x and giy < len3.y)
+        buf[threadIdx.y][threadIdx.x] = scattered_outlier[id] + static_cast<T>(eq[id]) - radius;  // fuse
+    else
+        buf[threadIdx.y][threadIdx.x] = 0;
+    __syncthreads();
+
+    for (auto d = 1; d < BLK; d *= 2) {
+        T n = 0;
+        if (threadIdx.x >= d) n = buf[threadIdx.y][threadIdx.x - d];
+        __syncthreads();
+        if (threadIdx.x >= d) buf[threadIdx.y][threadIdx.x] += n;
+        __syncthreads();
+    }
+
+    for (auto d = 1; d < BLK; d *= 2) {
+        T n = 0;
+        if (threadIdx.y >= d) n = buf[threadIdx.y - d][threadIdx.x];
+        __syncthreads();
+        if (threadIdx.y >= d) buf[threadIdx.y][threadIdx.x] += n;
+        __syncthreads();
+    }
+
+    if (gix < len3.x and giy < len3.y) { xdata[id] = buf[threadIdx.y][threadIdx.x] * ebx2; }
+}
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 8>
+__global__ void x_lorenzo_3d1l(EQ* eq, T* scattered_outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata)
+{
+    __shared__ T buf[BLK][BLK][BLK + 1];
+
+    auto giz = blockIdx.z * BLK + threadIdx.z, giy = blockIdx.y * BLK + threadIdx.y,
+         gix  = blockIdx.x * BLK + threadIdx.x;
+    size_t id = gix + giy * stride3.y + giz * stride3.z;  // low to high in dim, inner to outer
+
+    if (gix < len3.x and giy < len3.y and giz < len3.z)
+        buf[threadIdx.z][threadIdx.y][threadIdx.x] = scattered_outlier[id] + static_cast<T>(eq[id]) - radius;  // id
+    else
+        buf[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
+    __syncthreads();
+
+    for (auto dist = 1; dist < BLK; dist *= 2) {
+        T addend = 0;
+        if (threadIdx.x >= dist) addend = buf[threadIdx.z][threadIdx.y][threadIdx.x - dist];
+        __syncthreads();
+        if (threadIdx.x >= dist) buf[threadIdx.z][threadIdx.y][threadIdx.x] += addend;
+        __syncthreads();
+    }
+
+    for (auto dist = 1; dist < BLK; dist *= 2) {
+        T addend = 0;
+        if (threadIdx.y >= dist) addend = buf[threadIdx.z][threadIdx.y - dist][threadIdx.x];
+        __syncthreads();
+        if (threadIdx.y >= dist) buf[threadIdx.z][threadIdx.y][threadIdx.x] += addend;
+        __syncthreads();
+    }
+
+    for (auto dist = 1; dist < BLK; dist *= 2) {
+        T addend = 0;
+        if (threadIdx.z >= dist) addend = buf[threadIdx.z - dist][threadIdx.y][threadIdx.x];
+        __syncthreads();
+        if (threadIdx.z >= dist) buf[threadIdx.z][threadIdx.y][threadIdx.x] += addend;
+        __syncthreads();
+    }
+
+    if (gix < len3.x and giy < len3.y and giz < len3.z) {
+        xdata[id] = buf[threadIdx.z][threadIdx.y][threadIdx.x] * ebx2;
+    }
+}
+
+}  // namespace prototype
+}  // namespace __kernel
+}  // namespace cuda
+}  // namespace psz
+
+#endif
diff --git a/qtensor/compression/cusz/src/kernel/detail/lorenzo_serial.inl b/qtensor/compression/cusz/src/kernel/detail/lorenzo_serial.inl
index b00ec690..e82013d5 100644
--- a/qtensor/compression/cusz/src/kernel/detail/lorenzo_serial.inl
+++ b/qtensor/compression/cusz/src/kernel/detail/lorenzo_serial.inl
@@ -1,326 +1,326 @@
-/**
- * @file lorenzo_serial.inl
- * @author Jiannan Tian
- * @brief
- * @version 0.4
- * @date 2023-03-13
- *
- * (C) 2023 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef E0B87BA8_BEDC_4CBE_B5EE_C0C5875E07D6
-#define E0B87BA8_BEDC_4CBE_B5EE_C0C5875E07D6
-
-#include <iostream>
-#include "cusz/it.hh"
-#include "cusz/nd.h"
-
-using std::cout;
-using std::endl;
-
-#define SETUP_1D_BASIC                                                                        \
-    psz_dim3 grid_dim, block_idx, thread_idx;                                                 \
-    auto     gx             = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; }; \
-    auto     gidx           = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; }; \
-    auto     check_boundary = [&]() { return gx() < len3.x; };                                \
-    grid_dim.x              = (len3.x - 1) / BLK + 1;
-#define SETUP_1D_DATABUF                                            \
-    constexpr auto PADDING    = 1;                                  \
-    auto           _buf1      = new psz_buf<T, 1, BLK + PADDING>(); \
-    auto&          buf1       = *_buf1;                             \
-    auto           databuf_it = [&](auto x) -> T& { return buf1(thread_idx.x + x + PADDING); };
-#define SETUP_1D_EQBUF                          \
-    auto  _buf2    = new psz_buf<EQ, 1, BLK>(); \
-    auto& buf2     = *_buf2;                    \
-    auto  eqbuf_it = [&](auto dx) -> EQ& { return buf2(thread_idx.x + dx); };
-#define PFOR_GRID_1D() for (block_idx.x = 0; block_idx.x < grid_dim.x; block_idx.x++)
-#define PFOR_BLOCK_1D() for (thread_idx.x = 0; thread_idx.x < BLK; thread_idx.x++)
-
-#define SETUP_2D_BASIC                                                                        \
-    psz_dim3 grid_dim, block_idx, thread_idx;                                                 \
-    auto     gx             = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; }; \
-    auto     gy             = [&]() -> uint32_t { return block_idx.y * BLK + thread_idx.y; }; \
-    auto     gidx           = [&]() -> uint32_t { return gy() * stride3.y + gx(); };          \
-    auto     check_boundary = [&]() { return gx() < len3.x and gy() < len3.y; };              \
-    grid_dim.x              = (len3.x - 1) / BLK + 1;                                         \
-    grid_dim.y              = (len3.y - 1) / BLK + 1;
-#define SETUP_2D_DATABUF                                                                 \
-    constexpr auto PADDING    = 1;                                                       \
-    auto           _buf1      = new psz_buf<T, 2, BLK + PADDING>();                      \
-    auto&          buf1       = *_buf1;                                                  \
-    auto           databuf_it = [&](auto dx, auto dy) -> T& {                            \
-        return buf1(thread_idx.x + dx + PADDING, thread_idx.y + dy + PADDING); \
-    };
-#define SETUP_2D_EQBUF                          \
-    auto  _buf2    = new psz_buf<EQ, 2, BLK>(); \
-    auto& buf2     = *_buf2;                    \
-    auto  eqbuf_it = [&](auto dx, auto dy) -> EQ& { return buf2(thread_idx.x + dx, thread_idx.y + dy); };
-#define PFOR_GRID_2D()                                             \
-    for (block_idx.y = 0; block_idx.y < grid_dim.y; block_idx.y++) \
-        for (block_idx.x = 0; block_idx.x < grid_dim.x; block_idx.x++)
-#define PFOR_BLOCK_2D()                                        \
-    for (thread_idx.y = 0; thread_idx.y < BLK; thread_idx.y++) \
-        for (thread_idx.x = 0; thread_idx.x < BLK; thread_idx.x++)
-
-#define SETUP_3D_BASIC                                                                                  \
-    psz_dim3 grid_dim, block_idx, thread_idx;                                                           \
-    auto     gx             = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; };           \
-    auto     gy             = [&]() -> uint32_t { return block_idx.y * BLK + thread_idx.y; };           \
-    auto     gz             = [&]() -> uint32_t { return block_idx.z * BLK + thread_idx.z; };           \
-    auto     gidx           = [&]() -> uint32_t { return gz() * stride3.z + gy() * stride3.y + gx(); }; \
-    auto     check_boundary = [&]() { return gx() < len3.x and gy() < len3.y and gz() < len3.z; };      \
-    grid_dim.x              = (len3.x - 1) / BLK + 1;                                                   \
-    grid_dim.y              = (len3.y - 1) / BLK + 1;                                                   \
-    grid_dim.z              = (len3.z - 1) / BLK + 1;
-#define SETUP_3D_DATABUF                                                                                              \
-    constexpr auto PADDING    = 1;                                                                                    \
-    auto           _buf1      = new psz_buf<T, 3, BLK + PADDING>();                                                   \
-    auto&          buf1       = *_buf1;                                                                               \
-    auto           databuf_it = [&](auto dx, auto dy, auto dz) -> T& {                                                \
-        return buf1(thread_idx.x + dx + PADDING, thread_idx.y + dy + PADDING, thread_idx.z + dz + PADDING); \
-    };
-#define SETUP_3D_EQBUF                                                         \
-    auto  _buf2    = new psz_buf<EQ, 3, BLK>();                                \
-    auto& buf2     = *_buf2;                                                   \
-    auto  eqbuf_it = [&](auto dx, auto dy, auto dz) -> EQ& {                   \
-        return buf2(thread_idx.x + dx, thread_idx.y + dy, thread_idx.z + dz); \
-    };
-#define PFOR_GRID_3D()                                                 \
-    for (block_idx.z = 0; block_idx.z < grid_dim.z; block_idx.z++)     \
-        for (block_idx.y = 0; block_idx.y < grid_dim.y; block_idx.y++) \
-            for (block_idx.x = 0; block_idx.x < grid_dim.x; block_idx.x++)
-#define PFOR_BLOCK_3D()                                            \
-    for (thread_idx.z = 0; thread_idx.z < BLK; thread_idx.z++)     \
-        for (thread_idx.y = 0; thread_idx.y < BLK; thread_idx.y++) \
-            for (thread_idx.x = 0; thread_idx.x < BLK; thread_idx.x++)
-
-namespace psz {
-namespace serial {
-namespace __kernel {
-
-template <
-    typename T,
-    typename EQ      = int32_t,
-    typename FP      = T,
-    int BLK          = 256,
-    typename OUTLIER = struct psz_outlier_serial<T>>
-void c_lorenzo_1d1l(T* data, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2_r, EQ* eq, OUTLIER* outlier) {
-    SETUP_1D_BASIC;
-    SETUP_1D_DATABUF;
-    SETUP_1D_EQBUF;
-
-    // per-thread ("real" kernel)
-    auto threadview_load = [&]() {
-        if (check_boundary()) databuf_it(0) = data[gidx()] * ebx2_r;
-    };
-    auto threadview_process = [&]() {
-        auto delta = databuf_it(0) - databuf_it(-1);
-        if (delta > radius) {
-            outlier->record(delta, gidx());
-            eqbuf_it(0) = 0;
-        }
-        else {
-            eqbuf_it(0) = delta;
-        }
-    };
-    auto threadview_store = [&]() {
-        if (check_boundary()) eq[gidx()] = eqbuf_it(0);
-    };
-
-    ////////////////////////////////////////
-    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_load(); }
-    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_process(); }
-    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_store(); }
-
-    delete _buf1;
-    delete _buf2;
-
-}
-
-template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 256>
-void x_lorenzo_1d1l(EQ* eq, T* scattered_outlier, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2, T* xdata)
-{
-    SETUP_1D_BASIC;
-    SETUP_1D_DATABUF;
-
-    // per-thread ("real" kernel)
-    auto threadview_load = [&]() {
-        if (check_boundary()) databuf_it(0) = eq[gidx()] + scattered_outlier[gidx()];
-    };
-    auto threadview_partial_sum = [&]() {
-        if (thread_idx.x > 0) databuf_it(0) += databuf_it(-1);
-    };
-    auto threadview_store = [&]() {
-        if (check_boundary()) xdata[gidx()] = databuf_it(0) * ebx2;
-    };
-
-    ////////////////////////////////////////
-    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_load(); }
-    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_partial_sum(); }
-    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_store(); }
-
-    delete _buf1;
-}
-
-template <
-    typename T,
-    typename EQ      = int32_t,
-    typename FP      = T,
-    int BLK          = 16,
-    typename OUTLIER = struct psz_outlier_serial<T>>
-void c_lorenzo_2d1l(T* data, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2_r, EQ* eq, OUTLIER* outlier) {
-    SETUP_2D_BASIC;
-    SETUP_2D_DATABUF;
-    SETUP_2D_EQBUF;
-
-    // per-thread ("real" kernel)
-    auto threadview_load = [&]() {
-        if (check_boundary()) databuf_it(0, 0) = data[gidx()] * ebx2_r;
-    };
-    auto threadview_process = [&]() {
-        auto delta = databuf_it(0, 0) - (databuf_it(-1, 0) + databuf_it(0, -1) - databuf_it(-1, -1));
-        if (delta > radius) {
-            outlier->record(delta, gidx());
-            eqbuf_it(0, 0) = 0;
-        }
-        else {
-            eqbuf_it(0, 0) = delta;
-        }
-    };
-    auto threadview_store = [&]() {
-        if (check_boundary()) eq[gidx()] = eqbuf_it(0, 0);
-    };
-
-    ////////////////////////////////////////
-    PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_load(); }
-    PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_process(); }
-    PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_store(); }
-
-    delete _buf1;
-    delete _buf2;
-}
-
-template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 16>
-void x_lorenzo_2d1l(EQ* eq, T* scattered_outlier, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2, T* xdata)
-{
-    SETUP_2D_BASIC;
-    SETUP_2D_DATABUF;
-
-    // per-thread ("real" kernel)
-    auto threadview_load = [&]() {
-        if (check_boundary()) databuf_it(0, 0) = eq[gidx()] + scattered_outlier[gidx()];
-    };
-    auto threadview_partial_sum_x = [&]() {
-        if (thread_idx.x > 0) databuf_it(0, 0) += databuf_it(-1, 0);
-    };
-    auto threadview_partial_sum_y = [&]() {
-        if (thread_idx.y > 0) databuf_it(0, 0) += databuf_it(0, -1);
-    };
-    auto threadview_store = [&]() {
-        if (check_boundary()) xdata[gidx()] = databuf_it(0, 0) * ebx2;
-    };
-
-    ////////////////////////////////////////
-    PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_load(); }
-    PFOR_GRID_2D()
-    {
-        PFOR_BLOCK_2D() threadview_partial_sum_x();
-        PFOR_BLOCK_2D() threadview_partial_sum_y();
-    }
-    PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_store(); }
-
-    delete _buf1;
-}
-
-template <
-    typename T,
-    typename EQ      = int32_t,
-    typename FP      = T,
-    int BLK          = 8,
-    typename OUTLIER = struct psz_outlier_serial<T>>
-void c_lorenzo_3d1l(T* data, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2_r, EQ* eq, OUTLIER* outlier) {
-    SETUP_3D_BASIC;
-    SETUP_3D_DATABUF;
-    SETUP_3D_EQBUF;
-
-    // per-thread ("real" kernel)
-    auto threadview_load = [&]() {
-        if (check_boundary()) databuf_it(0, 0, 0) = data[gidx()] * ebx2_r;
-    };
-    auto threadview_process = [&]() {
-        auto delta = databuf_it(0, 0, 0) -
-                     (databuf_it(-1, -1, -1) - databuf_it(0, -1, -1) - databuf_it(-1, 0, -1) - databuf_it(-1, -1, 0) +
-                      databuf_it(0, 0, -1) + databuf_it(0, -1, 0) + databuf_it(-1, 0, 0));
-        if (delta > radius) {
-            outlier->record(delta, gidx());
-            eqbuf_it(0, 0, 0) = 0;
-        }
-        else {
-            eqbuf_it(0, 0, 0) = delta;
-        }
-    };
-    auto threadview_store = [&]() {
-        if (check_boundary()) eq[gidx()] = eqbuf_it(0, 0, 0);
-    };
-
-    ////////////////////////////////////////
-    PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_load(); }
-    PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_process(); }
-    PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_store(); }
-
-    delete _buf1;
-    delete _buf2;
-}
-
-template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 8>
-void x_lorenzo_3d1l(EQ* eq, T* scattered_outlier, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2, T* xdata)
-{
-    SETUP_3D_BASIC;
-    SETUP_3D_DATABUF;
-
-    // per-thread ("real" kernel)
-    auto threadview_load = [&]() {
-        if (check_boundary()) databuf_it(0, 0, 0) = eq[gidx()] + scattered_outlier[gidx()];
-    };
-    auto threadview_partial_sum_x = [&]() {
-        if (thread_idx.x > 0) databuf_it(0, 0, 0) += databuf_it(-1, 0, 0);
-    };
-    auto threadview_partial_sum_y = [&]() {
-        if (thread_idx.y > 0) databuf_it(0, 0, 0) += databuf_it(0, -1, 0);
-    };
-    auto threadview_partial_sum_z = [&]() {
-        if (thread_idx.z > 0) databuf_it(0, 0, 0) += databuf_it(0, 0, -1);
-    };
-    auto threadview_store = [&]() {
-        if (check_boundary()) xdata[gidx()] = databuf_it(0, 0, 0) * ebx2;
-    };
-
-    ////////////////////////////////////////
-    PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_load(); }
-    PFOR_GRID_3D()
-    {
-        PFOR_BLOCK_3D() threadview_partial_sum_x();
-        PFOR_BLOCK_3D() threadview_partial_sum_y();
-        PFOR_BLOCK_3D() threadview_partial_sum_z();
-    }
-    PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_store(); }
-
-    delete _buf1;
-}
-
-}  // namespace __kernel
-}  // namespace serial
-}  // namespace psz
-
-#undef SETUP_1D
-#undef PFOR_GRID_1D
-#undef PFOR_BLOCK_1D
-#undef SETUP_2D_BASIC
-#undef PFOR_GRID_2D
-#undef PFOR_BLOCK_2D
-#undef SETUP_3D
-#undef PFOR_GRID_3D
-#undef PFOR_BLOCK_3D
-
-#endif /* E0B87BA8_BEDC_4CBE_B5EE_C0C5875E07D6 */
+/**
+ * @file lorenzo_serial.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-03-13
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef E0B87BA8_BEDC_4CBE_B5EE_C0C5875E07D6
+#define E0B87BA8_BEDC_4CBE_B5EE_C0C5875E07D6
+
+#include <iostream>
+#include "cusz/it.hh"
+#include "cusz/nd.h"
+
+using std::cout;
+using std::endl;
+
+#define SETUP_1D_BASIC                                                                        \
+    psz_dim3 grid_dim, block_idx, thread_idx;                                                 \
+    auto     gx             = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; }; \
+    auto     gidx           = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; }; \
+    auto     check_boundary = [&]() { return gx() < len3.x; };                                \
+    grid_dim.x              = (len3.x - 1) / BLK + 1;
+#define SETUP_1D_DATABUF                                            \
+    constexpr auto PADDING    = 1;                                  \
+    auto           _buf1      = new psz_buf<T, 1, BLK + PADDING>(); \
+    auto&          buf1       = *_buf1;                             \
+    auto           databuf_it = [&](auto x) -> T& { return buf1(thread_idx.x + x + PADDING); };
+#define SETUP_1D_EQBUF                          \
+    auto  _buf2    = new psz_buf<EQ, 1, BLK>(); \
+    auto& buf2     = *_buf2;                    \
+    auto  eqbuf_it = [&](auto dx) -> EQ& { return buf2(thread_idx.x + dx); };
+#define PFOR_GRID_1D() for (block_idx.x = 0; block_idx.x < grid_dim.x; block_idx.x++)
+#define PFOR_BLOCK_1D() for (thread_idx.x = 0; thread_idx.x < BLK; thread_idx.x++)
+
+#define SETUP_2D_BASIC                                                                        \
+    psz_dim3 grid_dim, block_idx, thread_idx;                                                 \
+    auto     gx             = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; }; \
+    auto     gy             = [&]() -> uint32_t { return block_idx.y * BLK + thread_idx.y; }; \
+    auto     gidx           = [&]() -> uint32_t { return gy() * stride3.y + gx(); };          \
+    auto     check_boundary = [&]() { return gx() < len3.x and gy() < len3.y; };              \
+    grid_dim.x              = (len3.x - 1) / BLK + 1;                                         \
+    grid_dim.y              = (len3.y - 1) / BLK + 1;
+#define SETUP_2D_DATABUF                                                                 \
+    constexpr auto PADDING    = 1;                                                       \
+    auto           _buf1      = new psz_buf<T, 2, BLK + PADDING>();                      \
+    auto&          buf1       = *_buf1;                                                  \
+    auto           databuf_it = [&](auto dx, auto dy) -> T& {                            \
+        return buf1(thread_idx.x + dx + PADDING, thread_idx.y + dy + PADDING); \
+    };
+#define SETUP_2D_EQBUF                          \
+    auto  _buf2    = new psz_buf<EQ, 2, BLK>(); \
+    auto& buf2     = *_buf2;                    \
+    auto  eqbuf_it = [&](auto dx, auto dy) -> EQ& { return buf2(thread_idx.x + dx, thread_idx.y + dy); };
+#define PFOR_GRID_2D()                                             \
+    for (block_idx.y = 0; block_idx.y < grid_dim.y; block_idx.y++) \
+        for (block_idx.x = 0; block_idx.x < grid_dim.x; block_idx.x++)
+#define PFOR_BLOCK_2D()                                        \
+    for (thread_idx.y = 0; thread_idx.y < BLK; thread_idx.y++) \
+        for (thread_idx.x = 0; thread_idx.x < BLK; thread_idx.x++)
+
+#define SETUP_3D_BASIC                                                                                  \
+    psz_dim3 grid_dim, block_idx, thread_idx;                                                           \
+    auto     gx             = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; };           \
+    auto     gy             = [&]() -> uint32_t { return block_idx.y * BLK + thread_idx.y; };           \
+    auto     gz             = [&]() -> uint32_t { return block_idx.z * BLK + thread_idx.z; };           \
+    auto     gidx           = [&]() -> uint32_t { return gz() * stride3.z + gy() * stride3.y + gx(); }; \
+    auto     check_boundary = [&]() { return gx() < len3.x and gy() < len3.y and gz() < len3.z; };      \
+    grid_dim.x              = (len3.x - 1) / BLK + 1;                                                   \
+    grid_dim.y              = (len3.y - 1) / BLK + 1;                                                   \
+    grid_dim.z              = (len3.z - 1) / BLK + 1;
+#define SETUP_3D_DATABUF                                                                                              \
+    constexpr auto PADDING    = 1;                                                                                    \
+    auto           _buf1      = new psz_buf<T, 3, BLK + PADDING>();                                                   \
+    auto&          buf1       = *_buf1;                                                                               \
+    auto           databuf_it = [&](auto dx, auto dy, auto dz) -> T& {                                                \
+        return buf1(thread_idx.x + dx + PADDING, thread_idx.y + dy + PADDING, thread_idx.z + dz + PADDING); \
+    };
+#define SETUP_3D_EQBUF                                                         \
+    auto  _buf2    = new psz_buf<EQ, 3, BLK>();                                \
+    auto& buf2     = *_buf2;                                                   \
+    auto  eqbuf_it = [&](auto dx, auto dy, auto dz) -> EQ& {                   \
+        return buf2(thread_idx.x + dx, thread_idx.y + dy, thread_idx.z + dz); \
+    };
+#define PFOR_GRID_3D()                                                 \
+    for (block_idx.z = 0; block_idx.z < grid_dim.z; block_idx.z++)     \
+        for (block_idx.y = 0; block_idx.y < grid_dim.y; block_idx.y++) \
+            for (block_idx.x = 0; block_idx.x < grid_dim.x; block_idx.x++)
+#define PFOR_BLOCK_3D()                                            \
+    for (thread_idx.z = 0; thread_idx.z < BLK; thread_idx.z++)     \
+        for (thread_idx.y = 0; thread_idx.y < BLK; thread_idx.y++) \
+            for (thread_idx.x = 0; thread_idx.x < BLK; thread_idx.x++)
+
+namespace psz {
+namespace serial {
+namespace __kernel {
+
+template <
+    typename T,
+    typename EQ      = int32_t,
+    typename FP      = T,
+    int BLK          = 256,
+    typename OUTLIER = struct psz_outlier_serial<T>>
+void c_lorenzo_1d1l(T* data, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2_r, EQ* eq, OUTLIER* outlier) {
+    SETUP_1D_BASIC;
+    SETUP_1D_DATABUF;
+    SETUP_1D_EQBUF;
+
+    // per-thread ("real" kernel)
+    auto threadview_load = [&]() {
+        if (check_boundary()) databuf_it(0) = data[gidx()] * ebx2_r;
+    };
+    auto threadview_process = [&]() {
+        auto delta = databuf_it(0) - databuf_it(-1);
+        if (delta > radius) {
+            outlier->record(delta, gidx());
+            eqbuf_it(0) = 0;
+        }
+        else {
+            eqbuf_it(0) = delta;
+        }
+    };
+    auto threadview_store = [&]() {
+        if (check_boundary()) eq[gidx()] = eqbuf_it(0);
+    };
+
+    ////////////////////////////////////////
+    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_load(); }
+    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_process(); }
+    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_store(); }
+
+    delete _buf1;
+    delete _buf2;
+
+}
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 256>
+void x_lorenzo_1d1l(EQ* eq, T* scattered_outlier, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2, T* xdata)
+{
+    SETUP_1D_BASIC;
+    SETUP_1D_DATABUF;
+
+    // per-thread ("real" kernel)
+    auto threadview_load = [&]() {
+        if (check_boundary()) databuf_it(0) = eq[gidx()] + scattered_outlier[gidx()];
+    };
+    auto threadview_partial_sum = [&]() {
+        if (thread_idx.x > 0) databuf_it(0) += databuf_it(-1);
+    };
+    auto threadview_store = [&]() {
+        if (check_boundary()) xdata[gidx()] = databuf_it(0) * ebx2;
+    };
+
+    ////////////////////////////////////////
+    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_load(); }
+    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_partial_sum(); }
+    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_store(); }
+
+    delete _buf1;
+}
+
+template <
+    typename T,
+    typename EQ      = int32_t,
+    typename FP      = T,
+    int BLK          = 16,
+    typename OUTLIER = struct psz_outlier_serial<T>>
+void c_lorenzo_2d1l(T* data, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2_r, EQ* eq, OUTLIER* outlier) {
+    SETUP_2D_BASIC;
+    SETUP_2D_DATABUF;
+    SETUP_2D_EQBUF;
+
+    // per-thread ("real" kernel)
+    auto threadview_load = [&]() {
+        if (check_boundary()) databuf_it(0, 0) = data[gidx()] * ebx2_r;
+    };
+    auto threadview_process = [&]() {
+        auto delta = databuf_it(0, 0) - (databuf_it(-1, 0) + databuf_it(0, -1) - databuf_it(-1, -1));
+        if (delta > radius) {
+            outlier->record(delta, gidx());
+            eqbuf_it(0, 0) = 0;
+        }
+        else {
+            eqbuf_it(0, 0) = delta;
+        }
+    };
+    auto threadview_store = [&]() {
+        if (check_boundary()) eq[gidx()] = eqbuf_it(0, 0);
+    };
+
+    ////////////////////////////////////////
+    PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_load(); }
+    PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_process(); }
+    PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_store(); }
+
+    delete _buf1;
+    delete _buf2;
+}
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 16>
+void x_lorenzo_2d1l(EQ* eq, T* scattered_outlier, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2, T* xdata)
+{
+    SETUP_2D_BASIC;
+    SETUP_2D_DATABUF;
+
+    // per-thread ("real" kernel)
+    auto threadview_load = [&]() {
+        if (check_boundary()) databuf_it(0, 0) = eq[gidx()] + scattered_outlier[gidx()];
+    };
+    auto threadview_partial_sum_x = [&]() {
+        if (thread_idx.x > 0) databuf_it(0, 0) += databuf_it(-1, 0);
+    };
+    auto threadview_partial_sum_y = [&]() {
+        if (thread_idx.y > 0) databuf_it(0, 0) += databuf_it(0, -1);
+    };
+    auto threadview_store = [&]() {
+        if (check_boundary()) xdata[gidx()] = databuf_it(0, 0) * ebx2;
+    };
+
+    ////////////////////////////////////////
+    PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_load(); }
+    PFOR_GRID_2D()
+    {
+        PFOR_BLOCK_2D() threadview_partial_sum_x();
+        PFOR_BLOCK_2D() threadview_partial_sum_y();
+    }
+    PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_store(); }
+
+    delete _buf1;
+}
+
+template <
+    typename T,
+    typename EQ      = int32_t,
+    typename FP      = T,
+    int BLK          = 8,
+    typename OUTLIER = struct psz_outlier_serial<T>>
+void c_lorenzo_3d1l(T* data, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2_r, EQ* eq, OUTLIER* outlier) {
+    SETUP_3D_BASIC;
+    SETUP_3D_DATABUF;
+    SETUP_3D_EQBUF;
+
+    // per-thread ("real" kernel)
+    auto threadview_load = [&]() {
+        if (check_boundary()) databuf_it(0, 0, 0) = data[gidx()] * ebx2_r;
+    };
+    auto threadview_process = [&]() {
+        auto delta = databuf_it(0, 0, 0) -
+                     (databuf_it(-1, -1, -1) - databuf_it(0, -1, -1) - databuf_it(-1, 0, -1) - databuf_it(-1, -1, 0) +
+                      databuf_it(0, 0, -1) + databuf_it(0, -1, 0) + databuf_it(-1, 0, 0));
+        if (delta > radius) {
+            outlier->record(delta, gidx());
+            eqbuf_it(0, 0, 0) = 0;
+        }
+        else {
+            eqbuf_it(0, 0, 0) = delta;
+        }
+    };
+    auto threadview_store = [&]() {
+        if (check_boundary()) eq[gidx()] = eqbuf_it(0, 0, 0);
+    };
+
+    ////////////////////////////////////////
+    PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_load(); }
+    PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_process(); }
+    PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_store(); }
+
+    delete _buf1;
+    delete _buf2;
+}
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 8>
+void x_lorenzo_3d1l(EQ* eq, T* scattered_outlier, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2, T* xdata)
+{
+    SETUP_3D_BASIC;
+    SETUP_3D_DATABUF;
+
+    // per-thread ("real" kernel)
+    auto threadview_load = [&]() {
+        if (check_boundary()) databuf_it(0, 0, 0) = eq[gidx()] + scattered_outlier[gidx()];
+    };
+    auto threadview_partial_sum_x = [&]() {
+        if (thread_idx.x > 0) databuf_it(0, 0, 0) += databuf_it(-1, 0, 0);
+    };
+    auto threadview_partial_sum_y = [&]() {
+        if (thread_idx.y > 0) databuf_it(0, 0, 0) += databuf_it(0, -1, 0);
+    };
+    auto threadview_partial_sum_z = [&]() {
+        if (thread_idx.z > 0) databuf_it(0, 0, 0) += databuf_it(0, 0, -1);
+    };
+    auto threadview_store = [&]() {
+        if (check_boundary()) xdata[gidx()] = databuf_it(0, 0, 0) * ebx2;
+    };
+
+    ////////////////////////////////////////
+    PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_load(); }
+    PFOR_GRID_3D()
+    {
+        PFOR_BLOCK_3D() threadview_partial_sum_x();
+        PFOR_BLOCK_3D() threadview_partial_sum_y();
+        PFOR_BLOCK_3D() threadview_partial_sum_z();
+    }
+    PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_store(); }
+
+    delete _buf1;
+}
+
+}  // namespace __kernel
+}  // namespace serial
+}  // namespace psz
+
+#undef SETUP_1D
+#undef PFOR_GRID_1D
+#undef PFOR_BLOCK_1D
+#undef SETUP_2D_BASIC
+#undef PFOR_GRID_2D
+#undef PFOR_BLOCK_2D
+#undef SETUP_3D
+#undef PFOR_GRID_3D
+#undef PFOR_BLOCK_3D
+
+#endif /* E0B87BA8_BEDC_4CBE_B5EE_C0C5875E07D6 */
diff --git a/qtensor/compression/cusz/src/kernel/detail/lorenzo_var.inl b/qtensor/compression/cusz/src/kernel/detail/lorenzo_var.inl
index b5563275..2f58d1ad 100644
--- a/qtensor/compression/cusz/src/kernel/detail/lorenzo_var.inl
+++ b/qtensor/compression/cusz/src/kernel/detail/lorenzo_var.inl
@@ -1,530 +1,530 @@
-/**
- * @file lorenzo_var.inl
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-09-29
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef E2BEA52A_4D2E_4966_9135_6CE8B8E05762
-#define E2BEA52A_4D2E_4966_9135_6CE8B8E05762
-
-#include <cstddef>
-
-#if __has_include(<cub/cub.cuh>)
-// #pragma message __FILE__ ": (CUDA 11 onward), cub from system path"
-#include <cub/cub.cuh>
-#else
-// #pragma message __FILE__ ": (CUDA 10 or earlier), cub from git submodule"
-#include "../../third_party/cub/cub/cub.cuh"
-#endif
-
-#if __cplusplus >= 201703L
-#define CONSTEXPR constexpr
-#else
-#define CONSTEXPR
-#endif
-
-#define TIX threadIdx.x
-#define TIY threadIdx.y
-#define TIZ threadIdx.z
-#define BIX blockIdx.x
-#define BIY blockIdx.y
-#define BIZ blockIdx.z
-#define BDX blockDim.x
-#define BDY blockDim.y
-#define BDZ blockDim.z
-
-#include "utils/cuda_err.cuh"
-#include "utils/timer.h"
-
-namespace cusz {
-namespace experimental {
-
-template <typename Data, typename ErrCtrl, int SEQ, bool FIRST_POINT>
-__forceinline__ __device__ void
-pred1d(Data thread_scope[SEQ], volatile bool* shmem_signum, volatile ErrCtrl* shmem_delta, Data from_last_stripe = 0)
-{
-    if CONSTEXPR (FIRST_POINT) {  // i == 0
-        Data delta                  = thread_scope[0] - from_last_stripe;
-        shmem_signum[0 + TIX * SEQ] = delta < 0;  // signnum
-        shmem_delta[0 + TIX * SEQ]  = static_cast<ErrCtrl>(fabs(delta));
-    }
-    else {
-#pragma unroll
-        for (auto i = 1; i < SEQ; i++) {
-            Data delta                  = thread_scope[i] - thread_scope[i - 1];
-            shmem_signum[i + TIX * SEQ] = delta < 0;  // signum
-            shmem_delta[i + TIX * SEQ]  = static_cast<ErrCtrl>(fabs(delta));
-        }
-        __syncthreads();
-    }
-}
-
-template <typename Data, typename FP, int NTHREAD, int SEQ>
-__forceinline__ __device__ void load1d(
-    Data*          data,
-    unsigned int   dimx,
-    unsigned int   id_base,
-    volatile Data* shmem_data,
-    Data           thread_scope[SEQ],
-    Data&          from_last_stripe,
-    FP             ebx2_r)
-{
-#pragma unroll
-    for (auto i = 0; i < SEQ; i++) {
-        auto id = id_base + TIX + i * NTHREAD;
-        if (id < dimx) { shmem_data[TIX + i * NTHREAD] = round(data[id] * ebx2_r); }
-    }
-    __syncthreads();
-
-    for (auto i = 0; i < SEQ; i++) thread_scope[i] = shmem_data[TIX * SEQ + i];
-
-    if (TIX > 0) from_last_stripe = shmem_data[TIX * SEQ - 1];
-    __syncthreads();
-}
-
-template <typename ErrCtrl, int NTHREAD, int SEQ>
-__forceinline__ __device__ void write1d(
-    volatile bool*    shmem_signum,
-    bool*             signum,
-    unsigned int      dimx,
-    unsigned int      id_base,
-    volatile ErrCtrl* shmem_delta = nullptr,
-    ErrCtrl*          delta       = nullptr)
-{
-#pragma unroll
-    for (auto i = 0; i < SEQ; i++) {
-        auto id = id_base + TIX + i * NTHREAD;
-        if (id < dimx) {
-            signum[id] = shmem_signum[TIX + i * NTHREAD];
-            delta[id]  = shmem_delta[TIX + i * NTHREAD];
-        }
-    }
-}
-
-template <typename Data, typename FP, int YSEQ>
-__forceinline__ __device__ void load2d_prequant(
-    Data*        data,
-    Data         center[YSEQ + 1],
-    unsigned int dimx,
-    unsigned int dimy,
-    unsigned int stridey,
-    unsigned int gix,
-    unsigned int giy_base,
-    FP           ebx2_r)
-{
-    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
-
-#pragma unroll
-    for (auto i = 0; i < YSEQ; i++) {
-        if (gix < dimx and giy_base + i < dimy) center[i + 1] = round(data[get_gid(i)] * ebx2_r);
-    }
-    auto tmp = __shfl_up_sync(0xffffffff, center[YSEQ], 16);  // same-warp, next-16
-    if (TIY == 1) center[0] = tmp;
-}
-
-template <typename Data, typename FP, int YSEQ>
-__forceinline__ __device__ void pred2d(Data center[YSEQ + 1])
-{
-    /* prediction
-         original form:  Data delta = center[i] - center[i - 1] + west[i] - west[i - 1];
-            short form:  Data delta = center[i] - west[i];
-       */
-#pragma unroll
-    for (auto i = YSEQ; i > 0; i--) {
-        center[i] -= center[i - 1];
-        auto west = __shfl_up_sync(0xffffffff, center[i], 1, 16);
-        if (TIX > 0) center[i] -= west;
-    }
-    __syncthreads();
-}
-
-template <typename Data, typename ErrCtrl, int YSEQ>
-__forceinline__ __device__ void postquant_write2d(
-    Data         center[YSEQ + 1],
-    ErrCtrl*     delta,
-    bool*        signum,
-    unsigned int dimx,
-    unsigned int dimy,
-    unsigned int stridey,
-    unsigned int gix,
-    unsigned int giy_base)
-{
-    /********************************************************************************
-     * Depending on whether postquant is delayed in compression, deside separating
-     * data-type signum and uint-type quantcode when writing to DRAM (or not).
-     ********************************************************************************/
-    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
-
-#pragma unroll
-    for (auto i = 1; i < YSEQ + 1; i++) {
-        auto gid = get_gid(i - 1);
-
-        if (gix < dimx and giy_base + i - 1 < dimy) {
-            signum[gid] = center[i] < 0;  // output; reuse data for signum
-            delta[gid]  = static_cast<ErrCtrl>(fabs(center[i]));
-        }
-    }
-}
-
-template <
-    typename Data,
-    typename ErrCtrl,
-    typename FP,
-    int BLOCK,
-    int SEQ>
-__global__ void c_lorenzo_1d1l(  //
-    Data*    data,
-    ErrCtrl* delta,
-    bool*    signum,
-    dim3     len3,
-    dim3     stride3,
-    FP       ebx2_r)
-{
-    constexpr auto NTHREAD = BLOCK / SEQ;
-
-    __shared__ struct {
-        Data    data[BLOCK];
-        ErrCtrl delta[BLOCK];
-        bool    signum[BLOCK];
-    } shmem;
-
-    auto id_base = BIX * BLOCK;
-
-    Data thread_scope[SEQ];
-    Data from_last_stripe{0};
-
-    /********************************************************************************
-     * load from DRAM using striped layout, perform prequant
-     ********************************************************************************/
-    load1d<Data, FP, NTHREAD, SEQ>(data, len3.x, id_base, shmem.data, thread_scope, from_last_stripe, ebx2_r);
-
-    /********************************************************************************
-     * delta and signum
-     ********************************************************************************/
-    pred1d<Data, ErrCtrl, SEQ, true>(thread_scope, shmem.signum, shmem.delta, from_last_stripe);
-    pred1d<Data, ErrCtrl, SEQ, false>(thread_scope, shmem.signum, shmem.delta);
-    write1d<ErrCtrl, NTHREAD, SEQ>(shmem.signum, signum, len3.x, id_base, shmem.delta, delta);
-}
-
-template <typename Data = float, typename ErrCtrl = uint16_t, typename FP = float>
-__global__ void c_lorenzo_2d1l_16x16data_mapto16x2(
-    Data*    data,    // input
-    ErrCtrl* delta,   // output
-    bool*    signum,  // output
-    dim3     len3,
-    dim3     stride3,
-    FP       ebx2_r)
-{
-    constexpr auto BLOCK = 16;
-    constexpr auto YSEQ  = 8;
-
-    Data center[YSEQ + 1] = {0};  // nw  n
-                                  //  w  center
-
-    auto gix      = BIX * BDX + TIX;           // BDX == 16
-    auto giy_base = BIY * BLOCK + TIY * YSEQ;  // BDY * YSEQ = BLOCK == 16
-                                               // clang-format off
-    load2d_prequant<Data, FP, YSEQ>(data, center, len3.x, len3.y, stride3.y, gix, giy_base, ebx2_r);
-    pred2d<Data, FP, YSEQ>(center);
-    postquant_write2d<Data, ErrCtrl, YSEQ >(center, delta, signum, len3.x, len3.y, stride3.y,  gix, giy_base);
-    // clang-format on
-}
-
-template <typename Data, typename ErrCtrl = uint16_t, typename FP = float>
-__global__ void c_lorenzo_3d1l_32x8x8data_mapto32x1x8(
-    Data*    data,    // input
-    ErrCtrl* delta,   // output
-    bool*    signum,  // output
-    dim3     len3,
-    dim3     stride3,
-    FP       ebx2_r)
-{
-    constexpr auto  BLOCK = 8;
-    __shared__ Data shmem[8][8][32];
-
-    auto z = TIZ;
-
-    auto gix      = BIX * (BLOCK * 4) + TIX;
-    auto giy_base = BIY * BLOCK;
-    auto giz      = BIZ * BLOCK + z;
-    auto base_id  = gix + giy_base * stride3.y + giz * stride3.z;
-
-    /********************************************************************************
-     * load from DRAM, perform prequant
-     ********************************************************************************/
-    if (gix < len3.x and giz < len3.z) {
-        for (auto y = 0; y < BLOCK; y++) {
-            if (giy_base + y < len3.y) {
-                shmem[z][y][TIX] = round(data[base_id + y * stride3.y] * ebx2_r);  // prequant (fp presence)
-            }
-        }
-    }
-    __syncthreads();  // necessary to ensure correctness
-
-    auto x = TIX % 8;
-
-    for (auto y = 0; y < BLOCK; y++) {
-        Data delta_val;
-
-        // prediction
-        delta_val = shmem[z][y][TIX] - ((z > 0 and y > 0 and x > 0 ? shmem[z - 1][y - 1][TIX - 1] : 0)  // dist=3
-                                        - (y > 0 and x > 0 ? shmem[z][y - 1][TIX - 1] : 0)              // dist=2
-                                        - (z > 0 and x > 0 ? shmem[z - 1][y][TIX - 1] : 0)              //
-                                        - (z > 0 and y > 0 ? shmem[z - 1][y - 1][TIX] : 0)              //
-                                        + (x > 0 ? shmem[z][y][TIX - 1] : 0)                            // dist=1
-                                        + (y > 0 ? shmem[z][y - 1][TIX] : 0)                            //
-                                        + (z > 0 ? shmem[z - 1][y][TIX] : 0));                          //
-
-        auto id = base_id + (y * stride3.y);
-
-        // delta and signum
-        if (gix < len3.x and (giy_base + y) < len3.y and giz < len3.z) {
-            signum[id] = delta_val < 0;
-            delta[id]  = static_cast<ErrCtrl>(fabs(delta_val));
-        }
-    }
-    /* EOF */
-}
-
-template <typename Data = float, typename ErrCtrl = uint16_t, typename FP = float, int BLOCK = 256, int SEQ = 8>
-__global__ void x_lorenzo_1d1l(  //
-    bool*    signum,
-    ErrCtrl* delta,
-    Data*    xdata,
-    dim3     len3,
-    dim3     stride3,
-    FP       ebx2)
-{
-    constexpr auto block_dim = BLOCK / SEQ;  // dividable
-
-    // coalesce-load (warp-striped) and transpose in shmem (similar for store)
-    typedef cub::BlockLoad<bool, block_dim, SEQ, cub::BLOCK_LOAD_WARP_TRANSPOSE>    BlockLoadT_signum;
-    typedef cub::BlockLoad<ErrCtrl, block_dim, SEQ, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT_delta;
-    typedef cub::BlockStore<Data, block_dim, SEQ, cub::BLOCK_STORE_WARP_TRANSPOSE>  BlockStoreT_xdata;
-    typedef cub::BlockScan<Data, block_dim, cub::BLOCK_SCAN_RAKING_MEMOIZE>
-        BlockScanT_xdata;  // TODO autoselect algorithm
-
-    __shared__ union TempStorage {  // overlap shared memory space
-        typename BlockLoadT_signum::TempStorage load_signum;
-        typename BlockLoadT_delta::TempStorage  load_delta;
-        typename BlockStoreT_xdata::TempStorage store_xdata;
-        typename BlockScanT_xdata::TempStorage  scan_xdata;
-    } temp_storage;
-
-    // thread-scope tiled data
-    struct ThreadData {
-        Data xdata[SEQ];
-        bool signum[SEQ];
-    } thread_scope;
-    ErrCtrl thread_scope_delta[SEQ];
-
-    /********************************************************************************
-     * load to thread-private array (fuse at the same time)
-     * (BIX * BDX * SEQ) denotes the start of the data chunk that belongs to this thread block
-     ********************************************************************************/
-    BlockLoadT_delta(temp_storage.load_delta).Load(delta + (BIX * BDX) * SEQ, thread_scope_delta);
-    __syncthreads();  // barrier for shmem reuse
-    BlockLoadT_signum(temp_storage.load_signum).Load(signum + (BIX * BDX) * SEQ, thread_scope.signum);
-    __syncthreads();  // barrier for shmem reuse
-
-#pragma unroll
-    for (auto i = 0; i < SEQ; i++) {
-        auto id               = (BIX * BDX + TIX) * SEQ + i;
-        thread_scope.xdata[i] = id < len3.x  //
-                                    ? (thread_scope.signum[i] ? -1 : 1) * static_cast<Data>(thread_scope_delta[i])
-                                    : 0;
-    }
-    __syncthreads();
-
-    /********************************************************************************
-     * perform partial-sum using cub::InclusiveSum
-     ********************************************************************************/
-    BlockScanT_xdata(temp_storage.scan_xdata).InclusiveSum(thread_scope.xdata, thread_scope.xdata);
-    __syncthreads();  // barrier for shmem reuse
-
-    /********************************************************************************
-     * scale by ebx2 and write to DRAM
-     ********************************************************************************/
-#pragma unroll
-    for (auto i = 0; i < SEQ; i++) thread_scope.xdata[i] *= ebx2;
-    __syncthreads();  // barrier for shmem reuse
-
-    BlockStoreT_xdata(temp_storage.store_xdata).Store(xdata + (BIX * BDX) * SEQ, thread_scope.xdata);
-}
-
-template <typename Data = float, typename ErrCtrl = uint16_t, typename FP = float>
-__global__ void
-x_lorenzo_2d1l_16x16data_mapto16x2(bool* signum, ErrCtrl* delta, Data* xdata, dim3 len3, dim3 stride3, FP ebx2)
-{
-    constexpr auto BLOCK = 16;
-    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
-    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
-
-    __shared__ Data intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
-    Data            thread_scope[YSEQ];
-    /*
-      .  ------> gix (x)
-      |  t00    t01    t02    t03    ... t0f
-      |  ts00_0 ts00_0 ts00_0 ts00_0
-     giy ts00_1 ts00_1 ts00_1 ts00_1
-     (y)  |      |      |      |
-         ts00_7 ts00_7 ts00_7 ts00_7
-
-      |  t10    t11    t12    t13    ... t1f
-      |  ts00_0 ts00_0 ts00_0 ts00_0
-     giy ts00_1 ts00_1 ts00_1 ts00_1
-     (y)  |      |      |      |
-         ts00_7 ts00_7 ts00_7 ts00_7
-     */
-
-    auto gix      = BIX * BLOCK + TIX;
-    auto giy_base = BIY * BLOCK + TIY * YSEQ;  // BDY * YSEQ = BLOCK == 16
-    auto get_gid  = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
-
-    /********************************************************************************
-     * load to thread-private array (fuse at the same time)
-     ********************************************************************************/
-#pragma unroll
-    for (auto i = 0; i < YSEQ; i++) {
-        auto gid = get_gid(i);
-        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
-        if (gix < len3.x and giy_base + i < len3.y)
-            thread_scope[i] = (signum[gid] ? -1 : 1) * static_cast<Data>(delta[gid]);  // fuse
-        else
-            thread_scope[i] = 0;  // TODO set as init state?
-    }
-
-    /********************************************************************************
-     * partial-sum along y-axis, sequantially
-     ********************************************************************************/
-    for (auto i = 1; i < YSEQ; i++) thread_scope[i] += thread_scope[i - 1];
-    // two-pass: store for cross-threadscope update
-    if (TIY == 0) intermediate[TIX] = thread_scope[YSEQ - 1];
-    __syncthreads();
-    // two-pass: load and update
-    if (TIY == 1) {
-        auto tmp = intermediate[TIX];
-#pragma unroll
-        for (auto& i : thread_scope) i += tmp;
-    }
-
-    /********************************************************************************
-     * in-warp partial-sum along x-axis
-     ********************************************************************************/
-#pragma unroll
-    for (auto& i : thread_scope) {
-        for (auto d = 1; d < BLOCK; d *= 2) {
-            Data n = __shfl_up_sync(0xffffffff, i, d, 16);
-            if (TIX >= d) i += n;
-        }
-        i *= ebx2;
-    }
-
-    /********************************************************************************
-     * write to DRAM
-     ********************************************************************************/
-#pragma unroll
-    for (auto i = 0; i < YSEQ; i++) {
-        auto gid = get_gid(i);
-        if (gix < len3.x and giy_base + i < len3.y) xdata[gid] = thread_scope[i];
-    }
-}
-
-template <typename Data = float, typename ErrCtrl = uint16_t, typename FP = float>
-__global__ void
-x_lorenzo_3d1l_32x8x8data_mapto32x1x8(bool* signum, ErrCtrl* delta, Data* xdata, dim3 len3, dim3 stride3, FP ebx2)
-{
-    constexpr auto BLOCK = 8;
-    constexpr auto YSEQ  = BLOCK;
-    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
-
-    __shared__ Data intermediate[BLOCK][4][8];
-    Data            thread_scope[YSEQ];
-
-    auto seg_id  = TIX / 8;
-    auto seg_tix = TIX % 8;
-
-    auto gix = BIX * (4 * BLOCK) + TIX, giy_base = BIY * BLOCK, giz = BIZ * BLOCK + TIZ;
-    auto get_gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
-
-    /********************************************************************************
-     * load to thread-private array (fuse at the same time)
-     ********************************************************************************/
-#pragma unroll
-    for (auto y = 0; y < YSEQ; y++) {
-        auto gid = get_gid(y);
-        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
-            thread_scope[y] = (signum[gid] ? -1 : 1) * static_cast<Data>(delta[gid]);
-        else
-            thread_scope[y] = 0;
-    }
-
-    /********************************************************************************
-     * partial-sum along y-axis, sequantially
-     ********************************************************************************/
-    for (auto y = 1; y < YSEQ; y++) thread_scope[y] += thread_scope[y - 1];
-
-    /********************************************************************************
-     * ND partial-sums along x- and z-axis
-     * in-warp shuffle used: in order to perform, it's transposed after X-partial sum
-     ********************************************************************************/
-    auto dist = 1;
-    Data addend;
-
-#pragma unroll
-    for (auto i = 0; i < BLOCK; i++) {
-        Data val = thread_scope[i];
-
-        for (dist = 1; dist < BLOCK; dist *= 2) {
-            addend = __shfl_up_sync(0xffffffff, val, dist, 8);
-            if (seg_tix >= dist) val += addend;
-        }
-
-        // x-z transpose
-        intermediate[TIZ][seg_id][seg_tix] = val;
-        __syncthreads();
-        val = intermediate[seg_tix][seg_id][TIZ];
-        __syncthreads();
-
-        for (dist = 1; dist < BLOCK; dist *= 2) {
-            addend = __shfl_up_sync(0xffffffff, val, dist, 8);
-            if (seg_tix >= dist) val += addend;
-        }
-
-        intermediate[TIZ][seg_id][seg_tix] = val;
-        __syncthreads();
-        val = intermediate[seg_tix][seg_id][TIZ];
-        __syncthreads();
-
-        thread_scope[i] = val;
-    }
-
-    /********************************************************************************
-     * write to DRAM
-     ********************************************************************************/
-#pragma unroll
-    for (auto y = 0; y < YSEQ; y++) {
-        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) { xdata[get_gid(y)] = thread_scope[y] * ebx2; }
-    }
-    /* EOF */
-}
-
-}  // namespace experimental
-}  // namespace cusz
-
-#undef TIX
-#undef TIY
-#undef TIZ
-#undef BIX
-#undef BIY
-#undef BIZ
-#undef BDX
-#undef BDY
-#undef BDZ
-
-#endif /* E2BEA52A_4D2E_4966_9135_6CE8B8E05762 */
+/**
+ * @file lorenzo_var.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-09-29
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef E2BEA52A_4D2E_4966_9135_6CE8B8E05762
+#define E2BEA52A_4D2E_4966_9135_6CE8B8E05762
+
+#include <cstddef>
+
+#if __has_include(<cub/cub.cuh>)
+// #pragma message __FILE__ ": (CUDA 11 onward), cub from system path"
+#include <cub/cub.cuh>
+#else
+// #pragma message __FILE__ ": (CUDA 10 or earlier), cub from git submodule"
+#include "../../third_party/cub/cub/cub.cuh"
+#endif
+
+#if __cplusplus >= 201703L
+#define CONSTEXPR constexpr
+#else
+#define CONSTEXPR
+#endif
+
+#define TIX threadIdx.x
+#define TIY threadIdx.y
+#define TIZ threadIdx.z
+#define BIX blockIdx.x
+#define BIY blockIdx.y
+#define BIZ blockIdx.z
+#define BDX blockDim.x
+#define BDY blockDim.y
+#define BDZ blockDim.z
+
+#include "utils/cuda_err.cuh"
+#include "utils/timer.h"
+
+namespace cusz {
+namespace experimental {
+
+template <typename Data, typename ErrCtrl, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void
+pred1d(Data thread_scope[SEQ], volatile bool* shmem_signum, volatile ErrCtrl* shmem_delta, Data from_last_stripe = 0)
+{
+    if CONSTEXPR (FIRST_POINT) {  // i == 0
+        Data delta                  = thread_scope[0] - from_last_stripe;
+        shmem_signum[0 + TIX * SEQ] = delta < 0;  // signnum
+        shmem_delta[0 + TIX * SEQ]  = static_cast<ErrCtrl>(fabs(delta));
+    }
+    else {
+#pragma unroll
+        for (auto i = 1; i < SEQ; i++) {
+            Data delta                  = thread_scope[i] - thread_scope[i - 1];
+            shmem_signum[i + TIX * SEQ] = delta < 0;  // signum
+            shmem_delta[i + TIX * SEQ]  = static_cast<ErrCtrl>(fabs(delta));
+        }
+        __syncthreads();
+    }
+}
+
+template <typename Data, typename FP, int NTHREAD, int SEQ>
+__forceinline__ __device__ void load1d(
+    Data*          data,
+    unsigned int   dimx,
+    unsigned int   id_base,
+    volatile Data* shmem_data,
+    Data           thread_scope[SEQ],
+    Data&          from_last_stripe,
+    FP             ebx2_r)
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id = id_base + TIX + i * NTHREAD;
+        if (id < dimx) { shmem_data[TIX + i * NTHREAD] = round(data[id] * ebx2_r); }
+    }
+    __syncthreads();
+
+    for (auto i = 0; i < SEQ; i++) thread_scope[i] = shmem_data[TIX * SEQ + i];
+
+    if (TIX > 0) from_last_stripe = shmem_data[TIX * SEQ - 1];
+    __syncthreads();
+}
+
+template <typename ErrCtrl, int NTHREAD, int SEQ>
+__forceinline__ __device__ void write1d(
+    volatile bool*    shmem_signum,
+    bool*             signum,
+    unsigned int      dimx,
+    unsigned int      id_base,
+    volatile ErrCtrl* shmem_delta = nullptr,
+    ErrCtrl*          delta       = nullptr)
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id = id_base + TIX + i * NTHREAD;
+        if (id < dimx) {
+            signum[id] = shmem_signum[TIX + i * NTHREAD];
+            delta[id]  = shmem_delta[TIX + i * NTHREAD];
+        }
+    }
+}
+
+template <typename Data, typename FP, int YSEQ>
+__forceinline__ __device__ void load2d_prequant(
+    Data*        data,
+    Data         center[YSEQ + 1],
+    unsigned int dimx,
+    unsigned int dimy,
+    unsigned int stridey,
+    unsigned int gix,
+    unsigned int giy_base,
+    FP           ebx2_r)
+{
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        if (gix < dimx and giy_base + i < dimy) center[i + 1] = round(data[get_gid(i)] * ebx2_r);
+    }
+    auto tmp = __shfl_up_sync(0xffffffff, center[YSEQ], 16);  // same-warp, next-16
+    if (TIY == 1) center[0] = tmp;
+}
+
+template <typename Data, typename FP, int YSEQ>
+__forceinline__ __device__ void pred2d(Data center[YSEQ + 1])
+{
+    /* prediction
+         original form:  Data delta = center[i] - center[i - 1] + west[i] - west[i - 1];
+            short form:  Data delta = center[i] - west[i];
+       */
+#pragma unroll
+    for (auto i = YSEQ; i > 0; i--) {
+        center[i] -= center[i - 1];
+        auto west = __shfl_up_sync(0xffffffff, center[i], 1, 16);
+        if (TIX > 0) center[i] -= west;
+    }
+    __syncthreads();
+}
+
+template <typename Data, typename ErrCtrl, int YSEQ>
+__forceinline__ __device__ void postquant_write2d(
+    Data         center[YSEQ + 1],
+    ErrCtrl*     delta,
+    bool*        signum,
+    unsigned int dimx,
+    unsigned int dimy,
+    unsigned int stridey,
+    unsigned int gix,
+    unsigned int giy_base)
+{
+    /********************************************************************************
+     * Depending on whether postquant is delayed in compression, deside separating
+     * data-type signum and uint-type quantcode when writing to DRAM (or not).
+     ********************************************************************************/
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 1; i < YSEQ + 1; i++) {
+        auto gid = get_gid(i - 1);
+
+        if (gix < dimx and giy_base + i - 1 < dimy) {
+            signum[gid] = center[i] < 0;  // output; reuse data for signum
+            delta[gid]  = static_cast<ErrCtrl>(fabs(center[i]));
+        }
+    }
+}
+
+template <
+    typename Data,
+    typename ErrCtrl,
+    typename FP,
+    int BLOCK,
+    int SEQ>
+__global__ void c_lorenzo_1d1l(  //
+    Data*    data,
+    ErrCtrl* delta,
+    bool*    signum,
+    dim3     len3,
+    dim3     stride3,
+    FP       ebx2_r)
+{
+    constexpr auto NTHREAD = BLOCK / SEQ;
+
+    __shared__ struct {
+        Data    data[BLOCK];
+        ErrCtrl delta[BLOCK];
+        bool    signum[BLOCK];
+    } shmem;
+
+    auto id_base = BIX * BLOCK;
+
+    Data thread_scope[SEQ];
+    Data from_last_stripe{0};
+
+    /********************************************************************************
+     * load from DRAM using striped layout, perform prequant
+     ********************************************************************************/
+    load1d<Data, FP, NTHREAD, SEQ>(data, len3.x, id_base, shmem.data, thread_scope, from_last_stripe, ebx2_r);
+
+    /********************************************************************************
+     * delta and signum
+     ********************************************************************************/
+    pred1d<Data, ErrCtrl, SEQ, true>(thread_scope, shmem.signum, shmem.delta, from_last_stripe);
+    pred1d<Data, ErrCtrl, SEQ, false>(thread_scope, shmem.signum, shmem.delta);
+    write1d<ErrCtrl, NTHREAD, SEQ>(shmem.signum, signum, len3.x, id_base, shmem.delta, delta);
+}
+
+template <typename Data = float, typename ErrCtrl = uint16_t, typename FP = float>
+__global__ void c_lorenzo_2d1l_16x16data_mapto16x2(
+    Data*    data,    // input
+    ErrCtrl* delta,   // output
+    bool*    signum,  // output
+    dim3     len3,
+    dim3     stride3,
+    FP       ebx2_r)
+{
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = 8;
+
+    Data center[YSEQ + 1] = {0};  // nw  n
+                                  //  w  center
+
+    auto gix      = BIX * BDX + TIX;           // BDX == 16
+    auto giy_base = BIY * BLOCK + TIY * YSEQ;  // BDY * YSEQ = BLOCK == 16
+                                               // clang-format off
+    load2d_prequant<Data, FP, YSEQ>(data, center, len3.x, len3.y, stride3.y, gix, giy_base, ebx2_r);
+    pred2d<Data, FP, YSEQ>(center);
+    postquant_write2d<Data, ErrCtrl, YSEQ >(center, delta, signum, len3.x, len3.y, stride3.y,  gix, giy_base);
+    // clang-format on
+}
+
+template <typename Data, typename ErrCtrl = uint16_t, typename FP = float>
+__global__ void c_lorenzo_3d1l_32x8x8data_mapto32x1x8(
+    Data*    data,    // input
+    ErrCtrl* delta,   // output
+    bool*    signum,  // output
+    dim3     len3,
+    dim3     stride3,
+    FP       ebx2_r)
+{
+    constexpr auto  BLOCK = 8;
+    __shared__ Data shmem[8][8][32];
+
+    auto z = TIZ;
+
+    auto gix      = BIX * (BLOCK * 4) + TIX;
+    auto giy_base = BIY * BLOCK;
+    auto giz      = BIZ * BLOCK + z;
+    auto base_id  = gix + giy_base * stride3.y + giz * stride3.z;
+
+    /********************************************************************************
+     * load from DRAM, perform prequant
+     ********************************************************************************/
+    if (gix < len3.x and giz < len3.z) {
+        for (auto y = 0; y < BLOCK; y++) {
+            if (giy_base + y < len3.y) {
+                shmem[z][y][TIX] = round(data[base_id + y * stride3.y] * ebx2_r);  // prequant (fp presence)
+            }
+        }
+    }
+    __syncthreads();  // necessary to ensure correctness
+
+    auto x = TIX % 8;
+
+    for (auto y = 0; y < BLOCK; y++) {
+        Data delta_val;
+
+        // prediction
+        delta_val = shmem[z][y][TIX] - ((z > 0 and y > 0 and x > 0 ? shmem[z - 1][y - 1][TIX - 1] : 0)  // dist=3
+                                        - (y > 0 and x > 0 ? shmem[z][y - 1][TIX - 1] : 0)              // dist=2
+                                        - (z > 0 and x > 0 ? shmem[z - 1][y][TIX - 1] : 0)              //
+                                        - (z > 0 and y > 0 ? shmem[z - 1][y - 1][TIX] : 0)              //
+                                        + (x > 0 ? shmem[z][y][TIX - 1] : 0)                            // dist=1
+                                        + (y > 0 ? shmem[z][y - 1][TIX] : 0)                            //
+                                        + (z > 0 ? shmem[z - 1][y][TIX] : 0));                          //
+
+        auto id = base_id + (y * stride3.y);
+
+        // delta and signum
+        if (gix < len3.x and (giy_base + y) < len3.y and giz < len3.z) {
+            signum[id] = delta_val < 0;
+            delta[id]  = static_cast<ErrCtrl>(fabs(delta_val));
+        }
+    }
+    /* EOF */
+}
+
+template <typename Data = float, typename ErrCtrl = uint16_t, typename FP = float, int BLOCK = 256, int SEQ = 8>
+__global__ void x_lorenzo_1d1l(  //
+    bool*    signum,
+    ErrCtrl* delta,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    FP       ebx2)
+{
+    constexpr auto block_dim = BLOCK / SEQ;  // dividable
+
+    // coalesce-load (warp-striped) and transpose in shmem (similar for store)
+    typedef cub::BlockLoad<bool, block_dim, SEQ, cub::BLOCK_LOAD_WARP_TRANSPOSE>    BlockLoadT_signum;
+    typedef cub::BlockLoad<ErrCtrl, block_dim, SEQ, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT_delta;
+    typedef cub::BlockStore<Data, block_dim, SEQ, cub::BLOCK_STORE_WARP_TRANSPOSE>  BlockStoreT_xdata;
+    typedef cub::BlockScan<Data, block_dim, cub::BLOCK_SCAN_RAKING_MEMOIZE>
+        BlockScanT_xdata;  // TODO autoselect algorithm
+
+    __shared__ union TempStorage {  // overlap shared memory space
+        typename BlockLoadT_signum::TempStorage load_signum;
+        typename BlockLoadT_delta::TempStorage  load_delta;
+        typename BlockStoreT_xdata::TempStorage store_xdata;
+        typename BlockScanT_xdata::TempStorage  scan_xdata;
+    } temp_storage;
+
+    // thread-scope tiled data
+    struct ThreadData {
+        Data xdata[SEQ];
+        bool signum[SEQ];
+    } thread_scope;
+    ErrCtrl thread_scope_delta[SEQ];
+
+    /********************************************************************************
+     * load to thread-private array (fuse at the same time)
+     * (BIX * BDX * SEQ) denotes the start of the data chunk that belongs to this thread block
+     ********************************************************************************/
+    BlockLoadT_delta(temp_storage.load_delta).Load(delta + (BIX * BDX) * SEQ, thread_scope_delta);
+    __syncthreads();  // barrier for shmem reuse
+    BlockLoadT_signum(temp_storage.load_signum).Load(signum + (BIX * BDX) * SEQ, thread_scope.signum);
+    __syncthreads();  // barrier for shmem reuse
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id               = (BIX * BDX + TIX) * SEQ + i;
+        thread_scope.xdata[i] = id < len3.x  //
+                                    ? (thread_scope.signum[i] ? -1 : 1) * static_cast<Data>(thread_scope_delta[i])
+                                    : 0;
+    }
+    __syncthreads();
+
+    /********************************************************************************
+     * perform partial-sum using cub::InclusiveSum
+     ********************************************************************************/
+    BlockScanT_xdata(temp_storage.scan_xdata).InclusiveSum(thread_scope.xdata, thread_scope.xdata);
+    __syncthreads();  // barrier for shmem reuse
+
+    /********************************************************************************
+     * scale by ebx2 and write to DRAM
+     ********************************************************************************/
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) thread_scope.xdata[i] *= ebx2;
+    __syncthreads();  // barrier for shmem reuse
+
+    BlockStoreT_xdata(temp_storage.store_xdata).Store(xdata + (BIX * BDX) * SEQ, thread_scope.xdata);
+}
+
+template <typename Data = float, typename ErrCtrl = uint16_t, typename FP = float>
+__global__ void
+x_lorenzo_2d1l_16x16data_mapto16x2(bool* signum, ErrCtrl* delta, Data* xdata, dim3 len3, dim3 stride3, FP ebx2)
+{
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
+    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
+
+    __shared__ Data intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
+    Data            thread_scope[YSEQ];
+    /*
+      .  ------> gix (x)
+      |  t00    t01    t02    t03    ... t0f
+      |  ts00_0 ts00_0 ts00_0 ts00_0
+     giy ts00_1 ts00_1 ts00_1 ts00_1
+     (y)  |      |      |      |
+         ts00_7 ts00_7 ts00_7 ts00_7
+
+      |  t10    t11    t12    t13    ... t1f
+      |  ts00_0 ts00_0 ts00_0 ts00_0
+     giy ts00_1 ts00_1 ts00_1 ts00_1
+     (y)  |      |      |      |
+         ts00_7 ts00_7 ts00_7 ts00_7
+     */
+
+    auto gix      = BIX * BLOCK + TIX;
+    auto giy_base = BIY * BLOCK + TIY * YSEQ;  // BDY * YSEQ = BLOCK == 16
+    auto get_gid  = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
+
+    /********************************************************************************
+     * load to thread-private array (fuse at the same time)
+     ********************************************************************************/
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
+        if (gix < len3.x and giy_base + i < len3.y)
+            thread_scope[i] = (signum[gid] ? -1 : 1) * static_cast<Data>(delta[gid]);  // fuse
+        else
+            thread_scope[i] = 0;  // TODO set as init state?
+    }
+
+    /********************************************************************************
+     * partial-sum along y-axis, sequantially
+     ********************************************************************************/
+    for (auto i = 1; i < YSEQ; i++) thread_scope[i] += thread_scope[i - 1];
+    // two-pass: store for cross-threadscope update
+    if (TIY == 0) intermediate[TIX] = thread_scope[YSEQ - 1];
+    __syncthreads();
+    // two-pass: load and update
+    if (TIY == 1) {
+        auto tmp = intermediate[TIX];
+#pragma unroll
+        for (auto& i : thread_scope) i += tmp;
+    }
+
+    /********************************************************************************
+     * in-warp partial-sum along x-axis
+     ********************************************************************************/
+#pragma unroll
+    for (auto& i : thread_scope) {
+        for (auto d = 1; d < BLOCK; d *= 2) {
+            Data n = __shfl_up_sync(0xffffffff, i, d, 16);
+            if (TIX >= d) i += n;
+        }
+        i *= ebx2;
+    }
+
+    /********************************************************************************
+     * write to DRAM
+     ********************************************************************************/
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        if (gix < len3.x and giy_base + i < len3.y) xdata[gid] = thread_scope[i];
+    }
+}
+
+template <typename Data = float, typename ErrCtrl = uint16_t, typename FP = float>
+__global__ void
+x_lorenzo_3d1l_32x8x8data_mapto32x1x8(bool* signum, ErrCtrl* delta, Data* xdata, dim3 len3, dim3 stride3, FP ebx2)
+{
+    constexpr auto BLOCK = 8;
+    constexpr auto YSEQ  = BLOCK;
+    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
+
+    __shared__ Data intermediate[BLOCK][4][8];
+    Data            thread_scope[YSEQ];
+
+    auto seg_id  = TIX / 8;
+    auto seg_tix = TIX % 8;
+
+    auto gix = BIX * (4 * BLOCK) + TIX, giy_base = BIY * BLOCK, giz = BIZ * BLOCK + TIZ;
+    auto get_gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
+
+    /********************************************************************************
+     * load to thread-private array (fuse at the same time)
+     ********************************************************************************/
+#pragma unroll
+    for (auto y = 0; y < YSEQ; y++) {
+        auto gid = get_gid(y);
+        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
+            thread_scope[y] = (signum[gid] ? -1 : 1) * static_cast<Data>(delta[gid]);
+        else
+            thread_scope[y] = 0;
+    }
+
+    /********************************************************************************
+     * partial-sum along y-axis, sequantially
+     ********************************************************************************/
+    for (auto y = 1; y < YSEQ; y++) thread_scope[y] += thread_scope[y - 1];
+
+    /********************************************************************************
+     * ND partial-sums along x- and z-axis
+     * in-warp shuffle used: in order to perform, it's transposed after X-partial sum
+     ********************************************************************************/
+    auto dist = 1;
+    Data addend;
+
+#pragma unroll
+    for (auto i = 0; i < BLOCK; i++) {
+        Data val = thread_scope[i];
+
+        for (dist = 1; dist < BLOCK; dist *= 2) {
+            addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+            if (seg_tix >= dist) val += addend;
+        }
+
+        // x-z transpose
+        intermediate[TIZ][seg_id][seg_tix] = val;
+        __syncthreads();
+        val = intermediate[seg_tix][seg_id][TIZ];
+        __syncthreads();
+
+        for (dist = 1; dist < BLOCK; dist *= 2) {
+            addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+            if (seg_tix >= dist) val += addend;
+        }
+
+        intermediate[TIZ][seg_id][seg_tix] = val;
+        __syncthreads();
+        val = intermediate[seg_tix][seg_id][TIZ];
+        __syncthreads();
+
+        thread_scope[i] = val;
+    }
+
+    /********************************************************************************
+     * write to DRAM
+     ********************************************************************************/
+#pragma unroll
+    for (auto y = 0; y < YSEQ; y++) {
+        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) { xdata[get_gid(y)] = thread_scope[y] * ebx2; }
+    }
+    /* EOF */
+}
+
+}  // namespace experimental
+}  // namespace cusz
+
+#undef TIX
+#undef TIY
+#undef TIZ
+#undef BIX
+#undef BIY
+#undef BIZ
+#undef BDX
+#undef BDY
+#undef BDZ
+
+#endif /* E2BEA52A_4D2E_4966_9135_6CE8B8E05762 */
diff --git a/qtensor/compression/cusz/src/kernel/detail/spline3.inl b/qtensor/compression/cusz/src/kernel/detail/spline3.inl
index 2c4f1213..5e3526bd 100644
--- a/qtensor/compression/cusz/src/kernel/detail/spline3.inl
+++ b/qtensor/compression/cusz/src/kernel/detail/spline3.inl
@@ -1,746 +1,746 @@
-/**
- * @file spline3.inl
- * @author Jiannan Tian
- * @brief
- * @version 0.2
- * @date 2021-05-15
- *
- * (C) 2021 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef CUSZ_KERNEL_SPLINE3_CUH
-#define CUSZ_KERNEL_SPLINE3_CUH
-
-#include <stdint.h>
-#include <stdio.h>
-#include <type_traits>
-#include "utils/cuda_err.cuh"
-
-#define SPLINE3_COMPR true
-#define SPLINE3_DECOMPR false
-
-#if __cplusplus >= 201703L
-#define CONSTEXPR constexpr
-#else
-#define CONSTEXPR
-#endif
-
-#define TIX threadIdx.x
-#define TIY threadIdx.y
-#define TIZ threadIdx.z
-#define BIX blockIdx.x
-#define BIY blockIdx.y
-#define BIZ blockIdx.z
-#define BDX blockDim.x
-#define BDY blockDim.y
-#define BDZ blockDim.
-
-using DIM     = unsigned int;
-using STRIDE  = unsigned int;
-using DIM3    = dim3;
-using STRIDE3 = dim3;
-
-constexpr int BLOCK8  = 8;
-constexpr int BLOCK32 = 32;
-
-#define SHM_ERROR shm_errctrl
-
-namespace cusz {
-
-/********************************************************************************
- * host API
- ********************************************************************************/
-
-template <
-    typename TITER,
-    typename EITER,
-    typename FP            = float,
-    int  LINEAR_BLOCK_SIZE = 256,
-    bool PROBE_PRED_ERROR  = false>
-__global__ void c_spline3d_infprecis_32x8x8data(
-    TITER   data,
-    DIM3    data_size,
-    STRIDE3 data_leap,
-    EITER   errctrl,
-    DIM3    errctrl_size,
-    STRIDE3 errctrl_leap,
-    TITER   anchor,
-    STRIDE3 anchor_leap,
-    FP      eb_r,
-    FP      ebx2,
-    int     radius,
-    TITER   pred_error     = nullptr,
-    TITER   compress_error = nullptr);
-
-template <
-    typename EITER,
-    typename TITER,
-    typename FP           = float,
-    int LINEAR_BLOCK_SIZE = 256>
-__global__ void x_spline3d_infprecis_32x8x8data(
-    EITER   errctrl,       // input 1
-    DIM3    errctrl_size,  //
-    STRIDE3 errctrl_leap,  //
-    TITER   anchor,        // input 2
-    DIM3    anchor_size,   //
-    STRIDE3 anchor_leap,   //
-    TITER   data,          // output
-    DIM3    data_size,     //
-    STRIDE3 data_leap,     //
-    FP      eb_r,
-    FP      ebx2,
-    int     radius);
-
-namespace device_api {
-/********************************************************************************
- * device API
- ********************************************************************************/
-template <
-    typename T1,
-    typename T2,
-    typename FP,
-    int  LINEAR_BLOCK_SIZE,
-    bool WORKFLOW         = SPLINE3_COMPR,
-    bool PROBE_PRED_ERROR = false>
-__device__ void spline3d_layout2_interpolate(
-    volatile T1 shm_data[9][9][33],
-    volatile T2 shm_errctrl[9][9][33],
-    FP          eb_r,
-    FP          ebx2,
-    int         radius);
-}  // namespace device_api
-
-}  // namespace cusz
-
-/********************************************************************************
- * helper function
- ********************************************************************************/
-
-namespace {
-
-template <bool INCLUSIVE = true>
-__forceinline__ __device__ bool xyz33x9x9_predicate(unsigned int x, unsigned int y, unsigned int z)
-{
-    if CONSTEXPR (INCLUSIVE) {  //
-        return x <= 32 and y <= 8 and z <= 8;
-    }
-    else {
-        return x < 32 and y < 8 and z < 8;
-    }
-}
-
-// control block_id3 in function call
-template <typename T, bool PRINT_FP = false, int XEND = 33, int YEND = 9, int ZEND = 9>
-__device__ void
-spline3d_print_block_from_GPU(T volatile a[9][9][33], int radius = 512, bool compress = true, bool print_errctrl = true)
-{
-    for (auto z = 0; z < ZEND; z++) {
-        printf("\nprint from GPU, z=%d\n", z);
-        printf("    ");
-        for (auto i = 0; i < 33; i++) printf("%3d", i);
-        printf("\n");
-
-        for (auto y = 0; y < YEND; y++) {
-            printf("y=%d ", y);
-            for (auto x = 0; x < XEND; x++) {  //
-                if CONSTEXPR (PRINT_FP) { printf("%.2e\t", (float)a[z][y][x]); }
-                else {
-                    T c = print_errctrl ? a[z][y][x] - radius : a[z][y][x];
-                    if (compress) {
-                        if (c == 0) { printf("%3c", '.'); }
-                        else {
-                            if (abs(c) >= 10) { printf("%3c", '*'); }
-                            else {
-                                if (print_errctrl) { printf("%3d", c); }
-                                else {
-                                    printf("%4.2f", c);
-                                }
-                            }
-                        }
-                    }
-                    else {
-                        if (print_errctrl) { printf("%3d", c); }
-                        else {
-                            printf("%4.2f", c);
-                        }
-                    }
-                }
-            }
-            printf("\n");
-        }
-    }
-    printf("\nGPU print end\n\n");
-}
-
-template <typename T1, typename T2, int LINEAR_BLOCK_SIZE = 256>
-__device__ void
-c_reset_scratch_33x9x9data(volatile T1 shm_data[9][9][33], volatile T2 shm_errctrl[9][9][33], int radius)
-{
-    // alternatively, reinterprete cast volatile T?[][][] to 1D
-    for (auto _tix = TIX; _tix < 33 * 9 * 9; _tix += LINEAR_BLOCK_SIZE) {
-        auto x = (_tix % 33);
-        auto y = (_tix / 33) % 9;
-        auto z = (_tix / 33) / 9;
-
-        shm_data[z][y][x] = 0;
-        /*****************************************************************************
-         okay to use
-         ******************************************************************************/
-        if (x % 8 == 0 and y % 8 == 0 and z % 8 == 0) shm_errctrl[z][y][x] = radius;
-        /*****************************************************************************
-         alternatively
-         ******************************************************************************/
-        // shm_errctrl[z][y][x] = radius;
-    }
-    __syncthreads();
-}
-
-template <typename T1, int LINEAR_BLOCK_SIZE = 256>
-__device__ void c_gather_anchor(T1* data, DIM3 data_size, STRIDE3 data_leap, T1* anchor, STRIDE3 anchor_leap)
-{
-    auto x = (TIX % 32) + BIX * 32;
-    auto y = (TIX / 32) % 8 + BIY * 8;
-    auto z = (TIX / 32) / 8 + BIZ * 8;
-
-    bool pred1 = x % 8 == 0 and y % 8 == 0 and z % 8 == 0;
-    bool pred2 = x < data_size.x and y < data_size.y and z < data_size.z;
-
-    if (pred1 and pred2) {
-        auto data_id      = x + y * data_leap.y + z * data_leap.z;
-        auto anchor_id    = (x / 8) + (y / 8) * anchor_leap.y + (z / 8) * anchor_leap.z;
-        anchor[anchor_id] = data[data_id];
-    }
-    __syncthreads();
-}
-
-/*
- * use shmem, erroneous
-template <typename T1, int LINEAR_BLOCK_SIZE = 256>
-__device__ void c_gather_anchor(volatile T1 shm_data[9][9][33], T1* anchor, STRIDE3 anchor_leap)
-{
-    constexpr auto NUM_ITERS = 33 * 9 * 9 / LINEAR_BLOCK_SIZE + 1;  // 11 iterations
-    for (auto i = 0; i < NUM_ITERS; i++) {
-        auto _tix = i * LINEAR_BLOCK_SIZE + TIX;
-
-        if (_tix < 33 * 9 * 9) {
-            auto x = (_tix % 33);
-            auto y = (_tix / 33) % 9;
-            auto z = (_tix / 33) / 9;
-
-            if (x % 8 == 0 and y % 8 == 0 and z % 8 == 0) {
-                auto aid = ((x / 8) + BIX * 4) +             //
-                           ((y / 8) + BIY) * anchor_leap.y +  //
-                           ((z / 8) + BIZ) * anchor_leap.z;   //
-                anchor[aid] = shm_data[z][y][x];
-            }
-        }
-    }
-    __syncthreads();
-}
-*/
-
-template <typename T1, typename T2, int LINEAR_BLOCK_SIZE = 256>
-__device__ void x_reset_scratch_33x9x9data(
-    volatile T1 shm_xdata[9][9][33],
-    volatile T2 shm_errctrl[9][9][33],
-    T1*         anchor,       //
-    DIM3        anchor_size,  //
-    STRIDE3     anchor_leap)
-{
-    for (auto _tix = TIX; _tix < 33 * 9 * 9; _tix += LINEAR_BLOCK_SIZE) {
-        auto x = (_tix % 33);
-        auto y = (_tix / 33) % 9;
-        auto z = (_tix / 33) / 9;
-
-        shm_errctrl[z][y][x] = 0;  // TODO explicitly handle zero-padding
-        /*****************************************************************************
-         okay to use
-         ******************************************************************************/
-        if (x % 8 == 0 and y % 8 == 0 and z % 8 == 0) {
-            shm_xdata[z][y][x] = 0;
-
-            auto ax = ((x / 8) + BIX * 4);
-            auto ay = ((y / 8) + BIY);
-            auto az = ((z / 8) + BIZ);
-
-            if (ax < anchor_size.x and ay < anchor_size.y and az < anchor_size.z)
-                shm_xdata[z][y][x] = anchor[ax + ay * anchor_leap.y + az * anchor_leap.z];
-        }
-        /*****************************************************************************
-         alternatively
-         ******************************************************************************/
-        // shm_errctrl[z][y][x] = radius;
-    }
-
-    __syncthreads();
-}
-
-template <typename Input, int LINEAR_BLOCK_SIZE = 256>
-__device__ void
-global2shmem_33x9x9data(Input* data, DIM3 data_size, STRIDE3 data_leap, volatile Input shm_data[9][9][33])
-{
-    constexpr auto TOTAL = 33 * 9 * 9;
-
-    for (auto _tix = TIX; _tix < TOTAL; _tix += LINEAR_BLOCK_SIZE) {
-        auto x   = (_tix % 33);
-        auto y   = (_tix / 33) % 9;
-        auto z   = (_tix / 33) / 9;
-        auto gx  = (x + BIX * BLOCK32);
-        auto gy  = (y + BIY * BLOCK8);
-        auto gz  = (z + BIZ * BLOCK8);
-        auto gid = gx + gy * data_leap.y + gz * data_leap.z;
-
-        if (gx < data_size.x and gy < data_size.y and gz < data_size.z) shm_data[z][y][x] = data[gid];
-    }
-    __syncthreads();
-}
-
-template <typename Output, int LINEAR_BLOCK_SIZE = 256>
-__device__ void
-shmem2global_32x8x8data(volatile Output shm_data[9][9][33], Output* data, DIM3 data_size, STRIDE3 data_leap)
-{
-    constexpr auto TOTAL = 32 * 8 * 8;
-
-    for (auto _tix = TIX; _tix < TOTAL; _tix += LINEAR_BLOCK_SIZE) {
-        auto x   = (_tix % 32);
-        auto y   = (_tix / 32) % 8;
-        auto z   = (_tix / 32) / 8;
-        auto gx  = (x + BIX * BLOCK32);
-        auto gy  = (y + BIY * BLOCK8);
-        auto gz  = (z + BIZ * BLOCK8);
-        auto gid = gx + gy * data_leap.y + gz * data_leap.z;
-
-        if (gx < data_size.x and gy < data_size.y and gz < data_size.z) data[gid] = shm_data[z][y][x];
-    }
-    __syncthreads();
-}
-
-template <
-    typename T1,
-    typename T2,
-    typename FP,
-    typename LAMBDAX,
-    typename LAMBDAY,
-    typename LAMBDAZ,
-    bool BLUE,
-    bool YELLOW,
-    bool HOLLOW,
-    int  LINEAR_BLOCK_SIZE,
-    int  BLOCK_DIMX,
-    int  BLOCK_DIMY,
-    bool COARSEN,
-    int  BLOCK_DIMZ,
-    bool BORDER_INCLUSIVE,
-    bool WORKFLOW>
-__forceinline__ __device__ void interpolate_stage(
-    volatile T1 shm_data[9][9][33],
-    volatile T2 shm_errctrl[9][9][33],
-    LAMBDAX     xmap,
-    LAMBDAY     ymap,
-    LAMBDAZ     zmap,
-    int         unit,
-    FP          eb_r,
-    FP          ebx2,
-    int         radius)
-{
-    static_assert(BLOCK_DIMX * BLOCK_DIMY * (COARSEN ? 1 : BLOCK_DIMZ) <= LINEAR_BLOCK_SIZE, "block oversized");
-    static_assert((BLUE or YELLOW or HOLLOW) == true, "must be one hot");
-    static_assert((BLUE and YELLOW) == false, "must be only one hot (1)");
-    static_assert((BLUE and YELLOW) == false, "must be only one hot (2)");
-    static_assert((YELLOW and HOLLOW) == false, "must be only one hot (3)");
-
-    auto run = [&](auto x, auto y, auto z) {
-        if (xyz33x9x9_predicate<BORDER_INCLUSIVE>(x, y, z)) {
-            T1 pred = 0;
-
-            if CONSTEXPR (BLUE) {  //
-                pred = (shm_data[z - unit][y][x] + shm_data[z + unit][y][x]) / 2;
-            }
-            if CONSTEXPR (YELLOW) {  //
-                pred = (shm_data[z][y][x - unit] + shm_data[z][y][x + unit]) / 2;
-            }
-            if CONSTEXPR (HOLLOW) {  //
-                pred = (shm_data[z][y - unit][x] + shm_data[z][y + unit][x]) / 2;
-            }
-
-            if CONSTEXPR (WORKFLOW == SPLINE3_COMPR) {
-                auto          err = shm_data[z][y][x] - pred;
-                decltype(err) code;
-                // TODO unsafe, did not deal with the out-of-cap case
-                {
-                    code = fabs(err) * eb_r + 1;
-                    code = err < 0 ? -code : code;
-                    code = int(code / 2) + radius;
-                }
-                shm_errctrl[z][y][x] = code;  // TODO double check if unsigned type works
-                shm_data[z][y][x]    = pred + (code - radius) * ebx2;
-            }
-            else {  // TODO == DECOMPRESSS and static_assert
-                auto code         = shm_errctrl[z][y][x];
-                shm_data[z][y][x] = pred + (code - radius) * ebx2;
-            }
-        }
-    };
-    // -------------------------------------------------------------------------------- //
-
-    if CONSTEXPR (COARSEN) {
-        constexpr auto TOTAL = BLOCK_DIMX * BLOCK_DIMY * BLOCK_DIMZ;
-        for (auto _tix = TIX; _tix < TOTAL; _tix += LINEAR_BLOCK_SIZE) {
-            auto itix = (_tix % BLOCK_DIMX);
-            auto itiy = (_tix / BLOCK_DIMX) % BLOCK_DIMY;
-            auto itiz = (_tix / BLOCK_DIMX) / BLOCK_DIMY;
-            auto x    = xmap(itix, unit);
-            auto y    = ymap(itiy, unit);
-            auto z    = zmap(itiz, unit);
-            run(x, y, z);
-        }
-    }
-    else {
-        auto itix = (TIX % BLOCK_DIMX);
-        auto itiy = (TIX / BLOCK_DIMX) % BLOCK_DIMY;
-        auto itiz = (TIX / BLOCK_DIMX) / BLOCK_DIMY;
-        auto x    = xmap(itix, unit);
-        auto y    = ymap(itiy, unit);
-        auto z    = zmap(itiz, unit);
-        run(x, y, z);
-    }
-    __syncthreads();
-}
-
-}  // namespace
-
-/********************************************************************************/
-
-template <typename T1, typename T2, typename FP, int LINEAR_BLOCK_SIZE, bool WORKFLOW, bool PROBE_PRED_ERROR>
-__device__ void cusz::device_api::spline3d_layout2_interpolate(
-    volatile T1 shm_data[9][9][33],
-    volatile T2 shm_errctrl[9][9][33],
-    FP          eb_r,
-    FP          ebx2,
-    int         radius)
-{
-    auto xblue = [] __device__(int _tix, int unit) -> int { return unit * (_tix * 2); };
-    auto yblue = [] __device__(int _tiy, int unit) -> int { return unit * (_tiy * 2); };
-    auto zblue = [] __device__(int _tiz, int unit) -> int { return unit * (_tiz * 2 + 1); };
-
-    auto xyellow = [] __device__(int _tix, int unit) -> int { return unit * (_tix * 2 + 1); };
-    auto yyellow = [] __device__(int _tiy, int unit) -> int { return unit * (_tiy * 2); };
-    auto zyellow = [] __device__(int _tiz, int unit) -> int { return unit * (_tiz); };
-
-    auto xhollow = [] __device__(int _tix, int unit) -> int { return unit * (_tix); };
-    auto yhollow = [] __device__(int _tiy, int unit) -> int { return unit * (_tiy * 2 + 1); };
-    auto zhollow = [] __device__(int _tiz, int unit) -> int { return unit * (_tiz); };
-
-    constexpr auto COARSEN          = true;
-    constexpr auto NO_COARSEN       = false;
-    constexpr auto BORDER_INCLUSIVE = true;
-    constexpr auto BORDER_EXCLUSIVE = false;
-
-    int unit = 4;
-
-    // iteration 1
-    interpolate_stage<
-        T1, T2, FP, decltype(xblue), decltype(yblue), decltype(zblue),  //
-        true, false, false, LINEAR_BLOCK_SIZE, 5, 2, NO_COARSEN, 1, BORDER_INCLUSIVE, WORKFLOW>(
-        shm_data, shm_errctrl, xblue, yblue, zblue, unit, eb_r, ebx2, radius);
-    interpolate_stage<
-        T1, T2, FP, decltype(xyellow), decltype(yyellow), decltype(zyellow),  //
-        false, true, false, LINEAR_BLOCK_SIZE, 4, 2, NO_COARSEN, 3, BORDER_INCLUSIVE, WORKFLOW>(
-        shm_data, shm_errctrl, xyellow, yyellow, zyellow, unit, eb_r, ebx2, radius);
-    interpolate_stage<
-        T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow),  //
-        false, false, true, LINEAR_BLOCK_SIZE, 9, 1, NO_COARSEN, 3, BORDER_INCLUSIVE, WORKFLOW>(
-        shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius);
-
-    unit = 2;
-
-    // iteration 2, TODO switch y-z order
-    interpolate_stage<
-        T1, T2, FP, decltype(xblue), decltype(yblue), decltype(zblue),  //
-        true, false, false, LINEAR_BLOCK_SIZE, 9, 3, NO_COARSEN, 2, BORDER_INCLUSIVE, WORKFLOW>(
-        shm_data, shm_errctrl, xblue, yblue, zblue, unit, eb_r, ebx2, radius);
-    interpolate_stage<
-        T1, T2, FP, decltype(xyellow), decltype(yyellow), decltype(zyellow),  //
-        false, true, false, LINEAR_BLOCK_SIZE, 8, 3, NO_COARSEN, 5, BORDER_INCLUSIVE, WORKFLOW>(
-        shm_data, shm_errctrl, xyellow, yyellow, zyellow, unit, eb_r, ebx2, radius);
-    interpolate_stage<
-        T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow),  //
-        false, false, true, LINEAR_BLOCK_SIZE, 17, 2, NO_COARSEN, 5, BORDER_INCLUSIVE, WORKFLOW>(
-        shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius);
-
-    unit = 1;
-
-    // iteration 3
-    interpolate_stage<
-        T1, T2, FP, decltype(xblue), decltype(yblue), decltype(zblue),  //
-        true, false, false, LINEAR_BLOCK_SIZE, 17, 5, COARSEN, 4, BORDER_INCLUSIVE, WORKFLOW>(
-        shm_data, shm_errctrl, xblue, yblue, zblue, unit, eb_r, ebx2, radius);
-    interpolate_stage<
-        T1, T2, FP, decltype(xyellow), decltype(yyellow), decltype(zyellow),  //
-        false, true, false, LINEAR_BLOCK_SIZE, 16, 5, COARSEN, 9, BORDER_INCLUSIVE, WORKFLOW>(
-        shm_data, shm_errctrl, xyellow, yyellow, zyellow, unit, eb_r, ebx2, radius);
-    /******************************************************************************
-     test only: last step inclusive
-     ******************************************************************************/
-    // interpolate_stage<
-    //     T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow),  //
-    //     false, false, true, LINEAR_BLOCK_SIZE, 33, 4, COARSEN, 9, BORDER_INCLUSIVE, WORKFLOW>(
-    //     shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius);
-    /******************************************************************************
-     production
-     ******************************************************************************/
-    interpolate_stage<
-        T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow),  //
-        false, false, true, LINEAR_BLOCK_SIZE, 32, 4, COARSEN, 8, BORDER_EXCLUSIVE, WORKFLOW>(
-        shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius);
-
-    /******************************************************************************
-     test only: print a block
-     ******************************************************************************/
-    // if (TIX == 0 and BIX == 0 and BIY == 0 and BIZ == 0) { spline3d_print_block_from_GPU(shm_errctrl); }
-    // if (TIX == 0 and BIX == 0 and BIY == 0 and BIZ == 0) { spline3d_print_block_from_GPU(shm_data); }
-}
-
-/********************************************************************************
- * host API/kernel
- ********************************************************************************/
-
-template <typename TITER, typename EITER, typename FP, int LINEAR_BLOCK_SIZE, bool PROBE_PRED_ERROR>
-__global__ void cusz::c_spline3d_infprecis_32x8x8data(
-    TITER   data,
-    DIM3    data_size,
-    STRIDE3 data_leap,
-    EITER   errctrl,
-    DIM3    errctrl_size,
-    STRIDE3 errctrl_leap,
-    TITER   anchor,
-    STRIDE3 anchor_leap,
-    FP      eb_r,
-    FP      ebx2,
-    int     radius,
-    TITER   pred_error,
-    TITER   compress_error)
-{
-    // compile time variables
-    using T = typename std::remove_pointer<TITER>::type;
-    using E = typename std::remove_pointer<EITER>::type;
-
-    if CONSTEXPR (PROBE_PRED_ERROR) {
-        // TODO
-    }
-    else {
-        __shared__ struct {
-            T data[9][9][33];
-            E errctrl[9][9][33];
-        } shmem;
-
-        c_reset_scratch_33x9x9data<T, E, LINEAR_BLOCK_SIZE>(shmem.data, shmem.errctrl, radius);
-        global2shmem_33x9x9data<T, LINEAR_BLOCK_SIZE>(data, data_size, data_leap, shmem.data);
-
-        // version 1, use shmem, erroneous
-        // c_gather_anchor<T>(shmem.data, anchor, anchor_leap);
-        // version 2, use global mem, correct
-        c_gather_anchor<T>(data, data_size, data_leap, anchor, anchor_leap);
-
-        cusz::device_api::spline3d_layout2_interpolate<T, E, FP, LINEAR_BLOCK_SIZE, SPLINE3_COMPR, false>(
-            shmem.data, shmem.errctrl, eb_r, ebx2, radius);
-        shmem2global_32x8x8data<E, LINEAR_BLOCK_SIZE>(shmem.errctrl, errctrl, errctrl_size, errctrl_leap);
-    }
-}
-
-template <
-    typename EITER,
-    typename TITER,
-    typename FP,
-    int LINEAR_BLOCK_SIZE>
-__global__ void cusz::x_spline3d_infprecis_32x8x8data(
-    EITER   errctrl,       // input 1
-    DIM3    errctrl_size,  //
-    STRIDE3 errctrl_leap,  //
-    TITER   anchor,        // input 2
-    DIM3    anchor_size,   //
-    STRIDE3 anchor_leap,   //
-    TITER   data,          // output
-    DIM3    data_size,     //
-    STRIDE3 data_leap,     //
-    FP      eb_r,
-    FP      ebx2,
-    int     radius)
-{
-    // compile time variables
-    using E = typename std::remove_pointer<EITER>::type;
-    using T = typename std::remove_pointer<TITER>::type;
-
-    __shared__ struct {
-        E errctrl[9][9][33];
-        T data[9][9][33];
-    } shmem;
-
-    x_reset_scratch_33x9x9data<T, E, LINEAR_BLOCK_SIZE>(shmem.data, shmem.errctrl, anchor, anchor_size, anchor_leap);
-    global2shmem_33x9x9data<E, LINEAR_BLOCK_SIZE>(errctrl, errctrl_size, errctrl_leap, shmem.errctrl);
-    cusz::device_api::spline3d_layout2_interpolate<T, E, FP, LINEAR_BLOCK_SIZE, SPLINE3_DECOMPR, false>(
-        shmem.data, shmem.errctrl, eb_r, ebx2, radius);
-    shmem2global_32x8x8data<T, LINEAR_BLOCK_SIZE>(shmem.data, data, data_size, data_leap);
-}
-
-#undef TIX
-#undef TIY
-#undef TIZ
-#undef BIX
-#undef BIY
-#undef BIZ
-#undef BDX
-#undef BDY
-#undef BDZ
-
-template <typename T, typename E, typename FP, bool NO_R_SEPARATE>
-void launch_construct_Spline3(
-    T*           data,
-    dim3 const   len3,
-    T*           anchor,
-    dim3 const   an_len3,
-    E*           errctrl,
-    dim3 const   ec_len3,
-    double const eb,
-    int const    radius,
-    float&       time_elapsed,
-    cudaStream_t stream)
-{
-    auto divide3 = [](dim3 len, dim3 sublen) {
-        return dim3(
-            (len.x - 1) / sublen.x + 1,  //
-            (len.y - 1) / sublen.y + 1,  //
-            (len.z - 1) / sublen.z + 1);
-    };
-
-    auto ndim = [&]() {
-        if (len3.z == 1 and len3.y == 1)
-            return 1;
-        else if (len3.z == 1 and len3.y != 1)
-            return 2;
-        else
-            return 3;
-    };
-
-    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
-    constexpr auto SEQ_3D    = dim3(1, 8, 1);
-    constexpr auto BLOCK_3D  = dim3(256, 1, 1);
-    auto           GRID_3D   = divide3(len3, SUBLEN_3D);
-
-    {
-        constexpr auto SUBLEN_TOTAL = SUBLEN_3D.x * SUBLEN_3D.y * SUBLEN_3D.z;
-        constexpr auto SEQ_TOTAL    = SEQ_3D.x * SEQ_3D.y * SEQ_3D.z;
-        constexpr auto BLOCK_TOTAL  = BLOCK_3D.x * BLOCK_3D.y * BLOCK_3D.z;
-
-        // static_assert(SUBLEN_TOTAL / SEQ_TOTAL == BLOCK_TOTAL, "parallelism does not match!");
-        if (SUBLEN_TOTAL / SEQ_TOTAL != BLOCK_TOTAL) throw std::runtime_error("parallelism does not match!");
-    }
-
-    ////////////////////////////////////////
-
-    auto ebx2     = eb * 2;
-    auto eb_r     = 1 / eb;
-    auto leap3    = dim3(1, len3.x, len3.x * len3.y);
-    auto ec_leap3 = dim3(1, ec_len3.x, ec_len3.x * ec_len3.y);
-    auto an_leap3 = dim3(1, an_len3.x, an_len3.x * an_len3.y);
-
-    CREATE_CUDAEVENT_PAIR;
-    START_CUDAEVENT_RECORDING(stream);
-
-    auto d = ndim();
-
-    if (d == 1) {  //
-        throw std::runtime_error("Spline1 not implemented");
-    }
-    else if (d == 2) {
-        throw std::runtime_error("Spline2 not implemented");
-    }
-    else if (d == 3) {
-        cusz::c_spline3d_infprecis_32x8x8data<T*, E*, float, 256, false>  //
-            <<<GRID_3D, BLOCK_3D, 0, stream>>>                            //
-            (data, len3, leap3,                                           //
-             errctrl, ec_len3, ec_leap3,                                  //
-             anchor, an_leap3,                                            //
-             eb_r, ebx2, radius);
-    }
-
-    STOP_CUDAEVENT_RECORDING(stream);
-    CHECK_CUDA(cudaStreamSynchronize(stream));
-    TIME_ELAPSED_CUDAEVENT(&time_elapsed);
-
-    DESTROY_CUDAEVENT_PAIR;
-}
-
-template <typename T, typename E, typename FP>
-void launch_reconstruct_Spline3(
-    T*           xdata,
-    dim3 const   len3,
-    T*           anchor,
-    dim3 const   an_len3,
-    E*           errctrl,
-    dim3 const   ec_len3,
-    double const eb,
-    int const    radius,
-    float&       time_elapsed,
-    cudaStream_t stream)
-{
-    auto divide3 = [](dim3 len, dim3 sublen) {
-        return dim3(
-            (len.x - 1) / sublen.x + 1,  //
-            (len.y - 1) / sublen.y + 1,  //
-            (len.z - 1) / sublen.z + 1);
-    };
-
-    /*
-    auto ndim = [&]() {
-        if (len3.z == 1 and len3.y == 1)
-            return 1;
-        else if (len3.z == 1 and len3.y != 1)
-            return 2;
-        else
-            return 3;
-    };
-     */
-
-    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
-    constexpr auto SEQ_3D    = dim3(1, 8, 1);
-    constexpr auto BLOCK_3D  = dim3(256, 1, 1);
-    auto           GRID_3D   = divide3(len3, SUBLEN_3D);
-
-    {
-        constexpr auto SUBLEN_TOTAL = SUBLEN_3D.x * SUBLEN_3D.y * SUBLEN_3D.z;
-        constexpr auto SEQ_TOTAL    = SEQ_3D.x * SEQ_3D.y * SEQ_3D.z;
-        constexpr auto BLOCK_TOTAL  = BLOCK_3D.x * BLOCK_3D.y * BLOCK_3D.z;
-
-        // static_assert(SUBLEN_TOTAL / SEQ_TOTAL == BLOCK_TOTAL, "parallelism does not match!");
-        if (SUBLEN_TOTAL / SEQ_TOTAL != BLOCK_TOTAL) throw std::runtime_error("parallelism does not match!");
-    }
-
-    ////////////////////////////////////////
-
-    auto ebx2     = eb * 2;
-    auto eb_r     = 1 / eb;
-    auto leap3    = dim3(1, len3.x, len3.x * len3.y);
-    auto ec_leap3 = dim3(1, ec_len3.x, ec_len3.x * ec_len3.y);
-    auto an_leap3 = dim3(1, an_len3.x, an_len3.x * an_len3.y);
-
-    CREATE_CUDAEVENT_PAIR;
-    START_CUDAEVENT_RECORDING(stream);
-
-    cusz::x_spline3d_infprecis_32x8x8data<E*, T*, float, 256>  //
-        <<<GRID_3D, BLOCK_3D, 0, stream>>>                     //
-        (errctrl, ec_len3, ec_leap3,                           //
-         anchor, an_len3, an_leap3,                            //
-         xdata, len3, leap3,                                   //
-         eb_r, ebx2, radius);
-
-    STOP_CUDAEVENT_RECORDING(stream);
-
-    CHECK_CUDA(cudaStreamSynchronize(stream));
-
-    TIME_ELAPSED_CUDAEVENT(&time_elapsed);
-    DESTROY_CUDAEVENT_PAIR;
-}
-
-#endif
+/**
+ * @file spline3.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.2
+ * @date 2021-05-15
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_KERNEL_SPLINE3_CUH
+#define CUSZ_KERNEL_SPLINE3_CUH
+
+#include <stdint.h>
+#include <stdio.h>
+#include <type_traits>
+#include "utils/cuda_err.cuh"
+
+#define SPLINE3_COMPR true
+#define SPLINE3_DECOMPR false
+
+#if __cplusplus >= 201703L
+#define CONSTEXPR constexpr
+#else
+#define CONSTEXPR
+#endif
+
+#define TIX threadIdx.x
+#define TIY threadIdx.y
+#define TIZ threadIdx.z
+#define BIX blockIdx.x
+#define BIY blockIdx.y
+#define BIZ blockIdx.z
+#define BDX blockDim.x
+#define BDY blockDim.y
+#define BDZ blockDim.
+
+using DIM     = unsigned int;
+using STRIDE  = unsigned int;
+using DIM3    = dim3;
+using STRIDE3 = dim3;
+
+constexpr int BLOCK8  = 8;
+constexpr int BLOCK32 = 32;
+
+#define SHM_ERROR shm_errctrl
+
+namespace cusz {
+
+/********************************************************************************
+ * host API
+ ********************************************************************************/
+
+template <
+    typename TITER,
+    typename EITER,
+    typename FP            = float,
+    int  LINEAR_BLOCK_SIZE = 256,
+    bool PROBE_PRED_ERROR  = false>
+__global__ void c_spline3d_infprecis_32x8x8data(
+    TITER   data,
+    DIM3    data_size,
+    STRIDE3 data_leap,
+    EITER   errctrl,
+    DIM3    errctrl_size,
+    STRIDE3 errctrl_leap,
+    TITER   anchor,
+    STRIDE3 anchor_leap,
+    FP      eb_r,
+    FP      ebx2,
+    int     radius,
+    TITER   pred_error     = nullptr,
+    TITER   compress_error = nullptr);
+
+template <
+    typename EITER,
+    typename TITER,
+    typename FP           = float,
+    int LINEAR_BLOCK_SIZE = 256>
+__global__ void x_spline3d_infprecis_32x8x8data(
+    EITER   errctrl,       // input 1
+    DIM3    errctrl_size,  //
+    STRIDE3 errctrl_leap,  //
+    TITER   anchor,        // input 2
+    DIM3    anchor_size,   //
+    STRIDE3 anchor_leap,   //
+    TITER   data,          // output
+    DIM3    data_size,     //
+    STRIDE3 data_leap,     //
+    FP      eb_r,
+    FP      ebx2,
+    int     radius);
+
+namespace device_api {
+/********************************************************************************
+ * device API
+ ********************************************************************************/
+template <
+    typename T1,
+    typename T2,
+    typename FP,
+    int  LINEAR_BLOCK_SIZE,
+    bool WORKFLOW         = SPLINE3_COMPR,
+    bool PROBE_PRED_ERROR = false>
+__device__ void spline3d_layout2_interpolate(
+    volatile T1 shm_data[9][9][33],
+    volatile T2 shm_errctrl[9][9][33],
+    FP          eb_r,
+    FP          ebx2,
+    int         radius);
+}  // namespace device_api
+
+}  // namespace cusz
+
+/********************************************************************************
+ * helper function
+ ********************************************************************************/
+
+namespace {
+
+template <bool INCLUSIVE = true>
+__forceinline__ __device__ bool xyz33x9x9_predicate(unsigned int x, unsigned int y, unsigned int z)
+{
+    if CONSTEXPR (INCLUSIVE) {  //
+        return x <= 32 and y <= 8 and z <= 8;
+    }
+    else {
+        return x < 32 and y < 8 and z < 8;
+    }
+}
+
+// control block_id3 in function call
+template <typename T, bool PRINT_FP = false, int XEND = 33, int YEND = 9, int ZEND = 9>
+__device__ void
+spline3d_print_block_from_GPU(T volatile a[9][9][33], int radius = 512, bool compress = true, bool print_errctrl = true)
+{
+    for (auto z = 0; z < ZEND; z++) {
+        printf("\nprint from GPU, z=%d\n", z);
+        printf("    ");
+        for (auto i = 0; i < 33; i++) printf("%3d", i);
+        printf("\n");
+
+        for (auto y = 0; y < YEND; y++) {
+            printf("y=%d ", y);
+            for (auto x = 0; x < XEND; x++) {  //
+                if CONSTEXPR (PRINT_FP) { printf("%.2e\t", (float)a[z][y][x]); }
+                else {
+                    T c = print_errctrl ? a[z][y][x] - radius : a[z][y][x];
+                    if (compress) {
+                        if (c == 0) { printf("%3c", '.'); }
+                        else {
+                            if (abs(c) >= 10) { printf("%3c", '*'); }
+                            else {
+                                if (print_errctrl) { printf("%3d", c); }
+                                else {
+                                    printf("%4.2f", c);
+                                }
+                            }
+                        }
+                    }
+                    else {
+                        if (print_errctrl) { printf("%3d", c); }
+                        else {
+                            printf("%4.2f", c);
+                        }
+                    }
+                }
+            }
+            printf("\n");
+        }
+    }
+    printf("\nGPU print end\n\n");
+}
+
+template <typename T1, typename T2, int LINEAR_BLOCK_SIZE = 256>
+__device__ void
+c_reset_scratch_33x9x9data(volatile T1 shm_data[9][9][33], volatile T2 shm_errctrl[9][9][33], int radius)
+{
+    // alternatively, reinterprete cast volatile T?[][][] to 1D
+    for (auto _tix = TIX; _tix < 33 * 9 * 9; _tix += LINEAR_BLOCK_SIZE) {
+        auto x = (_tix % 33);
+        auto y = (_tix / 33) % 9;
+        auto z = (_tix / 33) / 9;
+
+        shm_data[z][y][x] = 0;
+        /*****************************************************************************
+         okay to use
+         ******************************************************************************/
+        if (x % 8 == 0 and y % 8 == 0 and z % 8 == 0) shm_errctrl[z][y][x] = radius;
+        /*****************************************************************************
+         alternatively
+         ******************************************************************************/
+        // shm_errctrl[z][y][x] = radius;
+    }
+    __syncthreads();
+}
+
+template <typename T1, int LINEAR_BLOCK_SIZE = 256>
+__device__ void c_gather_anchor(T1* data, DIM3 data_size, STRIDE3 data_leap, T1* anchor, STRIDE3 anchor_leap)
+{
+    auto x = (TIX % 32) + BIX * 32;
+    auto y = (TIX / 32) % 8 + BIY * 8;
+    auto z = (TIX / 32) / 8 + BIZ * 8;
+
+    bool pred1 = x % 8 == 0 and y % 8 == 0 and z % 8 == 0;
+    bool pred2 = x < data_size.x and y < data_size.y and z < data_size.z;
+
+    if (pred1 and pred2) {
+        auto data_id      = x + y * data_leap.y + z * data_leap.z;
+        auto anchor_id    = (x / 8) + (y / 8) * anchor_leap.y + (z / 8) * anchor_leap.z;
+        anchor[anchor_id] = data[data_id];
+    }
+    __syncthreads();
+}
+
+/*
+ * use shmem, erroneous
+template <typename T1, int LINEAR_BLOCK_SIZE = 256>
+__device__ void c_gather_anchor(volatile T1 shm_data[9][9][33], T1* anchor, STRIDE3 anchor_leap)
+{
+    constexpr auto NUM_ITERS = 33 * 9 * 9 / LINEAR_BLOCK_SIZE + 1;  // 11 iterations
+    for (auto i = 0; i < NUM_ITERS; i++) {
+        auto _tix = i * LINEAR_BLOCK_SIZE + TIX;
+
+        if (_tix < 33 * 9 * 9) {
+            auto x = (_tix % 33);
+            auto y = (_tix / 33) % 9;
+            auto z = (_tix / 33) / 9;
+
+            if (x % 8 == 0 and y % 8 == 0 and z % 8 == 0) {
+                auto aid = ((x / 8) + BIX * 4) +             //
+                           ((y / 8) + BIY) * anchor_leap.y +  //
+                           ((z / 8) + BIZ) * anchor_leap.z;   //
+                anchor[aid] = shm_data[z][y][x];
+            }
+        }
+    }
+    __syncthreads();
+}
+*/
+
+template <typename T1, typename T2, int LINEAR_BLOCK_SIZE = 256>
+__device__ void x_reset_scratch_33x9x9data(
+    volatile T1 shm_xdata[9][9][33],
+    volatile T2 shm_errctrl[9][9][33],
+    T1*         anchor,       //
+    DIM3        anchor_size,  //
+    STRIDE3     anchor_leap)
+{
+    for (auto _tix = TIX; _tix < 33 * 9 * 9; _tix += LINEAR_BLOCK_SIZE) {
+        auto x = (_tix % 33);
+        auto y = (_tix / 33) % 9;
+        auto z = (_tix / 33) / 9;
+
+        shm_errctrl[z][y][x] = 0;  // TODO explicitly handle zero-padding
+        /*****************************************************************************
+         okay to use
+         ******************************************************************************/
+        if (x % 8 == 0 and y % 8 == 0 and z % 8 == 0) {
+            shm_xdata[z][y][x] = 0;
+
+            auto ax = ((x / 8) + BIX * 4);
+            auto ay = ((y / 8) + BIY);
+            auto az = ((z / 8) + BIZ);
+
+            if (ax < anchor_size.x and ay < anchor_size.y and az < anchor_size.z)
+                shm_xdata[z][y][x] = anchor[ax + ay * anchor_leap.y + az * anchor_leap.z];
+        }
+        /*****************************************************************************
+         alternatively
+         ******************************************************************************/
+        // shm_errctrl[z][y][x] = radius;
+    }
+
+    __syncthreads();
+}
+
+template <typename Input, int LINEAR_BLOCK_SIZE = 256>
+__device__ void
+global2shmem_33x9x9data(Input* data, DIM3 data_size, STRIDE3 data_leap, volatile Input shm_data[9][9][33])
+{
+    constexpr auto TOTAL = 33 * 9 * 9;
+
+    for (auto _tix = TIX; _tix < TOTAL; _tix += LINEAR_BLOCK_SIZE) {
+        auto x   = (_tix % 33);
+        auto y   = (_tix / 33) % 9;
+        auto z   = (_tix / 33) / 9;
+        auto gx  = (x + BIX * BLOCK32);
+        auto gy  = (y + BIY * BLOCK8);
+        auto gz  = (z + BIZ * BLOCK8);
+        auto gid = gx + gy * data_leap.y + gz * data_leap.z;
+
+        if (gx < data_size.x and gy < data_size.y and gz < data_size.z) shm_data[z][y][x] = data[gid];
+    }
+    __syncthreads();
+}
+
+template <typename Output, int LINEAR_BLOCK_SIZE = 256>
+__device__ void
+shmem2global_32x8x8data(volatile Output shm_data[9][9][33], Output* data, DIM3 data_size, STRIDE3 data_leap)
+{
+    constexpr auto TOTAL = 32 * 8 * 8;
+
+    for (auto _tix = TIX; _tix < TOTAL; _tix += LINEAR_BLOCK_SIZE) {
+        auto x   = (_tix % 32);
+        auto y   = (_tix / 32) % 8;
+        auto z   = (_tix / 32) / 8;
+        auto gx  = (x + BIX * BLOCK32);
+        auto gy  = (y + BIY * BLOCK8);
+        auto gz  = (z + BIZ * BLOCK8);
+        auto gid = gx + gy * data_leap.y + gz * data_leap.z;
+
+        if (gx < data_size.x and gy < data_size.y and gz < data_size.z) data[gid] = shm_data[z][y][x];
+    }
+    __syncthreads();
+}
+
+template <
+    typename T1,
+    typename T2,
+    typename FP,
+    typename LAMBDAX,
+    typename LAMBDAY,
+    typename LAMBDAZ,
+    bool BLUE,
+    bool YELLOW,
+    bool HOLLOW,
+    int  LINEAR_BLOCK_SIZE,
+    int  BLOCK_DIMX,
+    int  BLOCK_DIMY,
+    bool COARSEN,
+    int  BLOCK_DIMZ,
+    bool BORDER_INCLUSIVE,
+    bool WORKFLOW>
+__forceinline__ __device__ void interpolate_stage(
+    volatile T1 shm_data[9][9][33],
+    volatile T2 shm_errctrl[9][9][33],
+    LAMBDAX     xmap,
+    LAMBDAY     ymap,
+    LAMBDAZ     zmap,
+    int         unit,
+    FP          eb_r,
+    FP          ebx2,
+    int         radius)
+{
+    static_assert(BLOCK_DIMX * BLOCK_DIMY * (COARSEN ? 1 : BLOCK_DIMZ) <= LINEAR_BLOCK_SIZE, "block oversized");
+    static_assert((BLUE or YELLOW or HOLLOW) == true, "must be one hot");
+    static_assert((BLUE and YELLOW) == false, "must be only one hot (1)");
+    static_assert((BLUE and YELLOW) == false, "must be only one hot (2)");
+    static_assert((YELLOW and HOLLOW) == false, "must be only one hot (3)");
+
+    auto run = [&](auto x, auto y, auto z) {
+        if (xyz33x9x9_predicate<BORDER_INCLUSIVE>(x, y, z)) {
+            T1 pred = 0;
+
+            if CONSTEXPR (BLUE) {  //
+                pred = (shm_data[z - unit][y][x] + shm_data[z + unit][y][x]) / 2;
+            }
+            if CONSTEXPR (YELLOW) {  //
+                pred = (shm_data[z][y][x - unit] + shm_data[z][y][x + unit]) / 2;
+            }
+            if CONSTEXPR (HOLLOW) {  //
+                pred = (shm_data[z][y - unit][x] + shm_data[z][y + unit][x]) / 2;
+            }
+
+            if CONSTEXPR (WORKFLOW == SPLINE3_COMPR) {
+                auto          err = shm_data[z][y][x] - pred;
+                decltype(err) code;
+                // TODO unsafe, did not deal with the out-of-cap case
+                {
+                    code = fabs(err) * eb_r + 1;
+                    code = err < 0 ? -code : code;
+                    code = int(code / 2) + radius;
+                }
+                shm_errctrl[z][y][x] = code;  // TODO double check if unsigned type works
+                shm_data[z][y][x]    = pred + (code - radius) * ebx2;
+            }
+            else {  // TODO == DECOMPRESSS and static_assert
+                auto code         = shm_errctrl[z][y][x];
+                shm_data[z][y][x] = pred + (code - radius) * ebx2;
+            }
+        }
+    };
+    // -------------------------------------------------------------------------------- //
+
+    if CONSTEXPR (COARSEN) {
+        constexpr auto TOTAL = BLOCK_DIMX * BLOCK_DIMY * BLOCK_DIMZ;
+        for (auto _tix = TIX; _tix < TOTAL; _tix += LINEAR_BLOCK_SIZE) {
+            auto itix = (_tix % BLOCK_DIMX);
+            auto itiy = (_tix / BLOCK_DIMX) % BLOCK_DIMY;
+            auto itiz = (_tix / BLOCK_DIMX) / BLOCK_DIMY;
+            auto x    = xmap(itix, unit);
+            auto y    = ymap(itiy, unit);
+            auto z    = zmap(itiz, unit);
+            run(x, y, z);
+        }
+    }
+    else {
+        auto itix = (TIX % BLOCK_DIMX);
+        auto itiy = (TIX / BLOCK_DIMX) % BLOCK_DIMY;
+        auto itiz = (TIX / BLOCK_DIMX) / BLOCK_DIMY;
+        auto x    = xmap(itix, unit);
+        auto y    = ymap(itiy, unit);
+        auto z    = zmap(itiz, unit);
+        run(x, y, z);
+    }
+    __syncthreads();
+}
+
+}  // namespace
+
+/********************************************************************************/
+
+template <typename T1, typename T2, typename FP, int LINEAR_BLOCK_SIZE, bool WORKFLOW, bool PROBE_PRED_ERROR>
+__device__ void cusz::device_api::spline3d_layout2_interpolate(
+    volatile T1 shm_data[9][9][33],
+    volatile T2 shm_errctrl[9][9][33],
+    FP          eb_r,
+    FP          ebx2,
+    int         radius)
+{
+    auto xblue = [] __device__(int _tix, int unit) -> int { return unit * (_tix * 2); };
+    auto yblue = [] __device__(int _tiy, int unit) -> int { return unit * (_tiy * 2); };
+    auto zblue = [] __device__(int _tiz, int unit) -> int { return unit * (_tiz * 2 + 1); };
+
+    auto xyellow = [] __device__(int _tix, int unit) -> int { return unit * (_tix * 2 + 1); };
+    auto yyellow = [] __device__(int _tiy, int unit) -> int { return unit * (_tiy * 2); };
+    auto zyellow = [] __device__(int _tiz, int unit) -> int { return unit * (_tiz); };
+
+    auto xhollow = [] __device__(int _tix, int unit) -> int { return unit * (_tix); };
+    auto yhollow = [] __device__(int _tiy, int unit) -> int { return unit * (_tiy * 2 + 1); };
+    auto zhollow = [] __device__(int _tiz, int unit) -> int { return unit * (_tiz); };
+
+    constexpr auto COARSEN          = true;
+    constexpr auto NO_COARSEN       = false;
+    constexpr auto BORDER_INCLUSIVE = true;
+    constexpr auto BORDER_EXCLUSIVE = false;
+
+    int unit = 4;
+
+    // iteration 1
+    interpolate_stage<
+        T1, T2, FP, decltype(xblue), decltype(yblue), decltype(zblue),  //
+        true, false, false, LINEAR_BLOCK_SIZE, 5, 2, NO_COARSEN, 1, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xblue, yblue, zblue, unit, eb_r, ebx2, radius);
+    interpolate_stage<
+        T1, T2, FP, decltype(xyellow), decltype(yyellow), decltype(zyellow),  //
+        false, true, false, LINEAR_BLOCK_SIZE, 4, 2, NO_COARSEN, 3, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xyellow, yyellow, zyellow, unit, eb_r, ebx2, radius);
+    interpolate_stage<
+        T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow),  //
+        false, false, true, LINEAR_BLOCK_SIZE, 9, 1, NO_COARSEN, 3, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius);
+
+    unit = 2;
+
+    // iteration 2, TODO switch y-z order
+    interpolate_stage<
+        T1, T2, FP, decltype(xblue), decltype(yblue), decltype(zblue),  //
+        true, false, false, LINEAR_BLOCK_SIZE, 9, 3, NO_COARSEN, 2, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xblue, yblue, zblue, unit, eb_r, ebx2, radius);
+    interpolate_stage<
+        T1, T2, FP, decltype(xyellow), decltype(yyellow), decltype(zyellow),  //
+        false, true, false, LINEAR_BLOCK_SIZE, 8, 3, NO_COARSEN, 5, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xyellow, yyellow, zyellow, unit, eb_r, ebx2, radius);
+    interpolate_stage<
+        T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow),  //
+        false, false, true, LINEAR_BLOCK_SIZE, 17, 2, NO_COARSEN, 5, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius);
+
+    unit = 1;
+
+    // iteration 3
+    interpolate_stage<
+        T1, T2, FP, decltype(xblue), decltype(yblue), decltype(zblue),  //
+        true, false, false, LINEAR_BLOCK_SIZE, 17, 5, COARSEN, 4, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xblue, yblue, zblue, unit, eb_r, ebx2, radius);
+    interpolate_stage<
+        T1, T2, FP, decltype(xyellow), decltype(yyellow), decltype(zyellow),  //
+        false, true, false, LINEAR_BLOCK_SIZE, 16, 5, COARSEN, 9, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xyellow, yyellow, zyellow, unit, eb_r, ebx2, radius);
+    /******************************************************************************
+     test only: last step inclusive
+     ******************************************************************************/
+    // interpolate_stage<
+    //     T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow),  //
+    //     false, false, true, LINEAR_BLOCK_SIZE, 33, 4, COARSEN, 9, BORDER_INCLUSIVE, WORKFLOW>(
+    //     shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius);
+    /******************************************************************************
+     production
+     ******************************************************************************/
+    interpolate_stage<
+        T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow),  //
+        false, false, true, LINEAR_BLOCK_SIZE, 32, 4, COARSEN, 8, BORDER_EXCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius);
+
+    /******************************************************************************
+     test only: print a block
+     ******************************************************************************/
+    // if (TIX == 0 and BIX == 0 and BIY == 0 and BIZ == 0) { spline3d_print_block_from_GPU(shm_errctrl); }
+    // if (TIX == 0 and BIX == 0 and BIY == 0 and BIZ == 0) { spline3d_print_block_from_GPU(shm_data); }
+}
+
+/********************************************************************************
+ * host API/kernel
+ ********************************************************************************/
+
+template <typename TITER, typename EITER, typename FP, int LINEAR_BLOCK_SIZE, bool PROBE_PRED_ERROR>
+__global__ void cusz::c_spline3d_infprecis_32x8x8data(
+    TITER   data,
+    DIM3    data_size,
+    STRIDE3 data_leap,
+    EITER   errctrl,
+    DIM3    errctrl_size,
+    STRIDE3 errctrl_leap,
+    TITER   anchor,
+    STRIDE3 anchor_leap,
+    FP      eb_r,
+    FP      ebx2,
+    int     radius,
+    TITER   pred_error,
+    TITER   compress_error)
+{
+    // compile time variables
+    using T = typename std::remove_pointer<TITER>::type;
+    using E = typename std::remove_pointer<EITER>::type;
+
+    if CONSTEXPR (PROBE_PRED_ERROR) {
+        // TODO
+    }
+    else {
+        __shared__ struct {
+            T data[9][9][33];
+            E errctrl[9][9][33];
+        } shmem;
+
+        c_reset_scratch_33x9x9data<T, E, LINEAR_BLOCK_SIZE>(shmem.data, shmem.errctrl, radius);
+        global2shmem_33x9x9data<T, LINEAR_BLOCK_SIZE>(data, data_size, data_leap, shmem.data);
+
+        // version 1, use shmem, erroneous
+        // c_gather_anchor<T>(shmem.data, anchor, anchor_leap);
+        // version 2, use global mem, correct
+        c_gather_anchor<T>(data, data_size, data_leap, anchor, anchor_leap);
+
+        cusz::device_api::spline3d_layout2_interpolate<T, E, FP, LINEAR_BLOCK_SIZE, SPLINE3_COMPR, false>(
+            shmem.data, shmem.errctrl, eb_r, ebx2, radius);
+        shmem2global_32x8x8data<E, LINEAR_BLOCK_SIZE>(shmem.errctrl, errctrl, errctrl_size, errctrl_leap);
+    }
+}
+
+template <
+    typename EITER,
+    typename TITER,
+    typename FP,
+    int LINEAR_BLOCK_SIZE>
+__global__ void cusz::x_spline3d_infprecis_32x8x8data(
+    EITER   errctrl,       // input 1
+    DIM3    errctrl_size,  //
+    STRIDE3 errctrl_leap,  //
+    TITER   anchor,        // input 2
+    DIM3    anchor_size,   //
+    STRIDE3 anchor_leap,   //
+    TITER   data,          // output
+    DIM3    data_size,     //
+    STRIDE3 data_leap,     //
+    FP      eb_r,
+    FP      ebx2,
+    int     radius)
+{
+    // compile time variables
+    using E = typename std::remove_pointer<EITER>::type;
+    using T = typename std::remove_pointer<TITER>::type;
+
+    __shared__ struct {
+        E errctrl[9][9][33];
+        T data[9][9][33];
+    } shmem;
+
+    x_reset_scratch_33x9x9data<T, E, LINEAR_BLOCK_SIZE>(shmem.data, shmem.errctrl, anchor, anchor_size, anchor_leap);
+    global2shmem_33x9x9data<E, LINEAR_BLOCK_SIZE>(errctrl, errctrl_size, errctrl_leap, shmem.errctrl);
+    cusz::device_api::spline3d_layout2_interpolate<T, E, FP, LINEAR_BLOCK_SIZE, SPLINE3_DECOMPR, false>(
+        shmem.data, shmem.errctrl, eb_r, ebx2, radius);
+    shmem2global_32x8x8data<T, LINEAR_BLOCK_SIZE>(shmem.data, data, data_size, data_leap);
+}
+
+#undef TIX
+#undef TIY
+#undef TIZ
+#undef BIX
+#undef BIY
+#undef BIZ
+#undef BDX
+#undef BDY
+#undef BDZ
+
+template <typename T, typename E, typename FP, bool NO_R_SEPARATE>
+void launch_construct_Spline3(
+    T*           data,
+    dim3 const   len3,
+    T*           anchor,
+    dim3 const   an_len3,
+    E*           errctrl,
+    dim3 const   ec_len3,
+    double const eb,
+    int const    radius,
+    float&       time_elapsed,
+    cudaStream_t stream)
+{
+    auto divide3 = [](dim3 len, dim3 sublen) {
+        return dim3(
+            (len.x - 1) / sublen.x + 1,  //
+            (len.y - 1) / sublen.y + 1,  //
+            (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
+    constexpr auto SEQ_3D    = dim3(1, 8, 1);
+    constexpr auto BLOCK_3D  = dim3(256, 1, 1);
+    auto           GRID_3D   = divide3(len3, SUBLEN_3D);
+
+    {
+        constexpr auto SUBLEN_TOTAL = SUBLEN_3D.x * SUBLEN_3D.y * SUBLEN_3D.z;
+        constexpr auto SEQ_TOTAL    = SEQ_3D.x * SEQ_3D.y * SEQ_3D.z;
+        constexpr auto BLOCK_TOTAL  = BLOCK_3D.x * BLOCK_3D.y * BLOCK_3D.z;
+
+        // static_assert(SUBLEN_TOTAL / SEQ_TOTAL == BLOCK_TOTAL, "parallelism does not match!");
+        if (SUBLEN_TOTAL / SEQ_TOTAL != BLOCK_TOTAL) throw std::runtime_error("parallelism does not match!");
+    }
+
+    ////////////////////////////////////////
+
+    auto ebx2     = eb * 2;
+    auto eb_r     = 1 / eb;
+    auto leap3    = dim3(1, len3.x, len3.x * len3.y);
+    auto ec_leap3 = dim3(1, ec_len3.x, ec_len3.x * ec_len3.y);
+    auto an_leap3 = dim3(1, an_len3.x, an_len3.x * an_len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    auto d = ndim();
+
+    if (d == 1) {  //
+        throw std::runtime_error("Spline1 not implemented");
+    }
+    else if (d == 2) {
+        throw std::runtime_error("Spline2 not implemented");
+    }
+    else if (d == 3) {
+        cusz::c_spline3d_infprecis_32x8x8data<T*, E*, float, 256, false>  //
+            <<<GRID_3D, BLOCK_3D, 0, stream>>>                            //
+            (data, len3, leap3,                                           //
+             errctrl, ec_len3, ec_leap3,                                  //
+             anchor, an_leap3,                                            //
+             eb_r, ebx2, radius);
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+    TIME_ELAPSED_CUDAEVENT(&time_elapsed);
+
+    DESTROY_CUDAEVENT_PAIR;
+}
+
+template <typename T, typename E, typename FP>
+void launch_reconstruct_Spline3(
+    T*           xdata,
+    dim3 const   len3,
+    T*           anchor,
+    dim3 const   an_len3,
+    E*           errctrl,
+    dim3 const   ec_len3,
+    double const eb,
+    int const    radius,
+    float&       time_elapsed,
+    cudaStream_t stream)
+{
+    auto divide3 = [](dim3 len, dim3 sublen) {
+        return dim3(
+            (len.x - 1) / sublen.x + 1,  //
+            (len.y - 1) / sublen.y + 1,  //
+            (len.z - 1) / sublen.z + 1);
+    };
+
+    /*
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+     */
+
+    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
+    constexpr auto SEQ_3D    = dim3(1, 8, 1);
+    constexpr auto BLOCK_3D  = dim3(256, 1, 1);
+    auto           GRID_3D   = divide3(len3, SUBLEN_3D);
+
+    {
+        constexpr auto SUBLEN_TOTAL = SUBLEN_3D.x * SUBLEN_3D.y * SUBLEN_3D.z;
+        constexpr auto SEQ_TOTAL    = SEQ_3D.x * SEQ_3D.y * SEQ_3D.z;
+        constexpr auto BLOCK_TOTAL  = BLOCK_3D.x * BLOCK_3D.y * BLOCK_3D.z;
+
+        // static_assert(SUBLEN_TOTAL / SEQ_TOTAL == BLOCK_TOTAL, "parallelism does not match!");
+        if (SUBLEN_TOTAL / SEQ_TOTAL != BLOCK_TOTAL) throw std::runtime_error("parallelism does not match!");
+    }
+
+    ////////////////////////////////////////
+
+    auto ebx2     = eb * 2;
+    auto eb_r     = 1 / eb;
+    auto leap3    = dim3(1, len3.x, len3.x * len3.y);
+    auto ec_leap3 = dim3(1, ec_len3.x, ec_len3.x * ec_len3.y);
+    auto an_leap3 = dim3(1, an_len3.x, an_len3.x * an_len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    cusz::x_spline3d_infprecis_32x8x8data<E*, T*, float, 256>  //
+        <<<GRID_3D, BLOCK_3D, 0, stream>>>                     //
+        (errctrl, ec_len3, ec_leap3,                           //
+         anchor, an_len3, an_leap3,                            //
+         xdata, len3, leap3,                                   //
+         eb_r, ebx2, radius);
+
+    STOP_CUDAEVENT_RECORDING(stream);
+
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    TIME_ELAPSED_CUDAEVENT(&time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+}
+
+#endif
diff --git a/qtensor/compression/cusz/src/kernel/detail/subroutine.inl b/qtensor/compression/cusz/src/kernel/detail/subroutine.inl
index 2aa5bb5c..15d10ade 100644
--- a/qtensor/compression/cusz/src/kernel/detail/subroutine.inl
+++ b/qtensor/compression/cusz/src/kernel/detail/subroutine.inl
@@ -1,1074 +1,1074 @@
-/**
- * @file subroutine.inl
- * @author Jiannan Tian
- * @brief subroutines of kernels
- * @version 0.4
- * @date 2022-12-22
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include <stdint.h>
-#include <type_traits>
-#include "cusz/pn.hh"
-#include "pipeline/compaction_g.inl"
-#include "subsub.inl"
-
-namespace psz {
-namespace cuda {
-namespace __device {
-
-//////// 1D
-
-namespace v0 {
-
-// compression load
-template <typename T, typename FP, int NTHREAD, int SEQ>
-__forceinline__ __device__ void load_prequant_1d(
-    T*          data,
-    uint32_t    dimx,
-    uint32_t    id_base,
-    volatile T* shmem,
-    T           private_buffer[SEQ],
-    T&          prev,
-    FP          ebx2_r);
-
-// decompression load
-template <typename T, typename EQ, int NTHREAD, int SEQ>
-__forceinline__ __device__ void load_fuse_1d(
-    EQ*         quant,
-    T*          outlier,
-    uint32_t    dimx,
-    uint32_t    id_base,
-    int         radius,
-    volatile T* shmem,
-    T           private_buffer[SEQ]);
-
-namespace delta_only {
-
-template <typename T, typename EQ, int NTHREAD, int SEQ>
-__forceinline__ __device__ void
-load_1d(EQ* quant, uint32_t dimx, uint32_t id_base, volatile T* shmem, T private_buffer[SEQ]);
-
-}
-
-// compression and decompression store
-template <typename T1, typename T2, int NTHREAD, int SEQ, bool NO_OUTLIER>
-__forceinline__ __device__ void write_1d(  //
-    volatile T1* shmem_a1,
-    volatile T2* shmem_a2,
-    uint32_t     dimx,
-    uint32_t     id_base,
-    T1*          a1,
-    T2*          a2);
-
-// compression pred-quant, method 1
-template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
-__forceinline__ __device__ void predict_quantize__no_outlier_1d(  //
-    T            private_buffer[SEQ],
-    volatile EQ* shmem_quant,
-    T            prev = 0);
-
-// compression pred-quant, method 2
-template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
-__forceinline__ __device__ void predict_quantize_1d(  //
-    T            private_buffer[SEQ],
-    volatile EQ* shmem_quant,
-    volatile T*  shmem_outlier,
-    int          radius,
-    T            prev = 0);
-
-namespace compaction {
-
-template <
-    typename T,
-    typename EQ,
-    int  SEQ,
-    bool FIRST_POINT,
-    typename Compaction = CompactionDRAM<T>>
-__forceinline__ __device__ void predict_quantize_1d(  //
-    T            thp_buffer[SEQ],
-    volatile EQ* s_quant,
-    uint32_t     dimx,
-    int          radius,
-    uint32_t     g_id_base,
-    Compaction   g_outlier,
-    T            prev = 0);
-
-}
-
-// decompression pred-quant
-template <typename T, int SEQ, int NTHREAD>
-__forceinline__ __device__ void block_scan_1d(
-    T           private_buffer[SEQ],
-    T           ebx2,
-    volatile T* exchange_in,
-    volatile T* exchange_out,
-    volatile T* shmem_buffer);
-
-}  // namespace v0
-
-namespace v1_pn {
-
-template <typename T, typename EQ, int NTHREAD, int SEQ>
-__forceinline__ __device__ void
-load_fuse_1d(EQ* quant, T* outlier, uint32_t dimx, uint32_t id_base, volatile T* shmem, T private_buffer[SEQ]);
-
-template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
-__forceinline__ __device__ void
-predict_quantize__no_outlier_1d(T private_buffer[SEQ], volatile EQ* shmem_quant, T prev);
-
-template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
-__forceinline__ __device__ void
-predict_quantize_1d(T private_buffer[SEQ], volatile EQ* shmem_quant, volatile T* shmem_outlier, int radius, T prev);
-
-namespace compaction {
-
-template <typename T, typename EQ, int SEQ, bool FIRST_POINT, typename Compaction>
-__forceinline__ __device__ void predict_quantize_1d(
-    T            thp_buffer[SEQ],
-    volatile EQ* s_quant,
-    uint32_t     dimx,
-    int          radius,
-    uint32_t     g_idx_base,
-    Compaction   outlier,
-    T            prev);
-
-}
-
-namespace delta_only {
-
-template <typename T, typename EQ, int NTHREAD, int SEQ>
-__forceinline__ __device__ void
-load_1d(EQ* quant, uint32_t dimx, uint32_t id_base, volatile T* shmem, T private_buffer[SEQ]);
-
-}
-
-}  // namespace v1_pn
-
-//////// 2D
-
-namespace v0 {
-
-template <typename T, typename FP, int YSEQ>
-__forceinline__ __device__ void load_prequant_2d(
-    T*       data,
-    uint32_t dimx,
-    uint32_t gix,
-    uint32_t dimy,
-    uint32_t giy_base,
-    uint32_t stridey,
-    FP       ebx2_r,
-    T        center[YSEQ + 1]);
-
-template <typename T, typename FP, int YSEQ>
-__forceinline__ __device__ void predict_2d(T center[YSEQ + 1]);
-
-template <typename T, typename EQ, int YSEQ>
-__forceinline__ __device__ void quantize_write_2d(
-    T        delta[YSEQ + 1],
-    uint32_t dimx,
-    uint32_t gix,
-    uint32_t dimy,
-    uint32_t giy_base,
-    uint32_t stridey,
-    int      radius,
-    EQ*      quant,
-    T*       outlier);
-
-namespace delta_only {
-
-template <typename T, typename EQ, int YSEQ>
-__forceinline__ __device__ void quantize_write_2d(
-    T        delta[YSEQ + 1],
-    uint32_t dimx,
-    uint32_t gix,
-    uint32_t dimy,
-    uint32_t giy_base,
-    uint32_t stridey,
-    EQ*      quant);
-
-}
-
-namespace compaction {
-
-template <typename T, typename EQ, int YSEQ, typename Compaction>
-__forceinline__ __device__ void quantize_write_2d(
-    T          delta[YSEQ + 1],
-    uint32_t   dimx,
-    uint32_t   gix,
-    uint32_t   dimy,
-    uint32_t   giy_base,
-    uint32_t   stridey,
-    int        radius,
-    EQ*        quant,
-    Compaction outlier);
-
-};
-
-// decompression load
-template <typename T, typename EQ, int YSEQ>
-__forceinline__ __device__ void load_fuse_2d(
-    EQ*      quant,
-    T*       outlier,
-    uint32_t dimx,
-    uint32_t gix,
-    uint32_t dimy,
-    uint32_t giy_base,
-    uint32_t stridey,
-    int      radius,
-    T        private_buffer[YSEQ]);
-
-namespace delta_only {
-// decompression load
-template <typename T, typename EQ, int YSEQ>
-__forceinline__ __device__ void load_2d(
-    EQ*      quant,
-    uint32_t dimx,
-    uint32_t gix,
-    uint32_t dimy,
-    uint32_t giy_base,
-    uint32_t stridey,
-    T        private_buffer[YSEQ]);
-
-}  // namespace delta_only
-
-template <typename T, typename EQ, typename FP, int YSEQ>
-__forceinline__ __device__ void block_scan_2d(  //
-    T           thread_private[YSEQ],
-    volatile T* intermediate,
-    FP          ebx2);
-
-template <typename T, int YSEQ>
-__forceinline__ __device__ void decomp_write_2d(
-    T        thread_private[YSEQ],
-    uint32_t dimx,
-    uint32_t gix,
-    uint32_t dimy,
-    uint32_t giy_base,
-    uint32_t stridey,
-    T*       xdata);
-
-}  // namespace v0
-
-namespace v1_pn {
-
-namespace compaction {
-template <typename T, typename EQ, int YSEQ, typename Compaction>
-__forceinline__ __device__ void quantize_write_2d(
-    // clang-format off
-    T        delta[YSEQ + 1],
-    uint32_t dimx, uint32_t gix,
-    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
-    int      radius,
-    EQ*      quant,
-    Compaction outlier
-    // clang-format on
-);
-
-}
-
-template <typename T, typename EQ, int YSEQ>
-__forceinline__ __device__ void load_fuse_2d(
-    // clang-format off
-    EQ*      quant,
-    T*       outlier,
-    uint32_t dimx, uint32_t gix,
-    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
-    int      radius,
-    T        thread_private[YSEQ]
-    // clang-format on
-);
-
-namespace delta_only {
-
-template <typename T, typename EQ, int YSEQ>
-__forceinline__ __device__ void load_2d(
-    // clang-format off
-    EQ*      quant,
-    uint32_t dimx, uint32_t gix,
-    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
-    T        thread_private[YSEQ]
-    // clang-format on
-);
-
-template <typename T, typename EQ, int YSEQ>
-__forceinline__ __device__ void quantize_write_2d(
-    T        delta[YSEQ + 1],
-    uint32_t dimx,
-    uint32_t gix,
-    uint32_t dimy,
-    uint32_t giy_base,
-    uint32_t stridey,
-    EQ*      quant);
-
-}  // namespace delta_only
-
-}  // namespace v1_pn
-
-//////// 3D
-
-namespace v0 {
-
-// TODO move subroutines for 3D here
-
-}
-
-}  // namespace __device
-}  // namespace cuda
-}  // namespace psz
-
-////////////////////////////////////////////////////////////////////////////////
-
-//////// 1D
-
-template <typename T, typename FP, int NTHREAD, int SEQ>
-__forceinline__ __device__ void psz::cuda::__device::v0::load_prequant_1d(
-    T*          data,
-    uint32_t    dimx,
-    uint32_t    id_base,
-    volatile T* shmem,
-    T           private_buffer[SEQ],
-    T&          prev,  // TODO use pointer?
-    FP          ebx2_r)
-{
-#pragma unroll
-    for (auto i = 0; i < SEQ; i++) {
-        auto id = id_base + threadIdx.x + i * NTHREAD;
-        if (id < dimx) shmem[threadIdx.x + i * NTHREAD] = round(data[id] * ebx2_r);
-    }
-    __syncthreads();
-
-#pragma unroll
-    for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i];
-    if (threadIdx.x > 0) prev = shmem[threadIdx.x * SEQ - 1];
-    __syncthreads();
-}
-
-template <typename T, typename EQ, int NTHREAD, int SEQ>
-__forceinline__ __device__ void psz::cuda::__device::v0::load_fuse_1d(
-    EQ*         quant,
-    T*          outlier,
-    uint32_t    dimx,
-    uint32_t    id_base,
-    int         radius,
-    volatile T* shmem,
-    T           private_buffer[SEQ])
-{
-#pragma unroll
-    for (auto i = 0; i < SEQ; i++) {
-        auto local_id = threadIdx.x + i * NTHREAD;
-        auto id       = id_base + local_id;
-        if (id < dimx) shmem[local_id] = outlier[id] + static_cast<T>(quant[id]) - radius;
-    }
-    __syncthreads();
-
-#pragma unroll
-    for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i];
-    __syncthreads();
-}
-
-template <typename T, typename EQ, int NTHREAD, int SEQ>
-__forceinline__ __device__ void psz::cuda::__device::v1_pn::load_fuse_1d(
-    EQ*         quant,
-    T*          outlier,
-    uint32_t    dimx,
-    uint32_t    id_base,
-    volatile T* shmem,
-    T           private_buffer[SEQ])
-{
-    constexpr auto BYTEWIDTH = sizeof(EQ);
-
-    using UI = EQ;
-    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
-
-#pragma unroll
-    for (auto i = 0; i < SEQ; i++) {
-        auto local_id = threadIdx.x + i * NTHREAD;
-        auto id       = id_base + local_id;
-        if (id < dimx) shmem[local_id] = outlier[id] + PN<BYTEWIDTH>::decode(quant[id]);
-    }
-    __syncthreads();
-
-#pragma unroll
-    for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i];
-    __syncthreads();
-}
-
-template <typename T, typename EQ, int NTHREAD, int SEQ>
-__forceinline__ __device__ void psz::cuda::__device::v0::delta_only::load_1d(
-    EQ*         quant,
-    uint32_t    dimx,
-    uint32_t    id_base,
-    volatile T* shmem,
-    T           private_buffer[SEQ])
-{
-#pragma unroll
-    for (auto i = 0; i < SEQ; i++) {
-        auto local_id = threadIdx.x + i * NTHREAD;
-        auto id       = id_base + local_id;
-        if (id < dimx) shmem[local_id] = static_cast<T>(quant[id]);
-    }
-    __syncthreads();
-
-#pragma unroll
-    for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i];
-    __syncthreads();
-}
-
-template <typename T, typename EQ, int NTHREAD, int SEQ>
-__forceinline__ __device__ void psz::cuda::__device::v1_pn::delta_only::load_1d(
-    EQ*         quant,
-    uint32_t    dimx,
-    uint32_t    id_base,
-    volatile T* shmem,
-    T           private_buffer[SEQ])
-{
-    constexpr auto BYTEWIDTH = sizeof(EQ);
-
-    using UI = EQ;
-    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
-
-#pragma unroll
-    for (auto i = 0; i < SEQ; i++) {
-        auto local_id = threadIdx.x + i * NTHREAD;
-        auto id       = id_base + local_id;
-        if (id < dimx) shmem[local_id] = PN<BYTEWIDTH>::decode(quant[id]);
-    }
-    __syncthreads();
-
-#pragma unroll
-    for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i];
-    __syncthreads();
-}
-
-template <typename T1, typename T2, int NTHREAD, int SEQ, bool NO_OUTLIER>  // TODO remove NO_OUTLIER, use nullable
-__forceinline__ __device__ void psz::cuda::__device::v0::write_1d(
-    volatile T1* shmem_a1,
-    volatile T2* shmem_a2,
-    uint32_t     dimx,
-    uint32_t     id_base,
-    T1*          a1,
-    T2*          a2)
-{
-#pragma unroll
-    for (auto i = 0; i < SEQ; i++) {
-        auto id = id_base + threadIdx.x + i * NTHREAD;
-        if (id < dimx) {
-            if (NO_OUTLIER) {  //
-                a1[id] = shmem_a1[threadIdx.x + i * NTHREAD];
-            }
-            else {
-                a1[id] = shmem_a1[threadIdx.x + i * NTHREAD];
-                a2[id] = shmem_a2[threadIdx.x + i * NTHREAD];
-            }
-        }
-    }
-}
-
-template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
-__forceinline__ __device__ void psz::cuda::__device::v0::predict_quantize__no_outlier_1d(  //
-    T            private_buffer[SEQ],
-    volatile EQ* shmem_quant,
-    T            prev)
-{
-    auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) {
-        shmem_quant[idx + threadIdx.x * SEQ] = static_cast<EQ>(cur - prev);
-    };
-
-    if (FIRST_POINT) {  // i == 0
-        quantize_1d(private_buffer[0], prev, 0);
-    }
-    else {
-#pragma unroll
-        for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i);
-        __syncthreads();
-    }
-}
-
-template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
-__forceinline__ __device__ void psz::cuda::__device::v0::predict_quantize_1d(
-    T            private_buffer[SEQ],
-    volatile EQ* shmem_quant,
-    volatile T*  shmem_outlier,
-    int          radius,
-    T            prev)
-{
-    auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) {
-        T    delta       = cur - prev;
-        bool quantizable = fabs(delta) < radius;
-        T    candidate   = delta + radius;
-
-        // otherwise, need to reset shared memory (to 0)
-        shmem_quant[idx + threadIdx.x * SEQ]   = quantizable * static_cast<EQ>(candidate);
-        shmem_outlier[idx + threadIdx.x * SEQ] = (not quantizable) * candidate;
-    };
-
-    if (FIRST_POINT) {  // i == 0
-        quantize_1d(private_buffer[0], prev, 0);
-    }
-    else {
-#pragma unroll
-        for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i);
-        __syncthreads();
-    }
-}
-
-template <typename T, typename EQ, int SEQ, bool FIRST_POINT, typename Compaction>
-__forceinline__ __device__ void psz::cuda::__device::v0::compaction::predict_quantize_1d(
-    T            thp_buffer[SEQ],
-    volatile EQ* s_quant,
-    uint32_t     dimx,  // put x-related
-    int          radius,
-    uint32_t     g_idx_base,  // TODO this file `id_base` to `g_idx_base`
-    Compaction   outlier,
-    T            prev)
-{
-    auto quantize_1d = [&](T& cur, T& prev, uint32_t inloop_idx) {
-        T    delta       = cur - prev;
-        bool quantizable = fabs(delta) < radius;
-        T    candidate   = delta + radius;
-
-        auto inblock_idx = inloop_idx + threadIdx.x * SEQ;  // TODO this file use `inblock_idx`
-
-        // though quantizable, need to set non-quantizable position as 0
-        s_quant[inblock_idx] = quantizable * static_cast<EQ>(candidate);
-
-        // very small chance running into this block
-        if (not quantizable) {
-            auto g_idx = inblock_idx + g_idx_base;
-            if (g_idx < dimx) {
-                auto cur_idx         = atomicAdd(outlier.count, 1);
-                outlier.val[cur_idx] = candidate;
-                outlier.idx[cur_idx] = g_idx;
-            }
-        }
-    };
-
-    if (FIRST_POINT) {  // i == 0
-        quantize_1d(thp_buffer[0], prev, 0);
-    }
-    else {
-#pragma unroll
-        for (auto i = 1; i < SEQ; i++) quantize_1d(thp_buffer[i], thp_buffer[i - 1], i);
-        __syncthreads();  // TODO move __syncthreads() outside this subroutine?
-    }
-}
-
-template <typename T, typename EQ, int SEQ, bool FIRST_POINT, typename Compaction>
-__forceinline__ __device__ void psz::cuda::__device::v1_pn::compaction::predict_quantize_1d(
-    T            thp_buffer[SEQ],
-    volatile EQ* s_quant,
-    uint32_t     dimx,  // put x-related
-    int          radius,
-    uint32_t     g_idx_base,  // TODO this file `id_base` to `g_idx_base`
-    Compaction   outlier,
-    T            prev)
-{
-    constexpr auto BYTEWIDTH = sizeof(EQ);
-
-    using UI = EQ;
-    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
-
-    auto quantize_1d = [&](T& cur, T& prev, uint32_t inloop_idx) {
-        T    delta       = cur - prev;
-        bool quantizable = fabs(delta) < radius;
-        UI   UI_delta    = PN<BYTEWIDTH>::encode(static_cast<I>(delta));
-
-        auto inblock_idx = inloop_idx + threadIdx.x * SEQ;  // TODO this file use `inblock_idx`
-
-        // though quantizable, need to set non-quantizable position as 0
-        s_quant[inblock_idx] = quantizable * UI_delta;
-
-        // very small chance running into this block
-        if (not quantizable) {
-            auto g_idx = inblock_idx + g_idx_base;
-            if (g_idx < dimx) {
-                auto cur_idx         = atomicAdd(outlier.count, 1);
-                outlier.val[cur_idx] = delta;
-                outlier.idx[cur_idx] = g_idx;
-            }
-        }
-    };
-
-    if (FIRST_POINT) {  // i == 0
-        quantize_1d(thp_buffer[0], prev, 0);
-    }
-    else {
-#pragma unroll
-        for (auto i = 1; i < SEQ; i++) quantize_1d(thp_buffer[i], thp_buffer[i - 1], i);
-        __syncthreads();  // TODO move __syncthreads() outside this subroutine?
-    }
-}
-
-// decompression pred-quant
-template <typename T, int SEQ, int NTHREAD>
-__forceinline__ __device__ void psz::cuda::__device::v0::block_scan_1d(
-    T           private_buffer[SEQ],
-    T           ebx2,
-    volatile T* exchange_in,
-    volatile T* exchange_out,
-    volatile T* shmem_buffer)
-{
-    namespace wave32 = psz::cuda::__device::wave32;
-    wave32::intrawarp_inclusivescan_1d<T, SEQ>(private_buffer);
-    wave32::intrablock_exclusivescan_1d<T, SEQ, NTHREAD>(private_buffer, exchange_in, exchange_out);
-
-    // put back to shmem
-#pragma unroll
-    for (auto i = 0; i < SEQ; i++) shmem_buffer[threadIdx.x * SEQ + i] = private_buffer[i] * ebx2;
-    __syncthreads();
-}
-
-// v1_pn: quantization code uses PN::encode
-template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
-__forceinline__ __device__ void psz::cuda::__device::v1_pn::predict_quantize__no_outlier_1d(  //
-    T            private_buffer[SEQ],
-    volatile EQ* shmem_quant,
-    T            prev)
-{
-    constexpr auto BYTEWIDTH = sizeof(EQ);
-
-    using UI = EQ;
-    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
-
-    auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) {
-        UI UI_delta                          = PN<BYTEWIDTH>::encode(static_cast<I>(cur - prev));
-        shmem_quant[idx + threadIdx.x * SEQ] = UI_delta;
-    };
-
-    if (FIRST_POINT) {  // i == 0
-        quantize_1d(private_buffer[0], prev, 0);
-    }
-    else {
-#pragma unroll
-        for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i);
-        __syncthreads();
-    }
-}
-
-// template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
-// __forceinline__ __device__ void psz::cuda::__device::v1_pn::predict_quantize_1d(
-//     T            private_buffer[SEQ],
-//     volatile EQ* shmem_quant,
-//     volatile T*  shmem_outlier,
-//     int          radius,
-//     T            prev)
-// {
-//     constexpr auto BYTEWIDTH = sizeof(EQ);
-//     using UI                 = EQ;
-//     using I                  = typename psz::typing::Int<BYTEWIDTH>::T;
-
-//     auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) {
-//         T    delta       = cur - prev;
-//         bool quantizable = fabs(delta) < radius;
-//         UI   UI_delta    = PN<BYTEWIDTH>::encode(static_cast<I>(delta));
-
-//         // otherwise, need to reset shared memory (to 0)
-//         shmem_quant[idx + threadIdx.x * SEQ]   = quantizable * UI_delta;
-//         shmem_outlier[idx + threadIdx.x * SEQ] = (not quantizable) * delta;
-//     };
-
-//     if (FIRST_POINT) {  // i == 0
-//         quantize_1d(private_buffer[0], prev, 0);
-//     }
-//     else {
-// #pragma unroll
-//         for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i);
-//         __syncthreads();
-//     }
-// }
-
-////////////////////////////////////////////////////////////////////////////////
-
-//////// 2D
-
-template <typename T, typename FP, int YSEQ>
-__forceinline__ __device__ void psz::cuda::__device::v0::load_prequant_2d(
-    // clang-format off
-    T*       data,
-    uint32_t dimx, uint32_t gix,
-    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
-    FP ebx2_r,
-    T  center[YSEQ + 1]
-    // clang-format on
-)
-{
-    auto g_id = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
-
-    // use a warp as two half-warps
-    // block_dim = (16, 2, 1) makes a full warp internally
-
-#pragma unroll
-    for (auto iy = 0; iy < YSEQ; iy++) {
-        if (gix < dimx and giy_base + iy < dimy) center[iy + 1] = round(data[g_id(iy)] * ebx2_r);
-    }
-    auto tmp = __shfl_up_sync(0xffffffff, center[YSEQ], 16, 32);  // same-warp, next-16
-    if (threadIdx.y == 1) center[0] = tmp;
-}
-
-template <typename T, typename FP, int YSEQ>
-__forceinline__ __device__ void psz::cuda::__device::v0::predict_2d(T center[YSEQ + 1])
-{
-    /*
-       Lorenzo 2D (1-layer) illustration
-                 NW N NE
-       notation   W C E   "->" to predict
-       --------  SW S SE
-
-                normal data layout       |   considering register file
-                col(k-1)    col(k)       |   thread(k-1)        thread(k)
-                                         |
-       r(i-1)  -west[i-1]  +center[i-1]  |  -center(k-1)[i-1]  +center(k)[i-1]
-       r(i  )  +west[i]   ->center[i]    |  +center(k-1)[i]   ->center(k)[i]
-
-       calculation
-       -----------
-       delta = center[i] - (center[i-1] + west[i] - west[i-1])
-             = (center[i] - center[i-1]) - (west[i] - west[i-1])
-
-       With center[i] -= center[i-1] and west[i] -= west[i-1],
-       delta = center[i] - west[i]
-
-       For thread(k),
-       delta(k) = center(k)[i] - center(k-1)[i]
-                = center(k)[i] - SHFL_UP(center(k)[i], 1, HALF_WARP)
-     */
-
-#pragma unroll
-    for (auto i = YSEQ; i > 0; i--) {
-        // with center[i-1] intact in this iteration
-        center[i] -= center[i - 1];
-        // within a halfwarp (32/2)
-        auto west = __shfl_up_sync(0xffffffff, center[i], 1, 16);
-        if (threadIdx.x > 0) center[i] -= west;  // delta
-    }
-    __syncthreads();
-}
-
-template <typename T, typename EQ, int YSEQ>
-__forceinline__ __device__ void psz::cuda::__device::v0::quantize_write_2d(
-    // clang-format off
-    T        delta[YSEQ + 1],
-    uint32_t dimx,  uint32_t gix,
-    uint32_t dimy,  uint32_t giy_base, uint32_t stridey,
-    int      radius,
-    EQ*      quant, 
-    T*       outlier
-    // clang-format on
-)
-{
-    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
-
-#pragma unroll
-    for (auto i = 1; i < YSEQ + 1; i++) {
-        auto gid = get_gid(i - 1);
-
-        if (gix < dimx and giy_base + (i - 1) < dimy) {
-            bool quantizable = fabs(delta[i]) < radius;
-            T    candidate   = delta[i] + radius;
-
-            // outlier array is not in sparse form in this version
-            quant[gid]   = quantizable * static_cast<EQ>(candidate);
-            outlier[gid] = (not quantizable) * candidate;
-        }
-    }
-}
-
-template <typename T, typename EQ, int YSEQ>
-__forceinline__ __device__ void psz::cuda::__device::v0::delta_only::quantize_write_2d(
-    // clang-format off
-    T        delta[YSEQ + 1],
-    uint32_t dimx,  uint32_t gix,
-    uint32_t dimy,  uint32_t giy_base, uint32_t stridey,
-    EQ*      quant
-    // clang-format on
-)
-{
-    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
-
-#pragma unroll
-    for (auto i = 1; i < YSEQ + 1; i++) {
-        auto gid = get_gid(i - 1);
-        if (gix < dimx and giy_base + (i - 1) < dimy) quant[gid] = static_cast<EQ>(delta[i]);
-    }
-}
-
-template <typename T, typename EQ, int YSEQ>
-__forceinline__ __device__ void psz::cuda::__device::v1_pn::delta_only::quantize_write_2d(
-    // clang-format off
-    T        delta[YSEQ + 1],
-    uint32_t dimx,  uint32_t gix,
-    uint32_t dimy,  uint32_t giy_base, uint32_t stridey,
-    EQ*      quant
-    // clang-format on
-)
-{
-    constexpr auto BYTEWIDTH = sizeof(EQ);
-
-    using UI = EQ;
-    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
-
-    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
-
-#pragma unroll
-    for (auto i = 1; i < YSEQ + 1; i++) {
-        auto gid = get_gid(i - 1);
-        if (gix < dimx and giy_base + (i - 1) < dimy) quant[gid] = PN<BYTEWIDTH>::encode(static_cast<I>(delta[i]));
-    }
-}
-
-template <typename T, typename EQ, int YSEQ, typename Compaction>
-__forceinline__ __device__ void psz::cuda::__device::v0::compaction::quantize_write_2d(
-    // clang-format off
-    T        delta[YSEQ + 1],
-    uint32_t dimx, uint32_t gix,
-    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
-    int      radius,
-    EQ*      quant,
-    Compaction outlier
-    // clang-format on
-)
-{
-    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
-
-#pragma unroll
-    for (auto i = 1; i < YSEQ + 1; i++) {
-        auto gid = get_gid(i - 1);
-
-        if (gix < dimx and giy_base + (i - 1) < dimy) {
-            bool quantizable = fabs(delta[i]) < radius;
-            T    candidate   = delta[i] + radius;
-
-            // The non-quantizable is recorded as "0" (radius).
-            quant[gid] = quantizable * static_cast<EQ>(candidate);
-
-            if (not quantizable) {
-                auto cur_idx         = atomicAdd(outlier.count, 1);
-                outlier.idx[cur_idx] = gid;
-                outlier.val[cur_idx] = candidate;
-            }
-        }
-    }
-}
-
-template <typename T, typename EQ, int YSEQ, typename Compaction>
-__forceinline__ __device__ void psz::cuda::__device::v1_pn::compaction::quantize_write_2d(
-    // clang-format off
-    T        delta[YSEQ + 1],
-    uint32_t dimx, uint32_t gix,
-    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
-    int      radius,
-    EQ*      quant,
-    Compaction outlier
-    // clang-format on
-)
-{
-    constexpr auto BYTEWIDTH = sizeof(EQ);
-
-    using UI = EQ;
-    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
-
-    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
-
-#pragma unroll
-    for (auto i = 1; i < YSEQ + 1; i++) {
-        auto gid = get_gid(i - 1);
-
-        if (gix < dimx and giy_base + (i - 1) < dimy) {
-            bool quantizable = fabs(delta[i]) < radius;
-            UI   UI_delta    = PN<BYTEWIDTH>::encode(static_cast<I>(delta[i]));
-
-            // The non-quantizable is recorded as "0" (radius).
-            quant[gid] = quantizable * UI_delta;
-
-            if (not quantizable) {
-                auto cur_idx         = atomicAdd(outlier.count, 1);
-                outlier.idx[cur_idx] = gid;
-                outlier.val[cur_idx] = delta[i];
-            }
-        }
-    }
-}
-
-// load to thread-private array (fuse at the same time)
-template <typename T, typename EQ, int YSEQ>
-__forceinline__ __device__ void psz::cuda::__device::v0::load_fuse_2d(
-    // clang-format off
-    EQ*      quant,
-    T*       outlier,
-    uint32_t dimx, uint32_t gix,
-    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
-    int      radius,
-    T        thread_private[YSEQ]
-    // clang-format on
-)
-{
-    auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
-
-#pragma unroll
-    for (auto i = 0; i < YSEQ; i++) {
-        auto gid = get_gid(i);
-        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
-        if (gix < dimx and (giy_base + i) < dimy)
-            thread_private[i] = outlier[gid] + static_cast<T>(quant[gid]) - radius;  // fuse
-        else
-            thread_private[i] = 0;  // TODO set as init state?
-    }
-}
-
-// load to thread-private array (fuse at the same time)
-template <typename T, typename EQ, int YSEQ>
-__forceinline__ __device__ void psz::cuda::__device::v1_pn::load_fuse_2d(
-    // clang-format off
-    EQ*      quant,
-    T*       outlier,
-    uint32_t dimx, uint32_t gix,
-    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
-    int      radius,
-    T        thread_private[YSEQ]
-    // clang-format on
-)
-{
-    constexpr auto BYTEWIDTH = sizeof(EQ);
-
-    using UI = EQ;
-    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
-
-    auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
-
-#pragma unroll
-    for (auto i = 0; i < YSEQ; i++) {
-        auto gid = get_gid(i);
-        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
-        if (gix < dimx and (giy_base + i) < dimy)
-            thread_private[i] = outlier[gid] + PN<BYTEWIDTH>::decode(quant[gid]);  // fuse
-        else
-            thread_private[i] = 0;  // TODO set as init state?
-    }
-}
-
-// load to thread-private array (fuse at the same time)
-template <typename T, typename EQ, int YSEQ>
-__forceinline__ __device__ void psz::cuda::__device::v0::delta_only::load_2d(
-    // clang-format off
-    EQ*      quant,
-    uint32_t dimx, uint32_t gix,
-    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
-    T        thread_private[YSEQ]
-    // clang-format on
-)
-{
-    auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
-
-#pragma unroll
-    for (auto i = 0; i < YSEQ; i++) {
-        auto gid = get_gid(i);
-        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
-        if (gix < dimx and (giy_base + i) < dimy)
-            thread_private[i] = static_cast<T>(quant[gid]);
-        else
-            thread_private[i] = 0;  // TODO set as init state?
-    }
-}
-
-// load to thread-private array (fuse at the same time)
-template <typename T, typename EQ, int YSEQ>
-__forceinline__ __device__ void psz::cuda::__device::v1_pn::delta_only::load_2d(
-    // clang-format off
-    EQ*      quant,
-    uint32_t dimx, uint32_t gix,
-    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
-    T        thread_private[YSEQ]
-    // clang-format on
-)
-{
-    constexpr auto BYTEWIDTH = sizeof(EQ);
-
-    using UI = EQ;
-    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
-
-    auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
-
-#pragma unroll
-    for (auto i = 0; i < YSEQ; i++) {
-        auto gid = get_gid(i);
-        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
-        if (gix < dimx and (giy_base + i) < dimy)
-            thread_private[i] = PN<BYTEWIDTH>::decode(quant[gid]);
-        else
-            thread_private[i] = 0;  // TODO set as init state?
-    }
-}
-
-// partial-sum along y-axis, sequantially
-// then, in-warp partial-sum along x-axis
-template <typename T, typename EQ, typename FP, int YSEQ>
-__forceinline__ __device__ void
-psz::cuda::__device::v0::block_scan_2d(T thread_private[YSEQ], volatile T* intermediate, FP ebx2)
-{
-    //       ------> gix (x)
-    //
-    //   |   t(0,0)       t(0,1)       t(0,2)       t(0,3)       ... t(0,f)
-    //   |
-    //   |   thp(0,0)[0]  thp(0,0)[0]  thp(0,0)[0]  thp(0,0)[0]
-    //  giy  thp(0,0)[1]  thp(0,0)[1]  thp(0,0)[1]  thp(0,0)[1]
-    //  (y)  |            |            |            |
-    //       thp(0,0)[7]  thp(0,0)[7]  thp(0,0)[7]  thp(0,0)[7]
-    //
-    //   |   t(1,0)       t(1,1)       t(1,2)       t(1,3)       ... t(1,f)
-    //   |
-    //   |   thp(1,0)[0]  thp(1,0)[0]  thp(1,0)[0]  thp(1,0)[0]
-    //  giy  thp(1,0)[1]  thp(1,0)[1]  thp(1,0)[1]  thp(1,0)[1]
-    //  (y)  |            |            |            |
-    //       thp(1,0)[7]  thp(1,0)[7]  thp(1,0)[7]  thp(1,0)[7]
-
-    constexpr auto BLOCK = 16;
-
-    for (auto i = 1; i < YSEQ; i++) thread_private[i] += thread_private[i - 1];
-    // two-pass: store for cross-thread-private update
-    // TODO shuffle up by 16 in the same warp
-    if (threadIdx.y == 0) intermediate[threadIdx.x] = thread_private[YSEQ - 1];
-    __syncthreads();
-    // broadcast the partial-sum result from a previous segment
-    if (threadIdx.y == 1) {
-        auto tmp = intermediate[threadIdx.x];
-#pragma unroll
-        for (auto i = 0; i < YSEQ; i++) thread_private[i] += tmp;  // regression as pointer
-    }
-    // implicit sync as there is half-warp divergence
-
-#pragma unroll
-    for (auto i = 0; i < YSEQ; i++) {
-        for (auto d = 1; d < BLOCK; d *= 2) {
-            T n = __shfl_up_sync(0xffffffff, thread_private[i], d, 16);  // half-warp shuffle
-            if (threadIdx.x >= d) thread_private[i] += n;
-        }
-        thread_private[i] *= ebx2;  // scale accordingly
-    }
-}
-
-// write to DRAM
-template <typename T, int YSEQ>
-__forceinline__ __device__ void psz::cuda::__device::v0::decomp_write_2d(
-    // clang-format off
-    T        thread_private[YSEQ],
-    uint32_t dimx, uint32_t gix,
-    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
-    T*       xdata
-    // clang-format on
-)
-{
-    auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
-
-#pragma unroll
-    for (auto i = 0; i < YSEQ; i++) {
-        auto gid = get_gid(i);
-        if (gix < dimx and (giy_base + i) < dimy) xdata[gid] = thread_private[i];
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-//////// 3D
+/**
+ * @file subroutine.inl
+ * @author Jiannan Tian
+ * @brief subroutines of kernels
+ * @version 0.4
+ * @date 2022-12-22
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include <stdint.h>
+#include <type_traits>
+#include "cusz/pn.hh"
+#include "pipeline/compaction_g.inl"
+#include "subsub.inl"
+
+namespace psz {
+namespace cuda {
+namespace __device {
+
+//////// 1D
+
+namespace v0 {
+
+// compression load
+template <typename T, typename FP, int NTHREAD, int SEQ>
+__forceinline__ __device__ void load_prequant_1d(
+    T*          data,
+    uint32_t    dimx,
+    uint32_t    id_base,
+    volatile T* shmem,
+    T           private_buffer[SEQ],
+    T&          prev,
+    FP          ebx2_r);
+
+// decompression load
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void load_fuse_1d(
+    EQ*         quant,
+    T*          outlier,
+    uint32_t    dimx,
+    uint32_t    id_base,
+    int         radius,
+    volatile T* shmem,
+    T           private_buffer[SEQ]);
+
+namespace delta_only {
+
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void
+load_1d(EQ* quant, uint32_t dimx, uint32_t id_base, volatile T* shmem, T private_buffer[SEQ]);
+
+}
+
+// compression and decompression store
+template <typename T1, typename T2, int NTHREAD, int SEQ, bool NO_OUTLIER>
+__forceinline__ __device__ void write_1d(  //
+    volatile T1* shmem_a1,
+    volatile T2* shmem_a2,
+    uint32_t     dimx,
+    uint32_t     id_base,
+    T1*          a1,
+    T2*          a2);
+
+// compression pred-quant, method 1
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void predict_quantize__no_outlier_1d(  //
+    T            private_buffer[SEQ],
+    volatile EQ* shmem_quant,
+    T            prev = 0);
+
+// compression pred-quant, method 2
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void predict_quantize_1d(  //
+    T            private_buffer[SEQ],
+    volatile EQ* shmem_quant,
+    volatile T*  shmem_outlier,
+    int          radius,
+    T            prev = 0);
+
+namespace compaction {
+
+template <
+    typename T,
+    typename EQ,
+    int  SEQ,
+    bool FIRST_POINT,
+    typename Compaction = CompactionDRAM<T>>
+__forceinline__ __device__ void predict_quantize_1d(  //
+    T            thp_buffer[SEQ],
+    volatile EQ* s_quant,
+    uint32_t     dimx,
+    int          radius,
+    uint32_t     g_id_base,
+    Compaction   g_outlier,
+    T            prev = 0);
+
+}
+
+// decompression pred-quant
+template <typename T, int SEQ, int NTHREAD>
+__forceinline__ __device__ void block_scan_1d(
+    T           private_buffer[SEQ],
+    T           ebx2,
+    volatile T* exchange_in,
+    volatile T* exchange_out,
+    volatile T* shmem_buffer);
+
+}  // namespace v0
+
+namespace v1_pn {
+
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void
+load_fuse_1d(EQ* quant, T* outlier, uint32_t dimx, uint32_t id_base, volatile T* shmem, T private_buffer[SEQ]);
+
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void
+predict_quantize__no_outlier_1d(T private_buffer[SEQ], volatile EQ* shmem_quant, T prev);
+
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void
+predict_quantize_1d(T private_buffer[SEQ], volatile EQ* shmem_quant, volatile T* shmem_outlier, int radius, T prev);
+
+namespace compaction {
+
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT, typename Compaction>
+__forceinline__ __device__ void predict_quantize_1d(
+    T            thp_buffer[SEQ],
+    volatile EQ* s_quant,
+    uint32_t     dimx,
+    int          radius,
+    uint32_t     g_idx_base,
+    Compaction   outlier,
+    T            prev);
+
+}
+
+namespace delta_only {
+
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void
+load_1d(EQ* quant, uint32_t dimx, uint32_t id_base, volatile T* shmem, T private_buffer[SEQ]);
+
+}
+
+}  // namespace v1_pn
+
+//////// 2D
+
+namespace v0 {
+
+template <typename T, typename FP, int YSEQ>
+__forceinline__ __device__ void load_prequant_2d(
+    T*       data,
+    uint32_t dimx,
+    uint32_t gix,
+    uint32_t dimy,
+    uint32_t giy_base,
+    uint32_t stridey,
+    FP       ebx2_r,
+    T        center[YSEQ + 1]);
+
+template <typename T, typename FP, int YSEQ>
+__forceinline__ __device__ void predict_2d(T center[YSEQ + 1]);
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void quantize_write_2d(
+    T        delta[YSEQ + 1],
+    uint32_t dimx,
+    uint32_t gix,
+    uint32_t dimy,
+    uint32_t giy_base,
+    uint32_t stridey,
+    int      radius,
+    EQ*      quant,
+    T*       outlier);
+
+namespace delta_only {
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void quantize_write_2d(
+    T        delta[YSEQ + 1],
+    uint32_t dimx,
+    uint32_t gix,
+    uint32_t dimy,
+    uint32_t giy_base,
+    uint32_t stridey,
+    EQ*      quant);
+
+}
+
+namespace compaction {
+
+template <typename T, typename EQ, int YSEQ, typename Compaction>
+__forceinline__ __device__ void quantize_write_2d(
+    T          delta[YSEQ + 1],
+    uint32_t   dimx,
+    uint32_t   gix,
+    uint32_t   dimy,
+    uint32_t   giy_base,
+    uint32_t   stridey,
+    int        radius,
+    EQ*        quant,
+    Compaction outlier);
+
+};
+
+// decompression load
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void load_fuse_2d(
+    EQ*      quant,
+    T*       outlier,
+    uint32_t dimx,
+    uint32_t gix,
+    uint32_t dimy,
+    uint32_t giy_base,
+    uint32_t stridey,
+    int      radius,
+    T        private_buffer[YSEQ]);
+
+namespace delta_only {
+// decompression load
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void load_2d(
+    EQ*      quant,
+    uint32_t dimx,
+    uint32_t gix,
+    uint32_t dimy,
+    uint32_t giy_base,
+    uint32_t stridey,
+    T        private_buffer[YSEQ]);
+
+}  // namespace delta_only
+
+template <typename T, typename EQ, typename FP, int YSEQ>
+__forceinline__ __device__ void block_scan_2d(  //
+    T           thread_private[YSEQ],
+    volatile T* intermediate,
+    FP          ebx2);
+
+template <typename T, int YSEQ>
+__forceinline__ __device__ void decomp_write_2d(
+    T        thread_private[YSEQ],
+    uint32_t dimx,
+    uint32_t gix,
+    uint32_t dimy,
+    uint32_t giy_base,
+    uint32_t stridey,
+    T*       xdata);
+
+}  // namespace v0
+
+namespace v1_pn {
+
+namespace compaction {
+template <typename T, typename EQ, int YSEQ, typename Compaction>
+__forceinline__ __device__ void quantize_write_2d(
+    // clang-format off
+    T        delta[YSEQ + 1],
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    int      radius,
+    EQ*      quant,
+    Compaction outlier
+    // clang-format on
+);
+
+}
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void load_fuse_2d(
+    // clang-format off
+    EQ*      quant,
+    T*       outlier,
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    int      radius,
+    T        thread_private[YSEQ]
+    // clang-format on
+);
+
+namespace delta_only {
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void load_2d(
+    // clang-format off
+    EQ*      quant,
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    T        thread_private[YSEQ]
+    // clang-format on
+);
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void quantize_write_2d(
+    T        delta[YSEQ + 1],
+    uint32_t dimx,
+    uint32_t gix,
+    uint32_t dimy,
+    uint32_t giy_base,
+    uint32_t stridey,
+    EQ*      quant);
+
+}  // namespace delta_only
+
+}  // namespace v1_pn
+
+//////// 3D
+
+namespace v0 {
+
+// TODO move subroutines for 3D here
+
+}
+
+}  // namespace __device
+}  // namespace cuda
+}  // namespace psz
+
+////////////////////////////////////////////////////////////////////////////////
+
+//////// 1D
+
+template <typename T, typename FP, int NTHREAD, int SEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::load_prequant_1d(
+    T*          data,
+    uint32_t    dimx,
+    uint32_t    id_base,
+    volatile T* shmem,
+    T           private_buffer[SEQ],
+    T&          prev,  // TODO use pointer?
+    FP          ebx2_r)
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id = id_base + threadIdx.x + i * NTHREAD;
+        if (id < dimx) shmem[threadIdx.x + i * NTHREAD] = round(data[id] * ebx2_r);
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i];
+    if (threadIdx.x > 0) prev = shmem[threadIdx.x * SEQ - 1];
+    __syncthreads();
+}
+
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::load_fuse_1d(
+    EQ*         quant,
+    T*          outlier,
+    uint32_t    dimx,
+    uint32_t    id_base,
+    int         radius,
+    volatile T* shmem,
+    T           private_buffer[SEQ])
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto local_id = threadIdx.x + i * NTHREAD;
+        auto id       = id_base + local_id;
+        if (id < dimx) shmem[local_id] = outlier[id] + static_cast<T>(quant[id]) - radius;
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i];
+    __syncthreads();
+}
+
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::load_fuse_1d(
+    EQ*         quant,
+    T*          outlier,
+    uint32_t    dimx,
+    uint32_t    id_base,
+    volatile T* shmem,
+    T           private_buffer[SEQ])
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto local_id = threadIdx.x + i * NTHREAD;
+        auto id       = id_base + local_id;
+        if (id < dimx) shmem[local_id] = outlier[id] + PN<BYTEWIDTH>::decode(quant[id]);
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i];
+    __syncthreads();
+}
+
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::delta_only::load_1d(
+    EQ*         quant,
+    uint32_t    dimx,
+    uint32_t    id_base,
+    volatile T* shmem,
+    T           private_buffer[SEQ])
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto local_id = threadIdx.x + i * NTHREAD;
+        auto id       = id_base + local_id;
+        if (id < dimx) shmem[local_id] = static_cast<T>(quant[id]);
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i];
+    __syncthreads();
+}
+
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::delta_only::load_1d(
+    EQ*         quant,
+    uint32_t    dimx,
+    uint32_t    id_base,
+    volatile T* shmem,
+    T           private_buffer[SEQ])
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto local_id = threadIdx.x + i * NTHREAD;
+        auto id       = id_base + local_id;
+        if (id < dimx) shmem[local_id] = PN<BYTEWIDTH>::decode(quant[id]);
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i];
+    __syncthreads();
+}
+
+template <typename T1, typename T2, int NTHREAD, int SEQ, bool NO_OUTLIER>  // TODO remove NO_OUTLIER, use nullable
+__forceinline__ __device__ void psz::cuda::__device::v0::write_1d(
+    volatile T1* shmem_a1,
+    volatile T2* shmem_a2,
+    uint32_t     dimx,
+    uint32_t     id_base,
+    T1*          a1,
+    T2*          a2)
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id = id_base + threadIdx.x + i * NTHREAD;
+        if (id < dimx) {
+            if (NO_OUTLIER) {  //
+                a1[id] = shmem_a1[threadIdx.x + i * NTHREAD];
+            }
+            else {
+                a1[id] = shmem_a1[threadIdx.x + i * NTHREAD];
+                a2[id] = shmem_a2[threadIdx.x + i * NTHREAD];
+            }
+        }
+    }
+}
+
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void psz::cuda::__device::v0::predict_quantize__no_outlier_1d(  //
+    T            private_buffer[SEQ],
+    volatile EQ* shmem_quant,
+    T            prev)
+{
+    auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) {
+        shmem_quant[idx + threadIdx.x * SEQ] = static_cast<EQ>(cur - prev);
+    };
+
+    if (FIRST_POINT) {  // i == 0
+        quantize_1d(private_buffer[0], prev, 0);
+    }
+    else {
+#pragma unroll
+        for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i);
+        __syncthreads();
+    }
+}
+
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void psz::cuda::__device::v0::predict_quantize_1d(
+    T            private_buffer[SEQ],
+    volatile EQ* shmem_quant,
+    volatile T*  shmem_outlier,
+    int          radius,
+    T            prev)
+{
+    auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) {
+        T    delta       = cur - prev;
+        bool quantizable = fabs(delta) < radius;
+        T    candidate   = delta + radius;
+
+        // otherwise, need to reset shared memory (to 0)
+        shmem_quant[idx + threadIdx.x * SEQ]   = quantizable * static_cast<EQ>(candidate);
+        shmem_outlier[idx + threadIdx.x * SEQ] = (not quantizable) * candidate;
+    };
+
+    if (FIRST_POINT) {  // i == 0
+        quantize_1d(private_buffer[0], prev, 0);
+    }
+    else {
+#pragma unroll
+        for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i);
+        __syncthreads();
+    }
+}
+
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT, typename Compaction>
+__forceinline__ __device__ void psz::cuda::__device::v0::compaction::predict_quantize_1d(
+    T            thp_buffer[SEQ],
+    volatile EQ* s_quant,
+    uint32_t     dimx,  // put x-related
+    int          radius,
+    uint32_t     g_idx_base,  // TODO this file `id_base` to `g_idx_base`
+    Compaction   outlier,
+    T            prev)
+{
+    auto quantize_1d = [&](T& cur, T& prev, uint32_t inloop_idx) {
+        T    delta       = cur - prev;
+        bool quantizable = fabs(delta) < radius;
+        T    candidate   = delta + radius;
+
+        auto inblock_idx = inloop_idx + threadIdx.x * SEQ;  // TODO this file use `inblock_idx`
+
+        // though quantizable, need to set non-quantizable position as 0
+        s_quant[inblock_idx] = quantizable * static_cast<EQ>(candidate);
+
+        // very small chance running into this block
+        if (not quantizable) {
+            auto g_idx = inblock_idx + g_idx_base;
+            if (g_idx < dimx) {
+                auto cur_idx         = atomicAdd(outlier.count, 1);
+                outlier.val[cur_idx] = candidate;
+                outlier.idx[cur_idx] = g_idx;
+            }
+        }
+    };
+
+    if (FIRST_POINT) {  // i == 0
+        quantize_1d(thp_buffer[0], prev, 0);
+    }
+    else {
+#pragma unroll
+        for (auto i = 1; i < SEQ; i++) quantize_1d(thp_buffer[i], thp_buffer[i - 1], i);
+        __syncthreads();  // TODO move __syncthreads() outside this subroutine?
+    }
+}
+
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT, typename Compaction>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::compaction::predict_quantize_1d(
+    T            thp_buffer[SEQ],
+    volatile EQ* s_quant,
+    uint32_t     dimx,  // put x-related
+    int          radius,
+    uint32_t     g_idx_base,  // TODO this file `id_base` to `g_idx_base`
+    Compaction   outlier,
+    T            prev)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    auto quantize_1d = [&](T& cur, T& prev, uint32_t inloop_idx) {
+        T    delta       = cur - prev;
+        bool quantizable = fabs(delta) < radius;
+        UI   UI_delta    = PN<BYTEWIDTH>::encode(static_cast<I>(delta));
+
+        auto inblock_idx = inloop_idx + threadIdx.x * SEQ;  // TODO this file use `inblock_idx`
+
+        // though quantizable, need to set non-quantizable position as 0
+        s_quant[inblock_idx] = quantizable * UI_delta;
+
+        // very small chance running into this block
+        if (not quantizable) {
+            auto g_idx = inblock_idx + g_idx_base;
+            if (g_idx < dimx) {
+                auto cur_idx         = atomicAdd(outlier.count, 1);
+                outlier.val[cur_idx] = delta;
+                outlier.idx[cur_idx] = g_idx;
+            }
+        }
+    };
+
+    if (FIRST_POINT) {  // i == 0
+        quantize_1d(thp_buffer[0], prev, 0);
+    }
+    else {
+#pragma unroll
+        for (auto i = 1; i < SEQ; i++) quantize_1d(thp_buffer[i], thp_buffer[i - 1], i);
+        __syncthreads();  // TODO move __syncthreads() outside this subroutine?
+    }
+}
+
+// decompression pred-quant
+template <typename T, int SEQ, int NTHREAD>
+__forceinline__ __device__ void psz::cuda::__device::v0::block_scan_1d(
+    T           private_buffer[SEQ],
+    T           ebx2,
+    volatile T* exchange_in,
+    volatile T* exchange_out,
+    volatile T* shmem_buffer)
+{
+    namespace wave32 = psz::cuda::__device::wave32;
+    wave32::intrawarp_inclusivescan_1d<T, SEQ>(private_buffer);
+    wave32::intrablock_exclusivescan_1d<T, SEQ, NTHREAD>(private_buffer, exchange_in, exchange_out);
+
+    // put back to shmem
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) shmem_buffer[threadIdx.x * SEQ + i] = private_buffer[i] * ebx2;
+    __syncthreads();
+}
+
+// v1_pn: quantization code uses PN::encode
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::predict_quantize__no_outlier_1d(  //
+    T            private_buffer[SEQ],
+    volatile EQ* shmem_quant,
+    T            prev)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) {
+        UI UI_delta                          = PN<BYTEWIDTH>::encode(static_cast<I>(cur - prev));
+        shmem_quant[idx + threadIdx.x * SEQ] = UI_delta;
+    };
+
+    if (FIRST_POINT) {  // i == 0
+        quantize_1d(private_buffer[0], prev, 0);
+    }
+    else {
+#pragma unroll
+        for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i);
+        __syncthreads();
+    }
+}
+
+// template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+// __forceinline__ __device__ void psz::cuda::__device::v1_pn::predict_quantize_1d(
+//     T            private_buffer[SEQ],
+//     volatile EQ* shmem_quant,
+//     volatile T*  shmem_outlier,
+//     int          radius,
+//     T            prev)
+// {
+//     constexpr auto BYTEWIDTH = sizeof(EQ);
+//     using UI                 = EQ;
+//     using I                  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+//     auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) {
+//         T    delta       = cur - prev;
+//         bool quantizable = fabs(delta) < radius;
+//         UI   UI_delta    = PN<BYTEWIDTH>::encode(static_cast<I>(delta));
+
+//         // otherwise, need to reset shared memory (to 0)
+//         shmem_quant[idx + threadIdx.x * SEQ]   = quantizable * UI_delta;
+//         shmem_outlier[idx + threadIdx.x * SEQ] = (not quantizable) * delta;
+//     };
+
+//     if (FIRST_POINT) {  // i == 0
+//         quantize_1d(private_buffer[0], prev, 0);
+//     }
+//     else {
+// #pragma unroll
+//         for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i);
+//         __syncthreads();
+//     }
+// }
+
+////////////////////////////////////////////////////////////////////////////////
+
+//////// 2D
+
+template <typename T, typename FP, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::load_prequant_2d(
+    // clang-format off
+    T*       data,
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    FP ebx2_r,
+    T  center[YSEQ + 1]
+    // clang-format on
+)
+{
+    auto g_id = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
+
+    // use a warp as two half-warps
+    // block_dim = (16, 2, 1) makes a full warp internally
+
+#pragma unroll
+    for (auto iy = 0; iy < YSEQ; iy++) {
+        if (gix < dimx and giy_base + iy < dimy) center[iy + 1] = round(data[g_id(iy)] * ebx2_r);
+    }
+    auto tmp = __shfl_up_sync(0xffffffff, center[YSEQ], 16, 32);  // same-warp, next-16
+    if (threadIdx.y == 1) center[0] = tmp;
+}
+
+template <typename T, typename FP, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::predict_2d(T center[YSEQ + 1])
+{
+    /*
+       Lorenzo 2D (1-layer) illustration
+                 NW N NE
+       notation   W C E   "->" to predict
+       --------  SW S SE
+
+                normal data layout       |   considering register file
+                col(k-1)    col(k)       |   thread(k-1)        thread(k)
+                                         |
+       r(i-1)  -west[i-1]  +center[i-1]  |  -center(k-1)[i-1]  +center(k)[i-1]
+       r(i  )  +west[i]   ->center[i]    |  +center(k-1)[i]   ->center(k)[i]
+
+       calculation
+       -----------
+       delta = center[i] - (center[i-1] + west[i] - west[i-1])
+             = (center[i] - center[i-1]) - (west[i] - west[i-1])
+
+       With center[i] -= center[i-1] and west[i] -= west[i-1],
+       delta = center[i] - west[i]
+
+       For thread(k),
+       delta(k) = center(k)[i] - center(k-1)[i]
+                = center(k)[i] - SHFL_UP(center(k)[i], 1, HALF_WARP)
+     */
+
+#pragma unroll
+    for (auto i = YSEQ; i > 0; i--) {
+        // with center[i-1] intact in this iteration
+        center[i] -= center[i - 1];
+        // within a halfwarp (32/2)
+        auto west = __shfl_up_sync(0xffffffff, center[i], 1, 16);
+        if (threadIdx.x > 0) center[i] -= west;  // delta
+    }
+    __syncthreads();
+}
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::quantize_write_2d(
+    // clang-format off
+    T        delta[YSEQ + 1],
+    uint32_t dimx,  uint32_t gix,
+    uint32_t dimy,  uint32_t giy_base, uint32_t stridey,
+    int      radius,
+    EQ*      quant, 
+    T*       outlier
+    // clang-format on
+)
+{
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 1; i < YSEQ + 1; i++) {
+        auto gid = get_gid(i - 1);
+
+        if (gix < dimx and giy_base + (i - 1) < dimy) {
+            bool quantizable = fabs(delta[i]) < radius;
+            T    candidate   = delta[i] + radius;
+
+            // outlier array is not in sparse form in this version
+            quant[gid]   = quantizable * static_cast<EQ>(candidate);
+            outlier[gid] = (not quantizable) * candidate;
+        }
+    }
+}
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::delta_only::quantize_write_2d(
+    // clang-format off
+    T        delta[YSEQ + 1],
+    uint32_t dimx,  uint32_t gix,
+    uint32_t dimy,  uint32_t giy_base, uint32_t stridey,
+    EQ*      quant
+    // clang-format on
+)
+{
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 1; i < YSEQ + 1; i++) {
+        auto gid = get_gid(i - 1);
+        if (gix < dimx and giy_base + (i - 1) < dimy) quant[gid] = static_cast<EQ>(delta[i]);
+    }
+}
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::delta_only::quantize_write_2d(
+    // clang-format off
+    T        delta[YSEQ + 1],
+    uint32_t dimx,  uint32_t gix,
+    uint32_t dimy,  uint32_t giy_base, uint32_t stridey,
+    EQ*      quant
+    // clang-format on
+)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 1; i < YSEQ + 1; i++) {
+        auto gid = get_gid(i - 1);
+        if (gix < dimx and giy_base + (i - 1) < dimy) quant[gid] = PN<BYTEWIDTH>::encode(static_cast<I>(delta[i]));
+    }
+}
+
+template <typename T, typename EQ, int YSEQ, typename Compaction>
+__forceinline__ __device__ void psz::cuda::__device::v0::compaction::quantize_write_2d(
+    // clang-format off
+    T        delta[YSEQ + 1],
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    int      radius,
+    EQ*      quant,
+    Compaction outlier
+    // clang-format on
+)
+{
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 1; i < YSEQ + 1; i++) {
+        auto gid = get_gid(i - 1);
+
+        if (gix < dimx and giy_base + (i - 1) < dimy) {
+            bool quantizable = fabs(delta[i]) < radius;
+            T    candidate   = delta[i] + radius;
+
+            // The non-quantizable is recorded as "0" (radius).
+            quant[gid] = quantizable * static_cast<EQ>(candidate);
+
+            if (not quantizable) {
+                auto cur_idx         = atomicAdd(outlier.count, 1);
+                outlier.idx[cur_idx] = gid;
+                outlier.val[cur_idx] = candidate;
+            }
+        }
+    }
+}
+
+template <typename T, typename EQ, int YSEQ, typename Compaction>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::compaction::quantize_write_2d(
+    // clang-format off
+    T        delta[YSEQ + 1],
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    int      radius,
+    EQ*      quant,
+    Compaction outlier
+    // clang-format on
+)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 1; i < YSEQ + 1; i++) {
+        auto gid = get_gid(i - 1);
+
+        if (gix < dimx and giy_base + (i - 1) < dimy) {
+            bool quantizable = fabs(delta[i]) < radius;
+            UI   UI_delta    = PN<BYTEWIDTH>::encode(static_cast<I>(delta[i]));
+
+            // The non-quantizable is recorded as "0" (radius).
+            quant[gid] = quantizable * UI_delta;
+
+            if (not quantizable) {
+                auto cur_idx         = atomicAdd(outlier.count, 1);
+                outlier.idx[cur_idx] = gid;
+                outlier.val[cur_idx] = delta[i];
+            }
+        }
+    }
+}
+
+// load to thread-private array (fuse at the same time)
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::load_fuse_2d(
+    // clang-format off
+    EQ*      quant,
+    T*       outlier,
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    int      radius,
+    T        thread_private[YSEQ]
+    // clang-format on
+)
+{
+    auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
+        if (gix < dimx and (giy_base + i) < dimy)
+            thread_private[i] = outlier[gid] + static_cast<T>(quant[gid]) - radius;  // fuse
+        else
+            thread_private[i] = 0;  // TODO set as init state?
+    }
+}
+
+// load to thread-private array (fuse at the same time)
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::load_fuse_2d(
+    // clang-format off
+    EQ*      quant,
+    T*       outlier,
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    int      radius,
+    T        thread_private[YSEQ]
+    // clang-format on
+)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
+        if (gix < dimx and (giy_base + i) < dimy)
+            thread_private[i] = outlier[gid] + PN<BYTEWIDTH>::decode(quant[gid]);  // fuse
+        else
+            thread_private[i] = 0;  // TODO set as init state?
+    }
+}
+
+// load to thread-private array (fuse at the same time)
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::delta_only::load_2d(
+    // clang-format off
+    EQ*      quant,
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    T        thread_private[YSEQ]
+    // clang-format on
+)
+{
+    auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
+        if (gix < dimx and (giy_base + i) < dimy)
+            thread_private[i] = static_cast<T>(quant[gid]);
+        else
+            thread_private[i] = 0;  // TODO set as init state?
+    }
+}
+
+// load to thread-private array (fuse at the same time)
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::delta_only::load_2d(
+    // clang-format off
+    EQ*      quant,
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    T        thread_private[YSEQ]
+    // clang-format on
+)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
+        if (gix < dimx and (giy_base + i) < dimy)
+            thread_private[i] = PN<BYTEWIDTH>::decode(quant[gid]);
+        else
+            thread_private[i] = 0;  // TODO set as init state?
+    }
+}
+
+// partial-sum along y-axis, sequantially
+// then, in-warp partial-sum along x-axis
+template <typename T, typename EQ, typename FP, int YSEQ>
+__forceinline__ __device__ void
+psz::cuda::__device::v0::block_scan_2d(T thread_private[YSEQ], volatile T* intermediate, FP ebx2)
+{
+    //       ------> gix (x)
+    //
+    //   |   t(0,0)       t(0,1)       t(0,2)       t(0,3)       ... t(0,f)
+    //   |
+    //   |   thp(0,0)[0]  thp(0,0)[0]  thp(0,0)[0]  thp(0,0)[0]
+    //  giy  thp(0,0)[1]  thp(0,0)[1]  thp(0,0)[1]  thp(0,0)[1]
+    //  (y)  |            |            |            |
+    //       thp(0,0)[7]  thp(0,0)[7]  thp(0,0)[7]  thp(0,0)[7]
+    //
+    //   |   t(1,0)       t(1,1)       t(1,2)       t(1,3)       ... t(1,f)
+    //   |
+    //   |   thp(1,0)[0]  thp(1,0)[0]  thp(1,0)[0]  thp(1,0)[0]
+    //  giy  thp(1,0)[1]  thp(1,0)[1]  thp(1,0)[1]  thp(1,0)[1]
+    //  (y)  |            |            |            |
+    //       thp(1,0)[7]  thp(1,0)[7]  thp(1,0)[7]  thp(1,0)[7]
+
+    constexpr auto BLOCK = 16;
+
+    for (auto i = 1; i < YSEQ; i++) thread_private[i] += thread_private[i - 1];
+    // two-pass: store for cross-thread-private update
+    // TODO shuffle up by 16 in the same warp
+    if (threadIdx.y == 0) intermediate[threadIdx.x] = thread_private[YSEQ - 1];
+    __syncthreads();
+    // broadcast the partial-sum result from a previous segment
+    if (threadIdx.y == 1) {
+        auto tmp = intermediate[threadIdx.x];
+#pragma unroll
+        for (auto i = 0; i < YSEQ; i++) thread_private[i] += tmp;  // regression as pointer
+    }
+    // implicit sync as there is half-warp divergence
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        for (auto d = 1; d < BLOCK; d *= 2) {
+            T n = __shfl_up_sync(0xffffffff, thread_private[i], d, 16);  // half-warp shuffle
+            if (threadIdx.x >= d) thread_private[i] += n;
+        }
+        thread_private[i] *= ebx2;  // scale accordingly
+    }
+}
+
+// write to DRAM
+template <typename T, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::decomp_write_2d(
+    // clang-format off
+    T        thread_private[YSEQ],
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    T*       xdata
+    // clang-format on
+)
+{
+    auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        if (gix < dimx and (giy_base + i) < dimy) xdata[gid] = thread_private[i];
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+//////// 3D
diff --git a/qtensor/compression/cusz/src/kernel/detail/subsub.inl b/qtensor/compression/cusz/src/kernel/detail/subsub.inl
index 4d34fdc6..e8da624f 100644
--- a/qtensor/compression/cusz/src/kernel/detail/subsub.inl
+++ b/qtensor/compression/cusz/src/kernel/detail/subsub.inl
@@ -1,92 +1,92 @@
-/**
- * @file subsub.inl
- * @author Jiannan Tian
- * @brief
- * @version 0.4
- * @date 2022-12-26
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-namespace psz {
-namespace cuda {
-namespace __device {
-
-namespace wave32 {
-template <typename T, int SEQ>
-__forceinline__ __device__ void intrawarp_inclusivescan_1d(  //
-    T private_buffer[SEQ]);
-
-template <typename T, int SEQ, int NTHREAD>
-__forceinline__ __device__ void intrablock_exclusivescan_1d(  //
-    T           private_buffer[SEQ],
-    volatile T* exchange_in,
-    volatile T* exchange_out);
-}  // namespace wave32
-
-}  // namespace __device
-}  // namespace cuda
-}  // namespace psz
-
-template <typename T, int SEQ>
-__forceinline__ __device__ void psz::cuda::__device::wave32::intrawarp_inclusivescan_1d(T private_buffer[SEQ])
-{
-    for (auto i = 1; i < SEQ; i++) private_buffer[i] += private_buffer[i - 1];
-    T addend = private_buffer[SEQ - 1];
-
-    // in-warp shuffle
-    for (auto d = 1; d < 32; d *= 2) {
-        T n = __shfl_up_sync(0xffffffff, addend, d, 32);
-        if (threadIdx.x % 32 >= d) addend += n;
-    }
-    // exclusive scan
-    T prev_addend = __shfl_up_sync(0xffffffff, addend, 1, 32);
-
-    // propagate
-    if (threadIdx.x % 32 > 0)
-        for (auto i = 0; i < SEQ; i++) private_buffer[i] += prev_addend;
-}
-
-template <typename T, int SEQ, int NTHREAD>
-__forceinline__ __device__ void psz::cuda::__device::wave32::intrablock_exclusivescan_1d(
-    T           private_buffer[SEQ],
-    volatile T* exchange_in,
-    volatile T* exchange_out)
-{
-    constexpr auto NWARP = NTHREAD / 32;
-    static_assert(NWARP <= 32, "too big");
-
-    auto warp_id = threadIdx.x / 32;
-    auto lane_id = threadIdx.x % 32;
-
-    if (lane_id == 31) exchange_in[warp_id] = private_buffer[SEQ - 1];
-    __syncthreads();
-
-    if (NWARP <= 8) {
-        if (threadIdx.x == 0) {
-            exchange_out[0] = 0;
-            for (auto i = 1; i < NWARP; i++) exchange_out[i] = exchange_out[i - 1] + exchange_in[i - 1];
-        }
-    }
-    else if (NWARP <= 32) {
-        if (threadIdx.x <= 32) {
-            auto addend = exchange_in[threadIdx.x];
-
-            for (auto d = 1; d < 32; d *= 2) {
-                T n = __shfl_up_sync(0xffffffff, addend, d, 32);
-                if (threadIdx.x >= d) addend += n;
-            }
-            // exclusive scan
-            T prev_addend         = __shfl_up_sync(0xffffffff, addend, 1, 32);
-            exchange_out[warp_id] = (warp_id > 0) * prev_addend;
-        }
-    }
-    // else-case handled by static_assert
-    __syncthreads();
-
-    // propagate
-    auto addend = exchange_out[warp_id];
-    for (auto i = 0; i < SEQ; i++) private_buffer[i] += addend;
-    __syncthreads();
-};
+/**
+ * @file subsub.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2022-12-26
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+namespace psz {
+namespace cuda {
+namespace __device {
+
+namespace wave32 {
+template <typename T, int SEQ>
+__forceinline__ __device__ void intrawarp_inclusivescan_1d(  //
+    T private_buffer[SEQ]);
+
+template <typename T, int SEQ, int NTHREAD>
+__forceinline__ __device__ void intrablock_exclusivescan_1d(  //
+    T           private_buffer[SEQ],
+    volatile T* exchange_in,
+    volatile T* exchange_out);
+}  // namespace wave32
+
+}  // namespace __device
+}  // namespace cuda
+}  // namespace psz
+
+template <typename T, int SEQ>
+__forceinline__ __device__ void psz::cuda::__device::wave32::intrawarp_inclusivescan_1d(T private_buffer[SEQ])
+{
+    for (auto i = 1; i < SEQ; i++) private_buffer[i] += private_buffer[i - 1];
+    T addend = private_buffer[SEQ - 1];
+
+    // in-warp shuffle
+    for (auto d = 1; d < 32; d *= 2) {
+        T n = __shfl_up_sync(0xffffffff, addend, d, 32);
+        if (threadIdx.x % 32 >= d) addend += n;
+    }
+    // exclusive scan
+    T prev_addend = __shfl_up_sync(0xffffffff, addend, 1, 32);
+
+    // propagate
+    if (threadIdx.x % 32 > 0)
+        for (auto i = 0; i < SEQ; i++) private_buffer[i] += prev_addend;
+}
+
+template <typename T, int SEQ, int NTHREAD>
+__forceinline__ __device__ void psz::cuda::__device::wave32::intrablock_exclusivescan_1d(
+    T           private_buffer[SEQ],
+    volatile T* exchange_in,
+    volatile T* exchange_out)
+{
+    constexpr auto NWARP = NTHREAD / 32;
+    static_assert(NWARP <= 32, "too big");
+
+    auto warp_id = threadIdx.x / 32;
+    auto lane_id = threadIdx.x % 32;
+
+    if (lane_id == 31) exchange_in[warp_id] = private_buffer[SEQ - 1];
+    __syncthreads();
+
+    if (NWARP <= 8) {
+        if (threadIdx.x == 0) {
+            exchange_out[0] = 0;
+            for (auto i = 1; i < NWARP; i++) exchange_out[i] = exchange_out[i - 1] + exchange_in[i - 1];
+        }
+    }
+    else if (NWARP <= 32) {
+        if (threadIdx.x <= 32) {
+            auto addend = exchange_in[threadIdx.x];
+
+            for (auto d = 1; d < 32; d *= 2) {
+                T n = __shfl_up_sync(0xffffffff, addend, d, 32);
+                if (threadIdx.x >= d) addend += n;
+            }
+            // exclusive scan
+            T prev_addend         = __shfl_up_sync(0xffffffff, addend, 1, 32);
+            exchange_out[warp_id] = (warp_id > 0) * prev_addend;
+        }
+    }
+    // else-case handled by static_assert
+    __syncthreads();
+
+    // propagate
+    auto addend = exchange_out[warp_id];
+    for (auto i = 0; i < SEQ; i++) private_buffer[i] += addend;
+    __syncthreads();
+};
diff --git a/qtensor/compression/cusz/src/kernel/lorenzo.cu b/qtensor/compression/cusz/src/kernel/lorenzo.cu
index fe5e6a25..ff46e548 100644
--- a/qtensor/compression/cusz/src/kernel/lorenzo.cu
+++ b/qtensor/compression/cusz/src/kernel/lorenzo.cu
@@ -1,209 +1,209 @@
-/**
- * @file lorenzo.cu
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-11-01
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "cusz/type.h"
-#include "utils/cuda_err.cuh"
-#include "utils/timer.h"
-
-#include "kernel/lorenzo_all.hh"
-
-// #include "detail/lorenzo.inl"
-#include "detail/lorenzo23.inl"
-
-template <typename T, typename EQ, typename FP>
-cusz_error_status compress_predict_lorenzo_i(
-    T* const     data,
-    dim3 const   len3,
-    double const eb,
-    int const    radius,
-    EQ* const    eq,
-    T* const     outlier,
-    uint32_t*    outlier_idx,
-    uint32_t*    num_outliers,
-    float*       time_elapsed,
-    cudaStream_t stream)
-{
-    auto divide3 = [](dim3 len, dim3 sublen) {
-        return dim3(
-            (len.x - 1) / sublen.x + 1,  //
-            (len.y - 1) / sublen.y + 1,  //
-            (len.z - 1) / sublen.z + 1);
-    };
-
-    auto ndim = [&]() {
-        if (len3.z == 1 and len3.y == 1)
-            return 1;
-        else if (len3.z == 1 and len3.y != 1)
-            return 2;
-        else
-            return 3;
-    };
-
-    constexpr auto SUBLEN_1D = 256;
-    constexpr auto SEQ_1D    = 4;  // x-sequentiality == 4
-    constexpr auto BLOCK_1D  = dim3(256 / 4, 1, 1);
-    auto           GRID_1D   = divide3(len3, SUBLEN_1D);
-
-    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
-    // constexpr auto SEQ_2D    = dim3(1, 8, 1);  // y-sequentiality == 8
-    constexpr auto BLOCK_2D = dim3(16, 2, 1);
-    auto           GRID_2D  = divide3(len3, SUBLEN_2D);
-
-    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
-    // constexpr auto SEQ_3D    = dim3(1, 8, 1);  // y-sequentiality == 8
-    // constexpr auto BLOCK_3D = dim3(32, 1, 8);  // for v0
-    constexpr auto BLOCK_3D = dim3(32, 8, 1);  // for v0::r1_shfl
-    auto           GRID_3D  = divide3(len3, SUBLEN_3D);
-
-    auto d = ndim();
-
-    // error bound
-    auto ebx2   = eb * 2;
-    auto ebx2_r = 1 / ebx2;
-    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
-
-    CREATE_CUDAEVENT_PAIR;
-    START_CUDAEVENT_RECORDING(stream);
-
-    if (d == 1) {
-        //::cusz::c_lorenzo_1d1l<T, EQ, FP, SUBLEN_1D, SEQ_1D>
-        //<<<GRID_1D, BLOCK_1D, 0, stream>>>(data, eq, outlier, len3, leap3, radius, ebx2_r);
-
-        psz::cuda::__kernel::v0::c_lorenzo_1d1l<T, EQ, FP, SUBLEN_1D, SEQ_1D>
-            <<<GRID_1D, BLOCK_1D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
-    }
-    else if (d == 2) {
-        //::cusz::c_lorenzo_2d1l_16x16data_mapto16x2<T, EQ, FP>
-        //<<<GRID_2D, BLOCK_2D, 0, stream>>>(data, eq, outlier, len3, leap3, radius, ebx2_r);
-        psz::cuda::__kernel::v0::c_lorenzo_2d1l<T, EQ, FP>
-            <<<GRID_2D, BLOCK_2D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
-    }
-    else if (d == 3) {
-        //::cusz::c_lorenzo_3d1l_32x8x8data_mapto32x1x8<T, EQ, FP>
-        //<<<GRID_3D, BLOCK_3D, 0, stream>>>(data, eq, outlier, len3, leap3, radius, ebx2_r);
-        psz::cuda::__kernel::v0::c_lorenzo_3d1l<T, EQ, FP>
-            <<<GRID_3D, BLOCK_3D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
-    }
-
-    STOP_CUDAEVENT_RECORDING(stream);
-    CHECK_CUDA(cudaStreamSynchronize(stream));
-    TIME_ELAPSED_CUDAEVENT(time_elapsed);
-    DESTROY_CUDAEVENT_PAIR;
-
-    return CUSZ_SUCCESS;
-}
-
-template <typename T, typename EQ, typename FP>
-cusz_error_status decompress_predict_lorenzo_i(
-    EQ*            eq,
-    dim3 const     len3,
-    T*             outlier,
-    uint32_t*      outlier_idx,
-    uint32_t const num_outliers,
-    double const   eb,
-    int const      radius,
-    T*             xdata,
-    float*         time_elapsed,
-    cudaStream_t   stream)
-{
-    auto divide3 = [](dim3 len, dim3 sublen) {
-        return dim3(
-            (len.x - 1) / sublen.x + 1,  //
-            (len.y - 1) / sublen.y + 1,  //
-            (len.z - 1) / sublen.z + 1);
-    };
-
-    auto ndim = [&]() {
-        if (len3.z == 1 and len3.y == 1)
-            return 1;
-        else if (len3.z == 1 and len3.y != 1)
-            return 2;
-        else
-            return 3;
-    };
-
-    constexpr auto SUBLEN_1D = 256;
-    constexpr auto SEQ_1D    = 8;  // x-sequentiality == 8
-    constexpr auto BLOCK_1D  = dim3(256 / 8, 1, 1);
-    auto           GRID_1D   = divide3(len3, SUBLEN_1D);
-
-    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
-    // constexpr auto SEQ_2D    = dim3(1, 8, 1);  // y-sequentiality == 8
-    constexpr auto BLOCK_2D = dim3(16, 2, 1);
-    auto           GRID_2D  = divide3(len3, SUBLEN_2D);
-
-    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
-    // constexpr auto SEQ_3D    = dim3(1, 8, 1);  // y-sequentiality == 8
-    constexpr auto BLOCK_3D = dim3(32, 1, 8);
-    auto           GRID_3D  = divide3(len3, SUBLEN_3D);
-
-    // error bound
-    auto ebx2   = eb * 2;
-    auto ebx2_r = 1 / ebx2;
-    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
-
-    auto d = ndim();
-
-    CREATE_CUDAEVENT_PAIR;
-    START_CUDAEVENT_RECORDING(stream);
-
-    if (d == 1) {
-        //::cusz::x_lorenzo_1d1l<T, EQ, FP, SUBLEN_1D, SEQ_1D>
-        //<<<GRID_1D, BLOCK_1D, 0, stream>>>(outlier, eq, xdata, len3, leap3, radius, ebx2);
-        psz::cuda::__kernel::v0::x_lorenzo_1d1l<T, EQ, FP, SUBLEN_1D, SEQ_1D>
-            <<<GRID_1D, BLOCK_1D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
-    }
-    else if (d == 2) {
-        //::cusz::x_lorenzo_2d1l_16x16data_mapto16x2<T, EQ, FP>
-        //<<<GRID_2D, BLOCK_2D, 0, stream>>>(outlier, eq, xdata, len3, leap3, radius, ebx2);
-        psz::cuda::__kernel::v0::x_lorenzo_2d1l<T, EQ, FP>
-            <<<GRID_2D, BLOCK_2D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
-    }
-    else if (d == 3) {
-        //::cusz::x_lorenzo_3d1l_32x8x8data_mapto32x1x8<T, EQ, FP>
-        //<<<GRID_3D, BLOCK_3D, 0, stream>>>(outlier, eq, xdata, len3, leap3, radius, ebx2);
-        psz::cuda::__kernel::v0::x_lorenzo_3d1l<T, EQ, FP>
-            <<<GRID_3D, BLOCK_3D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
-    }
-
-    STOP_CUDAEVENT_RECORDING(stream);
-    CHECK_CUDA(cudaStreamSynchronize(stream));
-    TIME_ELAPSED_CUDAEVENT(time_elapsed);
-    DESTROY_CUDAEVENT_PAIR;
-
-    return CUSZ_SUCCESS;
-}
-
-#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(T, EQ)                                                                     \
-    template cusz_error_status compress_predict_lorenzo_i<T, EQ>(                                                  \
-        T* const data, dim3 const len3, double const eb, int const radius, EQ* const eq, T* const outlier,         \
-        uint32_t* outlier_idx, uint32_t* num_outliers, float* time_elapsed, cudaStream_t stream);                  \
-                                                                                                                   \
-    template cusz_error_status decompress_predict_lorenzo_i<T, EQ>(                                                \
-        EQ * eq, dim3 const len3, T* outlier, uint32_t* outlier_idx, uint32_t const num_outliers, double const eb, \
-        int const radius, T* xdata, float* time_elapsed, cudaStream_t stream);
-
-// before 2023
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, uint8_t);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, uint16_t);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, uint32_t);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, float);
-
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, uint8_t);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, uint16_t);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, uint32_t);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, float);
-
-// 2023
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, int32_t);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, int32_t);
-
-#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER
+/**
+ * @file lorenzo.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-01
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "cusz/type.h"
+#include "utils/cuda_err.cuh"
+#include "utils/timer.h"
+
+#include "kernel/lorenzo_all.hh"
+
+// #include "detail/lorenzo.inl"
+#include "detail/lorenzo23.inl"
+
+template <typename T, typename EQ, typename FP>
+cusz_error_status compress_predict_lorenzo_i(
+    T* const     data,
+    dim3 const   len3,
+    double const eb,
+    int const    radius,
+    EQ* const    eq,
+    T* const     outlier,
+    uint32_t*    outlier_idx,
+    uint32_t*    num_outliers,
+    float*       time_elapsed,
+    cudaStream_t stream)
+{
+    auto divide3 = [](dim3 len, dim3 sublen) {
+        return dim3(
+            (len.x - 1) / sublen.x + 1,  //
+            (len.y - 1) / sublen.y + 1,  //
+            (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_1D = 256;
+    constexpr auto SEQ_1D    = 4;  // x-sequentiality == 4
+    constexpr auto BLOCK_1D  = dim3(256 / 4, 1, 1);
+    auto           GRID_1D   = divide3(len3, SUBLEN_1D);
+
+    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
+    // constexpr auto SEQ_2D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    constexpr auto BLOCK_2D = dim3(16, 2, 1);
+    auto           GRID_2D  = divide3(len3, SUBLEN_2D);
+
+    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
+    // constexpr auto SEQ_3D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    // constexpr auto BLOCK_3D = dim3(32, 1, 8);  // for v0
+    constexpr auto BLOCK_3D = dim3(32, 8, 1);  // for v0::r1_shfl
+    auto           GRID_3D  = divide3(len3, SUBLEN_3D);
+
+    auto d = ndim();
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    if (d == 1) {
+        //::cusz::c_lorenzo_1d1l<T, EQ, FP, SUBLEN_1D, SEQ_1D>
+        //<<<GRID_1D, BLOCK_1D, 0, stream>>>(data, eq, outlier, len3, leap3, radius, ebx2_r);
+
+        psz::cuda::__kernel::v0::c_lorenzo_1d1l<T, EQ, FP, SUBLEN_1D, SEQ_1D>
+            <<<GRID_1D, BLOCK_1D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+    else if (d == 2) {
+        //::cusz::c_lorenzo_2d1l_16x16data_mapto16x2<T, EQ, FP>
+        //<<<GRID_2D, BLOCK_2D, 0, stream>>>(data, eq, outlier, len3, leap3, radius, ebx2_r);
+        psz::cuda::__kernel::v0::c_lorenzo_2d1l<T, EQ, FP>
+            <<<GRID_2D, BLOCK_2D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+    else if (d == 3) {
+        //::cusz::c_lorenzo_3d1l_32x8x8data_mapto32x1x8<T, EQ, FP>
+        //<<<GRID_3D, BLOCK_3D, 0, stream>>>(data, eq, outlier, len3, leap3, radius, ebx2_r);
+        psz::cuda::__kernel::v0::c_lorenzo_3d1l<T, EQ, FP>
+            <<<GRID_3D, BLOCK_3D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+    TIME_ELAPSED_CUDAEVENT(time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+template <typename T, typename EQ, typename FP>
+cusz_error_status decompress_predict_lorenzo_i(
+    EQ*            eq,
+    dim3 const     len3,
+    T*             outlier,
+    uint32_t*      outlier_idx,
+    uint32_t const num_outliers,
+    double const   eb,
+    int const      radius,
+    T*             xdata,
+    float*         time_elapsed,
+    cudaStream_t   stream)
+{
+    auto divide3 = [](dim3 len, dim3 sublen) {
+        return dim3(
+            (len.x - 1) / sublen.x + 1,  //
+            (len.y - 1) / sublen.y + 1,  //
+            (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_1D = 256;
+    constexpr auto SEQ_1D    = 8;  // x-sequentiality == 8
+    constexpr auto BLOCK_1D  = dim3(256 / 8, 1, 1);
+    auto           GRID_1D   = divide3(len3, SUBLEN_1D);
+
+    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
+    // constexpr auto SEQ_2D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    constexpr auto BLOCK_2D = dim3(16, 2, 1);
+    auto           GRID_2D  = divide3(len3, SUBLEN_2D);
+
+    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
+    // constexpr auto SEQ_3D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    constexpr auto BLOCK_3D = dim3(32, 1, 8);
+    auto           GRID_3D  = divide3(len3, SUBLEN_3D);
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
+
+    auto d = ndim();
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    if (d == 1) {
+        //::cusz::x_lorenzo_1d1l<T, EQ, FP, SUBLEN_1D, SEQ_1D>
+        //<<<GRID_1D, BLOCK_1D, 0, stream>>>(outlier, eq, xdata, len3, leap3, radius, ebx2);
+        psz::cuda::__kernel::v0::x_lorenzo_1d1l<T, EQ, FP, SUBLEN_1D, SEQ_1D>
+            <<<GRID_1D, BLOCK_1D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+    else if (d == 2) {
+        //::cusz::x_lorenzo_2d1l_16x16data_mapto16x2<T, EQ, FP>
+        //<<<GRID_2D, BLOCK_2D, 0, stream>>>(outlier, eq, xdata, len3, leap3, radius, ebx2);
+        psz::cuda::__kernel::v0::x_lorenzo_2d1l<T, EQ, FP>
+            <<<GRID_2D, BLOCK_2D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+    else if (d == 3) {
+        //::cusz::x_lorenzo_3d1l_32x8x8data_mapto32x1x8<T, EQ, FP>
+        //<<<GRID_3D, BLOCK_3D, 0, stream>>>(outlier, eq, xdata, len3, leap3, radius, ebx2);
+        psz::cuda::__kernel::v0::x_lorenzo_3d1l<T, EQ, FP>
+            <<<GRID_3D, BLOCK_3D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+    TIME_ELAPSED_CUDAEVENT(time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(T, EQ)                                                                     \
+    template cusz_error_status compress_predict_lorenzo_i<T, EQ>(                                                  \
+        T* const data, dim3 const len3, double const eb, int const radius, EQ* const eq, T* const outlier,         \
+        uint32_t* outlier_idx, uint32_t* num_outliers, float* time_elapsed, cudaStream_t stream);                  \
+                                                                                                                   \
+    template cusz_error_status decompress_predict_lorenzo_i<T, EQ>(                                                \
+        EQ * eq, dim3 const len3, T* outlier, uint32_t* outlier_idx, uint32_t const num_outliers, double const eb, \
+        int const radius, T* xdata, float* time_elapsed, cudaStream_t stream);
+
+// before 2023
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, uint8_t);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, uint16_t);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, uint32_t);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, float);
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, uint8_t);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, uint16_t);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, uint32_t);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, float);
+
+// 2023
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, int32_t);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, int32_t);
+
+#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER
diff --git a/qtensor/compression/cusz/src/kernel/lorenzo_proto.cu b/qtensor/compression/cusz/src/kernel/lorenzo_proto.cu
index 3dcbadb3..061aebb4 100644
--- a/qtensor/compression/cusz/src/kernel/lorenzo_proto.cu
+++ b/qtensor/compression/cusz/src/kernel/lorenzo_proto.cu
@@ -1,176 +1,176 @@
-/**
- * @file claunch_cuda_proto.cu
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-09-22
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "cusz/type.h"
-#include "utils/cuda_err.cuh"
-#include "utils/timer.h"
-
-#include "kernel/lorenzo_all.h"
-#include "kernel/lorenzo_all.hh"
-
-#include "detail/lorenzo_proto.inl"
-
-template <typename T, typename EQ, typename FP>
-cusz_error_status compress_predict_lorenzo_iproto(
-    T* const     data,
-    dim3 const   len3,
-    double const eb,
-    int const    radius,
-    EQ* const    eq,
-    T*           outlier,
-    uint32_t*    outlier_idx,
-    uint32_t*    num_outliers,
-    float*       time_elapsed,
-    cudaStream_t stream)
-{
-    auto divide3 = [](dim3 len, dim3 sublen) {
-        return dim3((len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1);
-    };
-
-    auto ndim = [&]() {
-        if (len3.z == 1 and len3.y == 1)
-            return 1;
-        else if (len3.z == 1 and len3.y != 1)
-            return 2;
-        else
-            return 3;
-    };
-
-    constexpr auto SUBLEN_1D = 256;
-    constexpr auto BLOCK_1D  = dim3(256, 1, 1);
-    auto           GRID_1D   = divide3(len3, SUBLEN_1D);
-
-    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
-    constexpr auto BLOCK_2D  = dim3(16, 16, 1);
-    auto           GRID_2D   = divide3(len3, SUBLEN_2D);
-
-    constexpr auto SUBLEN_3D = dim3(8, 8, 8);
-    constexpr auto BLOCK_3D  = dim3(8, 8, 8);
-    auto           GRID_3D   = divide3(len3, SUBLEN_3D);
-
-    // error bound
-    auto ebx2   = eb * 2;
-    auto ebx2_r = 1 / ebx2;
-    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
-
-    CREATE_CUDAEVENT_PAIR;
-    START_CUDAEVENT_RECORDING(stream);
-
-    using namespace psz::cuda::__kernel::prototype;
-
-    if (ndim() == 1) {
-        c_lorenzo_1d1l<T, EQ, FP><<<GRID_1D, BLOCK_1D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
-    }
-    else if (ndim() == 2) {
-        c_lorenzo_2d1l<T, EQ, FP><<<GRID_2D, BLOCK_2D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
-    }
-    else if (ndim() == 3) {
-        c_lorenzo_3d1l<T, EQ, FP><<<GRID_3D, BLOCK_3D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
-    }
-    else {
-        throw std::runtime_error("Lorenzo only works for 123-D.");
-    }
-
-    STOP_CUDAEVENT_RECORDING(stream);
-    CHECK_CUDA(cudaStreamSynchronize(stream));
-
-    TIME_ELAPSED_CUDAEVENT(time_elapsed);
-    DESTROY_CUDAEVENT_PAIR;
-
-    return CUSZ_SUCCESS;
-}
-
-template <typename T, typename EQ, typename FP>
-cusz_error_status decompress_predict_lorenzo_iproto(
-    EQ*            eq,
-    dim3 const     len3,
-    T*             outlier,
-    uint32_t*      outlier_idx,
-    uint32_t const num_outliers,
-    double const   eb,
-    int const      radius,
-    T*             xdata,
-    float*         time_elapsed,
-    cudaStream_t   stream)
-{
-    auto divide3 = [](dim3 len, dim3 sublen) {
-        return dim3((len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1);
-    };
-
-    auto ndim = [&]() {
-        if (len3.z == 1 and len3.y == 1)
-            return 1;
-        else if (len3.z == 1 and len3.y != 1)
-            return 2;
-        else
-            return 3;
-    };
-
-    constexpr auto SUBLEN_1D = 256;
-    constexpr auto BLOCK_1D  = dim3(256, 1, 1);
-    auto           GRID_1D   = divide3(len3, SUBLEN_1D);
-
-    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
-    constexpr auto BLOCK_2D  = dim3(16, 16, 1);
-    auto           GRID_2D   = divide3(len3, SUBLEN_2D);
-
-    constexpr auto SUBLEN_3D = dim3(8, 8, 8);
-    constexpr auto BLOCK_3D  = dim3(8, 8, 8);
-    auto           GRID_3D   = divide3(len3, SUBLEN_3D);
-
-    // error bound
-    auto ebx2   = eb * 2;
-    auto ebx2_r = 1 / ebx2;
-    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
-
-    CREATE_CUDAEVENT_PAIR;
-    START_CUDAEVENT_RECORDING(stream);
-
-    using namespace psz::cuda::__kernel::prototype;
-
-    if (ndim() == 1) {
-        x_lorenzo_1d1l<T, EQ, FP><<<GRID_1D, BLOCK_1D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
-    }
-    else if (ndim() == 2) {
-        x_lorenzo_2d1l<T, EQ, FP><<<GRID_2D, BLOCK_2D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
-    }
-    else if (ndim() == 3) {
-        x_lorenzo_3d1l<T, EQ, FP><<<GRID_3D, BLOCK_3D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
-    }
-
-    STOP_CUDAEVENT_RECORDING(stream);
-    CHECK_CUDA(cudaStreamSynchronize(stream));
-
-    TIME_ELAPSED_CUDAEVENT(time_elapsed);
-    DESTROY_CUDAEVENT_PAIR;
-
-    return CUSZ_SUCCESS;
-}
-
-#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, EQ, FP)                         \
-    template cusz_error_status compress_predict_lorenzo_iproto<T, EQ, FP>(                                \
-        T* const, dim3 const, double const, int const, EQ* const, T* const, uint32_t*, uint32_t*, float*, \
-        cudaStream_t);                                                                                    \
-                                                                                                          \
-    template cusz_error_status decompress_predict_lorenzo_iproto<T, EQ, FP>(                              \
-        EQ*, dim3 const, T*, uint32_t*, uint32_t const, double const, int const, T*, float*, cudaStream_t);
-
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float);
-
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double);
-
-#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER
+/**
+ * @file claunch_cuda_proto.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-09-22
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "cusz/type.h"
+#include "utils/cuda_err.cuh"
+#include "utils/timer.h"
+
+#include "kernel/lorenzo_all.h"
+#include "kernel/lorenzo_all.hh"
+
+#include "detail/lorenzo_proto.inl"
+
+template <typename T, typename EQ, typename FP>
+cusz_error_status compress_predict_lorenzo_iproto(
+    T* const     data,
+    dim3 const   len3,
+    double const eb,
+    int const    radius,
+    EQ* const    eq,
+    T*           outlier,
+    uint32_t*    outlier_idx,
+    uint32_t*    num_outliers,
+    float*       time_elapsed,
+    cudaStream_t stream)
+{
+    auto divide3 = [](dim3 len, dim3 sublen) {
+        return dim3((len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_1D = 256;
+    constexpr auto BLOCK_1D  = dim3(256, 1, 1);
+    auto           GRID_1D   = divide3(len3, SUBLEN_1D);
+
+    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
+    constexpr auto BLOCK_2D  = dim3(16, 16, 1);
+    auto           GRID_2D   = divide3(len3, SUBLEN_2D);
+
+    constexpr auto SUBLEN_3D = dim3(8, 8, 8);
+    constexpr auto BLOCK_3D  = dim3(8, 8, 8);
+    auto           GRID_3D   = divide3(len3, SUBLEN_3D);
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    using namespace psz::cuda::__kernel::prototype;
+
+    if (ndim() == 1) {
+        c_lorenzo_1d1l<T, EQ, FP><<<GRID_1D, BLOCK_1D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+    else if (ndim() == 2) {
+        c_lorenzo_2d1l<T, EQ, FP><<<GRID_2D, BLOCK_2D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+    else if (ndim() == 3) {
+        c_lorenzo_3d1l<T, EQ, FP><<<GRID_3D, BLOCK_3D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+    else {
+        throw std::runtime_error("Lorenzo only works for 123-D.");
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    TIME_ELAPSED_CUDAEVENT(time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+template <typename T, typename EQ, typename FP>
+cusz_error_status decompress_predict_lorenzo_iproto(
+    EQ*            eq,
+    dim3 const     len3,
+    T*             outlier,
+    uint32_t*      outlier_idx,
+    uint32_t const num_outliers,
+    double const   eb,
+    int const      radius,
+    T*             xdata,
+    float*         time_elapsed,
+    cudaStream_t   stream)
+{
+    auto divide3 = [](dim3 len, dim3 sublen) {
+        return dim3((len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_1D = 256;
+    constexpr auto BLOCK_1D  = dim3(256, 1, 1);
+    auto           GRID_1D   = divide3(len3, SUBLEN_1D);
+
+    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
+    constexpr auto BLOCK_2D  = dim3(16, 16, 1);
+    auto           GRID_2D   = divide3(len3, SUBLEN_2D);
+
+    constexpr auto SUBLEN_3D = dim3(8, 8, 8);
+    constexpr auto BLOCK_3D  = dim3(8, 8, 8);
+    auto           GRID_3D   = divide3(len3, SUBLEN_3D);
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    using namespace psz::cuda::__kernel::prototype;
+
+    if (ndim() == 1) {
+        x_lorenzo_1d1l<T, EQ, FP><<<GRID_1D, BLOCK_1D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+    else if (ndim() == 2) {
+        x_lorenzo_2d1l<T, EQ, FP><<<GRID_2D, BLOCK_2D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+    else if (ndim() == 3) {
+        x_lorenzo_3d1l<T, EQ, FP><<<GRID_3D, BLOCK_3D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    TIME_ELAPSED_CUDAEVENT(time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, EQ, FP)                         \
+    template cusz_error_status compress_predict_lorenzo_iproto<T, EQ, FP>(                                \
+        T* const, dim3 const, double const, int const, EQ* const, T* const, uint32_t*, uint32_t*, float*, \
+        cudaStream_t);                                                                                    \
+                                                                                                          \
+    template cusz_error_status decompress_predict_lorenzo_iproto<T, EQ, FP>(                              \
+        EQ*, dim3 const, T*, uint32_t*, uint32_t const, double const, int const, T*, float*, cudaStream_t);
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float);
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double);
+
+#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER
diff --git a/qtensor/compression/cusz/src/kernel/lorenzo_serial.cc b/qtensor/compression/cusz/src/kernel/lorenzo_serial.cc
index b274bc23..0ef5b9f5 100644
--- a/qtensor/compression/cusz/src/kernel/lorenzo_serial.cc
+++ b/qtensor/compression/cusz/src/kernel/lorenzo_serial.cc
@@ -1,118 +1,118 @@
-/**
- * @file lorenzo.cu
- * @author Jiannan Tian
- * @brief
- * @version 0.4
- * @date 2023-03-16
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "detail/lorenzo_serial.inl"
-#include "cusz/type.h"
-
-template <typename T, typename EQ, typename FP, typename OUTLIER = psz_outlier_serial<T>>
-cusz_error_status serial_compress_predict_lorenzo_i(
-    T* const       data,
-    psz_dim3 const len3,
-    double const   eb,
-    int const      radius,
-    EQ* const      eq,
-    OUTLIER*       outlier,
-    float*         time_elapsed)
-{
-    auto divide3 = [](psz_dim3 len, psz_dim3 sublen) {
-        return psz_dim3{(len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1};
-    };
-
-    auto ndim = [&]() {
-        if (len3.z == 1 and len3.y == 1)
-            return 1;
-        else if (len3.z == 1 and len3.y != 1)
-            return 2;
-        else
-            return 3;
-    };
-
-    auto d = ndim();
-
-    // error bound
-    auto ebx2   = eb * 2;
-    auto ebx2_r = 1 / ebx2;
-    auto leap3  = psz_dim3{1, len3.x, len3.x * len3.y};
-
-    if (d == 1) {
-        psz::serial::__kernel::c_lorenzo_1d1l<T, EQ, FP, 256>(data, len3, leap3, radius, ebx2_r, eq, outlier);
-    }
-    else if (d == 2) {
-        psz::serial::__kernel::c_lorenzo_2d1l<T, EQ, FP, 16>(data, len3, leap3, radius, ebx2_r, eq, outlier);
-    }
-    else if (d == 3) {
-        psz::serial::__kernel::c_lorenzo_3d1l<T, EQ, FP, 8>(data, len3, leap3, radius, ebx2_r, eq, outlier);
-    }
-
-    return CUSZ_SUCCESS;
-}
-
-template <typename T, typename EQ, typename FP>
-cusz_error_status serial_decompress_predict_lorenzo_i(
-    EQ*            eq,
-    psz_dim3 const len3,
-    T*             outlier,
-    double const   eb,
-    int const      radius,
-    T*             xdata,
-    float*         time_elapsed)
-{
-    auto divide3 = [](psz_dim3 len, psz_dim3 sublen) {
-        return psz_dim3{(len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1};
-    };
-
-    auto ndim = [&]() {
-        if (len3.z == 1 and len3.y == 1)
-            return 1;
-        else if (len3.z == 1 and len3.y != 1)
-            return 2;
-        else
-            return 3;
-    };
-
-    // error bound
-    auto ebx2   = eb * 2;
-    auto ebx2_r = 1 / ebx2;
-    auto leap3  = psz_dim3{1, len3.x, len3.x * len3.y};
-
-    auto d = ndim();
-
-    if (d == 1) {
-        psz::serial::__kernel::x_lorenzo_1d1l<T, EQ, FP, 256>(eq, outlier, len3, leap3, radius, ebx2, xdata);
-    }
-    else if (d == 2) {
-        psz::serial::__kernel::x_lorenzo_2d1l<T, EQ, FP, 16>(eq, outlier, len3, leap3, radius, ebx2, xdata);
-    }
-    else if (d == 3) {
-        psz::serial::__kernel::x_lorenzo_3d1l<T, EQ, FP, 8>(eq, outlier, len3, leap3, radius, ebx2, xdata);
-    }
-
-    return CUSZ_SUCCESS;
-}
-
-#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, EQ, FP)                      \
-    template cusz_error_status serial_compress_predict_lorenzo_i<T, EQ, FP>(                           \
-        T* const, psz_dim3 const, double const, int const, EQ* const, psz_outlier_serial<T>*, float*); \
-                                                                                                       \
-    template cusz_error_status serial_decompress_predict_lorenzo_i<T, EQ, FP>(                         \
-        EQ*, psz_dim3 const, T*, double const, int const, T*, float*);
-
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float);
-
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double);
-
-#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER
+/**
+ * @file lorenzo.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-03-16
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "detail/lorenzo_serial.inl"
+#include "cusz/type.h"
+
+template <typename T, typename EQ, typename FP, typename OUTLIER = psz_outlier_serial<T>>
+cusz_error_status serial_compress_predict_lorenzo_i(
+    T* const       data,
+    psz_dim3 const len3,
+    double const   eb,
+    int const      radius,
+    EQ* const      eq,
+    OUTLIER*       outlier,
+    float*         time_elapsed)
+{
+    auto divide3 = [](psz_dim3 len, psz_dim3 sublen) {
+        return psz_dim3{(len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1};
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    auto d = ndim();
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = psz_dim3{1, len3.x, len3.x * len3.y};
+
+    if (d == 1) {
+        psz::serial::__kernel::c_lorenzo_1d1l<T, EQ, FP, 256>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+    else if (d == 2) {
+        psz::serial::__kernel::c_lorenzo_2d1l<T, EQ, FP, 16>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+    else if (d == 3) {
+        psz::serial::__kernel::c_lorenzo_3d1l<T, EQ, FP, 8>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+
+    return CUSZ_SUCCESS;
+}
+
+template <typename T, typename EQ, typename FP>
+cusz_error_status serial_decompress_predict_lorenzo_i(
+    EQ*            eq,
+    psz_dim3 const len3,
+    T*             outlier,
+    double const   eb,
+    int const      radius,
+    T*             xdata,
+    float*         time_elapsed)
+{
+    auto divide3 = [](psz_dim3 len, psz_dim3 sublen) {
+        return psz_dim3{(len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1};
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = psz_dim3{1, len3.x, len3.x * len3.y};
+
+    auto d = ndim();
+
+    if (d == 1) {
+        psz::serial::__kernel::x_lorenzo_1d1l<T, EQ, FP, 256>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+    else if (d == 2) {
+        psz::serial::__kernel::x_lorenzo_2d1l<T, EQ, FP, 16>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+    else if (d == 3) {
+        psz::serial::__kernel::x_lorenzo_3d1l<T, EQ, FP, 8>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+
+    return CUSZ_SUCCESS;
+}
+
+#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, EQ, FP)                      \
+    template cusz_error_status serial_compress_predict_lorenzo_i<T, EQ, FP>(                           \
+        T* const, psz_dim3 const, double const, int const, EQ* const, psz_outlier_serial<T>*, float*); \
+                                                                                                       \
+    template cusz_error_status serial_decompress_predict_lorenzo_i<T, EQ, FP>(                         \
+        EQ*, psz_dim3 const, T*, double const, int const, T*, float*);
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float);
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double);
+
+#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER
diff --git a/qtensor/compression/cusz/src/kernel/lorenzo_var.cu b/qtensor/compression/cusz/src/kernel/lorenzo_var.cu
index 8fc3ff39..12773d35 100644
--- a/qtensor/compression/cusz/src/kernel/lorenzo_var.cu
+++ b/qtensor/compression/cusz/src/kernel/lorenzo_var.cu
@@ -1,206 +1,206 @@
-/**
- * @file lorenzo_var.cu
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-10-27
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "cusz/type.h"
-#include "utils/cuda_err.cuh"
-#include "utils/timer.h"
-
-#include "kernel/lorenzo_all.h"
-#include "kernel/lorenzo_all.hh"
-
-#include "detail/lorenzo_var.inl"
-
-template <typename T, typename DeltaT, typename FP>
-cusz_error_status asz::experimental::compress_predict_lorenzo_ivar(
-    T*           data,
-    dim3 const   len3,
-    double const eb,
-    DeltaT*      delta,
-    bool*        signum,
-    float*       time_elapsed,
-    cudaStream_t stream)
-{
-    auto pardeg3 = [](dim3 len, dim3 sublen) {
-        return dim3(
-            (len.x - 1) / sublen.x + 1,  //
-            (len.y - 1) / sublen.y + 1,  //
-            (len.z - 1) / sublen.z + 1);
-    };
-
-    auto ndim = [&]() {
-        if (len3.z == 1 and len3.y == 1)
-            return 1;
-        else if (len3.z == 1 and len3.y != 1)
-            return 2;
-        else
-            return 3;
-    };
-
-    constexpr auto SUBLEN_1D = 256;
-    constexpr auto SEQ_1D    = 4;  // x-sequentiality == 4
-    constexpr auto BLOCK_1D  = dim3(256 / 4, 1, 1);
-    auto           GRID_1D   = pardeg3(len3, SUBLEN_1D);
-
-    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
-    // constexpr auto SEQ_2D    = dim3(1, 8, 1);  // y-sequentiality == 8
-    constexpr auto BLOCK_2D = dim3(16, 2, 1);
-    auto           GRID_2D  = pardeg3(len3, SUBLEN_2D);
-
-    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
-    // constexpr auto SEQ_3D    = dim3(1, 8, 1);  // y-sequentiality == 8
-    constexpr auto BLOCK_3D = dim3(32, 1, 8);
-    auto           GRID_3D  = pardeg3(len3, SUBLEN_3D);
-
-    // error bound
-    auto ebx2   = eb * 2;
-    auto ebx2_r = 1 / ebx2;
-    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
-
-    CREATE_CUDAEVENT_PAIR;
-    START_CUDAEVENT_RECORDING(stream);
-
-    if (ndim() == 1) {
-        cusz::experimental::c_lorenzo_1d1l<T, DeltaT, FP, SEQ_1D, SEQ_1D>  //
-            <<<GRID_1D, BLOCK_1D, 0, stream>>>                             //
-            (data, delta, signum, len3, leap3, ebx2_r);
-    }
-    else if (ndim() == 2) {
-        cusz::experimental::c_lorenzo_2d1l_16x16data_mapto16x2<T, DeltaT, FP>  //
-            <<<GRID_2D, BLOCK_2D, 0, stream>>>                                 //
-            (data, delta, signum, len3, leap3, ebx2_r);
-    }
-    else if (ndim() == 3) {
-        cusz::experimental::c_lorenzo_3d1l_32x8x8data_mapto32x1x8<T, DeltaT, FP>  //
-            <<<GRID_3D, BLOCK_3D, 0, stream>>>                                    //
-            (data, delta, signum, len3, leap3, ebx2_r);
-    }
-    else {
-        throw std::runtime_error("Lorenzo only works for 123-D.");
-    }
-
-    STOP_CUDAEVENT_RECORDING(stream);
-    CHECK_CUDA(cudaStreamSynchronize(stream));
-
-    TIME_ELAPSED_CUDAEVENT(time_elapsed);
-    DESTROY_CUDAEVENT_PAIR;
-
-    return CUSZ_SUCCESS;
-}
-
-template <typename T, typename DeltaT, typename FP>
-cusz_error_status asz::experimental::decompress_predict_lorenzo_ivar(
-    DeltaT*      delta,
-    bool*        signum,
-    dim3 const   len3,
-    double const eb,
-    T*           xdata,
-    float*       time_elapsed,
-    cudaStream_t stream)
-{
-    auto pardeg3 = [](dim3 len, dim3 sublen) {
-        return dim3(
-            (len.x - 1) / sublen.x + 1,  //
-            (len.y - 1) / sublen.y + 1,  //
-            (len.z - 1) / sublen.z + 1);
-    };
-
-    auto ndim = [&]() {
-        if (len3.z == 1 and len3.y == 1)
-            return 1;
-        else if (len3.z == 1 and len3.y != 1)
-            return 2;
-        else
-            return 3;
-    };
-
-    constexpr auto SUBLEN_1D = 256;
-    // constexpr auto SEQ_1D    = 8;  // x-sequentiality == 8
-    constexpr auto BLOCK_1D = dim3(256 / 8, 1, 1);
-    auto           GRID_1D  = pardeg3(len3, SUBLEN_1D);
-
-    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
-    // constexpr auto SEQ_2D    = dim3(1, 8, 1);  // y-sequentiality == 8
-    constexpr auto BLOCK_2D = dim3(16, 2, 1);
-    auto           GRID_2D  = pardeg3(len3, SUBLEN_2D);
-
-    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
-    // constexpr auto SEQ_3D    = dim3(1, 8, 1);  // y-sequentiality == 8
-    constexpr auto BLOCK_3D = dim3(32, 1, 8);
-    auto           GRID_3D  = pardeg3(len3, SUBLEN_3D);
-
-    // error bound
-    auto ebx2   = eb * 2;
-    auto ebx2_r = 1 / ebx2;
-    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
-
-    CREATE_CUDAEVENT_PAIR;
-    START_CUDAEVENT_RECORDING(stream);
-
-    if (ndim() == 1) {
-        cusz::experimental::x_lorenzo_1d1l<T, DeltaT, FP, 256, 8>  //
-            <<<GRID_1D, BLOCK_1D, 0, stream>>>                     //
-            (signum, delta, xdata, len3, leap3, ebx2);
-    }
-    else if (ndim() == 2) {
-        cusz::experimental::x_lorenzo_2d1l_16x16data_mapto16x2<T, DeltaT, FP>  //
-            <<<GRID_2D, BLOCK_2D, 0, stream>>>                                 //
-            (signum, delta, xdata, len3, leap3, ebx2);
-    }
-    else {
-        cusz::experimental::x_lorenzo_3d1l_32x8x8data_mapto32x1x8<T, DeltaT, FP>  //
-            <<<GRID_3D, BLOCK_3D, 0, stream>>>                                    //
-            (signum, delta, xdata, len3, leap3, ebx2);
-    }
-
-    STOP_CUDAEVENT_RECORDING(stream);
-    CHECK_CUDA(cudaStreamSynchronize(stream));
-
-    TIME_ELAPSED_CUDAEVENT(time_elapsed);
-    DESTROY_CUDAEVENT_PAIR;
-
-    return CUSZ_SUCCESS;
-}
-
-#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, E, FP)                                      \
-    template cusz_error_status asz::experimental::compress_predict_lorenzo_ivar<T, E, FP>(                            \
-        T*, dim3 const, double const, E*, bool*, float*, cudaStream_t);                                               \
-                                                                                                                      \
-    template cusz_error_status asz::experimental::decompress_predict_lorenzo_ivar<T, E, FP>(                          \
-        E*, bool*, dim3 const, double const, T*, float*, cudaStream_t);                                               \
-                                                                                                                      \
-    cusz_error_status compress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral(                        \
-        T* const data, dim3 const len3, double const eb, E* delta, bool* signum, float* time_elapsed,                 \
-        cudaStream_t stream)                                                                                          \
-    {                                                                                                                 \
-        asz::experimental::compress_predict_lorenzo_ivar<T, E, FP>(                                                   \
-            data, len3, eb, delta, signum, time_elapsed, stream);                                                     \
-        return CUSZ_SUCCESS;                                                                                          \
-    }                                                                                                                 \
-                                                                                                                      \
-    cusz_error_status decompress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral(                      \
-        E* delta, bool* signum, dim3 const len3, double const eb, T* xdata, float* time_elapsed, cudaStream_t stream) \
-    {                                                                                                                 \
-        asz::experimental::decompress_predict_lorenzo_ivar<T, E, FP>(                                                 \
-            delta, signum, len3, eb, xdata, time_elapsed, stream);                                                    \
-        return CUSZ_SUCCESS;                                                                                          \
-    }
-
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float);
-
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double);
-
-#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER
+/**
+ * @file lorenzo_var.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-27
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "cusz/type.h"
+#include "utils/cuda_err.cuh"
+#include "utils/timer.h"
+
+#include "kernel/lorenzo_all.h"
+#include "kernel/lorenzo_all.hh"
+
+#include "detail/lorenzo_var.inl"
+
+template <typename T, typename DeltaT, typename FP>
+cusz_error_status asz::experimental::compress_predict_lorenzo_ivar(
+    T*           data,
+    dim3 const   len3,
+    double const eb,
+    DeltaT*      delta,
+    bool*        signum,
+    float*       time_elapsed,
+    cudaStream_t stream)
+{
+    auto pardeg3 = [](dim3 len, dim3 sublen) {
+        return dim3(
+            (len.x - 1) / sublen.x + 1,  //
+            (len.y - 1) / sublen.y + 1,  //
+            (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_1D = 256;
+    constexpr auto SEQ_1D    = 4;  // x-sequentiality == 4
+    constexpr auto BLOCK_1D  = dim3(256 / 4, 1, 1);
+    auto           GRID_1D   = pardeg3(len3, SUBLEN_1D);
+
+    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
+    // constexpr auto SEQ_2D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    constexpr auto BLOCK_2D = dim3(16, 2, 1);
+    auto           GRID_2D  = pardeg3(len3, SUBLEN_2D);
+
+    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
+    // constexpr auto SEQ_3D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    constexpr auto BLOCK_3D = dim3(32, 1, 8);
+    auto           GRID_3D  = pardeg3(len3, SUBLEN_3D);
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    if (ndim() == 1) {
+        cusz::experimental::c_lorenzo_1d1l<T, DeltaT, FP, SEQ_1D, SEQ_1D>  //
+            <<<GRID_1D, BLOCK_1D, 0, stream>>>                             //
+            (data, delta, signum, len3, leap3, ebx2_r);
+    }
+    else if (ndim() == 2) {
+        cusz::experimental::c_lorenzo_2d1l_16x16data_mapto16x2<T, DeltaT, FP>  //
+            <<<GRID_2D, BLOCK_2D, 0, stream>>>                                 //
+            (data, delta, signum, len3, leap3, ebx2_r);
+    }
+    else if (ndim() == 3) {
+        cusz::experimental::c_lorenzo_3d1l_32x8x8data_mapto32x1x8<T, DeltaT, FP>  //
+            <<<GRID_3D, BLOCK_3D, 0, stream>>>                                    //
+            (data, delta, signum, len3, leap3, ebx2_r);
+    }
+    else {
+        throw std::runtime_error("Lorenzo only works for 123-D.");
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    TIME_ELAPSED_CUDAEVENT(time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+template <typename T, typename DeltaT, typename FP>
+cusz_error_status asz::experimental::decompress_predict_lorenzo_ivar(
+    DeltaT*      delta,
+    bool*        signum,
+    dim3 const   len3,
+    double const eb,
+    T*           xdata,
+    float*       time_elapsed,
+    cudaStream_t stream)
+{
+    auto pardeg3 = [](dim3 len, dim3 sublen) {
+        return dim3(
+            (len.x - 1) / sublen.x + 1,  //
+            (len.y - 1) / sublen.y + 1,  //
+            (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_1D = 256;
+    // constexpr auto SEQ_1D    = 8;  // x-sequentiality == 8
+    constexpr auto BLOCK_1D = dim3(256 / 8, 1, 1);
+    auto           GRID_1D  = pardeg3(len3, SUBLEN_1D);
+
+    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
+    // constexpr auto SEQ_2D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    constexpr auto BLOCK_2D = dim3(16, 2, 1);
+    auto           GRID_2D  = pardeg3(len3, SUBLEN_2D);
+
+    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
+    // constexpr auto SEQ_3D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    constexpr auto BLOCK_3D = dim3(32, 1, 8);
+    auto           GRID_3D  = pardeg3(len3, SUBLEN_3D);
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    if (ndim() == 1) {
+        cusz::experimental::x_lorenzo_1d1l<T, DeltaT, FP, 256, 8>  //
+            <<<GRID_1D, BLOCK_1D, 0, stream>>>                     //
+            (signum, delta, xdata, len3, leap3, ebx2);
+    }
+    else if (ndim() == 2) {
+        cusz::experimental::x_lorenzo_2d1l_16x16data_mapto16x2<T, DeltaT, FP>  //
+            <<<GRID_2D, BLOCK_2D, 0, stream>>>                                 //
+            (signum, delta, xdata, len3, leap3, ebx2);
+    }
+    else {
+        cusz::experimental::x_lorenzo_3d1l_32x8x8data_mapto32x1x8<T, DeltaT, FP>  //
+            <<<GRID_3D, BLOCK_3D, 0, stream>>>                                    //
+            (signum, delta, xdata, len3, leap3, ebx2);
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    TIME_ELAPSED_CUDAEVENT(time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, E, FP)                                      \
+    template cusz_error_status asz::experimental::compress_predict_lorenzo_ivar<T, E, FP>(                            \
+        T*, dim3 const, double const, E*, bool*, float*, cudaStream_t);                                               \
+                                                                                                                      \
+    template cusz_error_status asz::experimental::decompress_predict_lorenzo_ivar<T, E, FP>(                          \
+        E*, bool*, dim3 const, double const, T*, float*, cudaStream_t);                                               \
+                                                                                                                      \
+    cusz_error_status compress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral(                        \
+        T* const data, dim3 const len3, double const eb, E* delta, bool* signum, float* time_elapsed,                 \
+        cudaStream_t stream)                                                                                          \
+    {                                                                                                                 \
+        asz::experimental::compress_predict_lorenzo_ivar<T, E, FP>(                                                   \
+            data, len3, eb, delta, signum, time_elapsed, stream);                                                     \
+        return CUSZ_SUCCESS;                                                                                          \
+    }                                                                                                                 \
+                                                                                                                      \
+    cusz_error_status decompress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral(                      \
+        E* delta, bool* signum, dim3 const len3, double const eb, T* xdata, float* time_elapsed, cudaStream_t stream) \
+    {                                                                                                                 \
+        asz::experimental::decompress_predict_lorenzo_ivar<T, E, FP>(                                                 \
+            delta, signum, len3, eb, xdata, time_elapsed, stream);                                                    \
+        return CUSZ_SUCCESS;                                                                                          \
+    }
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float);
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double);
+
+#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER
diff --git a/qtensor/compression/cusz/src/kernel/preprocess.cuh b/qtensor/compression/cusz/src/kernel/preprocess.cuh
index f7c321f7..f082c193 100644
--- a/qtensor/compression/cusz/src/kernel/preprocess.cuh
+++ b/qtensor/compression/cusz/src/kernel/preprocess.cuh
@@ -1,65 +1,65 @@
-/**
- * @file preprocess.cuh
- * @author Jiannan Tian
- * @brief Filters for preprocessing of cuSZ.
- * @version 0.3
- * @date 2020-09-20
- * (created) 2020-05-03 (rev) 2021-06-21
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#ifndef CUSZ_KERNEL_PREPROCESS_CUH
-#define CUSZ_KERNEL_PREPROCESS_CUH
-
-#include <iostream>
-
-#include "common.hh"
-
-using std::cout;
-using std::endl;
-
-namespace cusz {
-
-#include <numeric>
-
-template <typename T>
-__global__ void log_transform()
-{
-    static_assert(std::is_floating_point<T>::value, "[log_transform] must be floating-point type.");
-}
-
-template <typename Data, int DOWNSCALE_FACTOR, int tBLK>
-__global__ void binning2d(Data* input, Data* output, size_t d0, size_t d1, size_t new_d0, size_t new_d1)
-{
-    auto y   = threadIdx.y;
-    auto x   = threadIdx.x;
-    auto yid = blockIdx.y * blockDim.y + y;
-    auto xid = blockIdx.x * blockDim.x + x;
-
-    __shared__ Data s[tBLK][tBLK];
-
-    if (yid >= new_d1 or xid >= new_d0) return;
-
-    int xblk = (xid + 1) * DOWNSCALE_FACTOR >= d0 ? d0 - xid * DOWNSCALE_FACTOR : DOWNSCALE_FACTOR;
-    int yblk = (yid + 1) * DOWNSCALE_FACTOR >= d1 ? d1 - yid * DOWNSCALE_FACTOR : DOWNSCALE_FACTOR;
-    s[y][x]  = 0;
-
-    for (int j = 0; j < yblk; j++)
-        for (int i = 0; i < xblk; i++)
-            s[y][x] += input[(yid * DOWNSCALE_FACTOR + j) * d0 + (xid * DOWNSCALE_FACTOR + i)];
-
-    output[yid * new_d0 + xid] = s[y][x] / static_cast<Data>(yblk * xblk);
-}
-}  // namespace cusz
-
-template __global__ void cusz::binning2d<float, 2, 32>(float*, float*, size_t, size_t, size_t, size_t);
-template __global__ void cusz::binning2d<double, 2, 32>(double*, double*, size_t, size_t, size_t, size_t);
-// template __global__ void cusz::binning2d<I1, 2, 32>(I1*, I1*, size_t, size_t, size_t, size_t);
-// template __global__ void cusz::binning2d<I2, 2, 32>(I2*, I2*, size_t, size_t, size_t, size_t);
-// template __global__ void cusz::binning2d<I4, 2, 32>(I4*, I4*, size_t, size_t, size_t, size_t);
-// template __global__ void cusz::binning2d<I8, 2, 32>(I8*, I8*, size_t, size_t, size_t, size_t);
-
-#endif
+/**
+ * @file preprocess.cuh
+ * @author Jiannan Tian
+ * @brief Filters for preprocessing of cuSZ.
+ * @version 0.3
+ * @date 2020-09-20
+ * (created) 2020-05-03 (rev) 2021-06-21
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_KERNEL_PREPROCESS_CUH
+#define CUSZ_KERNEL_PREPROCESS_CUH
+
+#include <iostream>
+
+#include "common.hh"
+
+using std::cout;
+using std::endl;
+
+namespace cusz {
+
+#include <numeric>
+
+template <typename T>
+__global__ void log_transform()
+{
+    static_assert(std::is_floating_point<T>::value, "[log_transform] must be floating-point type.");
+}
+
+template <typename Data, int DOWNSCALE_FACTOR, int tBLK>
+__global__ void binning2d(Data* input, Data* output, size_t d0, size_t d1, size_t new_d0, size_t new_d1)
+{
+    auto y   = threadIdx.y;
+    auto x   = threadIdx.x;
+    auto yid = blockIdx.y * blockDim.y + y;
+    auto xid = blockIdx.x * blockDim.x + x;
+
+    __shared__ Data s[tBLK][tBLK];
+
+    if (yid >= new_d1 or xid >= new_d0) return;
+
+    int xblk = (xid + 1) * DOWNSCALE_FACTOR >= d0 ? d0 - xid * DOWNSCALE_FACTOR : DOWNSCALE_FACTOR;
+    int yblk = (yid + 1) * DOWNSCALE_FACTOR >= d1 ? d1 - yid * DOWNSCALE_FACTOR : DOWNSCALE_FACTOR;
+    s[y][x]  = 0;
+
+    for (int j = 0; j < yblk; j++)
+        for (int i = 0; i < xblk; i++)
+            s[y][x] += input[(yid * DOWNSCALE_FACTOR + j) * d0 + (xid * DOWNSCALE_FACTOR + i)];
+
+    output[yid * new_d0 + xid] = s[y][x] / static_cast<Data>(yblk * xblk);
+}
+}  // namespace cusz
+
+template __global__ void cusz::binning2d<float, 2, 32>(float*, float*, size_t, size_t, size_t, size_t);
+template __global__ void cusz::binning2d<double, 2, 32>(double*, double*, size_t, size_t, size_t, size_t);
+// template __global__ void cusz::binning2d<I1, 2, 32>(I1*, I1*, size_t, size_t, size_t, size_t);
+// template __global__ void cusz::binning2d<I2, 2, 32>(I2*, I2*, size_t, size_t, size_t, size_t);
+// template __global__ void cusz::binning2d<I4, 2, 32>(I4*, I4*, size_t, size_t, size_t, size_t);
+// template __global__ void cusz::binning2d<I8, 2, 32>(I8*, I8*, size_t, size_t, size_t, size_t);
+
+#endif
diff --git a/qtensor/compression/cusz/src/kernel/rle.cuh b/qtensor/compression/cusz/src/kernel/rle.cuh
index 6f01cff4..f8fe36ed 100644
--- a/qtensor/compression/cusz/src/kernel/rle.cuh
+++ b/qtensor/compression/cusz/src/kernel/rle.cuh
@@ -1,74 +1,74 @@
-// modified from thrust example
-// attach the license below when push to master branch
-// https://github.com/NVIDIA/thrust/blob/main/LICENSE
-
-/**
- * @file rle.cuh
- * @author Jiannan Tian
- * @brief
- * @version 0.2
- * @date 2021-04-01
- *
- * (C) 2021 by Washington State University, Argonne National Laboratory
- *
- */
-
-#ifndef KERNEL_RLE_CUH
-#define KERNEL_RLE_CUH
-
-#include <thrust/binary_search.h>
-#include <thrust/copy.h>
-#include <thrust/device_vector.h>
-#include <thrust/gather.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/reduce.h>
-#include <thrust/scan.h>
-
-#include <iostream>
-#include <iterator>
-
-using const_gen = thrust::constant_iterator<int>;
-using counter   = thrust::counting_iterator<int>;
-
-namespace kernel {
-
-template <typename T>
-void RunLengthEncoding(T* d_fullfmt_data, const size_t N, T* d_compact_data, int* d_lengths, size_t& num_runs)
-{
-    thrust::device_ptr<T>   input   = thrust::device_pointer_cast(d_fullfmt_data);
-    thrust::device_ptr<T>   output  = thrust::device_pointer_cast(d_compact_data);
-    thrust::device_ptr<int> lengths = thrust::device_pointer_cast(d_lengths);
-    // compute the output size (run lengths)
-    num_runs = thrust::reduce_by_key(
-                   input, input + N,  // input::key (symbol)
-                   const_gen(1),      // input::value (count)
-                   output,            // output::key (symbol)
-                   lengths)           // output::value (count)
-                   .first -
-               output;
-}
-
-template <typename T>
-void RunLengthDecoding(T* d_fullfmt_data, const size_t N, T* d_compact_data, int* d_lengths, const size_t num_runs)
-{
-    thrust::device_ptr<T>   output  = thrust::device_pointer_cast(d_fullfmt_data);
-    thrust::device_ptr<T>   input   = thrust::device_pointer_cast(d_compact_data);
-    thrust::device_ptr<int> lengths = thrust::device_pointer_cast(d_lengths);
-
-    // scan the lengths
-    thrust::inclusive_scan(lengths, lengths + num_runs, lengths);
-
-    // compute input index for each output element
-    thrust::device_vector<int> indices(N);
-    thrust::lower_bound(
-        lengths, lengths + N,        //
-        counter(1), counter(N + 1),  //
-        indices.begin());
-
-    thrust::encode(indices.begin(), indices.end(), input, output);
-}
-
-}  // namespace kernel
-
-#endif
+// modified from thrust example
+// attach the license below when push to master branch
+// https://github.com/NVIDIA/thrust/blob/main/LICENSE
+
+/**
+ * @file rle.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.2
+ * @date 2021-04-01
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef KERNEL_RLE_CUH
+#define KERNEL_RLE_CUH
+
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/gather.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+
+#include <iostream>
+#include <iterator>
+
+using const_gen = thrust::constant_iterator<int>;
+using counter   = thrust::counting_iterator<int>;
+
+namespace kernel {
+
+template <typename T>
+void RunLengthEncoding(T* d_fullfmt_data, const size_t N, T* d_compact_data, int* d_lengths, size_t& num_runs)
+{
+    thrust::device_ptr<T>   input   = thrust::device_pointer_cast(d_fullfmt_data);
+    thrust::device_ptr<T>   output  = thrust::device_pointer_cast(d_compact_data);
+    thrust::device_ptr<int> lengths = thrust::device_pointer_cast(d_lengths);
+    // compute the output size (run lengths)
+    num_runs = thrust::reduce_by_key(
+                   input, input + N,  // input::key (symbol)
+                   const_gen(1),      // input::value (count)
+                   output,            // output::key (symbol)
+                   lengths)           // output::value (count)
+                   .first -
+               output;
+}
+
+template <typename T>
+void RunLengthDecoding(T* d_fullfmt_data, const size_t N, T* d_compact_data, int* d_lengths, const size_t num_runs)
+{
+    thrust::device_ptr<T>   output  = thrust::device_pointer_cast(d_fullfmt_data);
+    thrust::device_ptr<T>   input   = thrust::device_pointer_cast(d_compact_data);
+    thrust::device_ptr<int> lengths = thrust::device_pointer_cast(d_lengths);
+
+    // scan the lengths
+    thrust::inclusive_scan(lengths, lengths + num_runs, lengths);
+
+    // compute input index for each output element
+    thrust::device_vector<int> indices(N);
+    thrust::lower_bound(
+        lengths, lengths + N,        //
+        counter(1), counter(N + 1),  //
+        indices.begin());
+
+    thrust::encode(indices.begin(), indices.end(), input, output);
+}
+
+}  // namespace kernel
+
+#endif
diff --git a/qtensor/compression/cusz/src/kernel/spv_gpu.cu b/qtensor/compression/cusz/src/kernel/spv_gpu.cu
index 96b665a7..29bcee1c 100644
--- a/qtensor/compression/cusz/src/kernel/spv_gpu.cu
+++ b/qtensor/compression/cusz/src/kernel/spv_gpu.cu
@@ -1,60 +1,60 @@
-/**
- * @file spv_gpu.cu
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-10-29
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "../detail/spv_gpu.inl"
-#include "kernel/spv_gpu.h"
-#include "kernel/spv_gpu.hh"
-
-#define SPV(Tliteral, Mliteral, T, M)                                                                              \
-    void spv_gather_T##Tliteral##_M##Mliteral(                                                                     \
-        T* in, size_t const in_len, T* d_val, uint32_t* d_idx, int* nnz, float* milliseconds, cudaStream_t stream) \
-    {                                                                                                              \
-        psz::detail::spv_gather<T, M>(in, in_len, d_val, d_idx, nnz, milliseconds, stream);                        \
-    }                                                                                                              \
-                                                                                                                   \
-    void spv_scatter_T##Tliteral##_M##Mliteral(                                                                    \
-        T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream)            \
-    {                                                                                                              \
-        psz::detail::spv_scatter<T, M>(d_val, d_idx, nnz, decoded, milliseconds, stream);                          \
-    }
-
-SPV(ui8, ui32, uint8_t, uint32_t)
-SPV(ui16, ui32, uint16_t, uint32_t)
-SPV(ui32, ui32, uint32_t, uint32_t)
-SPV(ui64, ui32, uint64_t, uint32_t)
-SPV(fp32, ui32, float, uint32_t)
-SPV(fp64, ui32, double, uint32_t)
-
-#undef SPV
-
-#define SPV(Tliteral, Mliteral, T, M)                                                                               \
-    template <>                                                                                                     \
-    void psz::spv_gather<T, M>(                                                                                     \
-        T * in, size_t const in_len, T* d_val, uint32_t* d_idx, int* nnz, float* milliseconds, cudaStream_t stream) \
-    {                                                                                                               \
-        spv_gather_T##Tliteral##_M##Mliteral(in, in_len, d_val, d_idx, nnz, milliseconds, stream);                  \
-    }                                                                                                               \
-                                                                                                                    \
-    template <>                                                                                                     \
-    void psz::spv_scatter<T, M>(                                                                                    \
-        T * d_val, uint32_t * d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream)           \
-    {                                                                                                               \
-        spv_scatter_T##Tliteral##_M##Mliteral(d_val, d_idx, nnz, decoded, milliseconds, stream);                    \
-    }
-
-SPV(ui8, ui32, uint8_t, uint32_t)
-SPV(ui16, ui32, uint16_t, uint32_t)
-SPV(ui32, ui32, uint32_t, uint32_t)
-SPV(ui64, ui32, uint64_t, uint32_t)
-SPV(fp32, ui32, float, uint32_t)
-SPV(fp64, ui32, double, uint32_t)
-
-#undef SPV
+/**
+ * @file spv_gpu.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-29
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/spv_gpu.inl"
+#include "kernel/spv_gpu.h"
+#include "kernel/spv_gpu.hh"
+
+#define SPV(Tliteral, Mliteral, T, M)                                                                              \
+    void spv_gather_T##Tliteral##_M##Mliteral(                                                                     \
+        T* in, size_t const in_len, T* d_val, uint32_t* d_idx, int* nnz, float* milliseconds, cudaStream_t stream) \
+    {                                                                                                              \
+        psz::detail::spv_gather<T, M>(in, in_len, d_val, d_idx, nnz, milliseconds, stream);                        \
+    }                                                                                                              \
+                                                                                                                   \
+    void spv_scatter_T##Tliteral##_M##Mliteral(                                                                    \
+        T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream)            \
+    {                                                                                                              \
+        psz::detail::spv_scatter<T, M>(d_val, d_idx, nnz, decoded, milliseconds, stream);                          \
+    }
+
+SPV(ui8, ui32, uint8_t, uint32_t)
+SPV(ui16, ui32, uint16_t, uint32_t)
+SPV(ui32, ui32, uint32_t, uint32_t)
+SPV(ui64, ui32, uint64_t, uint32_t)
+SPV(fp32, ui32, float, uint32_t)
+SPV(fp64, ui32, double, uint32_t)
+
+#undef SPV
+
+#define SPV(Tliteral, Mliteral, T, M)                                                                               \
+    template <>                                                                                                     \
+    void psz::spv_gather<T, M>(                                                                                     \
+        T * in, size_t const in_len, T* d_val, uint32_t* d_idx, int* nnz, float* milliseconds, cudaStream_t stream) \
+    {                                                                                                               \
+        spv_gather_T##Tliteral##_M##Mliteral(in, in_len, d_val, d_idx, nnz, milliseconds, stream);                  \
+    }                                                                                                               \
+                                                                                                                    \
+    template <>                                                                                                     \
+    void psz::spv_scatter<T, M>(                                                                                    \
+        T * d_val, uint32_t * d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream)           \
+    {                                                                                                               \
+        spv_scatter_T##Tliteral##_M##Mliteral(d_val, d_idx, nnz, decoded, milliseconds, stream);                    \
+    }
+
+SPV(ui8, ui32, uint8_t, uint32_t)
+SPV(ui16, ui32, uint16_t, uint32_t)
+SPV(ui32, ui32, uint32_t, uint32_t)
+SPV(ui64, ui32, uint64_t, uint32_t)
+SPV(fp32, ui32, float, uint32_t)
+SPV(fp64, ui32, double, uint32_t)
+
+#undef SPV
diff --git a/qtensor/compression/cusz/src/kernel/v2_lorenzo.cu b/qtensor/compression/cusz/src/kernel/v2_lorenzo.cu
index fb2c22ed..b7263613 100644
--- a/qtensor/compression/cusz/src/kernel/v2_lorenzo.cu
+++ b/qtensor/compression/cusz/src/kernel/v2_lorenzo.cu
@@ -1,118 +1,118 @@
-/**
- * @file v2_lorenzo.cu
- * @author Jiannan Tian
- * @brief
- * @version 0.4
- * @date 2023-01-23
- *
- * (C) 2023 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "cusz/type.h"
-#include "utils/cuda_err.cuh"
-#include "utils/timer.h"
-
-#include "kernel/lorenzo_all.hh"
-#include "kernel/v2_lorenzo.hh"
-
-template <typename T, typename E, typename FP>
-cusz_error_status v2_compress_predict_lorenzo_i(
-    T* const          data,
-    dim3 const        len3,
-    double const      eb,
-    int const         radius,
-    E* const          errctrl,
-    dim3 const        placeholder_2,
-    T* const          anchor,
-    dim3 const        placeholder_1,
-    CompactionDRAM<T> outlier,
-    float*            time_elapsed,
-    cudaStream_t      stream)
-{
-    auto divide3 = [](dim3 len, dim3 sublen) {
-        return dim3(
-            (len.x - 1) / sublen.x + 1,  //
-            (len.y - 1) / sublen.y + 1,  //
-            (len.z - 1) / sublen.z + 1);
-    };
-
-    auto ndim = [&]() {
-        if (len3.z == 1 and len3.y == 1)
-            return 1;
-        else if (len3.z == 1 and len3.y != 1)
-            return 2;
-        else
-            return 3;
-    };
-
-    constexpr auto SUBLEN_1D = 256;
-    constexpr auto SEQ_1D    = 4;  // x-sequentiality == 4
-    constexpr auto BLOCK_1D  = dim3(256 / 4, 1, 1);
-    auto           GRID_1D   = divide3(len3, SUBLEN_1D);
-
-    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
-    constexpr auto BLOCK_2D  = dim3(16, 2, 1);
-    auto           GRID_2D   = divide3(len3, SUBLEN_2D);
-
-    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
-    constexpr auto BLOCK_3D  = dim3(32, 8, 1);  // for v0::r1_shfl
-    auto           GRID_3D   = divide3(len3, SUBLEN_3D);
-
-    auto d = ndim();
-
-    // error bound
-    auto ebx2   = eb * 2;
-    auto ebx2_r = 1 / ebx2;
-    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
-
-    CREATE_CUDAEVENT_PAIR;
-    START_CUDAEVENT_RECORDING(stream);
-
-    if (d == 1) {
-        psz::cuda::__kernel::v0::compaction::c_lorenzo_1d1l<T, E, FP, SUBLEN_1D, SEQ_1D>
-            <<<GRID_1D, BLOCK_1D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, errctrl, outlier);
-    }
-    else if (d == 2) {
-        psz::cuda::__kernel::v0::compaction::c_lorenzo_2d1l<T, E, FP>
-            <<<GRID_2D, BLOCK_2D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, errctrl, outlier);
-    }
-    else if (d == 3) {
-        psz::cuda::__kernel::v0::compaction::c_lorenzo_3d1l<T, E, FP>
-            <<<GRID_3D, BLOCK_3D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, errctrl, outlier);
-    }
-
-    STOP_CUDAEVENT_RECORDING(stream);
-    CHECK_CUDA(cudaStreamSynchronize(stream));
-    TIME_ELAPSED_CUDAEVENT(time_elapsed);
-    DESTROY_CUDAEVENT_PAIR;
-
-    return CUSZ_SUCCESS;
-}
-
-#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, E, FP)                   \
-    template cusz_error_status v2_compress_predict_lorenzo_i<T, E, FP>(                            \
-        T* const, dim3 const, double const, int const, E* const, dim3 const, T* const, dim3 const, \
-        struct CompactionDRAM<T>, float*, cudaStream_t);                                           \
-                                                                                                   \
-    // cusz_error_status v2_compress_predict_lorenzo_i_T##Tliteral##_E##Eliteral##_FP##FPliteral(                \
-    //     T* const data, dim3 const len3, T* const anchor, dim3 const placeholder_1, E* const errctrl,          \
-    //     dim3 const placeholder_2, T* outlier, double const eb, int const radius, float* time_elapsed,         \
-    //     cudaStream_t stream)                                                                                  \
-    // {                                                                                                         \
-    //     return v2_compress_predict_lorenzo_i<T, E, FP>(                                                       \
-    //         data, len3, eb, radius, errctrl, placeholder_2, anchor, placeholder_1, outlier, nullptr, nullptr, \
-    //         time_elapsed, stream);                                                                            \
-    // }
-
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float);
-
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double);
-CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double);
-
-#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER
+/**
+ * @file v2_lorenzo.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-23
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "cusz/type.h"
+#include "utils/cuda_err.cuh"
+#include "utils/timer.h"
+
+#include "kernel/lorenzo_all.hh"
+#include "kernel/v2_lorenzo.hh"
+
+template <typename T, typename E, typename FP>
+cusz_error_status v2_compress_predict_lorenzo_i(
+    T* const          data,
+    dim3 const        len3,
+    double const      eb,
+    int const         radius,
+    E* const          errctrl,
+    dim3 const        placeholder_2,
+    T* const          anchor,
+    dim3 const        placeholder_1,
+    CompactionDRAM<T> outlier,
+    float*            time_elapsed,
+    cudaStream_t      stream)
+{
+    auto divide3 = [](dim3 len, dim3 sublen) {
+        return dim3(
+            (len.x - 1) / sublen.x + 1,  //
+            (len.y - 1) / sublen.y + 1,  //
+            (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_1D = 256;
+    constexpr auto SEQ_1D    = 4;  // x-sequentiality == 4
+    constexpr auto BLOCK_1D  = dim3(256 / 4, 1, 1);
+    auto           GRID_1D   = divide3(len3, SUBLEN_1D);
+
+    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
+    constexpr auto BLOCK_2D  = dim3(16, 2, 1);
+    auto           GRID_2D   = divide3(len3, SUBLEN_2D);
+
+    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
+    constexpr auto BLOCK_3D  = dim3(32, 8, 1);  // for v0::r1_shfl
+    auto           GRID_3D   = divide3(len3, SUBLEN_3D);
+
+    auto d = ndim();
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    if (d == 1) {
+        psz::cuda::__kernel::v0::compaction::c_lorenzo_1d1l<T, E, FP, SUBLEN_1D, SEQ_1D>
+            <<<GRID_1D, BLOCK_1D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, errctrl, outlier);
+    }
+    else if (d == 2) {
+        psz::cuda::__kernel::v0::compaction::c_lorenzo_2d1l<T, E, FP>
+            <<<GRID_2D, BLOCK_2D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, errctrl, outlier);
+    }
+    else if (d == 3) {
+        psz::cuda::__kernel::v0::compaction::c_lorenzo_3d1l<T, E, FP>
+            <<<GRID_3D, BLOCK_3D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, errctrl, outlier);
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+    TIME_ELAPSED_CUDAEVENT(time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, E, FP)                   \
+    template cusz_error_status v2_compress_predict_lorenzo_i<T, E, FP>(                            \
+        T* const, dim3 const, double const, int const, E* const, dim3 const, T* const, dim3 const, \
+        struct CompactionDRAM<T>, float*, cudaStream_t);                                           \
+                                                                                                   \
+    // cusz_error_status v2_compress_predict_lorenzo_i_T##Tliteral##_E##Eliteral##_FP##FPliteral(                \
+    //     T* const data, dim3 const len3, T* const anchor, dim3 const placeholder_1, E* const errctrl,          \
+    //     dim3 const placeholder_2, T* outlier, double const eb, int const radius, float* time_elapsed,         \
+    //     cudaStream_t stream)                                                                                  \
+    // {                                                                                                         \
+    //     return v2_compress_predict_lorenzo_i<T, E, FP>(                                                       \
+    //         data, len3, eb, radius, errctrl, placeholder_2, anchor, placeholder_1, outlier, nullptr, nullptr, \
+    //         time_elapsed, stream);                                                                            \
+    // }
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float);
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double);
+
+#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER
diff --git a/qtensor/compression/cusz/src/pipeline/v2_compressor.cc b/qtensor/compression/cusz/src/pipeline/v2_compressor.cc
index 73ee3c83..a9449447 100644
--- a/qtensor/compression/cusz/src/pipeline/v2_compressor.cc
+++ b/qtensor/compression/cusz/src/pipeline/v2_compressor.cc
@@ -1,112 +1,112 @@
-/**
- * @file v2_compressor.cc
- * @author Jiannan Tian
- * @brief
- * @version 0.4
- * @date 2023-01-29
- *
- * (C) 2023 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "pipeline/v2_compressor.hh"
-#include "common/configs.hh"
-#include "framework.hh"
-
-namespace psz {
-
-template <class B>
-v2_Compressor<B>::~v2_Compressor()
-{
-    pimpl.reset();
-}
-
-template <class B>
-v2_Compressor<B>::v2_Compressor() : pimpl{std::make_unique<impl>()}
-{
-}
-
-template <class B>
-v2_Compressor<B>::v2_Compressor(const v2_Compressor<B>& old) : pimpl{std::make_unique<impl>(*old.pimpl)}
-{
-}
-
-template <class B>
-v2_Compressor<B>& v2_Compressor<B>::operator=(const v2_Compressor<B>& old)
-{
-    *pimpl = *old.pimpl;
-    return *this;
-}
-
-template <class B>
-v2_Compressor<B>::v2_Compressor(v2_Compressor<B>&&) = default;
-
-template <class B>
-v2_Compressor<B>& v2_Compressor<B>::operator=(v2_Compressor<B>&&) = default;
-
-//------------------------------------------------------------------------------
-
-template <class B>
-void v2_Compressor<B>::init(Context* config)
-{
-    pimpl->init(config);
-}
-
-template <class B>
-void v2_Compressor<B>::init(v2_header* config)
-{
-    pimpl->init(config);
-}
-
-template <class B>
-void v2_Compressor<B>::compress(
-    Context*             config,
-    v2_Compressor<B>::T* uncompressed,
-    BYTE*&               compressed,
-    size_t&              compressed_len,
-    cudaStream_t         stream,
-    bool                 dbg_print)
-{
-    pimpl->compress(config, uncompressed, compressed, compressed_len, stream, dbg_print);
-}
-
-template <class B>
-void v2_Compressor<B>::decompress(
-    v2_header*           config,
-    BYTE*                compressed,
-    v2_Compressor<B>::T* decompressed,
-    cudaStream_t         stream,
-    bool                 dbg_print)
-{
-    pimpl->decompress(config, compressed, decompressed, stream, dbg_print);
-}
-
-// template <class B>
-// void v2_Compressor<B>::clear_buffer()
-// {
-//     pimpl->clear_buffer();
-// }
-
-// getter
-
-template <class B>
-void v2_Compressor<B>::export_header(v2_header& header)
-{
-    pimpl->export_header(header);
-}
-
-template <class B>
-void v2_Compressor<B>::export_header(v2_header* header)
-{
-    pimpl->export_header(header);
-}
-
-// template <class B>
-// void v2_Compressor<B>::export_timerecord(TimeRecord* ext_timerecord)
-// {
-//     pimpl->export_timerecord(ext_timerecord);
-// }
-
-}  // namespace psz
-
+/**
+ * @file v2_compressor.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-29
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "pipeline/v2_compressor.hh"
+#include "common/configs.hh"
+#include "framework.hh"
+
+namespace psz {
+
+template <class B>
+v2_Compressor<B>::~v2_Compressor()
+{
+    pimpl.reset();
+}
+
+template <class B>
+v2_Compressor<B>::v2_Compressor() : pimpl{std::make_unique<impl>()}
+{
+}
+
+template <class B>
+v2_Compressor<B>::v2_Compressor(const v2_Compressor<B>& old) : pimpl{std::make_unique<impl>(*old.pimpl)}
+{
+}
+
+template <class B>
+v2_Compressor<B>& v2_Compressor<B>::operator=(const v2_Compressor<B>& old)
+{
+    *pimpl = *old.pimpl;
+    return *this;
+}
+
+template <class B>
+v2_Compressor<B>::v2_Compressor(v2_Compressor<B>&&) = default;
+
+template <class B>
+v2_Compressor<B>& v2_Compressor<B>::operator=(v2_Compressor<B>&&) = default;
+
+//------------------------------------------------------------------------------
+
+template <class B>
+void v2_Compressor<B>::init(Context* config)
+{
+    pimpl->init(config);
+}
+
+template <class B>
+void v2_Compressor<B>::init(v2_header* config)
+{
+    pimpl->init(config);
+}
+
+template <class B>
+void v2_Compressor<B>::compress(
+    Context*             config,
+    v2_Compressor<B>::T* uncompressed,
+    BYTE*&               compressed,
+    size_t&              compressed_len,
+    cudaStream_t         stream,
+    bool                 dbg_print)
+{
+    pimpl->compress(config, uncompressed, compressed, compressed_len, stream, dbg_print);
+}
+
+template <class B>
+void v2_Compressor<B>::decompress(
+    v2_header*           config,
+    BYTE*                compressed,
+    v2_Compressor<B>::T* decompressed,
+    cudaStream_t         stream,
+    bool                 dbg_print)
+{
+    pimpl->decompress(config, compressed, decompressed, stream, dbg_print);
+}
+
+// template <class B>
+// void v2_Compressor<B>::clear_buffer()
+// {
+//     pimpl->clear_buffer();
+// }
+
+// getter
+
+template <class B>
+void v2_Compressor<B>::export_header(v2_header& header)
+{
+    pimpl->export_header(header);
+}
+
+template <class B>
+void v2_Compressor<B>::export_header(v2_header* header)
+{
+    pimpl->export_header(header);
+}
+
+// template <class B>
+// void v2_Compressor<B>::export_timerecord(TimeRecord* ext_timerecord)
+// {
+//     pimpl->export_timerecord(ext_timerecord);
+// }
+
+}  // namespace psz
+
 template class psz::v2_Compressor<cusz::Framework<float>>;
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.cu b/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.cu
index 32eeb39d..0fcc6ebc 100644
--- a/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.cu
+++ b/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.cu
@@ -1,15 +1,15 @@
-/**
- * @file v2_compressor_impl.cu
- * @author Jiannan Tian
- * @brief
- * @version 0.4
- * @date 2023-01-23
- *
- * (C) 2023 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "framework.hh"
-#include "v2_compressor_impl.inl"
-
+/**
+ * @file v2_compressor_impl.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-23
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "framework.hh"
+#include "v2_compressor_impl.inl"
+
 template class psz::v2_Compressor<cusz::Framework<float>>::impl;
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.inl b/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.inl
index 2a2788f4..0dd96f91 100644
--- a/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.inl
+++ b/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.inl
@@ -1,239 +1,239 @@
-/**
- * @file v2_compressor_impl.inl
- * @author Jiannan Tian
- * @brief
- * @version 0.4
- * @date 2023-01-23
- *
- * (C) 2023 by Indiana University, Argonne National Laboratory
- *
- */
-
-#ifndef F4D645B7_B2E3_41AB_BCFD_DCF919C4C56D
-#define F4D645B7_B2E3_41AB_BCFD_DCF919C4C56D
-
-#include <iostream>
-
-#include "component.hh"
-#include "header.h"
-#include "pipeline/v2_compressor.hh"
-// #include "kernel/cpplaunch_cuda.hh"
-#include "kernel/v2_lorenzo.hh"
-#include "stat/stat_g.hh"
-#include "utils/cuda_err.cuh"
-
-#include "../detail/spv_gpu.inl"
-#include "../kernel/detail/lorenzo23.inl"
-
-#define TEMPLATE_TYPE template <class CONFIG>
-#define IMPL v2_Compressor<CONFIG>::impl
-
-#define ARCHIVE(VAR, FIELD)                                                                                  \
-    if (segments[v2_header::FIELD] != 0 and VAR != nullptr) {                                                \
-        auto dst = var_archive() + header.entry[v2_header::FIELD];                                           \
-        auto src = reinterpret_cast<BYTE*>(VAR);                                                             \
-        CHECK_CUDA(cudaMemcpyAsync(dst, src, segments[v2_header::FIELD], cudaMemcpyDeviceToDevice, stream)); \
-    }
-
-#define ACCESS_VAR(SYM, TYPE) reinterpret_cast<TYPE*>(in_compressed + header->entry[v2_header::SYM])
-
-namespace psz {
-
-TEMPLATE_TYPE
-IMPL::impl()
-{
-    codec = new Codec;
-    // TODO re-enable fallback codec
-    // fb_codec  = new FallbackCodec;
-}
-
-TEMPLATE_TYPE
-void IMPL::destroy()
-{
-    if (codec) delete codec;
-    // if (fb_codec) delete codec;
-
-    // also deallocate buffer
-}
-
-TEMPLATE_TYPE
-void IMPL::init(Context* config) { __init(config); }
-
-TEMPLATE_TYPE
-void IMPL::init(v2_header* config) { __init(config); }
-
-TEMPLATE_TYPE
-template <class ContextOrHeader>
-void IMPL::__init(ContextOrHeader* c)
-{
-    static_assert(
-        std::is_same<ContextOrHeader, Context>::value or  //
-            std::is_same<ContextOrHeader, v2_header>::value,
-        "[v2_Compressor::impl::init] not a valid comrpessor config type.");
-
-    auto len = c->x * c->y * c->z;
-    // TODO allocate anchor
-
-    // allocate eq
-    cudaMalloc(&d_errctrl, len * sizeof(EQ));  // to overlap with one of vle/hf buffers
-
-    // allocate outlier
-    outlier.allocate(len / sp_factor, true);
-
-    // allocate vle/hf
-    codec->init(len, c->radius * 2, c->vle_pardeg);
-    // TODO disable fallback codec for now
-}
-
-TEMPLATE_TYPE
-void IMPL::compress(
-    Context*     c,
-    T*           uncompressed,
-    BYTE*&       compressed,
-    size_t&      compressed_len,
-    cudaStream_t stream,
-    bool         dbg_print)
-{
-    auto const eb     = c->eb;
-    auto const radius = c->radius;
-    auto const pardeg = c->vle_pardeg;
-
-    if (dbg_print) {
-        printf("[dbg] eb: %lf\n", eb);
-        printf("[dbg] radius: %d\n", radius);
-        printf("[dbg] pardeg: %d\n", pardeg);
-        // printf("[dbg] codecs_in_use: %d\n", codecs_in_use);
-        printf("[dbg] sp_factor: %d\n", sp_factor);
-    }
-
-    data_len3 = dim3(c->x, c->y, c->z);
-    data_len  = c->x * c->y * c->z;
-
-    header.sp.factor = sp_factor;
-
-    BYTE*  d_codec_out{nullptr};
-    size_t codec_outlen{0};
-
-    // size_t sublen;
-    auto booklen = radius * 2;
-
-    /******************************************************************************/
-
-    // TODO version clarification
-    // with compaction
-    v2_compress_predict_lorenzo_i<T, EQ, FP>(
-        uncompressed, data_len3, eb, radius, d_errctrl, dim3(1, 1, 1), d_anchor, dim3(1, 1, 1), outlier,
-        &comp_time.construct, stream);
-
-    outlier.make_count_host_accessible(stream);
-
-    asz::stat::histogram<E>(d_errctrl, data_len, d_freq, booklen, &comp_time.hist, stream);
-
-    CHECK_CUDA(cudaStreamSynchronize(stream));
-
-    // TODO overlapping memory
-    codec->encode(d_errctrl, data_len, d_codec_out, codec_outlen, stream);
-
-    CHECK_CUDA(cudaStreamSynchronize(stream));
-
-    // update header
-    {
-        header.x = c->x, header.y = c->y, header.z = c->z, header.w = 1;
-        header.sp.count = outlier.access_count_on_host();
-        // TODO the new
-        {
-            // header.config.radius = radius, header.config.eb = eb;
-            // header.hf.pardeg = pardeg;
-        }
-
-        // the compat
-        {
-            header.radius = radius, header.eb = eb;
-            header.vle_pardeg = pardeg;
-        }
-
-        // header.byte_vle  = 4;  // regardless of fallback codec
-    };
-
-    size_t segments[v2_header::END] = {0};
-
-    // gather archive
-    {
-        // calculate offsets
-        segments[v2_header::HEADER] = sizeof(v2_header);
-        segments[v2_header::ANCHOR] = 0;  // placeholder
-        segments[v2_header::SP_IDX] = outlier.access_count_on_host() * sizeof(IDX);
-        segments[v2_header::SP_VAL] = outlier.access_count_on_host() * sizeof(T);
-        segments[v2_header::HF]     = codec_outlen;
-
-        header.entry[0] = 0;
-        for (auto i = 1; i < v2_header::END + 1; i++) { header.entry[i] = segments[i - 1]; }
-        for (auto i = 1; i < v2_header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; }
-
-        CHECK_CUDA(cudaStreamSynchronize(stream));
-
-        // memcpy
-        ARCHIVE(d_anchor, ANCHOR);
-        ARCHIVE(outlier.idx, SP_IDX);
-        ARCHIVE(outlier.val, SP_VAL);
-        ARCHIVE(d_codec_out, HF);
-
-        CHECK_CUDA(cudaStreamSynchronize(stream));
-    }
-
-    // output
-    compressed_len = header.entry[v2_header::END];
-    compressed     = var_archive();
-
-    // collect_compress_timerecord();
-}
-
-TEMPLATE_TYPE
-void IMPL::decompress(v2_header* header, BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool dbg_print)
-{
-    // TODO host having copy of header when compressing
-    if (not header) {
-        header = new v2_header;
-        CHECK_CUDA(cudaMemcpyAsync(header, in_compressed, sizeof(v2_header), cudaMemcpyDeviceToHost, stream));
-        CHECK_CUDA(cudaStreamSynchronize(stream));
-    }
-
-    data_len3 = dim3(header->x, header->y, header->z);
-
-    // use_fallback_codec      = header->byte_vle == 8;
-    // auto const vle_pardeg = header->hf.pardeg;
-
-    // The inputs of components are from `compressed`.
-    // auto d_anchor = ACCESS_VAR(ANCHOR, T);
-    auto d_vle   = ACCESS_VAR(HF, BYTE);
-    auto d_spidx = ACCESS_VAR(SP_IDX, IDX);
-    auto d_spval = ACCESS_VAR(SP_VAL, T);
-
-    // wire and aliasing
-    auto d_outlier = out_decompressed;
-    auto d_xdata   = out_decompressed;
-
-    psz::detail::spv_scatter<T, IDX>(d_spval, d_spidx, header->sp.count, d_outlier, &decomp_time.scatter, stream);
-
-    codec->decode(d_vle, d_errctrl);
-
-    decompress_predict_lorenzo_i<T, EQ, FP>(
-        d_errctrl, data_len3,  //
-        d_outlier,             //
-        nullptr, 0,            // TODO remove
-        header->eb, header->radius,
-        d_xdata,  // output
-        &decomp_time.reconstruct, stream);
-
-    // collect_decompress_timerecord();
-
-    // clear state for the next decompression after reporting
-    // use_fallback_codec = false;
-}
-
-}  // namespace psz
-
-#undef TEMPLATE_TYPE
-#undef IMPL
-
-#endif /* F4D645B7_B2E3_41AB_BCFD_DCF919C4C56D */
+/**
+ * @file v2_compressor_impl.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-23
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef F4D645B7_B2E3_41AB_BCFD_DCF919C4C56D
+#define F4D645B7_B2E3_41AB_BCFD_DCF919C4C56D
+
+#include <iostream>
+
+#include "component.hh"
+#include "header.h"
+#include "pipeline/v2_compressor.hh"
+// #include "kernel/cpplaunch_cuda.hh"
+#include "kernel/v2_lorenzo.hh"
+#include "stat/stat_g.hh"
+#include "utils/cuda_err.cuh"
+
+#include "../detail/spv_gpu.inl"
+#include "../kernel/detail/lorenzo23.inl"
+
+#define TEMPLATE_TYPE template <class CONFIG>
+#define IMPL v2_Compressor<CONFIG>::impl
+
+#define ARCHIVE(VAR, FIELD)                                                                                  \
+    if (segments[v2_header::FIELD] != 0 and VAR != nullptr) {                                                \
+        auto dst = var_archive() + header.entry[v2_header::FIELD];                                           \
+        auto src = reinterpret_cast<BYTE*>(VAR);                                                             \
+        CHECK_CUDA(cudaMemcpyAsync(dst, src, segments[v2_header::FIELD], cudaMemcpyDeviceToDevice, stream)); \
+    }
+
+#define ACCESS_VAR(SYM, TYPE) reinterpret_cast<TYPE*>(in_compressed + header->entry[v2_header::SYM])
+
+namespace psz {
+
+TEMPLATE_TYPE
+IMPL::impl()
+{
+    codec = new Codec;
+    // TODO re-enable fallback codec
+    // fb_codec  = new FallbackCodec;
+}
+
+TEMPLATE_TYPE
+void IMPL::destroy()
+{
+    if (codec) delete codec;
+    // if (fb_codec) delete codec;
+
+    // also deallocate buffer
+}
+
+TEMPLATE_TYPE
+void IMPL::init(Context* config) { __init(config); }
+
+TEMPLATE_TYPE
+void IMPL::init(v2_header* config) { __init(config); }
+
+TEMPLATE_TYPE
+template <class ContextOrHeader>
+void IMPL::__init(ContextOrHeader* c)
+{
+    static_assert(
+        std::is_same<ContextOrHeader, Context>::value or  //
+            std::is_same<ContextOrHeader, v2_header>::value,
+        "[v2_Compressor::impl::init] not a valid comrpessor config type.");
+
+    auto len = c->x * c->y * c->z;
+    // TODO allocate anchor
+
+    // allocate eq
+    cudaMalloc(&d_errctrl, len * sizeof(EQ));  // to overlap with one of vle/hf buffers
+
+    // allocate outlier
+    outlier.allocate(len / sp_factor, true);
+
+    // allocate vle/hf
+    codec->init(len, c->radius * 2, c->vle_pardeg);
+    // TODO disable fallback codec for now
+}
+
+TEMPLATE_TYPE
+void IMPL::compress(
+    Context*     c,
+    T*           uncompressed,
+    BYTE*&       compressed,
+    size_t&      compressed_len,
+    cudaStream_t stream,
+    bool         dbg_print)
+{
+    auto const eb     = c->eb;
+    auto const radius = c->radius;
+    auto const pardeg = c->vle_pardeg;
+
+    if (dbg_print) {
+        printf("[dbg] eb: %lf\n", eb);
+        printf("[dbg] radius: %d\n", radius);
+        printf("[dbg] pardeg: %d\n", pardeg);
+        // printf("[dbg] codecs_in_use: %d\n", codecs_in_use);
+        printf("[dbg] sp_factor: %d\n", sp_factor);
+    }
+
+    data_len3 = dim3(c->x, c->y, c->z);
+    data_len  = c->x * c->y * c->z;
+
+    header.sp.factor = sp_factor;
+
+    BYTE*  d_codec_out{nullptr};
+    size_t codec_outlen{0};
+
+    // size_t sublen;
+    auto booklen = radius * 2;
+
+    /******************************************************************************/
+
+    // TODO version clarification
+    // with compaction
+    v2_compress_predict_lorenzo_i<T, EQ, FP>(
+        uncompressed, data_len3, eb, radius, d_errctrl, dim3(1, 1, 1), d_anchor, dim3(1, 1, 1), outlier,
+        &comp_time.construct, stream);
+
+    outlier.make_count_host_accessible(stream);
+
+    asz::stat::histogram<E>(d_errctrl, data_len, d_freq, booklen, &comp_time.hist, stream);
+
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    // TODO overlapping memory
+    codec->encode(d_errctrl, data_len, d_codec_out, codec_outlen, stream);
+
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    // update header
+    {
+        header.x = c->x, header.y = c->y, header.z = c->z, header.w = 1;
+        header.sp.count = outlier.access_count_on_host();
+        // TODO the new
+        {
+            // header.config.radius = radius, header.config.eb = eb;
+            // header.hf.pardeg = pardeg;
+        }
+
+        // the compat
+        {
+            header.radius = radius, header.eb = eb;
+            header.vle_pardeg = pardeg;
+        }
+
+        // header.byte_vle  = 4;  // regardless of fallback codec
+    };
+
+    size_t segments[v2_header::END] = {0};
+
+    // gather archive
+    {
+        // calculate offsets
+        segments[v2_header::HEADER] = sizeof(v2_header);
+        segments[v2_header::ANCHOR] = 0;  // placeholder
+        segments[v2_header::SP_IDX] = outlier.access_count_on_host() * sizeof(IDX);
+        segments[v2_header::SP_VAL] = outlier.access_count_on_host() * sizeof(T);
+        segments[v2_header::HF]     = codec_outlen;
+
+        header.entry[0] = 0;
+        for (auto i = 1; i < v2_header::END + 1; i++) { header.entry[i] = segments[i - 1]; }
+        for (auto i = 1; i < v2_header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; }
+
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        // memcpy
+        ARCHIVE(d_anchor, ANCHOR);
+        ARCHIVE(outlier.idx, SP_IDX);
+        ARCHIVE(outlier.val, SP_VAL);
+        ARCHIVE(d_codec_out, HF);
+
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+    }
+
+    // output
+    compressed_len = header.entry[v2_header::END];
+    compressed     = var_archive();
+
+    // collect_compress_timerecord();
+}
+
+TEMPLATE_TYPE
+void IMPL::decompress(v2_header* header, BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool dbg_print)
+{
+    // TODO host having copy of header when compressing
+    if (not header) {
+        header = new v2_header;
+        CHECK_CUDA(cudaMemcpyAsync(header, in_compressed, sizeof(v2_header), cudaMemcpyDeviceToHost, stream));
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+    }
+
+    data_len3 = dim3(header->x, header->y, header->z);
+
+    // use_fallback_codec      = header->byte_vle == 8;
+    // auto const vle_pardeg = header->hf.pardeg;
+
+    // The inputs of components are from `compressed`.
+    // auto d_anchor = ACCESS_VAR(ANCHOR, T);
+    auto d_vle   = ACCESS_VAR(HF, BYTE);
+    auto d_spidx = ACCESS_VAR(SP_IDX, IDX);
+    auto d_spval = ACCESS_VAR(SP_VAL, T);
+
+    // wire and aliasing
+    auto d_outlier = out_decompressed;
+    auto d_xdata   = out_decompressed;
+
+    psz::detail::spv_scatter<T, IDX>(d_spval, d_spidx, header->sp.count, d_outlier, &decomp_time.scatter, stream);
+
+    codec->decode(d_vle, d_errctrl);
+
+    decompress_predict_lorenzo_i<T, EQ, FP>(
+        d_errctrl, data_len3,  //
+        d_outlier,             //
+        nullptr, 0,            // TODO remove
+        header->eb, header->radius,
+        d_xdata,  // output
+        &decomp_time.reconstruct, stream);
+
+    // collect_decompress_timerecord();
+
+    // clear state for the next decompression after reporting
+    // use_fallback_codec = false;
+}
+
+}  // namespace psz
+
+#undef TEMPLATE_TYPE
+#undef IMPL
+
+#endif /* F4D645B7_B2E3_41AB_BCFD_DCF919C4C56D */
diff --git a/qtensor/compression/cusz/src/stat/cmpg1_1.cu b/qtensor/compression/cusz/src/stat/cmpg1_1.cu
index ccf91661..a32a02eb 100644
--- a/qtensor/compression/cusz/src/stat/cmpg1_1.cu
+++ b/qtensor/compression/cusz/src/stat/cmpg1_1.cu
@@ -1,30 +1,30 @@
-/**
- * @file cmpg1.cu
- * @author Jiannan Tian
- * @brief (split to speed up buid process; part 1)
- * @version 0.3
- * @date 2022-10-09
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "../detail/compare_gpu.inl"
-#include "stat/compare.h"
-#include "stat/compare_gpu.hh"
-
-#define THRUSTGPU_DESCRIPTION(Tliteral, T)                                        \
-    void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \
-    {                                                                             \
-        psz::detail::thrustgpu_get_extrema_rawptr<T>(d_ptr, len, res);            \
-    }                                                                             \
-                                                                                  \
-    template <>                                                                   \
-    void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])        \
-    {                                                                             \
-        thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res);                \
-    }
-
-THRUSTGPU_DESCRIPTION(ui8, uint8_t)
-
-#undef THRUSTGPU_DESCRIPTION
+/**
+ * @file cmpg1.cu
+ * @author Jiannan Tian
+ * @brief (split to speed up buid process; part 1)
+ * @version 0.3
+ * @date 2022-10-09
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_DESCRIPTION(Tliteral, T)                                        \
+    void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \
+    {                                                                             \
+        psz::detail::thrustgpu_get_extrema_rawptr<T>(d_ptr, len, res);            \
+    }                                                                             \
+                                                                                  \
+    template <>                                                                   \
+    void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])        \
+    {                                                                             \
+        thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res);                \
+    }
+
+THRUSTGPU_DESCRIPTION(ui8, uint8_t)
+
+#undef THRUSTGPU_DESCRIPTION
diff --git a/qtensor/compression/cusz/src/stat/cmpg1_2.cu b/qtensor/compression/cusz/src/stat/cmpg1_2.cu
index 8b44a9e6..b85c6477 100644
--- a/qtensor/compression/cusz/src/stat/cmpg1_2.cu
+++ b/qtensor/compression/cusz/src/stat/cmpg1_2.cu
@@ -1,30 +1,30 @@
-/**
- * @file cmpg1_2.cu
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-11-03
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "../detail/compare_gpu.inl"
-#include "stat/compare.h"
-#include "stat/compare_gpu.hh"
-
-#define THRUSTGPU_DESCRIPTION(Tliteral, T)                                        \
-    void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \
-    {                                                                             \
-        psz::detail::thrustgpu_get_extrema_rawptr<T>(d_ptr, len, res);            \
-    }                                                                             \
-                                                                                  \
-    template <>                                                                   \
-    void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])        \
-    {                                                                             \
-        thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res);                \
-    }
-
-THRUSTGPU_DESCRIPTION(ui16, uint16_t)
-
+/**
+ * @file cmpg1_2.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_DESCRIPTION(Tliteral, T)                                        \
+    void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \
+    {                                                                             \
+        psz::detail::thrustgpu_get_extrema_rawptr<T>(d_ptr, len, res);            \
+    }                                                                             \
+                                                                                  \
+    template <>                                                                   \
+    void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])        \
+    {                                                                             \
+        thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res);                \
+    }
+
+THRUSTGPU_DESCRIPTION(ui16, uint16_t)
+
 #undef THRUSTGPU_DESCRIPTION
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/stat/cmpg1_3.cu b/qtensor/compression/cusz/src/stat/cmpg1_3.cu
index 169741bc..a68f760c 100644
--- a/qtensor/compression/cusz/src/stat/cmpg1_3.cu
+++ b/qtensor/compression/cusz/src/stat/cmpg1_3.cu
@@ -1,30 +1,30 @@
-/**
- * @file cmpg1_3.cu
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-11-03
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "../detail/compare_gpu.inl"
-#include "stat/compare.h"
-#include "stat/compare_gpu.hh"
-
-#define THRUSTGPU_DESCRIPTION(Tliteral, T)                                        \
-    void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \
-    {                                                                             \
-        psz::detail::thrustgpu_get_extrema_rawptr<T>(d_ptr, len, res);            \
-    }                                                                             \
-                                                                                  \
-    template <>                                                                   \
-    void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])        \
-    {                                                                             \
-        thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res);                \
-    }
-
-THRUSTGPU_DESCRIPTION(ui32, uint32_t)
-
+/**
+ * @file cmpg1_3.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_DESCRIPTION(Tliteral, T)                                        \
+    void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \
+    {                                                                             \
+        psz::detail::thrustgpu_get_extrema_rawptr<T>(d_ptr, len, res);            \
+    }                                                                             \
+                                                                                  \
+    template <>                                                                   \
+    void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])        \
+    {                                                                             \
+        thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res);                \
+    }
+
+THRUSTGPU_DESCRIPTION(ui32, uint32_t)
+
 #undef THRUSTGPU_DESCRIPTION
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/stat/cmpg1_4.cu b/qtensor/compression/cusz/src/stat/cmpg1_4.cu
index 4ec93b20..47dcc774 100644
--- a/qtensor/compression/cusz/src/stat/cmpg1_4.cu
+++ b/qtensor/compression/cusz/src/stat/cmpg1_4.cu
@@ -1,30 +1,30 @@
-/**
- * @file cmpg1_4.cu
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-11-03
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "../detail/compare_gpu.inl"
-#include "stat/compare.h"
-#include "stat/compare_gpu.hh"
-
-#define THRUSTGPU_DESCRIPTION(Tliteral, T)                                        \
-    void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \
-    {                                                                             \
-        psz::detail::thrustgpu_get_extrema_rawptr<T>(d_ptr, len, res);            \
-    }                                                                             \
-                                                                                  \
-    template <>                                                                   \
-    void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])        \
-    {                                                                             \
-        thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res);                \
-    }
-
-THRUSTGPU_DESCRIPTION(fp32, float)
-
+/**
+ * @file cmpg1_4.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_DESCRIPTION(Tliteral, T)                                        \
+    void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \
+    {                                                                             \
+        psz::detail::thrustgpu_get_extrema_rawptr<T>(d_ptr, len, res);            \
+    }                                                                             \
+                                                                                  \
+    template <>                                                                   \
+    void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])        \
+    {                                                                             \
+        thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res);                \
+    }
+
+THRUSTGPU_DESCRIPTION(fp32, float)
+
 #undef THRUSTGPU_DESCRIPTION
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/stat/cmpg1_5.cu b/qtensor/compression/cusz/src/stat/cmpg1_5.cu
index 3b08e576..5828860d 100644
--- a/qtensor/compression/cusz/src/stat/cmpg1_5.cu
+++ b/qtensor/compression/cusz/src/stat/cmpg1_5.cu
@@ -1,30 +1,30 @@
-/**
- * @file cmpg1_5.cu
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-11-03
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "../detail/compare_gpu.inl"
-#include "stat/compare.h"
-#include "stat/compare_gpu.hh"
-
-#define THRUSTGPU_DESCRIPTION(Tliteral, T)                                        \
-    void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \
-    {                                                                             \
-        psz::detail::thrustgpu_get_extrema_rawptr<T>(d_ptr, len, res);            \
-    }                                                                             \
-                                                                                  \
-    template <>                                                                   \
-    void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])        \
-    {                                                                             \
-        thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res);                \
-    }
-
-THRUSTGPU_DESCRIPTION(fp64, double)
-
+/**
+ * @file cmpg1_5.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_DESCRIPTION(Tliteral, T)                                        \
+    void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \
+    {                                                                             \
+        psz::detail::thrustgpu_get_extrema_rawptr<T>(d_ptr, len, res);            \
+    }                                                                             \
+                                                                                  \
+    template <>                                                                   \
+    void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])        \
+    {                                                                             \
+        thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res);                \
+    }
+
+THRUSTGPU_DESCRIPTION(fp64, double)
+
 #undef THRUSTGPU_DESCRIPTION
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/stat/cmpg2.cu b/qtensor/compression/cusz/src/stat/cmpg2.cu
index 0ece52b5..a8bdcd29 100644
--- a/qtensor/compression/cusz/src/stat/cmpg2.cu
+++ b/qtensor/compression/cusz/src/stat/cmpg2.cu
@@ -1,34 +1,34 @@
-/**
- * @file cmp2g.cu
- * @author Jiannan Tian
- * @brief (split to speed up buid process; part 2)
- * @version 0.3
- * @date 2022-11-03
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "../detail/compare_gpu.inl"
-#include "stat/compare.h"
-#include "stat/compare_gpu.hh"
-
-#define THRUSTGPU_COMPARE_LOSSLESS(Tliteral, T)                          \
-    bool thrustgpu_identical_T##Tliteral(T* d1, T* d2, size_t const len) \
-    {                                                                    \
-        return psz::detail::thrustgpu_identical<T>(d1, d2, len);         \
-    }                                                                    \
-                                                                         \
-    template <>                                                          \
-    bool psz::thrustgpu_identical<T>(T * d1, T * d2, size_t const len)   \
-    {                                                                    \
-        return thrustgpu_identical_T##Tliteral(d1, d2, len);             \
-    }
-
-THRUSTGPU_COMPARE_LOSSLESS(fp32, float)
-THRUSTGPU_COMPARE_LOSSLESS(fp64, double)
-THRUSTGPU_COMPARE_LOSSLESS(ui8, uint8_t)
-THRUSTGPU_COMPARE_LOSSLESS(ui16, uint16_t)
-THRUSTGPU_COMPARE_LOSSLESS(ui32, uint32_t)
-
-#undef THRUSTGPU_COMPARE_LOSSLESS
+/**
+ * @file cmp2g.cu
+ * @author Jiannan Tian
+ * @brief (split to speed up buid process; part 2)
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_COMPARE_LOSSLESS(Tliteral, T)                          \
+    bool thrustgpu_identical_T##Tliteral(T* d1, T* d2, size_t const len) \
+    {                                                                    \
+        return psz::detail::thrustgpu_identical<T>(d1, d2, len);         \
+    }                                                                    \
+                                                                         \
+    template <>                                                          \
+    bool psz::thrustgpu_identical<T>(T * d1, T * d2, size_t const len)   \
+    {                                                                    \
+        return thrustgpu_identical_T##Tliteral(d1, d2, len);             \
+    }
+
+THRUSTGPU_COMPARE_LOSSLESS(fp32, float)
+THRUSTGPU_COMPARE_LOSSLESS(fp64, double)
+THRUSTGPU_COMPARE_LOSSLESS(ui8, uint8_t)
+THRUSTGPU_COMPARE_LOSSLESS(ui16, uint16_t)
+THRUSTGPU_COMPARE_LOSSLESS(ui32, uint32_t)
+
+#undef THRUSTGPU_COMPARE_LOSSLESS
diff --git a/qtensor/compression/cusz/src/stat/cmpg3.cu b/qtensor/compression/cusz/src/stat/cmpg3.cu
index 05c7af97..61f71f13 100644
--- a/qtensor/compression/cusz/src/stat/cmpg3.cu
+++ b/qtensor/compression/cusz/src/stat/cmpg3.cu
@@ -1,32 +1,32 @@
-/**
- * @file cmp3g.cu
- * @author Jiannan Tian
- * @brief (split to speed up buid process; part 3)
- * @version 0.3
- * @date 2022-11-03
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "../detail/compare_gpu.inl"
-#include "stat/compare.h"
-#include "stat/compare_gpu.hh"
-
-#define THRUSTGPU_COMPARE_LOSSY(Tliteral, T)                                                                        \
-    bool thrustgpu_error_bounded_T##Tliteral(                                                                       \
-        T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx = nullptr)                          \
-    {                                                                                                               \
-        return psz::detail::thrustgpu_error_bounded<T>(a, b, len, eb, first_faulty_idx);                            \
-    }                                                                                                               \
-                                                                                                                    \
-    template <>                                                                                                     \
-    bool psz::thrustgpu_error_bounded<T>(T * a, T * b, size_t const len, double const eb, size_t* first_faulty_idx) \
-    {                                                                                                               \
-        return thrustgpu_error_bounded_T##Tliteral(a, b, len, eb, first_faulty_idx);                                \
-    }
-
-THRUSTGPU_COMPARE_LOSSY(fp32, float);
-THRUSTGPU_COMPARE_LOSSY(fp64, double);
-
-#undef THRUSTGPU_COMPARE_LOSSY
+/**
+ * @file cmp3g.cu
+ * @author Jiannan Tian
+ * @brief (split to speed up buid process; part 3)
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_COMPARE_LOSSY(Tliteral, T)                                                                        \
+    bool thrustgpu_error_bounded_T##Tliteral(                                                                       \
+        T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx = nullptr)                          \
+    {                                                                                                               \
+        return psz::detail::thrustgpu_error_bounded<T>(a, b, len, eb, first_faulty_idx);                            \
+    }                                                                                                               \
+                                                                                                                    \
+    template <>                                                                                                     \
+    bool psz::thrustgpu_error_bounded<T>(T * a, T * b, size_t const len, double const eb, size_t* first_faulty_idx) \
+    {                                                                                                               \
+        return thrustgpu_error_bounded_T##Tliteral(a, b, len, eb, first_faulty_idx);                                \
+    }
+
+THRUSTGPU_COMPARE_LOSSY(fp32, float);
+THRUSTGPU_COMPARE_LOSSY(fp64, double);
+
+#undef THRUSTGPU_COMPARE_LOSSY
diff --git a/qtensor/compression/cusz/src/stat/cmpg4_1.cu b/qtensor/compression/cusz/src/stat/cmpg4_1.cu
index b3e5edaf..34d74884 100644
--- a/qtensor/compression/cusz/src/stat/cmpg4_1.cu
+++ b/qtensor/compression/cusz/src/stat/cmpg4_1.cu
@@ -1,24 +1,24 @@
-/**
- * @file cmpg4_1.cu
- * @author Jiannan Tian
- * @brief (split to speed up buid process; part 4)
- * @version 0.3
- * @date 2022-11-03
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "../detail/compare_gpu.inl"
-#include "stat/compare.h"
-#include "stat/compare_gpu.hh"
-
-#define THRUSTGPU_ASSESS(Tliteral, T)                                                              \
-    void thrustgpu_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len) \
-    {                                                                                              \
-        psz::detail::thrustgpu_assess_quality<T>(s, xdata, odata, len);                            \
-    }
-
-THRUSTGPU_ASSESS(fp32, float);
-
-#undef THRUSTGPU_ASSESS
+/**
+ * @file cmpg4_1.cu
+ * @author Jiannan Tian
+ * @brief (split to speed up buid process; part 4)
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_ASSESS(Tliteral, T)                                                              \
+    void thrustgpu_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len) \
+    {                                                                                              \
+        psz::detail::thrustgpu_assess_quality<T>(s, xdata, odata, len);                            \
+    }
+
+THRUSTGPU_ASSESS(fp32, float);
+
+#undef THRUSTGPU_ASSESS
diff --git a/qtensor/compression/cusz/src/stat/cmpg4_2.cu b/qtensor/compression/cusz/src/stat/cmpg4_2.cu
index 7a62b06d..73dcde1f 100644
--- a/qtensor/compression/cusz/src/stat/cmpg4_2.cu
+++ b/qtensor/compression/cusz/src/stat/cmpg4_2.cu
@@ -1,25 +1,25 @@
-/**
- * @file cmpg4_2.cu
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-11-03
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "../detail/compare_gpu.inl"
-#include "stat/compare.h"
-#include "stat/compare_gpu.hh"
-
-#define THRUSTGPU_ASSESS(Tliteral, T)                                                             \
-    template <>                                                                                   \
-    void psz::thrustgpu_assess_quality<T>(cusz_stats * s, T * xdata, T * odata, size_t const len) \
-    {                                                                                             \
-        thrustgpu_assess_quality_T##Tliteral(s, xdata, odata, len);                               \
-    }
-
-THRUSTGPU_ASSESS(fp32, float);
-
-#undef THRUSTGPU_ASSESS
+/**
+ * @file cmpg4_2.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_ASSESS(Tliteral, T)                                                             \
+    template <>                                                                                   \
+    void psz::thrustgpu_assess_quality<T>(cusz_stats * s, T * xdata, T * odata, size_t const len) \
+    {                                                                                             \
+        thrustgpu_assess_quality_T##Tliteral(s, xdata, odata, len);                               \
+    }
+
+THRUSTGPU_ASSESS(fp32, float);
+
+#undef THRUSTGPU_ASSESS
diff --git a/qtensor/compression/cusz/src/stat/cmpg4_3.cu b/qtensor/compression/cusz/src/stat/cmpg4_3.cu
index b9361bfb..bbca7c6c 100644
--- a/qtensor/compression/cusz/src/stat/cmpg4_3.cu
+++ b/qtensor/compression/cusz/src/stat/cmpg4_3.cu
@@ -1,24 +1,24 @@
-/**
- * @file cmpg4_3.cu
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-11-03
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "../detail/compare_gpu.inl"
-#include "stat/compare.h"
-#include "stat/compare_gpu.hh"
-
-#define THRUSTGPU_ASSESS(Tliteral, T)                                                              \
-    void thrustgpu_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len) \
-    {                                                                                              \
-        psz::detail::thrustgpu_assess_quality<T>(s, xdata, odata, len);                            \
-    }
-
-THRUSTGPU_ASSESS(fp64, double);
-
+/**
+ * @file cmpg4_3.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_ASSESS(Tliteral, T)                                                              \
+    void thrustgpu_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len) \
+    {                                                                                              \
+        psz::detail::thrustgpu_assess_quality<T>(s, xdata, odata, len);                            \
+    }
+
+THRUSTGPU_ASSESS(fp64, double);
+
 #undef THRUSTGPU_ASSESS
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/stat/cmpg4_4.cu b/qtensor/compression/cusz/src/stat/cmpg4_4.cu
index 4df3919f..d60b8b97 100644
--- a/qtensor/compression/cusz/src/stat/cmpg4_4.cu
+++ b/qtensor/compression/cusz/src/stat/cmpg4_4.cu
@@ -1,25 +1,25 @@
-/**
- * @file cmpg4_4.cu
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-11-03
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "../detail/compare_gpu.inl"
-#include "stat/compare.h"
-#include "stat/compare_gpu.hh"
-
-#define THRUSTGPU_ASSESS(Tliteral, T)                                                             \
-    template <>                                                                                   \
-    void psz::thrustgpu_assess_quality<T>(cusz_stats * s, T * xdata, T * odata, size_t const len) \
-    {                                                                                             \
-        thrustgpu_assess_quality_T##Tliteral(s, xdata, odata, len);                               \
-    }
-
-THRUSTGPU_ASSESS(fp64, double);
-
+/**
+ * @file cmpg4_4.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_ASSESS(Tliteral, T)                                                             \
+    template <>                                                                                   \
+    void psz::thrustgpu_assess_quality<T>(cusz_stats * s, T * xdata, T * odata, size_t const len) \
+    {                                                                                             \
+        thrustgpu_assess_quality_T##Tliteral(s, xdata, odata, len);                               \
+    }
+
+THRUSTGPU_ASSESS(fp64, double);
+
 #undef THRUSTGPU_ASSESS
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/stat/compare_cpu.cc b/qtensor/compression/cusz/src/stat/compare_cpu.cc
index c9432bb4..8a22dbe3 100644
--- a/qtensor/compression/cusz/src/stat/compare_cpu.cc
+++ b/qtensor/compression/cusz/src/stat/compare_cpu.cc
@@ -1,43 +1,43 @@
-/**
- * @file _compare.cc
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-10-09
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "../detail/compare_cpu.inl"
-#include "stat/compare.h"
-
-#define CPPSTD_COMPARE_LOSSLESS(Tliteral, T)                          \
-    bool cppstd_identical_T##Tliteral(T* d1, T* d2, size_t const len) \
-    {                                                                 \
-        return psz::detail::cppstd_identical<T>(d1, d2, len);         \
-    }
-
-#define CPPSTD_COMPARE_LOSSY(Tliteral, T)                                                       \
-    bool cppstd_error_bounded_T##Tliteral(                                                      \
-        T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx = nullptr)      \
-    {                                                                                           \
-        return psz::detail::cppstd_error_bounded<T>(a, b, len, eb, first_faulty_idx);           \
-    }                                                                                           \
-                                                                                                \
-    void cppstd_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len) \
-    {                                                                                           \
-        psz::detail::cppstd_assess_quality<T>(s, xdata, odata, len);                            \
-    }
-
-CPPSTD_COMPARE_LOSSLESS(fp32, float)
-CPPSTD_COMPARE_LOSSLESS(fp64, double)
-CPPSTD_COMPARE_LOSSLESS(ui8, uint8_t)
-CPPSTD_COMPARE_LOSSLESS(ui16, uint16_t)
-CPPSTD_COMPARE_LOSSLESS(ui32, uint32_t)
-
-CPPSTD_COMPARE_LOSSY(fp32, float)
-CPPSTD_COMPARE_LOSSY(fp64, double)
-
-#undef CPPSTD_COMPARE_LOSSLESS
-#undef CPPSTD_COMPARE_LOSSY
+/**
+ * @file _compare.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-09
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_cpu.inl"
+#include "stat/compare.h"
+
+#define CPPSTD_COMPARE_LOSSLESS(Tliteral, T)                          \
+    bool cppstd_identical_T##Tliteral(T* d1, T* d2, size_t const len) \
+    {                                                                 \
+        return psz::detail::cppstd_identical<T>(d1, d2, len);         \
+    }
+
+#define CPPSTD_COMPARE_LOSSY(Tliteral, T)                                                       \
+    bool cppstd_error_bounded_T##Tliteral(                                                      \
+        T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx = nullptr)      \
+    {                                                                                           \
+        return psz::detail::cppstd_error_bounded<T>(a, b, len, eb, first_faulty_idx);           \
+    }                                                                                           \
+                                                                                                \
+    void cppstd_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len) \
+    {                                                                                           \
+        psz::detail::cppstd_assess_quality<T>(s, xdata, odata, len);                            \
+    }
+
+CPPSTD_COMPARE_LOSSLESS(fp32, float)
+CPPSTD_COMPARE_LOSSLESS(fp64, double)
+CPPSTD_COMPARE_LOSSLESS(ui8, uint8_t)
+CPPSTD_COMPARE_LOSSLESS(ui16, uint16_t)
+CPPSTD_COMPARE_LOSSLESS(ui32, uint32_t)
+
+CPPSTD_COMPARE_LOSSY(fp32, float)
+CPPSTD_COMPARE_LOSSY(fp64, double)
+
+#undef CPPSTD_COMPARE_LOSSLESS
+#undef CPPSTD_COMPARE_LOSSY
diff --git a/qtensor/compression/cusz/src/stat/stat_g.cu b/qtensor/compression/cusz/src/stat/stat_g.cu
index 2fcc81c6..c3c18c12 100644
--- a/qtensor/compression/cusz/src/stat/stat_g.cu
+++ b/qtensor/compression/cusz/src/stat/stat_g.cu
@@ -1,96 +1,96 @@
-/**
- * @file stat_g.cu
- * @author Cody Rivera, Jiannan Tian
- * @brief Fast histogramming from [Gómez-Luna et al. 2013], wrapper
- * @version 0.3
- * @date 2022-11-02
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "../kernel/detail/hist.inl"
-
-#include "cusz/type.h"
-#include "stat/stat.h"
-#include "stat/stat_g.hh"
-
-template <typename T>
-cusz_error_status asz::stat::histogram(
-    T*           in_data,
-    size_t const in_len,
-    uint32_t*    out_freq,
-    int const    num_buckets,
-    float*       milliseconds,
-    cudaStream_t stream)
-{
-    int device_id, max_bytes, num_SMs;
-    int items_per_thread, r_per_block, grid_dim, block_dim, shmem_use;
-
-    cudaGetDevice(&device_id);
-    cudaDeviceGetAttribute(&num_SMs, cudaDevAttrMultiProcessorCount, device_id);
-
-    auto query_maxbytes = [&]() {
-        int max_bytes_opt_in;
-        cudaDeviceGetAttribute(&max_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id);
-
-        // account for opt-in extra shared memory on certain architectures
-        cudaDeviceGetAttribute(&max_bytes_opt_in, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id);
-        max_bytes = std::max(max_bytes, max_bytes_opt_in);
-
-        // config kernel attribute
-        cudaFuncSetAttribute(
-            kernel::p2013Histogram<T, cusz::FREQ>, cudaFuncAttributeMaxDynamicSharedMemorySize, max_bytes);
-    };
-
-    auto optimize_launch = [&]() {
-        items_per_thread = 1;
-        r_per_block      = (max_bytes / sizeof(int)) / (num_buckets + 1);
-        grid_dim         = num_SMs;
-        // fits to size
-        block_dim = ((((in_len / (grid_dim * items_per_thread)) + 1) / 64) + 1) * 64;
-        while (block_dim > 1024) {
-            if (r_per_block <= 1) { block_dim = 1024; }
-            else {
-                r_per_block /= 2;
-                grid_dim *= 2;
-                block_dim = ((((in_len / (grid_dim * items_per_thread)) + 1) / 64) + 1) * 64;
-            }
-        }
-        shmem_use = ((num_buckets + 1) * r_per_block) * sizeof(int);
-    };
-
-    query_maxbytes();
-    optimize_launch();
-
-    CREATE_CUDAEVENT_PAIR;
-    START_CUDAEVENT_RECORDING(stream);
-
-    kernel::p2013Histogram<<<grid_dim, block_dim, shmem_use, stream>>>  //
-        (in_data, out_freq, in_len, num_buckets, r_per_block);
-
-    STOP_CUDAEVENT_RECORDING(stream);
-
-    cudaStreamSynchronize(stream);
-    TIME_ELAPSED_CUDAEVENT(milliseconds);
-    DESTROY_CUDAEVENT_PAIR;
-
-    return CUSZ_SUCCESS;
-}
-
-#define INIT_HIST_AND_C(Tname, T)                                                                                     \
-    template cusz_error_status asz::stat::histogram<T>(T*, size_t const, uint32_t*, int const, float*, cudaStream_t); \
-                                                                                                                      \
-    cusz_error_status histogram_T##Tname(                                                                             \
-        T* in_data, size_t const in_len, uint32_t* out_freq, int const num_buckets, float* milliseconds,              \
-        cudaStream_t stream)                                                                                          \
-    {                                                                                                                 \
-        return asz::stat::histogram<T>(in_data, in_len, out_freq, num_buckets, milliseconds, stream);                 \
-    }
-
-INIT_HIST_AND_C(ui8, uint8_t)
-INIT_HIST_AND_C(ui16, uint16_t)
-INIT_HIST_AND_C(ui32, uint32_t)
-INIT_HIST_AND_C(ui64, uint64_t)
-
+/**
+ * @file stat_g.cu
+ * @author Cody Rivera, Jiannan Tian
+ * @brief Fast histogramming from [Gómez-Luna et al. 2013], wrapper
+ * @version 0.3
+ * @date 2022-11-02
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../kernel/detail/hist.inl"
+
+#include "cusz/type.h"
+#include "stat/stat.h"
+#include "stat/stat_g.hh"
+
+template <typename T>
+cusz_error_status asz::stat::histogram(
+    T*           in_data,
+    size_t const in_len,
+    uint32_t*    out_freq,
+    int const    num_buckets,
+    float*       milliseconds,
+    cudaStream_t stream)
+{
+    int device_id, max_bytes, num_SMs;
+    int items_per_thread, r_per_block, grid_dim, block_dim, shmem_use;
+
+    cudaGetDevice(&device_id);
+    cudaDeviceGetAttribute(&num_SMs, cudaDevAttrMultiProcessorCount, device_id);
+
+    auto query_maxbytes = [&]() {
+        int max_bytes_opt_in;
+        cudaDeviceGetAttribute(&max_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id);
+
+        // account for opt-in extra shared memory on certain architectures
+        cudaDeviceGetAttribute(&max_bytes_opt_in, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id);
+        max_bytes = std::max(max_bytes, max_bytes_opt_in);
+
+        // config kernel attribute
+        cudaFuncSetAttribute(
+            kernel::p2013Histogram<T, cusz::FREQ>, cudaFuncAttributeMaxDynamicSharedMemorySize, max_bytes);
+    };
+
+    auto optimize_launch = [&]() {
+        items_per_thread = 1;
+        r_per_block      = (max_bytes / sizeof(int)) / (num_buckets + 1);
+        grid_dim         = num_SMs;
+        // fits to size
+        block_dim = ((((in_len / (grid_dim * items_per_thread)) + 1) / 64) + 1) * 64;
+        while (block_dim > 1024) {
+            if (r_per_block <= 1) { block_dim = 1024; }
+            else {
+                r_per_block /= 2;
+                grid_dim *= 2;
+                block_dim = ((((in_len / (grid_dim * items_per_thread)) + 1) / 64) + 1) * 64;
+            }
+        }
+        shmem_use = ((num_buckets + 1) * r_per_block) * sizeof(int);
+    };
+
+    query_maxbytes();
+    optimize_launch();
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    kernel::p2013Histogram<<<grid_dim, block_dim, shmem_use, stream>>>  //
+        (in_data, out_freq, in_len, num_buckets, r_per_block);
+
+    STOP_CUDAEVENT_RECORDING(stream);
+
+    cudaStreamSynchronize(stream);
+    TIME_ELAPSED_CUDAEVENT(milliseconds);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+#define INIT_HIST_AND_C(Tname, T)                                                                                     \
+    template cusz_error_status asz::stat::histogram<T>(T*, size_t const, uint32_t*, int const, float*, cudaStream_t); \
+                                                                                                                      \
+    cusz_error_status histogram_T##Tname(                                                                             \
+        T* in_data, size_t const in_len, uint32_t* out_freq, int const num_buckets, float* milliseconds,              \
+        cudaStream_t stream)                                                                                          \
+    {                                                                                                                 \
+        return asz::stat::histogram<T>(in_data, in_len, out_freq, num_buckets, milliseconds, stream);                 \
+    }
+
+INIT_HIST_AND_C(ui8, uint8_t)
+INIT_HIST_AND_C(ui16, uint16_t)
+INIT_HIST_AND_C(ui32, uint32_t)
+INIT_HIST_AND_C(ui64, uint64_t)
+
 #undef INIT_HIST_AND_C
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/utils/dbg_print.cuh b/qtensor/compression/cusz/src/utils/dbg_print.cuh
index 19334e2e..2c2b5580 100644
--- a/qtensor/compression/cusz/src/utils/dbg_print.cuh
+++ b/qtensor/compression/cusz/src/utils/dbg_print.cuh
@@ -1,132 +1,132 @@
-#ifndef UTILS_DBG_PRINT_CUH
-#define UTILS_DBG_PRINT_CUH
-
-/**
- * @file dbg_print.cuh
- * @author Jiannan Tian
- * @brief
- * @version 0.2
- * @date 2020-09-20
- * Created on 2020-03-17
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-template <typename Q, int PART_SIZE>
-__global__ void print_deflated(Q* coded, size_t gid)
-{
-    if (blockIdx.x * blockDim.x + threadIdx.x != gid) return;
-    printf("print after deflating\n");
-    //    for_each(coded, coded + PART_SIZE, [](Q& i) { print_by_type(i, '_', '\n'); });
-    for (size_t i = 0; i < PART_SIZE; i++) { print_by_type(*(coded + i), '_', '\n'); }
-    printf("\n");
-}
-
-template <typename T>
-__global__ void print_histogram(T* freq, size_t size, size_t radius = 20)
-{
-    const int DICT_SIZE = size; /* Dynamic sizing */
-    if (blockIdx.x * blockDim.x + threadIdx.x == 0) {
-        for (size_t i = DICT_SIZE / 2 - radius; i < DICT_SIZE / 2 + radius; i++) {
-            if (i % 10 == 0) printf("\n");
-            printf("%4lu: %-12lu", i, static_cast<size_t>(freq[i]));
-        }
-        printf("\n");
-    }
-}
-
-template <typename T>
-__device__ __host__ void print_by_type(T num, char sep = '_', char ending = '\n')
-{
-    for (size_t j = 0; j < sizeof(T) * CHAR_BIT; j++) {
-        printf("%u", (num >> ((sizeof(T) * CHAR_BIT - 1) - j)) & 0x01u);
-        if (j != 0 and j != sizeof(T) * CHAR_BIT - 1 and j % 8 == 7) printf("%c", sep);
-    }
-    printf("%c", ending);
-}
-
-// MSB to LSB
-template <typename T>
-__device__ __host__ void print_code_only(T num, size_t bitwidth, char sep = '_', char ending = '\n')
-{
-    for (size_t j = 0; j < bitwidth; j++) {
-        printf("%u", (num >> ((bitwidth - 1) - j)) & 0x01u);
-        if (j != 0 and j != bitwidth - 1 and j % 8 == 7) printf("%c", sep);
-    }
-    printf("%c", ending);
-}
-
-template <typename T>
-__device__ __host__ void snippet_print_bitset_full(T num)
-{
-    print_by_type(num, '_', '\t');
-    size_t bitwidth = *((uint8_t*)&num + sizeof(T) - 1);
-    //    size_t code_bitwidth = ((static_cast<T>(0xffu) << (sizeof(T) * 8 - 8)) & num) >> (sizeof(T) * 8 - 8);
-    printf("len: %3lu\tcode: ", bitwidth);
-    print_code_only<T>(num, bitwidth, '\0', '\n');
-}
-
-template <typename T>
-__global__ void print_codebook(T* codebook, size_t len)
-{
-    if (blockIdx.x * blockDim.x + threadIdx.x != 0) return;
-    printf("--------------------------------------------------------------------------------\n");
-    printf("printing codebook\n");
-    printf("--------------------------------------------------------------------------------\n");
-    __shared__ T buffer;
-    for (size_t i = 0; i < len; i++) {
-        buffer = codebook[i];
-        if (buffer == ~((T)0x0)) continue;
-        printf("%5lu\t", i);
-        snippet_print_bitset_full(buffer);
-    }
-    printf("--------------------------------------------------------------------------------\n");
-    printf("done printing codebook\n");
-    printf("--------------------------------------------------------------------------------\n");
-}
-
-template <typename T>
-__global__ void get_entropy(T* freq)
-{
-}
-
-// TODO real GPU version
-template <typename T, typename Q>
-__global__ void get_theoretical_dense_Huffman_coded_length(T* codebook, Q* freq, size_t codebook_len)
-{
-}
-
-// template <typename T>
-//__global__ void print_Huffman_coded_before_deflating(T* coded, size_t len=200) {
-//    if (blockIdx.x * blockDim.x + threadIdx.x != 0) return;
-//    printf("print Huffman coded before it is deflated\n");
-//    for (size_t i = 0; i < 200; i++) {
-//        if (coded[i] == ~((T)0x0)) continue;
-//        printf("%5lu\t", i);
-//        snippet_print_bitset_full(coded[i]);
-//    }
-//    printf("\n");
-//}
-
-template <typename T>
-__global__ void print_Huffman_coded_before_deflating(T* coded, size_t len)
-{
-    if (blockIdx.x != 0) return;
-    size_t gid = blockDim.x * blockIdx.x + threadIdx.x;
-    if (coded[gid] == ~((T)0x0)) return;
-    printf("%5lu\t", gid);
-    snippet_print_bitset_full(coded[gid]);
-
-    //        if (coded[i] == ~((T)0x0)) continue;
-    //    printf("print Huffman coded before it is deflated\n");
-    //    for (size_t i = 0; i < 200; i++) {
-    //        if (coded[i] == ~((T)0x0)) continue;
-    //        printf("%5lu\t", i);
-    //        snippet_print_bitset_full(coded[i]);
-    //    }
-    //    printf("\n");
-}
-
+#ifndef UTILS_DBG_PRINT_CUH
+#define UTILS_DBG_PRINT_CUH
+
+/**
+ * @file dbg_print.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.2
+ * @date 2020-09-20
+ * Created on 2020-03-17
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+template <typename Q, int PART_SIZE>
+__global__ void print_deflated(Q* coded, size_t gid)
+{
+    if (blockIdx.x * blockDim.x + threadIdx.x != gid) return;
+    printf("print after deflating\n");
+    //    for_each(coded, coded + PART_SIZE, [](Q& i) { print_by_type(i, '_', '\n'); });
+    for (size_t i = 0; i < PART_SIZE; i++) { print_by_type(*(coded + i), '_', '\n'); }
+    printf("\n");
+}
+
+template <typename T>
+__global__ void print_histogram(T* freq, size_t size, size_t radius = 20)
+{
+    const int DICT_SIZE = size; /* Dynamic sizing */
+    if (blockIdx.x * blockDim.x + threadIdx.x == 0) {
+        for (size_t i = DICT_SIZE / 2 - radius; i < DICT_SIZE / 2 + radius; i++) {
+            if (i % 10 == 0) printf("\n");
+            printf("%4lu: %-12lu", i, static_cast<size_t>(freq[i]));
+        }
+        printf("\n");
+    }
+}
+
+template <typename T>
+__device__ __host__ void print_by_type(T num, char sep = '_', char ending = '\n')
+{
+    for (size_t j = 0; j < sizeof(T) * CHAR_BIT; j++) {
+        printf("%u", (num >> ((sizeof(T) * CHAR_BIT - 1) - j)) & 0x01u);
+        if (j != 0 and j != sizeof(T) * CHAR_BIT - 1 and j % 8 == 7) printf("%c", sep);
+    }
+    printf("%c", ending);
+}
+
+// MSB to LSB
+template <typename T>
+__device__ __host__ void print_code_only(T num, size_t bitwidth, char sep = '_', char ending = '\n')
+{
+    for (size_t j = 0; j < bitwidth; j++) {
+        printf("%u", (num >> ((bitwidth - 1) - j)) & 0x01u);
+        if (j != 0 and j != bitwidth - 1 and j % 8 == 7) printf("%c", sep);
+    }
+    printf("%c", ending);
+}
+
+template <typename T>
+__device__ __host__ void snippet_print_bitset_full(T num)
+{
+    print_by_type(num, '_', '\t');
+    size_t bitwidth = *((uint8_t*)&num + sizeof(T) - 1);
+    //    size_t code_bitwidth = ((static_cast<T>(0xffu) << (sizeof(T) * 8 - 8)) & num) >> (sizeof(T) * 8 - 8);
+    printf("len: %3lu\tcode: ", bitwidth);
+    print_code_only<T>(num, bitwidth, '\0', '\n');
+}
+
+template <typename T>
+__global__ void print_codebook(T* codebook, size_t len)
+{
+    if (blockIdx.x * blockDim.x + threadIdx.x != 0) return;
+    printf("--------------------------------------------------------------------------------\n");
+    printf("printing codebook\n");
+    printf("--------------------------------------------------------------------------------\n");
+    __shared__ T buffer;
+    for (size_t i = 0; i < len; i++) {
+        buffer = codebook[i];
+        if (buffer == ~((T)0x0)) continue;
+        printf("%5lu\t", i);
+        snippet_print_bitset_full(buffer);
+    }
+    printf("--------------------------------------------------------------------------------\n");
+    printf("done printing codebook\n");
+    printf("--------------------------------------------------------------------------------\n");
+}
+
+template <typename T>
+__global__ void get_entropy(T* freq)
+{
+}
+
+// TODO real GPU version
+template <typename T, typename Q>
+__global__ void get_theoretical_dense_Huffman_coded_length(T* codebook, Q* freq, size_t codebook_len)
+{
+}
+
+// template <typename T>
+//__global__ void print_Huffman_coded_before_deflating(T* coded, size_t len=200) {
+//    if (blockIdx.x * blockDim.x + threadIdx.x != 0) return;
+//    printf("print Huffman coded before it is deflated\n");
+//    for (size_t i = 0; i < 200; i++) {
+//        if (coded[i] == ~((T)0x0)) continue;
+//        printf("%5lu\t", i);
+//        snippet_print_bitset_full(coded[i]);
+//    }
+//    printf("\n");
+//}
+
+template <typename T>
+__global__ void print_Huffman_coded_before_deflating(T* coded, size_t len)
+{
+    if (blockIdx.x != 0) return;
+    size_t gid = blockDim.x * blockIdx.x + threadIdx.x;
+    if (coded[gid] == ~((T)0x0)) return;
+    printf("%5lu\t", gid);
+    snippet_print_bitset_full(coded[gid]);
+
+    //        if (coded[i] == ~((T)0x0)) continue;
+    //    printf("print Huffman coded before it is deflated\n");
+    //    for (size_t i = 0; i < 200; i++) {
+    //        if (coded[i] == ~((T)0x0)) continue;
+    //        printf("%5lu\t", i);
+    //        snippet_print_bitset_full(coded[i]);
+    //    }
+    //    printf("\n");
+}
+
 #endif
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/utils/print_gpu.cu b/qtensor/compression/cusz/src/utils/print_gpu.cu
index 9fd20040..2d2b195f 100644
--- a/qtensor/compression/cusz/src/utils/print_gpu.cu
+++ b/qtensor/compression/cusz/src/utils/print_gpu.cu
@@ -1,121 +1,121 @@
-/**
- * @file print_gpu.cu
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-09-23
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-// #include "../detail/print_gpu.inl"
-#include <stdio.h>
-#include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
-#include "utils/print_gpu.h"
-#include "utils/print_gpu.hh"
-
-#define PRINT_INT_LESS_THAN_64(Tliteral, T)                                                                 \
-    void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset)                                  \
-    {                                                                                                       \
-        thrust::for_each(                                                                                   \
-            thrust::device, d_arr, d_arr + num, [=] __device__(const T i) { printf("%d\t", (int32_t)i); }); \
-        printf("\n");                                                                                       \
-    }
-
-PRINT_INT_LESS_THAN_64(i8, int8_t)
-PRINT_INT_LESS_THAN_64(i16, int16_t)
-PRINT_INT_LESS_THAN_64(i32, int32_t)
-
-void peek_device_data_Ti64(int64_t* d_arr, size_t num, size_t offset)
-{
-    thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const int64_t i) { printf("%ld\t", i); });
-    printf("\n");
-}
-
-#define PRINT_UINT_LESS_THAN_64(Tliteral, T)                                                                 \
-    void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset)                                   \
-    {                                                                                                        \
-        thrust::for_each(                                                                                    \
-            thrust::device, d_arr, d_arr + num, [=] __device__(const T i) { printf("%u\t", (uint32_t)i); }); \
-        printf("\n");                                                                                        \
-    }
-
-PRINT_UINT_LESS_THAN_64(ui8, uint8_t)
-PRINT_UINT_LESS_THAN_64(ui16, uint16_t)
-PRINT_UINT_LESS_THAN_64(ui32, uint32_t)
-
-void peek_device_data_Tui64(uint64_t* d_arr, size_t num, size_t offset)
-{
-    thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const uint64_t i) { printf("%lu\t", i); });
-    printf("\n");
-}
-
-void peek_device_data_Tfp32(float* d_arr, size_t num, size_t offset)
-{
-    thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const float i) { printf("%.7f\t", i); });
-    printf("\n");
-}
-
-void peek_device_data_Tfp64(double* d_arr, size_t num, size_t offset)
-{
-    thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const double i) { printf("%.7lf\t", i); });
-    printf("\n");
-}
-
-template <typename T>
-void psz::peek_device_data(T* d_arr, size_t num, size_t offset)
-{
-    if (std::is_same<T, int8_t>::value) {  //
-        peek_device_data_Ti8((int8_t*)d_arr, num, offset);
-    }
-    else if (std::is_same<T, int16_t>::value) {
-        peek_device_data_Ti16((int16_t*)d_arr, num, offset);
-    }
-    else if (std::is_same<T, int32_t>::value) {
-        peek_device_data_Ti32((int32_t*)d_arr, num, offset);
-    }
-    else if (std::is_same<T, int64_t>::value) {
-        peek_device_data_Ti64((int64_t*)d_arr, num, offset);
-    }
-    else if (std::is_same<T, uint8_t>::value) {
-        peek_device_data_Tui8((uint8_t*)d_arr, num, offset);
-    }
-    else if (std::is_same<T, uint16_t>::value) {
-        peek_device_data_Tui16((uint16_t*)d_arr, num, offset);
-    }
-    else if (std::is_same<T, uint32_t>::value) {
-        peek_device_data_Tui32((uint32_t*)d_arr, num, offset);
-    }
-    else if (std::is_same<T, uint64_t>::value) {
-        peek_device_data_Tui64((uint64_t*)d_arr, num, offset);
-    }
-    else if (std::is_same<T, float>::value) {
-        peek_device_data_Tfp32((float*)d_arr, num, offset);
-    }
-    else if (std::is_same<T, double>::value) {
-        peek_device_data_Tfp64((double*)d_arr, num, offset);
-    }
-    else {
-        std::runtime_error("peek_device_data cannot accept this type.");
-    }
-}
-
-#define CPP_PEEK(Tliteral, T) template void psz::peek_device_data<T>(T * d_arr, size_t num, size_t offset);
-
-CPP_PEEK(i8, int8_t);
-CPP_PEEK(i16, int16_t);
-CPP_PEEK(i32, int32_t);
-CPP_PEEK(i64, int64_t);
-CPP_PEEK(ui8, uint8_t);
-CPP_PEEK(ui16, uint16_t);
-CPP_PEEK(ui32, uint32_t);
-CPP_PEEK(ui64, uint64_t);
-CPP_PEEK(fp32, float);
-CPP_PEEK(fp64, double);
-
-#undef CPP_PEEK
-
-#undef PRINT_INT_LESS_THAN_64
-#undef PRINT_UINT_LESS_THAN_64
+/**
+ * @file print_gpu.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-09-23
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+// #include "../detail/print_gpu.inl"
+#include <stdio.h>
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+#include "utils/print_gpu.h"
+#include "utils/print_gpu.hh"
+
+#define PRINT_INT_LESS_THAN_64(Tliteral, T)                                                                 \
+    void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset)                                  \
+    {                                                                                                       \
+        thrust::for_each(                                                                                   \
+            thrust::device, d_arr, d_arr + num, [=] __device__(const T i) { printf("%d\t", (int32_t)i); }); \
+        printf("\n");                                                                                       \
+    }
+
+PRINT_INT_LESS_THAN_64(i8, int8_t)
+PRINT_INT_LESS_THAN_64(i16, int16_t)
+PRINT_INT_LESS_THAN_64(i32, int32_t)
+
+void peek_device_data_Ti64(int64_t* d_arr, size_t num, size_t offset)
+{
+    thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const int64_t i) { printf("%ld\t", i); });
+    printf("\n");
+}
+
+#define PRINT_UINT_LESS_THAN_64(Tliteral, T)                                                                 \
+    void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset)                                   \
+    {                                                                                                        \
+        thrust::for_each(                                                                                    \
+            thrust::device, d_arr, d_arr + num, [=] __device__(const T i) { printf("%u\t", (uint32_t)i); }); \
+        printf("\n");                                                                                        \
+    }
+
+PRINT_UINT_LESS_THAN_64(ui8, uint8_t)
+PRINT_UINT_LESS_THAN_64(ui16, uint16_t)
+PRINT_UINT_LESS_THAN_64(ui32, uint32_t)
+
+void peek_device_data_Tui64(uint64_t* d_arr, size_t num, size_t offset)
+{
+    thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const uint64_t i) { printf("%lu\t", i); });
+    printf("\n");
+}
+
+void peek_device_data_Tfp32(float* d_arr, size_t num, size_t offset)
+{
+    thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const float i) { printf("%.7f\t", i); });
+    printf("\n");
+}
+
+void peek_device_data_Tfp64(double* d_arr, size_t num, size_t offset)
+{
+    thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const double i) { printf("%.7lf\t", i); });
+    printf("\n");
+}
+
+template <typename T>
+void psz::peek_device_data(T* d_arr, size_t num, size_t offset)
+{
+    if (std::is_same<T, int8_t>::value) {  //
+        peek_device_data_Ti8((int8_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, int16_t>::value) {
+        peek_device_data_Ti16((int16_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, int32_t>::value) {
+        peek_device_data_Ti32((int32_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, int64_t>::value) {
+        peek_device_data_Ti64((int64_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, uint8_t>::value) {
+        peek_device_data_Tui8((uint8_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, uint16_t>::value) {
+        peek_device_data_Tui16((uint16_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, uint32_t>::value) {
+        peek_device_data_Tui32((uint32_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, uint64_t>::value) {
+        peek_device_data_Tui64((uint64_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, float>::value) {
+        peek_device_data_Tfp32((float*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, double>::value) {
+        peek_device_data_Tfp64((double*)d_arr, num, offset);
+    }
+    else {
+        std::runtime_error("peek_device_data cannot accept this type.");
+    }
+}
+
+#define CPP_PEEK(Tliteral, T) template void psz::peek_device_data<T>(T * d_arr, size_t num, size_t offset);
+
+CPP_PEEK(i8, int8_t);
+CPP_PEEK(i16, int16_t);
+CPP_PEEK(i32, int32_t);
+CPP_PEEK(i64, int64_t);
+CPP_PEEK(ui8, uint8_t);
+CPP_PEEK(ui16, uint16_t);
+CPP_PEEK(ui32, uint32_t);
+CPP_PEEK(ui64, uint64_t);
+CPP_PEEK(fp32, float);
+CPP_PEEK(fp64, double);
+
+#undef CPP_PEEK
+
+#undef PRINT_INT_LESS_THAN_64
+#undef PRINT_UINT_LESS_THAN_64
diff --git a/qtensor/compression/cusz/src/utils/timer_cpu.cc b/qtensor/compression/cusz/src/utils/timer_cpu.cc
index 3983bc0f..2422f6f2 100644
--- a/qtensor/compression/cusz/src/utils/timer_cpu.cc
+++ b/qtensor/compression/cusz/src/utils/timer_cpu.cc
@@ -1,30 +1,30 @@
-/**
- * @file timer_cpu.cc
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-10-31
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include "utils/timer.h"
-
-#include <chrono>
-#include <utility>
-
-using hires         = std::chrono::high_resolution_clock;
-using duration_t    = std::chrono::duration<double>;
-using hires_clock_t = std::chrono::time_point<hires>;
-
-struct asz_timer {
-    hires_clock_t start, stop;
-};
-
-// cpu timer specific
-asz_timer* asz_cputimer_create() { return new asz_timer; }
-void       asz_cputimer_destroy(asz_timer* t) { delete t; }
-void       asz_cputimer_start(asz_timer* t) { t->start = hires::now(); }
-void       asz_cputimer_end(asz_timer* t) { t->stop = hires::now(); }
-double     asz_cputime_elapsed(asz_timer* t) { return static_cast<duration_t>((t->stop) - (t->start)).count(); }
+/**
+ * @file timer_cpu.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-31
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "utils/timer.h"
+
+#include <chrono>
+#include <utility>
+
+using hires         = std::chrono::high_resolution_clock;
+using duration_t    = std::chrono::duration<double>;
+using hires_clock_t = std::chrono::time_point<hires>;
+
+struct asz_timer {
+    hires_clock_t start, stop;
+};
+
+// cpu timer specific
+asz_timer* asz_cputimer_create() { return new asz_timer; }
+void       asz_cputimer_destroy(asz_timer* t) { delete t; }
+void       asz_cputimer_start(asz_timer* t) { t->start = hires::now(); }
+void       asz_cputimer_end(asz_timer* t) { t->stop = hires::now(); }
+double     asz_cputime_elapsed(asz_timer* t) { return static_cast<duration_t>((t->stop) - (t->start)).count(); }
diff --git a/qtensor/compression/cusz/src/utils/timer_gpu.cu b/qtensor/compression/cusz/src/utils/timer_gpu.cu
index a44ee4bf..247c80f8 100644
--- a/qtensor/compression/cusz/src/utils/timer_gpu.cu
+++ b/qtensor/compression/cusz/src/utils/timer_gpu.cu
@@ -1,82 +1,82 @@
-/**
- * @file timer_gpu.cu
- * @author Jiannan Tian
- * @brief
- * @version 0.3
- * @date 2022-10-31
- *
- * (C) 2022 by Indiana University, Argonne National Laboratory
- *
- */
-
-#include <cuda_runtime.h>
-#include <iostream>
-#include "utils/timer.h"
-
-typedef struct asz_cudatimer {
-    cudaEvent_t  a, b;
-    float        milliseconds;
-    cudaStream_t stream;
-
-    asz_cudatimer() { create(); }
-    asz_cudatimer(cudaStream_t stream)
-    {
-        create();
-        this->stream = stream;
-    }
-
-    void create()
-    {
-        cudaEventCreate(&a);
-        cudaEventCreate(&b);
-    }
-
-    void destroy()
-    {
-        cudaEventDestroy(a);
-        cudaEventDestroy(b);
-    }
-
-    // stream not involved
-    void start() { cudaEventRecord(a); }
-
-    void stop()
-    {
-        cudaEventRecord(b);
-        cudaEventSynchronize(b);
-    }
-
-    // stream involved
-    void stream_start()
-    {
-        cudaEventRecord(a, stream);  // set event as not occurred
-    }
-
-    void stream_stop()
-    {
-        cudaEventRecord(b, stream);
-        cudaEventSynchronize(b);  // block host until `stream` meets `stop`
-    }
-
-    // get time
-    float time_elapsed()
-    {
-        cudaEventElapsedTime(&milliseconds, a, b);
-        std::cout << "milliseconds: " << milliseconds << std::endl;
-        return milliseconds;
-    }
-} asz_cudatimer;
-
-// cuda timer specific
-asz_cudatimer* asz_cudatimer_create() { return new asz_cudatimer{}; }
-void           asz_cudatimer_destroy(asz_cudatimer* t) { t->destroy(); }
-void           asz_cudatimer_start(asz_cudatimer* t) { t->start(); }
-void           asz_cudatimer_end(asz_cudatimer* t) { t->stop(); }
-double         asz_cudatime_elapsed(asz_cudatimer* t) { return t->time_elapsed() / 1000; }
-
-// cuda streamtimer specific
-asz_cudatimer* asz_cudastreamtimer_create(void* stream) { return new asz_cudatimer((cudaStream_t)stream); }
-void           asz_cudastreamtimer_destroy(asz_cudatimer* t) { t->destroy(); }
-void           asz_cudastreamtimer_start(asz_cudatimer* t) { t->stream_start(); }
-void           asz_cudastreamtimer_end(asz_cudatimer* t) { t->stream_stop(); }
-double         asz_cudastreamtime_elapsed(asz_cudatimer* t) { return t->time_elapsed() / 1000; }
+/**
+ * @file timer_gpu.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-31
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include <cuda_runtime.h>
+#include <iostream>
+#include "utils/timer.h"
+
+typedef struct asz_cudatimer {
+    cudaEvent_t  a, b;
+    float        milliseconds;
+    cudaStream_t stream;
+
+    asz_cudatimer() { create(); }
+    asz_cudatimer(cudaStream_t stream)
+    {
+        create();
+        this->stream = stream;
+    }
+
+    void create()
+    {
+        cudaEventCreate(&a);
+        cudaEventCreate(&b);
+    }
+
+    void destroy()
+    {
+        cudaEventDestroy(a);
+        cudaEventDestroy(b);
+    }
+
+    // stream not involved
+    void start() { cudaEventRecord(a); }
+
+    void stop()
+    {
+        cudaEventRecord(b);
+        cudaEventSynchronize(b);
+    }
+
+    // stream involved
+    void stream_start()
+    {
+        cudaEventRecord(a, stream);  // set event as not occurred
+    }
+
+    void stream_stop()
+    {
+        cudaEventRecord(b, stream);
+        cudaEventSynchronize(b);  // block host until `stream` meets `stop`
+    }
+
+    // get time
+    float time_elapsed()
+    {
+        cudaEventElapsedTime(&milliseconds, a, b);
+        std::cout << "milliseconds: " << milliseconds << std::endl;
+        return milliseconds;
+    }
+} asz_cudatimer;
+
+// cuda timer specific
+asz_cudatimer* asz_cudatimer_create() { return new asz_cudatimer{}; }
+void           asz_cudatimer_destroy(asz_cudatimer* t) { t->destroy(); }
+void           asz_cudatimer_start(asz_cudatimer* t) { t->start(); }
+void           asz_cudatimer_end(asz_cudatimer* t) { t->stop(); }
+double         asz_cudatime_elapsed(asz_cudatimer* t) { return t->time_elapsed() / 1000; }
+
+// cuda streamtimer specific
+asz_cudatimer* asz_cudastreamtimer_create(void* stream) { return new asz_cudatimer((cudaStream_t)stream); }
+void           asz_cudastreamtimer_destroy(asz_cudatimer* t) { t->destroy(); }
+void           asz_cudastreamtimer_start(asz_cudatimer* t) { t->stream_start(); }
+void           asz_cudastreamtimer_end(asz_cudatimer* t) { t->stream_stop(); }
+double         asz_cudastreamtime_elapsed(asz_cudatimer* t) { return t->time_elapsed() / 1000; }
diff --git a/qtensor/compression/cusz/src/utils/vis_stat.hh b/qtensor/compression/cusz/src/utils/vis_stat.hh
index 60099138..ff27695f 100644
--- a/qtensor/compression/cusz/src/utils/vis_stat.hh
+++ b/qtensor/compression/cusz/src/utils/vis_stat.hh
@@ -1,137 +1,137 @@
-#ifndef UTILS_VIS_STAT_HH
-#define UTILS_VIS_STAT_HH
-
-/**
- * @file vis_stat.hh
- * @author Jiannan Tian
- * @brief Analysis and visualization of datum.
- * @version 0.1
- * @date 2020-09-20
- * Created on 2020-02-09
- *
- * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
- * See LICENSE in top-level directory
- *
- */
-
-#include <cmath>
-#include <cstdio>
-#include <iomanip>
-#include <iostream>
-#include <numeric>
-#include <tuple>
-#include <unordered_map>
-#include <vector>
-
-using std::cerr;
-using std::cout;
-using std::endl;
-using std::tuple;
-
-template <typename T>
-double GetEntropy(T* code, size_t l, size_t cap = 1024)
-{
-    if (cap == 0) {
-        cerr << "wrong cap" << endl;
-        exit(-1);
-    }
-    auto arr = new size_t[cap]();
-    for (size_t i = 0; i < l; i++) arr[code[i]]++;
-    std::vector<double> raw(arr, arr + cap);
-    std::vector<double> frequencies;
-    std::copy_if(raw.begin(), raw.end(), std::back_inserter(frequencies), [](double& e) { return e != 0; });
-    double entropy = 0;
-    for (auto freq : frequencies) { entropy += -(freq * 1.0 / l) * log2(freq * 1.0 / l); }
-
-    //    cout << "entropy:\t" << entropy << endl;
-    delete[] arr;
-    return entropy;
-}
-
-// TODO automatically omit bins that are less than 1%
-template <typename T>
-void VisualizeHistogram(
-    const std::string& tag,
-    T*                 _d_POD,
-    size_t             l,
-    size_t             _bins                   = 16,
-    bool               log_freq                = false,
-    double             override_min            = 0,
-    double             override_max            = 0,
-    bool               eliminate_zeros         = false,
-    bool               use_scientific_notation = true)
-{
-    std::vector<T> _d(_d_POD, _d_POD + l);
-    std::vector<T> _d_nonzero;
-    //    std::vector<size_t> arr;
-    //    arr.reserve(_bins);
-    //    for (size_t i = 0; i< _bins; i++) arr.push_back(0);
-    auto arr = new size_t[_bins]();
-
-    if (eliminate_zeros) {
-        std::copy_if(_d.begin(), _d.end(), std::back_inserter(_d_nonzero), [](int i) { return i != 0; });
-    }
-    double Min = *std::min_element(_d.begin(), _d.end());
-    double Max = *std::max_element(_d.begin(), _d.end());
-    //    double sum = std::accumulate(_d.begin(), _d.end(), 0);
-    double rng = Max - Min;
-    //    double avg = sum / l;
-
-    cout << "\e[7m[[" << tag << "]]\e[0m";
-    if (override_max > override_min) {
-        cout << "zoom into " << override_min << "--" << override_max << endl;
-        std::tie(Max, Min, rng) = std::make_tuple(override_max, override_min, override_max - override_min);
-    }
-    double step = rng / _bins;
-    for (size_t i = 0; i < l; i++) arr[static_cast<size_t>((_d[i] - Min) / step)]++;
-    std::vector<size_t> _viz(arr, arr + _bins);
-    //    std::vector<size_t> _viz(arr);
-
-    // visualization
-    printf("\tbins:\t%zu\tbin_width:\t%lf\n", _bins, step);
-    //    printf("count:\t%zu\tmin:\t%lf\tmax:\t%lf\trng:\t%lf\n", l, Min, Max, rng);
-    cout << "count:\t" << l << "\t";
-    cout << "min:\t" << Min << "\t";
-    cout << "max:\t" << Max << "\t";
-    cout << "rng:\t" << rng << endl;
-
-    if (log_freq) {
-        cout << "using log_freq" << endl;
-        std::for_each(_viz.begin(), _viz.end(), [](size_t& n) { n = log2(n); });
-    }
-
-    size_t longest     = *std::max_element(_viz.begin(), _viz.end());
-    size_t bar_str_len = 64;  // scale according to the longest
-    std::for_each(_viz.begin(), _viz.end(), [&](size_t& n) {
-        n = static_cast<size_t>(n / static_cast<double>(longest) * bar_str_len);
-    });
-
-    for (size_t i = 0; i < _bins; i++) {
-        // normalize to width
-        cout << "|"
-             << "\33[43m";
-
-        for (size_t j = 0; j < bar_str_len + 1; j++) {
-            if (j < _viz[i])
-                cout << "-";
-            else if (j == _viz[i])
-                cout << "\33[0m"
-                     << "+";
-            else
-                cout << " ";
-        }
-        cout.precision(2);
-        cout << "    ";
-        if (use_scientific_notation) cout << std::scientific;
-        cout << Min + i * step << " -- " << Min + (i + 1) * step;
-        cout << "  ";
-        cout << std::setw((int)log10(l) + 2);
-        cout << arr[i];
-        cout << "   ";
-        cout << std::defaultfloat << std::setw(5) << arr[i] / static_cast<double>(l) * 100 << "%" << endl;
-    }
-    cout << endl;
-    //    delete[] arr;
-}
-
-#endif
+#ifndef UTILS_VIS_STAT_HH
+#define UTILS_VIS_STAT_HH
+
+/**
+ * @file vis_stat.hh
+ * @author Jiannan Tian
+ * @brief Analysis and visualization of datum.
+ * @version 0.1
+ * @date 2020-09-20
+ * Created on 2020-02-09
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <cmath>
+#include <cstdio>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::tuple;
+
+template <typename T>
+double GetEntropy(T* code, size_t l, size_t cap = 1024)
+{
+    if (cap == 0) {
+        cerr << "wrong cap" << endl;
+        exit(-1);
+    }
+    auto arr = new size_t[cap]();
+    for (size_t i = 0; i < l; i++) arr[code[i]]++;
+    std::vector<double> raw(arr, arr + cap);
+    std::vector<double> frequencies;
+    std::copy_if(raw.begin(), raw.end(), std::back_inserter(frequencies), [](double& e) { return e != 0; });
+    double entropy = 0;
+    for (auto freq : frequencies) { entropy += -(freq * 1.0 / l) * log2(freq * 1.0 / l); }
+
+    //    cout << "entropy:\t" << entropy << endl;
+    delete[] arr;
+    return entropy;
+}
+
+// TODO automatically omit bins that are less than 1%
+template <typename T>
+void VisualizeHistogram(
+    const std::string& tag,
+    T*                 _d_POD,
+    size_t             l,
+    size_t             _bins                   = 16,
+    bool               log_freq                = false,
+    double             override_min            = 0,
+    double             override_max            = 0,
+    bool               eliminate_zeros         = false,
+    bool               use_scientific_notation = true)
+{
+    std::vector<T> _d(_d_POD, _d_POD + l);
+    std::vector<T> _d_nonzero;
+    //    std::vector<size_t> arr;
+    //    arr.reserve(_bins);
+    //    for (size_t i = 0; i< _bins; i++) arr.push_back(0);
+    auto arr = new size_t[_bins]();
+
+    if (eliminate_zeros) {
+        std::copy_if(_d.begin(), _d.end(), std::back_inserter(_d_nonzero), [](int i) { return i != 0; });
+    }
+    double Min = *std::min_element(_d.begin(), _d.end());
+    double Max = *std::max_element(_d.begin(), _d.end());
+    //    double sum = std::accumulate(_d.begin(), _d.end(), 0);
+    double rng = Max - Min;
+    //    double avg = sum / l;
+
+    cout << "\e[7m[[" << tag << "]]\e[0m";
+    if (override_max > override_min) {
+        cout << "zoom into " << override_min << "--" << override_max << endl;
+        std::tie(Max, Min, rng) = std::make_tuple(override_max, override_min, override_max - override_min);
+    }
+    double step = rng / _bins;
+    for (size_t i = 0; i < l; i++) arr[static_cast<size_t>((_d[i] - Min) / step)]++;
+    std::vector<size_t> _viz(arr, arr + _bins);
+    //    std::vector<size_t> _viz(arr);
+
+    // visualization
+    printf("\tbins:\t%zu\tbin_width:\t%lf\n", _bins, step);
+    //    printf("count:\t%zu\tmin:\t%lf\tmax:\t%lf\trng:\t%lf\n", l, Min, Max, rng);
+    cout << "count:\t" << l << "\t";
+    cout << "min:\t" << Min << "\t";
+    cout << "max:\t" << Max << "\t";
+    cout << "rng:\t" << rng << endl;
+
+    if (log_freq) {
+        cout << "using log_freq" << endl;
+        std::for_each(_viz.begin(), _viz.end(), [](size_t& n) { n = log2(n); });
+    }
+
+    size_t longest     = *std::max_element(_viz.begin(), _viz.end());
+    size_t bar_str_len = 64;  // scale according to the longest
+    std::for_each(_viz.begin(), _viz.end(), [&](size_t& n) {
+        n = static_cast<size_t>(n / static_cast<double>(longest) * bar_str_len);
+    });
+
+    for (size_t i = 0; i < _bins; i++) {
+        // normalize to width
+        cout << "|"
+             << "\33[43m";
+
+        for (size_t j = 0; j < bar_str_len + 1; j++) {
+            if (j < _viz[i])
+                cout << "-";
+            else if (j == _viz[i])
+                cout << "\33[0m"
+                     << "+";
+            else
+                cout << " ";
+        }
+        cout.precision(2);
+        cout << "    ";
+        if (use_scientific_notation) cout << std::scientific;
+        cout << Min + i * step << " -- " << Min + (i + 1) * step;
+        cout << "  ";
+        cout << std::setw((int)log10(l) + 2);
+        cout << arr[i];
+        cout << "   ";
+        cout << std::defaultfloat << std::setw(5) << arr[i] / static_cast<double>(l) * 100 << "%" << endl;
+    }
+    cout << endl;
+    //    delete[] arr;
+}
+
+#endif
diff --git a/qtensor/compression/cuszp/cuSZp/CMakeLists.txt b/qtensor/compression/cuszp/cuSZp/CMakeLists.txt
index d3c752ba..d6b24117 100644
--- a/qtensor/compression/cuszp/cuSZp/CMakeLists.txt
+++ b/qtensor/compression/cuszp/cuSZp/CMakeLists.txt
@@ -1,79 +1,79 @@
-# Specify the minimum version of CMake required to build the project
-cmake_minimum_required(VERSION 3.21)
-
-project(cuSZp
-        VERSION 0.0.2
-        DESCRIPTION "Error-bounded GPU lossy compression library"
-        )
-set(namespace "cuSZp")
-enable_language(CXX)
-enable_language(CUDA)
-
-find_package(CUDAToolkit REQUIRED)
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-
-#set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -debug -Wall -diag-disable=10441")
-#set(CMAKE_CXX_FLAGS_RELEASE "-diag-disable=10441 -g -ftz -fma -O2 -fp-model precise -prec-div -Wall")
-
-#set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -ftz=true -G -allow-unsupported-compiler")
-#set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -allow-unsupported-compiler")
-
-set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
-set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
-set(CMAKE_CUDA_STANDARD "17")
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-#set(CMAKE_CUDA_FLAGS_INIT "-std=c++17 -allow-unsupported-compiler")
-set(CMAKE_CUDA_ARCHITECTURES 60 61 62 70 75)
-set(CUDA_PROPAGATE_HOST_FLAGS ON)
-set(CUDA_LIBRARY CUDA::cudart)
-
-if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
-    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY VALUE Release)
-endif()
-
-add_library(${PROJECT_NAME} STATIC)
-
-target_sources(${PROJECT_NAME}
-        PRIVATE
-        src/cuSZp_f32.cu
-        src/cuSZp_f64.cu
-        src/cuSZp_utility.cu
-        src/cuSZp_timer.cu
-        src/cuSZp_entry_f32.cu
-        src/cuSZp_entry_f64.cu
-        )
-
-target_include_directories(${PROJECT_NAME}
-        PRIVATE
-        # where the library itself will look for its internal headers
-        ${CMAKE_CURRENT_SOURCE_DIR}/src
-        PUBLIC
-        # where top-level project will look for the library's public headers
-        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-        # where external projects will look for the library's public headers
-        $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
-        )
-
-#target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
-
-target_link_libraries(${PROJECT_NAME} PRIVATE CUDA::cudart)
-
-set(public_headers
-        include/cuSZp_f32.h
-        include/cuSZp_f64.h
-        include/cuSZp_utility.h
-        include/cuSZp_timer.h
-        include/cuSZp_entry_f32.h
-        include/cuSZp_entry_f64.h
-        )
-
-set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
-include(Installing)
-
-option(CUSZP_BUILD_EXAMPLES "Option to enable building example programs" ON)
-if (CUSZP_BUILD_EXAMPLES)
-    add_subdirectory(examples)
+# Specify the minimum version of CMake required to build the project
+cmake_minimum_required(VERSION 3.21)
+
+project(cuSZp
+        VERSION 0.0.2
+        DESCRIPTION "Error-bounded GPU lossy compression library"
+        )
+set(namespace "cuSZp")
+enable_language(CXX)
+enable_language(CUDA)
+
+find_package(CUDAToolkit REQUIRED)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+#set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -debug -Wall -diag-disable=10441")
+#set(CMAKE_CXX_FLAGS_RELEASE "-diag-disable=10441 -g -ftz -fma -O2 -fp-model precise -prec-div -Wall")
+
+#set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -ftz=true -G -allow-unsupported-compiler")
+#set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -allow-unsupported-compiler")
+
+set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
+set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+set(CMAKE_CUDA_STANDARD "17")
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+#set(CMAKE_CUDA_FLAGS_INIT "-std=c++17 -allow-unsupported-compiler")
+set(CMAKE_CUDA_ARCHITECTURES 60 61 62 70 75)
+set(CUDA_PROPAGATE_HOST_FLAGS ON)
+set(CUDA_LIBRARY CUDA::cudart)
+
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY VALUE Release)
+endif()
+
+add_library(${PROJECT_NAME} STATIC)
+
+target_sources(${PROJECT_NAME}
+        PRIVATE
+        src/cuSZp_f32.cu
+        src/cuSZp_f64.cu
+        src/cuSZp_utility.cu
+        src/cuSZp_timer.cu
+        src/cuSZp_entry_f32.cu
+        src/cuSZp_entry_f64.cu
+        )
+
+target_include_directories(${PROJECT_NAME}
+        PRIVATE
+        # where the library itself will look for its internal headers
+        ${CMAKE_CURRENT_SOURCE_DIR}/src
+        PUBLIC
+        # where top-level project will look for the library's public headers
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+        # where external projects will look for the library's public headers
+        $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+        )
+
+#target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+
+target_link_libraries(${PROJECT_NAME} PRIVATE CUDA::cudart)
+
+set(public_headers
+        include/cuSZp_f32.h
+        include/cuSZp_f64.h
+        include/cuSZp_utility.h
+        include/cuSZp_timer.h
+        include/cuSZp_entry_f32.h
+        include/cuSZp_entry_f64.h
+        )
+
+set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+include(Installing)
+
+option(CUSZP_BUILD_EXAMPLES "Option to enable building example programs" ON)
+if (CUSZP_BUILD_EXAMPLES)
+    add_subdirectory(examples)
 endif ()
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/Config.cmake.in b/qtensor/compression/cuszp/cuSZp/Config.cmake.in
index 97b7684e..8c9ad12a 100644
--- a/qtensor/compression/cuszp/cuSZp/Config.cmake.in
+++ b/qtensor/compression/cuszp/cuSZp/Config.cmake.in
@@ -1,5 +1,5 @@
-@PACKAGE_INIT@
-
-include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake")
-
-check_required_components(@PROJECT_NAME@)
+@PACKAGE_INIT@
+
+include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake")
+
+check_required_components(@PROJECT_NAME@)
diff --git a/qtensor/compression/cuszp/cuSZp/LICENSE b/qtensor/compression/cuszp/cuSZp/LICENSE
index d4fb7dda..786f3f5e 100644
--- a/qtensor/compression/cuszp/cuSZp/LICENSE
+++ b/qtensor/compression/cuszp/cuSZp/LICENSE
@@ -1,30 +1,30 @@
-Copyright © 2023, UChicago Argonne and University of Iowa
-
-All Rights Reserved
-
-Software Name: cuSZp: An Ultra-fast GPU Error-bounded Lossy Compressor with Optimized End-to-End Performance
-
-By: Argonne National Laboratory, University of Iowa
-
-OPEN SOURCE LICENSE
-
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-
-******************************************************************************************************
-                                              DISCLAIMER
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
-TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************************************************************************************
-
+Copyright © 2023, UChicago Argonne and University of Iowa
+
+All Rights Reserved
+
+Software Name: cuSZp: An Ultra-fast GPU Error-bounded Lossy Compressor with Optimized End-to-End Performance
+
+By: Argonne National Laboratory, University of Iowa
+
+OPEN SOURCE LICENSE
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+******************************************************************************************************
+                                              DISCLAIMER
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************************************************************************************
+
 Contact: SZ Team (szlossycompressor@gmail.com)
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/README.md b/qtensor/compression/cuszp/cuSZp/README.md
index 14454bd0..4f9f090d 100644
--- a/qtensor/compression/cuszp/cuSZp/README.md
+++ b/qtensor/compression/cuszp/cuSZp/README.md
@@ -1,106 +1,106 @@
-# cuSZp
-<a href="./LICENSE"><img src="https://img.shields.io/badge/License-BSD%203--Clause-blue.svg"></a> 
-
-cuSZp is a user-friendly error-bounded lossy compression tool specifically designed for the compression of single- and double-precision floating-point data using NVIDIA GPUs. 
-This tool fuses all compression or decompression computations into one single kernel, achieving ultra fast end-to-end throughput.
-Specifically, the cuSZp framework is structured around four pivotal stages: Quantization and Prediction, Fixed-length Encoding, Global Synchronization, and Block Bit-shuffling. 
-Noting that ongoing optimization efforts are being devoted to cuSZp, aimed at further improving its end-to-end performance.
-
-- Developer: Yafan Huang
-- Contributors: Sheng Di, Xiaodong Yu, Guanpeng Li, and Franck Cappello
-
-## Environment Requirements
-- Linux OS with NVIDIA GPUs
-- Git >= 2.15
-- CMake >= 3.21
-- Cuda Toolkit >= 11.0
-- GCC >= 7.3.0
-
-## Compile and Run cuSZp Prepared Executable Binary
-You can compile and install cuSZp with following commands:
-```shell
-$ git clone https://github.com/szcompressor/cuSZp.git
-$ cd cuSZp
-$ mkdir build && cd build
-$ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../install/ ..
-$ make -j
-$ make install
-```
-After compilation, you will see a list of executable binaries ```cuSZp/install/bin/```:
-- ```cuSZp_cpu_f32_api```: single-precision, host pointers (i.e. on CPU).
-- ```cuSZp_gpu_f32_api```: single-precision, device pointers (i.e. on GPU).
-- ```cuSZp_cpu_f64_api```: double-precision, host pointers (i.e. on CPU).
-- ```cuSZp_gpu_f64_api```: double-precision, device pointers (i.e. on GPU).
-
-To use those binaries, try following commands. 
-We here use RTM pressure_2000 dataset (1.4 GB, 1008x1008x352) for single-precision example, and NWChem acd-tst.bin.d64 (6.0 GB) for double-precision example.
-```shell
-# Example for single-precision API
-# ./cuSZp_gpu_f32_api TARGET_HPC_DATASET ERROR_MODE ERROR_BOUND
-#                                        ABS or REL
-$ ./cuSZp_gpu_f32_api ./pressure_2000 REL 1e-4
-cuSZp finished!
-cuSZp compression   end-to-end speed: 151.564649 GB/s
-cuSZp decompression end-to-end speed: 232.503219 GB/s
-cuSZp compression ratio: 13.003452
-
-Pass error check!
-$
-# Example for double-precision API
-# ./cuSZp_gpu_f64_api TARGET_HPC_DATASET ERROR_MODE ERROR_BOUND
-#                                        ABS or REL
-$ ./cuSZp_gpu_f64_api ./acd-tst.bin.d64 ABS 1E-8
-cuSZp finished!
-cuSZp compression   end-to-end speed: 110.117965 GB/s
-cuSZp decompression end-to-end speed: 222.743097 GB/s
-cuSZp compression ratio: 3.990585
-
-Pass error check!
-```
-More HPC dataset can be downloaded from [SDRBench](https://sdrbench.github.io/).
-
-## Using cuSZp as an Internal API
-This repository provides several examples for using cuSZp compression and decompression for different scenarios (device pointer? host pointer? f32 or f64?).
-The examples can be found in ```cuSZp/examples/```.
-Assuming your original data, compressed data, and reconstructed data are all device pointers (allocated on GPU), and the data type is single-precision. The compression and decompression APIs can be called as below:
-```C++
-// For measuring the end-to-end throughput.
-TimingGPU timer_GPU;
-
-// cuSZp compression.
-timer_GPU.StartCounter(); // set timer
-SZp_compress_deviceptr_f32(d_oriData, d_cmpBytes, nbEle, &cmpSize, errorBound, stream);
-float cmpTime = timer_GPU.GetCounter();
-
-// cuSZp decompression.
-timer_GPU.StartCounter(); // set timer
-SZp_decompress_deviceptr_f32(d_decData, d_cmpBytes, nbEle, cmpSize, errorBound, stream);
-float decTime = timer_GPU.GetCounter();
-```
-More details can be checked in:
-- **f32-hostptr**: ```cuSZp/examples/cuSZp_cpu_f32_api.cpp```.
-- **f32-deviceptr**: ```cuSZp/examples/cuSZp_gpu_f32_api.cpp```.
-- **f64-hostptr**: ```cuSZp/examples/cuSZp_cpu_f64_api.cpp```.
-- **f64-deviceptr**: ```cuSZp/examples/cuSZp_gpu_f64_api.cpp```.
-
-## Citation
-```bibtex
-@inproceedings{cuSZp2023huang,
-      title = {cuSZp: An Ultra-Fast GPU Error-Bounded Lossy Compression Framework with Optimized End-to-End Performance}
-     author = {Huang, Yafan and Di, Sheng and Yu, Xiaodong and Li, Guanpeng and Cappello, Franck},
-       year = {2023},
-       isbn = {979-8-4007-0109-2/23/11},
-  publisher = {Association for Computing Machinery},
-    address = {Denver, CO, USA},
-        doi = {10.1145/3581784.3607048},
-  booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
-   keywords = {Lossy compression; parallel computing; HPC; GPU},
-     series = {SC'23}
-}
-```
-
-## Copyright
-(C) 2023 by Argonne National Laboratory and University of Iowa. More details see [COPYRIGHT](https://github.com/szcompressor/cuSZp/blob/master/LICENSE).
-
-## Acknowledgement
-This research was supported by the Exascale Computing Project (ECP), Project Number: 17-SC-20-SC, a collaborative effort of two DOE organizations – the Office of Science and the National Nuclear Security Administration, responsible for the planning and preparation of a capable exascale ecosystem, including software, applications, hardware, advanced system engineering and early testbed platforms, to support the nation’s exascale computing imperative. The material was supported by the U.S. Department of Energy, Office of Science, Advanced Scientific Computing Research (ASCR), under contract DE-AC02-06CH11357, and supported by the National Science Foundation under Grant OAC-2003709 and OAC-2104023. We acknowledge the computing resources provided on Bebop (operated by Laboratory Computing Resource Center at Argonne) and on Theta and JLSE (operated by Argonne Leadership Computing Facility). We acknowledge the support of ARAMCO. 
+# cuSZp
+<a href="./LICENSE"><img src="https://img.shields.io/badge/License-BSD%203--Clause-blue.svg"></a> 
+
+cuSZp is a user-friendly error-bounded lossy compression tool specifically designed for the compression of single- and double-precision floating-point data using NVIDIA GPUs. 
+This tool fuses all compression or decompression computations into one single kernel, achieving ultra fast end-to-end throughput.
+Specifically, the cuSZp framework is structured around four pivotal stages: Quantization and Prediction, Fixed-length Encoding, Global Synchronization, and Block Bit-shuffling. 
+Noting that ongoing optimization efforts are being devoted to cuSZp, aimed at further improving its end-to-end performance.
+
+- Developer: Yafan Huang
+- Contributors: Sheng Di, Xiaodong Yu, Guanpeng Li, and Franck Cappello
+
+## Environment Requirements
+- Linux OS with NVIDIA GPUs
+- Git >= 2.15
+- CMake >= 3.21
+- Cuda Toolkit >= 11.0
+- GCC >= 7.3.0
+
+## Compile and Run cuSZp Prepared Executable Binary
+You can compile and install cuSZp with following commands:
+```shell
+$ git clone https://github.com/szcompressor/cuSZp.git
+$ cd cuSZp
+$ mkdir build && cd build
+$ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../install/ ..
+$ make -j
+$ make install
+```
+After compilation, you will see a list of executable binaries ```cuSZp/install/bin/```:
+- ```cuSZp_cpu_f32_api```: single-precision, host pointers (i.e. on CPU).
+- ```cuSZp_gpu_f32_api```: single-precision, device pointers (i.e. on GPU).
+- ```cuSZp_cpu_f64_api```: double-precision, host pointers (i.e. on CPU).
+- ```cuSZp_gpu_f64_api```: double-precision, device pointers (i.e. on GPU).
+
+To use those binaries, try following commands. 
+We here use RTM pressure_2000 dataset (1.4 GB, 1008x1008x352) for single-precision example, and NWChem acd-tst.bin.d64 (6.0 GB) for double-precision example.
+```shell
+# Example for single-precision API
+# ./cuSZp_gpu_f32_api TARGET_HPC_DATASET ERROR_MODE ERROR_BOUND
+#                                        ABS or REL
+$ ./cuSZp_gpu_f32_api ./pressure_2000 REL 1e-4
+cuSZp finished!
+cuSZp compression   end-to-end speed: 151.564649 GB/s
+cuSZp decompression end-to-end speed: 232.503219 GB/s
+cuSZp compression ratio: 13.003452
+
+Pass error check!
+$
+# Example for double-precision API
+# ./cuSZp_gpu_f64_api TARGET_HPC_DATASET ERROR_MODE ERROR_BOUND
+#                                        ABS or REL
+$ ./cuSZp_gpu_f64_api ./acd-tst.bin.d64 ABS 1E-8
+cuSZp finished!
+cuSZp compression   end-to-end speed: 110.117965 GB/s
+cuSZp decompression end-to-end speed: 222.743097 GB/s
+cuSZp compression ratio: 3.990585
+
+Pass error check!
+```
+More HPC dataset can be downloaded from [SDRBench](https://sdrbench.github.io/).
+
+## Using cuSZp as an Internal API
+This repository provides several examples for using cuSZp compression and decompression for different scenarios (device pointer? host pointer? f32 or f64?).
+The examples can be found in ```cuSZp/examples/```.
+Assuming your original data, compressed data, and reconstructed data are all device pointers (allocated on GPU), and the data type is single-precision. The compression and decompression APIs can be called as below:
+```C++
+// For measuring the end-to-end throughput.
+TimingGPU timer_GPU;
+
+// cuSZp compression.
+timer_GPU.StartCounter(); // set timer
+SZp_compress_deviceptr_f32(d_oriData, d_cmpBytes, nbEle, &cmpSize, errorBound, stream);
+float cmpTime = timer_GPU.GetCounter();
+
+// cuSZp decompression.
+timer_GPU.StartCounter(); // set timer
+SZp_decompress_deviceptr_f32(d_decData, d_cmpBytes, nbEle, cmpSize, errorBound, stream);
+float decTime = timer_GPU.GetCounter();
+```
+More details can be checked in:
+- **f32-hostptr**: ```cuSZp/examples/cuSZp_cpu_f32_api.cpp```.
+- **f32-deviceptr**: ```cuSZp/examples/cuSZp_gpu_f32_api.cpp```.
+- **f64-hostptr**: ```cuSZp/examples/cuSZp_cpu_f64_api.cpp```.
+- **f64-deviceptr**: ```cuSZp/examples/cuSZp_gpu_f64_api.cpp```.
+
+## Citation
+```bibtex
+@inproceedings{cuSZp2023huang,
+      title = {cuSZp: An Ultra-Fast GPU Error-Bounded Lossy Compression Framework with Optimized End-to-End Performance}
+     author = {Huang, Yafan and Di, Sheng and Yu, Xiaodong and Li, Guanpeng and Cappello, Franck},
+       year = {2023},
+       isbn = {979-8-4007-0109-2/23/11},
+  publisher = {Association for Computing Machinery},
+    address = {Denver, CO, USA},
+        doi = {10.1145/3581784.3607048},
+  booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
+   keywords = {Lossy compression; parallel computing; HPC; GPU},
+     series = {SC'23}
+}
+```
+
+## Copyright
+(C) 2023 by Argonne National Laboratory and University of Iowa. More details see [COPYRIGHT](https://github.com/szcompressor/cuSZp/blob/master/LICENSE).
+
+## Acknowledgement
+This research was supported by the Exascale Computing Project (ECP), Project Number: 17-SC-20-SC, a collaborative effort of two DOE organizations – the Office of Science and the National Nuclear Security Administration, responsible for the planning and preparation of a capable exascale ecosystem, including software, applications, hardware, advanced system engineering and early testbed platforms, to support the nation’s exascale computing imperative. The material was supported by the U.S. Department of Energy, Office of Science, Advanced Scientific Computing Research (ASCR), under contract DE-AC02-06CH11357, and supported by the National Science Foundation under Grant OAC-2003709 and OAC-2104023. We acknowledge the computing resources provided on Bebop (operated by Laboratory Computing Resource Center at Argonne) and on Theta and JLSE (operated by Argonne Leadership Computing Facility). We acknowledge the support of ARAMCO. 
diff --git a/qtensor/compression/cuszp/cuSZp/cmake/Installing.cmake b/qtensor/compression/cuszp/cuSZp/cmake/Installing.cmake
index 8a635ca6..cd5a27d0 100644
--- a/qtensor/compression/cuszp/cuSZp/cmake/Installing.cmake
+++ b/qtensor/compression/cuszp/cuSZp/cmake/Installing.cmake
@@ -1,67 +1,67 @@
-include(GNUInstallDirs)
-
-if(DEFINED CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
-    message(
-            STATUS
-            "CMAKE_INSTALL_PREFIX is not set\n"
-            "Default value: ${CMAKE_INSTALL_PREFIX}\n"
-            "Will set it to ${CMAKE_SOURCE_DIR}/install"
-    )
-    set(CMAKE_INSTALL_PREFIX
-            "${CMAKE_SOURCE_DIR}/install"
-            CACHE PATH "Where the library will be installed to" FORCE
-            )
-else()
-    message(
-            STATUS
-            "CMAKE_INSTALL_PREFIX was already set\n"
-            "Current value: ${CMAKE_INSTALL_PREFIX}"
-    )
-endif()
-
-set_target_properties(${PROJECT_NAME} PROPERTIES PUBLIC_HEADER "${public_headers}")
-
-set_target_properties(${PROJECT_NAME} PROPERTIES DEBUG_POSTFIX "d")
-
-install(TARGETS ${PROJECT_NAME}
-        EXPORT "${PROJECT_NAME}Targets"
-        # these get default values from GNUInstallDirs, no need to set them
-        #RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} # bin
-        #LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} # lib
-        #ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} # lib
-        # except for public headers, as we want them to be inside a library folder
-        PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME} # include/SomeProject
-        INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} # include
-        )
-
-# generate and install export file
-install(EXPORT "${PROJECT_NAME}Targets"
-        FILE "${PROJECT_NAME}Targets.cmake"
-        NAMESPACE ${namespace}::
-        DESTINATION cmake
-        )
-
-include(CMakePackageConfigHelpers)
-
-# generate the version file for the config file
-write_basic_package_version_file(
-        "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake"
-        VERSION "${version}"
-        COMPATIBILITY AnyNewerVersion
-)
-# create config file
-configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in
-        "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
-        INSTALL_DESTINATION cmake
-        )
-# install config files
-install(FILES
-        "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
-        "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake"
-        DESTINATION cmake
-        )
-# generate the export targets for the build tree
-export(EXPORT "${PROJECT_NAME}Targets"
-        FILE "${CMAKE_CURRENT_BINARY_DIR}/cmake/${PROJECT_NAME}Targets.cmake"
-        NAMESPACE ${namespace}::
-        )
+include(GNUInstallDirs)
+
+if(DEFINED CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+    message(
+            STATUS
+            "CMAKE_INSTALL_PREFIX is not set\n"
+            "Default value: ${CMAKE_INSTALL_PREFIX}\n"
+            "Will set it to ${CMAKE_SOURCE_DIR}/install"
+    )
+    set(CMAKE_INSTALL_PREFIX
+            "${CMAKE_SOURCE_DIR}/install"
+            CACHE PATH "Where the library will be installed to" FORCE
+            )
+else()
+    message(
+            STATUS
+            "CMAKE_INSTALL_PREFIX was already set\n"
+            "Current value: ${CMAKE_INSTALL_PREFIX}"
+    )
+endif()
+
+set_target_properties(${PROJECT_NAME} PROPERTIES PUBLIC_HEADER "${public_headers}")
+
+set_target_properties(${PROJECT_NAME} PROPERTIES DEBUG_POSTFIX "d")
+
+install(TARGETS ${PROJECT_NAME}
+        EXPORT "${PROJECT_NAME}Targets"
+        # these get default values from GNUInstallDirs, no need to set them
+        #RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} # bin
+        #LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} # lib
+        #ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} # lib
+        # except for public headers, as we want them to be inside a library folder
+        PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME} # include/SomeProject
+        INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} # include
+        )
+
+# generate and install export file
+install(EXPORT "${PROJECT_NAME}Targets"
+        FILE "${PROJECT_NAME}Targets.cmake"
+        NAMESPACE ${namespace}::
+        DESTINATION cmake
+        )
+
+include(CMakePackageConfigHelpers)
+
+# generate the version file for the config file
+write_basic_package_version_file(
+        "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake"
+        VERSION "${version}"
+        COMPATIBILITY AnyNewerVersion
+)
+# create config file
+configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in
+        "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
+        INSTALL_DESTINATION cmake
+        )
+# install config files
+install(FILES
+        "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
+        "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake"
+        DESTINATION cmake
+        )
+# generate the export targets for the build tree
+export(EXPORT "${PROJECT_NAME}Targets"
+        FILE "${CMAKE_CURRENT_BINARY_DIR}/cmake/${PROJECT_NAME}Targets.cmake"
+        NAMESPACE ${namespace}::
+        )
diff --git a/qtensor/compression/cuszp/cuSZp/examples/CMakeLists.txt b/qtensor/compression/cuszp/cuSZp/examples/CMakeLists.txt
index e5484362..8de5b50d 100644
--- a/qtensor/compression/cuszp/cuSZp/examples/CMakeLists.txt
+++ b/qtensor/compression/cuszp/cuSZp/examples/CMakeLists.txt
@@ -1,45 +1,45 @@
-# Find CUDA package
-find_package(CUDA REQUIRED)
-
-set(install_dir ${PROJECT_BINARY_DIR}/examples/bin)
-set(execName_gpu_f32 "cuSZp_gpu_f32_api")
-set(execName_cpu_f32 "cuSZp_cpu_f32_api")
-set(execName_gpu_f64 "cuSZp_gpu_f64_api")
-set(execName_cpu_f64 "cuSZp_cpu_f64_api")
-set(SRC_DIR ${PROJECT_SOURCE_DIR}/src)
-set(INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include)
-
-# Add include and library directories
-include_directories(${INCLUDE_DIR})
-
-# Compile headers as a library
-cuda_add_library(cuSZp_libs STATIC ${SRC_DIR}/cuSZp_f32.cu
-                                   ${SRC_DIR}/cuSZp_f64.cu 
-                                   ${SRC_DIR}/cuSZp_utility.cu
-                                   ${SRC_DIR}/cuSZp_timer.cu
-                                   ${SRC_DIR}/cuSZp_entry_f32.cu
-                                   ${SRC_DIR}/cuSZp_entry_f64.cu)
-
-# Compile executable binary
-cuda_add_executable(${execName_gpu_f32} cuSZp_gpu_f32_api.cpp)
-cuda_add_executable(${execName_cpu_f32} cuSZp_cpu_f32_api.cpp)
-cuda_add_executable(${execName_gpu_f64} cuSZp_gpu_f64_api.cpp)
-cuda_add_executable(${execName_cpu_f64} cuSZp_cpu_f64_api.cpp)
-
-# Link with headers
-target_link_libraries(${execName_gpu_f32} cuSZp_libs)
-target_link_libraries(${execName_cpu_f32} cuSZp_libs)
-target_link_libraries(${execName_gpu_f64} cuSZp_libs)
-target_link_libraries(${execName_cpu_f64} cuSZp_libs)
-
-# Set output paths for the compiled binary
-set_target_properties(${execName_gpu_f32} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${install_dir})
-set_target_properties(${execName_cpu_f32} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${install_dir})
-set_target_properties(${execName_gpu_f64} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${install_dir})
-set_target_properties(${execName_cpu_f64} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${install_dir})
-
-# Set installation paths for the compiled binary.
-install(TARGETS ${execName_gpu_f32} DESTINATION bin)
-install(TARGETS ${execName_cpu_f32} DESTINATION bin)
-install(TARGETS ${execName_gpu_f64} DESTINATION bin)
+# Find CUDA package
+find_package(CUDA REQUIRED)
+
+set(install_dir ${PROJECT_BINARY_DIR}/examples/bin)
+set(execName_gpu_f32 "cuSZp_gpu_f32_api")
+set(execName_cpu_f32 "cuSZp_cpu_f32_api")
+set(execName_gpu_f64 "cuSZp_gpu_f64_api")
+set(execName_cpu_f64 "cuSZp_cpu_f64_api")
+set(SRC_DIR ${PROJECT_SOURCE_DIR}/src)
+set(INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include)
+
+# Add include and library directories
+include_directories(${INCLUDE_DIR})
+
+# Compile headers as a library
+cuda_add_library(cuSZp_libs STATIC ${SRC_DIR}/cuSZp_f32.cu
+                                   ${SRC_DIR}/cuSZp_f64.cu 
+                                   ${SRC_DIR}/cuSZp_utility.cu
+                                   ${SRC_DIR}/cuSZp_timer.cu
+                                   ${SRC_DIR}/cuSZp_entry_f32.cu
+                                   ${SRC_DIR}/cuSZp_entry_f64.cu)
+
+# Compile executable binary
+cuda_add_executable(${execName_gpu_f32} cuSZp_gpu_f32_api.cpp)
+cuda_add_executable(${execName_cpu_f32} cuSZp_cpu_f32_api.cpp)
+cuda_add_executable(${execName_gpu_f64} cuSZp_gpu_f64_api.cpp)
+cuda_add_executable(${execName_cpu_f64} cuSZp_cpu_f64_api.cpp)
+
+# Link with headers
+target_link_libraries(${execName_gpu_f32} cuSZp_libs)
+target_link_libraries(${execName_cpu_f32} cuSZp_libs)
+target_link_libraries(${execName_gpu_f64} cuSZp_libs)
+target_link_libraries(${execName_cpu_f64} cuSZp_libs)
+
+# Set output paths for the compiled binary
+set_target_properties(${execName_gpu_f32} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${install_dir})
+set_target_properties(${execName_cpu_f32} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${install_dir})
+set_target_properties(${execName_gpu_f64} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${install_dir})
+set_target_properties(${execName_cpu_f64} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${install_dir})
+
+# Set installation paths for the compiled binary.
+install(TARGETS ${execName_gpu_f32} DESTINATION bin)
+install(TARGETS ${execName_cpu_f32} DESTINATION bin)
+install(TARGETS ${execName_gpu_f64} DESTINATION bin)
 install(TARGETS ${execName_cpu_f64} DESTINATION bin)
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/examples/cuSZp_cpu_f32_api.cpp b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_cpu_f32_api.cpp
index f543f59c..e4d63c27 100644
--- a/qtensor/compression/cuszp/cuSZp/examples/cuSZp_cpu_f32_api.cpp
+++ b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_cpu_f32_api.cpp
@@ -1,83 +1,83 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <string.h>
-#include <cuSZp_utility.h>
-#include <cuSZp_entry_f32.h>
-
-int main(int argc, char* argv[])
-{
-    // Read input information.
-    char oriFilePath[640];
-    char errorMode[20];
-    int status=0;
-    if(argc != 4)
-    {
-        printf("Usage: cuSZp_cpu_f32_api [srcFilePath] [errorMode] [errBound] # errorMode can only be ABS or REL\n");
-        printf("Example: cuSZp_cpu_f32_api testfloat_8_8_128.dat ABS 1E-2     # compress dataset with absolute 1E-2 error bound\n");
-        printf("         cuSZp_cpu_f32_api testfloat_8_8_128.dat REL 1e-3     # compress dataset with relative 1E-3 error bound\n");
-        exit(0);
-    }
-    sprintf(oriFilePath, "%s", argv[1]);
-    sprintf(errorMode, "%s", argv[2]);
-    float errorBound = atof(argv[3]);
-
-    // Input data preparation.
-    float* oriData = NULL;
-    float* decData = NULL;
-    unsigned char* cmpBytes = NULL;
-    size_t nbEle = 0;
-    size_t cmpSize = 0;
-    oriData = readFloatData_Yafan(oriFilePath, &nbEle, &status);
-    decData = (float*)malloc(nbEle*sizeof(float));
-    cmpBytes = (unsigned char*)malloc(nbEle*sizeof(float));
-
-    // Generating error bounds.
-    if(strcmp(errorMode, "REL")==0)
-    {
-        float max_val = oriData[0];
-        float min_val = oriData[0];
-        for(size_t i=0; i<nbEle; i++)
-        {
-            if(oriData[i]>max_val)
-                max_val = oriData[i];
-            else if(oriData[i]<min_val)
-                min_val = oriData[i];
-        }
-        errorBound = errorBound * (max_val - min_val);
-    }
-    else if(strcmp(errorMode, "ABS")!=0)
-    {
-        printf("invalid errorMode! errorMode can only be ABS or REL.\n");
-        exit(0);
-    }
-
-    // cuSZp compression.
-    SZp_compress_hostptr_f32(oriData, cmpBytes, nbEle, &cmpSize, errorBound);
-    
-    // cuSZp decompression.
-    SZp_decompress_hostptr_f32(decData, cmpBytes, nbEle, cmpSize, errorBound);
-
-    // Print result.
-    printf("cuSZp finished!\n");
-    printf("compression ratios: %f\n\n", (nbEle*sizeof(float)/1024.0/1024.0)/(cmpSize*sizeof(unsigned char)/1024.0/1024.0));
-
-    // Error check
-    int not_bound = 0;
-    for(size_t i=0; i<nbEle; i+=1)
-    {
-        if(abs(oriData[i]-decData[i]) > errorBound*1.1)
-        {
-            not_bound++;
-            // printf("not bound: %zu oriData: %f, decData: %f, errors: %f, bound: %f\n", i, oriData[i], decData[i], abs(oriData[i]-decData[i]), errBound);
-        }
-    }
-    if(!not_bound) printf("\033[0;32mPass error check!\033[0m\n");
-    else printf("\033[0;31mFail error check!\033[0m\n");
-    
-    // Free allocated data.
-    free(oriData);
-    free(decData);
-    free(cmpBytes);
-    return 0;
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <cuSZp_utility.h>
+#include <cuSZp_entry_f32.h>
+
+int main(int argc, char* argv[])
+{
+    // Read input information.
+    char oriFilePath[640];
+    char errorMode[20];
+    int status=0;
+    if(argc != 4)
+    {
+        printf("Usage: cuSZp_cpu_f32_api [srcFilePath] [errorMode] [errBound] # errorMode can only be ABS or REL\n");
+        printf("Example: cuSZp_cpu_f32_api testfloat_8_8_128.dat ABS 1E-2     # compress dataset with absolute 1E-2 error bound\n");
+        printf("         cuSZp_cpu_f32_api testfloat_8_8_128.dat REL 1e-3     # compress dataset with relative 1E-3 error bound\n");
+        exit(0);
+    }
+    sprintf(oriFilePath, "%s", argv[1]);
+    sprintf(errorMode, "%s", argv[2]);
+    float errorBound = atof(argv[3]);
+
+    // Input data preparation.
+    float* oriData = NULL;
+    float* decData = NULL;
+    unsigned char* cmpBytes = NULL;
+    size_t nbEle = 0;
+    size_t cmpSize = 0;
+    oriData = readFloatData_Yafan(oriFilePath, &nbEle, &status);
+    decData = (float*)malloc(nbEle*sizeof(float));
+    cmpBytes = (unsigned char*)malloc(nbEle*sizeof(float));
+
+    // Generating error bounds.
+    if(strcmp(errorMode, "REL")==0)
+    {
+        float max_val = oriData[0];
+        float min_val = oriData[0];
+        for(size_t i=0; i<nbEle; i++)
+        {
+            if(oriData[i]>max_val)
+                max_val = oriData[i];
+            else if(oriData[i]<min_val)
+                min_val = oriData[i];
+        }
+        errorBound = errorBound * (max_val - min_val);
+    }
+    else if(strcmp(errorMode, "ABS")!=0)
+    {
+        printf("invalid errorMode! errorMode can only be ABS or REL.\n");
+        exit(0);
+    }
+
+    // cuSZp compression.
+    SZp_compress_hostptr_f32(oriData, cmpBytes, nbEle, &cmpSize, errorBound);
+    
+    // cuSZp decompression.
+    SZp_decompress_hostptr_f32(decData, cmpBytes, nbEle, cmpSize, errorBound);
+
+    // Print result.
+    printf("cuSZp finished!\n");
+    printf("compression ratios: %f\n\n", (nbEle*sizeof(float)/1024.0/1024.0)/(cmpSize*sizeof(unsigned char)/1024.0/1024.0));
+
+    // Error check
+    int not_bound = 0;
+    for(size_t i=0; i<nbEle; i+=1)
+    {
+        if(abs(oriData[i]-decData[i]) > errorBound*1.1)
+        {
+            not_bound++;
+            // printf("not bound: %zu oriData: %f, decData: %f, errors: %f, bound: %f\n", i, oriData[i], decData[i], abs(oriData[i]-decData[i]), errBound);
+        }
+    }
+    if(!not_bound) printf("\033[0;32mPass error check!\033[0m\n");
+    else printf("\033[0;31mFail error check!\033[0m\n");
+    
+    // Free allocated data.
+    free(oriData);
+    free(decData);
+    free(cmpBytes);
+    return 0;
 }
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/examples/cuSZp_cpu_f64_api.cpp b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_cpu_f64_api.cpp
index 6ed6adb1..5dcf6788 100644
--- a/qtensor/compression/cuszp/cuSZp/examples/cuSZp_cpu_f64_api.cpp
+++ b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_cpu_f64_api.cpp
@@ -1,83 +1,83 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <string.h>
-#include <cuSZp_utility.h>
-#include <cuSZp_entry_f64.h>
-
-int main(int argc, char* argv[])
-{
-    // Read input information.
-    char oriFilePath[640];
-    char errorMode[20];
-    int status=0;
-    if(argc != 4)
-    {
-        printf("Usage: cuSZp_cpu_f64_api [srcFilePath] [errorMode] [errBound] # errorMode can only be ABS or REL\n");
-        printf("Example: cuSZp_cpu_f64_api testdouble_8_8_128.dat ABS 1E-2     # compress dataset with absolute 1E-2 error bound\n");
-        printf("         cuSZp_cpu_f64_api testdouble_8_8_128.dat REL 1e-3     # compress dataset with relative 1E-3 error bound\n");
-        exit(0);
-    }
-    sprintf(oriFilePath, "%s", argv[1]);
-    sprintf(errorMode, "%s", argv[2]);
-    double errorBound = atof(argv[3]);
-
-    // Input data preparation.
-    double* oriData = NULL;
-    double* decData = NULL;
-    unsigned char* cmpBytes = NULL;
-    size_t nbEle = 0;
-    size_t cmpSize = 0;
-    oriData = readDoubleData_Yafan(oriFilePath, &nbEle, &status);
-    decData = (double*)malloc(nbEle*sizeof(double));
-    cmpBytes = (unsigned char*)malloc(nbEle*sizeof(double));
-
-    // Generating error bounds.
-    if(strcmp(errorMode, "REL")==0)
-    {
-        double max_val = oriData[0];
-        double min_val = oriData[0];
-        for(size_t i=0; i<nbEle; i++)
-        {
-            if(oriData[i]>max_val)
-                max_val = oriData[i];
-            else if(oriData[i]<min_val)
-                min_val = oriData[i];
-        }
-        errorBound = errorBound * (max_val - min_val);
-    }
-    else if(strcmp(errorMode, "ABS")!=0)
-    {
-        printf("invalid errorMode! errorMode can only be ABS or REL.\n");
-        exit(0);
-    }
-
-    // cuSZp compression.
-    SZp_compress_hostptr_f64(oriData, cmpBytes, nbEle, &cmpSize, errorBound);
-    
-    // cuSZp decompression.
-    SZp_decompress_hostptr_f64(decData, cmpBytes, nbEle, cmpSize, errorBound);
-
-    // Print result.
-    printf("cuSZp finished!\n");
-    printf("compression ratios: %f\n\n", (nbEle*sizeof(double)/1024.0/1024.0)/(cmpSize*sizeof(unsigned char)/1024.0/1024.0));
-
-    // Error check
-    int not_bound = 0;
-    for(size_t i=0; i<nbEle; i+=1)
-    {
-        if(abs(oriData[i]-decData[i]) > errorBound*1.1)
-        {
-            not_bound++;
-            // printf("not bound: %zu oriData: %f, decData: %f, errors: %f, bound: %f\n", i, oriData[i], decData[i], abs(oriData[i]-decData[i]), errBound);
-        }
-    }
-    if(!not_bound) printf("\033[0;32mPass error check!\033[0m\n");
-    else printf("\033[0;31mFail error check!\033[0m\n");
-    
-    // Free allocated data.
-    free(oriData);
-    free(decData);
-    free(cmpBytes);
-    return 0;
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <cuSZp_utility.h>
+#include <cuSZp_entry_f64.h>
+
+int main(int argc, char* argv[])
+{
+    // Read input information.
+    char oriFilePath[640];
+    char errorMode[20];
+    int status=0;
+    if(argc != 4)
+    {
+        printf("Usage: cuSZp_cpu_f64_api [srcFilePath] [errorMode] [errBound] # errorMode can only be ABS or REL\n");
+        printf("Example: cuSZp_cpu_f64_api testdouble_8_8_128.dat ABS 1E-2     # compress dataset with absolute 1E-2 error bound\n");
+        printf("         cuSZp_cpu_f64_api testdouble_8_8_128.dat REL 1e-3     # compress dataset with relative 1E-3 error bound\n");
+        exit(0);
+    }
+    sprintf(oriFilePath, "%s", argv[1]);
+    sprintf(errorMode, "%s", argv[2]);
+    double errorBound = atof(argv[3]);
+
+    // Input data preparation.
+    double* oriData = NULL;
+    double* decData = NULL;
+    unsigned char* cmpBytes = NULL;
+    size_t nbEle = 0;
+    size_t cmpSize = 0;
+    oriData = readDoubleData_Yafan(oriFilePath, &nbEle, &status);
+    decData = (double*)malloc(nbEle*sizeof(double));
+    cmpBytes = (unsigned char*)malloc(nbEle*sizeof(double));
+
+    // Generating error bounds.
+    if(strcmp(errorMode, "REL")==0)
+    {
+        double max_val = oriData[0];
+        double min_val = oriData[0];
+        for(size_t i=0; i<nbEle; i++)
+        {
+            if(oriData[i]>max_val)
+                max_val = oriData[i];
+            else if(oriData[i]<min_val)
+                min_val = oriData[i];
+        }
+        errorBound = errorBound * (max_val - min_val);
+    }
+    else if(strcmp(errorMode, "ABS")!=0)
+    {
+        printf("invalid errorMode! errorMode can only be ABS or REL.\n");
+        exit(0);
+    }
+
+    // cuSZp compression.
+    SZp_compress_hostptr_f64(oriData, cmpBytes, nbEle, &cmpSize, errorBound);
+    
+    // cuSZp decompression.
+    SZp_decompress_hostptr_f64(decData, cmpBytes, nbEle, cmpSize, errorBound);
+
+    // Print result.
+    printf("cuSZp finished!\n");
+    printf("compression ratios: %f\n\n", (nbEle*sizeof(double)/1024.0/1024.0)/(cmpSize*sizeof(unsigned char)/1024.0/1024.0));
+
+    // Error check
+    int not_bound = 0;
+    for(size_t i=0; i<nbEle; i+=1)
+    {
+        if(abs(oriData[i]-decData[i]) > errorBound*1.1)
+        {
+            not_bound++;
+            // printf("not bound: %zu oriData: %f, decData: %f, errors: %f, bound: %f\n", i, oriData[i], decData[i], abs(oriData[i]-decData[i]), errBound);
+        }
+    }
+    if(!not_bound) printf("\033[0;32mPass error check!\033[0m\n");
+    else printf("\033[0;31mFail error check!\033[0m\n");
+    
+    // Free allocated data.
+    free(oriData);
+    free(decData);
+    free(cmpBytes);
+    return 0;
 }
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/examples/cuSZp_gpu_f32_api.cpp b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_gpu_f32_api.cpp
index 7c54199d..96722d2b 100644
--- a/qtensor/compression/cuszp/cuSZp/examples/cuSZp_gpu_f32_api.cpp
+++ b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_gpu_f32_api.cpp
@@ -1,119 +1,119 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <string.h>
-#include <cuda_runtime.h>
-#include <cuSZp_utility.h>
-#include <cuSZp_entry_f32.h>
-#include <cuSZp_timer.h>
-
-int main(int argc, char* argv[])
-{
-    // Read input information.
-    char oriFilePath[640];
-    char errorMode[20];
-    int status=0;
-    if(argc != 4)
-    {
-        printf("Usage: cuSZp_gpu_f32_api [srcFilePath] [errorMode] [errBound] # errorMode can only be ABS or REL\n");
-        printf("Example: cuSZp_gpu_f32_api testfloat_8_8_128.dat ABS 1E-2     # compress dataset with absolute 1E-2 error bound\n");
-        printf("         cuSZp_gpu_f32_api testfloat_8_8_128.dat REL 1e-3     # compress dataset with relative 1E-3 error bound\n");
-        exit(0);
-    }
-    sprintf(oriFilePath, "%s", argv[1]);
-    sprintf(errorMode, "%s", argv[2]);
-    float errorBound = atof(argv[3]);
-
-    // For measuring the end-to-end throughput.
-    TimingGPU timer_GPU;
-
-    // Input data preparation on CPU.
-    float* oriData = NULL;
-    float* decData = NULL;
-    unsigned char* cmpBytes = NULL;
-    size_t nbEle = 0;
-    size_t cmpSize = 0;
-    oriData = readFloatData_Yafan(oriFilePath, &nbEle, &status);
-    decData = (float*)malloc(nbEle*sizeof(float));
-    cmpBytes = (unsigned char*)malloc(nbEle*sizeof(float));
-
-    // Generating error bounds.
-    if(strcmp(errorMode, "REL")==0)
-    {
-        float max_val = oriData[0];
-        float min_val = oriData[0];
-        for(size_t i=0; i<nbEle; i++)
-        {
-            if(oriData[i]>max_val)
-                max_val = oriData[i];
-            else if(oriData[i]<min_val)
-                min_val = oriData[i];
-        }
-        errorBound = errorBound * (max_val - min_val);
-    }
-    else if(strcmp(errorMode, "ABS")!=0)
-    {
-        printf("invalid errorMode! errorMode can only be ABS or REL.\n");
-        exit(0);
-    }
-
-    // Input data preparation on GPU.
-    float* d_oriData;
-    float* d_decData;
-    unsigned char* d_cmpBytes;
-    size_t pad_nbEle = (nbEle + 262144 - 1) / 262144 * 262144; // A temp demo, will add more block sizes in future implementation.
-    cudaMalloc((void**)&d_oriData, sizeof(float)*pad_nbEle);
-    cudaMemcpy(d_oriData, oriData, sizeof(float)*pad_nbEle, cudaMemcpyHostToDevice);
-    cudaMalloc((void**)&d_decData, sizeof(float)*pad_nbEle);
-    cudaMemset(d_decData, 0, sizeof(float)*pad_nbEle);
-    cudaMalloc((void**)&d_cmpBytes, sizeof(float)*pad_nbEle);
-
-    // Initializing CUDA Stream.
-    cudaStream_t stream;
-    cudaStreamCreate(&stream);
-
-    // Just a warmup.
-    for(int i=0; i<3; i++)
-        SZp_compress_deviceptr_f32(d_oriData, d_cmpBytes, nbEle, &cmpSize, errorBound, stream);
-
-    // cuSZp compression.
-    timer_GPU.StartCounter(); // set timer
-    SZp_compress_deviceptr_f32(d_oriData, d_cmpBytes, nbEle, &cmpSize, errorBound, stream);
-    float cmpTime = timer_GPU.GetCounter();
-    
-    // cuSZp decompression.
-    timer_GPU.StartCounter(); // set timer
-    SZp_decompress_deviceptr_f32(d_decData, d_cmpBytes, nbEle, cmpSize, errorBound, stream);
-    float decTime = timer_GPU.GetCounter();
-
-    // Print result.
-    printf("cuSZp finished!\n");
-    printf("cuSZp compression   end-to-end speed: %f GB/s\n", (nbEle*sizeof(float)/1024.0/1024.0)/cmpTime);
-    printf("cuSZp decompression end-to-end speed: %f GB/s\n", (nbEle*sizeof(float)/1024.0/1024.0)/decTime);
-    printf("cuSZp compression ratio: %f\n\n", (nbEle*sizeof(float)/1024.0/1024.0)/(cmpSize*sizeof(unsigned char)/1024.0/1024.0));
-
-    // Error check
-    cudaMemcpy(cmpBytes, d_cmpBytes, cmpSize*sizeof(unsigned char), cudaMemcpyDeviceToHost);
-    cudaMemcpy(decData, d_decData, sizeof(float)*nbEle, cudaMemcpyDeviceToHost);
-    int not_bound = 0;
-    for(size_t i=0; i<nbEle; i+=1)
-    {
-        if(abs(oriData[i]-decData[i]) > errorBound*1.1)
-        {
-            not_bound++;
-            // printf("not bound: %zu oriData: %f, decData: %f, errors: %f, bound: %f\n", i, oriData[i], decData[i], abs(oriData[i]-decData[i]), errBound);
-        }
-    }
-    if(!not_bound) printf("\033[0;32mPass error check!\033[0m\n");
-    else printf("\033[0;31mFail error check!\033[0m\n");
-    
-    // Free allocated data.
-    free(oriData);
-    free(decData);
-    free(cmpBytes);
-    cudaFree(d_oriData);
-    cudaFree(d_decData);
-    cudaFree(d_cmpBytes);
-    cudaStreamDestroy(stream);
-    return 0;
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <cuda_runtime.h>
+#include <cuSZp_utility.h>
+#include <cuSZp_entry_f32.h>
+#include <cuSZp_timer.h>
+
+int main(int argc, char* argv[])
+{
+    // Read input information.
+    char oriFilePath[640];
+    char errorMode[20];
+    int status=0;
+    if(argc != 4)
+    {
+        printf("Usage: cuSZp_gpu_f32_api [srcFilePath] [errorMode] [errBound] # errorMode can only be ABS or REL\n");
+        printf("Example: cuSZp_gpu_f32_api testfloat_8_8_128.dat ABS 1E-2     # compress dataset with absolute 1E-2 error bound\n");
+        printf("         cuSZp_gpu_f32_api testfloat_8_8_128.dat REL 1e-3     # compress dataset with relative 1E-3 error bound\n");
+        exit(0);
+    }
+    sprintf(oriFilePath, "%s", argv[1]);
+    sprintf(errorMode, "%s", argv[2]);
+    float errorBound = atof(argv[3]);
+
+    // For measuring the end-to-end throughput.
+    TimingGPU timer_GPU;
+
+    // Input data preparation on CPU.
+    float* oriData = NULL;
+    float* decData = NULL;
+    unsigned char* cmpBytes = NULL;
+    size_t nbEle = 0;
+    size_t cmpSize = 0;
+    oriData = readFloatData_Yafan(oriFilePath, &nbEle, &status);
+    decData = (float*)malloc(nbEle*sizeof(float));
+    cmpBytes = (unsigned char*)malloc(nbEle*sizeof(float));
+
+    // Generating error bounds.
+    if(strcmp(errorMode, "REL")==0)
+    {
+        float max_val = oriData[0];
+        float min_val = oriData[0];
+        for(size_t i=0; i<nbEle; i++)
+        {
+            if(oriData[i]>max_val)
+                max_val = oriData[i];
+            else if(oriData[i]<min_val)
+                min_val = oriData[i];
+        }
+        errorBound = errorBound * (max_val - min_val);
+    }
+    else if(strcmp(errorMode, "ABS")!=0)
+    {
+        printf("invalid errorMode! errorMode can only be ABS or REL.\n");
+        exit(0);
+    }
+
+    // Input data preparation on GPU.
+    float* d_oriData;
+    float* d_decData;
+    unsigned char* d_cmpBytes;
+    size_t pad_nbEle = (nbEle + 262144 - 1) / 262144 * 262144; // A temp demo, will add more block sizes in future implementation.
+    cudaMalloc((void**)&d_oriData, sizeof(float)*pad_nbEle);
+    cudaMemcpy(d_oriData, oriData, sizeof(float)*pad_nbEle, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_decData, sizeof(float)*pad_nbEle);
+    cudaMemset(d_decData, 0, sizeof(float)*pad_nbEle);
+    cudaMalloc((void**)&d_cmpBytes, sizeof(float)*pad_nbEle);
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // Just a warmup.
+    for(int i=0; i<3; i++)
+        SZp_compress_deviceptr_f32(d_oriData, d_cmpBytes, nbEle, &cmpSize, errorBound, stream);
+
+    // cuSZp compression.
+    timer_GPU.StartCounter(); // set timer
+    SZp_compress_deviceptr_f32(d_oriData, d_cmpBytes, nbEle, &cmpSize, errorBound, stream);
+    float cmpTime = timer_GPU.GetCounter();
+    
+    // cuSZp decompression.
+    timer_GPU.StartCounter(); // set timer
+    SZp_decompress_deviceptr_f32(d_decData, d_cmpBytes, nbEle, cmpSize, errorBound, stream);
+    float decTime = timer_GPU.GetCounter();
+
+    // Print result.
+    printf("cuSZp finished!\n");
+    printf("cuSZp compression   end-to-end speed: %f GB/s\n", (nbEle*sizeof(float)/1024.0/1024.0)/cmpTime);
+    printf("cuSZp decompression end-to-end speed: %f GB/s\n", (nbEle*sizeof(float)/1024.0/1024.0)/decTime);
+    printf("cuSZp compression ratio: %f\n\n", (nbEle*sizeof(float)/1024.0/1024.0)/(cmpSize*sizeof(unsigned char)/1024.0/1024.0));
+
+    // Error check
+    cudaMemcpy(cmpBytes, d_cmpBytes, cmpSize*sizeof(unsigned char), cudaMemcpyDeviceToHost);
+    cudaMemcpy(decData, d_decData, sizeof(float)*nbEle, cudaMemcpyDeviceToHost);
+    int not_bound = 0;
+    for(size_t i=0; i<nbEle; i+=1)
+    {
+        if(abs(oriData[i]-decData[i]) > errorBound*1.1)
+        {
+            not_bound++;
+            // printf("not bound: %zu oriData: %f, decData: %f, errors: %f, bound: %f\n", i, oriData[i], decData[i], abs(oriData[i]-decData[i]), errBound);
+        }
+    }
+    if(!not_bound) printf("\033[0;32mPass error check!\033[0m\n");
+    else printf("\033[0;31mFail error check!\033[0m\n");
+    
+    // Free allocated data.
+    free(oriData);
+    free(decData);
+    free(cmpBytes);
+    cudaFree(d_oriData);
+    cudaFree(d_decData);
+    cudaFree(d_cmpBytes);
+    cudaStreamDestroy(stream);
+    return 0;
 }
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/examples/cuSZp_gpu_f64_api.cpp b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_gpu_f64_api.cpp
index 3c03df17..7af2f303 100644
--- a/qtensor/compression/cuszp/cuSZp/examples/cuSZp_gpu_f64_api.cpp
+++ b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_gpu_f64_api.cpp
@@ -1,120 +1,120 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <string.h>
-#include <cuda_runtime.h>
-#include <cuSZp_utility.h>
-#include <cuSZp_entry_f64.h>
-#include <cuSZp_timer.h>
-
-int main(int argc, char* argv[])
-{
-    // Read input information.
-    char oriFilePath[640];
-    char errorMode[20];
-    int status=0;
-    if(argc != 4)
-    {
-        printf("Usage: cuSZp_gpu_f64_api [srcFilePath] [errorMode] [errBound] # errorMode can only be ABS or REL\n");
-        printf("Example: cuSZp_gpu_f64_api testdouble_8_8_128.dat ABS 1E-2     # compress dataset with absolute 1E-2 error bound\n");
-        printf("         cuSZp_gpu_f64_api testdouble_8_8_128.dat REL 1e-3     # compress dataset with relative 1E-3 error bound\n");
-        exit(0);
-    }
-    sprintf(oriFilePath, "%s", argv[1]);
-    sprintf(errorMode, "%s", argv[2]);
-    double errorBound = atof(argv[3]);
-
-    // For measuring the end-to-end throughput.
-    TimingGPU timer_GPU;
-
-    // Input data preparation on CPU.
-    double* oriData = NULL;
-    double* decData = NULL;
-    unsigned char* cmpBytes = NULL;
-    size_t nbEle = 0;
-    size_t cmpSize = 0;
-    oriData = readDoubleData_Yafan(oriFilePath, &nbEle, &status);
-    decData = (double*)malloc(nbEle*sizeof(double));
-    cmpBytes = (unsigned char*)malloc(nbEle*sizeof(double));
-
-    // Generating error bounds.
-    if(strcmp(errorMode, "REL")==0)
-    {
-        double max_val = oriData[0];
-        double min_val = oriData[0];
-        for(size_t i=0; i<nbEle; i++)
-        {
-            if(oriData[i]>max_val)
-                max_val = oriData[i];
-            else if(oriData[i]<min_val)
-                min_val = oriData[i];
-        }
-        errorBound = errorBound * (max_val - min_val);
-    }
-    else if(strcmp(errorMode, "ABS")!=0)
-    {
-        printf("invalid errorMode! errorMode can only be ABS or REL.\n");
-        exit(0);
-    }
-
-    // Input data preparation on GPU.
-    double* d_oriData;
-    double* d_decData;
-    unsigned char* d_cmpBytes;
-    size_t pad_nbEle = (nbEle + 262144 - 1) / 262144 * 262144; // A temp demo, will add more block sizes in future implementation.
-    cudaMalloc((void**)&d_oriData, sizeof(double)*pad_nbEle);
-    cudaMemcpy(d_oriData, oriData, sizeof(double)*pad_nbEle, cudaMemcpyHostToDevice);
-    cudaMalloc((void**)&d_decData, sizeof(double)*pad_nbEle);
-    cudaMemset(d_decData, 0, sizeof(double)*pad_nbEle);
-    cudaMalloc((void**)&d_cmpBytes, sizeof(double)*pad_nbEle);
-
-    // Initializing CUDA Stream.
-    cudaStream_t stream;
-    cudaStreamCreate(&stream);
-
-    // Just a warmup.
-    for(int i=0; i<3; i++)
-        SZp_compress_deviceptr_f64(d_oriData, d_cmpBytes, nbEle, &cmpSize, errorBound, stream);
-
-    // cuSZp compression.
-    timer_GPU.StartCounter(); // set timer
-    SZp_compress_deviceptr_f64(d_oriData, d_cmpBytes, nbEle, &cmpSize, errorBound, stream);
-    float cmpTime = timer_GPU.GetCounter();
-    
-    // cuSZp decompression.
-    timer_GPU.StartCounter(); // set timer
-    SZp_decompress_deviceptr_f64(d_decData, d_cmpBytes, nbEle, cmpSize, errorBound, stream);
-    float decTime = timer_GPU.GetCounter();
-
-    // Print result.
-    printf("cuSZp finished!\n");
-    printf("cuSZp compression   end-to-end speed: %f GB/s\n", (nbEle*sizeof(double)/1024.0/1024.0)/cmpTime);
-    printf("cuSZp decompression end-to-end speed: %f GB/s\n", (nbEle*sizeof(double)/1024.0/1024.0)/decTime);
-    printf("cuSZp compression ratio: %f\n\n", (nbEle*sizeof(double)/1024.0/1024.0)/(cmpSize*sizeof(unsigned char)/1024.0/1024.0));
-
-    // Error check
-    cudaMemcpy(cmpBytes, d_cmpBytes, cmpSize*sizeof(unsigned char), cudaMemcpyDeviceToHost);
-    cudaMemcpy(decData, d_decData, sizeof(double)*nbEle, cudaMemcpyDeviceToHost);
-    int not_bound = 0;
-    for(size_t i=0; i<nbEle; i+=1)
-    {
-        if(abs(oriData[i]-decData[i]) > errorBound*1.1)
-        {
-            not_bound++;
-            // printf("not bound: %zu oriData: %f, decData: %f, errors: %f, bound: %f\n", i, oriData[i], decData[i], abs(oriData[i]-decData[i]), errBound);
-        }
-    }
-    if(!not_bound) printf("\033[0;32mPass error check!\033[0m\n");
-    else printf("\033[0;31mFail error check!\033[0m\n");
-    
-    // Free allocated data.
-    free(oriData);
-    free(decData);
-    free(cmpBytes);
-    cudaFree(d_oriData);
-    cudaFree(d_decData);
-    cudaFree(d_cmpBytes);
-    cudaStreamDestroy(stream);
-
-    return 0;
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <cuda_runtime.h>
+#include <cuSZp_utility.h>
+#include <cuSZp_entry_f64.h>
+#include <cuSZp_timer.h>
+
+int main(int argc, char* argv[])
+{
+    // Read input information.
+    char oriFilePath[640];
+    char errorMode[20];
+    int status=0;
+    if(argc != 4)
+    {
+        printf("Usage: cuSZp_gpu_f64_api [srcFilePath] [errorMode] [errBound] # errorMode can only be ABS or REL\n");
+        printf("Example: cuSZp_gpu_f64_api testdouble_8_8_128.dat ABS 1E-2     # compress dataset with absolute 1E-2 error bound\n");
+        printf("         cuSZp_gpu_f64_api testdouble_8_8_128.dat REL 1e-3     # compress dataset with relative 1E-3 error bound\n");
+        exit(0);
+    }
+    sprintf(oriFilePath, "%s", argv[1]);
+    sprintf(errorMode, "%s", argv[2]);
+    double errorBound = atof(argv[3]);
+
+    // For measuring the end-to-end throughput.
+    TimingGPU timer_GPU;
+
+    // Input data preparation on CPU.
+    double* oriData = NULL;
+    double* decData = NULL;
+    unsigned char* cmpBytes = NULL;
+    size_t nbEle = 0;
+    size_t cmpSize = 0;
+    oriData = readDoubleData_Yafan(oriFilePath, &nbEle, &status);
+    decData = (double*)malloc(nbEle*sizeof(double));
+    cmpBytes = (unsigned char*)malloc(nbEle*sizeof(double));
+
+    // Generating error bounds.
+    if(strcmp(errorMode, "REL")==0)
+    {
+        double max_val = oriData[0];
+        double min_val = oriData[0];
+        for(size_t i=0; i<nbEle; i++)
+        {
+            if(oriData[i]>max_val)
+                max_val = oriData[i];
+            else if(oriData[i]<min_val)
+                min_val = oriData[i];
+        }
+        errorBound = errorBound * (max_val - min_val);
+    }
+    else if(strcmp(errorMode, "ABS")!=0)
+    {
+        printf("invalid errorMode! errorMode can only be ABS or REL.\n");
+        exit(0);
+    }
+
+    // Input data preparation on GPU.
+    double* d_oriData;
+    double* d_decData;
+    unsigned char* d_cmpBytes;
+    size_t pad_nbEle = (nbEle + 262144 - 1) / 262144 * 262144; // A temp demo, will add more block sizes in future implementation.
+    cudaMalloc((void**)&d_oriData, sizeof(double)*pad_nbEle);
+    cudaMemcpy(d_oriData, oriData, sizeof(double)*pad_nbEle, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_decData, sizeof(double)*pad_nbEle);
+    cudaMemset(d_decData, 0, sizeof(double)*pad_nbEle);
+    cudaMalloc((void**)&d_cmpBytes, sizeof(double)*pad_nbEle);
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // Just a warmup.
+    for(int i=0; i<3; i++)
+        SZp_compress_deviceptr_f64(d_oriData, d_cmpBytes, nbEle, &cmpSize, errorBound, stream);
+
+    // cuSZp compression.
+    timer_GPU.StartCounter(); // set timer
+    SZp_compress_deviceptr_f64(d_oriData, d_cmpBytes, nbEle, &cmpSize, errorBound, stream);
+    float cmpTime = timer_GPU.GetCounter();
+    
+    // cuSZp decompression.
+    timer_GPU.StartCounter(); // set timer
+    SZp_decompress_deviceptr_f64(d_decData, d_cmpBytes, nbEle, cmpSize, errorBound, stream);
+    float decTime = timer_GPU.GetCounter();
+
+    // Print result.
+    printf("cuSZp finished!\n");
+    printf("cuSZp compression   end-to-end speed: %f GB/s\n", (nbEle*sizeof(double)/1024.0/1024.0)/cmpTime);
+    printf("cuSZp decompression end-to-end speed: %f GB/s\n", (nbEle*sizeof(double)/1024.0/1024.0)/decTime);
+    printf("cuSZp compression ratio: %f\n\n", (nbEle*sizeof(double)/1024.0/1024.0)/(cmpSize*sizeof(unsigned char)/1024.0/1024.0));
+
+    // Error check
+    cudaMemcpy(cmpBytes, d_cmpBytes, cmpSize*sizeof(unsigned char), cudaMemcpyDeviceToHost);
+    cudaMemcpy(decData, d_decData, sizeof(double)*nbEle, cudaMemcpyDeviceToHost);
+    int not_bound = 0;
+    for(size_t i=0; i<nbEle; i+=1)
+    {
+        if(abs(oriData[i]-decData[i]) > errorBound*1.1)
+        {
+            not_bound++;
+            // printf("not bound: %zu oriData: %f, decData: %f, errors: %f, bound: %f\n", i, oriData[i], decData[i], abs(oriData[i]-decData[i]), errBound);
+        }
+    }
+    if(!not_bound) printf("\033[0;32mPass error check!\033[0m\n");
+    else printf("\033[0;31mFail error check!\033[0m\n");
+    
+    // Free allocated data.
+    free(oriData);
+    free(decData);
+    free(cmpBytes);
+    cudaFree(d_oriData);
+    cudaFree(d_decData);
+    cudaFree(d_cmpBytes);
+    cudaStreamDestroy(stream);
+
+    return 0;
 }
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/include/cuSZp_entry_f32.h b/qtensor/compression/cuszp/cuSZp/include/cuSZp_entry_f32.h
index 4300d26c..5b77d73c 100644
--- a/qtensor/compression/cuszp/cuSZp/include/cuSZp_entry_f32.h
+++ b/qtensor/compression/cuszp/cuSZp/include/cuSZp_entry_f32.h
@@ -1,11 +1,11 @@
-#ifndef CUSZP_INCLUDE_CUSZP_ENTRY_F32_H
-#define CUSZP_INCLUDE_CUSZP_ENTRY_F32_H
-
-#include <cuda_runtime.h>
-
-void SZp_compress_hostptr_f32(float* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound);
-void SZp_decompress_hostptr_f32(float* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, float errorBound);
-void SZp_compress_deviceptr_f32(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream = 0);
-void SZp_decompress_deviceptr_f32(float* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, float errorBound, cudaStream_t stream = 0);
-
+#ifndef CUSZP_INCLUDE_CUSZP_ENTRY_F32_H
+#define CUSZP_INCLUDE_CUSZP_ENTRY_F32_H
+
+#include <cuda_runtime.h>
+
+void SZp_compress_hostptr_f32(float* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound);
+void SZp_decompress_hostptr_f32(float* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, float errorBound);
+void SZp_compress_deviceptr_f32(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream = 0);
+void SZp_decompress_deviceptr_f32(float* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, float errorBound, cudaStream_t stream = 0);
+
 #endif // CUSZP_INCLUDE_CUSZP_ENTRY_F32_H
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/include/cuSZp_entry_f64.h b/qtensor/compression/cuszp/cuSZp/include/cuSZp_entry_f64.h
index 29837263..6a591acd 100644
--- a/qtensor/compression/cuszp/cuSZp/include/cuSZp_entry_f64.h
+++ b/qtensor/compression/cuszp/cuSZp/include/cuSZp_entry_f64.h
@@ -1,11 +1,11 @@
-#ifndef CUSZP_INCLUDE_CUSZP_ENTRY_F64_H
-#define CUSZP_INCLUDE_CUSZP_ENTRY_F64_H
-
-#include <cuda_runtime.h>
-
-void SZp_compress_hostptr_f64(double* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, double errorBound);
-void SZp_decompress_hostptr_f64(double* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, double errorBound);
-void SZp_compress_deviceptr_f64(double* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, double errorBound, cudaStream_t stream = 0);
-void SZp_decompress_deviceptr_f64(double* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, double errorBound, cudaStream_t stream = 0);
-
+#ifndef CUSZP_INCLUDE_CUSZP_ENTRY_F64_H
+#define CUSZP_INCLUDE_CUSZP_ENTRY_F64_H
+
+#include <cuda_runtime.h>
+
+void SZp_compress_hostptr_f64(double* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, double errorBound);
+void SZp_decompress_hostptr_f64(double* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, double errorBound);
+void SZp_compress_deviceptr_f64(double* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, double errorBound, cudaStream_t stream = 0);
+void SZp_decompress_deviceptr_f64(double* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, double errorBound, cudaStream_t stream = 0);
+
 #endif // CUSZP_INCLUDE_CUSZP_ENTRY_F64_H
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/include/cuSZp_f32.h b/qtensor/compression/cuszp/cuSZp/include/cuSZp_f32.h
index fa91cf50..c69d349a 100644
--- a/qtensor/compression/cuszp/cuSZp/include/cuSZp_f32.h
+++ b/qtensor/compression/cuszp/cuSZp/include/cuSZp_f32.h
@@ -1,12 +1,12 @@
-#ifndef CUSZP_INCLUDE_CUSZP_F32_H
-#define CUSZP_INCLUDE_CUSZP_F32_H
-
-static const int cmp_tblock_size_f32 = 32;
-static const int dec_tblock_size_f32 = 32;
-static const int cmp_chunk_f32 = 256;
-static const int dec_chunk_f32 = 256;
-
-__global__ void SZp_compress_kernel_f32(const float* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle);
-__global__ void SZp_decompress_kernel_f32(float* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle);
-
+#ifndef CUSZP_INCLUDE_CUSZP_F32_H
+#define CUSZP_INCLUDE_CUSZP_F32_H
+
+static const int cmp_tblock_size_f32 = 32;
+static const int dec_tblock_size_f32 = 32;
+static const int cmp_chunk_f32 = 256;
+static const int dec_chunk_f32 = 256;
+
+__global__ void SZp_compress_kernel_f32(const float* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle);
+__global__ void SZp_decompress_kernel_f32(float* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle);
+
 #endif // CUSZP_INCLUDE_CUSZP_F32_H
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/include/cuSZp_f64.h b/qtensor/compression/cuszp/cuSZp/include/cuSZp_f64.h
index c26f5ee6..d1cc1b43 100644
--- a/qtensor/compression/cuszp/cuSZp/include/cuSZp_f64.h
+++ b/qtensor/compression/cuszp/cuSZp/include/cuSZp_f64.h
@@ -1,12 +1,12 @@
-#ifndef CUSZP_INCLUDE_CUSZP_F64_H
-#define CUSZP_INCLUDE_CUSZP_F64_H
-
-static const int cmp_tblock_size_f64 = 32;
-static const int dec_tblock_size_f64 = 32;
-static const int cmp_chunk_f64 = 8192;
-static const int dec_chunk_f64 = 8192;
-
-__global__ void SZp_compress_kernel_f64(const double* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const double eb, const size_t nbEle);
-__global__ void SZp_decompress_kernel_f64(double* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const double eb, const size_t nbEle);
-
+#ifndef CUSZP_INCLUDE_CUSZP_F64_H
+#define CUSZP_INCLUDE_CUSZP_F64_H
+
+static const int cmp_tblock_size_f64 = 32;
+static const int dec_tblock_size_f64 = 32;
+static const int cmp_chunk_f64 = 8192;
+static const int dec_chunk_f64 = 8192;
+
+__global__ void SZp_compress_kernel_f64(const double* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const double eb, const size_t nbEle);
+__global__ void SZp_decompress_kernel_f64(double* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const double eb, const size_t nbEle);
+
 #endif // CUSZP_INCLUDE_CUSZP_F64_H
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/include/cuSZp_timer.h b/qtensor/compression/cuszp/cuSZp/include/cuSZp_timer.h
index faca61c3..2777a919 100644
--- a/qtensor/compression/cuszp/cuSZp/include/cuSZp_timer.h
+++ b/qtensor/compression/cuszp/cuSZp/include/cuSZp_timer.h
@@ -1,31 +1,31 @@
-#ifndef CUSZP_INCLUDE_CUSZP_TIMER_H
-#define CUSZP_INCLUDE_CUSZP_TIMER_H
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-struct PrivateTimingGPU {
-    cudaEvent_t start;
-    cudaEvent_t stop;
-};
-
-class TimingGPU
-{
-    private:
-        PrivateTimingGPU *privateTimingGPU;
-
-    public:
-
-        TimingGPU();
-
-        ~TimingGPU();
-
-        void StartCounter();
-
-        void StartCounterFlags();
-
-        float GetCounter();
-
-};
-
+#ifndef CUSZP_INCLUDE_CUSZP_TIMER_H
+#define CUSZP_INCLUDE_CUSZP_TIMER_H
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+struct PrivateTimingGPU {
+    cudaEvent_t start;
+    cudaEvent_t stop;
+};
+
+class TimingGPU
+{
+    private:
+        PrivateTimingGPU *privateTimingGPU;
+
+    public:
+
+        TimingGPU();
+
+        ~TimingGPU();
+
+        void StartCounter();
+
+        void StartCounterFlags();
+
+        float GetCounter();
+
+};
+
 #endif // CUSZP_INCLUDE_CUSZP_TIMER_H
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/include/cuSZp_utility.h b/qtensor/compression/cuszp/cuSZp/include/cuSZp_utility.h
index 1e29f134..ae9b3b60 100644
--- a/qtensor/compression/cuszp/cuSZp/include/cuSZp_utility.h
+++ b/qtensor/compression/cuszp/cuSZp/include/cuSZp_utility.h
@@ -1,18 +1,18 @@
-#ifndef CUSZP_INCLUDE_CUSZP_UTILITY_H
-#define CUSZP_INCLUDE_CUSZP_UTILITY_H
-
-void symTransForm_4Bytes(unsigned char data[4]);
-void symTransform_8bytes(unsigned char data[8]);
-unsigned char *readByteData_Yafan(char *srcFilePath, size_t *byteLength, int *status);
-float *readFloatData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status);
-float *readFloatData_Yafan(char *srcFilePath, size_t *nbEle, int *status);
-double *readDoubleData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status);
-double *readDoubleData_Yafan(char *srcFilePath, size_t *nbEle, int *status);
-void writeByteData_Yafan(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status);
-void writeFloatData_inBytes_Yafan(float *data, size_t nbEle, char* tgtFilePath, int *status);
-void writeDoubleData_inBytes_Yafan(double *data, size_t nbEle, char* tgtFilePath, int *status);
-double SSIM_3d_calcWindow_float(float* data, float* other, size_t size1, size_t size0, int offset0, int offset1, int offset2, int windowSize0, int windowSize1, int windowSize2);
-double computeSSIM(float* oriData, float* decData, size_t size2, size_t size1, size_t size0);
-double *computePSNR(size_t nbEle, float *ori_data, float *data);
-
+#ifndef CUSZP_INCLUDE_CUSZP_UTILITY_H
+#define CUSZP_INCLUDE_CUSZP_UTILITY_H
+
+void symTransForm_4Bytes(unsigned char data[4]);
+void symTransform_8bytes(unsigned char data[8]);
+unsigned char *readByteData_Yafan(char *srcFilePath, size_t *byteLength, int *status);
+float *readFloatData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status);
+float *readFloatData_Yafan(char *srcFilePath, size_t *nbEle, int *status);
+double *readDoubleData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status);
+double *readDoubleData_Yafan(char *srcFilePath, size_t *nbEle, int *status);
+void writeByteData_Yafan(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status);
+void writeFloatData_inBytes_Yafan(float *data, size_t nbEle, char* tgtFilePath, int *status);
+void writeDoubleData_inBytes_Yafan(double *data, size_t nbEle, char* tgtFilePath, int *status);
+double SSIM_3d_calcWindow_float(float* data, float* other, size_t size1, size_t size0, int offset0, int offset1, int offset2, int windowSize0, int windowSize1, int windowSize2);
+double computeSSIM(float* oriData, float* decData, size_t size2, size_t size1, size_t size0);
+double *computePSNR(size_t nbEle, float *ori_data, float *data);
+
 #endif // CUSZP_INCLUDE_CUSZP_UTILITY_H
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/src/cuSZp_entry_f32.cu b/qtensor/compression/cuszp/cuSZp/src/cuSZp_entry_f32.cu
index 6a29939d..59749099 100644
--- a/qtensor/compression/cuszp/cuSZp/src/cuSZp_entry_f32.cu
+++ b/qtensor/compression/cuszp/cuSZp/src/cuSZp_entry_f32.cu
@@ -1,149 +1,149 @@
-#include "cuSZp_entry_f32.h"
-#include "cuSZp_f32.h"
-#include <stdio.h>
-
-void SZp_compress_hostptr_f32(float* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound)
-{
-    // Data blocking.
-    int bsize = cmp_tblock_size_f32;
-    int gsize = (nbEle + bsize * cmp_chunk_f32 - 1) / (bsize * cmp_chunk_f32);
-    int cmpOffSize = gsize + 1;
-    int pad_nbEle = gsize * bsize * cmp_chunk_f32;
-
-    // Initializing global memory for GPU compression.
-    float* d_oriData;
-    unsigned char* d_cmpData;
-    unsigned int* d_cmpOffset;
-    int* d_flag;
-    cudaMalloc((void**)&d_oriData, sizeof(float)*pad_nbEle);
-    cudaMemcpy(d_oriData, oriData, sizeof(float)*pad_nbEle, cudaMemcpyHostToDevice);
-    cudaMalloc((void**)&d_cmpData, sizeof(float)*pad_nbEle);
-    cudaMallocManaged((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
-    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
-    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
-    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
-    cudaMemset(d_oriData + nbEle, 0, (pad_nbEle - nbEle) * sizeof(float));
-
-    // Initializing CUDA Stream.
-    cudaStream_t stream;
-    cudaStreamCreate(&stream);
-
-    // cuSZp GPU compression.
-    dim3 blockSize(bsize);
-    dim3 gridSize(gsize);
-    SZp_compress_kernel_f32<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
-    cudaDeviceSynchronize();
-
-    // Obtain compression ratio and move data back to CPU.  
-    *cmpSize = (size_t)d_cmpOffset[cmpOffSize-1] + (nbEle+31)/32;
-    cudaMemcpy(cmpBytes, d_cmpData, *cmpSize*sizeof(unsigned char), cudaMemcpyDeviceToHost);
-
-    // Free memory that is used.
-    cudaFree(d_oriData);
-    cudaFree(d_cmpData);
-    cudaFree(d_cmpOffset);
-    cudaFree(d_flag);
-    cudaStreamDestroy(stream);
-}
-
-
-void SZp_decompress_hostptr_f32(float* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, float errorBound)
-{
-    // Data blocking.
-    int bsize = dec_tblock_size_f32;
-    int gsize = (nbEle + bsize * dec_chunk_f32 - 1) / (bsize * dec_chunk_f32);
-    int cmpOffSize = gsize + 1;
-    int pad_nbEle = gsize * bsize * dec_chunk_f32;
-
-    // Initializing global memory for GPU compression.
-    float* d_decData;
-    unsigned char* d_cmpData;
-    unsigned int* d_cmpOffset;
-    int* d_flag;
-    cudaMalloc((void**)&d_decData, sizeof(float)*pad_nbEle);
-    cudaMemset(d_decData, 0, sizeof(float)*pad_nbEle);
-    cudaMalloc((void**)&d_cmpData, sizeof(float)*pad_nbEle);
-    cudaMemcpy(d_cmpData, cmpBytes, sizeof(unsigned char)*cmpSize, cudaMemcpyHostToDevice);
-    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
-    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
-    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
-    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
-
-    // Initializing CUDA Stream.
-    cudaStream_t stream;
-    cudaStreamCreate(&stream);
-
-    // cuSZp GPU compression.
-    dim3 blockSize(bsize);
-    dim3 gridSize(gsize);
-    SZp_decompress_kernel_f32<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
-    cudaDeviceSynchronize();
-
-    // Move data back to CPU.
-    cudaMemcpy(decData, d_decData, sizeof(float)*nbEle, cudaMemcpyDeviceToHost);
-
-    // Free memoy that is used.
-    cudaFree(d_decData);
-    cudaFree(d_cmpData);
-    cudaFree(d_cmpOffset);
-    cudaFree(d_flag);
-    cudaStreamDestroy(stream);
-}
-
-
-void SZp_compress_deviceptr_f32(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream)
-{
-    int bsize = cmp_tblock_size_f32;
-    int gsize = (nbEle + bsize * cmp_chunk_f32 - 1) / (bsize * cmp_chunk_f32);
-    int cmpOffSize = gsize + 1;
-    int pad_nbEle = gsize * bsize * cmp_chunk_f32;
-
-    // Initializing global memory for GPU compression.
-    unsigned int* d_cmpOffset;
-    int* d_flag;
-    cudaMallocManaged((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
-    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
-    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
-    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
-    // cudaMemset(d_oriData + nbEle, 0, (pad_nbEle - nbEle) * sizeof(float));
-
-    // cuSZp GPU compression.
-    dim3 blockSize(bsize);
-    dim3 gridSize(gsize);
-    SZp_compress_kernel_f32<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
-    cudaDeviceSynchronize();
-
-    // Obtain compression ratio and move data back to CPU.  
-    *cmpSize = (size_t)d_cmpOffset[cmpOffSize-1] + (nbEle+31)/32;
-
-    // Free memory that is used.
-    cudaFree(d_cmpOffset);
-    cudaFree(d_flag);
-}
-
-
-void SZp_decompress_deviceptr_f32(float* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, float errorBound, cudaStream_t stream)
-{
-    // Data blocking.
-    int bsize = dec_tblock_size_f32;
-    int gsize = (nbEle + bsize * dec_chunk_f32 - 1) / (bsize * dec_chunk_f32);
-    int cmpOffSize = gsize + 1;
-
-    // Initializing global memory for GPU compression.
-    unsigned int* d_cmpOffset;
-    int* d_flag;
-    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
-    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
-    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
-    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
-
-    // cuSZp GPU compression.
-    dim3 blockSize(bsize);
-    dim3 gridSize(gsize);
-    SZp_decompress_kernel_f32<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
-    cudaDeviceSynchronize();
-
-    // Free memoy that is used.
-    cudaFree(d_cmpOffset);
-    cudaFree(d_flag);
+#include "cuSZp_entry_f32.h"
+#include "cuSZp_f32.h"
+#include <stdio.h>
+
+void SZp_compress_hostptr_f32(float* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound)
+{
+    // Data blocking.
+    int bsize = cmp_tblock_size_f32;
+    int gsize = (nbEle + bsize * cmp_chunk_f32 - 1) / (bsize * cmp_chunk_f32);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * cmp_chunk_f32;
+
+    // Initializing global memory for GPU compression.
+    float* d_oriData;
+    unsigned char* d_cmpData;
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_oriData, sizeof(float)*pad_nbEle);
+    cudaMemcpy(d_oriData, oriData, sizeof(float)*pad_nbEle, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_cmpData, sizeof(float)*pad_nbEle);
+    cudaMallocManaged((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+    cudaMemset(d_oriData + nbEle, 0, (pad_nbEle - nbEle) * sizeof(float));
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_compress_kernel_f32<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Obtain compression ratio and move data back to CPU.  
+    *cmpSize = (size_t)d_cmpOffset[cmpOffSize-1] + (nbEle+31)/32;
+    cudaMemcpy(cmpBytes, d_cmpData, *cmpSize*sizeof(unsigned char), cudaMemcpyDeviceToHost);
+
+    // Free memory that is used.
+    cudaFree(d_oriData);
+    cudaFree(d_cmpData);
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+    cudaStreamDestroy(stream);
+}
+
+
+void SZp_decompress_hostptr_f32(float* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, float errorBound)
+{
+    // Data blocking.
+    int bsize = dec_tblock_size_f32;
+    int gsize = (nbEle + bsize * dec_chunk_f32 - 1) / (bsize * dec_chunk_f32);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * dec_chunk_f32;
+
+    // Initializing global memory for GPU compression.
+    float* d_decData;
+    unsigned char* d_cmpData;
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_decData, sizeof(float)*pad_nbEle);
+    cudaMemset(d_decData, 0, sizeof(float)*pad_nbEle);
+    cudaMalloc((void**)&d_cmpData, sizeof(float)*pad_nbEle);
+    cudaMemcpy(d_cmpData, cmpBytes, sizeof(unsigned char)*cmpSize, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_decompress_kernel_f32<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Move data back to CPU.
+    cudaMemcpy(decData, d_decData, sizeof(float)*nbEle, cudaMemcpyDeviceToHost);
+
+    // Free memoy that is used.
+    cudaFree(d_decData);
+    cudaFree(d_cmpData);
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+    cudaStreamDestroy(stream);
+}
+
+
+void SZp_compress_deviceptr_f32(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream)
+{
+    int bsize = cmp_tblock_size_f32;
+    int gsize = (nbEle + bsize * cmp_chunk_f32 - 1) / (bsize * cmp_chunk_f32);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * cmp_chunk_f32;
+
+    // Initializing global memory for GPU compression.
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMallocManaged((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+    // cudaMemset(d_oriData + nbEle, 0, (pad_nbEle - nbEle) * sizeof(float));
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_compress_kernel_f32<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Obtain compression ratio and move data back to CPU.  
+    *cmpSize = (size_t)d_cmpOffset[cmpOffSize-1] + (nbEle+31)/32;
+
+    // Free memory that is used.
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+}
+
+
+void SZp_decompress_deviceptr_f32(float* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, float errorBound, cudaStream_t stream)
+{
+    // Data blocking.
+    int bsize = dec_tblock_size_f32;
+    int gsize = (nbEle + bsize * dec_chunk_f32 - 1) / (bsize * dec_chunk_f32);
+    int cmpOffSize = gsize + 1;
+
+    // Initializing global memory for GPU compression.
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_decompress_kernel_f32<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Free memoy that is used.
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
 }
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/src/cuSZp_entry_f64.cu b/qtensor/compression/cuszp/cuSZp/src/cuSZp_entry_f64.cu
index 8bd1e76f..926406c2 100644
--- a/qtensor/compression/cuszp/cuSZp/src/cuSZp_entry_f64.cu
+++ b/qtensor/compression/cuszp/cuSZp/src/cuSZp_entry_f64.cu
@@ -1,149 +1,149 @@
-#include "cuSZp_entry_f64.h"
-#include "cuSZp_f64.h"
-
-void SZp_compress_hostptr_f64(double* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, double errorBound)
-{
-    // Data blocking.
-    int bsize = cmp_tblock_size_f64;
-    int gsize = (nbEle + bsize * cmp_chunk_f64 - 1) / (bsize * cmp_chunk_f64);
-    int cmpOffSize = gsize + 1;
-    int pad_nbEle = gsize * bsize * cmp_chunk_f64;
-
-    // Initializing global memory for GPU compression.
-    double* d_oriData;
-    unsigned char* d_cmpData;
-    unsigned int* d_cmpOffset;
-    int* d_flag;
-    cudaMalloc((void**)&d_oriData, sizeof(double)*pad_nbEle);
-    cudaMemcpy(d_oriData, oriData, sizeof(double)*pad_nbEle, cudaMemcpyHostToDevice);
-    cudaMalloc((void**)&d_cmpData, sizeof(double)*pad_nbEle);
-    cudaMallocManaged((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
-    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
-    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
-    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
-    cudaMemset(d_oriData + nbEle, 0, (pad_nbEle - nbEle) * sizeof(double));
-
-    // Initializing CUDA Stream.
-    cudaStream_t stream;
-    cudaStreamCreate(&stream);
-
-    // cuSZp GPU compression.
-    dim3 blockSize(bsize);
-    dim3 gridSize(gsize);
-    SZp_compress_kernel_f64<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
-    cudaDeviceSynchronize();
-
-    // Obtain compression ratio and move data back to CPU.  
-    *cmpSize = (size_t)d_cmpOffset[cmpOffSize-1] + (nbEle+31)/32;
-    cudaMemcpy(cmpBytes, d_cmpData, *cmpSize*sizeof(unsigned char), cudaMemcpyDeviceToHost);
-
-    // Free memory that is used.
-    cudaFree(d_oriData);
-    cudaFree(d_cmpData);
-    cudaFree(d_cmpOffset);
-    cudaFree(d_flag);
-    cudaStreamDestroy(stream);
-}
-
-
-void SZp_decompress_hostptr_f64(double* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, double errorBound)
-{
-    // Data blocking.
-    int bsize = dec_tblock_size_f64;
-    int gsize = (nbEle + bsize * dec_chunk_f64 - 1) / (bsize * dec_chunk_f64);
-    int cmpOffSize = gsize + 1;
-    int pad_nbEle = gsize * bsize * dec_chunk_f64;
-
-    // Initializing global memory for GPU compression.
-    double* d_decData;
-    unsigned char* d_cmpData;
-    unsigned int* d_cmpOffset;
-    int* d_flag;
-    cudaMalloc((void**)&d_decData, sizeof(double)*pad_nbEle);
-    cudaMemset(d_decData, 0, sizeof(double)*pad_nbEle);
-    cudaMalloc((void**)&d_cmpData, sizeof(double)*pad_nbEle);
-    cudaMemcpy(d_cmpData, cmpBytes, sizeof(unsigned char)*cmpSize, cudaMemcpyHostToDevice);
-    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
-    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
-    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
-    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
-
-    // Initializing CUDA Stream.
-    cudaStream_t stream;
-    cudaStreamCreate(&stream);
-
-    // cuSZp GPU compression.
-    dim3 blockSize(bsize);
-    dim3 gridSize(gsize);
-    SZp_decompress_kernel_f64<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
-    cudaDeviceSynchronize();
-
-    // Move data back to CPU.
-    cudaMemcpy(decData, d_decData, sizeof(double)*nbEle, cudaMemcpyDeviceToHost);
-
-    // Free memoy that is used.
-    cudaFree(d_decData);
-    cudaFree(d_cmpData);
-    cudaFree(d_cmpOffset);
-    cudaFree(d_flag);
-    cudaStreamDestroy(stream);
-}
-
-
-void SZp_compress_deviceptr_f64(double* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, double errorBound, cudaStream_t stream)
-{
-    // Data blocking.
-    int bsize = cmp_tblock_size_f64;
-    int gsize = (nbEle + bsize * cmp_chunk_f64 - 1) / (bsize * cmp_chunk_f64);
-    int cmpOffSize = gsize + 1;
-    int pad_nbEle = gsize * bsize * cmp_chunk_f64;
-
-    // Initializing global memory for GPU compression.
-    unsigned int* d_cmpOffset;
-    int* d_flag;
-    cudaMallocManaged((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
-    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
-    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
-    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
-    cudaMemset(d_oriData + nbEle, 0, (pad_nbEle - nbEle) * sizeof(double));
-
-    // cuSZp GPU compression.
-    dim3 blockSize(bsize);
-    dim3 gridSize(gsize);
-    SZp_compress_kernel_f64<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
-    cudaDeviceSynchronize();
-
-    // Obtain compression ratio and move data back to CPU.  
-    *cmpSize = (size_t)d_cmpOffset[cmpOffSize-1] + (nbEle+31)/32;
-
-    // Free memory that is used.
-    cudaFree(d_cmpOffset);
-    cudaFree(d_flag);
-}
-
-
-void SZp_decompress_deviceptr_f64(double* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, double errorBound, cudaStream_t stream)
-{
-    // Data blocking.
-    int bsize = dec_tblock_size_f64;
-    int gsize = (nbEle + bsize * dec_chunk_f64 - 1) / (bsize * dec_chunk_f64);
-    int cmpOffSize = gsize + 1;
-
-    // Initializing global memory for GPU compression.
-    unsigned int* d_cmpOffset;
-    int* d_flag;
-    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
-    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
-    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
-    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
-
-    // cuSZp GPU compression.
-    dim3 blockSize(bsize);
-    dim3 gridSize(gsize);
-    SZp_decompress_kernel_f64<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
-    cudaDeviceSynchronize();
-
-    // Free memoy that is used.
-    cudaFree(d_cmpOffset);
-    cudaFree(d_flag);
+#include "cuSZp_entry_f64.h"
+#include "cuSZp_f64.h"
+
+void SZp_compress_hostptr_f64(double* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, double errorBound)
+{
+    // Data blocking.
+    int bsize = cmp_tblock_size_f64;
+    int gsize = (nbEle + bsize * cmp_chunk_f64 - 1) / (bsize * cmp_chunk_f64);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * cmp_chunk_f64;
+
+    // Initializing global memory for GPU compression.
+    double* d_oriData;
+    unsigned char* d_cmpData;
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_oriData, sizeof(double)*pad_nbEle);
+    cudaMemcpy(d_oriData, oriData, sizeof(double)*pad_nbEle, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_cmpData, sizeof(double)*pad_nbEle);
+    cudaMallocManaged((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+    cudaMemset(d_oriData + nbEle, 0, (pad_nbEle - nbEle) * sizeof(double));
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_compress_kernel_f64<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Obtain compression ratio and move data back to CPU.  
+    *cmpSize = (size_t)d_cmpOffset[cmpOffSize-1] + (nbEle+31)/32;
+    cudaMemcpy(cmpBytes, d_cmpData, *cmpSize*sizeof(unsigned char), cudaMemcpyDeviceToHost);
+
+    // Free memory that is used.
+    cudaFree(d_oriData);
+    cudaFree(d_cmpData);
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+    cudaStreamDestroy(stream);
+}
+
+
+void SZp_decompress_hostptr_f64(double* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, double errorBound)
+{
+    // Data blocking.
+    int bsize = dec_tblock_size_f64;
+    int gsize = (nbEle + bsize * dec_chunk_f64 - 1) / (bsize * dec_chunk_f64);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * dec_chunk_f64;
+
+    // Initializing global memory for GPU compression.
+    double* d_decData;
+    unsigned char* d_cmpData;
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_decData, sizeof(double)*pad_nbEle);
+    cudaMemset(d_decData, 0, sizeof(double)*pad_nbEle);
+    cudaMalloc((void**)&d_cmpData, sizeof(double)*pad_nbEle);
+    cudaMemcpy(d_cmpData, cmpBytes, sizeof(unsigned char)*cmpSize, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_decompress_kernel_f64<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Move data back to CPU.
+    cudaMemcpy(decData, d_decData, sizeof(double)*nbEle, cudaMemcpyDeviceToHost);
+
+    // Free memoy that is used.
+    cudaFree(d_decData);
+    cudaFree(d_cmpData);
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+    cudaStreamDestroy(stream);
+}
+
+
+void SZp_compress_deviceptr_f64(double* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, double errorBound, cudaStream_t stream)
+{
+    // Data blocking.
+    int bsize = cmp_tblock_size_f64;
+    int gsize = (nbEle + bsize * cmp_chunk_f64 - 1) / (bsize * cmp_chunk_f64);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * cmp_chunk_f64;
+
+    // Initializing global memory for GPU compression.
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMallocManaged((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+    cudaMemset(d_oriData + nbEle, 0, (pad_nbEle - nbEle) * sizeof(double));
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_compress_kernel_f64<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Obtain compression ratio and move data back to CPU.  
+    *cmpSize = (size_t)d_cmpOffset[cmpOffSize-1] + (nbEle+31)/32;
+
+    // Free memory that is used.
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+}
+
+
+void SZp_decompress_deviceptr_f64(double* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, double errorBound, cudaStream_t stream)
+{
+    // Data blocking.
+    int bsize = dec_tblock_size_f64;
+    int gsize = (nbEle + bsize * dec_chunk_f64 - 1) / (bsize * dec_chunk_f64);
+    int cmpOffSize = gsize + 1;
+
+    // Initializing global memory for GPU compression.
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_decompress_kernel_f64<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Free memoy that is used.
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
 }
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/src/cuSZp_f32.cu b/qtensor/compression/cuszp/cuSZp/src/cuSZp_f32.cu
index 90c2c45d..1f18bfc0 100644
--- a/qtensor/compression/cuszp/cuSZp/src/cuSZp_f32.cu
+++ b/qtensor/compression/cuszp/cuSZp/src/cuSZp_f32.cu
@@ -1,335 +1,335 @@
-#include "cuSZp_f32.h"
-
-__device__ inline int quantization_f32(float data, float recipPrecision)
-{
-    float dataRecip = data*recipPrecision;
-    int s = dataRecip>=-0.5f?0:1;
-    return (int)(dataRecip+0.5f) - s;
-}
-
-
-__device__ inline int get_bit_num(unsigned int x)
-{
-    return (sizeof(unsigned int)*8) - __clz(x);
-}
-
-
-__global__ void SZp_compress_kernel_f32(const float* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle)
-{
-    __shared__ unsigned int base_idx;
-
-    const int tid = threadIdx.x;
-    const int idx = blockIdx.x * blockDim.x + tid;
-    const int lane = idx & 31;
-    const int warp = idx >> 5;
-    const int block_num = cmp_chunk_f32/32;
-    const int start_idx = idx * cmp_chunk_f32;
-    const int start_block_idx = start_idx/32;
-    const int rate_ofs = (nbEle+31)/32;
-    const float recipPrecision = 0.5f/eb;
-
-    int temp_start_idx, temp_end_idx;
-    int quant_chunk_idx;
-    int block_idx;
-    int currQuant, lorenQuant, prevQuant, maxQuant;
-    int absQuant[cmp_chunk_f32];
-    unsigned int sign_flag[block_num];
-    int sign_ofs;
-    int fixed_rate[block_num];
-    unsigned int thread_ofs = 0;
-
-    for(int j=0; j<block_num; j++)
-    {
-        sign_flag[j] = 0;
-        temp_start_idx = start_idx + j*32;
-        temp_end_idx = temp_start_idx + 32;
-        block_idx = start_block_idx+j;
-        prevQuant = 0;
-        maxQuant = 0;
-
-        for(int i=temp_start_idx; i<temp_end_idx; i++)
-        {
-            quant_chunk_idx = i%cmp_chunk_f32;
-            currQuant = i > nbEle ? 0 : quantization_f32(oriData[i], recipPrecision);
-            lorenQuant = currQuant - prevQuant;
-            prevQuant = currQuant;
-            sign_ofs = i % 32;
-            sign_flag[j] |= (lorenQuant < 0) << (31 - sign_ofs);
-            absQuant[quant_chunk_idx] = abs(lorenQuant);
-            maxQuant = maxQuant > absQuant[quant_chunk_idx] ? maxQuant : absQuant[quant_chunk_idx];
-        }
-
-        fixed_rate[j] = get_bit_num(maxQuant);
-        thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
-        if(block_idx<rate_ofs) cmpData[block_idx] = (unsigned char)fixed_rate[j];
-    }
-    __syncthreads();
-
-    for(int i=1; i<32; i<<=1)
-    {
-        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
-        if(lane >= i) thread_ofs += tmp;
-    }
-    __syncthreads();
-
-    if(lane==31) 
-    {
-        cmpOffset[warp+1] = (thread_ofs+7)/8;
-        __threadfence();
-        if(warp==0)
-        {
-            flag[1] = 2;
-            __threadfence();
-        }
-        else
-        {
-            flag[warp+1] = 1;
-            __threadfence();
-        }
-    }
-    __syncthreads();
-
-    if(warp>0)
-    {
-        if(!lane)
-        {
-            int temp_flag = 1;
-            while(temp_flag!=2) temp_flag = flag[warp];
-            __threadfence();
-            cmpOffset[warp] += cmpOffset[warp-1];
-            if(warp==gridDim.x-1) cmpOffset[warp+1] += cmpOffset[warp];
-            __threadfence();
-            flag[warp+1] = 2;
-        }
-        
-    }
-    __syncthreads();
-
-    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
-    __syncthreads();
-
-    unsigned int prev_thread = __shfl_up_sync(0xffffffff, thread_ofs, 1);
-    unsigned int cmp_byte_ofs;
-    if(!lane) cmp_byte_ofs = base_idx;
-    else cmp_byte_ofs = base_idx + prev_thread / 8;
-    
-    for(int j=0; j<block_num; j++)  
-    {
-        int chunk_idx_start = j*32;
-
-        if(fixed_rate[j])
-        {
-            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 24);
-            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 16);
-            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 8);
-            cmpData[cmp_byte_ofs++] = 0xff & sign_flag[j];
-
-            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
-            int mask = 1;
-            for(int i=0; i<fixed_rate[j]; i++)
-            {
-                tmp_char0 = 0;
-                tmp_char1 = 0;
-                tmp_char2 = 0;
-                tmp_char3 = 0;
-
-                tmp_char0 = (((absQuant[chunk_idx_start+0] & mask) >> i) << 7) |
-                            (((absQuant[chunk_idx_start+1] & mask) >> i) << 6) |
-                            (((absQuant[chunk_idx_start+2] & mask) >> i) << 5) |
-                            (((absQuant[chunk_idx_start+3] & mask) >> i) << 4) |
-                            (((absQuant[chunk_idx_start+4] & mask) >> i) << 3) |
-                            (((absQuant[chunk_idx_start+5] & mask) >> i) << 2) |
-                            (((absQuant[chunk_idx_start+6] & mask) >> i) << 1) |
-                            (((absQuant[chunk_idx_start+7] & mask) >> i) << 0);
-
-                tmp_char1 = (((absQuant[chunk_idx_start+8] & mask) >> i) << 7) |
-                            (((absQuant[chunk_idx_start+9] & mask) >> i) << 6) |
-                            (((absQuant[chunk_idx_start+10] & mask) >> i) << 5) |
-                            (((absQuant[chunk_idx_start+11] & mask) >> i) << 4) |
-                            (((absQuant[chunk_idx_start+12] & mask) >> i) << 3) |
-                            (((absQuant[chunk_idx_start+13] & mask) >> i) << 2) |
-                            (((absQuant[chunk_idx_start+14] & mask) >> i) << 1) |
-                            (((absQuant[chunk_idx_start+15] & mask) >> i) << 0);
-
-                tmp_char2 = (((absQuant[chunk_idx_start+16] & mask) >> i) << 7) |
-                            (((absQuant[chunk_idx_start+17] & mask) >> i) << 6) |
-                            (((absQuant[chunk_idx_start+18] & mask) >> i) << 5) |
-                            (((absQuant[chunk_idx_start+19] & mask) >> i) << 4) |
-                            (((absQuant[chunk_idx_start+20] & mask) >> i) << 3) |
-                            (((absQuant[chunk_idx_start+21] & mask) >> i) << 2) |
-                            (((absQuant[chunk_idx_start+22] & mask) >> i) << 1) |
-                            (((absQuant[chunk_idx_start+23] & mask) >> i) << 0);
-                
-                tmp_char3 = (((absQuant[chunk_idx_start+24] & mask) >> i) << 7) |
-                            (((absQuant[chunk_idx_start+25] & mask) >> i) << 6) |
-                            (((absQuant[chunk_idx_start+26] & mask) >> i) << 5) |
-                            (((absQuant[chunk_idx_start+27] & mask) >> i) << 4) |
-                            (((absQuant[chunk_idx_start+28] & mask) >> i) << 3) |
-                            (((absQuant[chunk_idx_start+29] & mask) >> i) << 2) |
-                            (((absQuant[chunk_idx_start+30] & mask) >> i) << 1) |
-                            (((absQuant[chunk_idx_start+31] & mask) >> i) << 0);
-
-                // Move data to global memory.
-                cmpData[cmp_byte_ofs++] = tmp_char0;
-                cmpData[cmp_byte_ofs++] = tmp_char1;
-                cmpData[cmp_byte_ofs++] = tmp_char2;
-                cmpData[cmp_byte_ofs++] = tmp_char3;
-                mask <<= 1;
-            }
-        }
-    }
-}
-
-
-__global__ void SZp_decompress_kernel_f32(float* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle)
-{
-    __shared__ unsigned int base_idx;
-
-    const int tid = threadIdx.x;
-    const int idx = blockIdx.x * blockDim.x + tid;
-    const int lane = idx & 31;
-    const int warp = idx >> 5;
-    const int block_num = dec_chunk_f32/32;
-    const int start_idx = idx * dec_chunk_f32;
-    const int start_block_idx = start_idx/32;
-    const int rate_ofs = (nbEle+31)/32;
-
-    int temp_start_idx;
-    int block_idx;
-    int absQuant[32];
-    int currQuant, lorenQuant, prevQuant;
-    int sign_ofs;
-    int fixed_rate[block_num];
-    unsigned int thread_ofs = 0;
-
-    for(int j=0; j<block_num; j++)
-    {
-        block_idx = start_block_idx + j;
-        if(block_idx<rate_ofs) 
-        {
-            fixed_rate[j] = (int)cmpData[block_idx];
-            thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
-        }
-    }
-    __syncthreads();
-
-    for(int i=1; i<32; i<<=1)
-    {
-        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
-        if(lane >= i) thread_ofs += tmp;
-    }
-    __syncthreads();
-
-    if(lane==31) 
-    {
-        cmpOffset[warp+1] = (thread_ofs+7)/8;
-        __threadfence();
-        if(warp==0)
-        {
-            flag[1] = 2;
-            __threadfence();
-        }
-        else
-        {
-            flag[warp+1] = 1;
-            __threadfence();
-        }
-    }
-    __syncthreads();
-
-    if(warp>0)
-    {
-        if(!lane)
-        {
-            int temp_flag = 1;
-            while(temp_flag!=2) temp_flag = flag[warp];
-            __threadfence();
-            cmpOffset[warp] += cmpOffset[warp-1];
-            __threadfence();
-            flag[warp+1] = 2;
-        }
-    }
-    __syncthreads();
-
-    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
-    __syncthreads();
-
-    unsigned int prev_thread = __shfl_up_sync(0xffffffff, thread_ofs, 1);
-    unsigned int cmp_byte_ofs;
-    if(!lane) cmp_byte_ofs = base_idx;
-    else cmp_byte_ofs = base_idx + prev_thread / 8;
-
-    for(int j=0; j<block_num; j++)
-    {
-        temp_start_idx = start_idx + j*32;
-        unsigned int sign_flag = 0;
-
-        if(fixed_rate[j])
-        {
-            sign_flag = (0xff000000 & (cmpData[cmp_byte_ofs++] << 24)) |
-                        (0x00ff0000 & (cmpData[cmp_byte_ofs++] << 16)) |
-                        (0x0000ff00 & (cmpData[cmp_byte_ofs++] << 8))  |
-                        (0x000000ff & cmpData[cmp_byte_ofs++]);
-            
-            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
-            for(int i=0; i<32; i++) absQuant[i] = 0;
-            for(int i=0; i<fixed_rate[j]; i++)
-            {
-                tmp_char0 = cmpData[cmp_byte_ofs++];
-                tmp_char1 = cmpData[cmp_byte_ofs++];
-                tmp_char2 = cmpData[cmp_byte_ofs++];
-                tmp_char3 = cmpData[cmp_byte_ofs++];
-
-                absQuant[0] |= ((tmp_char0 >> 7) & 0x00000001) << i;
-                absQuant[1] |= ((tmp_char0 >> 6) & 0x00000001) << i;
-                absQuant[2] |= ((tmp_char0 >> 5) & 0x00000001) << i;
-                absQuant[3] |= ((tmp_char0 >> 4) & 0x00000001) << i;
-                absQuant[4] |= ((tmp_char0 >> 3) & 0x00000001) << i;
-                absQuant[5] |= ((tmp_char0 >> 2) & 0x00000001) << i;
-                absQuant[6] |= ((tmp_char0 >> 1) & 0x00000001) << i;
-                absQuant[7] |= ((tmp_char0 >> 0) & 0x00000001) << i;
-
-                absQuant[8] |= ((tmp_char1 >> 7) & 0x00000001) << i;
-                absQuant[9] |= ((tmp_char1 >> 6) & 0x00000001) << i;
-                absQuant[10] |= ((tmp_char1 >> 5) & 0x00000001) << i;
-                absQuant[11] |= ((tmp_char1 >> 4) & 0x00000001) << i;
-                absQuant[12] |= ((tmp_char1 >> 3) & 0x00000001) << i;
-                absQuant[13] |= ((tmp_char1 >> 2) & 0x00000001) << i;
-                absQuant[14] |= ((tmp_char1 >> 1) & 0x00000001) << i;
-                absQuant[15] |= ((tmp_char1 >> 0) & 0x00000001) << i;
-
-                absQuant[16] |= ((tmp_char2 >> 7) & 0x00000001) << i;
-                absQuant[17] |= ((tmp_char2 >> 6) & 0x00000001) << i;
-                absQuant[18] |= ((tmp_char2 >> 5) & 0x00000001) << i;
-                absQuant[19] |= ((tmp_char2 >> 4) & 0x00000001) << i;
-                absQuant[20] |= ((tmp_char2 >> 3) & 0x00000001) << i;
-                absQuant[21] |= ((tmp_char2 >> 2) & 0x00000001) << i;
-                absQuant[22] |= ((tmp_char2 >> 1) & 0x00000001) << i;
-                absQuant[23] |= ((tmp_char2 >> 0) & 0x00000001) << i;
-
-                absQuant[24] |= ((tmp_char3 >> 7) & 0x00000001) << i;
-                absQuant[25] |= ((tmp_char3 >> 6) & 0x00000001) << i;
-                absQuant[26] |= ((tmp_char3 >> 5) & 0x00000001) << i;
-                absQuant[27] |= ((tmp_char3 >> 4) & 0x00000001) << i;
-                absQuant[28] |= ((tmp_char3 >> 3) & 0x00000001) << i;
-                absQuant[29] |= ((tmp_char3 >> 2) & 0x00000001) << i;
-                absQuant[30] |= ((tmp_char3 >> 1) & 0x00000001) << i;
-                absQuant[31] |= ((tmp_char3 >> 0) & 0x00000001) << i;
-            }
-            prevQuant = 0;
-            for(int i=0; i<32; i++)
-            {
-                sign_ofs = i % 32;
-                if(sign_flag & (1 << (31 - sign_ofs)))
-                    lorenQuant = absQuant[i] * -1;
-                else
-                    lorenQuant = absQuant[i];
-                currQuant = lorenQuant + prevQuant;
-                if(temp_start_idx+i < nbEle){
-                    decData[temp_start_idx+i] = currQuant * eb * 2;
-                }
-                prevQuant = currQuant;
-            }
-        }
-    }
+#include "cuSZp_f32.h"
+
+__device__ inline int quantization_f32(float data, float recipPrecision)
+{
+    float dataRecip = data*recipPrecision;
+    int s = dataRecip>=-0.5f?0:1;
+    return (int)(dataRecip+0.5f) - s;
+}
+
+
+__device__ inline int get_bit_num(unsigned int x)
+{
+    return (sizeof(unsigned int)*8) - __clz(x);
+}
+
+
+__global__ void SZp_compress_kernel_f32(const float* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle)
+{
+    __shared__ unsigned int base_idx;
+
+    const int tid = threadIdx.x;
+    const int idx = blockIdx.x * blockDim.x + tid;
+    const int lane = idx & 31;
+    const int warp = idx >> 5;
+    const int block_num = cmp_chunk_f32/32;
+    const int start_idx = idx * cmp_chunk_f32;
+    const int start_block_idx = start_idx/32;
+    const int rate_ofs = (nbEle+31)/32;
+    const float recipPrecision = 0.5f/eb;
+
+    int temp_start_idx, temp_end_idx;
+    int quant_chunk_idx;
+    int block_idx;
+    int currQuant, lorenQuant, prevQuant, maxQuant;
+    int absQuant[cmp_chunk_f32];
+    unsigned int sign_flag[block_num];
+    int sign_ofs;
+    int fixed_rate[block_num];
+    unsigned int thread_ofs = 0;
+
+    for(int j=0; j<block_num; j++)
+    {
+        sign_flag[j] = 0;
+        temp_start_idx = start_idx + j*32;
+        temp_end_idx = temp_start_idx + 32;
+        block_idx = start_block_idx+j;
+        prevQuant = 0;
+        maxQuant = 0;
+
+        for(int i=temp_start_idx; i<temp_end_idx; i++)
+        {
+            quant_chunk_idx = i%cmp_chunk_f32;
+            currQuant = i > nbEle ? 0 : quantization_f32(oriData[i], recipPrecision);
+            lorenQuant = currQuant - prevQuant;
+            prevQuant = currQuant;
+            sign_ofs = i % 32;
+            sign_flag[j] |= (lorenQuant < 0) << (31 - sign_ofs);
+            absQuant[quant_chunk_idx] = abs(lorenQuant);
+            maxQuant = maxQuant > absQuant[quant_chunk_idx] ? maxQuant : absQuant[quant_chunk_idx];
+        }
+
+        fixed_rate[j] = get_bit_num(maxQuant);
+        thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
+        if(block_idx<rate_ofs) cmpData[block_idx] = (unsigned char)fixed_rate[j];
+    }
+    __syncthreads();
+
+    for(int i=1; i<32; i<<=1)
+    {
+        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
+        if(lane >= i) thread_ofs += tmp;
+    }
+    __syncthreads();
+
+    if(lane==31) 
+    {
+        cmpOffset[warp+1] = (thread_ofs+7)/8;
+        __threadfence();
+        if(warp==0)
+        {
+            flag[1] = 2;
+            __threadfence();
+        }
+        else
+        {
+            flag[warp+1] = 1;
+            __threadfence();
+        }
+    }
+    __syncthreads();
+
+    if(warp>0)
+    {
+        if(!lane)
+        {
+            int temp_flag = 1;
+            while(temp_flag!=2) temp_flag = flag[warp];
+            __threadfence();
+            cmpOffset[warp] += cmpOffset[warp-1];
+            if(warp==gridDim.x-1) cmpOffset[warp+1] += cmpOffset[warp];
+            __threadfence();
+            flag[warp+1] = 2;
+        }
+        
+    }
+    __syncthreads();
+
+    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
+    __syncthreads();
+
+    unsigned int prev_thread = __shfl_up_sync(0xffffffff, thread_ofs, 1);
+    unsigned int cmp_byte_ofs;
+    if(!lane) cmp_byte_ofs = base_idx;
+    else cmp_byte_ofs = base_idx + prev_thread / 8;
+    
+    for(int j=0; j<block_num; j++)  
+    {
+        int chunk_idx_start = j*32;
+
+        if(fixed_rate[j])
+        {
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 24);
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 16);
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 8);
+            cmpData[cmp_byte_ofs++] = 0xff & sign_flag[j];
+
+            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
+            int mask = 1;
+            for(int i=0; i<fixed_rate[j]; i++)
+            {
+                tmp_char0 = 0;
+                tmp_char1 = 0;
+                tmp_char2 = 0;
+                tmp_char3 = 0;
+
+                tmp_char0 = (((absQuant[chunk_idx_start+0] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+1] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+2] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+3] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+4] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+5] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+6] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+7] & mask) >> i) << 0);
+
+                tmp_char1 = (((absQuant[chunk_idx_start+8] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+9] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+10] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+11] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+12] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+13] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+14] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+15] & mask) >> i) << 0);
+
+                tmp_char2 = (((absQuant[chunk_idx_start+16] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+17] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+18] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+19] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+20] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+21] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+22] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+23] & mask) >> i) << 0);
+                
+                tmp_char3 = (((absQuant[chunk_idx_start+24] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+25] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+26] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+27] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+28] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+29] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+30] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+31] & mask) >> i) << 0);
+
+                // Move data to global memory.
+                cmpData[cmp_byte_ofs++] = tmp_char0;
+                cmpData[cmp_byte_ofs++] = tmp_char1;
+                cmpData[cmp_byte_ofs++] = tmp_char2;
+                cmpData[cmp_byte_ofs++] = tmp_char3;
+                mask <<= 1;
+            }
+        }
+    }
+}
+
+
+__global__ void SZp_decompress_kernel_f32(float* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle)
+{
+    __shared__ unsigned int base_idx;
+
+    const int tid = threadIdx.x;
+    const int idx = blockIdx.x * blockDim.x + tid;
+    const int lane = idx & 31;
+    const int warp = idx >> 5;
+    const int block_num = dec_chunk_f32/32;
+    const int start_idx = idx * dec_chunk_f32;
+    const int start_block_idx = start_idx/32;
+    const int rate_ofs = (nbEle+31)/32;
+
+    int temp_start_idx;
+    int block_idx;
+    int absQuant[32];
+    int currQuant, lorenQuant, prevQuant;
+    int sign_ofs;
+    int fixed_rate[block_num];
+    unsigned int thread_ofs = 0;
+
+    for(int j=0; j<block_num; j++)
+    {
+        block_idx = start_block_idx + j;
+        if(block_idx<rate_ofs) 
+        {
+            fixed_rate[j] = (int)cmpData[block_idx];
+            thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
+        }
+    }
+    __syncthreads();
+
+    for(int i=1; i<32; i<<=1)
+    {
+        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
+        if(lane >= i) thread_ofs += tmp;
+    }
+    __syncthreads();
+
+    if(lane==31) 
+    {
+        cmpOffset[warp+1] = (thread_ofs+7)/8;
+        __threadfence();
+        if(warp==0)
+        {
+            flag[1] = 2;
+            __threadfence();
+        }
+        else
+        {
+            flag[warp+1] = 1;
+            __threadfence();
+        }
+    }
+    __syncthreads();
+
+    if(warp>0)
+    {
+        if(!lane)
+        {
+            int temp_flag = 1;
+            while(temp_flag!=2) temp_flag = flag[warp];
+            __threadfence();
+            cmpOffset[warp] += cmpOffset[warp-1];
+            __threadfence();
+            flag[warp+1] = 2;
+        }
+    }
+    __syncthreads();
+
+    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
+    __syncthreads();
+
+    unsigned int prev_thread = __shfl_up_sync(0xffffffff, thread_ofs, 1);
+    unsigned int cmp_byte_ofs;
+    if(!lane) cmp_byte_ofs = base_idx;
+    else cmp_byte_ofs = base_idx + prev_thread / 8;
+
+    for(int j=0; j<block_num; j++)
+    {
+        temp_start_idx = start_idx + j*32;
+        unsigned int sign_flag = 0;
+
+        if(fixed_rate[j])
+        {
+            sign_flag = (0xff000000 & (cmpData[cmp_byte_ofs++] << 24)) |
+                        (0x00ff0000 & (cmpData[cmp_byte_ofs++] << 16)) |
+                        (0x0000ff00 & (cmpData[cmp_byte_ofs++] << 8))  |
+                        (0x000000ff & cmpData[cmp_byte_ofs++]);
+            
+            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
+            for(int i=0; i<32; i++) absQuant[i] = 0;
+            for(int i=0; i<fixed_rate[j]; i++)
+            {
+                tmp_char0 = cmpData[cmp_byte_ofs++];
+                tmp_char1 = cmpData[cmp_byte_ofs++];
+                tmp_char2 = cmpData[cmp_byte_ofs++];
+                tmp_char3 = cmpData[cmp_byte_ofs++];
+
+                absQuant[0] |= ((tmp_char0 >> 7) & 0x00000001) << i;
+                absQuant[1] |= ((tmp_char0 >> 6) & 0x00000001) << i;
+                absQuant[2] |= ((tmp_char0 >> 5) & 0x00000001) << i;
+                absQuant[3] |= ((tmp_char0 >> 4) & 0x00000001) << i;
+                absQuant[4] |= ((tmp_char0 >> 3) & 0x00000001) << i;
+                absQuant[5] |= ((tmp_char0 >> 2) & 0x00000001) << i;
+                absQuant[6] |= ((tmp_char0 >> 1) & 0x00000001) << i;
+                absQuant[7] |= ((tmp_char0 >> 0) & 0x00000001) << i;
+
+                absQuant[8] |= ((tmp_char1 >> 7) & 0x00000001) << i;
+                absQuant[9] |= ((tmp_char1 >> 6) & 0x00000001) << i;
+                absQuant[10] |= ((tmp_char1 >> 5) & 0x00000001) << i;
+                absQuant[11] |= ((tmp_char1 >> 4) & 0x00000001) << i;
+                absQuant[12] |= ((tmp_char1 >> 3) & 0x00000001) << i;
+                absQuant[13] |= ((tmp_char1 >> 2) & 0x00000001) << i;
+                absQuant[14] |= ((tmp_char1 >> 1) & 0x00000001) << i;
+                absQuant[15] |= ((tmp_char1 >> 0) & 0x00000001) << i;
+
+                absQuant[16] |= ((tmp_char2 >> 7) & 0x00000001) << i;
+                absQuant[17] |= ((tmp_char2 >> 6) & 0x00000001) << i;
+                absQuant[18] |= ((tmp_char2 >> 5) & 0x00000001) << i;
+                absQuant[19] |= ((tmp_char2 >> 4) & 0x00000001) << i;
+                absQuant[20] |= ((tmp_char2 >> 3) & 0x00000001) << i;
+                absQuant[21] |= ((tmp_char2 >> 2) & 0x00000001) << i;
+                absQuant[22] |= ((tmp_char2 >> 1) & 0x00000001) << i;
+                absQuant[23] |= ((tmp_char2 >> 0) & 0x00000001) << i;
+
+                absQuant[24] |= ((tmp_char3 >> 7) & 0x00000001) << i;
+                absQuant[25] |= ((tmp_char3 >> 6) & 0x00000001) << i;
+                absQuant[26] |= ((tmp_char3 >> 5) & 0x00000001) << i;
+                absQuant[27] |= ((tmp_char3 >> 4) & 0x00000001) << i;
+                absQuant[28] |= ((tmp_char3 >> 3) & 0x00000001) << i;
+                absQuant[29] |= ((tmp_char3 >> 2) & 0x00000001) << i;
+                absQuant[30] |= ((tmp_char3 >> 1) & 0x00000001) << i;
+                absQuant[31] |= ((tmp_char3 >> 0) & 0x00000001) << i;
+            }
+            prevQuant = 0;
+            for(int i=0; i<32; i++)
+            {
+                sign_ofs = i % 32;
+                if(sign_flag & (1 << (31 - sign_ofs)))
+                    lorenQuant = absQuant[i] * -1;
+                else
+                    lorenQuant = absQuant[i];
+                currQuant = lorenQuant + prevQuant;
+                if(temp_start_idx+i < nbEle){
+                    decData[temp_start_idx+i] = currQuant * eb * 2;
+                }
+                prevQuant = currQuant;
+            }
+        }
+    }
 }
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/src/cuSZp_f64.cu b/qtensor/compression/cuszp/cuSZp/src/cuSZp_f64.cu
index c92dacba..30cdfbff 100644
--- a/qtensor/compression/cuszp/cuSZp/src/cuSZp_f64.cu
+++ b/qtensor/compression/cuszp/cuSZp/src/cuSZp_f64.cu
@@ -1,333 +1,333 @@
-#include "cuSZp_f64.h"
-
-__device__ inline int quantization_f64(double data, double recipPrecision)
-{
-    double dataRecip = data*recipPrecision;
-    int s = dataRecip>=-0.5?0:1;
-    return (int)(dataRecip+0.5) - s;
-}
-
-
-__device__ inline int get_bit_num(unsigned int x)
-{
-    return (sizeof(unsigned int)*8) - __clz(x);
-}
-
-
-__global__ void SZp_compress_kernel_f64(const double* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const double eb, const size_t nbEle)
-{
-    __shared__ unsigned int base_idx;
-
-    const int tid = threadIdx.x;
-    const int idx = blockIdx.x * blockDim.x + tid;
-    const int lane = idx & 31;
-    const int warp = idx >> 5;
-    const int block_num = cmp_chunk_f64/32;
-    const int start_idx = idx * cmp_chunk_f64;
-    const int start_block_idx = start_idx/32;
-    const int rate_ofs = (nbEle+31)/32;
-    const double recipPrecision = 0.5/eb;
-
-    int temp_start_idx, temp_end_idx;
-    int quant_chunk_idx;
-    int block_idx;
-    int currQuant, lorenQuant, prevQuant, maxQuant;
-    int absQuant[cmp_chunk_f64];
-    unsigned int sign_flag[block_num];
-    int sign_ofs;
-    int fixed_rate[block_num];
-    unsigned int thread_ofs = 0;
-
-    for(int j=0; j<block_num; j++)
-    {
-        sign_flag[j] = 0;
-        temp_start_idx = start_idx + j*32;
-        temp_end_idx = temp_start_idx + 32;
-        block_idx = start_block_idx+j;
-        prevQuant = 0;
-        maxQuant = 0;
-
-        for(int i=temp_start_idx; i<temp_end_idx; i++)
-        {
-            quant_chunk_idx = i%cmp_chunk_f64;
-            currQuant = quantization_f64(oriData[i], recipPrecision);
-            lorenQuant = currQuant - prevQuant;
-            prevQuant = currQuant;
-            sign_ofs = i % 32;
-            sign_flag[j] |= (lorenQuant < 0) << (31 - sign_ofs);
-            absQuant[quant_chunk_idx] = abs(lorenQuant);
-            maxQuant = maxQuant > absQuant[quant_chunk_idx] ? maxQuant : absQuant[quant_chunk_idx];
-        }
-
-        fixed_rate[j] = get_bit_num(maxQuant);
-        thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
-        if(block_idx<rate_ofs) cmpData[block_idx] = (unsigned char)fixed_rate[j];
-    }
-    __syncthreads();
-
-    for(int i=1; i<32; i<<=1)
-    {
-        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
-        if(lane >= i) thread_ofs += tmp;
-    }
-    __syncthreads();
-
-    if(lane==31) 
-    {
-        cmpOffset[warp+1] = (thread_ofs+7)/8;
-        __threadfence();
-        if(warp==0)
-        {
-            flag[1] = 2;
-            __threadfence();
-        }
-        else
-        {
-            flag[warp+1] = 1;
-            __threadfence();
-        }
-    }
-    __syncthreads();
-
-    if(warp>0)
-    {
-        if(!lane)
-        {
-            int temp_flag = 1;
-            while(temp_flag!=2) temp_flag = flag[warp];
-            __threadfence();
-            cmpOffset[warp] += cmpOffset[warp-1];
-            if(warp==gridDim.x-1) cmpOffset[warp+1] += cmpOffset[warp];
-            __threadfence();
-            flag[warp+1] = 2;
-        }
-        
-    }
-    __syncthreads();
-
-    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
-    __syncthreads();
-
-    unsigned int prev_thread = __shfl_up_sync(0xffffffff, thread_ofs, 1);
-    unsigned int cmp_byte_ofs;
-    if(!lane) cmp_byte_ofs = base_idx;
-    else cmp_byte_ofs = base_idx + prev_thread / 8;
-    
-    for(int j=0; j<block_num; j++)  
-    {
-        int chunk_idx_start = j*32;
-
-        if(fixed_rate[j])
-        {
-            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 24);
-            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 16);
-            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 8);
-            cmpData[cmp_byte_ofs++] = 0xff & sign_flag[j];
-
-            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
-            int mask = 1;
-            for(int i=0; i<fixed_rate[j]; i++)
-            {
-                tmp_char0 = 0;
-                tmp_char1 = 0;
-                tmp_char2 = 0;
-                tmp_char3 = 0;
-
-                tmp_char0 = (((absQuant[chunk_idx_start+0] & mask) >> i) << 7) |
-                            (((absQuant[chunk_idx_start+1] & mask) >> i) << 6) |
-                            (((absQuant[chunk_idx_start+2] & mask) >> i) << 5) |
-                            (((absQuant[chunk_idx_start+3] & mask) >> i) << 4) |
-                            (((absQuant[chunk_idx_start+4] & mask) >> i) << 3) |
-                            (((absQuant[chunk_idx_start+5] & mask) >> i) << 2) |
-                            (((absQuant[chunk_idx_start+6] & mask) >> i) << 1) |
-                            (((absQuant[chunk_idx_start+7] & mask) >> i) << 0);
-
-                tmp_char1 = (((absQuant[chunk_idx_start+8] & mask) >> i) << 7) |
-                            (((absQuant[chunk_idx_start+9] & mask) >> i) << 6) |
-                            (((absQuant[chunk_idx_start+10] & mask) >> i) << 5) |
-                            (((absQuant[chunk_idx_start+11] & mask) >> i) << 4) |
-                            (((absQuant[chunk_idx_start+12] & mask) >> i) << 3) |
-                            (((absQuant[chunk_idx_start+13] & mask) >> i) << 2) |
-                            (((absQuant[chunk_idx_start+14] & mask) >> i) << 1) |
-                            (((absQuant[chunk_idx_start+15] & mask) >> i) << 0);
-
-                tmp_char2 = (((absQuant[chunk_idx_start+16] & mask) >> i) << 7) |
-                            (((absQuant[chunk_idx_start+17] & mask) >> i) << 6) |
-                            (((absQuant[chunk_idx_start+18] & mask) >> i) << 5) |
-                            (((absQuant[chunk_idx_start+19] & mask) >> i) << 4) |
-                            (((absQuant[chunk_idx_start+20] & mask) >> i) << 3) |
-                            (((absQuant[chunk_idx_start+21] & mask) >> i) << 2) |
-                            (((absQuant[chunk_idx_start+22] & mask) >> i) << 1) |
-                            (((absQuant[chunk_idx_start+23] & mask) >> i) << 0);
-                
-                tmp_char3 = (((absQuant[chunk_idx_start+24] & mask) >> i) << 7) |
-                            (((absQuant[chunk_idx_start+25] & mask) >> i) << 6) |
-                            (((absQuant[chunk_idx_start+26] & mask) >> i) << 5) |
-                            (((absQuant[chunk_idx_start+27] & mask) >> i) << 4) |
-                            (((absQuant[chunk_idx_start+28] & mask) >> i) << 3) |
-                            (((absQuant[chunk_idx_start+29] & mask) >> i) << 2) |
-                            (((absQuant[chunk_idx_start+30] & mask) >> i) << 1) |
-                            (((absQuant[chunk_idx_start+31] & mask) >> i) << 0);
-
-                // Move data to global memory.
-                cmpData[cmp_byte_ofs++] = tmp_char0;
-                cmpData[cmp_byte_ofs++] = tmp_char1;
-                cmpData[cmp_byte_ofs++] = tmp_char2;
-                cmpData[cmp_byte_ofs++] = tmp_char3;
-                mask <<= 1;
-            }
-        }
-    }
-}
-
-
-__global__ void SZp_decompress_kernel_f64(double* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const double eb, const size_t nbEle)
-{
-    __shared__ unsigned int base_idx;
-
-    const int tid = threadIdx.x;
-    const int idx = blockIdx.x * blockDim.x + tid;
-    const int lane = idx & 31;
-    const int warp = idx >> 5;
-    const int block_num = dec_chunk_f64/32;
-    const int start_idx = idx * dec_chunk_f64;
-    const int start_block_idx = start_idx/32;
-    const int rate_ofs = (nbEle+31)/32;
-
-    int temp_start_idx;
-    int block_idx;
-    int absQuant[32];
-    int currQuant, lorenQuant, prevQuant;
-    int sign_ofs;
-    int fixed_rate[block_num];
-    unsigned int thread_ofs = 0;
-
-    for(int j=0; j<block_num; j++)
-    {
-        block_idx = start_block_idx + j;
-        if(block_idx<rate_ofs) 
-        {
-            fixed_rate[j] = (int)cmpData[block_idx];
-            thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
-        }
-    }
-    __syncthreads();
-
-    for(int i=1; i<32; i<<=1)
-    {
-        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
-        if(lane >= i) thread_ofs += tmp;
-    }
-    __syncthreads();
-
-    if(lane==31) 
-    {
-        cmpOffset[warp+1] = (thread_ofs+7)/8;
-        __threadfence();
-        if(warp==0)
-        {
-            flag[1] = 2;
-            __threadfence();
-        }
-        else
-        {
-            flag[warp+1] = 1;
-            __threadfence();
-        }
-    }
-    __syncthreads();
-
-    if(warp>0)
-    {
-        if(!lane)
-        {
-            int temp_flag = 1;
-            while(temp_flag!=2) temp_flag = flag[warp];
-            __threadfence();
-            cmpOffset[warp] += cmpOffset[warp-1];
-            __threadfence();
-            flag[warp+1] = 2;
-        }
-    }
-    __syncthreads();
-
-    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
-    __syncthreads();
-
-    unsigned int prev_thread = __shfl_up_sync(0xffffffff, thread_ofs, 1);
-    unsigned int cmp_byte_ofs;
-    if(!lane) cmp_byte_ofs = base_idx;
-    else cmp_byte_ofs = base_idx + prev_thread / 8;
-
-    for(int j=0; j<block_num; j++)
-    {
-        temp_start_idx = start_idx + j*32;
-        unsigned int sign_flag = 0;
-
-        if(fixed_rate[j])
-        {
-            sign_flag = (0xff000000 & (cmpData[cmp_byte_ofs++] << 24)) |
-                        (0x00ff0000 & (cmpData[cmp_byte_ofs++] << 16)) |
-                        (0x0000ff00 & (cmpData[cmp_byte_ofs++] << 8))  |
-                        (0x000000ff & cmpData[cmp_byte_ofs++]);
-            
-            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
-            for(int i=0; i<32; i++) absQuant[i] = 0;
-            for(int i=0; i<fixed_rate[j]; i++)
-            {
-                tmp_char0 = cmpData[cmp_byte_ofs++];
-                tmp_char1 = cmpData[cmp_byte_ofs++];
-                tmp_char2 = cmpData[cmp_byte_ofs++];
-                tmp_char3 = cmpData[cmp_byte_ofs++];
-
-                absQuant[0] |= ((tmp_char0 >> 7) & 0x00000001) << i;
-                absQuant[1] |= ((tmp_char0 >> 6) & 0x00000001) << i;
-                absQuant[2] |= ((tmp_char0 >> 5) & 0x00000001) << i;
-                absQuant[3] |= ((tmp_char0 >> 4) & 0x00000001) << i;
-                absQuant[4] |= ((tmp_char0 >> 3) & 0x00000001) << i;
-                absQuant[5] |= ((tmp_char0 >> 2) & 0x00000001) << i;
-                absQuant[6] |= ((tmp_char0 >> 1) & 0x00000001) << i;
-                absQuant[7] |= ((tmp_char0 >> 0) & 0x00000001) << i;
-
-                absQuant[8] |= ((tmp_char1 >> 7) & 0x00000001) << i;
-                absQuant[9] |= ((tmp_char1 >> 6) & 0x00000001) << i;
-                absQuant[10] |= ((tmp_char1 >> 5) & 0x00000001) << i;
-                absQuant[11] |= ((tmp_char1 >> 4) & 0x00000001) << i;
-                absQuant[12] |= ((tmp_char1 >> 3) & 0x00000001) << i;
-                absQuant[13] |= ((tmp_char1 >> 2) & 0x00000001) << i;
-                absQuant[14] |= ((tmp_char1 >> 1) & 0x00000001) << i;
-                absQuant[15] |= ((tmp_char1 >> 0) & 0x00000001) << i;
-
-                absQuant[16] |= ((tmp_char2 >> 7) & 0x00000001) << i;
-                absQuant[17] |= ((tmp_char2 >> 6) & 0x00000001) << i;
-                absQuant[18] |= ((tmp_char2 >> 5) & 0x00000001) << i;
-                absQuant[19] |= ((tmp_char2 >> 4) & 0x00000001) << i;
-                absQuant[20] |= ((tmp_char2 >> 3) & 0x00000001) << i;
-                absQuant[21] |= ((tmp_char2 >> 2) & 0x00000001) << i;
-                absQuant[22] |= ((tmp_char2 >> 1) & 0x00000001) << i;
-                absQuant[23] |= ((tmp_char2 >> 0) & 0x00000001) << i;
-
-                absQuant[24] |= ((tmp_char3 >> 7) & 0x00000001) << i;
-                absQuant[25] |= ((tmp_char3 >> 6) & 0x00000001) << i;
-                absQuant[26] |= ((tmp_char3 >> 5) & 0x00000001) << i;
-                absQuant[27] |= ((tmp_char3 >> 4) & 0x00000001) << i;
-                absQuant[28] |= ((tmp_char3 >> 3) & 0x00000001) << i;
-                absQuant[29] |= ((tmp_char3 >> 2) & 0x00000001) << i;
-                absQuant[30] |= ((tmp_char3 >> 1) & 0x00000001) << i;
-                absQuant[31] |= ((tmp_char3 >> 0) & 0x00000001) << i;
-            }
-            prevQuant = 0;
-            for(int i=0; i<32; i++)
-            {
-                sign_ofs = i % 32;
-                if(sign_flag & (1 << (31 - sign_ofs)))
-                    lorenQuant = absQuant[i] * -1;
-                else
-                    lorenQuant = absQuant[i];
-                currQuant = lorenQuant + prevQuant;
-                decData[temp_start_idx+i] = currQuant * eb * 2;
-                prevQuant = currQuant;
-            }
-        }
-    }
-}
+#include "cuSZp_f64.h"
+
+__device__ inline int quantization_f64(double data, double recipPrecision)
+{
+    double dataRecip = data*recipPrecision;
+    int s = dataRecip>=-0.5?0:1;
+    return (int)(dataRecip+0.5) - s;
+}
+
+
+__device__ inline int get_bit_num(unsigned int x)
+{
+    return (sizeof(unsigned int)*8) - __clz(x);
+}
+
+
+__global__ void SZp_compress_kernel_f64(const double* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const double eb, const size_t nbEle)
+{
+    __shared__ unsigned int base_idx;
+
+    const int tid = threadIdx.x;
+    const int idx = blockIdx.x * blockDim.x + tid;
+    const int lane = idx & 31;
+    const int warp = idx >> 5;
+    const int block_num = cmp_chunk_f64/32;
+    const int start_idx = idx * cmp_chunk_f64;
+    const int start_block_idx = start_idx/32;
+    const int rate_ofs = (nbEle+31)/32;
+    const double recipPrecision = 0.5/eb;
+
+    int temp_start_idx, temp_end_idx;
+    int quant_chunk_idx;
+    int block_idx;
+    int currQuant, lorenQuant, prevQuant, maxQuant;
+    int absQuant[cmp_chunk_f64];
+    unsigned int sign_flag[block_num];
+    int sign_ofs;
+    int fixed_rate[block_num];
+    unsigned int thread_ofs = 0;
+
+    for(int j=0; j<block_num; j++)
+    {
+        sign_flag[j] = 0;
+        temp_start_idx = start_idx + j*32;
+        temp_end_idx = temp_start_idx + 32;
+        block_idx = start_block_idx+j;
+        prevQuant = 0;
+        maxQuant = 0;
+
+        for(int i=temp_start_idx; i<temp_end_idx; i++)
+        {
+            quant_chunk_idx = i%cmp_chunk_f64;
+            currQuant = quantization_f64(oriData[i], recipPrecision);
+            lorenQuant = currQuant - prevQuant;
+            prevQuant = currQuant;
+            sign_ofs = i % 32;
+            sign_flag[j] |= (lorenQuant < 0) << (31 - sign_ofs);
+            absQuant[quant_chunk_idx] = abs(lorenQuant);
+            maxQuant = maxQuant > absQuant[quant_chunk_idx] ? maxQuant : absQuant[quant_chunk_idx];
+        }
+
+        fixed_rate[j] = get_bit_num(maxQuant);
+        thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
+        if(block_idx<rate_ofs) cmpData[block_idx] = (unsigned char)fixed_rate[j];
+    }
+    __syncthreads();
+
+    for(int i=1; i<32; i<<=1)
+    {
+        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
+        if(lane >= i) thread_ofs += tmp;
+    }
+    __syncthreads();
+
+    if(lane==31) 
+    {
+        cmpOffset[warp+1] = (thread_ofs+7)/8;
+        __threadfence();
+        if(warp==0)
+        {
+            flag[1] = 2;
+            __threadfence();
+        }
+        else
+        {
+            flag[warp+1] = 1;
+            __threadfence();
+        }
+    }
+    __syncthreads();
+
+    if(warp>0)
+    {
+        if(!lane)
+        {
+            int temp_flag = 1;
+            while(temp_flag!=2) temp_flag = flag[warp];
+            __threadfence();
+            cmpOffset[warp] += cmpOffset[warp-1];
+            if(warp==gridDim.x-1) cmpOffset[warp+1] += cmpOffset[warp];
+            __threadfence();
+            flag[warp+1] = 2;
+        }
+        
+    }
+    __syncthreads();
+
+    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
+    __syncthreads();
+
+    unsigned int prev_thread = __shfl_up_sync(0xffffffff, thread_ofs, 1);
+    unsigned int cmp_byte_ofs;
+    if(!lane) cmp_byte_ofs = base_idx;
+    else cmp_byte_ofs = base_idx + prev_thread / 8;
+    
+    for(int j=0; j<block_num; j++)  
+    {
+        int chunk_idx_start = j*32;
+
+        if(fixed_rate[j])
+        {
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 24);
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 16);
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 8);
+            cmpData[cmp_byte_ofs++] = 0xff & sign_flag[j];
+
+            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
+            int mask = 1;
+            for(int i=0; i<fixed_rate[j]; i++)
+            {
+                tmp_char0 = 0;
+                tmp_char1 = 0;
+                tmp_char2 = 0;
+                tmp_char3 = 0;
+
+                tmp_char0 = (((absQuant[chunk_idx_start+0] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+1] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+2] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+3] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+4] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+5] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+6] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+7] & mask) >> i) << 0);
+
+                tmp_char1 = (((absQuant[chunk_idx_start+8] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+9] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+10] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+11] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+12] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+13] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+14] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+15] & mask) >> i) << 0);
+
+                tmp_char2 = (((absQuant[chunk_idx_start+16] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+17] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+18] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+19] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+20] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+21] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+22] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+23] & mask) >> i) << 0);
+                
+                tmp_char3 = (((absQuant[chunk_idx_start+24] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+25] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+26] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+27] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+28] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+29] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+30] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+31] & mask) >> i) << 0);
+
+                // Move data to global memory.
+                cmpData[cmp_byte_ofs++] = tmp_char0;
+                cmpData[cmp_byte_ofs++] = tmp_char1;
+                cmpData[cmp_byte_ofs++] = tmp_char2;
+                cmpData[cmp_byte_ofs++] = tmp_char3;
+                mask <<= 1;
+            }
+        }
+    }
+}
+
+
+__global__ void SZp_decompress_kernel_f64(double* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const double eb, const size_t nbEle)
+{
+    __shared__ unsigned int base_idx;
+
+    const int tid = threadIdx.x;
+    const int idx = blockIdx.x * blockDim.x + tid;
+    const int lane = idx & 31;
+    const int warp = idx >> 5;
+    const int block_num = dec_chunk_f64/32;
+    const int start_idx = idx * dec_chunk_f64;
+    const int start_block_idx = start_idx/32;
+    const int rate_ofs = (nbEle+31)/32;
+
+    int temp_start_idx;
+    int block_idx;
+    int absQuant[32];
+    int currQuant, lorenQuant, prevQuant;
+    int sign_ofs;
+    int fixed_rate[block_num];
+    unsigned int thread_ofs = 0;
+
+    for(int j=0; j<block_num; j++)
+    {
+        block_idx = start_block_idx + j;
+        if(block_idx<rate_ofs) 
+        {
+            fixed_rate[j] = (int)cmpData[block_idx];
+            thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
+        }
+    }
+    __syncthreads();
+
+    for(int i=1; i<32; i<<=1)
+    {
+        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
+        if(lane >= i) thread_ofs += tmp;
+    }
+    __syncthreads();
+
+    if(lane==31) 
+    {
+        cmpOffset[warp+1] = (thread_ofs+7)/8;
+        __threadfence();
+        if(warp==0)
+        {
+            flag[1] = 2;
+            __threadfence();
+        }
+        else
+        {
+            flag[warp+1] = 1;
+            __threadfence();
+        }
+    }
+    __syncthreads();
+
+    if(warp>0)
+    {
+        if(!lane)
+        {
+            int temp_flag = 1;
+            while(temp_flag!=2) temp_flag = flag[warp];
+            __threadfence();
+            cmpOffset[warp] += cmpOffset[warp-1];
+            __threadfence();
+            flag[warp+1] = 2;
+        }
+    }
+    __syncthreads();
+
+    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
+    __syncthreads();
+
+    unsigned int prev_thread = __shfl_up_sync(0xffffffff, thread_ofs, 1);
+    unsigned int cmp_byte_ofs;
+    if(!lane) cmp_byte_ofs = base_idx;
+    else cmp_byte_ofs = base_idx + prev_thread / 8;
+
+    for(int j=0; j<block_num; j++)
+    {
+        temp_start_idx = start_idx + j*32;
+        unsigned int sign_flag = 0;
+
+        if(fixed_rate[j])
+        {
+            sign_flag = (0xff000000 & (cmpData[cmp_byte_ofs++] << 24)) |
+                        (0x00ff0000 & (cmpData[cmp_byte_ofs++] << 16)) |
+                        (0x0000ff00 & (cmpData[cmp_byte_ofs++] << 8))  |
+                        (0x000000ff & cmpData[cmp_byte_ofs++]);
+            
+            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
+            for(int i=0; i<32; i++) absQuant[i] = 0;
+            for(int i=0; i<fixed_rate[j]; i++)
+            {
+                tmp_char0 = cmpData[cmp_byte_ofs++];
+                tmp_char1 = cmpData[cmp_byte_ofs++];
+                tmp_char2 = cmpData[cmp_byte_ofs++];
+                tmp_char3 = cmpData[cmp_byte_ofs++];
+
+                absQuant[0] |= ((tmp_char0 >> 7) & 0x00000001) << i;
+                absQuant[1] |= ((tmp_char0 >> 6) & 0x00000001) << i;
+                absQuant[2] |= ((tmp_char0 >> 5) & 0x00000001) << i;
+                absQuant[3] |= ((tmp_char0 >> 4) & 0x00000001) << i;
+                absQuant[4] |= ((tmp_char0 >> 3) & 0x00000001) << i;
+                absQuant[5] |= ((tmp_char0 >> 2) & 0x00000001) << i;
+                absQuant[6] |= ((tmp_char0 >> 1) & 0x00000001) << i;
+                absQuant[7] |= ((tmp_char0 >> 0) & 0x00000001) << i;
+
+                absQuant[8] |= ((tmp_char1 >> 7) & 0x00000001) << i;
+                absQuant[9] |= ((tmp_char1 >> 6) & 0x00000001) << i;
+                absQuant[10] |= ((tmp_char1 >> 5) & 0x00000001) << i;
+                absQuant[11] |= ((tmp_char1 >> 4) & 0x00000001) << i;
+                absQuant[12] |= ((tmp_char1 >> 3) & 0x00000001) << i;
+                absQuant[13] |= ((tmp_char1 >> 2) & 0x00000001) << i;
+                absQuant[14] |= ((tmp_char1 >> 1) & 0x00000001) << i;
+                absQuant[15] |= ((tmp_char1 >> 0) & 0x00000001) << i;
+
+                absQuant[16] |= ((tmp_char2 >> 7) & 0x00000001) << i;
+                absQuant[17] |= ((tmp_char2 >> 6) & 0x00000001) << i;
+                absQuant[18] |= ((tmp_char2 >> 5) & 0x00000001) << i;
+                absQuant[19] |= ((tmp_char2 >> 4) & 0x00000001) << i;
+                absQuant[20] |= ((tmp_char2 >> 3) & 0x00000001) << i;
+                absQuant[21] |= ((tmp_char2 >> 2) & 0x00000001) << i;
+                absQuant[22] |= ((tmp_char2 >> 1) & 0x00000001) << i;
+                absQuant[23] |= ((tmp_char2 >> 0) & 0x00000001) << i;
+
+                absQuant[24] |= ((tmp_char3 >> 7) & 0x00000001) << i;
+                absQuant[25] |= ((tmp_char3 >> 6) & 0x00000001) << i;
+                absQuant[26] |= ((tmp_char3 >> 5) & 0x00000001) << i;
+                absQuant[27] |= ((tmp_char3 >> 4) & 0x00000001) << i;
+                absQuant[28] |= ((tmp_char3 >> 3) & 0x00000001) << i;
+                absQuant[29] |= ((tmp_char3 >> 2) & 0x00000001) << i;
+                absQuant[30] |= ((tmp_char3 >> 1) & 0x00000001) << i;
+                absQuant[31] |= ((tmp_char3 >> 0) & 0x00000001) << i;
+            }
+            prevQuant = 0;
+            for(int i=0; i<32; i++)
+            {
+                sign_ofs = i % 32;
+                if(sign_flag & (1 << (31 - sign_ofs)))
+                    lorenQuant = absQuant[i] * -1;
+                else
+                    lorenQuant = absQuant[i];
+                currQuant = lorenQuant + prevQuant;
+                decData[temp_start_idx+i] = currQuant * eb * 2;
+                prevQuant = currQuant;
+            }
+        }
+    }
+}
diff --git a/qtensor/compression/cuszp/cuSZp/src/cuSZp_timer.cu b/qtensor/compression/cuszp/cuSZp/src/cuSZp_timer.cu
index 74c81c30..5148af98 100644
--- a/qtensor/compression/cuszp/cuSZp/src/cuSZp_timer.cu
+++ b/qtensor/compression/cuszp/cuSZp/src/cuSZp_timer.cu
@@ -1,31 +1,31 @@
-#include "cuSZp_timer.h"
-
-TimingGPU::TimingGPU() { privateTimingGPU = new PrivateTimingGPU;  }
-
-TimingGPU::~TimingGPU() { }
-
-void TimingGPU::StartCounter()
-{
-    cudaEventCreate(&((*privateTimingGPU).start));
-    cudaEventCreate(&((*privateTimingGPU).stop));
-    cudaEventRecord((*privateTimingGPU).start,0);
-}
-
-void TimingGPU::StartCounterFlags()
-{
-    int eventflags = cudaEventBlockingSync;
-
-    cudaEventCreateWithFlags(&((*privateTimingGPU).start),eventflags);
-    cudaEventCreateWithFlags(&((*privateTimingGPU).stop),eventflags);
-    cudaEventRecord((*privateTimingGPU).start,0);
-}
-
-// Gets the counter in ms
-float TimingGPU::GetCounter()
-{
-    float time;
-    cudaEventRecord((*privateTimingGPU).stop, 0);
-    cudaEventSynchronize((*privateTimingGPU).stop);
-    cudaEventElapsedTime(&time,(*privateTimingGPU).start,(*privateTimingGPU).stop);
-    return time;
-}
+#include "cuSZp_timer.h"
+
+TimingGPU::TimingGPU() { privateTimingGPU = new PrivateTimingGPU;  }
+
+TimingGPU::~TimingGPU() { }
+
+void TimingGPU::StartCounter()
+{
+    cudaEventCreate(&((*privateTimingGPU).start));
+    cudaEventCreate(&((*privateTimingGPU).stop));
+    cudaEventRecord((*privateTimingGPU).start,0);
+}
+
+void TimingGPU::StartCounterFlags()
+{
+    int eventflags = cudaEventBlockingSync;
+
+    cudaEventCreateWithFlags(&((*privateTimingGPU).start),eventflags);
+    cudaEventCreateWithFlags(&((*privateTimingGPU).stop),eventflags);
+    cudaEventRecord((*privateTimingGPU).start,0);
+}
+
+// Gets the counter in ms
+float TimingGPU::GetCounter()
+{
+    float time;
+    cudaEventRecord((*privateTimingGPU).stop, 0);
+    cudaEventSynchronize((*privateTimingGPU).stop);
+    cudaEventElapsedTime(&time,(*privateTimingGPU).start,(*privateTimingGPU).stop);
+    return time;
+}
diff --git a/qtensor/compression/cuszp/cuSZp/src/cuSZp_utility.cu b/qtensor/compression/cuszp/cuSZp/src/cuSZp_utility.cu
index 077951f8..d72c17a0 100644
--- a/qtensor/compression/cuszp/cuSZp/src/cuSZp_utility.cu
+++ b/qtensor/compression/cuszp/cuSZp/src/cuSZp_utility.cu
@@ -1,614 +1,614 @@
-//
-// Created by Yafan Huang on 5/31/22.
-//     Copied from SZ2, QCAT, and SZx.
-//
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <math.h>
-#include <string.h>
-#include "cuSZp_utility.h"
-
-/*Macro Definition for Processing Data*/
-#define SZ_SCES 0  //successful
-#define SZ_NSCS -1 //Not successful
-#define SZ_FERR -2 //Failed to open input file
-#define SZ_TERR -3 //wrong data type (should be only float or double)
-#define RW_SCES 0
-#define RW_FERR 1
-#define RW_TERR 2
-#define LITTLE_ENDIAN_SYSTEM 0
-#define QCAT_BUFS 64
-
-
-/*Global Varaibles for Processing Data*/
-int dataEndianType_Yafan = 0;
-int sysEndianType_Yafan = 0; //0 means little endian, 1 means big endian
-
-
-typedef union llfloat
-{
-    float value;
-    unsigned int ivalue;
-    unsigned char byte[4];
-} llfloat;
-
-
-typedef union lldouble
-{
-    double value;
-    uint64_t lvalue;
-    unsigned char byte[8];
-} lldouble;
-
-
-/** ************************************************************************
- * @brief Reverse 4-bit-length unsigned char array.
- * 
- * @param   data[4]         4-bit-length unsigned char array.
- * *********************************************************************** */
-void symTransForm_4Bytes(unsigned char data[4])
-{
-        unsigned char tmp = data[0];
-        data[0] = data[3];
-        data[3] = tmp;
-
-        tmp = data[1];
-        data[1] = data[2];
-        data[2] = tmp;
-}
-
-
-/** ************************************************************************
- * @brief Reverse 8-bit-length unsigned char array.
- * 
- * @param   data[8]         8-bit-length unsigned char array.
- * *********************************************************************** */
-void symTransform_8bytes(unsigned char data[8])
-{
-	unsigned char tmp = data[0];
-	data[0] = data[7];
-	data[7] = tmp;
-
-	tmp = data[1];
-	data[1] = data[6];
-	data[6] = tmp;
-
-	tmp = data[2];
-	data[2] = data[5];
-	data[5] = tmp;
-
-	tmp = data[3];
-	data[3] = data[4];
-	data[4] = tmp;
-}
-
-
-/** ************************************************************************
- * @brief Read byte data from path to source binary format file.
- *        Usually used for decompressing data from input file.
- *        Variables byteLength and status can be obtained through this function.       
- * 
- * @param   srcFilePath     input source file path
- * @param   byteLength      the length of byte array
- * @param   status          data processing states (macro definitions) 
- * 
- * @return  byteBuf         unsigned char array with length byteLength
- * *********************************************************************** */
-unsigned char *readByteData_Yafan(char *srcFilePath, size_t *byteLength, int *status)
-{
-	FILE *pFile = fopen(srcFilePath, "rb");
-    if (pFile == NULL)
-    {
-        printf("Failed to open input file. 1\n");
-        *status = RW_FERR;
-        return 0;
-    }
-	fseek(pFile, 0, SEEK_END);
-    *byteLength = ftell(pFile);
-    fclose(pFile);
-    
-    unsigned char *byteBuf = ( unsigned char *)malloc((*byteLength)*sizeof(unsigned char)); //sizeof(char)==1
-    
-    pFile = fopen(srcFilePath, "rb");
-    if (pFile == NULL)
-    {
-        printf("Failed to open input file. 2\n");
-        *status = RW_FERR;
-        return 0;
-    }
-    fread(byteBuf, 1, *byteLength, pFile);
-    fclose(pFile);
-    *status = RW_SCES;
-    return byteBuf;
-}
-
-
-/** ************************************************************************
- * @brief Read float data from path to source binary format file in endian systems.
- *        Usually used for compressing data from input file.
- *        Variables nbEle and status can be obtained through this function. 
- * 
- * @param   srcFilePath     input source file path
- * @param   nbEle           the length of float array
- * @param   status          data processing states (macro definitions) 
- * 
- * @return  daBuf           float array with length nbEle
- * *********************************************************************** */
-float *readFloatData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status)
-{
-	size_t inSize;
-	FILE *pFile = fopen(srcFilePath, "rb");
-    if (pFile == NULL)
-    {
-        printf("Failed to open input file. 1\n");
-        *status = RW_FERR;
-        return NULL;
-    }
-	fseek(pFile, 0, SEEK_END);
-    inSize = ftell(pFile);
-    *nbEle = inSize/4; 
-    fclose(pFile);
-    
-    if(inSize<=0)
-    {
-		printf("Error: input file is wrong!\n");
-		*status = RW_FERR;
-	}
-    
-    float *daBuf = (float *)malloc(inSize);
-    
-    pFile = fopen(srcFilePath, "rb");
-    if (pFile == NULL)
-    {
-        printf("Failed to open input file. 2\n");
-        *status = RW_FERR;
-        return NULL;
-    }
-    fread(daBuf, 4, *nbEle, pFile);
-    fclose(pFile);
-    *status = RW_SCES;
-    return daBuf;
-}
-
-
-/** ************************************************************************
- * @brief Read float data from path to source binary format file.
- *        Usually used for compressing data from input file.
- *        Variables nbEle and status can be obtained through this function. 
- * 
- * @param   srcFilePath     input source file path
- * @param   nbEle           the length of float array
- * @param   status          data processing states (macro definitions) 
- * 
- * @return  daBuf           float array with length nbEle
- * *********************************************************************** */
-float *readFloatData_Yafan(char *srcFilePath, size_t *nbEle, int *status)
-{
-	int state = RW_SCES;
-	if(dataEndianType_Yafan==sysEndianType_Yafan)
-	{
-		float *daBuf = readFloatData_systemEndian_Yafan(srcFilePath, nbEle, &state);
-		*status = state;
-		return daBuf;
-	}
-	else
-	{
-		size_t i,j;
-		
-		size_t byteLength;
-		unsigned char* bytes = readByteData_Yafan(srcFilePath, &byteLength, &state);
-		if(state == RW_FERR)
-		{
-			*status = RW_FERR;
-			return NULL;
-		}
-		float *daBuf = (float *)malloc(byteLength);
-		*nbEle = byteLength/4;
-		
-		llfloat buf;
-		for(i = 0;i<*nbEle;i++)
-		{
-			j = i*4;
-			memcpy(buf.byte, bytes+j, 4);
-			symTransForm_4Bytes(buf.byte);
-			daBuf[i] = buf.value;
-		}
-		free(bytes);
-		return daBuf;
-	}
-}
-
-/** ************************************************************************
- * @brief Read double data from path to source binary format file in endian systems.
- *        Usually used for compressing data from input file.
- *        Variables nbEle and status can be obtained through this function. 
- * 
- * @param   srcFilePath     input source file path
- * @param   nbEle           the length of double array
- * @param   status          data processing states (macro definitions) 
- * 
- * @return  daBuf           double array with length nbEle
- * *********************************************************************** */
-double *readDoubleData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status)
-{
-	size_t inSize;
-	FILE *pFile = fopen(srcFilePath, "rb");
-    if (pFile == NULL)
-    {
-        printf("Failed to open input file. 1\n");
-        *status = SZ_FERR;
-        return NULL;
-    }
-	fseek(pFile, 0, SEEK_END);
-    inSize = ftell(pFile);
-    *nbEle = inSize/8; //only support double in this version
-    fclose(pFile);
-
-    double *daBuf = (double *)malloc(inSize);
-
-    pFile = fopen(srcFilePath, "rb");
-    if (pFile == NULL)
-    {
-        printf("Failed to open input file. 2\n");
-        *status = SZ_FERR;
-        return NULL;
-    }
-    fread(daBuf, 8, *nbEle, pFile);
-    fclose(pFile);
-    *status = SZ_SCES;
-    return daBuf;
-}
-
-
-/** ************************************************************************
- * @brief Read double data from path to source binary format file.
- *        Usually used for compressing data from input file.
- *        Variables nbEle and status can be obtained through this function. 
- * 
- * @param   srcFilePath     input source file path
- * @param   nbEle           the length of double array
- * @param   status          data processing states (macro definitions) 
- * 
- * @return  daBuf           double array with length nbEle
- * *********************************************************************** */
-double *readDoubleData_Yafan(char *srcFilePath, size_t *nbEle, int *status)
-{
-	int state = SZ_SCES;
-	if(dataEndianType_Yafan==sysEndianType_Yafan)
-	{
-		double *daBuf = readDoubleData_systemEndian_Yafan(srcFilePath, nbEle,&state);
-		*status = state;
-		return daBuf;
-	}
-	else
-	{
-		size_t i,j;
-
-		size_t byteLength;
-		unsigned char* bytes = readByteData_Yafan(srcFilePath, &byteLength, &state);
-		if(state==SZ_FERR)
-		{
-			*status = SZ_FERR;
-			return NULL;
-		}
-		double *daBuf = (double *)malloc(byteLength);
-		*nbEle = byteLength/8;
-
-		lldouble buf;
-		for(i = 0;i<*nbEle;i++)
-		{
-			j = i*8;
-			memcpy(buf.byte, bytes+j, 8);
-			symTransform_8bytes(buf.byte);
-			daBuf[i] = buf.value;
-		}
-		free(bytes);
-		return daBuf;
-	}
-}
-
-
-/** ************************************************************************
- * @brief Write byte data to binary format file.
- *        Usually used for writing compressed data.
- *        Variable status can be obtained/switched through this function. 
- * 
- * @param   bytes           unsigned char array (compressed data)
- * @param   byteLength      the length of unsigned char array
- * @param   tgtFilePath     output file path
- * @param   status          data processing states (macro definitions) 
- * *********************************************************************** */
-void writeByteData_Yafan(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status)
-{
-	FILE *pFile = fopen(tgtFilePath, "wb");
-    if (pFile == NULL)
-    {
-        printf("Failed to open input file. 3\n");
-        *status = RW_FERR;
-        return;
-    }
-    
-    fwrite(bytes, 1, byteLength, pFile); //write outSize bytes
-    fclose(pFile);
-    *status = RW_SCES;
-}
-
-
-/** ************************************************************************
- * @brief Write float data to binary format file.
- *        Usually used for writing decompressed (reconstructed) data.
- *        Variable status can be obtained/switched through this function. 
- * 
- * @param   bytes           unsigned char array (compressed data)
- * @param   nbEle           the length of float array
- * @param   tgtFilePath     output file path
- * @param   status          data processing states (macro definitions) 
- * *********************************************************************** */
-void writeFloatData_inBytes_Yafan(float *data, size_t nbEle, char* tgtFilePath, int *status)
-{
-	size_t i = 0; 
-	int state = RW_SCES;
-	llfloat buf;
-	unsigned char* bytes = (unsigned char*)malloc(nbEle*sizeof(float));
-	for(i=0;i<nbEle;i++)
-	{
-		buf.value = data[i];
-		bytes[i*4+0] = buf.byte[0];
-		bytes[i*4+1] = buf.byte[1];
-		bytes[i*4+2] = buf.byte[2];
-		bytes[i*4+3] = buf.byte[3];					
-	}
-
-	size_t byteLength = nbEle*sizeof(float);
-	writeByteData_Yafan(bytes, byteLength, tgtFilePath, &state);
-	free(bytes);
-	*status = state;
-}
-
-
-/** ************************************************************************
- * @brief Write double data to binary format file.
- *        Usually used for writing decompressed (reconstructed) data.
- *        Variable status can be obtained/switched through this function. 
- * 
- * @param   bytes           unsigned char array (compressed data)
- * @param   nbEle           the length of float array
- * @param   tgtFilePath     output file path
- * @param   status          data processing states (macro definitions) 
- * *********************************************************************** */
-void writeDoubleData_inBytes_Yafan(double *data, size_t nbEle, char* tgtFilePath, int *status)
-{
-	size_t i = 0, index = 0;
-	int state = SZ_SCES;
-	lldouble buf;
-	unsigned char* bytes = (unsigned char*)malloc(nbEle*sizeof(double));
-	for(i=0;i<nbEle;i++)
-	{
-		index = i*8;
-		buf.value = data[i];
-		bytes[index+0] = buf.byte[0];
-		bytes[index+1] = buf.byte[1];
-		bytes[index+2] = buf.byte[2];
-		bytes[index+3] = buf.byte[3];
-		bytes[index+4] = buf.byte[4];
-		bytes[index+5] = buf.byte[5];
-		bytes[index+6] = buf.byte[6];
-		bytes[index+7] = buf.byte[7];
-	}
-
-	size_t byteLength = nbEle*sizeof(double);
-	writeByteData_Yafan(bytes, byteLength, tgtFilePath, &state);
-	free(bytes);
-	*status = state;
-}
-
-
-/** ************************************************************************
- * @brief Calculate SSIM in a small fraction of a 3D data file.
- *        A subfunction used in computeSSIM().
- * 
- * @param   data            original float array
- * @param   other           other (reconstructed) float array
- * @param   size1           3d-ssim setting.
- * @param   size0           3d-ssim setting.
- * @param   offset0         3d-ssim setting.
- * @param   offset1         3d-ssim setting.
- * @param   offset2         3d-ssim setting.
- * @param   windowSize0     3d-ssim setting.
- * @param   windowSize1     3d-ssim setting.
- * @param   windowSize2     3d-ssim setting.
- * 
- * @return  ssim            ssim value of the current small fraction data
- * *********************************************************************** */
-double SSIM_3d_calcWindow_float(float* data, float* other, size_t size1, size_t size0, int offset0, int offset1, int offset2, int windowSize0, int windowSize1, int windowSize2) {
-    int i0,i1,i2,index;
-    int np=0; //Number of points
-    float xMin=data[offset0+size0*(offset1+size1*offset2)];
-    float xMax=data[offset0+size0*(offset1+size1*offset2)];
-    float yMin=other[offset0+size0*(offset1+size1*offset2)];
-    float yMax=other[offset0+size0*(offset1+size1*offset2)];
-    double xSum=0;
-    double ySum=0;
-    for(i2=offset2; i2<offset2+windowSize2; i2++) {
-        for(i1=offset1; i1<offset1+windowSize1; i1++) {
-            for(i0=offset0; i0<offset0+windowSize0; i0++) {
-                np++;
-                index=i0+size0*(i1+size1*i2);
-                if(xMin>data[index])
-                    xMin=data[index];
-                if(xMax<data[index])
-                    xMax=data[index];
-                if(yMin>other[index])
-                    yMin=other[index];
-                if(yMax<other[index])
-                    yMax=other[index];
-                xSum+=data[index];
-                ySum+=other[index];
-            }
-        }
-    }
-    double xMean=xSum/np;
-    double yMean=ySum/np;
-    double var_x = 0, var_y = 0, var_xy = 0;
-    for(i2=offset2; i2<offset2+windowSize2; i2++) {
-        for(i1=offset1; i1<offset1+windowSize1; i1++) {
-            for(i0=offset0; i0<offset0+windowSize0; i0++) {
-                index=i0+size0*(i1+size1*i2);
-                var_x += (data[index] - xMean)*(data[index] - xMean);
-                var_y += (other[index] - yMean)*(other[index] - yMean);
-                var_xy += (data[index] - xMean)*(other[index] - yMean);
-            }
-        }
-    }
-    var_x /= np;
-    var_y /= np;
-    var_xy /= np;
-    double xSigma=sqrt(var_x);
-    double ySigma=sqrt(var_y);
-    double xyCov = var_xy;
-    double c1,c2;
-    if(xMax-xMin==0) {
-		/*K1==0.01, K2==0.03*/
-        c1=0.01*0.01;
-        c2=0.03*0.03;
-    } else {
-        c1=0.01*0.01*(xMax-xMin)*(xMax-xMin);
-        c2=0.03*0.03*(xMax-xMin)*(xMax-xMin);
-    }
-    double c3=c2/2;
-    double luminance=(2*xMean*yMean+c1)/(xMean*xMean+yMean*yMean+c1);
-    double contrast=(2*xSigma*ySigma+c2)/(xSigma*xSigma+ySigma*ySigma+c2);
-    double structure=(xyCov+c3)/(xSigma*ySigma+c3);
-    double ssim=luminance*contrast*structure;
-    return ssim;
-}
-
-
-/** ************************************************************************
- * @brief Calculate SSIM between 3D original and decompressed (reconstructed) data.
- *        API for computing SSIM.
- * 
- * @param   oriData         original float array
- * @param   decData         decompressed (reconstructed) float array
- * @param   size2           the 1st dim of 3D data.
- * @param   size1           the 2nd dim of 3D data.
- * @param   size0           the 3rd dim of 3D data. (the fastest dim)
- * 
- * @return  ssimSum/nw      final ssim value between oriData and decData
- * *********************************************************************** */
-double computeSSIM(float* oriData, float* decData, size_t size2, size_t size1, size_t size0)
-{
-	int windowSize0=7;
-	int windowSize1=7;
-	int windowSize2=7;
-	int windowShift0=2;
-	int windowShift1=2;
-	int windowShift2=2;
-    int offset0,offset1,offset2;
-    int nw=0; //Number of windows
-    double ssimSum=0;
-    int offsetInc0,offsetInc1,offsetInc2;
-    if(windowSize0>size0) {
-        printf("ERROR: windowSize0 = %d > %zu\n", windowSize0, size0);
-    }
-    if(windowSize1>size1) {
-        printf("ERROR: windowSize1 = %d > %zu\n", windowSize1, size1);
-    }
-    if(windowSize2>size2) {
-        printf("ERROR: windowSize2 = %d > %zu\n", windowSize2, size2);
-    }
-    //offsetInc0=windowSize0/2;
-    //offsetInc1=windowSize1/2;
-    //offsetInc2=windowSize2/2;
-    offsetInc0=windowShift0;
-    offsetInc1=windowShift1;
-    offsetInc2=windowShift2;
-    for(offset2=0; offset2+windowSize2<=size2; offset2+=offsetInc2) { //MOVING WINDOW
-        for(offset1=0; offset1+windowSize1<=size1; offset1+=offsetInc1) { //MOVING WINDOW
-            for(offset0=0; offset0+windowSize0<=size0; offset0+=offsetInc0) { //MOVING WINDOW
-                nw++;
-                ssimSum+=SSIM_3d_calcWindow_float(oriData, decData, size1, size0, offset0, offset1, offset2, windowSize0, windowSize1, windowSize2);
-            }
-        }
-    }
-    return ssimSum/nw;
-}
-
-/** ************************************************************************
- * @brief Calculate PSNR between 3D original and decompressed (reconstructed) data.
- *        API for computing PSNR.
- * 
- * @param   nbEle           the length of float array
- * @param   ori_data        original float array
- * @param   dec_data        decompressed (reconstructed) float array
- * 
- * @return  result          6-length double array, which contains:
- *                              0. *Mean Square Error (MSE)*
- *                              1. *Value Range (Max-Min)*
- *                              2. *Peak Signal-to-noise Ratio (PSNR)*
- *                              3. Squared Error
- *                              4. Normalized Squared Error
- *                              5. Normalized Squared MSE
- * *********************************************************************** */
-double *computePSNR(size_t nbEle, float *ori_data, float *data) {
-    size_t i = 0;
-    double Max = 0, Min = 0, diffMax = 0;
-    Max = ori_data[0];
-    Min = ori_data[0];
-    diffMax = data[0] > ori_data[0] ? data[0] - ori_data[0] : ori_data[0] - data[0];
-
-    //diffMax = fabs(data[0] - ori_data[0]);
-    double sum1 = 0, sum2 = 0, sum22 = 0;
-
-    for (i = 0; i < nbEle; i++) {
-        sum1 += ori_data[i];
-        sum2 += data[i];
-        sum22 += data[i] * data[i];
-    }
-    double mean1 = sum1 / nbEle;
-    double mean2 = sum2 / nbEle;
-
-    double sum3 = 0, sum4 = 0;
-    double sum = 0, prodSum = 0, relerr = 0;
-
-    double maxpw_relerr = 0;
-    for (i = 0; i < nbEle; i++) {
-        if (Max < ori_data[i]) Max = ori_data[i];
-        if (Min > ori_data[i]) Min = ori_data[i];
-
-        float err = fabs(data[i] - ori_data[i]);
-        if (ori_data[i] != 0) {
-            relerr = err / fabs(ori_data[i]);
-            if (maxpw_relerr < relerr)
-                maxpw_relerr = relerr;
-        }
-
-        if (diffMax < err)
-            diffMax = err;
-        prodSum += (ori_data[i] - mean1) * (data[i] - mean2);
-        sum3 += (ori_data[i] - mean1) * (ori_data[i] - mean1);
-        sum4 += (data[i] - mean2) * (data[i] - mean2);
-        sum += err * err;
-    }
-    double std1 = sqrt(sum3 / nbEle);
-    double std2 = sqrt(sum4 / nbEle);
-    double ee = prodSum / nbEle;
-    double acEff = ee / std1 / std2;
-
-    double mse = sum / nbEle;
-    double range = Max - Min;
-    double psnr = 20 * log10(range) - 10 * log10(mse);
-    double normErr = sqrt(sum);
-    double normErr_norm = normErr / sqrt(sum22);
-    double nrmse = sqrt(mse) / range;
-    double *result = (double *) malloc(sizeof(double) * 6);
-    result[0] = mse;
-    result[1] = range;
-    result[2] = psnr;
-    result[3] = normErr;
-    result[4] = normErr_norm;
-    result[5] = nrmse;
-
-    return result;
+//
+// Created by Yafan Huang on 5/31/22.
+//     Copied from SZ2, QCAT, and SZx.
+//
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <math.h>
+#include <string.h>
+#include "cuSZp_utility.h"
+
+/*Macro Definition for Processing Data*/
+#define SZ_SCES 0  //successful
+#define SZ_NSCS -1 //Not successful
+#define SZ_FERR -2 //Failed to open input file
+#define SZ_TERR -3 //wrong data type (should be only float or double)
+#define RW_SCES 0
+#define RW_FERR 1
+#define RW_TERR 2
+#define LITTLE_ENDIAN_SYSTEM 0
+#define QCAT_BUFS 64
+
+
+/*Global Varaibles for Processing Data*/
+int dataEndianType_Yafan = 0;
+int sysEndianType_Yafan = 0; //0 means little endian, 1 means big endian
+
+
+typedef union llfloat
+{
+    float value;
+    unsigned int ivalue;
+    unsigned char byte[4];
+} llfloat;
+
+
+typedef union lldouble
+{
+    double value;
+    uint64_t lvalue;
+    unsigned char byte[8];
+} lldouble;
+
+
+/** ************************************************************************
+ * @brief Reverse 4-bit-length unsigned char array.
+ * 
+ * @param   data[4]         4-bit-length unsigned char array.
+ * *********************************************************************** */
+void symTransForm_4Bytes(unsigned char data[4])
+{
+        unsigned char tmp = data[0];
+        data[0] = data[3];
+        data[3] = tmp;
+
+        tmp = data[1];
+        data[1] = data[2];
+        data[2] = tmp;
+}
+
+
+/** ************************************************************************
+ * @brief Reverse 8-bit-length unsigned char array.
+ * 
+ * @param   data[8]         8-bit-length unsigned char array.
+ * *********************************************************************** */
+void symTransform_8bytes(unsigned char data[8])
+{
+	unsigned char tmp = data[0];
+	data[0] = data[7];
+	data[7] = tmp;
+
+	tmp = data[1];
+	data[1] = data[6];
+	data[6] = tmp;
+
+	tmp = data[2];
+	data[2] = data[5];
+	data[5] = tmp;
+
+	tmp = data[3];
+	data[3] = data[4];
+	data[4] = tmp;
+}
+
+
+/** ************************************************************************
+ * @brief Read byte data from path to source binary format file.
+ *        Usually used for decompressing data from input file.
+ *        Variables byteLength and status can be obtained through this function.       
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   byteLength      the length of byte array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  byteBuf         unsigned char array with length byteLength
+ * *********************************************************************** */
+unsigned char *readByteData_Yafan(char *srcFilePath, size_t *byteLength, int *status)
+{
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = RW_FERR;
+        return 0;
+    }
+	fseek(pFile, 0, SEEK_END);
+    *byteLength = ftell(pFile);
+    fclose(pFile);
+    
+    unsigned char *byteBuf = ( unsigned char *)malloc((*byteLength)*sizeof(unsigned char)); //sizeof(char)==1
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = RW_FERR;
+        return 0;
+    }
+    fread(byteBuf, 1, *byteLength, pFile);
+    fclose(pFile);
+    *status = RW_SCES;
+    return byteBuf;
+}
+
+
+/** ************************************************************************
+ * @brief Read float data from path to source binary format file in endian systems.
+ *        Usually used for compressing data from input file.
+ *        Variables nbEle and status can be obtained through this function. 
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   nbEle           the length of float array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  daBuf           float array with length nbEle
+ * *********************************************************************** */
+float *readFloatData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = RW_FERR;
+        return NULL;
+    }
+	fseek(pFile, 0, SEEK_END);
+    inSize = ftell(pFile);
+    *nbEle = inSize/4; 
+    fclose(pFile);
+    
+    if(inSize<=0)
+    {
+		printf("Error: input file is wrong!\n");
+		*status = RW_FERR;
+	}
+    
+    float *daBuf = (float *)malloc(inSize);
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = RW_FERR;
+        return NULL;
+    }
+    fread(daBuf, 4, *nbEle, pFile);
+    fclose(pFile);
+    *status = RW_SCES;
+    return daBuf;
+}
+
+
+/** ************************************************************************
+ * @brief Read float data from path to source binary format file.
+ *        Usually used for compressing data from input file.
+ *        Variables nbEle and status can be obtained through this function. 
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   nbEle           the length of float array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  daBuf           float array with length nbEle
+ * *********************************************************************** */
+float *readFloatData_Yafan(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = RW_SCES;
+	if(dataEndianType_Yafan==sysEndianType_Yafan)
+	{
+		float *daBuf = readFloatData_systemEndian_Yafan(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+		
+		size_t byteLength;
+		unsigned char* bytes = readByteData_Yafan(srcFilePath, &byteLength, &state);
+		if(state == RW_FERR)
+		{
+			*status = RW_FERR;
+			return NULL;
+		}
+		float *daBuf = (float *)malloc(byteLength);
+		*nbEle = byteLength/4;
+		
+		llfloat buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i*4;
+			memcpy(buf.byte, bytes+j, 4);
+			symTransForm_4Bytes(buf.byte);
+			daBuf[i] = buf.value;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+/** ************************************************************************
+ * @brief Read double data from path to source binary format file in endian systems.
+ *        Usually used for compressing data from input file.
+ *        Variables nbEle and status can be obtained through this function. 
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   nbEle           the length of double array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  daBuf           double array with length nbEle
+ * *********************************************************************** */
+double *readDoubleData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = SZ_FERR;
+        return NULL;
+    }
+	fseek(pFile, 0, SEEK_END);
+    inSize = ftell(pFile);
+    *nbEle = inSize/8; //only support double in this version
+    fclose(pFile);
+
+    double *daBuf = (double *)malloc(inSize);
+
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = SZ_FERR;
+        return NULL;
+    }
+    fread(daBuf, 8, *nbEle, pFile);
+    fclose(pFile);
+    *status = SZ_SCES;
+    return daBuf;
+}
+
+
+/** ************************************************************************
+ * @brief Read double data from path to source binary format file.
+ *        Usually used for compressing data from input file.
+ *        Variables nbEle and status can be obtained through this function. 
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   nbEle           the length of double array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  daBuf           double array with length nbEle
+ * *********************************************************************** */
+double *readDoubleData_Yafan(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType_Yafan==sysEndianType_Yafan)
+	{
+		double *daBuf = readDoubleData_systemEndian_Yafan(srcFilePath, nbEle,&state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData_Yafan(srcFilePath, &byteLength, &state);
+		if(state==SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		double *daBuf = (double *)malloc(byteLength);
+		*nbEle = byteLength/8;
+
+		lldouble buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i*8;
+			memcpy(buf.byte, bytes+j, 8);
+			symTransform_8bytes(buf.byte);
+			daBuf[i] = buf.value;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+
+/** ************************************************************************
+ * @brief Write byte data to binary format file.
+ *        Usually used for writing compressed data.
+ *        Variable status can be obtained/switched through this function. 
+ * 
+ * @param   bytes           unsigned char array (compressed data)
+ * @param   byteLength      the length of unsigned char array
+ * @param   tgtFilePath     output file path
+ * @param   status          data processing states (macro definitions) 
+ * *********************************************************************** */
+void writeByteData_Yafan(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status)
+{
+	FILE *pFile = fopen(tgtFilePath, "wb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 3\n");
+        *status = RW_FERR;
+        return;
+    }
+    
+    fwrite(bytes, 1, byteLength, pFile); //write outSize bytes
+    fclose(pFile);
+    *status = RW_SCES;
+}
+
+
+/** ************************************************************************
+ * @brief Write float data to binary format file.
+ *        Usually used for writing decompressed (reconstructed) data.
+ *        Variable status can be obtained/switched through this function. 
+ * 
+ * @param   bytes           unsigned char array (compressed data)
+ * @param   nbEle           the length of float array
+ * @param   tgtFilePath     output file path
+ * @param   status          data processing states (macro definitions) 
+ * *********************************************************************** */
+void writeFloatData_inBytes_Yafan(float *data, size_t nbEle, char* tgtFilePath, int *status)
+{
+	size_t i = 0; 
+	int state = RW_SCES;
+	llfloat buf;
+	unsigned char* bytes = (unsigned char*)malloc(nbEle*sizeof(float));
+	for(i=0;i<nbEle;i++)
+	{
+		buf.value = data[i];
+		bytes[i*4+0] = buf.byte[0];
+		bytes[i*4+1] = buf.byte[1];
+		bytes[i*4+2] = buf.byte[2];
+		bytes[i*4+3] = buf.byte[3];					
+	}
+
+	size_t byteLength = nbEle*sizeof(float);
+	writeByteData_Yafan(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+
+/** ************************************************************************
+ * @brief Write double data to binary format file.
+ *        Usually used for writing decompressed (reconstructed) data.
+ *        Variable status can be obtained/switched through this function. 
+ * 
+ * @param   bytes           unsigned char array (compressed data)
+ * @param   nbEle           the length of float array
+ * @param   tgtFilePath     output file path
+ * @param   status          data processing states (macro definitions) 
+ * *********************************************************************** */
+void writeDoubleData_inBytes_Yafan(double *data, size_t nbEle, char* tgtFilePath, int *status)
+{
+	size_t i = 0, index = 0;
+	int state = SZ_SCES;
+	lldouble buf;
+	unsigned char* bytes = (unsigned char*)malloc(nbEle*sizeof(double));
+	for(i=0;i<nbEle;i++)
+	{
+		index = i*8;
+		buf.value = data[i];
+		bytes[index+0] = buf.byte[0];
+		bytes[index+1] = buf.byte[1];
+		bytes[index+2] = buf.byte[2];
+		bytes[index+3] = buf.byte[3];
+		bytes[index+4] = buf.byte[4];
+		bytes[index+5] = buf.byte[5];
+		bytes[index+6] = buf.byte[6];
+		bytes[index+7] = buf.byte[7];
+	}
+
+	size_t byteLength = nbEle*sizeof(double);
+	writeByteData_Yafan(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+
+/** ************************************************************************
+ * @brief Calculate SSIM in a small fraction of a 3D data file.
+ *        A subfunction used in computeSSIM().
+ * 
+ * @param   data            original float array
+ * @param   other           other (reconstructed) float array
+ * @param   size1           3d-ssim setting.
+ * @param   size0           3d-ssim setting.
+ * @param   offset0         3d-ssim setting.
+ * @param   offset1         3d-ssim setting.
+ * @param   offset2         3d-ssim setting.
+ * @param   windowSize0     3d-ssim setting.
+ * @param   windowSize1     3d-ssim setting.
+ * @param   windowSize2     3d-ssim setting.
+ * 
+ * @return  ssim            ssim value of the current small fraction data
+ * *********************************************************************** */
+double SSIM_3d_calcWindow_float(float* data, float* other, size_t size1, size_t size0, int offset0, int offset1, int offset2, int windowSize0, int windowSize1, int windowSize2) {
+    int i0,i1,i2,index;
+    int np=0; //Number of points
+    float xMin=data[offset0+size0*(offset1+size1*offset2)];
+    float xMax=data[offset0+size0*(offset1+size1*offset2)];
+    float yMin=other[offset0+size0*(offset1+size1*offset2)];
+    float yMax=other[offset0+size0*(offset1+size1*offset2)];
+    double xSum=0;
+    double ySum=0;
+    for(i2=offset2; i2<offset2+windowSize2; i2++) {
+        for(i1=offset1; i1<offset1+windowSize1; i1++) {
+            for(i0=offset0; i0<offset0+windowSize0; i0++) {
+                np++;
+                index=i0+size0*(i1+size1*i2);
+                if(xMin>data[index])
+                    xMin=data[index];
+                if(xMax<data[index])
+                    xMax=data[index];
+                if(yMin>other[index])
+                    yMin=other[index];
+                if(yMax<other[index])
+                    yMax=other[index];
+                xSum+=data[index];
+                ySum+=other[index];
+            }
+        }
+    }
+    double xMean=xSum/np;
+    double yMean=ySum/np;
+    double var_x = 0, var_y = 0, var_xy = 0;
+    for(i2=offset2; i2<offset2+windowSize2; i2++) {
+        for(i1=offset1; i1<offset1+windowSize1; i1++) {
+            for(i0=offset0; i0<offset0+windowSize0; i0++) {
+                index=i0+size0*(i1+size1*i2);
+                var_x += (data[index] - xMean)*(data[index] - xMean);
+                var_y += (other[index] - yMean)*(other[index] - yMean);
+                var_xy += (data[index] - xMean)*(other[index] - yMean);
+            }
+        }
+    }
+    var_x /= np;
+    var_y /= np;
+    var_xy /= np;
+    double xSigma=sqrt(var_x);
+    double ySigma=sqrt(var_y);
+    double xyCov = var_xy;
+    double c1,c2;
+    if(xMax-xMin==0) {
+		/*K1==0.01, K2==0.03*/
+        c1=0.01*0.01;
+        c2=0.03*0.03;
+    } else {
+        c1=0.01*0.01*(xMax-xMin)*(xMax-xMin);
+        c2=0.03*0.03*(xMax-xMin)*(xMax-xMin);
+    }
+    double c3=c2/2;
+    double luminance=(2*xMean*yMean+c1)/(xMean*xMean+yMean*yMean+c1);
+    double contrast=(2*xSigma*ySigma+c2)/(xSigma*xSigma+ySigma*ySigma+c2);
+    double structure=(xyCov+c3)/(xSigma*ySigma+c3);
+    double ssim=luminance*contrast*structure;
+    return ssim;
+}
+
+
+/** ************************************************************************
+ * @brief Calculate SSIM between 3D original and decompressed (reconstructed) data.
+ *        API for computing SSIM.
+ * 
+ * @param   oriData         original float array
+ * @param   decData         decompressed (reconstructed) float array
+ * @param   size2           the 1st dim of 3D data.
+ * @param   size1           the 2nd dim of 3D data.
+ * @param   size0           the 3rd dim of 3D data. (the fastest dim)
+ * 
+ * @return  ssimSum/nw      final ssim value between oriData and decData
+ * *********************************************************************** */
+double computeSSIM(float* oriData, float* decData, size_t size2, size_t size1, size_t size0)
+{
+	int windowSize0=7;
+	int windowSize1=7;
+	int windowSize2=7;
+	int windowShift0=2;
+	int windowShift1=2;
+	int windowShift2=2;
+    int offset0,offset1,offset2;
+    int nw=0; //Number of windows
+    double ssimSum=0;
+    int offsetInc0,offsetInc1,offsetInc2;
+    if(windowSize0>size0) {
+        printf("ERROR: windowSize0 = %d > %zu\n", windowSize0, size0);
+    }
+    if(windowSize1>size1) {
+        printf("ERROR: windowSize1 = %d > %zu\n", windowSize1, size1);
+    }
+    if(windowSize2>size2) {
+        printf("ERROR: windowSize2 = %d > %zu\n", windowSize2, size2);
+    }
+    //offsetInc0=windowSize0/2;
+    //offsetInc1=windowSize1/2;
+    //offsetInc2=windowSize2/2;
+    offsetInc0=windowShift0;
+    offsetInc1=windowShift1;
+    offsetInc2=windowShift2;
+    for(offset2=0; offset2+windowSize2<=size2; offset2+=offsetInc2) { //MOVING WINDOW
+        for(offset1=0; offset1+windowSize1<=size1; offset1+=offsetInc1) { //MOVING WINDOW
+            for(offset0=0; offset0+windowSize0<=size0; offset0+=offsetInc0) { //MOVING WINDOW
+                nw++;
+                ssimSum+=SSIM_3d_calcWindow_float(oriData, decData, size1, size0, offset0, offset1, offset2, windowSize0, windowSize1, windowSize2);
+            }
+        }
+    }
+    return ssimSum/nw;
+}
+
+/** ************************************************************************
+ * @brief Calculate PSNR between 3D original and decompressed (reconstructed) data.
+ *        API for computing PSNR.
+ * 
+ * @param   nbEle           the length of float array
+ * @param   ori_data        original float array
+ * @param   dec_data        decompressed (reconstructed) float array
+ * 
+ * @return  result          6-length double array, which contains:
+ *                              0. *Mean Square Error (MSE)*
+ *                              1. *Value Range (Max-Min)*
+ *                              2. *Peak Signal-to-noise Ratio (PSNR)*
+ *                              3. Squared Error
+ *                              4. Normalized Squared Error
+ *                              5. Normalized Squared MSE
+ * *********************************************************************** */
+double *computePSNR(size_t nbEle, float *ori_data, float *data) {
+    size_t i = 0;
+    double Max = 0, Min = 0, diffMax = 0;
+    Max = ori_data[0];
+    Min = ori_data[0];
+    diffMax = data[0] > ori_data[0] ? data[0] - ori_data[0] : ori_data[0] - data[0];
+
+    //diffMax = fabs(data[0] - ori_data[0]);
+    double sum1 = 0, sum2 = 0, sum22 = 0;
+
+    for (i = 0; i < nbEle; i++) {
+        sum1 += ori_data[i];
+        sum2 += data[i];
+        sum22 += data[i] * data[i];
+    }
+    double mean1 = sum1 / nbEle;
+    double mean2 = sum2 / nbEle;
+
+    double sum3 = 0, sum4 = 0;
+    double sum = 0, prodSum = 0, relerr = 0;
+
+    double maxpw_relerr = 0;
+    for (i = 0; i < nbEle; i++) {
+        if (Max < ori_data[i]) Max = ori_data[i];
+        if (Min > ori_data[i]) Min = ori_data[i];
+
+        float err = fabs(data[i] - ori_data[i]);
+        if (ori_data[i] != 0) {
+            relerr = err / fabs(ori_data[i]);
+            if (maxpw_relerr < relerr)
+                maxpw_relerr = relerr;
+        }
+
+        if (diffMax < err)
+            diffMax = err;
+        prodSum += (ori_data[i] - mean1) * (data[i] - mean2);
+        sum3 += (ori_data[i] - mean1) * (ori_data[i] - mean1);
+        sum4 += (data[i] - mean2) * (data[i] - mean2);
+        sum += err * err;
+    }
+    double std1 = sqrt(sum3 / nbEle);
+    double std2 = sqrt(sum4 / nbEle);
+    double ee = prodSum / nbEle;
+    double acEff = ee / std1 / std2;
+
+    double mse = sum / nbEle;
+    double range = Max - Min;
+    double psnr = 20 * log10(range) - 10 * log10(mse);
+    double normErr = sqrt(sum);
+    double normErr_norm = normErr / sqrt(sum22);
+    double nrmse = sqrt(mse) / range;
+    double *result = (double *) malloc(sizeof(double) * 6);
+    result[0] = mse;
+    result[1] = range;
+    result[2] = psnr;
+    result[3] = normErr;
+    result[4] = normErr_norm;
+    result[5] = nrmse;
+
+    return result;
 }
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp_interface.cpp b/qtensor/compression/cuszp/cuSZp_interface.cpp
index e46d7e04..5d241b18 100644
--- a/qtensor/compression/cuszp/cuSZp_interface.cpp
+++ b/qtensor/compression/cuszp/cuSZp_interface.cpp
@@ -1,137 +1,137 @@
-#include <torch/extension.h>
-#include <ATen/ATen.h>
-// #include <cuSZp/cuSZp_entry_f32.h>
-// #include <cuSZp/cuSZp_timer.h>
-// #include <cuSZp/cuSZp_utility.h>
-#include <cuSZp_entry_f32.h>
-#include <cuSZp_timer.h>
-#include <cuSZp_utility.h>
-#include <cuda_runtime.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-#define CHECK_CUDA(x) \
-  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) \
-  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x) \
-  CHECK_CUDA(x);       \
-  CHECK_CONTIGUOUS(x)
-
-torch::Tensor compress(torch::Tensor input, float error_bound,
-                       std::string mode) {
-  CHECK_INPUT(input);
-  // Get the input tensor's data pointer and size
-  float *d_input_data = input.data_ptr<float>();
-  int64_t num_elements = input.numel();
-  size_t compressed_size = 0;
-
-  // Cuda allocate memory for the compressed output
-  unsigned char *d_compressed_data;
-  cudaMalloc((void **)&d_compressed_data, num_elements * sizeof(float));
-  cudaMemset(d_compressed_data, 0, num_elements * sizeof(float));
-  printf("f ptr %p\n", d_input_data);
-  // Initializing CUDA Stream.
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-
-  // Just a warmup.
-  SZp_compress_deviceptr_f32(d_input_data, d_compressed_data, num_elements,
-                             &compressed_size, error_bound, stream);
-  // Ensure on a 4096 boundary
-  // compressed_size = (compressed_size + 4095) / 4096 * 4096;
-  // Create a new tensor on the GPU from the compressed output
-  
-  cudaStreamSynchronize(stream);
-  
-  cudaError_t err = cudaGetLastError();
-  printf("after comp\n");
-  if (err != cudaSuccess) {
-    printf("CUDA error: %s\n", cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
-
-
- // torch::Tensor test_t = torch::zeros(5);
-  err = cudaGetLastError();
-  printf("after comp\n");
-  if (err != cudaSuccess) {
-    printf("CUDA error: %s\n", cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
-
-
-  torch::Tensor output = torch::empty(
-      {compressed_size}, torch::TensorOptions()
-                             .dtype(torch::kUInt8)
-                             .device(torch::kCUDA)
-			     .layout(at::kStrided)
-                             .memory_format(torch::MemoryFormat::Contiguous));
-  // write from d_compressed_data
-  cudaMemcpy(output.data_ptr<unsigned char>(), d_compressed_data,
-             compressed_size, cudaMemcpyDeviceToDevice);
-  // Sync free
-  cudaStreamSynchronize(stream);
-
-  printf("after comp2\n");
-  err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    printf("CUDA error: %s\n", cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
-
-  // cudaMemGetInfo(&free_byte, &total_byte);
-  // printf("GPU memory usage before output: used = %f, free = %f MB, total = %f
-  // MB\n",
-  //       (double)(total_byte - free_byte) / 1024.0 / 1024.0, (double)free_byte
-  //       / 1024.0 / 1024.0, (double)total_byte / 1024.0 / 1024.0);
-  cudaFree(d_compressed_data);
-  cudaStreamDestroy(stream);
-  CHECK_INPUT(output);
-  return output;
-}
-
-torch::Tensor decompress(torch::Tensor compressed_data, int64_t num_elements,
-                         size_t compressed_size, float error_bound,
-                         std::string mode) {
-  CHECK_INPUT(compressed_data);
-  // Get the input tensor's data pointer and size
-  unsigned char *d_compressed_data = compressed_data.data_ptr<unsigned char>();
-
-  // torch::Tensor decompressed_data = torch::empty(
-  //     , torch::TensorOptions()
-  //                         .dtype(torch::kFloat32)
-  //                         .device(torch::kCUDA)
-  //                         .memory_format(torch::MemoryFormat::Contiguous));
-  torch::Tensor decompressed_data = torch::zeros(
-      {num_elements}, torch::TensorOptions()
-                          .dtype(torch::kFloat32)
-                          .device(torch::kCUDA)
-                          .memory_format(torch::MemoryFormat::Contiguous));
-  float *d_decompressed_data = decompressed_data.data_ptr<float>();
-
-  // Initializing CUDA Stream.
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-
-  SZp_decompress_deviceptr_f32(d_decompressed_data, d_compressed_data,
-                               num_elements, compressed_size, error_bound,
-                               stream);
-  cudaStreamSynchronize(stream);
-  // Check cuda errors
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    printf("CUDA error: %s\n", cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
-  cudaStreamDestroy(stream);
-  CHECK_INPUT(decompressed_data);
-  return decompressed_data;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("compress", &compress, "Compress a PyTorch tensor using cuSZp");
-  m.def("decompress", &decompress, "Decompress a PyTorch tensor using cuSZp");
-}
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+// #include <cuSZp/cuSZp_entry_f32.h>
+// #include <cuSZp/cuSZp_timer.h>
+// #include <cuSZp/cuSZp_utility.h>
+#include <cuSZp_entry_f32.h>
+#include <cuSZp_timer.h>
+#include <cuSZp_utility.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#define CHECK_CUDA(x) \
+  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+torch::Tensor compress(torch::Tensor input, float error_bound,
+                       std::string mode) {
+  CHECK_INPUT(input);
+  // Get the input tensor's data pointer and size
+  float *d_input_data = input.data_ptr<float>();
+  int64_t num_elements = input.numel();
+  size_t compressed_size = 0;
+
+  // Cuda allocate memory for the compressed output
+  unsigned char *d_compressed_data;
+  cudaMalloc((void **)&d_compressed_data, num_elements * sizeof(float));
+  cudaMemset(d_compressed_data, 0, num_elements * sizeof(float));
+  printf("f ptr %p\n", d_input_data);
+  // Initializing CUDA Stream.
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  // Just a warmup.
+  SZp_compress_deviceptr_f32(d_input_data, d_compressed_data, num_elements,
+                             &compressed_size, error_bound, stream);
+  // Ensure on a 4096 boundary
+  // compressed_size = (compressed_size + 4095) / 4096 * 4096;
+  // Create a new tensor on the GPU from the compressed output
+  
+  cudaStreamSynchronize(stream);
+  
+  cudaError_t err = cudaGetLastError();
+  printf("after comp\n");
+  if (err != cudaSuccess) {
+    printf("CUDA error: %s\n", cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+
+
+ // torch::Tensor test_t = torch::zeros(5);
+  err = cudaGetLastError();
+  printf("after comp\n");
+  if (err != cudaSuccess) {
+    printf("CUDA error: %s\n", cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+
+
+  torch::Tensor output = torch::empty(
+      {compressed_size}, torch::TensorOptions()
+                             .dtype(torch::kUInt8)
+                             .device(torch::kCUDA)
+			     .layout(at::kStrided)
+                             .memory_format(torch::MemoryFormat::Contiguous));
+  // write from d_compressed_data
+  cudaMemcpy(output.data_ptr<unsigned char>(), d_compressed_data,
+             compressed_size, cudaMemcpyDeviceToDevice);
+  // Sync free
+  cudaStreamSynchronize(stream);
+
+  printf("after comp2\n");
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("CUDA error: %s\n", cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+
+  // cudaMemGetInfo(&free_byte, &total_byte);
+  // printf("GPU memory usage before output: used = %f, free = %f MB, total = %f
+  // MB\n",
+  //       (double)(total_byte - free_byte) / 1024.0 / 1024.0, (double)free_byte
+  //       / 1024.0 / 1024.0, (double)total_byte / 1024.0 / 1024.0);
+  cudaFree(d_compressed_data);
+  cudaStreamDestroy(stream);
+  CHECK_INPUT(output);
+  return output;
+}
+
+torch::Tensor decompress(torch::Tensor compressed_data, int64_t num_elements,
+                         size_t compressed_size, float error_bound,
+                         std::string mode) {
+  CHECK_INPUT(compressed_data);
+  // Get the input tensor's data pointer and size
+  unsigned char *d_compressed_data = compressed_data.data_ptr<unsigned char>();
+
+  // torch::Tensor decompressed_data = torch::empty(
+  //     , torch::TensorOptions()
+  //                         .dtype(torch::kFloat32)
+  //                         .device(torch::kCUDA)
+  //                         .memory_format(torch::MemoryFormat::Contiguous));
+  torch::Tensor decompressed_data = torch::zeros(
+      {num_elements}, torch::TensorOptions()
+                          .dtype(torch::kFloat32)
+                          .device(torch::kCUDA)
+                          .memory_format(torch::MemoryFormat::Contiguous));
+  float *d_decompressed_data = decompressed_data.data_ptr<float>();
+
+  // Initializing CUDA Stream.
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  SZp_decompress_deviceptr_f32(d_decompressed_data, d_compressed_data,
+                               num_elements, compressed_size, error_bound,
+                               stream);
+  cudaStreamSynchronize(stream);
+  // Check cuda errors
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("CUDA error: %s\n", cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+  cudaStreamDestroy(stream);
+  CHECK_INPUT(decompressed_data);
+  return decompressed_data;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("compress", &compress, "Compress a PyTorch tensor using cuSZp");
+  m.def("decompress", &decompress, "Decompress a PyTorch tensor using cuSZp");
+}
diff --git a/qtensor/compression/cuszp/cuszp_wrapper.py b/qtensor/compression/cuszp/cuszp_wrapper.py
index 54d04549..6d0f8ff4 100644
--- a/qtensor/compression/cuszp/cuszp_wrapper.py
+++ b/qtensor/compression/cuszp/cuszp_wrapper.py
@@ -1,113 +1,113 @@
-import numpy as np
-import ctypes
-from ctypes import *
-import random
-#from qtensor.tools.lazy_import import cupy as cp
-import cupy as cp
-import time
-import torch
-
-import cuszp
-
-from pathlib import Path
-
-def cuszp_device_compress(oriData, absErrBound,threshold):
-
-    oriData = oriData.flatten()
-    x = torch.as_tensor(oriData, device='cuda')
-    
-    ori_real = x.real
-    ori_imag = x.imag
-    x = x.contiguous()
-    x = torch.cat((ori_real, ori_imag))
-    x = torch.flatten(x)
-    bitmap = None
-    d = torch.max(x) - torch.min(x)
-    d = d.item()
-    absErrBound = float(absErrBound*(d))
-    threshold = threshold*(d)
-    truth_values = torch.abs(x)<=threshold
-    x[truth_values] = 0.0
-
-    o_bytes = cuszp.compress(x, absErrBound, "rel")
-    outSize = o_bytes.numel()*o_bytes.element_size()
-
-    return (o_bytes,bitmap, absErrBound), outSize
-
-
-def cuszp_device_decompress(nbEle, cmpBytes):
-
-    (cmpBytes, bitmap, absErrBound) = cmpBytes
-
-    newData = cuszp.decompress(
-        cmpBytes,
-        nbEle,
-        cmpBytes.numel()*cmpBytes.element_size(),
-        absErrBound,
-        "rel",
-    )
-
-    arr = cp.asarray(newData)
-    res = arr
-    c_res = cp.zeros(int(nbEle/2), np.complex64)
-    c_res.real = res[0:int(nbEle/2)]
-    c_res.imag = res[int(nbEle/2):]
-
-    return (c_res, None)
-
-### Example of device compress/decompress wrapper usage
-class Comp():
-    def __init__(self):
-        self.name = "dummy"
-
-def free_compressed(ptr):
-    p_ptr = ctypes.addressof(ptr)
-    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
-    decomp_int = p_int.contents
-    #cp.cuda.runtime.free(decomp_int.value)
-
-
-if __name__ == "__main__":
-    
-    DATA_SIZE = int(1024*64)
-    MAX_D = 10.0
-    MIN_D = -10.0
-    RANGE = MAX_D - MIN_D
-    r2r_threshold = 0.01
-    r2r_error = 0.01
-    ranga_vr = RANGE
-    in_vector = np.zeros((DATA_SIZE,))
-    for i in range(0,int(DATA_SIZE/4)):
-        in_vector[i] = 0.0
-    for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
-        in_vector[i] = 5.0
-    for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
-        in_vector[i] = random.uniform(MIN_D, MAX_D)
-    for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
-        in_vector[i] = -7.0
-    for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
-        in_vector[i] = 0.001
-
-    print(DATA_SIZE)
-    in_vector = in_vector.astype('complex64')
-    in_vector_gpu = cp.asarray(in_vector)
-    
-    #in_vector_gpu = cp.asarray(in_vector)
-    # variable = ctypes.c_size_t(0)
-    # outSize = ctypes.pointer(variable)
-    for i in range(2):
-        s_time = time.time()
-        o_bytes, outSize = cuszp_device_compress(in_vector_gpu, r2r_error, r2r_threshold)
-        print("Time python: "+str(time.time()-s_time))
-        print(outSize)
-        print("Compress Success...starting decompress ")
-        comp = Comp()
-
-        s_time = time.time()
-        (d_bytes,ptr )= cuszp_device_decompress(DATA_SIZE*2, o_bytes)
-        #free_compressed(o_bytes[0])
-        #cp.cuda.runtime.free(d_bytes)
-        print("Time python: "+str(time.time()-s_time))
-    #for i in d_bytes:
-    #    print(i)
-        print("Decompress Success")
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+#from qtensor.tools.lazy_import import cupy as cp
+import cupy as cp
+import time
+import torch
+
+import cuszp
+
+from pathlib import Path
+
+def cuszp_device_compress(oriData, absErrBound,threshold):
+
+    oriData = oriData.flatten()
+    x = torch.as_tensor(oriData, device='cuda')
+    
+    ori_real = x.real
+    ori_imag = x.imag
+    x = x.contiguous()
+    x = torch.cat((ori_real, ori_imag))
+    x = torch.flatten(x)
+    bitmap = None
+    d = torch.max(x) - torch.min(x)
+    d = d.item()
+    absErrBound = float(absErrBound*(d))
+    threshold = threshold*(d)
+    truth_values = torch.abs(x)<=threshold
+    x[truth_values] = 0.0
+
+    o_bytes = cuszp.compress(x, absErrBound, "rel")
+    outSize = o_bytes.numel()*o_bytes.element_size()
+
+    return (o_bytes,bitmap, absErrBound), outSize
+
+
+def cuszp_device_decompress(nbEle, cmpBytes):
+
+    (cmpBytes, bitmap, absErrBound) = cmpBytes
+
+    newData = cuszp.decompress(
+        cmpBytes,
+        nbEle,
+        cmpBytes.numel()*cmpBytes.element_size(),
+        absErrBound,
+        "rel",
+    )
+
+    arr = cp.asarray(newData)
+    res = arr
+    c_res = cp.zeros(int(nbEle/2), np.complex64)
+    c_res.real = res[0:int(nbEle/2)]
+    c_res.imag = res[int(nbEle/2):]
+
+    return (c_res, None)
+
+### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
+
+def free_compressed(ptr):
+    p_ptr = ctypes.addressof(ptr)
+    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decomp_int = p_int.contents
+    #cp.cuda.runtime.free(decomp_int.value)
+
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = int(1024*64)
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.01
+    r2r_error = 0.01
+    ranga_vr = RANGE
+    in_vector = np.zeros((DATA_SIZE,))
+    for i in range(0,int(DATA_SIZE/4)):
+        in_vector[i] = 0.0
+    for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+        in_vector[i] = 5.0
+    for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+        in_vector[i] = random.uniform(MIN_D, MAX_D)
+    for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+        in_vector[i] = -7.0
+    for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+        in_vector[i] = 0.001
+
+    print(DATA_SIZE)
+    in_vector = in_vector.astype('complex64')
+    in_vector_gpu = cp.asarray(in_vector)
+    
+    #in_vector_gpu = cp.asarray(in_vector)
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+    for i in range(2):
+        s_time = time.time()
+        o_bytes, outSize = cuszp_device_compress(in_vector_gpu, r2r_error, r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        print(outSize)
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        (d_bytes,ptr )= cuszp_device_decompress(DATA_SIZE*2, o_bytes)
+        #free_compressed(o_bytes[0])
+        #cp.cuda.runtime.free(d_bytes)
+        print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
+        print("Decompress Success")
diff --git a/qtensor/compression/cuszp/gnncuszp.py b/qtensor/compression/cuszp/gnncuszp.py
index 381a1989..76bd8197 100644
--- a/qtensor/compression/cuszp/gnncuszp.py
+++ b/qtensor/compression/cuszp/gnncuszp.py
@@ -1,347 +1,347 @@
-import cuszp
-import torch
-from statcollector import StatCollector
-# Create a class that performs compression and decompression on a tensor
-
-
-class Compressor(torch.nn.Module):
-    def __init__(self, err_mode, err_bound, device, num_nodes,statcollector:StatCollector):
-        super(Compressor, self).__init__()
-        self.err_mode = err_mode
-        self.err_bound = err_bound
-        self.device = device
-        self.compressor = cuszp
-        self.num_nodes = num_nodes
-        self.sc = statcollector
-
-    def compress(self, x):
-        # Ensure float32 type
-        if not x.dtype == torch.float32:
-            raise TypeError("x must be of type torch.float32")
-        x = x.contiguous()
-        if self.err_mode == "rel" or self.err_mode == "relative":
-            # Value-range error bound
-            x_max = torch.max(x)
-            x_min = torch.min(x)
-            # Compute the err_bound
-            err_bound = (x_max - x_min) * self.err_bound
-            # print("min =", x_min, "max =", x_max, "err_bound =", err_bound)
-            self.sc.add_tensor_stat("Min Value", x_min.item())
-            self.sc.add_tensor_stat("Max Value", x_max.item())
-
-        elif self.err_mode == "abs" or self.err_mode == "absolute":
-            err_bound = self.err_bound
-        else:
-            raise ValueError("err_mode must be 'rel / relative' or 'abs / absolute'")
-        self.sc.add_tensor_stat("Absolute Error Bound", err_bound.item())
-
-        return CompressedElement(x, self.compressor.compress(x, err_bound, self.err_mode), err_bound, self.device)
-
-    def decompress(self, comp_element):
-        if not isinstance(comp_element, CompressedElement):
-            raise TypeError("comp_element must be an instance of CompressedElement")
-        compressed_size = (
-            comp_element.compressed_data.numel()
-            * comp_element.compressed_data.element_size()
-        )
-        decompressed = self.compressor.decompress(
-            comp_element.compressed_data,
-            comp_element.uncompressed_elements,
-            compressed_size,
-            comp_element.err_bound,
-            self.err_mode,
-        )
-        # Reshape decompressed to match original shape
-        decompressed = decompressed.reshape(comp_element.original_shape)
-        return decompressed
-
-    def pack_hook(self, x):
-        if (
-            x.dtype == torch.float32
-            and x.requires_grad
-            and not x.is_sparse
-            and isinstance(x, torch.Tensor)
-            and x.shape[0] == self.num_nodes
-        ):
-            # print("Packing", x.shape)
-            t0 = self.sc.new_clock()
-            self.sc.sync_start_time(t0)
-
-            compressed = self.compress(x)
-
-            self.sc.sync_end_time(t0)
-            self.sc.increment_epoch_stat("Total Compression Time (s)",self.sc.get_elapsed_time(t0))
-
-            # print("Uncompressed size =", (x.numel() * x.element_size()) / 1024 / 1024)
-            # print(
-            #     "Compressed size =",
-            #     (
-            #         compressed.compressed_data.numel()
-            #         * compressed.compressed_data.element_size()
-            #     )
-            #     / 1024
-            #     / 1024,
-            # )
-            # print(
-            #     "Compression Ratio = ",
-            #     (x.numel() * x.element_size())
-            #     / (
-            #         compressed.compressed_data.numel()
-            #         * compressed.compressed_data.element_size()
-            #     ),
-            # )
-            csize = compressed.compressed_data.numel()*compressed.compressed_data.element_size()
-            osize = x.numel() * x.element_size()
-            self.sc.add_tensor_stat("Uncompressed Size (bytes)", osize)
-            self.sc.add_tensor_stat("Compressed Size (bytes)", csize)
-            self.sc.increment_epoch_stat("Average CR", osize/csize)
-            self.sc.increment_epoch_stat("Aggregate Uncompressed Tensor Size (bytes)", osize)
-            self.sc.increment_epoch_stat("Aggregate Compressed Tensor Size (bytes)", csize)
-            # print( "Data Saved", ((x.numel() * x.element_size()) - (compressed.compressed_data.numel() * compressed.compressed_data.element_size()))/1024/1024)
-            # print("Testing decompress,", decompressed)
-            # print("Compressed data", compressed.compressed_data)
-            # print("Decompressed shape =", decompressed.shape)
-            # print("X shape = ", x.shape)
-            # abs_error = torch.abs(x - decompressed)
-            # max_error = torch.max(abs_error)
-            # if max_error > self.err_bound * 1.1:
-            #     # Print the location of the max error and the values
-            #     print("Max error location =", torch.argmax(torch.abs(x - decompressed)))
-            #     print("Max error value =", max_error)
-            #     location = torch.argmax(torch.abs(x - decompressed))
-            #     # Print row and column of max error
-            #     print("Row =", int(location / x.shape[1]))
-            #     print("Column =", location % x.shape[1])
-            #     # Count the number of elements that are > self.err_bound * 1.1
-            #     bound_err_cnt = torch.sum(abs_error > self.err_bound * 1.1)
-            #     print("Number of elements > err_bound * 1.1 =", bound_err_cnt)
-            #     print("X value =", x[int(location / x.shape[1])][location % x.shape[1]])
-            #     print(
-            #         "Decompressed value =",
-            #         decompressed[int(location / x.shape[1])][location % x.shape[1]],
-            #     )
-            #     raise ValueError(
-            #         "Error bound exceeded! Max error = ", max_error
-            #     )
-            # # Ensure max_error <= err_bound
-
-            # print("Max error =", max_error)
-            # Ensure x is freed
-            # delete x
-            self.sc.increment_epoch_stat("Compressed Tensor Count",1)
-            self.sc.register_tensor_row_and_update()
-
-
-            del x
-            # empty cache
-            torch.cuda.empty_cache()
-            return compressed
-        else:
-            return x
-
-    def unpack_hook(self, x):
-        if isinstance(x, CompressedElement):
-            # print("Unpacking", x.name)
-            # print("Unpacking")
-            t0 = self.sc.new_clock()
-            self.sc.sync_start_time(t0)
-
-            decompressed = self.decompress(x)
-
-            self.sc.sync_end_time(t0)
-            self.sc.increment_epoch_stat("Total Decompression Time (s)",self.sc.get_elapsed_time(t0))
-
-            # print("Unpacked")
-            # print("Unpacked to", decompressed)
-            return decompressed
-        else:
-            return x
-
-
-# Create class for a compressed element that is used by the Compressor class
-
-
-class CompressedElement(torch.nn.Module):
-    def __init__(self, x, compressed, err_bound, device):
-        super(CompressedElement, self).__init__()
-        self.device = device
-        # self.compressor = cuszp
-        self.compressed_data = compressed
-        self.uncompressed_elements = x.numel()
-        self.original_shape = x.shape
-        self.err_bound = err_bound
-
-import numpy as np
-import ctypes
-from ctypes import *
-import random
-from qtensor.tools.lazy_import import cupy as cp
-import time
-import torch
-
-from pathlib import Path
-
-
-
-def quant_device_compress(oriData, nbEle, blockSize,threshold):
-    #print(nbEle)
-    ori_nbEle = nbEle
-    variable = ctypes.c_size_t(0)
-    outSize = ctypes.pointer(variable)
-
-    oriData = oriData.flatten()
-    ori_real = oriData.real
-    ori_imag = oriData.imag
-    oriData = cp.concatenate((ori_real, ori_imag))
-    sample = oriData[::2]
-    max_val = cp.amax(oriData).get()
-    min_val = cp.amin(oriData).get()
-    d = max_val - min_val
-    if d.dtype == np.complex64:
-        d = d.real
-    threshold = threshold*(d)
-    s_1 = time.time() 
-    truth_values = abs(oriData)<=threshold
-    oriData[truth_values] = 0.0
-    truth_values = cp.invert(truth_values)
-    ori_len = oriData.shape[0]
-    nonzero_percent = cp.count_nonzero(oriData)/oriData.shape[0]
-    print("Percent nonzero: "+str(nonzero_percent))
-
-    isGrouped = False
-    if nonzero_percent<=0.5:
-        isGrouped=True
-        oriData = oriData[truth_values]
-    
-    nbEle = oriData.shape[0]
-    
-    # oriData = cp.reshape(oriData, (-1, blockSize))  # Reshape to blocksize
-    tensor = torch.as_tensor(oriData, device='cuda')
-    # print("Min val: "+str(cp.amin(oriData).get())+" range: "+str(d))
-#    scale = d/255.0
-#    zero_point = -1*round(min_val*scale) - 128
-
-    scale = d/((2**8) - 1)
-    #zero_point = -1*round(min_val*scale)
-    zero_point = -1*round(min_val*scale)+32
-#    q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
-    
-    q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
-    del tensor
-    torch.cuda.empty_cache()
-    if isGrouped:
-        bitmap = cp.packbits(truth_values)
-    else:
-        bitmap = None
-    del truth_values
-    #q_ten2 = torch.dequantize(q_tensor)
-    #print(tensor)
-    #print(q_ten2)
-    #print("Max PW error")
-    #print(torch.max(torch.div(torch.abs(torch.sub(tensor[tensor!=0.0],q_ten2[tensor!=0.0])),tensor[tensor!=0.0])))
-    return (q_tensor, bitmap, isGrouped), (nbEle/4)+(ori_len/8)
-
-
-def quant_device_decompress(nbEle, cmpBytes, owner, dtype):
-    (q_tensor, bitmap, isGrouped) = cmpBytes
-    if isGrouped:
-        bitmap = cp.unpackbits(bitmap)
-    restored = torch.dequantize(q_tensor)
-    arr = cp.asarray(restored)
-    # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error
-
-    # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
-    # -- Workaround to convert GPU pointer to int
-    # p_decompressed_ptr = ctypes.addressof(newData)
-    # cast to int64 pointer
-    # (effectively converting pointer to pointer to addr to pointer to int64)
-    # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-    # decompressed_int = p_decompressed_int.contents
-    # # --
-    # pointer_for_free = decompressed_int.value
-    # # self.decompressed_own.append(decompressed_int.value)
-    # mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0)
-    # mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
-    #print("mem ptr")
-    #print(mem_ptr)
-    # arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr)
-    #print(nbEle)
-    if isGrouped:
-        res = cp.zeros((nbEle,))
-    # ## need to convert newData to cupy
-        cp.place(res,bitmap,arr)
-
-        c_res = cp.zeros(int(nbEle/2), np.complex64)
-    #c_res.real = arr[0:int(nbEle/2)]
-    #c_res.imag = arr[int(nbEle/2):]
-
-        c_res.real = res[0:int(nbEle/2)]
-        c_res.imag = res[int(nbEle/2):]
-    else:
-        c_res = cp.zeros(int(nbEle/2), np.complex64)
-        c_res.real = arr[0:int(nbEle/2)]
-        c_res.imag = arr[int(nbEle/2):]
-    return (c_res, None)
-
-### Example of device compress/decompress wrapper usage
-class Comp():
-    def __init__(self):
-        self.name = "dummy"
-
-def free_compressed(ptr):
-    p_ptr = ctypes.addressof(ptr)
-    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
-    decomp_int = p_int.contents
-    cp.cuda.runtime.free(decomp_int.value)
-
-
-if __name__ == "__main__":
-    
-    DATA_SIZE = int(1024)
-    MAX_D = 10.0
-    MIN_D = -10.0
-    RANGE = MAX_D - MIN_D
-    r2r_threshold = 0.002
-    r2r_error = 0.0001
-
-    in_vector = np.fromfile("all_sample.bin", dtype=np.complex64)
-    #print(np.max(in_vector))
-    DATA_SIZE = len(in_vector)
-    #range_vr = np.max(in_vector)-np.min(in_vector)
-    #r2r_threshold = r2r_threshold*range_vr
-    #r2r_error = r2r_error*range_vr
-    #in_vector = np.zeros((DATA_SIZE,))
-    #for i in range(0,int(DATA_SIZE/4)):
-    #    in_vector[i] = 0.0
-    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
-    #    in_vector[i] = 5.0
-    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
-    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
-    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
-    #    in_vector[i] = -7.0
-    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
-    #    in_vector[i] = 0.001
-
-    print(DATA_SIZE)
-    #in_vector = in_vector.astype('float32')
-    in_vector_gpu = cp.asarray(in_vector)
-    
-    # variable = ctypes.c_size_t(0)
-    # outSize = ctypes.pointer(variable)
-    for i in range(200):
-        s_time = time.time()
-        o_bytes, outSize = quant_device_compress(in_vector_gpu, DATA_SIZE, 256, r2r_threshold)
-        print("Time python: "+str(time.time()-s_time))
-        # print(outSize[0])
-        print("Compress Success...starting decompress ")
-        comp = Comp()
-
-        s_time = time.time()
-        (d_bytes,ptr )= quant_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
-        
-        # free_compressed(o_bytes[0])
-        # cp.cuda.runtime.free(ptr)
-        print("Time python: "+str(time.time()-s_time))
-    #for i in d_bytes:
-    #    print(i)
+import cuszp
+import torch
+from statcollector import StatCollector
+# Create a class that performs compression and decompression on a tensor
+
+
+class Compressor(torch.nn.Module):
+    def __init__(self, err_mode, err_bound, device, num_nodes,statcollector:StatCollector):
+        super(Compressor, self).__init__()
+        self.err_mode = err_mode
+        self.err_bound = err_bound
+        self.device = device
+        self.compressor = cuszp
+        self.num_nodes = num_nodes
+        self.sc = statcollector
+
+    def compress(self, x):
+        # Ensure float32 type
+        if not x.dtype == torch.float32:
+            raise TypeError("x must be of type torch.float32")
+        x = x.contiguous()
+        if self.err_mode == "rel" or self.err_mode == "relative":
+            # Value-range error bound
+            x_max = torch.max(x)
+            x_min = torch.min(x)
+            # Compute the err_bound
+            err_bound = (x_max - x_min) * self.err_bound
+            # print("min =", x_min, "max =", x_max, "err_bound =", err_bound)
+            self.sc.add_tensor_stat("Min Value", x_min.item())
+            self.sc.add_tensor_stat("Max Value", x_max.item())
+
+        elif self.err_mode == "abs" or self.err_mode == "absolute":
+            err_bound = self.err_bound
+        else:
+            raise ValueError("err_mode must be 'rel / relative' or 'abs / absolute'")
+        self.sc.add_tensor_stat("Absolute Error Bound", err_bound.item())
+
+        return CompressedElement(x, self.compressor.compress(x, err_bound, self.err_mode), err_bound, self.device)
+
+    def decompress(self, comp_element):
+        if not isinstance(comp_element, CompressedElement):
+            raise TypeError("comp_element must be an instance of CompressedElement")
+        compressed_size = (
+            comp_element.compressed_data.numel()
+            * comp_element.compressed_data.element_size()
+        )
+        decompressed = self.compressor.decompress(
+            comp_element.compressed_data,
+            comp_element.uncompressed_elements,
+            compressed_size,
+            comp_element.err_bound,
+            self.err_mode,
+        )
+        # Reshape decompressed to match original shape
+        decompressed = decompressed.reshape(comp_element.original_shape)
+        return decompressed
+
+    def pack_hook(self, x):
+        if (
+            x.dtype == torch.float32
+            and x.requires_grad
+            and not x.is_sparse
+            and isinstance(x, torch.Tensor)
+            and x.shape[0] == self.num_nodes
+        ):
+            # print("Packing", x.shape)
+            t0 = self.sc.new_clock()
+            self.sc.sync_start_time(t0)
+
+            compressed = self.compress(x)
+
+            self.sc.sync_end_time(t0)
+            self.sc.increment_epoch_stat("Total Compression Time (s)",self.sc.get_elapsed_time(t0))
+
+            # print("Uncompressed size =", (x.numel() * x.element_size()) / 1024 / 1024)
+            # print(
+            #     "Compressed size =",
+            #     (
+            #         compressed.compressed_data.numel()
+            #         * compressed.compressed_data.element_size()
+            #     )
+            #     / 1024
+            #     / 1024,
+            # )
+            # print(
+            #     "Compression Ratio = ",
+            #     (x.numel() * x.element_size())
+            #     / (
+            #         compressed.compressed_data.numel()
+            #         * compressed.compressed_data.element_size()
+            #     ),
+            # )
+            csize = compressed.compressed_data.numel()*compressed.compressed_data.element_size()
+            osize = x.numel() * x.element_size()
+            self.sc.add_tensor_stat("Uncompressed Size (bytes)", osize)
+            self.sc.add_tensor_stat("Compressed Size (bytes)", csize)
+            self.sc.increment_epoch_stat("Average CR", osize/csize)
+            self.sc.increment_epoch_stat("Aggregate Uncompressed Tensor Size (bytes)", osize)
+            self.sc.increment_epoch_stat("Aggregate Compressed Tensor Size (bytes)", csize)
+            # print( "Data Saved", ((x.numel() * x.element_size()) - (compressed.compressed_data.numel() * compressed.compressed_data.element_size()))/1024/1024)
+            # print("Testing decompress,", decompressed)
+            # print("Compressed data", compressed.compressed_data)
+            # print("Decompressed shape =", decompressed.shape)
+            # print("X shape = ", x.shape)
+            # abs_error = torch.abs(x - decompressed)
+            # max_error = torch.max(abs_error)
+            # if max_error > self.err_bound * 1.1:
+            #     # Print the location of the max error and the values
+            #     print("Max error location =", torch.argmax(torch.abs(x - decompressed)))
+            #     print("Max error value =", max_error)
+            #     location = torch.argmax(torch.abs(x - decompressed))
+            #     # Print row and column of max error
+            #     print("Row =", int(location / x.shape[1]))
+            #     print("Column =", location % x.shape[1])
+            #     # Count the number of elements that are > self.err_bound * 1.1
+            #     bound_err_cnt = torch.sum(abs_error > self.err_bound * 1.1)
+            #     print("Number of elements > err_bound * 1.1 =", bound_err_cnt)
+            #     print("X value =", x[int(location / x.shape[1])][location % x.shape[1]])
+            #     print(
+            #         "Decompressed value =",
+            #         decompressed[int(location / x.shape[1])][location % x.shape[1]],
+            #     )
+            #     raise ValueError(
+            #         "Error bound exceeded! Max error = ", max_error
+            #     )
+            # # Ensure max_error <= err_bound
+
+            # print("Max error =", max_error)
+            # Ensure x is freed
+            # delete x
+            self.sc.increment_epoch_stat("Compressed Tensor Count",1)
+            self.sc.register_tensor_row_and_update()
+
+
+            del x
+            # empty cache
+            torch.cuda.empty_cache()
+            return compressed
+        else:
+            return x
+
+    def unpack_hook(self, x):
+        if isinstance(x, CompressedElement):
+            # print("Unpacking", x.name)
+            # print("Unpacking")
+            t0 = self.sc.new_clock()
+            self.sc.sync_start_time(t0)
+
+            decompressed = self.decompress(x)
+
+            self.sc.sync_end_time(t0)
+            self.sc.increment_epoch_stat("Total Decompression Time (s)",self.sc.get_elapsed_time(t0))
+
+            # print("Unpacked")
+            # print("Unpacked to", decompressed)
+            return decompressed
+        else:
+            return x
+
+
+# Create class for a compressed element that is used by the Compressor class
+
+
+class CompressedElement(torch.nn.Module):
+    def __init__(self, x, compressed, err_bound, device):
+        super(CompressedElement, self).__init__()
+        self.device = device
+        # self.compressor = cuszp
+        self.compressed_data = compressed
+        self.uncompressed_elements = x.numel()
+        self.original_shape = x.shape
+        self.err_bound = err_bound
+
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+from qtensor.tools.lazy_import import cupy as cp
+import time
+import torch
+
+from pathlib import Path
+
+
+
+def quant_device_compress(oriData, nbEle, blockSize,threshold):
+    #print(nbEle)
+    ori_nbEle = nbEle
+    variable = ctypes.c_size_t(0)
+    outSize = ctypes.pointer(variable)
+
+    oriData = oriData.flatten()
+    ori_real = oriData.real
+    ori_imag = oriData.imag
+    oriData = cp.concatenate((ori_real, ori_imag))
+    sample = oriData[::2]
+    max_val = cp.amax(oriData).get()
+    min_val = cp.amin(oriData).get()
+    d = max_val - min_val
+    if d.dtype == np.complex64:
+        d = d.real
+    threshold = threshold*(d)
+    s_1 = time.time() 
+    truth_values = abs(oriData)<=threshold
+    oriData[truth_values] = 0.0
+    truth_values = cp.invert(truth_values)
+    ori_len = oriData.shape[0]
+    nonzero_percent = cp.count_nonzero(oriData)/oriData.shape[0]
+    print("Percent nonzero: "+str(nonzero_percent))
+
+    isGrouped = False
+    if nonzero_percent<=0.5:
+        isGrouped=True
+        oriData = oriData[truth_values]
+    
+    nbEle = oriData.shape[0]
+    
+    # oriData = cp.reshape(oriData, (-1, blockSize))  # Reshape to blocksize
+    tensor = torch.as_tensor(oriData, device='cuda')
+    # print("Min val: "+str(cp.amin(oriData).get())+" range: "+str(d))
+#    scale = d/255.0
+#    zero_point = -1*round(min_val*scale) - 128
+
+    scale = d/((2**8) - 1)
+    #zero_point = -1*round(min_val*scale)
+    zero_point = -1*round(min_val*scale)+32
+#    q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
+    
+    q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
+    del tensor
+    torch.cuda.empty_cache()
+    if isGrouped:
+        bitmap = cp.packbits(truth_values)
+    else:
+        bitmap = None
+    del truth_values
+    #q_ten2 = torch.dequantize(q_tensor)
+    #print(tensor)
+    #print(q_ten2)
+    #print("Max PW error")
+    #print(torch.max(torch.div(torch.abs(torch.sub(tensor[tensor!=0.0],q_ten2[tensor!=0.0])),tensor[tensor!=0.0])))
+    return (q_tensor, bitmap, isGrouped), (nbEle/4)+(ori_len/8)
+
+
+def quant_device_decompress(nbEle, cmpBytes, owner, dtype):
+    (q_tensor, bitmap, isGrouped) = cmpBytes
+    if isGrouped:
+        bitmap = cp.unpackbits(bitmap)
+    restored = torch.dequantize(q_tensor)
+    arr = cp.asarray(restored)
+    # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error
+
+    # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
+    # -- Workaround to convert GPU pointer to int
+    # p_decompressed_ptr = ctypes.addressof(newData)
+    # cast to int64 pointer
+    # (effectively converting pointer to pointer to addr to pointer to int64)
+    # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+    # decompressed_int = p_decompressed_int.contents
+    # # --
+    # pointer_for_free = decompressed_int.value
+    # # self.decompressed_own.append(decompressed_int.value)
+    # mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0)
+    # mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
+    #print("mem ptr")
+    #print(mem_ptr)
+    # arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr)
+    #print(nbEle)
+    if isGrouped:
+        res = cp.zeros((nbEle,))
+    # ## need to convert newData to cupy
+        cp.place(res,bitmap,arr)
+
+        c_res = cp.zeros(int(nbEle/2), np.complex64)
+    #c_res.real = arr[0:int(nbEle/2)]
+    #c_res.imag = arr[int(nbEle/2):]
+
+        c_res.real = res[0:int(nbEle/2)]
+        c_res.imag = res[int(nbEle/2):]
+    else:
+        c_res = cp.zeros(int(nbEle/2), np.complex64)
+        c_res.real = arr[0:int(nbEle/2)]
+        c_res.imag = arr[int(nbEle/2):]
+    return (c_res, None)
+
+### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
+
+def free_compressed(ptr):
+    p_ptr = ctypes.addressof(ptr)
+    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decomp_int = p_int.contents
+    cp.cuda.runtime.free(decomp_int.value)
+
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = int(1024)
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.002
+    r2r_error = 0.0001
+
+    in_vector = np.fromfile("all_sample.bin", dtype=np.complex64)
+    #print(np.max(in_vector))
+    DATA_SIZE = len(in_vector)
+    #range_vr = np.max(in_vector)-np.min(in_vector)
+    #r2r_threshold = r2r_threshold*range_vr
+    #r2r_error = r2r_error*range_vr
+    #in_vector = np.zeros((DATA_SIZE,))
+    #for i in range(0,int(DATA_SIZE/4)):
+    #    in_vector[i] = 0.0
+    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+    #    in_vector[i] = 5.0
+    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
+    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+    #    in_vector[i] = -7.0
+    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+    #    in_vector[i] = 0.001
+
+    print(DATA_SIZE)
+    #in_vector = in_vector.astype('float32')
+    in_vector_gpu = cp.asarray(in_vector)
+    
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+    for i in range(200):
+        s_time = time.time()
+        o_bytes, outSize = quant_device_compress(in_vector_gpu, DATA_SIZE, 256, r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        # print(outSize[0])
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        (d_bytes,ptr )= quant_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
+        
+        # free_compressed(o_bytes[0])
+        # cp.cuda.runtime.free(ptr)
+        print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
         print("Decompress Success")
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/setup.py b/qtensor/compression/cuszp/setup.py
index 3bc77e8f..33ab8839 100644
--- a/qtensor/compression/cuszp/setup.py
+++ b/qtensor/compression/cuszp/setup.py
@@ -1,28 +1,28 @@
-from setuptools import setup, Extension
-from torch.utils import cpp_extension
-import os
-
-cuSZp_install = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cuSZp')
-cuSZp_include = os.path.join(cuSZp_install, 'include')
-cuSZp_src = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cuSZp', 'src')
-# Retrieve list of source files
-cuSZp_src_files = []
-for root, dirs, files in os.walk(cuSZp_src):
-    for file in files:
-        if file.endswith('.cu'):
-            cuSZp_src_files.append(os.path.join(root, file))
-cuSZp_src_files.append('cuSZp_interface.cpp')
-
-# define the extension module
-cuSZp_extension = cpp_extension.CUDAExtension(
-    name='cuszp',
-    sources=cuSZp_src_files,
-    include_dirs=[cuSZp_include],
-)
-
-# build the extension module
-setup(
-    name='cuszp',
-    ext_modules=[cuSZp_extension],
-    cmdclass={'build_ext': cpp_extension.BuildExtension}
-)
+from setuptools import setup, Extension
+from torch.utils import cpp_extension
+import os
+
+cuSZp_install = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cuSZp')
+cuSZp_include = os.path.join(cuSZp_install, 'include')
+cuSZp_src = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cuSZp', 'src')
+# Retrieve list of source files
+cuSZp_src_files = []
+for root, dirs, files in os.walk(cuSZp_src):
+    for file in files:
+        if file.endswith('.cu'):
+            cuSZp_src_files.append(os.path.join(root, file))
+cuSZp_src_files.append('cuSZp_interface.cpp')
+
+# define the extension module
+cuSZp_extension = cpp_extension.CUDAExtension(
+    name='cuszp',
+    sources=cuSZp_src_files,
+    include_dirs=[cuSZp_include],
+)
+
+# build the extension module
+setup(
+    name='cuszp',
+    ext_modules=[cuSZp_extension],
+    cmdclass={'build_ext': cpp_extension.BuildExtension}
+)
diff --git a/qtensor/compression/newsz/newsz.cu b/qtensor/compression/newsz/newsz.cu
index 3ef211d5..00a394b6 100644
--- a/qtensor/compression/newsz/newsz.cu
+++ b/qtensor/compression/newsz/newsz.cu
@@ -1,248 +1,248 @@
-#include <stdio.h>
-#include "newsz.h"
-#include <thrust/copy.h>
-#include <thrust/execution_policy.h>
-#include <cub/cub.cuh>
-// #include "cuCompactor.cuh"
-
-#include "nvcomp/lz4.hpp"
-#include "nvcomp.hpp"
-#include "nvcomp/nvcompManagerFactory.hpp"
-
-#define BLKS 40
-#define THDS 128
-#define FULL_MASK 0xffffffff
-
-__device__ int g_ints;
-
-struct int_predicate
-{
-    
-	__host__ __device__
-	bool operator()(const int x)
-	{
-		return x>0;
-	}
-};
-
-struct to_copy
-{
-  __host__ __device__
-  bool operator()(const uint8_t x)
-  {
-    return x==1;
-  }
-};
-
-
-
-
-__global__ void compress(float *data, float *scales, float *zeropts, int8_t *out){
-    int bid = blockIdx.x;
-    int tid = threadIdx.x;
-    extern __shared__ float scratchpad[];
-    __shared__ float min;
-    __shared__ float max;
-
-    typedef cub::BlockReduce<float, THDS> BlockReduce;
-    __shared__ typename BlockReduce::TempStorage temp_storage1;
-
-    float item = data[blockIdx.x*blockDim.x+threadIdx.x];
-
-    float tmax = BlockReduce(temp_storage1).Reduce(item, cub::Max());
-    float tmin = BlockReduce(temp_storage1).Reduce(item, cub::Min());
-    
-    if (threadIdx.x==0)
-    {
-        max = tmax;
-        min = tmin;
-    }
-
-    __syncthreads();
-
-    float vrange = max - min;
-    float scale = vrange/((2^8) - 1);
-    int zeropt = -1*lrintf(min*scale) - (2^7);
-
-    int q_item = lrintf(item/scale) + zeropt;
-
-    // Clamp quantized value
-    if(q_item>127)q_item = 127;
-    if(q_item <-128)q_item = -128;
-    int8_t q_val = (int8_t)(0xff & q_item);
-    out[blockIdx.x*blockDim.x+threadIdx.x] = q_val;
-    if (threadIdx.x==0)
-    {
-        scales[blockIdx.x] = scale;
-        zeropts[blockIdx.x]= zeropt;
-    }
-    
-}
-
-__global__ void decompress(int8_t *q_data, float *scales, float *zeropts, float *out){
-    int bid = blockIdx.x;
-    int tid = threadIdx.x;
-    extern __shared__ float scratchpad[];
-    __shared__ float min;
-    __shared__ float max;
-
-    typedef cub::BlockReduce<float, THDS> BlockReduce;
-    __shared__ typename BlockReduce::TempStorage temp_storage1;
-
-    int8_t q_val = q_data[blockIdx.x*blockDim.x+threadIdx.x];
-
-    out[blockIdx.x*blockDim.x+threadIdx.x] = (q_val - zeropts[bid])*scales[bid];
-}
-
-__global__ void p_ints(){
-	printf("codebook entries used: %d\n", g_ints);
-}
-
-unsigned char* SZ_device_compress(float *data, size_t num_elements, int blocksize, size_t *outsize){
-    float *scales, *zeropts;
-    int8_t *q_out;
-    unsigned char *cmpbytes;
-    int num_blocks = num_elements/blocksize;
-
-    cudaMalloc(&scales, sizeof(float)*num_blocks);
-    cudaMalloc(&zeropts,sizeof(float)*num_blocks);
-    cudaMalloc(&q_out, num_elements);
-
-    using namespace nvcomp;
-
-    cudaStream_t stream;
-    cudaStreamCreate(&stream);
-
-    const int chunk_size = 1 << 16;
-    nvcompType_t data_type = NVCOMP_TYPE_CHAR;
-
-     
-
-    compress<<<num_blocks, blocksize>>>(data, scales, zeropts, q_out);
-    cudaDeviceSynchronize();
-
-    LZ4Manager nvcomp_manager{chunk_size, data_type, stream};
-    CompressionConfig comp_config = nvcomp_manager.configure_compression(num_elements);
-
-    uint8_t* comp_buffer;
-    cudaMalloc(&comp_buffer, comp_config.max_compressed_buffer_size);
-    
-    nvcomp_manager.compress((const uint8_t *)q_out, comp_buffer, comp_config);
-
-    size_t c_size = nvcomp_manager.get_compressed_output_size(comp_buffer);
-    cudaFree(q_out);
-
-    *outsize = sizeof(float)*(num_blocks+num_blocks)+c_size;
-    cudaMalloc(&cmpbytes, *outsize);
-
-    cudaMemcpy(cmpbytes, (unsigned char *)scales, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice);
-    cudaMemcpy(cmpbytes+sizeof(float)*num_blocks, (unsigned char *)zeropts, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice);
-    cudaMemcpy(cmpbytes+sizeof(float)*num_blocks+sizeof(float)*num_blocks, comp_buffer, c_size, cudaMemcpyDeviceToDevice);
-
-    float h_firstscale;
-    cudaMemcpy(&h_firstscale, cmpbytes, sizeof(float), cudaMemcpyDeviceToHost);
-    cudaFree(scales);
-    cudaFree(zeropts);
-    cudaFree(comp_buffer);
-    return cmpbytes;
-}
-
-float* SZ_device_decompress(unsigned char *cmpbytes, size_t num_elements, int blocksize, size_t *cmpsize){
-    float *scales, *zeropts;
-    uint8_t *q_cmp;
-    int8_t *q_vals;
-    float *out;
-    int num_blocks = num_elements/blocksize;
-    size_t c_size = *cmpsize-(2*sizeof(float)*num_blocks);
-
-    float first_val, *d_first;
-
-    cudaMalloc(&d_first, sizeof(float));
-    cudaMemcpy((unsigned char *)&first_val, cmpbytes, sizeof(float), cudaMemcpyDeviceToHost);
-
-
-
-    cudaMalloc((void **)&scales, sizeof(float)*num_blocks);
-    cudaMalloc((void **)&zeropts,sizeof(float)*num_blocks);
-    cudaMalloc((void **)&q_cmp, c_size);
-    cudaMemcpy((unsigned char *)scales, cmpbytes, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice);
-    
-    cudaMemcpy((unsigned char *)zeropts, cmpbytes+sizeof(float)*num_blocks, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice);
-    
-    cudaMemcpy(q_cmp, cmpbytes+sizeof(float)*num_blocks+sizeof(float)*num_blocks, c_size, cudaMemcpyDeviceToDevice);
-    cudaStream_t stream;
-    cudaStreamCreate(&stream);
-
-    const int chunk_size = 1 << 16;
-    
-    
-    nvcompType_t data_type = NVCOMP_TYPE_CHAR;
-
-    auto decomp_manager = nvcomp::create_manager(q_cmp, stream);
-
-    nvcomp::DecompressionConfig decomp_config = decomp_manager->configure_decompression((uint8_t *)q_cmp);
-    cudaMalloc(&q_vals, num_elements);
-
-    decomp_manager->decompress((uint8_t*)q_vals, (uint8_t*)q_cmp, decomp_config);
-    cudaFree(q_cmp);
-
-    cudaMalloc(&out, sizeof(float)*num_elements);
-
-    decompress<<<num_blocks, blocksize>>>(q_vals, scales, zeropts, out);
-    cudaDeviceSynchronize();
-    
-    cudaFree(scales);
-    cudaFree(zeropts);
-    cudaFree(q_vals);
-
-    return out;
-}
-
-int main(int argc, char** argv){
-    char oriFilePath[640], outputFilePath[645];
-    float* data;
-    size_t nbEle;
-    if(argc < 3)
-    {
-		printf("Usage: testfloat_compress_fastmode2 [srcFilePath] [block size] [err bound] [--cuda]\n");
-		printf("Example: testfloat_compress_fastmode2 testfloat_8_8_128.dat 64 1E-3 --cuda\n");
-		exit(0);
-    }
-
-    sprintf(oriFilePath, "%s", argv[1]);
-    int blockSize = atoi(argv[2]);
-    float errBound = atof(argv[3]);
-    nbEle = atoi(argv[4]);
-
-    data = (float*)malloc(sizeof(float)*nbEle);
-    sprintf(outputFilePath, "%s.sznew", oriFilePath);
-
-    FILE *in_file;
-    in_file = fopen(oriFilePath, "rb");
-    
-    fread(data, sizeof(float), nbEle, in_file);
-    fclose(in_file);
-    
-    float max = data[0];
-    float min = data[0];
-    for(int i=0;i<nbEle;i++){
-	if(data[i]>=max){
-		max = data[i];
-	}
-	if(data[i]<=min){
-		min = data[i];
-	}
-    }
-    errBound = errBound*(max-min);
-
-    // Move to device
-    float *d_data;
-    unsigned char *cmpbytes;
-    size_t outsize;
-    cudaMalloc(&d_data, sizeof(float)*nbEle);
-    cudaMemcpy(d_data, data, sizeof(float)*nbEle, cudaMemcpyHostToDevice);
-    //SZ_device_compress(d_data, nbEle, errBound, blockSize, cmpbytes, &outsize);
-
-    cudaFree(d_data);
-    
-}
+#include <stdio.h>
+#include "newsz.h"
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+#include <cub/cub.cuh>
+// #include "cuCompactor.cuh"
+
+#include "nvcomp/lz4.hpp"
+#include "nvcomp.hpp"
+#include "nvcomp/nvcompManagerFactory.hpp"
+
+#define BLKS 40
+#define THDS 128
+#define FULL_MASK 0xffffffff
+
+__device__ int g_ints;
+
+struct int_predicate
+{
+    
+	__host__ __device__
+	bool operator()(const int x)
+	{
+		return x>0;
+	}
+};
+
+struct to_copy
+{
+  __host__ __device__
+  bool operator()(const uint8_t x)
+  {
+    return x==1;
+  }
+};
+
+
+
+
+__global__ void compress(float *data, float *scales, float *zeropts, int8_t *out){
+    int bid = blockIdx.x;
+    int tid = threadIdx.x;
+    extern __shared__ float scratchpad[];
+    __shared__ float min;
+    __shared__ float max;
+
+    typedef cub::BlockReduce<float, THDS> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage1;
+
+    float item = data[blockIdx.x*blockDim.x+threadIdx.x];
+
+    float tmax = BlockReduce(temp_storage1).Reduce(item, cub::Max());
+    float tmin = BlockReduce(temp_storage1).Reduce(item, cub::Min());
+    
+    if (threadIdx.x==0)
+    {
+        max = tmax;
+        min = tmin;
+    }
+
+    __syncthreads();
+
+    float vrange = max - min;
+    float scale = vrange/((2^8) - 1);
+    int zeropt = -1*lrintf(min*scale) - (2^7);
+
+    int q_item = lrintf(item/scale) + zeropt;
+
+    // Clamp quantized value
+    if(q_item>127)q_item = 127;
+    if(q_item <-128)q_item = -128;
+    int8_t q_val = (int8_t)(0xff & q_item);
+    out[blockIdx.x*blockDim.x+threadIdx.x] = q_val;
+    if (threadIdx.x==0)
+    {
+        scales[blockIdx.x] = scale;
+        zeropts[blockIdx.x]= zeropt;
+    }
+    
+}
+
+__global__ void decompress(int8_t *q_data, float *scales, float *zeropts, float *out){
+    int bid = blockIdx.x;
+    int tid = threadIdx.x;
+    extern __shared__ float scratchpad[];
+    __shared__ float min;
+    __shared__ float max;
+
+    typedef cub::BlockReduce<float, THDS> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage1;
+
+    int8_t q_val = q_data[blockIdx.x*blockDim.x+threadIdx.x];
+
+    out[blockIdx.x*blockDim.x+threadIdx.x] = (q_val - zeropts[bid])*scales[bid];
+}
+
+__global__ void p_ints(){
+	printf("codebook entries used: %d\n", g_ints);
+}
+
+unsigned char* SZ_device_compress(float *data, size_t num_elements, int blocksize, size_t *outsize){
+    float *scales, *zeropts;
+    int8_t *q_out;
+    unsigned char *cmpbytes;
+    int num_blocks = num_elements/blocksize;
+
+    cudaMalloc(&scales, sizeof(float)*num_blocks);
+    cudaMalloc(&zeropts,sizeof(float)*num_blocks);
+    cudaMalloc(&q_out, num_elements);
+
+    using namespace nvcomp;
+
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    const int chunk_size = 1 << 16;
+    nvcompType_t data_type = NVCOMP_TYPE_CHAR;
+
+     
+
+    compress<<<num_blocks, blocksize>>>(data, scales, zeropts, q_out);
+    cudaDeviceSynchronize();
+
+    LZ4Manager nvcomp_manager{chunk_size, data_type, stream};
+    CompressionConfig comp_config = nvcomp_manager.configure_compression(num_elements);
+
+    uint8_t* comp_buffer;
+    cudaMalloc(&comp_buffer, comp_config.max_compressed_buffer_size);
+    
+    nvcomp_manager.compress((const uint8_t *)q_out, comp_buffer, comp_config);
+
+    size_t c_size = nvcomp_manager.get_compressed_output_size(comp_buffer);
+    cudaFree(q_out);
+
+    *outsize = sizeof(float)*(num_blocks+num_blocks)+c_size;
+    cudaMalloc(&cmpbytes, *outsize);
+
+    cudaMemcpy(cmpbytes, (unsigned char *)scales, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice);
+    cudaMemcpy(cmpbytes+sizeof(float)*num_blocks, (unsigned char *)zeropts, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice);
+    cudaMemcpy(cmpbytes+sizeof(float)*num_blocks+sizeof(float)*num_blocks, comp_buffer, c_size, cudaMemcpyDeviceToDevice);
+
+    float h_firstscale;
+    cudaMemcpy(&h_firstscale, cmpbytes, sizeof(float), cudaMemcpyDeviceToHost);
+    cudaFree(scales);
+    cudaFree(zeropts);
+    cudaFree(comp_buffer);
+    return cmpbytes;
+}
+
+float* SZ_device_decompress(unsigned char *cmpbytes, size_t num_elements, int blocksize, size_t *cmpsize){
+    float *scales, *zeropts;
+    uint8_t *q_cmp;
+    int8_t *q_vals;
+    float *out;
+    int num_blocks = num_elements/blocksize;
+    size_t c_size = *cmpsize-(2*sizeof(float)*num_blocks);
+
+    float first_val, *d_first;
+
+    cudaMalloc(&d_first, sizeof(float));
+    cudaMemcpy((unsigned char *)&first_val, cmpbytes, sizeof(float), cudaMemcpyDeviceToHost);
+
+
+
+    cudaMalloc((void **)&scales, sizeof(float)*num_blocks);
+    cudaMalloc((void **)&zeropts,sizeof(float)*num_blocks);
+    cudaMalloc((void **)&q_cmp, c_size);
+    cudaMemcpy((unsigned char *)scales, cmpbytes, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice);
+    
+    cudaMemcpy((unsigned char *)zeropts, cmpbytes+sizeof(float)*num_blocks, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice);
+    
+    cudaMemcpy(q_cmp, cmpbytes+sizeof(float)*num_blocks+sizeof(float)*num_blocks, c_size, cudaMemcpyDeviceToDevice);
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    const int chunk_size = 1 << 16;
+    
+    
+    nvcompType_t data_type = NVCOMP_TYPE_CHAR;
+
+    auto decomp_manager = nvcomp::create_manager(q_cmp, stream);
+
+    nvcomp::DecompressionConfig decomp_config = decomp_manager->configure_decompression((uint8_t *)q_cmp);
+    cudaMalloc(&q_vals, num_elements);
+
+    decomp_manager->decompress((uint8_t*)q_vals, (uint8_t*)q_cmp, decomp_config);
+    cudaFree(q_cmp);
+
+    cudaMalloc(&out, sizeof(float)*num_elements);
+
+    decompress<<<num_blocks, blocksize>>>(q_vals, scales, zeropts, out);
+    cudaDeviceSynchronize();
+    
+    cudaFree(scales);
+    cudaFree(zeropts);
+    cudaFree(q_vals);
+
+    return out;
+}
+
+int main(int argc, char** argv){
+    char oriFilePath[640], outputFilePath[645];
+    float* data;
+    size_t nbEle;
+    if(argc < 3)
+    {
+		printf("Usage: testfloat_compress_fastmode2 [srcFilePath] [block size] [err bound] [--cuda]\n");
+		printf("Example: testfloat_compress_fastmode2 testfloat_8_8_128.dat 64 1E-3 --cuda\n");
+		exit(0);
+    }
+
+    sprintf(oriFilePath, "%s", argv[1]);
+    int blockSize = atoi(argv[2]);
+    float errBound = atof(argv[3]);
+    nbEle = atoi(argv[4]);
+
+    data = (float*)malloc(sizeof(float)*nbEle);
+    sprintf(outputFilePath, "%s.sznew", oriFilePath);
+
+    FILE *in_file;
+    in_file = fopen(oriFilePath, "rb");
+    
+    fread(data, sizeof(float), nbEle, in_file);
+    fclose(in_file);
+    
+    float max = data[0];
+    float min = data[0];
+    for(int i=0;i<nbEle;i++){
+	if(data[i]>=max){
+		max = data[i];
+	}
+	if(data[i]<=min){
+		min = data[i];
+	}
+    }
+    errBound = errBound*(max-min);
+
+    // Move to device
+    float *d_data;
+    unsigned char *cmpbytes;
+    size_t outsize;
+    cudaMalloc(&d_data, sizeof(float)*nbEle);
+    cudaMemcpy(d_data, data, sizeof(float)*nbEle, cudaMemcpyHostToDevice);
+    //SZ_device_compress(d_data, nbEle, errBound, blockSize, cmpbytes, &outsize);
+
+    cudaFree(d_data);
+    
+}
diff --git a/qtensor/compression/newsz/newsz.h b/qtensor/compression/newsz/newsz.h
index c537b889..1022e20a 100644
--- a/qtensor/compression/newsz/newsz.h
+++ b/qtensor/compression/newsz/newsz.h
@@ -1,3 +1,3 @@
-
-unsigned char* SZ_device_compress(float *data, size_t num_elements, int blocksize, size_t *outsize);
-float* SZ_device_decompress(unsigned char *cmpbytes, size_t num_elements, int blocksize, size_t *cmpsize);
+
+unsigned char* SZ_device_compress(float *data, size_t num_elements, int blocksize, size_t *outsize);
+float* SZ_device_decompress(unsigned char *cmpbytes, size_t num_elements, int blocksize, size_t *cmpsize);
diff --git a/qtensor/compression/newsz/newsz_wrapper.cu b/qtensor/compression/newsz/newsz_wrapper.cu
index d067560d..a692af9b 100644
--- a/qtensor/compression/newsz/newsz_wrapper.cu
+++ b/qtensor/compression/newsz/newsz_wrapper.cu
@@ -1,21 +1,21 @@
-#include "newsz.h"
-#include <stdio.h>
-
-extern "C"{
-    
-    unsigned char* newSZ_device_compress(float *oriData, size_t *outSize, size_t nbEle, int blockSize){
-        //unsigned char* cmpbytes;
-        return SZ_device_compress(oriData, nbEle, blockSize, outSize);
-        //printf("in wrap cmpbytes: %p\n", cmpbytes);
-	//return cmpbytes;
-    }
-
-    float* newSZ_device_decompress(size_t nbEle, unsigned char* cmpBytes, int blocksize, size_t cmpsize){
-        size_t *cmpsize_ptr;
-        *cmpsize_ptr = cmpsize;
-
-        float *res = SZ_device_decompress(cmpBytes, nbEle, blocksize, cmpsize_ptr);
-	return res;
-    }
-    
-}
+#include "newsz.h"
+#include <stdio.h>
+
+extern "C"{
+    
+    unsigned char* newSZ_device_compress(float *oriData, size_t *outSize, size_t nbEle, int blockSize){
+        //unsigned char* cmpbytes;
+        return SZ_device_compress(oriData, nbEle, blockSize, outSize);
+        //printf("in wrap cmpbytes: %p\n", cmpbytes);
+	//return cmpbytes;
+    }
+
+    float* newSZ_device_decompress(size_t nbEle, unsigned char* cmpBytes, int blocksize, size_t cmpsize){
+        size_t *cmpsize_ptr;
+        *cmpsize_ptr = cmpsize;
+
+        float *res = SZ_device_decompress(cmpBytes, nbEle, blocksize, cmpsize_ptr);
+	return res;
+    }
+    
+}
diff --git a/qtensor/compression/newsz/newsz_wrapper.py b/qtensor/compression/newsz/newsz_wrapper.py
index d40304fb..4cbc2692 100644
--- a/qtensor/compression/newsz/newsz_wrapper.py
+++ b/qtensor/compression/newsz/newsz_wrapper.py
@@ -1,161 +1,161 @@
-import numpy as np
-import ctypes
-from ctypes import *
-import random
-from qtensor.tools.lazy_import import cupy as cp
-import time
-import torch
-
-from pathlib import Path
-LIB_PATH = str(Path(__file__).parent/'libnewsz_wrapper.so')
- 
-NVCOMP_PATH = str(Path(__file__).parent/'libnvcomp.so')
-#NVCOMP_PATH= './libnvcomp.so'
-#LIB_PATH = './libnewsz_wrapper.so'
-
-# unsigned char* newSZ_device_compress(float *oriData, size_t *outSize, size_t nbEle, int blockSize)
-def get_device_compress():
-    dll_base = ctypes.CDLL(NVCOMP_PATH,mode=ctypes.RTLD_GLOBAL)
-    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
-    func = dll.newSZ_device_compress
-    func.argtypes = [POINTER(c_float), POINTER(c_size_t), c_size_t, c_int]
-    func.restype = POINTER(c_ubyte)
-    return func
-
-# float* newSZ_device_decompress(size_t nbEle, unsigned char* cmpBytes, int blocksize, size_t cmpsize)
-def get_device_decompress():
-
-    dll_base = ctypes.CDLL(NVCOMP_PATH,mode=ctypes.RTLD_GLOBAL)
-    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
-    func = dll.newSZ_device_decompress
-    func.argtypes = [c_size_t, POINTER(c_ubyte), c_int, c_size_t]
-    func.restype = POINTER(c_float)
-    return func
-
-
-def newsz_device_compress(oriData, nbEle, blockSize,threshold):
-    __cuszx_device_compress = get_device_compress()
-    ori_nbEle = nbEle
-    variable = ctypes.c_size_t(0)
-    outSize = ctypes.pointer(variable)
-
-    oriData = oriData.flatten()
-    ori_real = oriData.real
-    ori_imag = oriData.imag
-    oriData = cp.concatenate((ori_real, ori_imag))
-    sample = oriData[::2]
-    d = cp.amax(oriData) - cp.amin(oriData)
-    d = d.get()
-    if d.dtype == np.complex64:
-        d = d.real
-    threshold = threshold*(d)
-    truth_values = abs(oriData)<=threshold
-    oriData[truth_values] = 0.0
-    nbEle = oriData.shape[0]
-    
-
-    oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float))
-    # newSZ_device_compress(float *oriData, size_t *outSize, size_t nbEle, int blockSize)
-    o_bytes = __cuszx_device_compress(oriData_p, outSize, np.ulonglong(nbEle), np.int32(blockSize))
-    #print("testing")
-    #print(o_bytes.value)
-    return (o_bytes,outSize.contents.value, blockSize), outSize
-
-
-def newsz_device_decompress(nbEle, cmpBytes, owner, dtype):
-    __cuszx_device_decompress=get_device_decompress()
-    (cmpBytes, cmpsize, blockSize) = cmpBytes
-
-    nbEle_p = ctypes.c_size_t(nbEle)
-    # float* newSZ_device_decompress(size_t nbEle, unsigned char* cmpBytes, int blocksize, size_t cmpsize)
-    newData = __cuszx_device_decompress(nbEle_p, cmpBytes, np.int32(blockSize), ctypes.c_size_t(cmpsize))
-    # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
-    # -- Workaround to convert GPU pointer to int
-    p_decompressed_ptr = ctypes.addressof(newData)
-    # cast to int64 pointer
-    # (effectively converting pointer to pointer to addr to pointer to int64)
-    p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-    decompressed_int = p_decompressed_int.contents
-    # --
-    pointer_for_free = decompressed_int.value
-    # self.decompressed_own.append(decompressed_int.value)
-    mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0)
-    mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
-    #print("mem ptr")
-    #print(mem_ptr)
-    arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr)
-    # res = cp.zeros((nbEle,))
-    # ## need to convert newData to cupy
-    # cp.place(res,bitmap,arr)
-
-    c_res = cp.zeros(int(nbEle/2), np.complex64)
-    c_res.real = arr[0:int(nbEle/2)]
-    c_res.imag = arr[int(nbEle/2):]
-    return (c_res, pointer_for_free)
-
-### Example of device compress/decompress wrapper usage
-class Comp():
-    def __init__(self):
-        self.name = "dummy"
-
-def free_compressed(ptr):
-    p_ptr = ctypes.addressof(ptr)
-    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
-    decomp_int = p_int.contents
-    cp.cuda.runtime.free(decomp_int.value)
-
-
-if __name__ == "__main__":
-    
-    DATA_SIZE = int(1024)
-    MAX_D = 10.0
-    MIN_D = -10.0
-    RANGE = MAX_D - MIN_D
-    r2r_threshold = 0.002
-    r2r_error = 0.0001
-
-    in_vector = np.fromfile("all_sample.bin", dtype=np.complex64)
-    #print(np.max(in_vector))
-    DATA_SIZE = len(in_vector)
-    #range_vr = np.max(in_vector)-np.min(in_vector)
-    #r2r_threshold = r2r_threshold*range_vr
-    #r2r_error = r2r_error*range_vr
-    #in_vector = np.zeros((DATA_SIZE,))
-    #for i in range(0,int(DATA_SIZE/4)):
-    #    in_vector[i] = 0.0
-    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
-    #    in_vector[i] = 5.0
-    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
-    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
-    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
-    #    in_vector[i] = -7.0
-    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
-    #    in_vector[i] = 0.001
-
-    print(DATA_SIZE)
-    #in_vector = in_vector.astype('float32')
-    in_vector_gpu = cp.asarray(in_vector)
-    
-    # variable = ctypes.c_size_t(0)
-    # outSize = ctypes.pointer(variable)
-    #print(in_vector[0:16])
-    for i in range(200):
-        s_time = time.time()
-        #o_bytes, outSize = cusz_device_compress(in_vector_gpu, r2r_error, DATA_SIZE, 256, r2r_threshold)
-
-        o_bytes, outSize = newsz_device_compress(in_vector_gpu, DATA_SIZE, 256,r2r_threshold)
-        print("Time python: "+str(time.time()-s_time))
-        print(outSize[0])
-        print("Compress Success...starting decompress ")
-        comp = Comp()
-
-        s_time = time.time()
-        #(d_bytes,ptr )= cusz_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
-        
-        (d_bytes, ptr) = newsz_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
-        free_compressed(o_bytes[0])
-        cp.cuda.runtime.free(ptr)
-        print("Time python: "+str(time.time()-s_time))
-    #for i in d_bytes:
-    #    print(i)
-        print("Decompress Success")
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+from qtensor.tools.lazy_import import cupy as cp
+import time
+import torch
+
+from pathlib import Path
+LIB_PATH = str(Path(__file__).parent/'libnewsz_wrapper.so')
+ 
+NVCOMP_PATH = str(Path(__file__).parent/'libnvcomp.so')
+#NVCOMP_PATH= './libnvcomp.so'
+#LIB_PATH = './libnewsz_wrapper.so'
+
+# unsigned char* newSZ_device_compress(float *oriData, size_t *outSize, size_t nbEle, int blockSize)
+def get_device_compress():
+    dll_base = ctypes.CDLL(NVCOMP_PATH,mode=ctypes.RTLD_GLOBAL)
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.newSZ_device_compress
+    func.argtypes = [POINTER(c_float), POINTER(c_size_t), c_size_t, c_int]
+    func.restype = POINTER(c_ubyte)
+    return func
+
+# float* newSZ_device_decompress(size_t nbEle, unsigned char* cmpBytes, int blocksize, size_t cmpsize)
+def get_device_decompress():
+
+    dll_base = ctypes.CDLL(NVCOMP_PATH,mode=ctypes.RTLD_GLOBAL)
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.newSZ_device_decompress
+    func.argtypes = [c_size_t, POINTER(c_ubyte), c_int, c_size_t]
+    func.restype = POINTER(c_float)
+    return func
+
+
+def newsz_device_compress(oriData, nbEle, blockSize,threshold):
+    __cuszx_device_compress = get_device_compress()
+    ori_nbEle = nbEle
+    variable = ctypes.c_size_t(0)
+    outSize = ctypes.pointer(variable)
+
+    oriData = oriData.flatten()
+    ori_real = oriData.real
+    ori_imag = oriData.imag
+    oriData = cp.concatenate((ori_real, ori_imag))
+    sample = oriData[::2]
+    d = cp.amax(oriData) - cp.amin(oriData)
+    d = d.get()
+    if d.dtype == np.complex64:
+        d = d.real
+    threshold = threshold*(d)
+    truth_values = abs(oriData)<=threshold
+    oriData[truth_values] = 0.0
+    nbEle = oriData.shape[0]
+    
+
+    oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float))
+    # newSZ_device_compress(float *oriData, size_t *outSize, size_t nbEle, int blockSize)
+    o_bytes = __cuszx_device_compress(oriData_p, outSize, np.ulonglong(nbEle), np.int32(blockSize))
+    #print("testing")
+    #print(o_bytes.value)
+    return (o_bytes,outSize.contents.value, blockSize), outSize
+
+
+def newsz_device_decompress(nbEle, cmpBytes, owner, dtype):
+    __cuszx_device_decompress=get_device_decompress()
+    (cmpBytes, cmpsize, blockSize) = cmpBytes
+
+    nbEle_p = ctypes.c_size_t(nbEle)
+    # float* newSZ_device_decompress(size_t nbEle, unsigned char* cmpBytes, int blocksize, size_t cmpsize)
+    newData = __cuszx_device_decompress(nbEle_p, cmpBytes, np.int32(blockSize), ctypes.c_size_t(cmpsize))
+    # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
+    # -- Workaround to convert GPU pointer to int
+    p_decompressed_ptr = ctypes.addressof(newData)
+    # cast to int64 pointer
+    # (effectively converting pointer to pointer to addr to pointer to int64)
+    p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decompressed_int = p_decompressed_int.contents
+    # --
+    pointer_for_free = decompressed_int.value
+    # self.decompressed_own.append(decompressed_int.value)
+    mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0)
+    mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
+    #print("mem ptr")
+    #print(mem_ptr)
+    arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr)
+    # res = cp.zeros((nbEle,))
+    # ## need to convert newData to cupy
+    # cp.place(res,bitmap,arr)
+
+    c_res = cp.zeros(int(nbEle/2), np.complex64)
+    c_res.real = arr[0:int(nbEle/2)]
+    c_res.imag = arr[int(nbEle/2):]
+    return (c_res, pointer_for_free)
+
+### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
+
+def free_compressed(ptr):
+    p_ptr = ctypes.addressof(ptr)
+    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decomp_int = p_int.contents
+    cp.cuda.runtime.free(decomp_int.value)
+
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = int(1024)
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.002
+    r2r_error = 0.0001
+
+    in_vector = np.fromfile("all_sample.bin", dtype=np.complex64)
+    #print(np.max(in_vector))
+    DATA_SIZE = len(in_vector)
+    #range_vr = np.max(in_vector)-np.min(in_vector)
+    #r2r_threshold = r2r_threshold*range_vr
+    #r2r_error = r2r_error*range_vr
+    #in_vector = np.zeros((DATA_SIZE,))
+    #for i in range(0,int(DATA_SIZE/4)):
+    #    in_vector[i] = 0.0
+    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+    #    in_vector[i] = 5.0
+    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
+    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+    #    in_vector[i] = -7.0
+    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+    #    in_vector[i] = 0.001
+
+    print(DATA_SIZE)
+    #in_vector = in_vector.astype('float32')
+    in_vector_gpu = cp.asarray(in_vector)
+    
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+    #print(in_vector[0:16])
+    for i in range(200):
+        s_time = time.time()
+        #o_bytes, outSize = cusz_device_compress(in_vector_gpu, r2r_error, DATA_SIZE, 256, r2r_threshold)
+
+        o_bytes, outSize = newsz_device_compress(in_vector_gpu, DATA_SIZE, 256,r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        print(outSize[0])
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        #(d_bytes,ptr )= cusz_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
+        
+        (d_bytes, ptr) = newsz_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
+        free_compressed(o_bytes[0])
+        cp.cuda.runtime.free(ptr)
+        print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
+        print("Decompress Success")
diff --git a/qtensor/compression/szp/include/cuSZp.h b/qtensor/compression/szp/include/cuSZp.h
index 0a168f34..d94e2943 100644
--- a/qtensor/compression/szp/include/cuSZp.h
+++ b/qtensor/compression/szp/include/cuSZp.h
@@ -1,12 +1,12 @@
-#ifndef CUSZP_INCLUDE_CUSZP_H
-#define CUSZP_INCLUDE_CUSZP_H
-
-static const int cmp_tblock_size = 32; // 32 should be the best, not need to modify.
-static const int dec_tblock_size = 32; // 32 should be the best, not need to modify.
-static const int cmp_chunk = 8192;
-static const int dec_chunk = 8192;
-
-__global__ void SZp_compress_kernel(const float* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle);
-__global__ void SZp_decompress_kernel(float* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle);
-
+#ifndef CUSZP_INCLUDE_CUSZP_H
+#define CUSZP_INCLUDE_CUSZP_H
+
+static const int cmp_tblock_size = 32; // 32 should be the best, not need to modify.
+static const int dec_tblock_size = 32; // 32 should be the best, not need to modify.
+static const int cmp_chunk = 8192;
+static const int dec_chunk = 8192;
+
+__global__ void SZp_compress_kernel(const float* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle);
+__global__ void SZp_decompress_kernel(float* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle);
+
 #endif // CUSZP_INCLUDE_CUSZP_H
\ No newline at end of file
diff --git a/qtensor/compression/szp/include/cuSZp_entry.h b/qtensor/compression/szp/include/cuSZp_entry.h
index 5acd97a5..fcdcb420 100644
--- a/qtensor/compression/szp/include/cuSZp_entry.h
+++ b/qtensor/compression/szp/include/cuSZp_entry.h
@@ -1,12 +1,12 @@
-#ifndef CUSZP_INCLUDE_CUSZP_ENTRY_H
-#define CUSZP_INCLUDE_CUSZP_ENTRY_H
-
-#include <cuda_runtime.h>
-
-void SZp_compress_hostptr(float* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound);
-void SZp_decompress_hostptr(float* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, float errorBound);
-extern "C" void SZp_compress_deviceptr(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream = 0);
-void SZp_dev_new(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream = 0);
-extern "C" void SZp_decompress_deviceptr(float* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, float errorBound, cudaStream_t stream = 0);
-
+#ifndef CUSZP_INCLUDE_CUSZP_ENTRY_H
+#define CUSZP_INCLUDE_CUSZP_ENTRY_H
+
+#include <cuda_runtime.h>
+
+void SZp_compress_hostptr(float* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound);
+void SZp_decompress_hostptr(float* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, float errorBound);
+extern "C" void SZp_compress_deviceptr(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream = 0);
+void SZp_dev_new(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream = 0);
+extern "C" void SZp_decompress_deviceptr(float* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, float errorBound, cudaStream_t stream = 0);
+
 #endif // CUSZP_INCLUDE_CUSZP_ENTRY_H
\ No newline at end of file
diff --git a/qtensor/compression/szp/include/cuSZp_timer.h b/qtensor/compression/szp/include/cuSZp_timer.h
index faca61c3..2777a919 100644
--- a/qtensor/compression/szp/include/cuSZp_timer.h
+++ b/qtensor/compression/szp/include/cuSZp_timer.h
@@ -1,31 +1,31 @@
-#ifndef CUSZP_INCLUDE_CUSZP_TIMER_H
-#define CUSZP_INCLUDE_CUSZP_TIMER_H
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-struct PrivateTimingGPU {
-    cudaEvent_t start;
-    cudaEvent_t stop;
-};
-
-class TimingGPU
-{
-    private:
-        PrivateTimingGPU *privateTimingGPU;
-
-    public:
-
-        TimingGPU();
-
-        ~TimingGPU();
-
-        void StartCounter();
-
-        void StartCounterFlags();
-
-        float GetCounter();
-
-};
-
+#ifndef CUSZP_INCLUDE_CUSZP_TIMER_H
+#define CUSZP_INCLUDE_CUSZP_TIMER_H
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+struct PrivateTimingGPU {
+    cudaEvent_t start;
+    cudaEvent_t stop;
+};
+
+class TimingGPU
+{
+    private:
+        PrivateTimingGPU *privateTimingGPU;
+
+    public:
+
+        TimingGPU();
+
+        ~TimingGPU();
+
+        void StartCounter();
+
+        void StartCounterFlags();
+
+        float GetCounter();
+
+};
+
 #endif // CUSZP_INCLUDE_CUSZP_TIMER_H
\ No newline at end of file
diff --git a/qtensor/compression/szp/include/cuSZp_utility.h b/qtensor/compression/szp/include/cuSZp_utility.h
index e698633f..32af7040 100644
--- a/qtensor/compression/szp/include/cuSZp_utility.h
+++ b/qtensor/compression/szp/include/cuSZp_utility.h
@@ -1,14 +1,14 @@
-#ifndef CUSZP_INCLUDE_CUSZP_UTILITY_H
-#define CUSZP_INCLUDE_CUSZP_UTILITY_H
-
-void symTransForm_4Bytes(unsigned char data[4]);
-unsigned char *readByteData_Yafan(char *srcFilePath, size_t *byteLength, int *status);
-float *readFloatData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status);
-float *readFloatData_Yafan(char *srcFilePath, size_t *nbEle, int *status);
-void writeByteData_Yafan(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status);
-void writeFloatData_inBytes_Yafan(float *data, size_t nbEle, char* tgtFilePath, int *status);
-double SSIM_3d_calcWindow_float(float* data, float* other, size_t size1, size_t size0, int offset0, int offset1, int offset2, int windowSize0, int windowSize1, int windowSize2);
-double computeSSIM(float* oriData, float* decData, size_t size2, size_t size1, size_t size0);
-double *computePSNR(size_t nbEle, float *ori_data, float *data);
-
+#ifndef CUSZP_INCLUDE_CUSZP_UTILITY_H
+#define CUSZP_INCLUDE_CUSZP_UTILITY_H
+
+void symTransForm_4Bytes(unsigned char data[4]);
+unsigned char *readByteData_Yafan(char *srcFilePath, size_t *byteLength, int *status);
+float *readFloatData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status);
+float *readFloatData_Yafan(char *srcFilePath, size_t *nbEle, int *status);
+void writeByteData_Yafan(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status);
+void writeFloatData_inBytes_Yafan(float *data, size_t nbEle, char* tgtFilePath, int *status);
+double SSIM_3d_calcWindow_float(float* data, float* other, size_t size1, size_t size0, int offset0, int offset1, int offset2, int windowSize0, int windowSize1, int windowSize2);
+double computeSSIM(float* oriData, float* decData, size_t size2, size_t size1, size_t size0);
+double *computePSNR(size_t nbEle, float *ori_data, float *data);
+
 #endif // CUSZP_INCLUDE_CUSZP_UTILITY_H
\ No newline at end of file
diff --git a/qtensor/compression/szp/src/cuSZp.cu b/qtensor/compression/szp/src/cuSZp.cu
index c58cf21f..f506ee97 100644
--- a/qtensor/compression/szp/src/cuSZp.cu
+++ b/qtensor/compression/szp/src/cuSZp.cu
@@ -1,393 +1,393 @@
-#include "cuSZp.h"
-
-__device__ inline int quantization(float data, float recipPrecision)
-{
-    float dataRecip = data*recipPrecision;
-    int s = dataRecip>=-0.5f?0:1;
-    return (int)(dataRecip+0.5f) - s;
-}
-
-
-__device__ inline int get_bit_num(unsigned int x)
-{
-    return (sizeof(unsigned int)*8) - __clz(x);
-}
-
-
-__global__ void SZp_compress_kernel(const float* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle)
-{
-    __shared__ unsigned int base_idx;
-
-    const int tid = threadIdx.x;
-    const int bid = blockIdx.x;
-    const int idx = bid * blockDim.x + tid;
-    const int lane = idx & 31;
-    const int warp = idx >> 5;
-    const int block_num = cmp_chunk/32;
-    const int rate_ofs = (nbEle+31)/32;
-    const float recipPrecision = 0.5f/eb;
-
-    int base_start_idx;
-    int base_block_start_idx, base_block_end_idx;
-    int quant_chunk_idx;
-    int block_idx;
-    int currQuant, lorenQuant, prevQuant, maxQuant;
-    int absQuant[cmp_chunk];
-    unsigned int sign_flag[block_num];
-    int sign_ofs;
-    int fixed_rate[block_num];
-    unsigned int thread_ofs = 0;
-
-    // Prequantization + Lorenzo Prediction + Fixed-length encoding + store fixed-length to global memory.
-    base_start_idx = warp * cmp_chunk * 32;
-    for(int j=0; j<block_num; j++)
-    {
-        // Block initilization.
-        base_block_start_idx = base_start_idx + j * 1024 + lane * 32;
-        base_block_end_idx = base_block_start_idx + 32;
-        sign_flag[j] = 0;
-        block_idx = base_block_start_idx/32;
-        prevQuant = 0;
-        maxQuant = 0;
-
-        // Operation for each block
-        for(int i=base_block_start_idx; i<base_block_end_idx; i++)
-        {
-            // Get quantization and Lorenzo prediction
-            quant_chunk_idx = j * 32 + i % 32;
-            currQuant = quantization(oriData[i], recipPrecision);
-            lorenQuant = currQuant - prevQuant;
-            prevQuant = currQuant;
-            // Get and combine sign info.
-            sign_ofs = i % 32;
-            sign_flag[j] |= (lorenQuant < 0) << (31 - sign_ofs);
-            // Get absolute quant.
-            absQuant[quant_chunk_idx] = abs(lorenQuant);
-            // Update max quant.
-            maxQuant = maxQuant > absQuant[quant_chunk_idx] ? maxQuant : absQuant[quant_chunk_idx];
-        }
-
-        // Record block info.
-        fixed_rate[j] = get_bit_num(maxQuant);
-        thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
-        // Write block fixed rate to compressed data.
-        if(block_idx<rate_ofs) cmpData[block_idx] = (unsigned char)fixed_rate[j];
-        __syncthreads();
-    }
-
-    // Warp-level prefix-sum (inclusive), also thread-block-level.
-    for(int i=1; i<32; i<<=1)
-    {
-        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
-        if(lane >= i) thread_ofs += tmp;
-    }
-    __syncthreads();
-
-    // Write warp(i.e. thread-block)-level prefix-sum to global-memory.
-    if(lane==31) 
-    {
-        cmpOffset[warp+1] = (thread_ofs+7)/8;
-        if(warp==0)
-            flag[1] = 2;
-        else
-            flag[warp+1] = 1;
-    }
-    __syncthreads();
-
-    // Global-level prefix-sum (exclusive).
-    if(warp>0)
-    {
-        if(!lane)
-        {
-            int temp_flag = 1;
-            while(temp_flag!=2) temp_flag = flag[warp];
-            __threadfence();
-            cmpOffset[warp] += cmpOffset[warp-1];
-            __threadfence();
-            flag[warp+1] = 2;
-        }
-    }
-    else
-    {
-        if(!lane) cmpOffset[0] = 0;
-    }
-    __syncthreads();
-    
-    // Assigning compression bytes by given prefix-sum results.
-    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
-    __syncthreads();
-
-    // Bit shuffle for each index, also storing data to global memory.
-    unsigned int base_cmp_byte_ofs = base_idx;
-    unsigned int cmp_byte_ofs;
-    unsigned int tmp_byte_ofs = 0;
-    unsigned int cur_byte_ofs = 0;
-    for(int j=0; j<block_num; j++)
-    {
-        int chunk_idx_start = j*32;
-
-        // Restore index for j-th iteration.
-        tmp_byte_ofs = (fixed_rate[j]) ? (4+fixed_rate[j]*4) : 0;
-        for(int i=1; i<32; i<<=1)
-        {
-            int tmp = __shfl_up_sync(0xffffffff, tmp_byte_ofs, i);
-            if(lane >= i) tmp_byte_ofs += tmp;
-        }
-        unsigned int prev_thread = __shfl_up_sync(0xffffffff, tmp_byte_ofs, 1);
-        if(!lane) cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs;
-        else cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs + prev_thread;
-
-        // Operation for each block, if zero block then do nothing.
-        if(fixed_rate[j])
-        {
-            // Assign sign information for one block.
-            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 24);
-            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 16);
-            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 8);
-            cmpData[cmp_byte_ofs++] = 0xff & sign_flag[j];
-
-            // Assign quant bit information for one block by bit-shuffle.
-            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
-            int mask = 1;
-            for(int i=0; i<fixed_rate[j]; i++)
-            {
-                // Initialization.
-                tmp_char0 = 0;
-                tmp_char1 = 0;
-                tmp_char2 = 0;
-                tmp_char3 = 0;
-
-                // Get ith bit in 0~7 quant, and store to tmp_char0.
-                tmp_char0 = (((absQuant[chunk_idx_start+0] & mask) >> i) << 7) |
-                            (((absQuant[chunk_idx_start+1] & mask) >> i) << 6) |
-                            (((absQuant[chunk_idx_start+2] & mask) >> i) << 5) |
-                            (((absQuant[chunk_idx_start+3] & mask) >> i) << 4) |
-                            (((absQuant[chunk_idx_start+4] & mask) >> i) << 3) |
-                            (((absQuant[chunk_idx_start+5] & mask) >> i) << 2) |
-                            (((absQuant[chunk_idx_start+6] & mask) >> i) << 1) |
-                            (((absQuant[chunk_idx_start+7] & mask) >> i) << 0);
-
-                // Get ith bit in 8~15 quant, and store to tmp_char1.
-                tmp_char1 = (((absQuant[chunk_idx_start+8] & mask) >> i) << 7) |
-                            (((absQuant[chunk_idx_start+9] & mask) >> i) << 6) |
-                            (((absQuant[chunk_idx_start+10] & mask) >> i) << 5) |
-                            (((absQuant[chunk_idx_start+11] & mask) >> i) << 4) |
-                            (((absQuant[chunk_idx_start+12] & mask) >> i) << 3) |
-                            (((absQuant[chunk_idx_start+13] & mask) >> i) << 2) |
-                            (((absQuant[chunk_idx_start+14] & mask) >> i) << 1) |
-                            (((absQuant[chunk_idx_start+15] & mask) >> i) << 0);
-
-                // Get ith bit in 16~23 quant, and store to tmp_char2.
-                tmp_char2 = (((absQuant[chunk_idx_start+16] & mask) >> i) << 7) |
-                            (((absQuant[chunk_idx_start+17] & mask) >> i) << 6) |
-                            (((absQuant[chunk_idx_start+18] & mask) >> i) << 5) |
-                            (((absQuant[chunk_idx_start+19] & mask) >> i) << 4) |
-                            (((absQuant[chunk_idx_start+20] & mask) >> i) << 3) |
-                            (((absQuant[chunk_idx_start+21] & mask) >> i) << 2) |
-                            (((absQuant[chunk_idx_start+22] & mask) >> i) << 1) |
-                            (((absQuant[chunk_idx_start+23] & mask) >> i) << 0);
-                
-                // Get ith bit in 24-31 quant, and store to tmp_char3.
-                tmp_char3 = (((absQuant[chunk_idx_start+24] & mask) >> i) << 7) |
-                            (((absQuant[chunk_idx_start+25] & mask) >> i) << 6) |
-                            (((absQuant[chunk_idx_start+26] & mask) >> i) << 5) |
-                            (((absQuant[chunk_idx_start+27] & mask) >> i) << 4) |
-                            (((absQuant[chunk_idx_start+28] & mask) >> i) << 3) |
-                            (((absQuant[chunk_idx_start+29] & mask) >> i) << 2) |
-                            (((absQuant[chunk_idx_start+30] & mask) >> i) << 1) |
-                            (((absQuant[chunk_idx_start+31] & mask) >> i) << 0);
-
-                // Move data to global memory.
-                cmpData[cmp_byte_ofs++] = tmp_char0;
-                cmpData[cmp_byte_ofs++] = tmp_char1;
-                cmpData[cmp_byte_ofs++] = tmp_char2;
-                cmpData[cmp_byte_ofs++] = tmp_char3;
-                mask <<= 1;
-            }
-        }
-
-        // Index updating across different iterations.
-        cur_byte_ofs += __shfl_sync(0xffffffff, tmp_byte_ofs, 31);
-    }
-}
-
-
-
-__global__ void SZp_decompress_kernel(float* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle)
-{
-    __shared__ unsigned int base_idx;
-
-    const int tid = threadIdx.x;
-    const int bid = blockIdx.x;
-    const int idx = bid * blockDim.x + tid;
-    const int lane = idx & 31;
-    const int warp = idx >> 5;
-    const int block_num = dec_chunk/32;
-    const int rate_ofs = (nbEle+31)/32;
-
-    int base_start_idx;
-    int base_block_start_idx;
-    int block_idx;    
-    int absQuant[32];
-    int currQuant, lorenQuant, prevQuant;
-    int sign_ofs;
-    int fixed_rate[block_num];
-    unsigned int thread_ofs = 0;
-
-    // Obtain fixed rate information for each block.
-    for(int j=0; j<block_num; j++)
-    {
-        block_idx = warp * dec_chunk + j * 32 + lane;
-        if(block_idx<rate_ofs) 
-        {
-            fixed_rate[j] = (int)cmpData[block_idx];
-            thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
-        }
-        __syncthreads();
-    }
-
-    // Warp-level prefix-sum (inclusive), also thread-block-level.
-    for(int i=1; i<32; i<<=1)
-    {
-        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
-        if(lane >= i) thread_ofs += tmp;
-    }
-    __syncthreads();
-
-    // Write warp(i.e. thread-block)-level prefix-sum to global-memory.
-    if(lane==31) 
-    {
-        cmpOffset[warp+1] = (thread_ofs+7)/8;
-        if(warp==0)
-            flag[1] = 2;
-        else
-            flag[warp+1] = 1;
-    }
-    __syncthreads();
-
-    // Global-level prefix-sum (exclusive).
-    if(warp>0)
-    {
-        if(!lane)
-        {
-            int temp_flag = 1;
-            while(temp_flag!=2) temp_flag = flag[warp];
-            __threadfence();
-            cmpOffset[warp] += cmpOffset[warp-1];
-            __threadfence();
-            flag[warp+1] = 2;
-        }
-    }
-    else
-    {
-        if(!lane) cmpOffset[0] = 0;
-    }
-    __syncthreads();
-
-    // Retrieving compression bytes and reconstruct decompression data.
-    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
-    __syncthreads();
-
-    // Restore bit-shuffle for each block.
-    unsigned int base_cmp_byte_ofs = base_idx;
-    unsigned int cmp_byte_ofs;
-    unsigned int tmp_byte_ofs = 0;
-    unsigned int cur_byte_ofs = 0;
-    base_start_idx = warp * dec_chunk * 32;
-    for(int j=0; j<block_num; j++)
-    {
-        // Block initialization.
-        base_block_start_idx = base_start_idx + j * 1024 + lane * 32;
-        unsigned int sign_flag = 0;
-
-        // Restore index for j-th iteration.
-        tmp_byte_ofs = (fixed_rate[j]) ? (4+fixed_rate[j]*4) : 0;
-        for(int i=1; i<32; i<<=1)
-        {
-            int tmp = __shfl_up_sync(0xffffffff, tmp_byte_ofs, i);
-            if(lane >= i) tmp_byte_ofs += tmp;
-        }
-        unsigned int prev_thread = __shfl_up_sync(0xffffffff, tmp_byte_ofs, 1);
-        if(!lane) cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs;
-        else cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs + prev_thread;
-
-        // Operation for each block, if zero block then do nothing.
-        if(fixed_rate[j])
-        {
-            // Retrieve sign information for one block.
-            sign_flag = (0xff000000 & (cmpData[cmp_byte_ofs++] << 24)) |
-                        (0x00ff0000 & (cmpData[cmp_byte_ofs++] << 16)) |
-                        (0x0000ff00 & (cmpData[cmp_byte_ofs++] << 8))  |
-                        (0x000000ff & cmpData[cmp_byte_ofs++]);
-            
-            // Retrieve quant data for one block.
-            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
-            for(int i=0; i<32; i++) absQuant[i] = 0;
-            for(int i=0; i<fixed_rate[j]; i++)
-            {
-                // Initialization.
-                tmp_char0 = cmpData[cmp_byte_ofs++];
-                tmp_char1 = cmpData[cmp_byte_ofs++];
-                tmp_char2 = cmpData[cmp_byte_ofs++];
-                tmp_char3 = cmpData[cmp_byte_ofs++];
-
-                // Get ith bit in 0~7 abs quant from global memory.
-                absQuant[0] |= ((tmp_char0 >> 7) & 0x00000001) << i;
-                absQuant[1] |= ((tmp_char0 >> 6) & 0x00000001) << i;
-                absQuant[2] |= ((tmp_char0 >> 5) & 0x00000001) << i;
-                absQuant[3] |= ((tmp_char0 >> 4) & 0x00000001) << i;
-                absQuant[4] |= ((tmp_char0 >> 3) & 0x00000001) << i;
-                absQuant[5] |= ((tmp_char0 >> 2) & 0x00000001) << i;
-                absQuant[6] |= ((tmp_char0 >> 1) & 0x00000001) << i;
-                absQuant[7] |= ((tmp_char0 >> 0) & 0x00000001) << i;
-
-                // Get ith bit in 8~15 abs quant from global memory.
-                absQuant[8] |= ((tmp_char1 >> 7) & 0x00000001) << i;
-                absQuant[9] |= ((tmp_char1 >> 6) & 0x00000001) << i;
-                absQuant[10] |= ((tmp_char1 >> 5) & 0x00000001) << i;
-                absQuant[11] |= ((tmp_char1 >> 4) & 0x00000001) << i;
-                absQuant[12] |= ((tmp_char1 >> 3) & 0x00000001) << i;
-                absQuant[13] |= ((tmp_char1 >> 2) & 0x00000001) << i;
-                absQuant[14] |= ((tmp_char1 >> 1) & 0x00000001) << i;
-                absQuant[15] |= ((tmp_char1 >> 0) & 0x00000001) << i;
-
-                // Get ith bit in 16-23 abs quant from global memory.
-                absQuant[16] |= ((tmp_char2 >> 7) & 0x00000001) << i;
-                absQuant[17] |= ((tmp_char2 >> 6) & 0x00000001) << i;
-                absQuant[18] |= ((tmp_char2 >> 5) & 0x00000001) << i;
-                absQuant[19] |= ((tmp_char2 >> 4) & 0x00000001) << i;
-                absQuant[20] |= ((tmp_char2 >> 3) & 0x00000001) << i;
-                absQuant[21] |= ((tmp_char2 >> 2) & 0x00000001) << i;
-                absQuant[22] |= ((tmp_char2 >> 1) & 0x00000001) << i;
-                absQuant[23] |= ((tmp_char2 >> 0) & 0x00000001) << i;
-
-                // // Get ith bit in 24-31 abs quant from global memory.
-                absQuant[24] |= ((tmp_char3 >> 7) & 0x00000001) << i;
-                absQuant[25] |= ((tmp_char3 >> 6) & 0x00000001) << i;
-                absQuant[26] |= ((tmp_char3 >> 5) & 0x00000001) << i;
-                absQuant[27] |= ((tmp_char3 >> 4) & 0x00000001) << i;
-                absQuant[28] |= ((tmp_char3 >> 3) & 0x00000001) << i;
-                absQuant[29] |= ((tmp_char3 >> 2) & 0x00000001) << i;
-                absQuant[30] |= ((tmp_char3 >> 1) & 0x00000001) << i;
-                absQuant[31] |= ((tmp_char3 >> 0) & 0x00000001) << i;
-            }
-            
-            // Delorenzo and store data back to decompression data.
-            prevQuant = 0;
-            for(int i=0; i<32; i++)
-            {
-                sign_ofs = i % 32;
-                if(sign_flag & (1 << (31 - sign_ofs)))
-                    lorenQuant = absQuant[i] * -1;
-                else
-                    lorenQuant = absQuant[i];
-                currQuant = lorenQuant + prevQuant;
-                decData[base_block_start_idx+i] = currQuant * eb * 2;
-                prevQuant = currQuant;
-            }
-        }
-
-        // Index updating across different iterations.
-        cur_byte_ofs += __shfl_sync(0xffffffff, tmp_byte_ofs, 31);
-    }
+#include "cuSZp.h"
+
+__device__ inline int quantization(float data, float recipPrecision)
+{
+    float dataRecip = data*recipPrecision;
+    int s = dataRecip>=-0.5f?0:1;
+    return (int)(dataRecip+0.5f) - s;
+}
+
+
+__device__ inline int get_bit_num(unsigned int x)
+{
+    return (sizeof(unsigned int)*8) - __clz(x);
+}
+
+
+__global__ void SZp_compress_kernel(const float* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle)
+{
+    __shared__ unsigned int base_idx;
+
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    const int idx = bid * blockDim.x + tid;
+    const int lane = idx & 31;
+    const int warp = idx >> 5;
+    const int block_num = cmp_chunk/32;
+    const int rate_ofs = (nbEle+31)/32;
+    const float recipPrecision = 0.5f/eb;
+
+    int base_start_idx;
+    int base_block_start_idx, base_block_end_idx;
+    int quant_chunk_idx;
+    int block_idx;
+    int currQuant, lorenQuant, prevQuant, maxQuant;
+    int absQuant[cmp_chunk];
+    unsigned int sign_flag[block_num];
+    int sign_ofs;
+    int fixed_rate[block_num];
+    unsigned int thread_ofs = 0;
+
+    // Prequantization + Lorenzo Prediction + Fixed-length encoding + store fixed-length to global memory.
+    base_start_idx = warp * cmp_chunk * 32;
+    for(int j=0; j<block_num; j++)
+    {
+        // Block initilization.
+        base_block_start_idx = base_start_idx + j * 1024 + lane * 32;
+        base_block_end_idx = base_block_start_idx + 32;
+        sign_flag[j] = 0;
+        block_idx = base_block_start_idx/32;
+        prevQuant = 0;
+        maxQuant = 0;
+
+        // Operation for each block
+        for(int i=base_block_start_idx; i<base_block_end_idx; i++)
+        {
+            // Get quantization and Lorenzo prediction
+            quant_chunk_idx = j * 32 + i % 32;
+            currQuant = quantization(oriData[i], recipPrecision);
+            lorenQuant = currQuant - prevQuant;
+            prevQuant = currQuant;
+            // Get and combine sign info.
+            sign_ofs = i % 32;
+            sign_flag[j] |= (lorenQuant < 0) << (31 - sign_ofs);
+            // Get absolute quant.
+            absQuant[quant_chunk_idx] = abs(lorenQuant);
+            // Update max quant.
+            maxQuant = maxQuant > absQuant[quant_chunk_idx] ? maxQuant : absQuant[quant_chunk_idx];
+        }
+
+        // Record block info.
+        fixed_rate[j] = get_bit_num(maxQuant);
+        thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
+        // Write block fixed rate to compressed data.
+        if(block_idx<rate_ofs) cmpData[block_idx] = (unsigned char)fixed_rate[j];
+        __syncthreads();
+    }
+
+    // Warp-level prefix-sum (inclusive), also thread-block-level.
+    for(int i=1; i<32; i<<=1)
+    {
+        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
+        if(lane >= i) thread_ofs += tmp;
+    }
+    __syncthreads();
+
+    // Write warp(i.e. thread-block)-level prefix-sum to global-memory.
+    if(lane==31) 
+    {
+        cmpOffset[warp+1] = (thread_ofs+7)/8;
+        if(warp==0)
+            flag[1] = 2;
+        else
+            flag[warp+1] = 1;
+    }
+    __syncthreads();
+
+    // Global-level prefix-sum (exclusive).
+    if(warp>0)
+    {
+        if(!lane)
+        {
+            int temp_flag = 1;
+            while(temp_flag!=2) temp_flag = flag[warp];
+            __threadfence();
+            cmpOffset[warp] += cmpOffset[warp-1];
+            __threadfence();
+            flag[warp+1] = 2;
+        }
+    }
+    else
+    {
+        if(!lane) cmpOffset[0] = 0;
+    }
+    __syncthreads();
+    
+    // Assigning compression bytes by given prefix-sum results.
+    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
+    __syncthreads();
+
+    // Bit shuffle for each index, also storing data to global memory.
+    unsigned int base_cmp_byte_ofs = base_idx;
+    unsigned int cmp_byte_ofs;
+    unsigned int tmp_byte_ofs = 0;
+    unsigned int cur_byte_ofs = 0;
+    for(int j=0; j<block_num; j++)
+    {
+        int chunk_idx_start = j*32;
+
+        // Restore index for j-th iteration.
+        tmp_byte_ofs = (fixed_rate[j]) ? (4+fixed_rate[j]*4) : 0;
+        for(int i=1; i<32; i<<=1)
+        {
+            int tmp = __shfl_up_sync(0xffffffff, tmp_byte_ofs, i);
+            if(lane >= i) tmp_byte_ofs += tmp;
+        }
+        unsigned int prev_thread = __shfl_up_sync(0xffffffff, tmp_byte_ofs, 1);
+        if(!lane) cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs;
+        else cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs + prev_thread;
+
+        // Operation for each block, if zero block then do nothing.
+        if(fixed_rate[j])
+        {
+            // Assign sign information for one block.
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 24);
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 16);
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 8);
+            cmpData[cmp_byte_ofs++] = 0xff & sign_flag[j];
+
+            // Assign quant bit information for one block by bit-shuffle.
+            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
+            int mask = 1;
+            for(int i=0; i<fixed_rate[j]; i++)
+            {
+                // Initialization.
+                tmp_char0 = 0;
+                tmp_char1 = 0;
+                tmp_char2 = 0;
+                tmp_char3 = 0;
+
+                // Get ith bit in 0~7 quant, and store to tmp_char0.
+                tmp_char0 = (((absQuant[chunk_idx_start+0] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+1] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+2] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+3] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+4] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+5] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+6] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+7] & mask) >> i) << 0);
+
+                // Get ith bit in 8~15 quant, and store to tmp_char1.
+                tmp_char1 = (((absQuant[chunk_idx_start+8] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+9] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+10] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+11] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+12] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+13] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+14] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+15] & mask) >> i) << 0);
+
+                // Get ith bit in 16~23 quant, and store to tmp_char2.
+                tmp_char2 = (((absQuant[chunk_idx_start+16] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+17] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+18] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+19] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+20] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+21] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+22] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+23] & mask) >> i) << 0);
+                
+                // Get ith bit in 24-31 quant, and store to tmp_char3.
+                tmp_char3 = (((absQuant[chunk_idx_start+24] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+25] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+26] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+27] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+28] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+29] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+30] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+31] & mask) >> i) << 0);
+
+                // Move data to global memory.
+                cmpData[cmp_byte_ofs++] = tmp_char0;
+                cmpData[cmp_byte_ofs++] = tmp_char1;
+                cmpData[cmp_byte_ofs++] = tmp_char2;
+                cmpData[cmp_byte_ofs++] = tmp_char3;
+                mask <<= 1;
+            }
+        }
+
+        // Index updating across different iterations.
+        cur_byte_ofs += __shfl_sync(0xffffffff, tmp_byte_ofs, 31);
+    }
+}
+
+
+
+__global__ void SZp_decompress_kernel(float* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle)
+{
+    __shared__ unsigned int base_idx;
+
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    const int idx = bid * blockDim.x + tid;
+    const int lane = idx & 31;
+    const int warp = idx >> 5;
+    const int block_num = dec_chunk/32;
+    const int rate_ofs = (nbEle+31)/32;
+
+    int base_start_idx;
+    int base_block_start_idx;
+    int block_idx;    
+    int absQuant[32];
+    int currQuant, lorenQuant, prevQuant;
+    int sign_ofs;
+    int fixed_rate[block_num];
+    unsigned int thread_ofs = 0;
+
+    // Obtain fixed rate information for each block.
+    for(int j=0; j<block_num; j++)
+    {
+        block_idx = warp * dec_chunk + j * 32 + lane;
+        if(block_idx<rate_ofs) 
+        {
+            fixed_rate[j] = (int)cmpData[block_idx];
+            thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
+        }
+        __syncthreads();
+    }
+
+    // Warp-level prefix-sum (inclusive), also thread-block-level.
+    for(int i=1; i<32; i<<=1)
+    {
+        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
+        if(lane >= i) thread_ofs += tmp;
+    }
+    __syncthreads();
+
+    // Write warp(i.e. thread-block)-level prefix-sum to global-memory.
+    if(lane==31) 
+    {
+        cmpOffset[warp+1] = (thread_ofs+7)/8;
+        if(warp==0)
+            flag[1] = 2;
+        else
+            flag[warp+1] = 1;
+    }
+    __syncthreads();
+
+    // Global-level prefix-sum (exclusive).
+    if(warp>0)
+    {
+        if(!lane)
+        {
+            int temp_flag = 1;
+            while(temp_flag!=2) temp_flag = flag[warp];
+            __threadfence();
+            cmpOffset[warp] += cmpOffset[warp-1];
+            __threadfence();
+            flag[warp+1] = 2;
+        }
+    }
+    else
+    {
+        if(!lane) cmpOffset[0] = 0;
+    }
+    __syncthreads();
+
+    // Retrieving compression bytes and reconstruct decompression data.
+    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
+    __syncthreads();
+
+    // Restore bit-shuffle for each block.
+    unsigned int base_cmp_byte_ofs = base_idx;
+    unsigned int cmp_byte_ofs;
+    unsigned int tmp_byte_ofs = 0;
+    unsigned int cur_byte_ofs = 0;
+    base_start_idx = warp * dec_chunk * 32;
+    for(int j=0; j<block_num; j++)
+    {
+        // Block initialization.
+        base_block_start_idx = base_start_idx + j * 1024 + lane * 32;
+        unsigned int sign_flag = 0;
+
+        // Restore index for j-th iteration.
+        tmp_byte_ofs = (fixed_rate[j]) ? (4+fixed_rate[j]*4) : 0;
+        for(int i=1; i<32; i<<=1)
+        {
+            int tmp = __shfl_up_sync(0xffffffff, tmp_byte_ofs, i);
+            if(lane >= i) tmp_byte_ofs += tmp;
+        }
+        unsigned int prev_thread = __shfl_up_sync(0xffffffff, tmp_byte_ofs, 1);
+        if(!lane) cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs;
+        else cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs + prev_thread;
+
+        // Operation for each block, if zero block then do nothing.
+        if(fixed_rate[j])
+        {
+            // Retrieve sign information for one block.
+            sign_flag = (0xff000000 & (cmpData[cmp_byte_ofs++] << 24)) |
+                        (0x00ff0000 & (cmpData[cmp_byte_ofs++] << 16)) |
+                        (0x0000ff00 & (cmpData[cmp_byte_ofs++] << 8))  |
+                        (0x000000ff & cmpData[cmp_byte_ofs++]);
+            
+            // Retrieve quant data for one block.
+            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
+            for(int i=0; i<32; i++) absQuant[i] = 0;
+            for(int i=0; i<fixed_rate[j]; i++)
+            {
+                // Initialization.
+                tmp_char0 = cmpData[cmp_byte_ofs++];
+                tmp_char1 = cmpData[cmp_byte_ofs++];
+                tmp_char2 = cmpData[cmp_byte_ofs++];
+                tmp_char3 = cmpData[cmp_byte_ofs++];
+
+                // Get ith bit in 0~7 abs quant from global memory.
+                absQuant[0] |= ((tmp_char0 >> 7) & 0x00000001) << i;
+                absQuant[1] |= ((tmp_char0 >> 6) & 0x00000001) << i;
+                absQuant[2] |= ((tmp_char0 >> 5) & 0x00000001) << i;
+                absQuant[3] |= ((tmp_char0 >> 4) & 0x00000001) << i;
+                absQuant[4] |= ((tmp_char0 >> 3) & 0x00000001) << i;
+                absQuant[5] |= ((tmp_char0 >> 2) & 0x00000001) << i;
+                absQuant[6] |= ((tmp_char0 >> 1) & 0x00000001) << i;
+                absQuant[7] |= ((tmp_char0 >> 0) & 0x00000001) << i;
+
+                // Get ith bit in 8~15 abs quant from global memory.
+                absQuant[8] |= ((tmp_char1 >> 7) & 0x00000001) << i;
+                absQuant[9] |= ((tmp_char1 >> 6) & 0x00000001) << i;
+                absQuant[10] |= ((tmp_char1 >> 5) & 0x00000001) << i;
+                absQuant[11] |= ((tmp_char1 >> 4) & 0x00000001) << i;
+                absQuant[12] |= ((tmp_char1 >> 3) & 0x00000001) << i;
+                absQuant[13] |= ((tmp_char1 >> 2) & 0x00000001) << i;
+                absQuant[14] |= ((tmp_char1 >> 1) & 0x00000001) << i;
+                absQuant[15] |= ((tmp_char1 >> 0) & 0x00000001) << i;
+
+                // Get ith bit in 16-23 abs quant from global memory.
+                absQuant[16] |= ((tmp_char2 >> 7) & 0x00000001) << i;
+                absQuant[17] |= ((tmp_char2 >> 6) & 0x00000001) << i;
+                absQuant[18] |= ((tmp_char2 >> 5) & 0x00000001) << i;
+                absQuant[19] |= ((tmp_char2 >> 4) & 0x00000001) << i;
+                absQuant[20] |= ((tmp_char2 >> 3) & 0x00000001) << i;
+                absQuant[21] |= ((tmp_char2 >> 2) & 0x00000001) << i;
+                absQuant[22] |= ((tmp_char2 >> 1) & 0x00000001) << i;
+                absQuant[23] |= ((tmp_char2 >> 0) & 0x00000001) << i;
+
+                // // Get ith bit in 24-31 abs quant from global memory.
+                absQuant[24] |= ((tmp_char3 >> 7) & 0x00000001) << i;
+                absQuant[25] |= ((tmp_char3 >> 6) & 0x00000001) << i;
+                absQuant[26] |= ((tmp_char3 >> 5) & 0x00000001) << i;
+                absQuant[27] |= ((tmp_char3 >> 4) & 0x00000001) << i;
+                absQuant[28] |= ((tmp_char3 >> 3) & 0x00000001) << i;
+                absQuant[29] |= ((tmp_char3 >> 2) & 0x00000001) << i;
+                absQuant[30] |= ((tmp_char3 >> 1) & 0x00000001) << i;
+                absQuant[31] |= ((tmp_char3 >> 0) & 0x00000001) << i;
+            }
+            
+            // Delorenzo and store data back to decompression data.
+            prevQuant = 0;
+            for(int i=0; i<32; i++)
+            {
+                sign_ofs = i % 32;
+                if(sign_flag & (1 << (31 - sign_ofs)))
+                    lorenQuant = absQuant[i] * -1;
+                else
+                    lorenQuant = absQuant[i];
+                currQuant = lorenQuant + prevQuant;
+                decData[base_block_start_idx+i] = currQuant * eb * 2;
+                prevQuant = currQuant;
+            }
+        }
+
+        // Index updating across different iterations.
+        cur_byte_ofs += __shfl_sync(0xffffffff, tmp_byte_ofs, 31);
+    }
 }
\ No newline at end of file
diff --git a/qtensor/compression/szp/src/cuSZp_entry.cu b/qtensor/compression/szp/src/cuSZp_entry.cu
index a04d8348..e92e669a 100644
--- a/qtensor/compression/szp/src/cuSZp_entry.cu
+++ b/qtensor/compression/szp/src/cuSZp_entry.cu
@@ -1,147 +1,147 @@
-#include "cuSZp_entry.h"
-#include "cuSZp.h"
-
-void SZp_compress_hostptr(float* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound)
-{
-    // Data blocking.
-    int bsize = cmp_tblock_size;
-    int gsize = (nbEle + bsize * cmp_chunk - 1) / (bsize * cmp_chunk);
-    int cmpOffSize = gsize + 1;
-    int pad_nbEle = gsize * bsize * cmp_chunk;
-
-    // Initializing global memory for GPU compression.
-    float* d_oriData;
-    unsigned char* d_cmpData;
-    unsigned int* d_cmpOffset;
-    int* d_flag;
-    unsigned int glob_sync;
-    cudaMalloc((void**)&d_oriData, sizeof(float)*pad_nbEle);
-    cudaMemcpy(d_oriData, oriData, sizeof(float)*pad_nbEle, cudaMemcpyHostToDevice);
-    cudaMalloc((void**)&d_cmpData, sizeof(float)*pad_nbEle);
-    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
-    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
-    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
-    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
-
-    // Initializing CUDA Stream.
-    cudaStream_t stream;
-    cudaStreamCreate(&stream);
-
-    // cuSZp GPU compression.
-    dim3 blockSize(bsize);
-    dim3 gridSize(gsize);
-    SZp_compress_kernel<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
-
-    // Obtain compression ratio and move data back to CPU.  
-    cudaMemcpy(&glob_sync, d_cmpOffset+cmpOffSize-2, sizeof(unsigned int), cudaMemcpyDeviceToHost);
-    *cmpSize = (size_t)glob_sync + (nbEle+31)/32;
-    cudaMemcpy(cmpBytes, d_cmpData, *cmpSize*sizeof(unsigned char), cudaMemcpyDeviceToHost);
-
-    // Free memory that is used.
-    cudaFree(d_oriData);
-    cudaFree(d_cmpData);
-    cudaFree(d_cmpOffset);
-    cudaFree(d_flag);
-    cudaStreamDestroy(stream);
-}
-
-
-void SZp_decompress_hostptr(float* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, float errorBound)
-{
-    // Data blocking.
-    int bsize = dec_tblock_size;
-    int gsize = (nbEle + bsize * dec_chunk - 1) / (bsize * dec_chunk);
-    int cmpOffSize = gsize + 1;
-    int pad_nbEle = gsize * bsize * dec_chunk;
-
-    // Initializing global memory for GPU compression.
-    float* d_decData;
-    unsigned char* d_cmpData;
-    unsigned int* d_cmpOffset;
-    int* d_flag;
-    cudaMalloc((void**)&d_decData, sizeof(float)*pad_nbEle);
-    cudaMemset(d_decData, 0, sizeof(float)*pad_nbEle);
-    cudaMalloc((void**)&d_cmpData, sizeof(float)*pad_nbEle);
-    cudaMemcpy(d_cmpData, cmpBytes, sizeof(unsigned char)*cmpSize, cudaMemcpyHostToDevice);
-    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
-    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
-    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
-    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
-
-    // Initializing CUDA Stream.
-    cudaStream_t stream;
-    cudaStreamCreate(&stream);
-
-    // cuSZp GPU compression.
-    dim3 blockSize(bsize);
-    dim3 gridSize(gsize);
-    SZp_decompress_kernel<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
-    
-    // Move data back to CPU.
-    cudaMemcpy(decData, d_decData, sizeof(float)*nbEle, cudaMemcpyDeviceToHost);
-
-    // Free memoy that is used.
-    cudaFree(d_decData);
-    cudaFree(d_cmpData);
-    cudaFree(d_cmpOffset);
-    cudaFree(d_flag);
-    cudaStreamDestroy(stream);
-}
-
-
-void SZp_compress_deviceptr(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream)
-{
-    // Data blocking.
-    int bsize = cmp_tblock_size;
-    int gsize = (nbEle + bsize * cmp_chunk - 1) / (bsize * cmp_chunk);
-    int cmpOffSize = gsize + 1;
-
-    // Initializing global memory for GPU compression.
-    unsigned int* d_cmpOffset;
-    int* d_flag;
-    unsigned int glob_sync;
-    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
-    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
-    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
-    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
-
-    // cuSZp GPU compression.
-    dim3 blockSize(bsize);
-    dim3 gridSize(gsize);
-    SZp_compress_kernel<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
-    cudaDeviceSynchronize();
-    // Obtain compression ratio and move data back to CPU.  
-    cudaMemcpy(&glob_sync, d_cmpOffset+cmpOffSize-2, sizeof(unsigned int), cudaMemcpyDeviceToHost);
-    *cmpSize = (size_t)glob_sync + (nbEle+31)/32;
-
-    // Free memory that is used.
-    cudaFree(d_cmpOffset);
-    cudaFree(d_flag);
-}
-
-
-void SZp_decompress_deviceptr(float* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, float errorBound, cudaStream_t stream)
-{
-    // Data blocking.
-    int bsize = dec_tblock_size;
-    int gsize = (nbEle + bsize * dec_chunk - 1) / (bsize * dec_chunk);
-    int cmpOffSize = gsize + 1;
-
-    // Initializing global memory for GPU compression.
-    unsigned int* d_cmpOffset;
-    int* d_flag;
-    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
-    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
-    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
-    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
-    cudaMemset(d_decData, 0, sizeof(float)*nbEle);
-
-    // cuSZp GPU compression.
-    dim3 blockSize(bsize);
-    dim3 gridSize(gsize);
-    SZp_decompress_kernel<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
-    cudaDeviceSynchronize();
-    // Free memoy that is used.
-    cudaFree(d_cmpOffset);
-    cudaFree(d_flag);
-}
+#include "cuSZp_entry.h"
+#include "cuSZp.h"
+
+void SZp_compress_hostptr(float* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound)
+{
+    // Data blocking.
+    int bsize = cmp_tblock_size;
+    int gsize = (nbEle + bsize * cmp_chunk - 1) / (bsize * cmp_chunk);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * cmp_chunk;
+
+    // Initializing global memory for GPU compression.
+    float* d_oriData;
+    unsigned char* d_cmpData;
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    unsigned int glob_sync;
+    cudaMalloc((void**)&d_oriData, sizeof(float)*pad_nbEle);
+    cudaMemcpy(d_oriData, oriData, sizeof(float)*pad_nbEle, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_cmpData, sizeof(float)*pad_nbEle);
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_compress_kernel<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
+
+    // Obtain compression ratio and move data back to CPU.  
+    cudaMemcpy(&glob_sync, d_cmpOffset+cmpOffSize-2, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+    *cmpSize = (size_t)glob_sync + (nbEle+31)/32;
+    cudaMemcpy(cmpBytes, d_cmpData, *cmpSize*sizeof(unsigned char), cudaMemcpyDeviceToHost);
+
+    // Free memory that is used.
+    cudaFree(d_oriData);
+    cudaFree(d_cmpData);
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+    cudaStreamDestroy(stream);
+}
+
+
+void SZp_decompress_hostptr(float* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, float errorBound)
+{
+    // Data blocking.
+    int bsize = dec_tblock_size;
+    int gsize = (nbEle + bsize * dec_chunk - 1) / (bsize * dec_chunk);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * dec_chunk;
+
+    // Initializing global memory for GPU compression.
+    float* d_decData;
+    unsigned char* d_cmpData;
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_decData, sizeof(float)*pad_nbEle);
+    cudaMemset(d_decData, 0, sizeof(float)*pad_nbEle);
+    cudaMalloc((void**)&d_cmpData, sizeof(float)*pad_nbEle);
+    cudaMemcpy(d_cmpData, cmpBytes, sizeof(unsigned char)*cmpSize, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_decompress_kernel<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
+    
+    // Move data back to CPU.
+    cudaMemcpy(decData, d_decData, sizeof(float)*nbEle, cudaMemcpyDeviceToHost);
+
+    // Free memoy that is used.
+    cudaFree(d_decData);
+    cudaFree(d_cmpData);
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+    cudaStreamDestroy(stream);
+}
+
+
+void SZp_compress_deviceptr(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream)
+{
+    // Data blocking.
+    int bsize = cmp_tblock_size;
+    int gsize = (nbEle + bsize * cmp_chunk - 1) / (bsize * cmp_chunk);
+    int cmpOffSize = gsize + 1;
+
+    // Initializing global memory for GPU compression.
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    unsigned int glob_sync;
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_compress_kernel<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+    // Obtain compression ratio and move data back to CPU.  
+    cudaMemcpy(&glob_sync, d_cmpOffset+cmpOffSize-2, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+    *cmpSize = (size_t)glob_sync + (nbEle+31)/32;
+
+    // Free memory that is used.
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+}
+
+
+void SZp_decompress_deviceptr(float* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, float errorBound, cudaStream_t stream)
+{
+    // Data blocking.
+    int bsize = dec_tblock_size;
+    int gsize = (nbEle + bsize * dec_chunk - 1) / (bsize * dec_chunk);
+    int cmpOffSize = gsize + 1;
+
+    // Initializing global memory for GPU compression.
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+    cudaMemset(d_decData, 0, sizeof(float)*nbEle);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_decompress_kernel<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+    // Free memoy that is used.
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+}
diff --git a/qtensor/compression/szp/src/cuSZp_timer.cu b/qtensor/compression/szp/src/cuSZp_timer.cu
index 74c81c30..5148af98 100644
--- a/qtensor/compression/szp/src/cuSZp_timer.cu
+++ b/qtensor/compression/szp/src/cuSZp_timer.cu
@@ -1,31 +1,31 @@
-#include "cuSZp_timer.h"
-
-TimingGPU::TimingGPU() { privateTimingGPU = new PrivateTimingGPU;  }
-
-TimingGPU::~TimingGPU() { }
-
-void TimingGPU::StartCounter()
-{
-    cudaEventCreate(&((*privateTimingGPU).start));
-    cudaEventCreate(&((*privateTimingGPU).stop));
-    cudaEventRecord((*privateTimingGPU).start,0);
-}
-
-void TimingGPU::StartCounterFlags()
-{
-    int eventflags = cudaEventBlockingSync;
-
-    cudaEventCreateWithFlags(&((*privateTimingGPU).start),eventflags);
-    cudaEventCreateWithFlags(&((*privateTimingGPU).stop),eventflags);
-    cudaEventRecord((*privateTimingGPU).start,0);
-}
-
-// Gets the counter in ms
-float TimingGPU::GetCounter()
-{
-    float time;
-    cudaEventRecord((*privateTimingGPU).stop, 0);
-    cudaEventSynchronize((*privateTimingGPU).stop);
-    cudaEventElapsedTime(&time,(*privateTimingGPU).start,(*privateTimingGPU).stop);
-    return time;
-}
+#include "cuSZp_timer.h"
+
+TimingGPU::TimingGPU() { privateTimingGPU = new PrivateTimingGPU;  }
+
+TimingGPU::~TimingGPU() { }
+
+void TimingGPU::StartCounter()
+{
+    cudaEventCreate(&((*privateTimingGPU).start));
+    cudaEventCreate(&((*privateTimingGPU).stop));
+    cudaEventRecord((*privateTimingGPU).start,0);
+}
+
+void TimingGPU::StartCounterFlags()
+{
+    int eventflags = cudaEventBlockingSync;
+
+    cudaEventCreateWithFlags(&((*privateTimingGPU).start),eventflags);
+    cudaEventCreateWithFlags(&((*privateTimingGPU).stop),eventflags);
+    cudaEventRecord((*privateTimingGPU).start,0);
+}
+
+// Gets the counter in ms
+float TimingGPU::GetCounter()
+{
+    float time;
+    cudaEventRecord((*privateTimingGPU).stop, 0);
+    cudaEventSynchronize((*privateTimingGPU).stop);
+    cudaEventElapsedTime(&time,(*privateTimingGPU).start,(*privateTimingGPU).stop);
+    return time;
+}
diff --git a/qtensor/compression/szp/src/cuSZp_utility.cu b/qtensor/compression/szp/src/cuSZp_utility.cu
index 784d378a..ac4006d7 100644
--- a/qtensor/compression/szp/src/cuSZp_utility.cu
+++ b/qtensor/compression/szp/src/cuSZp_utility.cu
@@ -1,493 +1,493 @@
-//
-// Created by Yafan Huang on 5/31/22.
-//     Copied from SZx.
-//
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <math.h>
-#include <string.h>
-#include "cuSZp_utility.h"
-
-/*Macro Definition for Processing Data*/
-// #define SZ_SCES 0  //successful
-#define RW_SCES 0
-#define RW_FERR 1
-#define RW_TERR 2
-#define LITTLE_ENDIAN_SYSTEM 0
-#define QCAT_BUFS 64
-
-/*Global Varaibles for Processing Data*/
-int dataEndianType_Yafan = 0;
-int sysEndianType_Yafan = 0; //0 means little endian, 1 means big endian
-
-typedef union lint32
-{
-	int ivalue;
-	unsigned int uivalue;
-	unsigned char byte[4];
-} lint32;
-
-typedef union llfloat
-{
-    float value;
-    unsigned int ivalue;
-    unsigned char byte[4];
-} llfloat;
-
-/** ************************************************************************
- * @brief Reverse 4-bit-length unsigned char array.
- * 
- * @param   data[4]         4-bit-length unsigned char array.
- * *********************************************************************** */
-void symTransForm_4Bytes(unsigned char data[4])
-{
-        unsigned char tmp = data[0];
-        data[0] = data[3];
-        data[3] = tmp;
-
-        tmp = data[1];
-        data[1] = data[2];
-        data[2] = tmp;
-}
-
-/** ************************************************************************
- * @brief Read byte data from path to source binary format file.
- *        Usually used for decompressing data from input file.
- *        Variables byteLength and status can be obtained through this function.       
- * 
- * @param   srcFilePath     input source file path
- * @param   byteLength      the length of byte array
- * @param   status          data processing states (macro definitions) 
- * 
- * @return  byteBuf         unsigned char array with length byteLength
- * *********************************************************************** */
-unsigned char *readByteData_Yafan(char *srcFilePath, size_t *byteLength, int *status)
-{
-	FILE *pFile = fopen(srcFilePath, "rb");
-    if (pFile == NULL)
-    {
-        printf("Failed to open input file. 1\n");
-        *status = RW_FERR;
-        return 0;
-    }
-	fseek(pFile, 0, SEEK_END);
-    *byteLength = ftell(pFile);
-    fclose(pFile);
-    
-    unsigned char *byteBuf = ( unsigned char *)malloc((*byteLength)*sizeof(unsigned char)); //sizeof(char)==1
-    
-    pFile = fopen(srcFilePath, "rb");
-    if (pFile == NULL)
-    {
-        printf("Failed to open input file. 2\n");
-        *status = RW_FERR;
-        return 0;
-    }
-    fread(byteBuf, 1, *byteLength, pFile);
-    fclose(pFile);
-    *status = RW_SCES;
-    return byteBuf;
-}
-
-/** ************************************************************************
- * @brief Read float data from path to source binary format file in endian systems.
- *        Usually used for compressing data from input file.
- *        Variables nbEle and status can be obtained through this function. 
- * 
- * @param   srcFilePath     input source file path
- * @param   nbEle           the length of float array
- * @param   status          data processing states (macro definitions) 
- * 
- * @return  daBuf           float array with length nbEle
- * *********************************************************************** */
-float *readFloatData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status)
-{
-	size_t inSize;
-	FILE *pFile = fopen(srcFilePath, "rb");
-    if (pFile == NULL)
-    {
-        printf("Failed to open input file. 1\n");
-        *status = RW_FERR;
-        return NULL;
-    }
-	fseek(pFile, 0, SEEK_END);
-    inSize = ftell(pFile);
-    *nbEle = inSize/4; 
-    fclose(pFile);
-    
-    if(inSize<=0)
-    {
-		printf("Error: input file is wrong!\n");
-		*status = RW_FERR;
-	}
-    
-    float *daBuf = (float *)malloc(inSize);
-    
-    pFile = fopen(srcFilePath, "rb");
-    if (pFile == NULL)
-    {
-        printf("Failed to open input file. 2\n");
-        *status = RW_FERR;
-        return NULL;
-    }
-    fread(daBuf, 4, *nbEle, pFile);
-    fclose(pFile);
-    *status = RW_SCES;
-    return daBuf;
-}
-
-/** ************************************************************************
- * @brief Read float data from path to source binary format file.
- *        Usually used for compressing data from input file.
- *        Variables nbEle and status can be obtained through this function. 
- * 
- * @param   srcFilePath     input source file path
- * @param   nbEle           the length of float array
- * @param   status          data processing states (macro definitions) 
- * 
- * @return  daBuf           float array with length nbEle
- * *********************************************************************** */
-float *readFloatData_Yafan(char *srcFilePath, size_t *nbEle, int *status)
-{
-	int state = RW_SCES;
-	if(dataEndianType_Yafan==sysEndianType_Yafan)
-	{
-		float *daBuf = readFloatData_systemEndian_Yafan(srcFilePath, nbEle, &state);
-		*status = state;
-		return daBuf;
-	}
-	else
-	{
-		size_t i,j;
-		
-		size_t byteLength;
-		unsigned char* bytes = readByteData_Yafan(srcFilePath, &byteLength, &state);
-		if(state == RW_FERR)
-		{
-			*status = RW_FERR;
-			return NULL;
-		}
-		float *daBuf = (float *)malloc(byteLength);
-		*nbEle = byteLength/4;
-		
-		llfloat buf;
-		for(i = 0;i<*nbEle;i++)
-		{
-			j = i*4;
-			memcpy(buf.byte, bytes+j, 4);
-			symTransForm_4Bytes(buf.byte);
-			daBuf[i] = buf.value;
-		}
-		free(bytes);
-		return daBuf;
-	}
-}
-
-/** ************************************************************************
- * @brief Write byte data to binary format file.
- *        Usually used for writing compressed data.
- *        Variable status can be obtained/switched through this function. 
- * 
- * @param   bytes           unsigned char array (compressed data)
- * @param   byteLength      the length of unsigned char array
- * @param   tgtFilePath     output file path
- * @param   status          data processing states (macro definitions) 
- * *********************************************************************** */
-void writeByteData_Yafan(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status)
-{
-	FILE *pFile = fopen(tgtFilePath, "wb");
-    if (pFile == NULL)
-    {
-        printf("Failed to open input file. 3\n");
-        *status = RW_FERR;
-        return;
-    }
-    
-    fwrite(bytes, 1, byteLength, pFile); //write outSize bytes
-    fclose(pFile);
-    *status = RW_SCES;
-}
-
-/** ************************************************************************
- * @brief Write float data to binary format file.
- *        Usually used for writing decompressed (reconstructed) data.
- *        Variable status can be obtained/switched through this function. 
- * 
- * @param   bytes           unsigned char array (compressed data)
- * @param   nbEle           the length of float array
- * @param   tgtFilePath     output file path
- * @param   status          data processing states (macro definitions) 
- * *********************************************************************** */
-void writeFloatData_inBytes_Yafan(float *data, size_t nbEle, char* tgtFilePath, int *status)
-{
-	size_t i = 0; 
-	int state = RW_SCES;
-	llfloat buf;
-	unsigned char* bytes = (unsigned char*)malloc(nbEle*sizeof(float));
-	for(i=0;i<nbEle;i++)
-	{
-		buf.value = data[i];
-		bytes[i*4+0] = buf.byte[0];
-		bytes[i*4+1] = buf.byte[1];
-		bytes[i*4+2] = buf.byte[2];
-		bytes[i*4+3] = buf.byte[3];					
-	}
-
-	size_t byteLength = nbEle*sizeof(float);
-	writeByteData_Yafan(bytes, byteLength, tgtFilePath, &state);
-	free(bytes);
-	*status = state;
-}
-
-// void convertIntArrayToBytes(int* states, size_t stateLength, unsigned char* bytes)
-// {
-// 	lint32 ls;
-// 	size_t index = 0;
-// 	size_t i;
-// 	if(sysEndianType_Yafan==dataEndianType_Yafan)
-// 	{
-// 		for(i=0;i<stateLength;i++)
-// 		{
-// 			index = i << 2; //==i*4
-// 			ls.ivalue = states[i];
-// 			bytes[index] = ls.byte[0];
-// 			bytes[index+1] = ls.byte[1];
-// 			bytes[index+2] = ls.byte[2];
-// 			bytes[index+3] = ls.byte[3];
-// 		}		
-// 	}
-// 	else
-// 	{
-// 		for(i=0;i<stateLength;i++)
-// 		{
-// 			index = i << 2; //==i*4
-// 			ls.ivalue = states[i];
-// 			bytes[index] = ls.byte[3];
-// 			bytes[index+1] = ls.byte[2];
-// 			bytes[index+2] = ls.byte[1];
-// 			bytes[index+3] = ls.byte[0];
-// 		}			
-// 	}
-// }
-
-// void writeIntData_inBytes(int *states, size_t stateLength, char *tgtFilePath, int *status)
-// {
-// 	int state = SZ_SCES;
-// 	size_t byteLength = stateLength*4;
-// 	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
-// 	convertIntArrayToBytes(states, stateLength, bytes);
-// 	writeByteData_Yafan(bytes, byteLength, tgtFilePath, &state);
-// 	free(bytes);
-// 	*status = state;
-// }
-
-
-/** ************************************************************************
- * @brief Calculate SSIM in a small fraction of a 3D data file.
- *        A subfunction used in computeSSIM().
- * 
- * @param   data            original float array
- * @param   other           other (reconstructed) float array
- * @param   size1           3d-ssim setting.
- * @param   size0           3d-ssim setting.
- * @param   offset0         3d-ssim setting.
- * @param   offset1         3d-ssim setting.
- * @param   offset2         3d-ssim setting.
- * @param   windowSize0     3d-ssim setting.
- * @param   windowSize1     3d-ssim setting.
- * @param   windowSize2     3d-ssim setting.
- * 
- * @return  ssim            ssim value of the current small fraction data
- * *********************************************************************** */
-double SSIM_3d_calcWindow_float(float* data, float* other, size_t size1, size_t size0, int offset0, int offset1, int offset2, int windowSize0, int windowSize1, int windowSize2) {
-    int i0,i1,i2,index;
-    int np=0; //Number of points
-    float xMin=data[offset0+size0*(offset1+size1*offset2)];
-    float xMax=data[offset0+size0*(offset1+size1*offset2)];
-    float yMin=other[offset0+size0*(offset1+size1*offset2)];
-    float yMax=other[offset0+size0*(offset1+size1*offset2)];
-    double xSum=0;
-    double ySum=0;
-    for(i2=offset2; i2<offset2+windowSize2; i2++) {
-        for(i1=offset1; i1<offset1+windowSize1; i1++) {
-            for(i0=offset0; i0<offset0+windowSize0; i0++) {
-                np++;
-                index=i0+size0*(i1+size1*i2);
-                if(xMin>data[index])
-                    xMin=data[index];
-                if(xMax<data[index])
-                    xMax=data[index];
-                if(yMin>other[index])
-                    yMin=other[index];
-                if(yMax<other[index])
-                    yMax=other[index];
-                xSum+=data[index];
-                ySum+=other[index];
-            }
-        }
-    }
-    double xMean=xSum/np;
-    double yMean=ySum/np;
-    double var_x = 0, var_y = 0, var_xy = 0;
-    for(i2=offset2; i2<offset2+windowSize2; i2++) {
-        for(i1=offset1; i1<offset1+windowSize1; i1++) {
-            for(i0=offset0; i0<offset0+windowSize0; i0++) {
-                index=i0+size0*(i1+size1*i2);
-                var_x += (data[index] - xMean)*(data[index] - xMean);
-                var_y += (other[index] - yMean)*(other[index] - yMean);
-                var_xy += (data[index] - xMean)*(other[index] - yMean);
-            }
-        }
-    }
-    var_x /= np;
-    var_y /= np;
-    var_xy /= np;
-    double xSigma=sqrt(var_x);
-    double ySigma=sqrt(var_y);
-    double xyCov = var_xy;
-    double c1,c2;
-    if(xMax-xMin==0) {
-		/*K1==0.01, K2==0.03*/
-        c1=0.01*0.01;
-        c2=0.03*0.03;
-    } else {
-        c1=0.01*0.01*(xMax-xMin)*(xMax-xMin);
-        c2=0.03*0.03*(xMax-xMin)*(xMax-xMin);
-    }
-    double c3=c2/2;
-    double luminance=(2*xMean*yMean+c1)/(xMean*xMean+yMean*yMean+c1);
-    double contrast=(2*xSigma*ySigma+c2)/(xSigma*xSigma+ySigma*ySigma+c2);
-    double structure=(xyCov+c3)/(xSigma*ySigma+c3);
-    double ssim=luminance*contrast*structure;
-    return ssim;
-}
-
-/** ************************************************************************
- * @brief Calculate SSIM between 3D original and decompressed (reconstructed) data.
- *        API for computing SSIM.
- * 
- * @param   oriData         original float array
- * @param   decData         decompressed (reconstructed) float array
- * @param   size2           the 1st dim of 3D data.
- * @param   size1           the 2nd dim of 3D data.
- * @param   size0           the 3rd dim of 3D data. (the fastest dim)
- * 
- * @return  ssimSum/nw      final ssim value between oriData and decData
- * *********************************************************************** */
-double computeSSIM(float* oriData, float* decData, size_t size2, size_t size1, size_t size0)
-{
-	int windowSize0=7;
-	int windowSize1=7;
-	int windowSize2=7;
-	int windowShift0=2;
-	int windowShift1=2;
-	int windowShift2=2;
-    int offset0,offset1,offset2;
-    int nw=0; //Number of windows
-    double ssimSum=0;
-    int offsetInc0,offsetInc1,offsetInc2;
-    if(windowSize0>size0) {
-        printf("ERROR: windowSize0 = %d > %zu\n", windowSize0, size0);
-    }
-    if(windowSize1>size1) {
-        printf("ERROR: windowSize1 = %d > %zu\n", windowSize1, size1);
-    }
-    if(windowSize2>size2) {
-        printf("ERROR: windowSize2 = %d > %zu\n", windowSize2, size2);
-    }
-    //offsetInc0=windowSize0/2;
-    //offsetInc1=windowSize1/2;
-    //offsetInc2=windowSize2/2;
-    offsetInc0=windowShift0;
-    offsetInc1=windowShift1;
-    offsetInc2=windowShift2;
-    for(offset2=0; offset2+windowSize2<=size2; offset2+=offsetInc2) { //MOVING WINDOW
-        for(offset1=0; offset1+windowSize1<=size1; offset1+=offsetInc1) { //MOVING WINDOW
-            for(offset0=0; offset0+windowSize0<=size0; offset0+=offsetInc0) { //MOVING WINDOW
-                nw++;
-                ssimSum+=SSIM_3d_calcWindow_float(oriData, decData, size1, size0, offset0, offset1, offset2, windowSize0, windowSize1, windowSize2);
-            }
-        }
-    }
-    return ssimSum/nw;
-}
-
-
-/** ************************************************************************
- * @brief Calculate PSNR between 3D original and decompressed (reconstructed) data.
- *        API for computing PSNR.
- * 
- * @param   nbEle           the length of float array
- * @param   ori_data        original float array
- * @param   dec_data        decompressed (reconstructed) float array
- * 
- * @return  result          6-length double array, which contains:
- *                              0. *Mean Square Error (MSE)*
- *                              1. *Value Range (Max-Min)*
- *                              2. *Peak Signal-to-noise Ratio (PSNR)*
- *                              3. Squared Error
- *                              4. Normalized Squared Error
- *                              5. Normalized Squared MSE
- * *********************************************************************** */
-double *computePSNR(size_t nbEle, float *ori_data, float *data) {
-    size_t i = 0;
-    double Max = 0, Min = 0, diffMax = 0;
-    Max = ori_data[0];
-    Min = ori_data[0];
-    diffMax = data[0] > ori_data[0] ? data[0] - ori_data[0] : ori_data[0] - data[0];
-
-    //diffMax = fabs(data[0] - ori_data[0]);
-    double sum1 = 0, sum2 = 0, sum22 = 0;
-
-    for (i = 0; i < nbEle; i++) {
-        sum1 += ori_data[i];
-        sum2 += data[i];
-        sum22 += data[i] * data[i];
-    }
-    double mean1 = sum1 / nbEle;
-    double mean2 = sum2 / nbEle;
-
-    double sum3 = 0, sum4 = 0;
-    double sum = 0, prodSum = 0, relerr = 0;
-
-    double maxpw_relerr = 0;
-    for (i = 0; i < nbEle; i++) {
-        if (Max < ori_data[i]) Max = ori_data[i];
-        if (Min > ori_data[i]) Min = ori_data[i];
-
-        float err = fabs(data[i] - ori_data[i]);
-        if (ori_data[i] != 0) {
-            relerr = err / fabs(ori_data[i]);
-            if (maxpw_relerr < relerr)
-                maxpw_relerr = relerr;
-        }
-
-        if (diffMax < err)
-            diffMax = err;
-        prodSum += (ori_data[i] - mean1) * (data[i] - mean2);
-        sum3 += (ori_data[i] - mean1) * (ori_data[i] - mean1);
-        sum4 += (data[i] - mean2) * (data[i] - mean2);
-        sum += err * err;
-    }
-    double std1 = sqrt(sum3 / nbEle);
-    double std2 = sqrt(sum4 / nbEle);
-    double ee = prodSum / nbEle;
-    double acEff = ee / std1 / std2;
-
-    double mse = sum / nbEle;
-    double range = Max - Min;
-    double psnr = 20 * log10(range) - 10 * log10(mse);
-    double normErr = sqrt(sum);
-    double normErr_norm = normErr / sqrt(sum22);
-    double nrmse = sqrt(mse) / range;
-    double *result = (double *) malloc(sizeof(double) * 6);
-    result[0] = mse;
-    result[1] = range;
-    result[2] = psnr;
-    result[3] = normErr;
-    result[4] = normErr_norm;
-    result[5] = nrmse;
-
-    return result;
+//
+// Created by Yafan Huang on 5/31/22.
+//     Copied from SZx.
+//
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <math.h>
+#include <string.h>
+#include "cuSZp_utility.h"
+
+/*Macro Definition for Processing Data*/
+// #define SZ_SCES 0  //successful
+#define RW_SCES 0
+#define RW_FERR 1
+#define RW_TERR 2
+#define LITTLE_ENDIAN_SYSTEM 0
+#define QCAT_BUFS 64
+
+/*Global Varaibles for Processing Data*/
+int dataEndianType_Yafan = 0;
+int sysEndianType_Yafan = 0; //0 means little endian, 1 means big endian
+
+typedef union lint32
+{
+	int ivalue;
+	unsigned int uivalue;
+	unsigned char byte[4];
+} lint32;
+
+typedef union llfloat
+{
+    float value;
+    unsigned int ivalue;
+    unsigned char byte[4];
+} llfloat;
+
+/** ************************************************************************
+ * @brief Reverse 4-bit-length unsigned char array.
+ * 
+ * @param   data[4]         4-bit-length unsigned char array.
+ * *********************************************************************** */
+void symTransForm_4Bytes(unsigned char data[4])
+{
+        unsigned char tmp = data[0];
+        data[0] = data[3];
+        data[3] = tmp;
+
+        tmp = data[1];
+        data[1] = data[2];
+        data[2] = tmp;
+}
+
+/** ************************************************************************
+ * @brief Read byte data from path to source binary format file.
+ *        Usually used for decompressing data from input file.
+ *        Variables byteLength and status can be obtained through this function.       
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   byteLength      the length of byte array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  byteBuf         unsigned char array with length byteLength
+ * *********************************************************************** */
+unsigned char *readByteData_Yafan(char *srcFilePath, size_t *byteLength, int *status)
+{
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = RW_FERR;
+        return 0;
+    }
+	fseek(pFile, 0, SEEK_END);
+    *byteLength = ftell(pFile);
+    fclose(pFile);
+    
+    unsigned char *byteBuf = ( unsigned char *)malloc((*byteLength)*sizeof(unsigned char)); //sizeof(char)==1
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = RW_FERR;
+        return 0;
+    }
+    fread(byteBuf, 1, *byteLength, pFile);
+    fclose(pFile);
+    *status = RW_SCES;
+    return byteBuf;
+}
+
+/** ************************************************************************
+ * @brief Read float data from path to source binary format file in endian systems.
+ *        Usually used for compressing data from input file.
+ *        Variables nbEle and status can be obtained through this function. 
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   nbEle           the length of float array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  daBuf           float array with length nbEle
+ * *********************************************************************** */
+float *readFloatData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = RW_FERR;
+        return NULL;
+    }
+	fseek(pFile, 0, SEEK_END);
+    inSize = ftell(pFile);
+    *nbEle = inSize/4; 
+    fclose(pFile);
+    
+    if(inSize<=0)
+    {
+		printf("Error: input file is wrong!\n");
+		*status = RW_FERR;
+	}
+    
+    float *daBuf = (float *)malloc(inSize);
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = RW_FERR;
+        return NULL;
+    }
+    fread(daBuf, 4, *nbEle, pFile);
+    fclose(pFile);
+    *status = RW_SCES;
+    return daBuf;
+}
+
+/** ************************************************************************
+ * @brief Read float data from path to source binary format file.
+ *        Usually used for compressing data from input file.
+ *        Variables nbEle and status can be obtained through this function. 
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   nbEle           the length of float array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  daBuf           float array with length nbEle
+ * *********************************************************************** */
+float *readFloatData_Yafan(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = RW_SCES;
+	if(dataEndianType_Yafan==sysEndianType_Yafan)
+	{
+		float *daBuf = readFloatData_systemEndian_Yafan(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+		
+		size_t byteLength;
+		unsigned char* bytes = readByteData_Yafan(srcFilePath, &byteLength, &state);
+		if(state == RW_FERR)
+		{
+			*status = RW_FERR;
+			return NULL;
+		}
+		float *daBuf = (float *)malloc(byteLength);
+		*nbEle = byteLength/4;
+		
+		llfloat buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i*4;
+			memcpy(buf.byte, bytes+j, 4);
+			symTransForm_4Bytes(buf.byte);
+			daBuf[i] = buf.value;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+/** ************************************************************************
+ * @brief Write byte data to binary format file.
+ *        Usually used for writing compressed data.
+ *        Variable status can be obtained/switched through this function. 
+ * 
+ * @param   bytes           unsigned char array (compressed data)
+ * @param   byteLength      the length of unsigned char array
+ * @param   tgtFilePath     output file path
+ * @param   status          data processing states (macro definitions) 
+ * *********************************************************************** */
+void writeByteData_Yafan(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status)
+{
+	FILE *pFile = fopen(tgtFilePath, "wb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 3\n");
+        *status = RW_FERR;
+        return;
+    }
+    
+    fwrite(bytes, 1, byteLength, pFile); //write outSize bytes
+    fclose(pFile);
+    *status = RW_SCES;
+}
+
+/** ************************************************************************
+ * @brief Write float data to binary format file.
+ *        Usually used for writing decompressed (reconstructed) data.
+ *        Variable status can be obtained/switched through this function. 
+ * 
+ * @param   bytes           unsigned char array (compressed data)
+ * @param   nbEle           the length of float array
+ * @param   tgtFilePath     output file path
+ * @param   status          data processing states (macro definitions) 
+ * *********************************************************************** */
+void writeFloatData_inBytes_Yafan(float *data, size_t nbEle, char* tgtFilePath, int *status)
+{
+	size_t i = 0; 
+	int state = RW_SCES;
+	llfloat buf;
+	unsigned char* bytes = (unsigned char*)malloc(nbEle*sizeof(float));
+	for(i=0;i<nbEle;i++)
+	{
+		buf.value = data[i];
+		bytes[i*4+0] = buf.byte[0];
+		bytes[i*4+1] = buf.byte[1];
+		bytes[i*4+2] = buf.byte[2];
+		bytes[i*4+3] = buf.byte[3];					
+	}
+
+	size_t byteLength = nbEle*sizeof(float);
+	writeByteData_Yafan(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+// void convertIntArrayToBytes(int* states, size_t stateLength, unsigned char* bytes)
+// {
+// 	lint32 ls;
+// 	size_t index = 0;
+// 	size_t i;
+// 	if(sysEndianType_Yafan==dataEndianType_Yafan)
+// 	{
+// 		for(i=0;i<stateLength;i++)
+// 		{
+// 			index = i << 2; //==i*4
+// 			ls.ivalue = states[i];
+// 			bytes[index] = ls.byte[0];
+// 			bytes[index+1] = ls.byte[1];
+// 			bytes[index+2] = ls.byte[2];
+// 			bytes[index+3] = ls.byte[3];
+// 		}		
+// 	}
+// 	else
+// 	{
+// 		for(i=0;i<stateLength;i++)
+// 		{
+// 			index = i << 2; //==i*4
+// 			ls.ivalue = states[i];
+// 			bytes[index] = ls.byte[3];
+// 			bytes[index+1] = ls.byte[2];
+// 			bytes[index+2] = ls.byte[1];
+// 			bytes[index+3] = ls.byte[0];
+// 		}			
+// 	}
+// }
+
+// void writeIntData_inBytes(int *states, size_t stateLength, char *tgtFilePath, int *status)
+// {
+// 	int state = SZ_SCES;
+// 	size_t byteLength = stateLength*4;
+// 	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+// 	convertIntArrayToBytes(states, stateLength, bytes);
+// 	writeByteData_Yafan(bytes, byteLength, tgtFilePath, &state);
+// 	free(bytes);
+// 	*status = state;
+// }
+
+
+/** ************************************************************************
+ * @brief Calculate SSIM in a small fraction of a 3D data file.
+ *        A subfunction used in computeSSIM().
+ * 
+ * @param   data            original float array
+ * @param   other           other (reconstructed) float array
+ * @param   size1           3d-ssim setting.
+ * @param   size0           3d-ssim setting.
+ * @param   offset0         3d-ssim setting.
+ * @param   offset1         3d-ssim setting.
+ * @param   offset2         3d-ssim setting.
+ * @param   windowSize0     3d-ssim setting.
+ * @param   windowSize1     3d-ssim setting.
+ * @param   windowSize2     3d-ssim setting.
+ * 
+ * @return  ssim            ssim value of the current small fraction data
+ * *********************************************************************** */
+double SSIM_3d_calcWindow_float(float* data, float* other, size_t size1, size_t size0, int offset0, int offset1, int offset2, int windowSize0, int windowSize1, int windowSize2) {
+    int i0,i1,i2,index;
+    int np=0; //Number of points
+    float xMin=data[offset0+size0*(offset1+size1*offset2)];
+    float xMax=data[offset0+size0*(offset1+size1*offset2)];
+    float yMin=other[offset0+size0*(offset1+size1*offset2)];
+    float yMax=other[offset0+size0*(offset1+size1*offset2)];
+    double xSum=0;
+    double ySum=0;
+    for(i2=offset2; i2<offset2+windowSize2; i2++) {
+        for(i1=offset1; i1<offset1+windowSize1; i1++) {
+            for(i0=offset0; i0<offset0+windowSize0; i0++) {
+                np++;
+                index=i0+size0*(i1+size1*i2);
+                if(xMin>data[index])
+                    xMin=data[index];
+                if(xMax<data[index])
+                    xMax=data[index];
+                if(yMin>other[index])
+                    yMin=other[index];
+                if(yMax<other[index])
+                    yMax=other[index];
+                xSum+=data[index];
+                ySum+=other[index];
+            }
+        }
+    }
+    double xMean=xSum/np;
+    double yMean=ySum/np;
+    double var_x = 0, var_y = 0, var_xy = 0;
+    for(i2=offset2; i2<offset2+windowSize2; i2++) {
+        for(i1=offset1; i1<offset1+windowSize1; i1++) {
+            for(i0=offset0; i0<offset0+windowSize0; i0++) {
+                index=i0+size0*(i1+size1*i2);
+                var_x += (data[index] - xMean)*(data[index] - xMean);
+                var_y += (other[index] - yMean)*(other[index] - yMean);
+                var_xy += (data[index] - xMean)*(other[index] - yMean);
+            }
+        }
+    }
+    var_x /= np;
+    var_y /= np;
+    var_xy /= np;
+    double xSigma=sqrt(var_x);
+    double ySigma=sqrt(var_y);
+    double xyCov = var_xy;
+    double c1,c2;
+    if(xMax-xMin==0) {
+		/*K1==0.01, K2==0.03*/
+        c1=0.01*0.01;
+        c2=0.03*0.03;
+    } else {
+        c1=0.01*0.01*(xMax-xMin)*(xMax-xMin);
+        c2=0.03*0.03*(xMax-xMin)*(xMax-xMin);
+    }
+    double c3=c2/2;
+    double luminance=(2*xMean*yMean+c1)/(xMean*xMean+yMean*yMean+c1);
+    double contrast=(2*xSigma*ySigma+c2)/(xSigma*xSigma+ySigma*ySigma+c2);
+    double structure=(xyCov+c3)/(xSigma*ySigma+c3);
+    double ssim=luminance*contrast*structure;
+    return ssim;
+}
+
+/** ************************************************************************
+ * @brief Calculate SSIM between 3D original and decompressed (reconstructed) data.
+ *        API for computing SSIM.
+ * 
+ * @param   oriData         original float array
+ * @param   decData         decompressed (reconstructed) float array
+ * @param   size2           the 1st dim of 3D data.
+ * @param   size1           the 2nd dim of 3D data.
+ * @param   size0           the 3rd dim of 3D data. (the fastest dim)
+ * 
+ * @return  ssimSum/nw      final ssim value between oriData and decData
+ * *********************************************************************** */
+double computeSSIM(float* oriData, float* decData, size_t size2, size_t size1, size_t size0)
+{
+	int windowSize0=7;
+	int windowSize1=7;
+	int windowSize2=7;
+	int windowShift0=2;
+	int windowShift1=2;
+	int windowShift2=2;
+    int offset0,offset1,offset2;
+    int nw=0; //Number of windows
+    double ssimSum=0;
+    int offsetInc0,offsetInc1,offsetInc2;
+    if(windowSize0>size0) {
+        printf("ERROR: windowSize0 = %d > %zu\n", windowSize0, size0);
+    }
+    if(windowSize1>size1) {
+        printf("ERROR: windowSize1 = %d > %zu\n", windowSize1, size1);
+    }
+    if(windowSize2>size2) {
+        printf("ERROR: windowSize2 = %d > %zu\n", windowSize2, size2);
+    }
+    //offsetInc0=windowSize0/2;
+    //offsetInc1=windowSize1/2;
+    //offsetInc2=windowSize2/2;
+    offsetInc0=windowShift0;
+    offsetInc1=windowShift1;
+    offsetInc2=windowShift2;
+    for(offset2=0; offset2+windowSize2<=size2; offset2+=offsetInc2) { //MOVING WINDOW
+        for(offset1=0; offset1+windowSize1<=size1; offset1+=offsetInc1) { //MOVING WINDOW
+            for(offset0=0; offset0+windowSize0<=size0; offset0+=offsetInc0) { //MOVING WINDOW
+                nw++;
+                ssimSum+=SSIM_3d_calcWindow_float(oriData, decData, size1, size0, offset0, offset1, offset2, windowSize0, windowSize1, windowSize2);
+            }
+        }
+    }
+    return ssimSum/nw;
+}
+
+
+/** ************************************************************************
+ * @brief Calculate PSNR between 3D original and decompressed (reconstructed) data.
+ *        API for computing PSNR.
+ * 
+ * @param   nbEle           the length of float array
+ * @param   ori_data        original float array
+ * @param   dec_data        decompressed (reconstructed) float array
+ * 
+ * @return  result          6-length double array, which contains:
+ *                              0. *Mean Square Error (MSE)*
+ *                              1. *Value Range (Max-Min)*
+ *                              2. *Peak Signal-to-noise Ratio (PSNR)*
+ *                              3. Squared Error
+ *                              4. Normalized Squared Error
+ *                              5. Normalized Squared MSE
+ * *********************************************************************** */
+double *computePSNR(size_t nbEle, float *ori_data, float *data) {
+    size_t i = 0;
+    double Max = 0, Min = 0, diffMax = 0;
+    Max = ori_data[0];
+    Min = ori_data[0];
+    diffMax = data[0] > ori_data[0] ? data[0] - ori_data[0] : ori_data[0] - data[0];
+
+    //diffMax = fabs(data[0] - ori_data[0]);
+    double sum1 = 0, sum2 = 0, sum22 = 0;
+
+    for (i = 0; i < nbEle; i++) {
+        sum1 += ori_data[i];
+        sum2 += data[i];
+        sum22 += data[i] * data[i];
+    }
+    double mean1 = sum1 / nbEle;
+    double mean2 = sum2 / nbEle;
+
+    double sum3 = 0, sum4 = 0;
+    double sum = 0, prodSum = 0, relerr = 0;
+
+    double maxpw_relerr = 0;
+    for (i = 0; i < nbEle; i++) {
+        if (Max < ori_data[i]) Max = ori_data[i];
+        if (Min > ori_data[i]) Min = ori_data[i];
+
+        float err = fabs(data[i] - ori_data[i]);
+        if (ori_data[i] != 0) {
+            relerr = err / fabs(ori_data[i]);
+            if (maxpw_relerr < relerr)
+                maxpw_relerr = relerr;
+        }
+
+        if (diffMax < err)
+            diffMax = err;
+        prodSum += (ori_data[i] - mean1) * (data[i] - mean2);
+        sum3 += (ori_data[i] - mean1) * (ori_data[i] - mean1);
+        sum4 += (data[i] - mean2) * (data[i] - mean2);
+        sum += err * err;
+    }
+    double std1 = sqrt(sum3 / nbEle);
+    double std2 = sqrt(sum4 / nbEle);
+    double ee = prodSum / nbEle;
+    double acEff = ee / std1 / std2;
+
+    double mse = sum / nbEle;
+    double range = Max - Min;
+    double psnr = 20 * log10(range) - 10 * log10(mse);
+    double normErr = sqrt(sum);
+    double normErr_norm = normErr / sqrt(sum22);
+    double nrmse = sqrt(mse) / range;
+    double *result = (double *) malloc(sizeof(double) * 6);
+    result[0] = mse;
+    result[1] = range;
+    result[2] = psnr;
+    result[3] = normErr;
+    result[4] = normErr_norm;
+    result[5] = nrmse;
+
+    return result;
 }
\ No newline at end of file
diff --git a/qtensor/compression/szp/src/cuSZp_wrapper.cu b/qtensor/compression/szp/src/cuSZp_wrapper.cu
index 4d83f283..803dbbe1 100644
--- a/qtensor/compression/szp/src/cuSZp_wrapper.cu
+++ b/qtensor/compression/szp/src/cuSZp_wrapper.cu
@@ -1,37 +1,37 @@
-#include "cuSZp_entry.h"
-#include "cuSZp_timer.h"
-#include "cuSZp_utility.h"
-#include "cuSZp.h"
-
-
-extern "C"{
-    /** Before entering SZp_compress, must allocate on device:
-     * - d_cmpBytes
-    */
-    unsigned char* cuSZp_device_compress(float *oriData, size_t *outSize, float absErrBound, size_t nbEle){
-        unsigned char *d_cmpBytes, *d_finalCmpBytes;
-        cudaStream_t stream;
-        cudaStreamCreate(&stream);
-        cudaMalloc((void**)&d_cmpBytes, sizeof(float)*nbEle);
-        SZp_compress_deviceptr(oriData, d_cmpBytes, nbEle, outSize, absErrBound, stream);
-        cudaMalloc((void**)&d_finalCmpBytes, *outSize);
-        cudaMemcpy(d_finalCmpBytes, d_cmpBytes, *outSize, cudaMemcpyDeviceToDevice);
-        cudaFree(d_cmpBytes);
-	//cudaFree(oriData);
-        return d_finalCmpBytes;
-    }
-
-    /** Before entering SZp_decompress, must allocate on device:
-     * - d_decData
-    */
-    float* cuSZp_device_decompress(size_t nbEle, unsigned char* cmpBytes, size_t cmpSize, float errorBound){
-        float *d_decData;
-        cudaStream_t stream;
-        cudaStreamCreate(&stream);
-        cudaMalloc((void**)&d_decData, sizeof(float)*nbEle);
-        SZp_decompress_deviceptr(d_decData, cmpBytes, nbEle, cmpSize, errorBound, stream);
-        cudaFree(cmpBytes);
-	return d_decData;
-    }
-    
-}
+#include "cuSZp_entry.h"
+#include "cuSZp_timer.h"
+#include "cuSZp_utility.h"
+#include "cuSZp.h"
+
+
+extern "C"{
+    /** Before entering SZp_compress, must allocate on device:
+     * - d_cmpBytes
+    */
+    unsigned char* cuSZp_device_compress(float *oriData, size_t *outSize, float absErrBound, size_t nbEle){
+        unsigned char *d_cmpBytes, *d_finalCmpBytes;
+        cudaStream_t stream;
+        cudaStreamCreate(&stream);
+        cudaMalloc((void**)&d_cmpBytes, sizeof(float)*nbEle);
+        SZp_compress_deviceptr(oriData, d_cmpBytes, nbEle, outSize, absErrBound, stream);
+        cudaMalloc((void**)&d_finalCmpBytes, *outSize);
+        cudaMemcpy(d_finalCmpBytes, d_cmpBytes, *outSize, cudaMemcpyDeviceToDevice);
+        cudaFree(d_cmpBytes);
+	//cudaFree(oriData);
+        return d_finalCmpBytes;
+    }
+
+    /** Before entering SZp_decompress, must allocate on device:
+     * - d_decData
+    */
+    float* cuSZp_device_decompress(size_t nbEle, unsigned char* cmpBytes, size_t cmpSize, float errorBound){
+        float *d_decData;
+        cudaStream_t stream;
+        cudaStreamCreate(&stream);
+        cudaMalloc((void**)&d_decData, sizeof(float)*nbEle);
+        SZp_decompress_deviceptr(d_decData, cmpBytes, nbEle, cmpSize, errorBound, stream);
+        cudaFree(cmpBytes);
+	return d_decData;
+    }
+    
+}
diff --git a/qtensor/compression/szp/src/cuSZp_wrapper.py b/qtensor/compression/szp/src/cuSZp_wrapper.py
index 6f4053ba..9abe1fb1 100644
--- a/qtensor/compression/szp/src/cuSZp_wrapper.py
+++ b/qtensor/compression/szp/src/cuSZp_wrapper.py
@@ -1,190 +1,190 @@
-import numpy as np
-import ctypes
-from ctypes import *
-import random
-from qtensor.tools.lazy_import import cupy as cp
-import time
-import torch
-
-from pathlib import Path
-#LIB_PATH = str(Path(__file__).parent/'libcuszp_wrapper.so')
-LIB_PATH = '/home/mkshah5/QTensor/qtensor/compression/szp/src/libcuszp_wrapper.so'
-# unsigned char* cuSZp_device_compress(float *oriData, size_t *outSize, float absErrBound, size_t nbEle){
-
-def get_device_compress():
-    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
-    func = dll.cuSZp_device_compress
-    # Returns: unsigned char *bytes
-    # Needs: float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold
-    func.argtypes = [POINTER(c_float), POINTER(c_size_t), c_float, c_size_t]
-    func.restype = POINTER(c_ubyte)
-    return func
-
-# float* cuSZp_device_decompress(size_t nbEle, unsigned char* cmpBytes, size_t cmpSize, float errorBound){
-
-def get_device_decompress():
-    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
-    func = dll.cuSZp_device_decompress
-    # Returns: float *newData
-    # Needs: size_t nbEle, unsigned char *cmpBytes
-    func.argtypes = [c_size_t, POINTER(c_ubyte), c_size_t, c_float]
-    func.restype = POINTER(c_float)
-    return func
-
-
-
-def cuszp_device_compress(oriData, absErrBound, nbEle,threshold):
-    __cuszp_device_compress = get_device_compress()
-    
-    ori_nbEle = nbEle
-    variable = ctypes.c_size_t(0)
-    outSize = ctypes.pointer(variable)
-    
-    oriData = oriData.flatten()
-    #ori_real = oriData.real
-    #ori_imag = oriData.imag
-    #oriData = cp.concatenate((ori_real, ori_imag))
-    #sample = oriData[::2]
-    
-    
-    d = cp.amax(oriData) - cp.amin(oriData)
-    #print("max min time (s): " +str(time.time()-v_time))
-    d = d.get()
-    if d.dtype == np.complex64:
-        #d = min(d.real, d.imag)
-        d = d.real
-    absErrBound = absErrBound*(d)
-    threshold = threshold*(d)
-    s_1 = time.time() 
-    #print(cp.get_array_module(oriData))    
-    truth_values = cp.absolute(oriData)<=threshold
-    #oriData[truth_values] = 0.0
-    truth_values = cp.invert(truth_values)
-    # oriData = oriData[truth_values]
-    bitmap = truth_values
-    nbEle = oriData.shape[0]*2
-    
-
-    oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float))
-    #print("starting") 
-    # float *oriData, size_t *outSize, float absErrBound, size_t nbEle
-    o_bytes = __cuszp_device_compress(oriData_p, outSize,np.float32(absErrBound), np.ulonglong(nbEle))
-
-    mempool = cp.get_default_memory_pool()
-    pinned_mempool = cp.get_default_pinned_memory_pool()
-    #del oriData
-
-    #print("tg and max time (s): "+str(time.time()-s_1))
-    #print("bitmap shape: "+str(bitmap.shape[0]))
-    #print("percent nonzero bytes: "+str(bitmap[cp.nonzero(bitmap)].shape[0]/bitmap.shape[0]))
-    #print("CR")
-    #print((ori_nbEle*4)/(outSize[0] + bitmap.shape[0]/8))
-    return (o_bytes,bitmap, absErrBound), outSize
-
-
-def cuszp_device_decompress(nbEle, cmpBytes, cmpSize, owner, dtype):
-    __cuszp_device_decompress=get_device_decompress()
-    (cmpBytes, bitmap, absErrBound) = cmpBytes
-    #print("bitmap len:" +str(len(bitmap)))
-    #print(nbEle)
-    #tmp_nbEle = nbEle
-    # tmp_nbEle = cp.count_nonzero(bitmap).item()
-#    print(tmp_nbEle)
-    nbEle_p = ctypes.c_size_t(nbEle)
-    # size_t nbEle, unsigned char* cmpBytes, size_t cmpSize, float errorBound
-    newData = __cuszp_device_decompress(nbEle_p,cmpBytes, np.ulonglong(cmpSize), np.float32(absErrBound))
-
-    # decompressed_ptr = self.cuszp_decompress(isCuPy, cmp_bytes, num_elements_eff)
-    # -- Workaround to convert GPU pointer to int
-    p_decompressed_ptr = ctypes.addressof(newData)
-    # cast to int64 pointer
-    # (effectively converting pointer to pointer to addr to pointer to int64)
-    p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-    decompressed_int = p_decompressed_int.contents
-    # --
-    pointer_for_free = decompressed_int.value
-    # self.decompressed_own.append(decompressed_int.value)
-    mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle, owner, device_id=0)
-    mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
-    #print("mem ptr")
-    #print(mem_ptr)
-    arr = cp.ndarray(shape=nbEle, dtype=cp.float32, memptr=mem_ptr)
-#    print("attempt alloc")
-    # res = cp.zeros(nbEle,dtype=cp.float32)
-#    print("alloc passed")
-    ## need to convert newData to cupy
-    # cp.putmask(res,bitmap,arr)
-    mempool = cp.get_default_memory_pool()
-    pinned_mempool = cp.get_default_pinned_memory_pool()
-    #del arr
-    
-    #print(res[0])
-    #print(res[int(nbEle/2)])
-    #reshaped_data = arr.reshape(-1,2)
-    reshaped_data = arr.reshape(-1,2)
-    #c_res = arr
-    c_res = reshaped_data.view(dtype=np.complex64)
-    #print(c_res[0])
-    #c_res = cp.zeros(int(nbEle/2), np.complex64)
-    #c_res.real = res[0:int(nbEle/2)]
-    #c_res.imag = res[int(nbEle/2):]
-    #del res
-    #del bitmap
-    #mempool.free_all_blocks()
-    #pinned_mempool.free_all_blocks()
-
-    return (c_res, pointer_for_free)
-
-### Example of device compress/decompress wrapper usage
-class Comp():
-    def __init__(self):
-        self.name = "dummy"
-
-if __name__ == "__main__":
-    
-    DATA_SIZE = int(1024)
-    MAX_D = 10.0
-    MIN_D = -10.0
-    RANGE = MAX_D - MIN_D
-    r2r_threshold = 0.002
-    r2r_error = 0.0001
-
-    in_vector = np.fromfile("real_sample.bin", dtype=np.float32)
-    #print(np.max(in_vector))
-    DATA_SIZE = len(in_vector)
-    #range_vr = np.max(in_vector)-np.min(in_vector)
-    #r2r_threshold = r2r_threshold*range_vr
-    #r2r_error = r2r_error*range_vr
-    #in_vector = np.zeros((DATA_SIZE,))
-    #for i in range(0,int(DATA_SIZE/4)):
-    #    in_vector[i] = 0.0
-    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
-    #    in_vector[i] = 5.0
-    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
-    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
-    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
-    #    in_vector[i] = -7.0
-    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
-    #    in_vector[i] = 0.001
-
-    print(DATA_SIZE)
-    in_vector = in_vector.astype('float32')
-    in_vector_gpu = cp.asarray(in_vector)
-    
-    # variable = ctypes.c_size_t(0)
-    # outSize = ctypes.pointer(variable)
-    for i in range(30):
-        s_time = time.time()
-        o_bytes, outSize = cuszp_device_compress(in_vector_gpu, r2r_error, DATA_SIZE,r2r_threshold)
-        print("Time python: "+str(time.time()-s_time))
-        print(outSize[0])
-        print("Compress Success...starting decompress ")
-        comp = Comp()
-
-        s_time = time.time()
-        (d_bytes,ptr )= cuszp_device_decompress(DATA_SIZE, o_bytes,outSize[0], comp, in_vector_gpu.dtype)
-    
-        print("Time python: "+str(time.time()-s_time))
-    #for i in d_bytes:
-    #    print(i)
-        print("Decompress Success")
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+from qtensor.tools.lazy_import import cupy as cp
+import time
+import torch
+
+from pathlib import Path
+#LIB_PATH = str(Path(__file__).parent/'libcuszp_wrapper.so')
+LIB_PATH = '/home/mkshah5/QTensor/qtensor/compression/szp/src/libcuszp_wrapper.so'
+# unsigned char* cuSZp_device_compress(float *oriData, size_t *outSize, float absErrBound, size_t nbEle){
+
+def get_device_compress():
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.cuSZp_device_compress
+    # Returns: unsigned char *bytes
+    # Needs: float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold
+    func.argtypes = [POINTER(c_float), POINTER(c_size_t), c_float, c_size_t]
+    func.restype = POINTER(c_ubyte)
+    return func
+
+# float* cuSZp_device_decompress(size_t nbEle, unsigned char* cmpBytes, size_t cmpSize, float errorBound){
+
+def get_device_decompress():
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.cuSZp_device_decompress
+    # Returns: float *newData
+    # Needs: size_t nbEle, unsigned char *cmpBytes
+    func.argtypes = [c_size_t, POINTER(c_ubyte), c_size_t, c_float]
+    func.restype = POINTER(c_float)
+    return func
+
+
+
+def cuszp_device_compress(oriData, absErrBound, nbEle,threshold):
+    __cuszp_device_compress = get_device_compress()
+    
+    ori_nbEle = nbEle
+    variable = ctypes.c_size_t(0)
+    outSize = ctypes.pointer(variable)
+    
+    oriData = oriData.flatten()
+    #ori_real = oriData.real
+    #ori_imag = oriData.imag
+    #oriData = cp.concatenate((ori_real, ori_imag))
+    #sample = oriData[::2]
+    
+    
+    d = cp.amax(oriData) - cp.amin(oriData)
+    #print("max min time (s): " +str(time.time()-v_time))
+    d = d.get()
+    if d.dtype == np.complex64:
+        #d = min(d.real, d.imag)
+        d = d.real
+    absErrBound = absErrBound*(d)
+    threshold = threshold*(d)
+    s_1 = time.time() 
+    #print(cp.get_array_module(oriData))    
+    truth_values = cp.absolute(oriData)<=threshold
+    #oriData[truth_values] = 0.0
+    truth_values = cp.invert(truth_values)
+    # oriData = oriData[truth_values]
+    bitmap = truth_values
+    nbEle = oriData.shape[0]*2
+    
+
+    oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float))
+    #print("starting") 
+    # float *oriData, size_t *outSize, float absErrBound, size_t nbEle
+    o_bytes = __cuszp_device_compress(oriData_p, outSize,np.float32(absErrBound), np.ulonglong(nbEle))
+
+    mempool = cp.get_default_memory_pool()
+    pinned_mempool = cp.get_default_pinned_memory_pool()
+    #del oriData
+
+    #print("tg and max time (s): "+str(time.time()-s_1))
+    #print("bitmap shape: "+str(bitmap.shape[0]))
+    #print("percent nonzero bytes: "+str(bitmap[cp.nonzero(bitmap)].shape[0]/bitmap.shape[0]))
+    #print("CR")
+    #print((ori_nbEle*4)/(outSize[0] + bitmap.shape[0]/8))
+    return (o_bytes,bitmap, absErrBound), outSize
+
+
+def cuszp_device_decompress(nbEle, cmpBytes, cmpSize, owner, dtype):
+    __cuszp_device_decompress=get_device_decompress()
+    (cmpBytes, bitmap, absErrBound) = cmpBytes
+    #print("bitmap len:" +str(len(bitmap)))
+    #print(nbEle)
+    #tmp_nbEle = nbEle
+    # tmp_nbEle = cp.count_nonzero(bitmap).item()
+#    print(tmp_nbEle)
+    nbEle_p = ctypes.c_size_t(nbEle)
+    # size_t nbEle, unsigned char* cmpBytes, size_t cmpSize, float errorBound
+    newData = __cuszp_device_decompress(nbEle_p,cmpBytes, np.ulonglong(cmpSize), np.float32(absErrBound))
+
+    # decompressed_ptr = self.cuszp_decompress(isCuPy, cmp_bytes, num_elements_eff)
+    # -- Workaround to convert GPU pointer to int
+    p_decompressed_ptr = ctypes.addressof(newData)
+    # cast to int64 pointer
+    # (effectively converting pointer to pointer to addr to pointer to int64)
+    p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decompressed_int = p_decompressed_int.contents
+    # --
+    pointer_for_free = decompressed_int.value
+    # self.decompressed_own.append(decompressed_int.value)
+    mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle, owner, device_id=0)
+    mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
+    #print("mem ptr")
+    #print(mem_ptr)
+    arr = cp.ndarray(shape=nbEle, dtype=cp.float32, memptr=mem_ptr)
+#    print("attempt alloc")
+    # res = cp.zeros(nbEle,dtype=cp.float32)
+#    print("alloc passed")
+    ## need to convert newData to cupy
+    # cp.putmask(res,bitmap,arr)
+    mempool = cp.get_default_memory_pool()
+    pinned_mempool = cp.get_default_pinned_memory_pool()
+    #del arr
+    
+    #print(res[0])
+    #print(res[int(nbEle/2)])
+    #reshaped_data = arr.reshape(-1,2)
+    reshaped_data = arr.reshape(-1,2)
+    #c_res = arr
+    c_res = reshaped_data.view(dtype=np.complex64)
+    #print(c_res[0])
+    #c_res = cp.zeros(int(nbEle/2), np.complex64)
+    #c_res.real = res[0:int(nbEle/2)]
+    #c_res.imag = res[int(nbEle/2):]
+    #del res
+    #del bitmap
+    #mempool.free_all_blocks()
+    #pinned_mempool.free_all_blocks()
+
+    return (c_res, pointer_for_free)
+
+### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = int(1024)
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.002
+    r2r_error = 0.0001
+
+    in_vector = np.fromfile("real_sample.bin", dtype=np.float32)
+    #print(np.max(in_vector))
+    DATA_SIZE = len(in_vector)
+    #range_vr = np.max(in_vector)-np.min(in_vector)
+    #r2r_threshold = r2r_threshold*range_vr
+    #r2r_error = r2r_error*range_vr
+    #in_vector = np.zeros((DATA_SIZE,))
+    #for i in range(0,int(DATA_SIZE/4)):
+    #    in_vector[i] = 0.0
+    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+    #    in_vector[i] = 5.0
+    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
+    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+    #    in_vector[i] = -7.0
+    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+    #    in_vector[i] = 0.001
+
+    print(DATA_SIZE)
+    in_vector = in_vector.astype('float32')
+    in_vector_gpu = cp.asarray(in_vector)
+    
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+    for i in range(30):
+        s_time = time.time()
+        o_bytes, outSize = cuszp_device_compress(in_vector_gpu, r2r_error, DATA_SIZE,r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        print(outSize[0])
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        (d_bytes,ptr )= cuszp_device_decompress(DATA_SIZE, o_bytes,outSize[0], comp, in_vector_gpu.dtype)
+    
+        print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
+        print("Decompress Success")
diff --git a/qtensor/compression/torch_quant/torch_quant.py b/qtensor/compression/torch_quant/torch_quant.py
index c5f04fc6..bbea4657 100644
--- a/qtensor/compression/torch_quant/torch_quant.py
+++ b/qtensor/compression/torch_quant/torch_quant.py
@@ -1,174 +1,174 @@
-import numpy as np
-import ctypes
-from ctypes import *
-import random
-from qtensor.tools.lazy_import import cupy as cp
-import time
-import torch
-
-from pathlib import Path
-
-
-
-def quant_device_compress(oriData, nbEle, blockSize,threshold):
-    #print(nbEle)
-    ori_nbEle = nbEle
-    variable = ctypes.c_size_t(0)
-    outSize = ctypes.pointer(variable)
-
-    oriData = oriData.flatten()
-    ori_real = oriData.real
-    ori_imag = oriData.imag
-    oriData = cp.concatenate((ori_real, ori_imag))
-    sample = oriData[::2]
-    max_val = cp.amax(oriData).get()
-    min_val = cp.amin(oriData).get()
-    d = max_val - min_val
-    if d.dtype == np.complex64:
-        d = d.real
-    threshold = threshold*(d)
-    s_1 = time.time() 
-    truth_values = abs(oriData)<=threshold
-    oriData[truth_values] = 0.0
-    truth_values = cp.invert(truth_values)
-    ori_len = oriData.shape[0]
-    nonzero_percent = cp.count_nonzero(oriData)/oriData.shape[0]
-    print("Percent nonzero: "+str(nonzero_percent))
-
-    isGrouped = False
-    if nonzero_percent<=0.5:
-        isGrouped=True
-        oriData = oriData[truth_values]
-    
-    nbEle = oriData.shape[0]
-    
-    # oriData = cp.reshape(oriData, (-1, blockSize))  # Reshape to blocksize
-    tensor = torch.as_tensor(oriData, device='cuda')
-    # print("Min val: "+str(cp.amin(oriData).get())+" range: "+str(d))
-#    scale = d/255.0
-#    zero_point = -1*round(min_val*scale) - 128
-
-    scale = d/((2**8) - 1)
-    #zero_point = -1*round(min_val*scale)
-    zero_point = -1*round(min_val*scale)+32
-#    q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
-    
-    q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
-    del tensor
-    torch.cuda.empty_cache()
-    if isGrouped:
-        bitmap = cp.packbits(truth_values)
-    else:
-        bitmap = None
-    del truth_values
-    #q_ten2 = torch.dequantize(q_tensor)
-    #print(tensor)
-    #print(q_ten2)
-    #print("Max PW error")
-    #print(torch.max(torch.div(torch.abs(torch.sub(tensor[tensor!=0.0],q_ten2[tensor!=0.0])),tensor[tensor!=0.0])))
-    return (q_tensor, bitmap, isGrouped), (nbEle/4)+(ori_len/8)
-
-
-def quant_device_decompress(nbEle, cmpBytes, owner, dtype):
-    (q_tensor, bitmap, isGrouped) = cmpBytes
-    if isGrouped:
-        bitmap = cp.unpackbits(bitmap)
-    restored = torch.dequantize(q_tensor)
-    arr = cp.asarray(restored)
-    # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error
-
-    # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
-    # -- Workaround to convert GPU pointer to int
-    # p_decompressed_ptr = ctypes.addressof(newData)
-    # cast to int64 pointer
-    # (effectively converting pointer to pointer to addr to pointer to int64)
-    # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-    # decompressed_int = p_decompressed_int.contents
-    # # --
-    # pointer_for_free = decompressed_int.value
-    # # self.decompressed_own.append(decompressed_int.value)
-    # mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0)
-    # mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
-    #print("mem ptr")
-    #print(mem_ptr)
-    # arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr)
-    #print(nbEle)
-    if isGrouped:
-        res = cp.zeros((nbEle,))
-    # ## need to convert newData to cupy
-        cp.place(res,bitmap,arr)
-
-        c_res = cp.zeros(int(nbEle/2), np.complex64)
-    #c_res.real = arr[0:int(nbEle/2)]
-    #c_res.imag = arr[int(nbEle/2):]
-
-        c_res.real = res[0:int(nbEle/2)]
-        c_res.imag = res[int(nbEle/2):]
-    else:
-        c_res = cp.zeros(int(nbEle/2), np.complex64)
-        c_res.real = arr[0:int(nbEle/2)]
-        c_res.imag = arr[int(nbEle/2):]
-    return (c_res, None)
-
-### Example of device compress/decompress wrapper usage
-class Comp():
-    def __init__(self):
-        self.name = "dummy"
-
-def free_compressed(ptr):
-    p_ptr = ctypes.addressof(ptr)
-    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
-    decomp_int = p_int.contents
-    cp.cuda.runtime.free(decomp_int.value)
-
-
-if __name__ == "__main__":
-    
-    DATA_SIZE = int(1024)
-    MAX_D = 10.0
-    MIN_D = -10.0
-    RANGE = MAX_D - MIN_D
-    r2r_threshold = 0.002
-    r2r_error = 0.0001
-
-    in_vector = np.fromfile("all_sample.bin", dtype=np.complex64)
-    #print(np.max(in_vector))
-    DATA_SIZE = len(in_vector)
-    #range_vr = np.max(in_vector)-np.min(in_vector)
-    #r2r_threshold = r2r_threshold*range_vr
-    #r2r_error = r2r_error*range_vr
-    #in_vector = np.zeros((DATA_SIZE,))
-    #for i in range(0,int(DATA_SIZE/4)):
-    #    in_vector[i] = 0.0
-    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
-    #    in_vector[i] = 5.0
-    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
-    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
-    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
-    #    in_vector[i] = -7.0
-    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
-    #    in_vector[i] = 0.001
-
-    print(DATA_SIZE)
-    #in_vector = in_vector.astype('float32')
-    in_vector_gpu = cp.asarray(in_vector)
-    
-    # variable = ctypes.c_size_t(0)
-    # outSize = ctypes.pointer(variable)
-    for i in range(200):
-        s_time = time.time()
-        o_bytes, outSize = quant_device_compress(in_vector_gpu, DATA_SIZE, 256, r2r_threshold)
-        print("Time python: "+str(time.time()-s_time))
-        # print(outSize[0])
-        print("Compress Success...starting decompress ")
-        comp = Comp()
-
-        s_time = time.time()
-        (d_bytes,ptr )= quant_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
-        
-        # free_compressed(o_bytes[0])
-        # cp.cuda.runtime.free(ptr)
-        print("Time python: "+str(time.time()-s_time))
-    #for i in d_bytes:
-    #    print(i)
-        print("Decompress Success")
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+from qtensor.tools.lazy_import import cupy as cp
+import time
+import torch
+
+from pathlib import Path
+
+
+
+def quant_device_compress(oriData, nbEle, blockSize,threshold):
+    #print(nbEle)
+    ori_nbEle = nbEle
+    variable = ctypes.c_size_t(0)
+    outSize = ctypes.pointer(variable)
+
+    oriData = oriData.flatten()
+    ori_real = oriData.real
+    ori_imag = oriData.imag
+    oriData = cp.concatenate((ori_real, ori_imag))
+    sample = oriData[::2]
+    max_val = cp.amax(oriData).get()
+    min_val = cp.amin(oriData).get()
+    d = max_val - min_val
+    if d.dtype == np.complex64:
+        d = d.real
+    threshold = threshold*(d)
+    s_1 = time.time() 
+    truth_values = abs(oriData)<=threshold
+    oriData[truth_values] = 0.0
+    truth_values = cp.invert(truth_values)
+    ori_len = oriData.shape[0]
+    nonzero_percent = cp.count_nonzero(oriData)/oriData.shape[0]
+    print("Percent nonzero: "+str(nonzero_percent))
+
+    isGrouped = False
+    if nonzero_percent<=0.5:
+        isGrouped=True
+        oriData = oriData[truth_values]
+    
+    nbEle = oriData.shape[0]
+    
+    # oriData = cp.reshape(oriData, (-1, blockSize))  # Reshape to blocksize
+    tensor = torch.as_tensor(oriData, device='cuda')
+    # print("Min val: "+str(cp.amin(oriData).get())+" range: "+str(d))
+#    scale = d/255.0
+#    zero_point = -1*round(min_val*scale) - 128
+
+    scale = d/((2**8) - 1)
+    #zero_point = -1*round(min_val*scale)
+    zero_point = -1*round(min_val*scale)+32
+#    q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
+    
+    q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
+    del tensor
+    torch.cuda.empty_cache()
+    if isGrouped:
+        bitmap = cp.packbits(truth_values)
+    else:
+        bitmap = None
+    del truth_values
+    #q_ten2 = torch.dequantize(q_tensor)
+    #print(tensor)
+    #print(q_ten2)
+    #print("Max PW error")
+    #print(torch.max(torch.div(torch.abs(torch.sub(tensor[tensor!=0.0],q_ten2[tensor!=0.0])),tensor[tensor!=0.0])))
+    return (q_tensor, bitmap, isGrouped), (nbEle/4)+(ori_len/8)
+
+
+def quant_device_decompress(nbEle, cmpBytes, owner, dtype):
+    (q_tensor, bitmap, isGrouped) = cmpBytes
+    if isGrouped:
+        bitmap = cp.unpackbits(bitmap)
+    restored = torch.dequantize(q_tensor)
+    arr = cp.asarray(restored)
+    # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error
+
+    # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
+    # -- Workaround to convert GPU pointer to int
+    # p_decompressed_ptr = ctypes.addressof(newData)
+    # cast to int64 pointer
+    # (effectively converting pointer to pointer to addr to pointer to int64)
+    # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+    # decompressed_int = p_decompressed_int.contents
+    # # --
+    # pointer_for_free = decompressed_int.value
+    # # self.decompressed_own.append(decompressed_int.value)
+    # mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0)
+    # mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
+    #print("mem ptr")
+    #print(mem_ptr)
+    # arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr)
+    #print(nbEle)
+    if isGrouped:
+        res = cp.zeros((nbEle,))
+    # ## need to convert newData to cupy
+        cp.place(res,bitmap,arr)
+
+        c_res = cp.zeros(int(nbEle/2), np.complex64)
+    #c_res.real = arr[0:int(nbEle/2)]
+    #c_res.imag = arr[int(nbEle/2):]
+
+        c_res.real = res[0:int(nbEle/2)]
+        c_res.imag = res[int(nbEle/2):]
+    else:
+        c_res = cp.zeros(int(nbEle/2), np.complex64)
+        c_res.real = arr[0:int(nbEle/2)]
+        c_res.imag = arr[int(nbEle/2):]
+    return (c_res, None)
+
+### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
+
+def free_compressed(ptr):
+    p_ptr = ctypes.addressof(ptr)
+    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decomp_int = p_int.contents
+    cp.cuda.runtime.free(decomp_int.value)
+
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = int(1024)
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.002
+    r2r_error = 0.0001
+
+    in_vector = np.fromfile("all_sample.bin", dtype=np.complex64)
+    #print(np.max(in_vector))
+    DATA_SIZE = len(in_vector)
+    #range_vr = np.max(in_vector)-np.min(in_vector)
+    #r2r_threshold = r2r_threshold*range_vr
+    #r2r_error = r2r_error*range_vr
+    #in_vector = np.zeros((DATA_SIZE,))
+    #for i in range(0,int(DATA_SIZE/4)):
+    #    in_vector[i] = 0.0
+    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+    #    in_vector[i] = 5.0
+    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
+    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+    #    in_vector[i] = -7.0
+    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+    #    in_vector[i] = 0.001
+
+    print(DATA_SIZE)
+    #in_vector = in_vector.astype('float32')
+    in_vector_gpu = cp.asarray(in_vector)
+    
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+    for i in range(200):
+        s_time = time.time()
+        o_bytes, outSize = quant_device_compress(in_vector_gpu, DATA_SIZE, 256, r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        # print(outSize[0])
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        (d_bytes,ptr )= quant_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
+        
+        # free_compressed(o_bytes[0])
+        # cp.cuda.runtime.free(ptr)
+        print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
+        print("Decompress Success")
diff --git a/qtensor/compression/torch_quant/torch_quant_perchannel.py b/qtensor/compression/torch_quant/torch_quant_perchannel.py
index a41606b2..24cf703e 100644
--- a/qtensor/compression/torch_quant/torch_quant_perchannel.py
+++ b/qtensor/compression/torch_quant/torch_quant_perchannel.py
@@ -1,203 +1,203 @@
-import numpy as np
-import ctypes
-from ctypes import *
-import random
-from qtensor.tools.lazy_import import cupy as cp
-import time
-import torch
-
-from pathlib import Path
-
-BS = 32
-
-def quant_device_compress(oriData, nbEle, blockSize,threshold):
-    #print(nbEle)
-    ori_nbEle = nbEle
-    variable = ctypes.c_size_t(0)
-    outSize = ctypes.pointer(variable)
-
-    oriData = oriData.flatten()
-    ori_real = oriData.real
-    ori_imag = oriData.imag
-    oriData = cp.concatenate((ori_real, ori_imag))
-    sample = oriData[::2]
-    max_val = cp.amax(oriData).get()
-    min_val = cp.amin(oriData).get()
-    d = max_val - min_val
-    if d.dtype == np.complex64:
-        d = d.real
-    threshold = threshold*(d)
-    s_1 = time.time() 
-    truth_values = abs(oriData)<=threshold
-    oriData[truth_values] = 0.0
-    truth_values = cp.invert(truth_values)
-    ori_len = oriData.shape[0]
-    nonzero_percent = cp.count_nonzero(oriData)/oriData.shape[0]
-    print("Percent nonzero: "+str(nonzero_percent))
-
-    isGrouped = False
-    if nonzero_percent<=0.5:
-        isGrouped=True
-        oriData = oriData[truth_values]
-    
-    nbEle = oriData.shape[0]
-    
-    # oriData = cp.reshape(oriData, (-1, blockSize))  # Reshape to blocksize
-    tensor = torch.as_tensor(oriData, device='cuda')
-    # print("Min val: "+str(cp.amin(oriData).get())+" range: "+str(d))
-#    scale = d/255.0
-#    zero_point = -1*round(min_val*scale) - 128
-    if isGrouped:
-        pad_rows = int(nbEle/BS)
-        if nbEle%BS != 0:
-            pad_rows +=1
-
-        padded = torch.zeros(pad_rows*BS, device='cuda')
-        padded[:nbEle] = tensor
-        tensor = padded
-    tensor = torch.reshape(tensor, (-1, BS))
-    maxs = torch.flatten(torch.max(tensor, dim=1)[0])
-    mins = torch.flatten(torch.min(tensor, dim=1)[0])
-    
-    #scales = torch.ones(tensor.shape[0], device='cuda')
-    #scales = torch.mul(scales, d/255.0)
-    #print(d)
-    #print(torch.max(torch.sub(maxs,mins)))
-    scales = torch.abs(torch.sub(maxs,mins))/127.0
-    zero_points = torch.zeros(tensor.shape[0], device='cuda')
-    #zero_points = torch.round(torch.div(torch.add(maxs,mins)/2,scales))
-    #zero_points = torch.neg(torch.round(torch.div(mins,scales)))+64
-
-    #print(zero_points)
-
-    #scale = d/((2**8) - 1)
-    #zero_point = -1*round(min_val*scale)
-    #zero_point = -1*round(min_val*scale)+32
-#    q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
-    #tensor = torch.flatten(tensor)
-    #tensor = torch.split(tensor, BS)
-    #print(maxs)
-    #print(mins)
-    #print(scales)
-    
-    q_tensor = torch.quantize_per_channel(tensor, scales, zero_points,0, dtype=torch.qint8)
-    #q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
-    del tensor
-    torch.cuda.empty_cache()
-    if isGrouped:
-        bitmap = cp.packbits(truth_values)
-    else:
-        bitmap = None
-    del truth_values
-    #q_ten2 = torch.dequantize(q_tensor)
-    #print(tensor)
-    #print(q_ten2)
-    #print("Max PW error")
-    #print(torch.max(torch.div(torch.abs(torch.sub(tensor[tensor!=0.0],q_ten2[tensor!=0.0])),tensor[tensor!=0.0])))
-    return (q_tensor, bitmap, isGrouped), (nbEle/2)+(ori_len/8)
-
-
-def quant_device_decompress(nbEle, cmpBytes, owner, dtype):
-    (q_tensor, bitmap, isGrouped) = cmpBytes
-    if isGrouped:
-        bitmap = cp.unpackbits(bitmap)
-    restored = torch.flatten(torch.dequantize(q_tensor))
-    
-    arr = cp.asarray(restored)
-    # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error
-
-    # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
-    # -- Workaround to convert GPU pointer to int
-    # p_decompressed_ptr = ctypes.addressof(newData)
-    # cast to int64 pointer
-    # (effectively converting pointer to pointer to addr to pointer to int64)
-    # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
-    # decompressed_int = p_decompressed_int.contents
-    # # --
-    # pointer_for_free = decompressed_int.value
-    # # self.decompressed_own.append(decompressed_int.value)
-    # mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0)
-    # mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
-    #print("mem ptr")
-    #print(mem_ptr)
-    # arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr)
-    #print(nbEle)
-    if isGrouped:
-        res = cp.zeros((nbEle,))
-    # ## need to convert newData to cupy
-        cp.place(res,bitmap,arr)
-
-        c_res = cp.zeros(int(nbEle/2), np.complex64)
-    #c_res.real = arr[0:int(nbEle/2)]
-    #c_res.imag = arr[int(nbEle/2):]
-
-        c_res.real = res[0:int(nbEle/2)]
-        c_res.imag = res[int(nbEle/2):]
-    else:
-        c_res = cp.zeros(int(nbEle/2), np.complex64)
-        c_res.real = arr[0:int(nbEle/2)]
-        c_res.imag = arr[int(nbEle/2):]
-    return (c_res, None)
-
-### Example of device compress/decompress wrapper usage
-class Comp():
-    def __init__(self):
-        self.name = "dummy"
-
-def free_compressed(ptr):
-    p_ptr = ctypes.addressof(ptr)
-    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
-    decomp_int = p_int.contents
-    cp.cuda.runtime.free(decomp_int.value)
-
-
-if __name__ == "__main__":
-    
-    DATA_SIZE = int(1024)
-    MAX_D = 10.0
-    MIN_D = -10.0
-    RANGE = MAX_D - MIN_D
-    r2r_threshold = 0.002
-    r2r_error = 0.0001
-
-    in_vector = np.fromfile("all_sample.bin", dtype=np.complex64)
-    #print(np.max(in_vector))
-    DATA_SIZE = len(in_vector)
-    #range_vr = np.max(in_vector)-np.min(in_vector)
-    #r2r_threshold = r2r_threshold*range_vr
-    #r2r_error = r2r_error*range_vr
-    #in_vector = np.zeros((DATA_SIZE,))
-    #for i in range(0,int(DATA_SIZE/4)):
-    #    in_vector[i] = 0.0
-    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
-    #    in_vector[i] = 5.0
-    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
-    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
-    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
-    #    in_vector[i] = -7.0
-    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
-    #    in_vector[i] = 0.001
-
-    print(DATA_SIZE)
-    #in_vector = in_vector.astype('float32')
-    in_vector_gpu = cp.asarray(in_vector)
-    
-    # variable = ctypes.c_size_t(0)
-    # outSize = ctypes.pointer(variable)
-    for i in range(200):
-        s_time = time.time()
-        o_bytes, outSize = quant_device_compress(in_vector_gpu, DATA_SIZE, 256, r2r_threshold)
-        print("Time python: "+str(time.time()-s_time))
-        # print(outSize[0])
-        print("Compress Success...starting decompress ")
-        comp = Comp()
-
-        s_time = time.time()
-        (d_bytes,ptr )= quant_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
-        
-        # free_compressed(o_bytes[0])
-        # cp.cuda.runtime.free(ptr)
-        print("Time python: "+str(time.time()-s_time))
-    #for i in d_bytes:
-    #    print(i)
-        print("Decompress Success")
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+from qtensor.tools.lazy_import import cupy as cp
+import time
+import torch
+
+from pathlib import Path
+
+BS = 32
+
+def quant_device_compress(oriData, nbEle, blockSize,threshold):
+    #print(nbEle)
+    ori_nbEle = nbEle
+    variable = ctypes.c_size_t(0)
+    outSize = ctypes.pointer(variable)
+
+    oriData = oriData.flatten()
+    ori_real = oriData.real
+    ori_imag = oriData.imag
+    oriData = cp.concatenate((ori_real, ori_imag))
+    sample = oriData[::2]
+    max_val = cp.amax(oriData).get()
+    min_val = cp.amin(oriData).get()
+    d = max_val - min_val
+    if d.dtype == np.complex64:
+        d = d.real
+    threshold = threshold*(d)
+    s_1 = time.time() 
+    truth_values = abs(oriData)<=threshold
+    oriData[truth_values] = 0.0
+    truth_values = cp.invert(truth_values)
+    ori_len = oriData.shape[0]
+    nonzero_percent = cp.count_nonzero(oriData)/oriData.shape[0]
+    print("Percent nonzero: "+str(nonzero_percent))
+
+    isGrouped = False
+    if nonzero_percent<=0.5:
+        isGrouped=True
+        oriData = oriData[truth_values]
+    
+    nbEle = oriData.shape[0]
+    
+    # oriData = cp.reshape(oriData, (-1, blockSize))  # Reshape to blocksize
+    tensor = torch.as_tensor(oriData, device='cuda')
+    # print("Min val: "+str(cp.amin(oriData).get())+" range: "+str(d))
+#    scale = d/255.0
+#    zero_point = -1*round(min_val*scale) - 128
+    if isGrouped:
+        pad_rows = int(nbEle/BS)
+        if nbEle%BS != 0:
+            pad_rows +=1
+
+        padded = torch.zeros(pad_rows*BS, device='cuda')
+        padded[:nbEle] = tensor
+        tensor = padded
+    tensor = torch.reshape(tensor, (-1, BS))
+    maxs = torch.flatten(torch.max(tensor, dim=1)[0])
+    mins = torch.flatten(torch.min(tensor, dim=1)[0])
+    
+    #scales = torch.ones(tensor.shape[0], device='cuda')
+    #scales = torch.mul(scales, d/255.0)
+    #print(d)
+    #print(torch.max(torch.sub(maxs,mins)))
+    scales = torch.abs(torch.sub(maxs,mins))/127.0
+    zero_points = torch.zeros(tensor.shape[0], device='cuda')
+    #zero_points = torch.round(torch.div(torch.add(maxs,mins)/2,scales))
+    #zero_points = torch.neg(torch.round(torch.div(mins,scales)))+64
+
+    #print(zero_points)
+
+    #scale = d/((2**8) - 1)
+    #zero_point = -1*round(min_val*scale)
+    #zero_point = -1*round(min_val*scale)+32
+#    q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
+    #tensor = torch.flatten(tensor)
+    #tensor = torch.split(tensor, BS)
+    #print(maxs)
+    #print(mins)
+    #print(scales)
+    
+    q_tensor = torch.quantize_per_channel(tensor, scales, zero_points,0, dtype=torch.qint8)
+    #q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
+    del tensor
+    torch.cuda.empty_cache()
+    if isGrouped:
+        bitmap = cp.packbits(truth_values)
+    else:
+        bitmap = None
+    del truth_values
+    #q_ten2 = torch.dequantize(q_tensor)
+    #print(tensor)
+    #print(q_ten2)
+    #print("Max PW error")
+    #print(torch.max(torch.div(torch.abs(torch.sub(tensor[tensor!=0.0],q_ten2[tensor!=0.0])),tensor[tensor!=0.0])))
+    return (q_tensor, bitmap, isGrouped), (nbEle/2)+(ori_len/8)
+
+
+def quant_device_decompress(nbEle, cmpBytes, owner, dtype):
+    (q_tensor, bitmap, isGrouped) = cmpBytes
+    if isGrouped:
+        bitmap = cp.unpackbits(bitmap)
+    restored = torch.flatten(torch.dequantize(q_tensor))
+    
+    arr = cp.asarray(restored)
+    # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error
+
+    # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
+    # -- Workaround to convert GPU pointer to int
+    # p_decompressed_ptr = ctypes.addressof(newData)
+    # cast to int64 pointer
+    # (effectively converting pointer to pointer to addr to pointer to int64)
+    # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+    # decompressed_int = p_decompressed_int.contents
+    # # --
+    # pointer_for_free = decompressed_int.value
+    # # self.decompressed_own.append(decompressed_int.value)
+    # mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0)
+    # mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
+    #print("mem ptr")
+    #print(mem_ptr)
+    # arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr)
+    #print(nbEle)
+    if isGrouped:
+        res = cp.zeros((nbEle,))
+    # ## need to convert newData to cupy
+        cp.place(res,bitmap,arr)
+
+        c_res = cp.zeros(int(nbEle/2), np.complex64)
+    #c_res.real = arr[0:int(nbEle/2)]
+    #c_res.imag = arr[int(nbEle/2):]
+
+        c_res.real = res[0:int(nbEle/2)]
+        c_res.imag = res[int(nbEle/2):]
+    else:
+        c_res = cp.zeros(int(nbEle/2), np.complex64)
+        c_res.real = arr[0:int(nbEle/2)]
+        c_res.imag = arr[int(nbEle/2):]
+    return (c_res, None)
+
+### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
+
+def free_compressed(ptr):
+    p_ptr = ctypes.addressof(ptr)
+    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decomp_int = p_int.contents
+    cp.cuda.runtime.free(decomp_int.value)
+
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = int(1024)
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.002
+    r2r_error = 0.0001
+
+    in_vector = np.fromfile("all_sample.bin", dtype=np.complex64)
+    #print(np.max(in_vector))
+    DATA_SIZE = len(in_vector)
+    #range_vr = np.max(in_vector)-np.min(in_vector)
+    #r2r_threshold = r2r_threshold*range_vr
+    #r2r_error = r2r_error*range_vr
+    #in_vector = np.zeros((DATA_SIZE,))
+    #for i in range(0,int(DATA_SIZE/4)):
+    #    in_vector[i] = 0.0
+    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+    #    in_vector[i] = 5.0
+    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
+    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+    #    in_vector[i] = -7.0
+    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+    #    in_vector[i] = 0.001
+
+    print(DATA_SIZE)
+    #in_vector = in_vector.astype('float32')
+    in_vector_gpu = cp.asarray(in_vector)
+    
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+    for i in range(200):
+        s_time = time.time()
+        o_bytes, outSize = quant_device_compress(in_vector_gpu, DATA_SIZE, 256, r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        # print(outSize[0])
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        (d_bytes,ptr )= quant_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
+        
+        # free_compressed(o_bytes[0])
+        # cp.cuda.runtime.free(ptr)
+        print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
+        print("Decompress Success")

From c8447ad7692378823d188a6006cdf215680aca9c Mon Sep 17 00:00:00 2001
From: Danil <mail@example.com>
Date: Fri, 22 Mar 2024 05:34:48 +0000
Subject: [PATCH 122/126] trying to make cuSZp work

---
 qtensor/compression/Compressor.py            | 22 ++++++++++++++------
 qtensor/compression/szp/src/cuSZp_wrapper.py |  4 ++--
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index fecfc45d..520f7c5a 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -5,8 +5,8 @@
 print(Path(__file__).parent/'szx/src/')
 sys.path.append(str(Path(__file__).parent/'szx/src/'))
 sys.path.append('./szx/src')
-# sys.path.append(str(Path(__file__).parent/'szp/src/'))
-# sys.path.append('./szp/src')
+sys.path.append(str(Path(__file__).parent/'szp/src/'))
+sys.path.append('./szp/src')
 
 sys.path.append(str(Path(__file__).parent/'cusz/src'))
 sys.path.append('./cusz/src')
@@ -19,7 +19,7 @@
 import torch
 try:
     from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
-    # from cuSZp_wrapper import cuszp_device_compress, cuszp_device_decompress
+    from cuSZp_wrapper import cuszp_device_compress, cuszp_device_decompress
     from cusz_wrapper import cusz_device_compress, cusz_device_decompress
     from torch_quant_perchannel import quant_device_compress, quant_device_decompress
     from newsz_wrapper import newsz_device_compress, newsz_device_decompress
@@ -166,14 +166,24 @@ def free_decompressed(self):
         self.decompressed_own = []
 
     def free_compressed(self, ptr):
+        import ctypes, cupy
         cmp_bytes, num_elements_eff, shape, dtype, _ = ptr
+        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
+        # cast to int64 pointer
+        # (effectively converting pointer to pointer to addr to pointer to int64)
+        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        decompressed_int = p_decompressed_int.contents
+        cupy.cuda.runtime.free(decompressed_int.value)
+        cupy.get_default_memory_pool().free_all_blocks()
         del cmp_bytes
 
     def compress(self, data):
         isCupy, num_elements_eff = _get_data_info(data)
         dtype = data.dtype
-        cmp_bytes, outSize_ptr = cuszp_device_compress(data, self.r2r_error,self.r2r_threshold)
-        return (cmp_bytes, num_elements_eff, data.shape, dtype, outSize_ptr)
+        print("Compressing")
+        print(type(data), type(num_elements_eff))
+        cmp_bytes, outSize_ptr = cuszp_device_compress(data, self.r2r_error,num_elements_eff, self.r2r_threshold)
+        return (cmp_bytes, num_elements_eff, data.shape, dtype, outSize_ptr.contents.value)
 
         # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
     def compress_size(self, ptr):
@@ -182,7 +192,7 @@ def compress_size(self, ptr):
     def decompress(self, obj):
         import cupy
         cmp_bytes, num_elements_eff, shape, dtype, cmpsize = obj
-        decompressed_ptr = cuszp_device_decompress(num_elements_eff, cmp_bytes)
+        decompressed_ptr = cuszp_device_decompress(num_elements_eff, cmp_bytes, cmpsize, self, dtype)
         arr_cp = decompressed_ptr[0]
 
         arr = cupy.reshape(arr_cp, shape)
diff --git a/qtensor/compression/szp/src/cuSZp_wrapper.py b/qtensor/compression/szp/src/cuSZp_wrapper.py
index 9abe1fb1..4e887a3b 100644
--- a/qtensor/compression/szp/src/cuSZp_wrapper.py
+++ b/qtensor/compression/szp/src/cuSZp_wrapper.py
@@ -7,8 +7,8 @@
 import torch
 
 from pathlib import Path
-#LIB_PATH = str(Path(__file__).parent/'libcuszp_wrapper.so')
-LIB_PATH = '/home/mkshah5/QTensor/qtensor/compression/szp/src/libcuszp_wrapper.so'
+LIB_PATH = str(Path(__file__).parent/'libcuszp_wrapper.so')
+#LIB_PATH = '/home/mkshah5/QTensor/qtensor/compression/szp/src/libcuszp_wrapper.so'
 # unsigned char* cuSZp_device_compress(float *oriData, size_t *outSize, float absErrBound, size_t nbEle){
 
 def get_device_compress():

From 447fc329fad4a8ea48449e3f429de7e531322d7e Mon Sep 17 00:00:00 2001
From: Danil <mail@example.com>
Date: Fri, 26 Apr 2024 06:32:50 +0000
Subject: [PATCH 123/126] use cuszp module in compressor; add compressors to
 init.py

---
 qtensor/compression/Compressor.py             | 64 ++++++++++++++-----
 qtensor/compression/__init__.py               |  4 ++
 .../tests/test_compressed_tensor.py           | 55 ++++++++++------
 qtensor/compression/tests/test_memory_leak.py |  3 +
 .../performance_measurement_decorator.py      |  4 +-
 5 files changed, 94 insertions(+), 36 deletions(-)

diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index 520f7c5a..407e8410 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -17,6 +17,7 @@
 
 
 import torch
+import cuszp
 try:
     from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
     from cuSZp_wrapper import cuszp_device_compress, cuszp_device_decompress
@@ -157,17 +158,30 @@ def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
 
     def free_decompressed(self):
         import cupy
-        print("Cleanup", len(self.decompressed_own))
+        print("Decompressed data Cleanup", len(self.decompressed_own))
         for x in self.decompressed_own:
-            del x
-        cupy.get_default_memory_pool().free_all_blocks()
-        cupy.get_default_pinned_memory_pool().free_all_blocks()
-        torch.cuda.empty_cache()
+            cupy.cuda.runtime.free(x)
+            # del x
+            # need to run this for every x?
+            cupy.get_default_memory_pool().free_all_blocks()
+            #cupy.get_default_pinned_memory_pool().free_all_blocks()
+        #torch.cuda.empty_cache()
         self.decompressed_own = []
+        #cupy.get_default_memory_pool().free_all_blocks()
+        #cupy.get_default_pinned_memory_pool().free_all_blocks()
+        #torch.cuda.empty_cache()
+        #self.decompressed_own = []
 
     def free_compressed(self, ptr):
+        #return
         import ctypes, cupy
-        cmp_bytes, num_elements_eff, shape, dtype, _ = ptr
+        #cmp_bytes, num_elements_eff, shape, dtype, _ = ptr
+        cmp_t_real, cmp_t_imag, shape, dtype = ptr
+        del cmp_t_real
+        del cmp_t_imag
+        torch.cuda.empty_cache()
+        return 
+        print(f"Freeing compressed data {num_elements_eff}")
         p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
         # cast to int64 pointer
         # (effectively converting pointer to pointer to addr to pointer to int64)
@@ -175,28 +189,48 @@ def free_compressed(self, ptr):
         decompressed_int = p_decompressed_int.contents
         cupy.cuda.runtime.free(decompressed_int.value)
         cupy.get_default_memory_pool().free_all_blocks()
-        del cmp_bytes
+        #del cmp_bytes
 
     def compress(self, data):
         isCupy, num_elements_eff = _get_data_info(data)
         dtype = data.dtype
-        print("Compressing")
-        print(type(data), type(num_elements_eff))
-        cmp_bytes, outSize_ptr = cuszp_device_compress(data, self.r2r_error,num_elements_eff, self.r2r_threshold)
-        return (cmp_bytes, num_elements_eff, data.shape, dtype, outSize_ptr.contents.value)
+        # convert cupy to torch
+        data_imag = torch.as_tensor(data.imag, device='cuda').contiguous()
+        data_real = torch.as_tensor(data.real, device='cuda').contiguous()
+        print(f"cuszp Compressing {type(data)}")
+        #cmp_bytes, outSize_ptr = cuszp_device_compress(data, self.r2r_error, num_elements_eff, self.r2r_threshold)
+        cmp_t_real = cuszp.compress(data_real, self.r2r_error, 'rel')
+        cmp_t_imag = cuszp.compress(data_imag, self.r2r_error, 'rel')
+        return (cmp_t_real, cmp_t_imag, data.shape, dtype)
 
         # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
     def compress_size(self, ptr):
-        return ptr[4]
+        #return ptr[4]
+        return ptr[0].nbytes + ptr[1].nbytes
 
     def decompress(self, obj):
         import cupy
-        cmp_bytes, num_elements_eff, shape, dtype, cmpsize = obj
-        decompressed_ptr = cuszp_device_decompress(num_elements_eff, cmp_bytes, cmpsize, self, dtype)
+        #cmp_bytes, num_elements_eff, shape, dtype, cmpsize = obj
+        #decompressed_ptr = cuszp_device_decompress(num_elements_eff, cmp_bytes, cmpsize, self, dtype)
+        cmp_t_real, cmp_t_imag, shape, dtype = obj
+        num_elements_decompressed = 1
+        for s in shape:
+            num_elements_decompressed *= s
+        decomp_t_real = cuszp.decompress(cmp_t_real, num_elements_decompressed, cmp_t_real.nbytes, self.r2r_error, 'rel')
+        decomp_t_imag = cuszp.decompress(cmp_t_imag, num_elements_decompressed, cmp_t_imag.nbytes, self.r2r_error, 'rel')
+        decomp_t = decomp_t_real + 1j * decomp_t_imag
+        arr_cp = cupy.asarray(decomp_t)
+        arr = cupy.reshape(arr_cp, shape)
+        return arr
         arr_cp = decompressed_ptr[0]
 
+        # Cupy memory management might not deallocate memory properly
+        #arr = cupy.reshape(arr_cp, shape)
+        #self.decompressed_own.append(arr)
+        # Use pointer instead, as in cuszx
+        arr_cp = decompressed_ptr[0]
+        self.decompressed_own.append(decompressed_ptr[1])
         arr = cupy.reshape(arr_cp, shape)
-        self.decompressed_own.append(arr)
         return arr
 
 class TorchCompressor(Compressor):
diff --git a/qtensor/compression/__init__.py b/qtensor/compression/__init__.py
index 9e320426..7a69a45c 100644
--- a/qtensor/compression/__init__.py
+++ b/qtensor/compression/__init__.py
@@ -4,7 +4,11 @@
     CUSZCompressor,
     CUSZXCompressor,
     ProfileCompressor,
+    CUSZPCompressor,
+    TorchCompressor,
 )
 from .CompressedTensor import CompressedTensor, Tensor
 from .compressed_contraction import compressed_contract, compressed_sum
 from .cost_estimation import compressed_contraction_cost
+
+
diff --git a/qtensor/compression/tests/test_compressed_tensor.py b/qtensor/compression/tests/test_compressed_tensor.py
index 29ad3243..dd71f97d 100644
--- a/qtensor/compression/tests/test_compressed_tensor.py
+++ b/qtensor/compression/tests/test_compressed_tensor.py
@@ -1,10 +1,16 @@
 from qtensor.compression import CompressedTensor
-from qtensor.compression import NumpyCompressor, CUSZCompressor
+from qtensor.compression import (
+    NumpyCompressor,
+    CUSZPCompressor,
+    CUSZXCompressor,
+    TorchCompressor,
+)
 from qtree.optimizer import Var
 from qtree.system_defs import NP_ARRAY_TYPE
 import pytest
 import numpy as np
 
+
 def test_empty_tensor():
     shape = (2, 3, 4)
     indices = [Var(i, size=s) for i, s in enumerate(shape)]
@@ -42,27 +48,38 @@ def test_slice_tensor():
     assert S.data is not None
     assert np.allclose(t.get_chunk([1, 2]), S.data)
 
-@pytest.mark.parametrize(argnames=["shape", "compressor", "dtype"],
-                         argvalues=[
-                             ((2, 3, 4), NumpyCompressor(), np.float32),
-                             ((2, 3, 4), NumpyCompressor(), np.float64),
-                             ((2, 3, 4), CUSZCompressor(), np.float32),
-                             ((2, 3, 4), CUSZCompressor(), np.float64),
-                             ((2, 3, 4), CUSZCompressor(), np.complex128),
-                             ((2,)*20, CUSZCompressor(), np.float32),
-                             ((2,)*20, CUSZCompressor(), np.complex64),
-                             # Not supported:
-                             #((2,)*20, CUSZCompressor(), np.float64)
-                        ]
-                        )
-def test_compressors(shape, compressor, dtype):
-    print(shape, compressor, dtype)
+
+@pytest.mark.parametrize(
+    argnames=["shape", "compressor_cls", "dtype"],
+    argvalues=[
+        ((2, 3, 4), NumpyCompressor, np.float32),
+        ((2, 3, 4), NumpyCompressor, np.float64),
+        #((2,) * 20, TorchCompressor, np.complex64),
+        ((2,) * 20, CUSZXCompressor, np.complex64),
+        ((2,) * 20, CUSZPCompressor, np.complex64),
+
+        # Not supported:
+        # ((2, 3, 4), CUSZXCompressor, np.float32),
+        # ((2, 3, 4), CUSZXCompressor, np.float64),
+        # ((2, 3, 4), CUSZXCompressor, np.complex128),
+        # ((2,)*20, CUSZXCompressor, np.float32),
+        # ((2,)*20, CUSZCompressor(), np.float64)
+    ],
+)
+def test_compressors(shape, compressor_cls, dtype):
+    print(shape, compressor_cls, dtype)
+    compressor = compressor_cls()
     import cupy
+
     indices = [Var(i, size=s) for i, s in enumerate(shape)]
     if dtype is np.complex128:
-        data = cupy.random.random(shape, dtype=np.float64) + 1j*cupy.random.random(shape, dtype=np.float64)
+        data = cupy.random.random(shape, dtype=np.float64) + 1j * cupy.random.random(
+            shape, dtype=np.float64
+        )
     elif dtype is np.complex64:
-        data = cupy.random.random(shape, dtype=np.float32) + 1j*cupy.random.random(shape, dtype=np.float32)
+        data = cupy.random.random(shape, dtype=np.float32) + 1j * cupy.random.random(
+            shape, dtype=np.float32
+        )
     else:
         data = cupy.random.random(shape, dtype=dtype)
     t = CompressedTensor("myT", indices, data=data, compressor=compressor)
@@ -76,4 +93,4 @@ def test_compressors(shape, compressor, dtype):
     ref = cupy.asnumpy(s.data)
 
     assert np.allclose(ch, ref)
-    assert np.allclose(ch, data[1], rtol=0.1, atol=.01)
+    assert np.allclose(ch, data[1], rtol=0.15, atol=0.05)
diff --git a/qtensor/compression/tests/test_memory_leak.py b/qtensor/compression/tests/test_memory_leak.py
index e0ca675c..34f327a2 100644
--- a/qtensor/compression/tests/test_memory_leak.py
+++ b/qtensor/compression/tests/test_memory_leak.py
@@ -106,3 +106,6 @@ def test_leak_contract():
         print(
             f"== [{j}] Memory history: {[np.round(x, 2) for x in _mem_histories]} GB =="
         )
+
+if __name__ == "__main__":
+    test_leak_contract()
diff --git a/qtensor/contraction_backends/performance_measurement_decorator.py b/qtensor/contraction_backends/performance_measurement_decorator.py
index 39efffa7..c2ae5bc3 100644
--- a/qtensor/contraction_backends/performance_measurement_decorator.py
+++ b/qtensor/contraction_backends/performance_measurement_decorator.py
@@ -77,7 +77,7 @@ def check_store(self):
         if isinstance(self.backend, CompressionBackend):
             gpu_mem += 8*2**self.backend.max_tw
         self.mem_history.append(dict(
-            mem=gpu_mem,
+            mem=total_mem,
             cupy_bufsize=mempool.total_bytes(),
             nvmem = self._get_nvsmi_mem(),
             cupybuf=mempool.total_bytes(),
@@ -85,7 +85,7 @@ def check_store(self):
             tensors_sizes=[len(tensor.indices) for tensor in self.object_store.values()]
         ))
         # --
-        print('MH', self.mem_history[-1])
+        #print('MH', self.mem_history[-1])
         if cupy_mem>1024**2:
             self._print("CuPy memory usage", cupy_mem/1024/1024, "MB. Total MB:", mempool.total_bytes()/1024**2)
 

From 6137d2c50958875bc4ff6fe41211422e1e0c9fb1 Mon Sep 17 00:00:00 2001
From: Danil <mail@example.com>
Date: Fri, 26 Apr 2024 07:02:24 +0000
Subject: [PATCH 124/126] add test for compressed energy exp calculation

---
 qtensor/compression/compressed_contraction.py |  2 +-
 .../test_compressed_energy_expectation.py     | 24 +++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 qtensor/compression/tests/test_compressed_energy_expectation.py

diff --git a/qtensor/compression/compressed_contraction.py b/qtensor/compression/compressed_contraction.py
index 893987f9..1023021c 100644
--- a/qtensor/compression/compressed_contraction.py
+++ b/qtensor/compression/compressed_contraction.py
@@ -203,5 +203,5 @@ def compressed_sum(A:Tensor, sum_ixs,
             R = Tensor(new_tensor_name, result_indices, data=chunk)
         else:
             R.set_chunk(r_i, chunk)
-        compressor.compressor.free_decompressed()
+        compressor.free_decompressed()
     return R
diff --git a/qtensor/compression/tests/test_compressed_energy_expectation.py b/qtensor/compression/tests/test_compressed_energy_expectation.py
new file mode 100644
index 00000000..895999f9
--- /dev/null
+++ b/qtensor/compression/tests/test_compressed_energy_expectation.py
@@ -0,0 +1,24 @@
+import qtensor
+import numpy as np
+from qtensor.compression import CUSZPCompressor
+import qtensor.tests
+
+def test_compress_energy_expect():
+    G, gamma, beta = qtensor.tests.get_test_problem(n=10, p=2, type='random')
+    edge = list(G.edges())[0]
+    composer = qtensor.QtreeQAOAComposer(G, gamma=gamma, beta=beta)
+    composer.energy_expectation_lightcone(edge)
+    circuit = composer.circuit
+    base_backend = qtensor.contraction_backends.get_backend('cupy')
+    compressor = CUSZPCompressor(r2r_error=1e-4, r2r_threshold=1e-4)
+    backend = qtensor.contraction_backends.CompressionBackend(base_backend, compressor, max_tw=6)
+    sim = qtensor.QtreeSimulator(backend=backend)
+    res = sim.simulate(circuit)
+    sim_exact = qtensor.QtreeSimulator(backend=base_backend)
+    ref = sim_exact.simulate(circuit)
+    print(f'exact: {ref}, compressed: {res}')
+    assert np.allclose(res, ref, atol=1e-4, rtol=0.05)
+
+if __name__ == '__main__':
+    test_energy_expect()
+    print('test passed!')

From 1506748f617a3c4ee1d18779568b60c36ccf4dc9 Mon Sep 17 00:00:00 2001
From: Danil <mail@example.com>
Date: Wed, 8 May 2024 04:28:37 +0000
Subject: [PATCH 125/126] fix cuszp implementation

---
 qtensor/compression/Compressor.py | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index 407e8410..39e2aef4 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -176,9 +176,8 @@ def free_compressed(self, ptr):
         #return
         import ctypes, cupy
         #cmp_bytes, num_elements_eff, shape, dtype, _ = ptr
-        cmp_t_real, cmp_t_imag, shape, dtype = ptr
-        del cmp_t_real
-        del cmp_t_imag
+        cmp_t, shape, dtype = ptr
+        del cmp_t
         torch.cuda.empty_cache()
         return 
         print(f"Freeing compressed data {num_elements_eff}")
@@ -194,31 +193,34 @@ def free_compressed(self, ptr):
     def compress(self, data):
         isCupy, num_elements_eff = _get_data_info(data)
         dtype = data.dtype
+        shape = data.shape
         # convert cupy to torch
-        data_imag = torch.as_tensor(data.imag, device='cuda').contiguous()
-        data_real = torch.as_tensor(data.real, device='cuda').contiguous()
-        print(f"cuszp Compressing {type(data)}")
+        # TODO: cast to one array of double the number of elements
+        torch_data = torch.tensor(data, device='cuda')
+        data_view = torch.view_as_real(torch_data)
+        #print(f"cuszp Compressing {type(data)}")
         #cmp_bytes, outSize_ptr = cuszp_device_compress(data, self.r2r_error, num_elements_eff, self.r2r_threshold)
-        cmp_t_real = cuszp.compress(data_real, self.r2r_error, 'rel')
-        cmp_t_imag = cuszp.compress(data_imag, self.r2r_error, 'rel')
-        return (cmp_t_real, cmp_t_imag, data.shape, dtype)
+        cmp_t = cuszp.compress(data_view, self.r2r_error, 'rel')
+        return (cmp_t, shape, dtype)
 
         # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
     def compress_size(self, ptr):
         #return ptr[4]
-        return ptr[0].nbytes + ptr[1].nbytes
+        return ptr[0].nbytes
 
     def decompress(self, obj):
         import cupy
         #cmp_bytes, num_elements_eff, shape, dtype, cmpsize = obj
         #decompressed_ptr = cuszp_device_decompress(num_elements_eff, cmp_bytes, cmpsize, self, dtype)
-        cmp_t_real, cmp_t_imag, shape, dtype = obj
+        cmp_t, shape, dtype = obj
         num_elements_decompressed = 1
         for s in shape:
             num_elements_decompressed *= s
-        decomp_t_real = cuszp.decompress(cmp_t_real, num_elements_decompressed, cmp_t_real.nbytes, self.r2r_error, 'rel')
-        decomp_t_imag = cuszp.decompress(cmp_t_imag, num_elements_decompressed, cmp_t_imag.nbytes, self.r2r_error, 'rel')
-        decomp_t = decomp_t_real + 1j * decomp_t_imag
+        # Number of elements is twice because the shape is for complex numbers
+        num_elements_decompressed *= 2
+        decomp_t_float = cuszp.decompress(cmp_t, num_elements_decompressed, cmp_t.nbytes, self.r2r_error, 'rel')
+        decomp_t_float = decomp_t_float.view(decomp_t_float.shape[0]//2, 2)
+        decomp_t = torch.view_as_complex(decomp_t_float)
         arr_cp = cupy.asarray(decomp_t)
         arr = cupy.reshape(arr_cp, shape)
         return arr

From 82fb5865db52f54bad3a23d8feb44aebada3a065 Mon Sep 17 00:00:00 2001
From: Dan Lykov <lkv97dn@gmail.com>
Date: Thu, 9 May 2024 00:05:27 -0500
Subject: [PATCH 126/126] cuszp compressor import optional

---
 qtensor/compression/Compressor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
index 39e2aef4..75db40ab 100644
--- a/qtensor/compression/Compressor.py
+++ b/qtensor/compression/Compressor.py
@@ -17,8 +17,8 @@
 
 
 import torch
-import cuszp
 try:
+    import cuszp
     from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
     from cuSZp_wrapper import cuszp_device_compress, cuszp_device_decompress
     from cusz_wrapper import cusz_device_compress, cusz_device_decompress